From daa4da80376847134ea417aeb36c075ebb50517d Mon Sep 17 00:00:00 2001 From: NotAFile Date: Tue, 1 Feb 2022 23:14:11 +0100 Subject: [PATCH] add tokenizer for new parser --- src/parser.rs | 1 + src/parser/tokens.rs | 164 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 src/parser/tokens.rs diff --git a/src/parser.rs b/src/parser.rs index b29ee38..79345e4 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -2,6 +2,7 @@ pub mod error; mod literals; pub mod module; pub mod proc; +pub mod tokens; use nom::{ branch::alt, diff --git a/src/parser/tokens.rs b/src/parser/tokens.rs new file mode 100644 index 0000000..51cbcec --- /dev/null +++ b/src/parser/tokens.rs @@ -0,0 +1,164 @@ +//! convert text into a token stream + +use std::io; +use std::fmt; +use super::{identifier, ws0, IResult, Span}; +use nom::{ + branch::alt, + bytes::complete::tag, + character::complete::{digit1, anychar}, + combinator::{consumed, map, recognize}, + multi::many0, + error::ParseError, +}; + +pub struct Token<'a> { + span: Span<'a>, + kind: TokenKind, +} + +impl fmt::Debug for Token<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} @{} {:?}", self.kind, self.span.location_offset(), self.span.fragment())?; + Ok(()) + } +} + +impl<'a> Token<'a> { + fn new(span: Span<'a>, kind: TokenKind) -> Self { + Self { span, kind } + } +} + +pub fn pretty_tokens(mut w: impl io::Write, toks: &[Token]) -> io::Result<()> { + for tok in toks { + writeln!(w, "{:?}", tok)?; + }; + Ok(()) +} + +#[derive(Debug, PartialEq, Clone)] +pub enum TokenKind { + // no whitespace, for now + // no token trees either, for now + // Braces + LParen, + RParen, + LAngle, + RAngle, + LBrace, + RBrace, + LSquare, + RSquare, + // single chars + Colon, + Semicolon, + Comma, + Caret, + Tilde, + Assign, + // Multi Chars + RArrow, + // Literals + Ident, + Number, + // Keywords + Module, + // Error + Error, +} + +#[derive(Debug)] +pub struct TokenSpan<'a> { + rest: &'a [Token<'a>], + pos: usize, +} + +impl<'a> TokenSpan<'a> { + pub fn new(rest: &'a [Token<'a>]) -> Self { Self { rest, pos: 0 } } + pub fn with_pos(rest: &'a [Token<'a>], pos: usize) -> Self { Self { rest, pos } } +} + +impl nom::InputTake for TokenSpan<'_> { + fn take(&self, count: usize) -> Self { + TokenSpan::with_pos(&self.rest[..count], self.pos + count) + } + + fn take_split(&self, count: usize) -> (Self, Self) { + let (head, tail) = &self.rest.split_at(count); + (TokenSpan::with_pos(&head, self.pos), TokenSpan::with_pos(&tail, self.pos + count)) + } +} + +impl nom_greedyerror::Position for TokenSpan<'_> { + fn position(&self) -> usize { + self.pos + } +} + +/// combinator that matches a token kind +pub fn token<'a, E>(kind: TokenKind) -> impl FnMut(TokenSpan<'a>) -> nom::IResult + where E: ParseError> +{ + move |input: TokenSpan| { + let next = &input.rest[0]; + if next.kind == kind.clone() { + let rest = TokenSpan::new(&input.rest[1..]); + Ok((rest, next)) + } + else { + Err(nom::Err::Error(E::from_error_kind(input, nom::error::ErrorKind::Tag))) + } + } +} + +pub fn lex(input: Span) -> IResult> { + many0(ws0(alt(( + map(tag("module"), |span| Token::new(span, TokenKind::Module)), + lex_literals, + lex_braces, + lex_punctuation, + map(recognize(anychar), |span| Token::new(span, TokenKind::Error)), + ))))(input) +} + +fn lex_braces(input: Span) -> IResult { + map( + consumed(alt(( + map(tag("("), |_| TokenKind::LParen), + map(tag(")"), |_| TokenKind::RParen), + map(tag("<"), |_| TokenKind::LAngle), + map(tag(">"), |_| TokenKind::RAngle), + map(tag("{"), |_| TokenKind::LBrace), + map(tag("}"), |_| TokenKind::RBrace), + map(tag("["), |_| TokenKind::LSquare), + map(tag("]"), |_| TokenKind::RSquare), + ))), + |(span, kind)| Token::new(span, kind), + )(input) +} + +fn lex_literals(input: Span) -> IResult { + map( + consumed(alt(( + map(identifier, |_| TokenKind::Ident), + map(digit1, |_| TokenKind::Number), + ))), + |(span, kind)| Token::new(span, kind), + )(input) +} + +fn lex_punctuation(input: Span) -> IResult { + map( + consumed(alt(( + map(tag(":"), |_| TokenKind::Colon), + map(tag(";"), |_| TokenKind::Semicolon), + map(tag(","), |_| TokenKind::Comma), + map(tag("^"), |_| TokenKind::Caret), + map(tag("->"), |_| TokenKind::RArrow), + map(tag("~"), |_| TokenKind::Tilde), + map(tag("="), |_| TokenKind::Assign), + ))), + |(span, kind)| Token::new(span, kind), + )(input) +}