add tokenizer for new parser
This commit is contained in:
parent
cec4f5fb8d
commit
daa4da8037
|
@ -2,6 +2,7 @@ pub mod error;
|
|||
mod literals;
|
||||
pub mod module;
|
||||
pub mod proc;
|
||||
pub mod tokens;
|
||||
|
||||
use nom::{
|
||||
branch::alt,
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
//! convert text into a token stream
|
||||
|
||||
use std::io;
|
||||
use std::fmt;
|
||||
use super::{identifier, ws0, IResult, Span};
|
||||
use nom::{
|
||||
branch::alt,
|
||||
bytes::complete::tag,
|
||||
character::complete::{digit1, anychar},
|
||||
combinator::{consumed, map, recognize},
|
||||
multi::many0,
|
||||
error::ParseError,
|
||||
};
|
||||
|
||||
pub struct Token<'a> {
|
||||
span: Span<'a>,
|
||||
kind: TokenKind,
|
||||
}
|
||||
|
||||
impl fmt::Debug for Token<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{:?} @{} {:?}", self.kind, self.span.location_offset(), self.span.fragment())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Token<'a> {
|
||||
fn new(span: Span<'a>, kind: TokenKind) -> Self {
|
||||
Self { span, kind }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pretty_tokens(mut w: impl io::Write, toks: &[Token]) -> io::Result<()> {
|
||||
for tok in toks {
|
||||
writeln!(w, "{:?}", tok)?;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub enum TokenKind {
|
||||
// no whitespace, for now
|
||||
// no token trees either, for now
|
||||
// Braces
|
||||
LParen,
|
||||
RParen,
|
||||
LAngle,
|
||||
RAngle,
|
||||
LBrace,
|
||||
RBrace,
|
||||
LSquare,
|
||||
RSquare,
|
||||
// single chars
|
||||
Colon,
|
||||
Semicolon,
|
||||
Comma,
|
||||
Caret,
|
||||
Tilde,
|
||||
Assign,
|
||||
// Multi Chars
|
||||
RArrow,
|
||||
// Literals
|
||||
Ident,
|
||||
Number,
|
||||
// Keywords
|
||||
Module,
|
||||
// Error
|
||||
Error,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TokenSpan<'a> {
|
||||
rest: &'a [Token<'a>],
|
||||
pos: usize,
|
||||
}
|
||||
|
||||
impl<'a> TokenSpan<'a> {
|
||||
pub fn new(rest: &'a [Token<'a>]) -> Self { Self { rest, pos: 0 } }
|
||||
pub fn with_pos(rest: &'a [Token<'a>], pos: usize) -> Self { Self { rest, pos } }
|
||||
}
|
||||
|
||||
impl nom::InputTake for TokenSpan<'_> {
|
||||
fn take(&self, count: usize) -> Self {
|
||||
TokenSpan::with_pos(&self.rest[..count], self.pos + count)
|
||||
}
|
||||
|
||||
fn take_split(&self, count: usize) -> (Self, Self) {
|
||||
let (head, tail) = &self.rest.split_at(count);
|
||||
(TokenSpan::with_pos(&head, self.pos), TokenSpan::with_pos(&tail, self.pos + count))
|
||||
}
|
||||
}
|
||||
|
||||
impl nom_greedyerror::Position for TokenSpan<'_> {
|
||||
fn position(&self) -> usize {
|
||||
self.pos
|
||||
}
|
||||
}
|
||||
|
||||
/// combinator that matches a token kind
|
||||
pub fn token<'a, E>(kind: TokenKind) -> impl FnMut(TokenSpan<'a>) -> nom::IResult<TokenSpan, &Token, E>
|
||||
where E: ParseError<TokenSpan<'a>>
|
||||
{
|
||||
move |input: TokenSpan| {
|
||||
let next = &input.rest[0];
|
||||
if next.kind == kind.clone() {
|
||||
let rest = TokenSpan::new(&input.rest[1..]);
|
||||
Ok((rest, next))
|
||||
}
|
||||
else {
|
||||
Err(nom::Err::Error(E::from_error_kind(input, nom::error::ErrorKind::Tag)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn lex(input: Span) -> IResult<Span, Vec<Token>> {
|
||||
many0(ws0(alt((
|
||||
map(tag("module"), |span| Token::new(span, TokenKind::Module)),
|
||||
lex_literals,
|
||||
lex_braces,
|
||||
lex_punctuation,
|
||||
map(recognize(anychar), |span| Token::new(span, TokenKind::Error)),
|
||||
))))(input)
|
||||
}
|
||||
|
||||
fn lex_braces(input: Span) -> IResult<Span, Token> {
|
||||
map(
|
||||
consumed(alt((
|
||||
map(tag("("), |_| TokenKind::LParen),
|
||||
map(tag(")"), |_| TokenKind::RParen),
|
||||
map(tag("<"), |_| TokenKind::LAngle),
|
||||
map(tag(">"), |_| TokenKind::RAngle),
|
||||
map(tag("{"), |_| TokenKind::LBrace),
|
||||
map(tag("}"), |_| TokenKind::RBrace),
|
||||
map(tag("["), |_| TokenKind::LSquare),
|
||||
map(tag("]"), |_| TokenKind::RSquare),
|
||||
))),
|
||||
|(span, kind)| Token::new(span, kind),
|
||||
)(input)
|
||||
}
|
||||
|
||||
fn lex_literals(input: Span) -> IResult<Span, Token> {
|
||||
map(
|
||||
consumed(alt((
|
||||
map(identifier, |_| TokenKind::Ident),
|
||||
map(digit1, |_| TokenKind::Number),
|
||||
))),
|
||||
|(span, kind)| Token::new(span, kind),
|
||||
)(input)
|
||||
}
|
||||
|
||||
fn lex_punctuation(input: Span) -> IResult<Span, Token> {
|
||||
map(
|
||||
consumed(alt((
|
||||
map(tag(":"), |_| TokenKind::Colon),
|
||||
map(tag(";"), |_| TokenKind::Semicolon),
|
||||
map(tag(","), |_| TokenKind::Comma),
|
||||
map(tag("^"), |_| TokenKind::Caret),
|
||||
map(tag("->"), |_| TokenKind::RArrow),
|
||||
map(tag("~"), |_| TokenKind::Tilde),
|
||||
map(tag("="), |_| TokenKind::Assign),
|
||||
))),
|
||||
|(span, kind)| Token::new(span, kind),
|
||||
)(input)
|
||||
}
|
Loading…
Reference in New Issue