handwritten lexer

2021-07-17 19:41:26 +02:00 · 2021-07-17 19:41:26 +02:00 · 378f463b7c
commit 378f463b7c
parent dacb6e49e2
10 changed files with 200 additions and 61 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -238,7 +238,6 @@ dependencies = [
 "lalrpop",
 "lalrpop-util",
 "lazy_static",
- "logos",
 "maplit",
 "pretty_assertions",
 "regex",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -15,7 +15,6 @@ itertools = "0.10"
 thiserror = "1.0"
 maplit = "1.0"
 codespan-reporting = "0.11"
-logos = "0.12"

 derive_more = "0.99"
 smart-default = "0.6"
--- a/examples/errors.rs
+++ b/examples/errors.rs
@ -4,7 +4,10 @@ fn main() {
    let mut files = codespan_reporting::files::SimpleFiles::new();

    let input = r#"
-        (hi :bar 22 :baz {(foo == bar ? 12.K : 12)} (foo) (baz))"#;
+        (heyho :foo { "foo \" } bar " }
+               :baz {(foo == bar ? 12.2 : 12)}
+          (foo)
+          (baz))"#;

    let file_id = files.add("foo.eww", input);
    let ast = eww_config::parse_string(file_id, input);
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -1,45 +1,22 @@
-use logos::Logos;
+use regex::{Regex, RegexSet};

 use crate::{ast::Span, parse_error};

-#[derive(Logos, Debug, PartialEq, Eq, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub enum Token {
-    #[token("(")]
    LPren,
-    #[token(")")]
    RPren,
-    #[token("[")]
    LBrack,
-    #[token("]")]
    RBrack,
-
-    #[token("true")]
    True,
-
-    #[token("false")]
    False,
-
-    #[regex(r#""(?:[^"\\]|\\.)*""#, |x| x.slice().to_string())]
    StrLit(String),
-
-    #[regex(r#"[+-]?(?:[0-9]+[.])?[0-9]+"#, priority = 2, callback = |x| x.slice().to_string())]
    NumLit(String),
-
-    #[regex(r#"[a-zA-Z_!\?<>/.*-+][^\s{}\(\)]*"#, |x| x.slice().to_string())]
    Symbol(String),
-
-    #[regex(r#":\S+"#, |x| x.slice().to_string())]
    Keyword(String),
-
-    #[regex(r#"\{[^}]*\}"#, |x| x.slice().to_string())]
    SimplExpr(String),
-
-    #[regex(r#";.*"#)]
    Comment,
-
-    #[error]
-    #[regex(r"[ \t\n\f]+", logos::skip)]
-    Error,
+    Skip,
 }

 impl std::fmt::Display for Token {
@ -57,33 +34,142 @@ impl std::fmt::Display for Token {
            Token::Keyword(x) => write!(f, "{}", x),
            Token::SimplExpr(x) => write!(f, "{{{}}}", x),
            Token::Comment => write!(f, ""),
-            Token::Error => write!(f, ""),
+            Token::Skip => write!(f, ""),
        }
    }
 }

+pub struct LexIterator {
+    source: String,
+    pos: usize,
+}
+
+macro_rules! regex_rules {
+    ($($regex:literal => $token:expr),*) => {
+        lazy_static::lazy_static! {
+            static ref LEXER_REGEX_SET: RegexSet = RegexSet::new(&[
+                $(format!("^{}", $regex)),*
+            ]).unwrap();
+            static ref LEXER_REGEXES: Vec<Regex> = vec![
+                $(Regex::new(&format!("^{}", $regex)).unwrap()),*
+            ];
+            static ref LEXER_FNS: Vec<Box<dyn Fn(String) -> Token + Sync>> = vec![
+                $(Box::new($token)),*
+            ];
+        }
+    }
+}
+
+regex_rules! {
+        r"\(" => |_| Token::LPren,
+        r"\)" => |_| Token::RPren,
+        r"\[" => |_| Token::LBrack,
+        r"\]" => |_| Token::LBrack,
+        r"true" => |_| Token::True,
+        r"false" => |_| Token::False,
+        r#""(?:[^"\\]|\\.)*""# => |x| Token::StrLit(x),
+        r#"[+-]?(?:[0-9]+[.])?[0-9]+"# => |x| Token::NumLit(x),
+        r#"[a-zA-Z_!\?<>/.*-+][^\s{}\(\)]*"# => |x| Token::Symbol(x),
+        r#":\S+"# => |x| Token::Keyword(x),
+        r#";.*"# => |_| Token::Comment,
+        r"[ \t\n\f]+" => |_| Token::Skip
+}
+
+impl Iterator for LexIterator {
+    type Item = (usize, Token, usize);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if self.pos >= self.source.len() {
+                return None;
+            }
+            let string = &self.source[self.pos..];
+
+            if string.starts_with('{') {
+                self.pos += 1;
+                let expr_start = self.pos;
+                let mut in_string = false;
+                loop {
+                    if self.pos >= self.source.len() {
+                        return None;
+                    }
+                    let string = &self.source[self.pos..];
+
+                    if string.starts_with('}') && !in_string {
+                        let tok_str = &self.source[expr_start..self.pos];
+                        self.pos += 1;
+                        return Some((expr_start, Token::SimplExpr(tok_str.to_string()), self.pos - 1));
+                    } else if string.starts_with('"') {
+                        self.pos += 1;
+                        in_string = !in_string;
+                    } else if string.starts_with("\\\"") {
+                        self.pos += 2;
+                    } else {
+                        self.pos += 1;
+                    }
+                }
+            } else {
+                let match_set = LEXER_REGEX_SET.matches(string);
+                let (len, i) = match_set
+                    .into_iter()
+                    .map(|i: usize| {
+                        let m = LEXER_REGEXES[i].find(string).unwrap();
+                        (m.end(), i)
+                    })
+                    .next()
+                    .unwrap();
+
+                let tok_str = &self.source[self.pos..self.pos + len];
+                let old_pos = self.pos;
+                self.pos += len;
+                match LEXER_FNS[i](tok_str.to_string()) {
+                    Token::Skip => {}
+                    token => return Some((old_pos, token, self.pos)),
+                }
+            }
+        }
+    }
+}
+
+macro_rules! test_lexer {
+    ($($text:literal),*) => {{
+        ::insta::with_settings!({sort_maps => true}, {
+            $(
+            ::insta::assert_debug_snapshot!(
+                LexIterator { pos: 0, source: $text.to_string() }.map(|x| x.1).collect::<Vec<_>>()
+            );
+            )*
+        });
+    }}
+}
+
+#[test]
+fn test() {
+    test_lexer!(r#"(test "h\"i")"#, r#"(foo { "}" })"#);
+}
+
 pub type SpannedResult<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;

-pub struct Lexer<'input> {
+pub struct Lexer {
    file_id: usize,
-    lexer: logos::SpannedIter<'input, Token>,
+    lexer: LexIterator,
 }

-impl<'input> Lexer<'input> {
-    pub fn new(file_id: usize, text: &'input str) -> Self {
-        Lexer { file_id, lexer: logos::Lexer::new(text).spanned() }
+impl Lexer {
+    pub fn new(file_id: usize, text: &str) -> Self {
+        Lexer { file_id, lexer: LexIterator { source: text.to_string(), pos: 0 } }
    }
 }

-impl<'input> Iterator for Lexer<'input> {
+impl Iterator for Lexer {
    type Item = SpannedResult<Token, usize, parse_error::ParseError>;

    fn next(&mut self) -> Option<Self::Item> {
-        let (token, range) = self.lexer.next()?;
-        if token == Token::Error {
-            Some(Err(parse_error::ParseError::LexicalError(Span(range.start, range.end, self.file_id))))
+        let (l, token, r) = self.lexer.next()?;
+        if token == Token::Skip {
+            Some(Err(parse_error::ParseError::LexicalError(Span(l, r, self.file_id))))
        } else {
-            Some(Ok((range.start, token, range.end)))
+            Some(Ok((l, token, r)))
        }
    }
 }
--- a/src/snapshots/eww_configconfigtest__test.snap.new
+++ b/src/snapshots/eww_configconfigtest__test.snap.new
@ -1,21 +0,0 @@
---
-source: src/config.rs
-expression: "Element::<Expr, Expr>::from_expr(parser.parse(0, lexer).unwrap()).unwrap()"
-
---
-Element {
-    name: "box",
-    attrs: {
-        ":baz": Value<18..22>(hi),
-        ":bar": Value<10..12>(12),
-    },
-    children: [
-        Symbol<23..26>(foo),
-        List<27..32>(
-            [
-                Symbol<28..31>(bar),
-            ],
-        ),
-    ],
-    span: 0..33,
-}
--- a/src/snapshots/eww_configlexertest-13.snap
+++ b/src/snapshots/eww_configlexertest-13.snap
@ -0,0 +1,15 @@
+---
+source: src/lexer.rs
+expression: "LexIterator{pos: 0,\n            source:\n                r#\"(test \" hi \")\"#.to_string(),}.map(|x|\n                                                         x.1).collect::<Vec<_>>()"
+
+---
+[
+    LPren,
+    Symbol(
+        "test",
+    ),
+    StrLit(
+        "\" hi \"",
+    ),
+    RPren,
+]
--- a/src/snapshots/eww_configlexertest-2.snap
+++ b/src/snapshots/eww_configlexertest-2.snap
@ -0,0 +1,15 @@
+---
+source: src/lexer.rs
+expression: "LexIterator{pos: 0,\n            source:\n                r#\"(foo { \"}\" })\"#.to_string(),}.map(|x|\n                                                         x.1).collect::<Vec<_>>()"
+
+---
+[
+    LPren,
+    Symbol(
+        "foo",
+    ),
+    SimplExpr(
+        " \"}\" ",
+    ),
+    RPren,
+]
--- a/src/snapshots/eww_configlexertest-4.snap
+++ b/src/snapshots/eww_configlexertest-4.snap
@ -0,0 +1,10 @@
+---
+source: src/lexer.rs
+expression: "LexIterator{pos: 0,\n            source: \"-1.2\".to_string(),}.map(|x| x.1).collect::<Vec<_>>()"
+
+---
+[
+    NumLit(
+        "-1.2",
+    ),
+]
--- a/src/snapshots/eww_configlexertest-6.snap
+++ b/src/snapshots/eww_configlexertest-6.snap
@ -0,0 +1,18 @@
+---
+source: src/lexer.rs
+expression: "LexIterator{pos: 0,\n            source:\n                \"(1 :foo 1)\".to_string(),}.map(|x| x.1).collect::<Vec<_>>()"
+
+---
+[
+    LPren,
+    NumLit(
+        "1",
+    ),
+    Keyword(
+        ":foo",
+    ),
+    NumLit(
+        "1",
+    ),
+    RPren,
+]
--- a/src/snapshots/eww_configlexertest.snap
+++ b/src/snapshots/eww_configlexertest.snap
@ -0,0 +1,15 @@
+---
+source: src/lexer.rs
+expression: "LexIterator{pos: 0,\n            source:\n                r#\"(test \"h\\\"i\")\"#.to_string(),}.map(|x|\n                                                         x.1).collect::<Vec<_>>()"
+
+---
+[
+    LPren,
+    Symbol(
+        "test",
+    ),
+    StrLit(
+        "\"h\\\"i\"",
+    ),
+    RPren,
+]