keyword parser + test of multiline lexing

Phillip Davis created

- add birdie for more sane testing
- parsing '\n' results in col_end=0 always, and also row_end = row_start
  + 1 always. This kind of annoying, but I think fine.

Change summary

.gitignore                                                       |  1 
birdie_snapshots/quasi_lexer_spans_with_multiline_input.accepted | 20 
birdie_snapshots/should_fail_to_parse_'lt'_as_node_let.accepted  | 10 
gleam.toml                                                       |  1 
manifest.toml                                                    | 16 
src/ghall.gleam                                                  |  7 
src/node.gleam                                                   |  4 
src/parser.gleam                                                 | 37 +
src/quasi_lexer.gleam                                            |  6 
test/ghall_test.gleam                                            | 61 ++
10 files changed, 157 insertions(+), 6 deletions(-)

Detailed changes

.gitignore 🔗

@@ -2,3 +2,4 @@
 *.ez
 /build
 erl_crash.dump
+.mcp.json

birdie_snapshots/quasi_lexer_spans_with_multiline_input.accepted 🔗

@@ -0,0 +1,20 @@
+---
+version: 1.4.1
+title: Quasi lexer spans with multiline input
+file: ./test/ghall_test.gleam
+test_name: quasi_lexer_off_by_one_test
+---
+Token 0: 'l' at Span(row_start: 1, col_start: 1, row_end: 1, col_end: 2)
+Token 1: 'e' at Span(row_start: 1, col_start: 2, row_end: 1, col_end: 3)
+Token 2: 't' at Span(row_start: 1, col_start: 3, row_end: 1, col_end: 4)
+Token 3: ' ' at Span(row_start: 1, col_start: 4, row_end: 1, col_end: 5)
+Token 4: 'x' at Span(row_start: 1, col_start: 5, row_end: 1, col_end: 6)
+Token 5: '1' at Span(row_start: 1, col_start: 6, row_end: 1, col_end: 7)
+Token 6: ' ' at Span(row_start: 1, col_start: 7, row_end: 1, col_end: 8)
+Token 7: '=' at Span(row_start: 1, col_start: 8, row_end: 1, col_end: 9)
+Token 8: '
+' at Span(row_start: 1, col_start: 9, row_end: 2, col_end: 1)
+Token 9: ' ' at Span(row_start: 2, col_start: 1, row_end: 2, col_end: 2)
+Token 10: 'e' at Span(row_start: 2, col_start: 2, row_end: 2, col_end: 3)
+Token 11: '1' at Span(row_start: 2, col_start: 3, row_end: 2, col_end: 4)
+

gleam.toml 🔗

@@ -18,3 +18,4 @@ nibble = ">= 1.1.4 and < 2.0.0"
 
 [dev-dependencies]
 gleeunit = ">= 1.0.0 and < 2.0.0"
+birdie = ">= 1.4.1 and < 2.0.0"

manifest.toml 🔗

@@ -2,15 +2,31 @@
 # You typically do not need to edit this file
 
 packages = [
+  { name = "argv", version = "1.0.2", build_tools = ["gleam"], requirements = [], otp_app = "argv", source = "hex", outer_checksum = "BA1FF0929525DEBA1CE67256E5ADF77A7CDDFE729E3E3F57A5BDCAA031DED09D" },
+  { name = "birdie", version = "1.4.1", build_tools = ["gleam"], requirements = ["argv", "edit_distance", "filepath", "glance", "gleam_community_ansi", "gleam_stdlib", "justin", "rank", "simplifile", "term_size", "trie_again"], otp_app = "birdie", source = "hex", outer_checksum = "18599E478C14BD9EBD2465F0561F96EB9B58A24DB44AF86F103EF81D4B9834BF" },
+  { name = "edit_distance", version = "3.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "edit_distance", source = "hex", outer_checksum = "7DC465C34695F9E57D79FC65670C53C992CE342BF29E0AA41FF44F61AF62FC56" },
+  { name = "filepath", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "B06A9AF0BF10E51401D64B98E4B627F1D2E48C154967DA7AF4D0914780A6D40A" },
+  { name = "glance", version = "5.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib", "glexer"], otp_app = "glance", source = "hex", outer_checksum = "7F216D97935465FF4AC46699CD1C3E0FB19CB678B002E4ACAFCE256E96312F14" },
+  { name = "gleam_community_ansi", version = "1.4.3", build_tools = ["gleam"], requirements = ["gleam_community_colour", "gleam_regexp", "gleam_stdlib"], otp_app = "gleam_community_ansi", source = "hex", outer_checksum = "8A62AE9CC6EA65BEA630D95016D6C07E4F9973565FA3D0DE68DC4200D8E0DD27" },
+  { name = "gleam_community_colour", version = "2.0.2", build_tools = ["gleam"], requirements = ["gleam_json", "gleam_stdlib"], otp_app = "gleam_community_colour", source = "hex", outer_checksum = "E34DD2C896AC3792151EDA939DA435FF3B69922F33415ED3C4406C932FBE9634" },
+  { name = "gleam_json", version = "3.0.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_json", source = "hex", outer_checksum = "874FA3C3BB6E22DD2BB111966BD40B3759E9094E05257899A7C08F5DE77EC049" },
   { name = "gleam_regexp", version = "1.1.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_regexp", source = "hex", outer_checksum = "9C215C6CA84A5B35BB934A9B61A9A306EC743153BE2B0425A0D032E477B062A9" },
   { name = "gleam_stdlib", version = "0.65.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "7C69C71D8C493AE11A5184828A77110EB05A7786EBF8B25B36A72F879C3EE107" },
   { name = "gleam_yielder", version = "1.1.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_yielder", source = "hex", outer_checksum = "8E4E4ECFA7982859F430C57F549200C7749823C106759F4A19A78AEA6687717A" },
   { name = "gleeunit", version = "1.6.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "FDC68A8C492B1E9B429249062CD9BAC9B5538C6FBF584817205D0998C42E1DAC" },
+  { name = "glexer", version = "2.3.0", build_tools = ["gleam"], requirements = ["gleam_stdlib", "splitter"], otp_app = "glexer", source = "hex", outer_checksum = "40A1FB0919FA080AD6C5809B4C7DBA545841CAAC8168FACDFA0B0667C22475CC" },
   { name = "iv", version = "1.3.2", build_tools = ["gleam"], requirements = ["gleam_stdlib", "gleam_yielder"], otp_app = "iv", source = "hex", outer_checksum = "1FE22E047705BE69EA366E3A2E73C2E1310CBCB27DDE767DE17AE3FA86499947" },
+  { name = "justin", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "justin", source = "hex", outer_checksum = "7FA0C6DB78640C6DC5FBFD59BF3456009F3F8B485BF6825E97E1EB44E9A1E2CD" },
   { name = "nibble", version = "1.1.4", build_tools = ["gleam"], requirements = ["gleam_regexp", "gleam_stdlib", "iv"], otp_app = "nibble", source = "hex", outer_checksum = "06397501730FF486AE6F99299982A33F5EA9F8945B5A25920C82C8F924CEA481" },
+  { name = "rank", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "rank", source = "hex", outer_checksum = "5660E361F0E49CBB714CC57CC4C89C63415D8986F05B2DA0C719D5642FAD91C9" },
+  { name = "simplifile", version = "2.3.0", build_tools = ["gleam"], requirements = ["filepath", "gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "0A868DAC6063D9E983477981839810DC2E553285AB4588B87E3E9C96A7FB4CB4" },
+  { name = "splitter", version = "1.1.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "splitter", source = "hex", outer_checksum = "05564A381580395DCDEFF4F88A64B021E8DAFA6540AE99B4623962F52976AA9D" },
+  { name = "term_size", version = "1.0.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "term_size", source = "hex", outer_checksum = "D00BD2BC8FB3EBB7E6AE076F3F1FF2AC9D5ED1805F004D0896C784D06C6645F1" },
+  { name = "trie_again", version = "1.1.4", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "trie_again", source = "hex", outer_checksum = "E3BD66B4E126EF567EA8C4944EAB216413392ADF6C16C36047AF79EE5EF13466" },
 ]
 
 [requirements]
+birdie = { version = ">= 1.4.1 and < 2.0.0" }
 gleam_stdlib = { version = ">= 0.44.0 and < 2.0.0" }
 gleeunit = { version = ">= 1.0.0 and < 2.0.0" }
 nibble = { version = ">= 1.1.4 and < 2.0.0" }

src/ghall.gleam 🔗

@@ -1,10 +1,11 @@
+import gleam/io
 import gleam/list
+import gleam/string
+import nibble
 import nibble/lexer.{type Token, Token}
+import parser
 import quasi_lexer
 
 pub fn main() -> Nil {
-  let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: "let x1 = e1")
-  echo tokens
-
   Nil
 }

src/parser.gleam 🔗

@@ -0,0 +1,37 @@
+import gleam/option.{None, Some}
+import gleam/string
+import nibble.{type Parser, do, return}
+import node.{type Node}
+
+/// Parse a keyword string and return the specified Node on success
+pub fn keyword(expected: String, node: Node) -> Parser(Node, String, ctx) {
+  use _ <- do(string.to_graphemes(expected) |> match_chars(expected))
+  return(node)
+}
+
+/// Recursively match each character in the list
+fn match_chars(chars: List(String), context: String) -> Parser(Nil, String, ctx) {
+  case chars {
+    [] -> return(Nil)
+
+    [first, ..rest] -> {
+      use _ <- do(
+        nibble.take_map(
+          "expected '" <> first <> "' in keyword '" <> context <> "'",
+          fn(tok) {
+            case tok == first {
+              True -> Some(Nil)
+              False -> None
+            }
+          },
+        ),
+      )
+      match_chars(rest, context)
+    }
+  }
+}
+
+/// Parse the "let" keyword
+pub fn let_keyword() -> Parser(Node, String, ctx) {
+  keyword("let", node.Let)
+}

src/quasi_lexer.gleam 🔗

@@ -12,6 +12,8 @@ pub fn run(
   let assert Ok(tokens) = lexer.run(input, lexer)
 
   // Nibble's lexer prepends an empty string to the quasi_lexer's
-  // otherwise acceptable output
-  tokens |> list.drop(1)
+  // otherwise acceptable output. After dropping it, we need to decrement
+  // column values on the first row since the empty token advanced the column counter
+  tokens
+  |> list.drop(1)
 }

test/ghall_test.gleam 🔗

@@ -1,6 +1,11 @@
+import birdie
+import gleam/int
 import gleam/list
 import gleeunit
+import nibble.{Expected}
 import nibble/lexer.{Span, Token}
+import node
+import parser
 import quasi_lexer
 
 pub fn main() -> Nil {
@@ -20,3 +25,59 @@ pub fn simple_quasi_lexer_test() {
     assert col_end == col_start + 1
   })
 }
+
+pub fn quasi_lexer_off_by_one_test() {
+  let input = "let x1 =\n e1"
+  let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input)
+
+  let snap =
+    tokens
+    |> list.index_map(fn(token, index) {
+      let Token(Span(rs, cs, re, ce), lexeme, _) = token
+      "Token "
+      <> int.to_string(index)
+      <> ": '"
+      <> lexeme
+      <> "' at Span(row_start: "
+      <> int.to_string(rs)
+      <> ", col_start: "
+      <> int.to_string(cs)
+      <> ", row_end: "
+      <> int.to_string(re)
+      <> ", col_end: "
+      <> int.to_string(ce)
+      <> ")\n"
+    })
+    |> list.fold("", fn(acc, line) { acc <> line })
+
+  birdie.snap(snap, title: "Quasi lexer spans with multiline input")
+}
+
+pub fn parse_let_successfully_test() {
+  let input = "let"
+  let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input)
+  let parser = parser.keyword("let", node.Let)
+  let assert Ok(_) = nibble.run(tokens, parser)
+}
+
+pub fn parse_let_failing_test() {
+  let input = "lt"
+  let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input)
+  let parser = parser.keyword("let", node.Let)
+  let assert Error(error) = nibble.run(tokens, parser)
+  let assert [nibble.DeadEnd(Span(_, cs, _, _), Expected(msg, got: got), [])] =
+    error
+
+  let snap =
+    "Msg: "
+    <> msg
+    <> "\n"
+    <> "Got: "
+    <> got
+    <> "\n"
+    <> "At column: "
+    <> int.to_string(cs)
+    <> "\n"
+
+  birdie.snap(snap, title: "Should fail to parse 'lt' as node.Let")
+}