diff --git a/src/node.gleam b/src/node.gleam index 948084d064c51c24b2a289b4a4d3132ad123cd45..4e2f07c7525ade976545674a011c0fa2c88a41d0 100644 --- a/src/node.gleam +++ b/src/node.gleam @@ -1,4 +1,9 @@ pub type Node { Let EndOfLine + Tab + LineCommentPrefix + ValidNonAscii(String) + Printable(String) + NotEndOfLine(Node) } diff --git a/src/parser.gleam b/src/parser.gleam index aa3b599c1f4cd4df859213e59d14160e30100439..9d83150291c9954e8ea47348d80e362f2c45c694 100644 --- a/src/parser.gleam +++ b/src/parser.gleam @@ -1,7 +1,10 @@ import gleam/option.{None, Some} import gleam/string import nibble.{type Parser, do, return} -import node.{type Node, EndOfLine} +import node.{ + type Node, EndOfLine, LineCommentPrefix, NotEndOfLine, Printable, Tab, + ValidNonAscii, +} pub fn exact_string(expected: String, node: Node) -> Parser(Node, String, ctx) { use _ <- do(string.to_graphemes(expected) |> match_chars(expected)) @@ -47,3 +50,89 @@ pub fn end_of_line() -> Parser(Node, String, ctx) { return(EndOfLine) } + +pub fn tab() -> Parser(Node, String, ctx) { + use _ <- nibble.do(nibble.token("\t")) + + return(Tab) +} + +pub fn line_comment_prefix() -> Parser(Node, String, ctx) { + use _ <- nibble.do(consume_exact_string("--")) + + return(LineCommentPrefix) +} + +/// Helper function to parse a grapheme if its codepoint satisfies a predicate +fn codepoint_satisfies( + predicate: fn(Int) -> Bool, + error_msg: String, +) -> Parser(String, String, ctx) { + nibble.take_map(error_msg, fn(grapheme) { + case string.to_utf_codepoints(grapheme) { + [codepoint] -> { + let cp_value = string.utf_codepoint_to_int(codepoint) + case predicate(cp_value) { + True -> Some(grapheme) + False -> None + } + } + _ -> None + } + }) +} + +/// Helper function to check if a codepoint is in the valid-non-ascii ranges +/// as defined by the Dhall ABNF specification +fn is_valid_non_ascii_codepoint(codepoint: Int) -> Bool { + case codepoint { + _ if codepoint >= 0x80 && codepoint <= 0xD7FF -> True + _ if codepoint >= 0xE000 && codepoint <= 0xFFFD -> True + _ if codepoint >= 0x10000 && codepoint <= 0x1FFFD -> True + _ if codepoint >= 0x20000 && codepoint <= 0x2FFFD -> True + _ if codepoint >= 0x30000 && codepoint <= 0x3FFFD -> True + _ if codepoint >= 0x40000 && codepoint <= 0x4FFFD -> True + _ if codepoint >= 0x50000 && codepoint <= 0x5FFFD -> True + _ if codepoint >= 0x60000 && codepoint <= 0x6FFFD -> True + _ if codepoint >= 0x70000 && codepoint <= 0x7FFFD -> True + _ if codepoint >= 0x80000 && codepoint <= 0x8FFFD -> True + _ if codepoint >= 0x90000 && codepoint <= 0x9FFFD -> True + _ if codepoint >= 0xA0000 && codepoint <= 0xAFFFD -> True + _ if codepoint >= 0xB0000 && codepoint <= 0xBFFFD -> True + _ if codepoint >= 0xC0000 && codepoint <= 0xCFFFD -> True + _ if codepoint >= 0xD0000 && codepoint <= 0xDFFFD -> True + _ if codepoint >= 0xE0000 && codepoint <= 0xEFFFD -> True + _ if codepoint >= 0xF0000 && codepoint <= 0xFFFFD -> True + _ if codepoint >= 0x100000 && codepoint <= 0x10FFFD -> True + _ -> False + } +} + +/// Parser for valid-non-ascii as defined by Dhall ABNF +pub fn valid_non_ascii() -> Parser(Node, String, ctx) { + use char <- nibble.do( + codepoint_satisfies(is_valid_non_ascii_codepoint, "valid-non-ascii character"), + ) + + return(ValidNonAscii(char)) +} + +/// Parser for ASCII printable characters (%x20-7F) +pub fn printable() -> Parser(Node, String, ctx) { + use char <- nibble.do( + codepoint_satisfies( + fn(cp) { cp >= 0x20 && cp <= 0x7F }, + "ASCII printable character (0x20-0x7F)", + ), + ) + + return(Printable(char)) +} + +/// Parser for not-end-of-line as defined by Dhall ABNF: +/// not-end-of-line = %x20-7F / valid-non-ascii / tab +pub fn not_end_of_line() -> Parser(Node, String, ctx) { + use inner <- nibble.do(nibble.one_of([printable(), valid_non_ascii(), tab()])) + + return(NotEndOfLine(inner)) +} diff --git a/test/ghall_test.gleam b/test/ghall_test.gleam index bf3be6a7fbe6126be80aebc041585ac1421c4415..0db8be2fec3e0d795ad63ec65b22f1cc5edb7faa 100644 --- a/test/ghall_test.gleam +++ b/test/ghall_test.gleam @@ -183,3 +183,134 @@ pub fn parse_multiple_line_endings_test() { title: "Multiple line endings should all parse as EndOfLine", ) } + +// Tests for printable parser (%x20-7F) + +pub fn parse_printable_space_test() { + let input = " " + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.printable() + let assert Ok(node.Printable(" ")) = nibble.run(tokens, parser) +} + +pub fn parse_printable_ascii_test() { + let input = "a" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.printable() + let assert Ok(node.Printable("a")) = nibble.run(tokens, parser) +} + +pub fn parse_printable_tilde_test() { + let input = "~" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.printable() + let assert Ok(node.Printable("~")) = nibble.run(tokens, parser) +} + +pub fn parse_printable_rejects_tab_test() { + let input = "\t" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.printable() + let assert Error(_) = nibble.run(tokens, parser) +} + +pub fn parse_printable_rejects_newline_test() { + let input = "\n" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.printable() + let assert Error(_) = nibble.run(tokens, parser) +} + +pub fn parse_printable_rejects_non_ascii_test() { + let input = "é" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.printable() + let assert Error(_) = nibble.run(tokens, parser) +} + +// Tests for valid-non-ascii parser + +pub fn parse_valid_non_ascii_latin_test() { + let input = "é" + // é is U+00E9, in range 0x80-0xD7FF + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.valid_non_ascii() + let assert Ok(node.ValidNonAscii("é")) = nibble.run(tokens, parser) +} + +pub fn parse_valid_non_ascii_emoji_test() { + let input = "🎉" + // 🎉 is U+1F389, in range 0x10000-0x1FFFD + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.valid_non_ascii() + let assert Ok(node.ValidNonAscii("🎉")) = nibble.run(tokens, parser) +} + +pub fn parse_valid_non_ascii_chinese_test() { + let input = "中" + // 中 is U+4E2D, in range 0xE000-0xFFFD + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.valid_non_ascii() + let assert Ok(node.ValidNonAscii("中")) = nibble.run(tokens, parser) +} + +pub fn parse_valid_non_ascii_rejects_ascii_test() { + let input = "a" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.valid_non_ascii() + let assert Error(_) = nibble.run(tokens, parser) +} + +pub fn parse_valid_non_ascii_rejects_tab_test() { + let input = "\t" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.valid_non_ascii() + let assert Error(_) = nibble.run(tokens, parser) +} + +// Tests for not-end-of-line parser + +pub fn parse_not_end_of_line_printable_test() { + let input = "a" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.not_end_of_line() + let assert Ok(node.NotEndOfLine(node.Printable("a"))) = + nibble.run(tokens, parser) +} + +pub fn parse_not_end_of_line_space_test() { + let input = " " + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.not_end_of_line() + let assert Ok(node.NotEndOfLine(node.Printable(" "))) = + nibble.run(tokens, parser) +} + +pub fn parse_not_end_of_line_tab_test() { + let input = "\t" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.not_end_of_line() + let assert Ok(node.NotEndOfLine(node.Tab)) = nibble.run(tokens, parser) +} + +pub fn parse_not_end_of_line_valid_non_ascii_test() { + let input = "λ" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.not_end_of_line() + let assert Ok(node.NotEndOfLine(node.ValidNonAscii("λ"))) = + nibble.run(tokens, parser) +} + +pub fn parse_not_end_of_line_rejects_newline_test() { + let input = "\n" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.not_end_of_line() + let assert Error(_) = nibble.run(tokens, parser) +} + +pub fn parse_not_end_of_line_rejects_carriage_return_test() { + let input = "\r" + let tokens = quasi_lexer.chars() |> quasi_lexer.run(on: input) + let parser = parser.not_end_of_line() + let assert Error(_) = nibble.run(tokens, parser) +}