tokenize.rs

  1use std::{iter::Peekable, str::CharIndices};
  2
  3#[derive(Clone, Copy, PartialEq, Eq)]
  4enum CharClass {
  5    Identifier,
  6    Newline,
  7    Whitespace,
  8    Punctuation,
  9}
 10
 11const MULTI_CHAR_PUNCTUATION: &[&str] = &[
 12    ">>>=", "<<=", ">>=", "...", "..=", "??=", "**=", ">>>", "::", "->", "=>", "==", "!=", "<=",
 13    ">=", "&&", "||", "<<", ">>", "..", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "++", "--",
 14    "**", "??", "?.", ":=", "<-", "//", "/*", "*/",
 15];
 16
 17fn char_class(character: char) -> CharClass {
 18    if character == '\n' || character == '\r' {
 19        CharClass::Newline
 20    } else if character.is_whitespace() {
 21        CharClass::Whitespace
 22    } else if character.is_alphanumeric() || character == '_' {
 23        CharClass::Identifier
 24    } else {
 25        CharClass::Punctuation
 26    }
 27}
 28
 29fn is_identifier_boundary(previous: char, current: char, next: Option<char>) -> bool {
 30    (current.is_uppercase() && (previous.is_lowercase() || previous.is_numeric()))
 31        || (current.is_uppercase()
 32            && previous.is_uppercase()
 33            && next.is_some_and(|next| next.is_lowercase()))
 34}
 35
 36fn push_identifier_tokens<'a>(identifier: &'a str, tokens: &mut Vec<&'a str>) {
 37    let characters: Vec<(usize, char)> = identifier.char_indices().collect();
 38    let mut segment_start = 0;
 39    let mut index = 0;
 40
 41    while index < characters.len() {
 42        let (byte_index, character) = characters[index];
 43
 44        if character == '_' {
 45            if segment_start < byte_index {
 46                tokens.push(&identifier[segment_start..byte_index]);
 47            }
 48
 49            let mut underscore_end = byte_index + character.len_utf8();
 50            index += 1;
 51
 52            while index < characters.len() && characters[index].1 == '_' {
 53                underscore_end = characters[index].0 + characters[index].1.len_utf8();
 54                index += 1;
 55            }
 56
 57            tokens.push(&identifier[byte_index..underscore_end]);
 58            segment_start = underscore_end;
 59            continue;
 60        }
 61
 62        if byte_index > segment_start {
 63            let previous = characters[index - 1].1;
 64            let next = characters.get(index + 1).map(|(_, character)| *character);
 65
 66            if is_identifier_boundary(previous, character, next) {
 67                tokens.push(&identifier[segment_start..byte_index]);
 68                segment_start = byte_index;
 69            }
 70        }
 71
 72        index += 1;
 73    }
 74
 75    if segment_start < identifier.len() {
 76        tokens.push(&identifier[segment_start..]);
 77    }
 78}
 79
 80fn push_punctuation_token<'a>(
 81    text: &'a str,
 82    start: usize,
 83    character: char,
 84    characters: &mut Peekable<CharIndices<'a>>,
 85    tokens: &mut Vec<&'a str>,
 86) {
 87    let remaining = &text[start..];
 88
 89    for punctuation in MULTI_CHAR_PUNCTUATION {
 90        if remaining.starts_with(punctuation) {
 91            for _ in punctuation.chars().skip(1) {
 92                characters.next();
 93            }
 94
 95            tokens.push(&remaining[..punctuation.len()]);
 96            return;
 97        }
 98    }
 99
100    let end = start + character.len_utf8();
101    tokens.push(&text[start..end]);
102}
103
104pub(crate) fn tokenize(text: &str) -> Vec<&str> {
105    let mut tokens = Vec::new();
106    let mut characters = text.char_indices().peekable();
107
108    while let Some((start, character)) = characters.next() {
109        match char_class(character) {
110            CharClass::Identifier => {
111                let mut end = start + character.len_utf8();
112
113                while let Some(&(next_start, next_character)) = characters.peek() {
114                    if char_class(next_character) != CharClass::Identifier {
115                        break;
116                    }
117
118                    end = next_start + next_character.len_utf8();
119                    characters.next();
120                }
121
122                push_identifier_tokens(&text[start..end], &mut tokens);
123            }
124            CharClass::Newline => {
125                let mut end = start + character.len_utf8();
126
127                while let Some(&(next_start, next_character)) = characters.peek() {
128                    if char_class(next_character) != CharClass::Newline {
129                        break;
130                    }
131
132                    end = next_start + next_character.len_utf8();
133                    characters.next();
134                }
135
136                tokens.push(&text[start..end]);
137            }
138            CharClass::Whitespace => {
139                let mut end = start + character.len_utf8();
140
141                while let Some(&(next_start, next_character)) = characters.peek() {
142                    if char_class(next_character) != CharClass::Whitespace {
143                        break;
144                    }
145
146                    end = next_start + next_character.len_utf8();
147                    characters.next();
148                }
149
150                tokens.push(&text[start..end]);
151            }
152            CharClass::Punctuation => {
153                push_punctuation_token(text, start, character, &mut characters, &mut tokens);
154            }
155        }
156    }
157
158    tokens
159}
160
161#[cfg(test)]
162mod tests {
163    use super::tokenize;
164
165    #[test]
166    fn tokenizes_code() {
167        assert_eq!(tokenize("hello world"), vec!["hello", " ", "world"]);
168        assert_eq!(
169            tokenize("foo_bar123 + baz"),
170            vec!["foo", "_", "bar123", " ", "+", " ", "baz"]
171        );
172        assert_eq!(
173            tokenize("print(\"hello\")"),
174            vec!["print", "(", "\"", "hello", "\"", ")"]
175        );
176        assert_eq!(tokenize("hello_world"), vec!["hello", "_", "world"]);
177        assert_eq!(tokenize("fn();"), vec!["fn", "(", ")", ";"]);
178    }
179
180    #[test]
181    fn tokenizes_identifier_case_styles() {
182        assert_eq!(
183            tokenize("camelCase PascalCase snake_case"),
184            vec![
185                "camel", "Case", " ", "Pascal", "Case", " ", "snake", "_", "case"
186            ]
187        );
188        assert_eq!(
189            tokenize("myHTTPServer __private_value foo__bar"),
190            vec![
191                "my", "HTTP", "Server", " ", "__", "private", "_", "value", " ", "foo", "__", "bar"
192            ]
193        );
194        assert_eq!(
195            tokenize("XMLHttpRequest Version2Update"),
196            vec!["XML", "Http", "Request", " ", "Version2", "Update"]
197        );
198    }
199
200    #[test]
201    fn tokenizes_grouped_punctuation() {
202        assert_eq!(
203            tokenize("a::b -> c != d ..= e"),
204            vec![
205                "a", "::", "b", " ", "->", " ", "c", " ", "!=", " ", "d", " ", "..=", " ", "e"
206            ]
207        );
208        assert_eq!(
209            tokenize("foo?.bar ?? baz"),
210            vec!["foo", "?.", "bar", " ", "??", " ", "baz"]
211        );
212    }
213
214    #[test]
215    fn tokenize_whitespace_runs() {
216        assert_eq!(tokenize("  "), vec!["  "]);
217        assert_eq!(tokenize("  \n   foo"), vec!["  ", "\n", "   ", "foo"]);
218        assert_eq!(tokenize("\r\n\nfoo"), vec!["\r\n\n", "foo"]);
219    }
220}