tokenize.rs

 1fn char_class(character: char) -> u8 {
 2    if character.is_alphanumeric() || character == '_' {
 3        0
 4    } else if character.is_whitespace() {
 5        1
 6    } else {
 7        2
 8    }
 9}
10
11pub(crate) fn tokenize(text: &str) -> Vec<&str> {
12    let mut tokens = Vec::new();
13    let mut characters = text.char_indices().peekable();
14
15    while let Some((start, character)) = characters.next() {
16        let class = char_class(character);
17        if class == 2 {
18            tokens.push(&text[start..start + character.len_utf8()]);
19            continue;
20        }
21
22        let mut end = start + character.len_utf8();
23        while let Some(&(_, next_character)) = characters.peek() {
24            if char_class(next_character) != class {
25                break;
26            }
27            end += next_character.len_utf8();
28            characters.next();
29        }
30        tokens.push(&text[start..end]);
31    }
32
33    tokens
34}
35
36#[cfg(test)]
37mod tests {
38    use super::tokenize;
39
40    #[test]
41    fn tokenizes_code_like_text() {
42        assert_eq!(tokenize("hello world"), vec!["hello", " ", "world"]);
43        assert_eq!(
44            tokenize("foo_bar123 + baz"),
45            vec!["foo_bar123", " ", "+", " ", "baz"]
46        );
47        assert_eq!(
48            tokenize("print(\"hello\")"),
49            vec!["print", "(", "\"", "hello", "\"", ")"]
50        );
51        assert_eq!(tokenize("hello_world"), vec!["hello_world"]);
52        assert_eq!(tokenize("fn();"), vec!["fn", "(", ")", ";"]);
53    }
54}