1fn char_class(character: char) -> u8 {
2 if character.is_alphanumeric() || character == '_' {
3 0
4 } else if character.is_whitespace() {
5 1
6 } else {
7 2
8 }
9}
10
11pub(crate) fn tokenize(text: &str) -> Vec<&str> {
12 let mut tokens = Vec::new();
13 let mut characters = text.char_indices().peekable();
14
15 while let Some((start, character)) = characters.next() {
16 let class = char_class(character);
17 if class == 2 {
18 tokens.push(&text[start..start + character.len_utf8()]);
19 continue;
20 }
21
22 let mut end = start + character.len_utf8();
23 while let Some(&(_, next_character)) = characters.peek() {
24 if char_class(next_character) != class {
25 break;
26 }
27 end += next_character.len_utf8();
28 characters.next();
29 }
30 tokens.push(&text[start..end]);
31 }
32
33 tokens
34}
35
36#[cfg(test)]
37mod tests {
38 use super::tokenize;
39
40 #[test]
41 fn tokenizes_code_like_text() {
42 assert_eq!(tokenize("hello world"), vec!["hello", " ", "world"]);
43 assert_eq!(
44 tokenize("foo_bar123 + baz"),
45 vec!["foo_bar123", " ", "+", " ", "baz"]
46 );
47 assert_eq!(
48 tokenize("print(\"hello\")"),
49 vec!["print", "(", "\"", "hello", "\"", ")"]
50 );
51 assert_eq!(tokenize("hello_world"), vec!["hello_world"]);
52 assert_eq!(tokenize("fn();"), vec!["fn", "(", ")", ";"]);
53 }
54}