1use std::{iter::Peekable, str::CharIndices};
2
3#[derive(Clone, Copy, PartialEq, Eq)]
4enum CharClass {
5 Identifier,
6 Newline,
7 Whitespace,
8 Punctuation,
9}
10
11const MULTI_CHAR_PUNCTUATION: &[&str] = &[
12 ">>>=", "<<=", ">>=", "...", "..=", "??=", "**=", ">>>", "::", "->", "=>", "==", "!=", "<=",
13 ">=", "&&", "||", "<<", ">>", "..", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "++", "--",
14 "**", "??", "?.", ":=", "<-", "//", "/*", "*/",
15];
16
17fn char_class(character: char) -> CharClass {
18 if character == '\n' || character == '\r' {
19 CharClass::Newline
20 } else if character.is_whitespace() {
21 CharClass::Whitespace
22 } else if character.is_alphanumeric() || character == '_' {
23 CharClass::Identifier
24 } else {
25 CharClass::Punctuation
26 }
27}
28
29fn is_identifier_boundary(previous: char, current: char, next: Option<char>) -> bool {
30 (current.is_uppercase() && (previous.is_lowercase() || previous.is_numeric()))
31 || (current.is_uppercase()
32 && previous.is_uppercase()
33 && next.is_some_and(|next| next.is_lowercase()))
34}
35
36fn push_identifier_tokens<'a>(identifier: &'a str, tokens: &mut Vec<&'a str>) {
37 let characters: Vec<(usize, char)> = identifier.char_indices().collect();
38 let mut segment_start = 0;
39 let mut index = 0;
40
41 while index < characters.len() {
42 let (byte_index, character) = characters[index];
43
44 if character == '_' {
45 if segment_start < byte_index {
46 tokens.push(&identifier[segment_start..byte_index]);
47 }
48
49 let mut underscore_end = byte_index + character.len_utf8();
50 index += 1;
51
52 while index < characters.len() && characters[index].1 == '_' {
53 underscore_end = characters[index].0 + characters[index].1.len_utf8();
54 index += 1;
55 }
56
57 tokens.push(&identifier[byte_index..underscore_end]);
58 segment_start = underscore_end;
59 continue;
60 }
61
62 if byte_index > segment_start {
63 let previous = characters[index - 1].1;
64 let next = characters.get(index + 1).map(|(_, character)| *character);
65
66 if is_identifier_boundary(previous, character, next) {
67 tokens.push(&identifier[segment_start..byte_index]);
68 segment_start = byte_index;
69 }
70 }
71
72 index += 1;
73 }
74
75 if segment_start < identifier.len() {
76 tokens.push(&identifier[segment_start..]);
77 }
78}
79
80fn push_punctuation_token<'a>(
81 text: &'a str,
82 start: usize,
83 character: char,
84 characters: &mut Peekable<CharIndices<'a>>,
85 tokens: &mut Vec<&'a str>,
86) {
87 let remaining = &text[start..];
88
89 for punctuation in MULTI_CHAR_PUNCTUATION {
90 if remaining.starts_with(punctuation) {
91 for _ in punctuation.chars().skip(1) {
92 characters.next();
93 }
94
95 tokens.push(&remaining[..punctuation.len()]);
96 return;
97 }
98 }
99
100 let end = start + character.len_utf8();
101 tokens.push(&text[start..end]);
102}
103
104pub(crate) fn tokenize(text: &str) -> Vec<&str> {
105 let mut tokens = Vec::new();
106 let mut characters = text.char_indices().peekable();
107
108 while let Some((start, character)) = characters.next() {
109 match char_class(character) {
110 CharClass::Identifier => {
111 let mut end = start + character.len_utf8();
112
113 while let Some(&(next_start, next_character)) = characters.peek() {
114 if char_class(next_character) != CharClass::Identifier {
115 break;
116 }
117
118 end = next_start + next_character.len_utf8();
119 characters.next();
120 }
121
122 push_identifier_tokens(&text[start..end], &mut tokens);
123 }
124 CharClass::Newline => {
125 let mut end = start + character.len_utf8();
126
127 while let Some(&(next_start, next_character)) = characters.peek() {
128 if char_class(next_character) != CharClass::Newline {
129 break;
130 }
131
132 end = next_start + next_character.len_utf8();
133 characters.next();
134 }
135
136 tokens.push(&text[start..end]);
137 }
138 CharClass::Whitespace => {
139 let mut end = start + character.len_utf8();
140
141 while let Some(&(next_start, next_character)) = characters.peek() {
142 if char_class(next_character) != CharClass::Whitespace {
143 break;
144 }
145
146 end = next_start + next_character.len_utf8();
147 characters.next();
148 }
149
150 tokens.push(&text[start..end]);
151 }
152 CharClass::Punctuation => {
153 push_punctuation_token(text, start, character, &mut characters, &mut tokens);
154 }
155 }
156 }
157
158 tokens
159}
160
161#[cfg(test)]
162mod tests {
163 use super::tokenize;
164
165 #[test]
166 fn tokenizes_code() {
167 assert_eq!(tokenize("hello world"), vec!["hello", " ", "world"]);
168 assert_eq!(
169 tokenize("foo_bar123 + baz"),
170 vec!["foo", "_", "bar123", " ", "+", " ", "baz"]
171 );
172 assert_eq!(
173 tokenize("print(\"hello\")"),
174 vec!["print", "(", "\"", "hello", "\"", ")"]
175 );
176 assert_eq!(tokenize("hello_world"), vec!["hello", "_", "world"]);
177 assert_eq!(tokenize("fn();"), vec!["fn", "(", ")", ";"]);
178 }
179
180 #[test]
181 fn tokenizes_identifier_case_styles() {
182 assert_eq!(
183 tokenize("camelCase PascalCase snake_case"),
184 vec![
185 "camel", "Case", " ", "Pascal", "Case", " ", "snake", "_", "case"
186 ]
187 );
188 assert_eq!(
189 tokenize("myHTTPServer __private_value foo__bar"),
190 vec![
191 "my", "HTTP", "Server", " ", "__", "private", "_", "value", " ", "foo", "__", "bar"
192 ]
193 );
194 assert_eq!(
195 tokenize("XMLHttpRequest Version2Update"),
196 vec!["XML", "Http", "Request", " ", "Version2", "Update"]
197 );
198 }
199
200 #[test]
201 fn tokenizes_grouped_punctuation() {
202 assert_eq!(
203 tokenize("a::b -> c != d ..= e"),
204 vec![
205 "a", "::", "b", " ", "->", " ", "c", " ", "!=", " ", "d", " ", "..=", " ", "e"
206 ]
207 );
208 assert_eq!(
209 tokenize("foo?.bar ?? baz"),
210 vec!["foo", "?.", "bar", " ", "??", " ", "baz"]
211 );
212 }
213
214 #[test]
215 fn tokenize_whitespace_runs() {
216 assert_eq!(tokenize(" "), vec![" "]);
217 assert_eq!(tokenize(" \n foo"), vec![" ", "\n", " ", "foo"]);
218 assert_eq!(tokenize("\r\n\nfoo"), vec!["\r\n\n", "foo"]);
219 }
220}