search.rs

  1use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
  2use anyhow::{Context, Result};
  3use client::proto;
  4use globset::{Glob, GlobMatcher};
  5use itertools::Itertools;
  6use language::{char_kind, Rope};
  7use regex::{Regex, RegexBuilder};
  8use smol::future::yield_now;
  9use std::{
 10    io::{BufRead, BufReader, Read},
 11    ops::Range,
 12    path::{Path, PathBuf},
 13    sync::Arc,
 14};
 15
 16#[derive(Clone, Debug)]
 17pub enum SearchQuery {
 18    Text {
 19        search: Arc<AhoCorasick<usize>>,
 20        query: Arc<str>,
 21        whole_word: bool,
 22        case_sensitive: bool,
 23        files_to_include: Vec<PathMatcher>,
 24        files_to_exclude: Vec<PathMatcher>,
 25    },
 26    Regex {
 27        regex: Regex,
 28        query: Arc<str>,
 29        multiline: bool,
 30        whole_word: bool,
 31        case_sensitive: bool,
 32        files_to_include: Vec<PathMatcher>,
 33        files_to_exclude: Vec<PathMatcher>,
 34    },
 35}
 36
 37#[derive(Clone, Debug)]
 38pub struct PathMatcher {
 39    maybe_path: PathBuf,
 40    glob: GlobMatcher,
 41}
 42
 43impl std::fmt::Display for PathMatcher {
 44    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 45        self.maybe_path.to_string_lossy().fmt(f)
 46    }
 47}
 48
 49impl PathMatcher {
 50    pub fn new(maybe_glob: &str) -> Result<Self, globset::Error> {
 51        Ok(PathMatcher {
 52            glob: Glob::new(&maybe_glob)?.compile_matcher(),
 53            maybe_path: PathBuf::from(maybe_glob),
 54        })
 55    }
 56
 57    pub fn is_match<P: AsRef<Path>>(&self, other: P) -> bool {
 58        other.as_ref().starts_with(&self.maybe_path) || self.glob.is_match(other)
 59    }
 60}
 61
 62impl SearchQuery {
 63    pub fn text(
 64        query: impl ToString,
 65        whole_word: bool,
 66        case_sensitive: bool,
 67        files_to_include: Vec<PathMatcher>,
 68        files_to_exclude: Vec<PathMatcher>,
 69    ) -> Self {
 70        let query = query.to_string();
 71        let search = AhoCorasickBuilder::new()
 72            .auto_configure(&[&query])
 73            .ascii_case_insensitive(!case_sensitive)
 74            .build(&[&query]);
 75        Self::Text {
 76            search: Arc::new(search),
 77            query: Arc::from(query),
 78            whole_word,
 79            case_sensitive,
 80            files_to_include,
 81            files_to_exclude,
 82        }
 83    }
 84
 85    pub fn regex(
 86        query: impl ToString,
 87        whole_word: bool,
 88        case_sensitive: bool,
 89        files_to_include: Vec<PathMatcher>,
 90        files_to_exclude: Vec<PathMatcher>,
 91    ) -> Result<Self> {
 92        let mut query = query.to_string();
 93        let initial_query = Arc::from(query.as_str());
 94        if whole_word {
 95            let mut word_query = String::new();
 96            word_query.push_str("\\b");
 97            word_query.push_str(&query);
 98            word_query.push_str("\\b");
 99            query = word_query
100        }
101
102        let multiline = query.contains('\n') || query.contains("\\n");
103        let regex = RegexBuilder::new(&query)
104            .case_insensitive(!case_sensitive)
105            .multi_line(multiline)
106            .build()?;
107        Ok(Self::Regex {
108            regex,
109            query: initial_query,
110            multiline,
111            whole_word,
112            case_sensitive,
113            files_to_include,
114            files_to_exclude,
115        })
116    }
117
118    pub fn from_proto(message: proto::SearchProject) -> Result<Self> {
119        if message.regex {
120            Self::regex(
121                message.query,
122                message.whole_word,
123                message.case_sensitive,
124                deserialize_path_matches(&message.files_to_include)?,
125                deserialize_path_matches(&message.files_to_exclude)?,
126            )
127        } else {
128            Ok(Self::text(
129                message.query,
130                message.whole_word,
131                message.case_sensitive,
132                deserialize_path_matches(&message.files_to_include)?,
133                deserialize_path_matches(&message.files_to_exclude)?,
134            ))
135        }
136    }
137
138    pub fn to_proto(&self, project_id: u64) -> proto::SearchProject {
139        proto::SearchProject {
140            project_id,
141            query: self.as_str().to_string(),
142            regex: self.is_regex(),
143            whole_word: self.whole_word(),
144            case_sensitive: self.case_sensitive(),
145            files_to_include: self
146                .files_to_include()
147                .iter()
148                .map(|matcher| matcher.to_string())
149                .join(","),
150            files_to_exclude: self
151                .files_to_exclude()
152                .iter()
153                .map(|matcher| matcher.to_string())
154                .join(","),
155        }
156    }
157
158    pub fn detect<T: Read>(&self, stream: T) -> Result<bool> {
159        if self.as_str().is_empty() {
160            return Ok(false);
161        }
162
163        match self {
164            Self::Text { search, .. } => {
165                let mat = search.stream_find_iter(stream).next();
166                match mat {
167                    Some(Ok(_)) => Ok(true),
168                    Some(Err(err)) => Err(err.into()),
169                    None => Ok(false),
170                }
171            }
172            Self::Regex {
173                regex, multiline, ..
174            } => {
175                let mut reader = BufReader::new(stream);
176                if *multiline {
177                    let mut text = String::new();
178                    if let Err(err) = reader.read_to_string(&mut text) {
179                        Err(err.into())
180                    } else {
181                        Ok(regex.find(&text).is_some())
182                    }
183                } else {
184                    for line in reader.lines() {
185                        let line = line?;
186                        if regex.find(&line).is_some() {
187                            return Ok(true);
188                        }
189                    }
190                    Ok(false)
191                }
192            }
193        }
194    }
195
196    pub async fn search(&self, rope: &Rope) -> Vec<Range<usize>> {
197        const YIELD_INTERVAL: usize = 20000;
198
199        if self.as_str().is_empty() {
200            return Default::default();
201        }
202
203        let mut matches = Vec::new();
204        match self {
205            Self::Text {
206                search, whole_word, ..
207            } => {
208                for (ix, mat) in search
209                    .stream_find_iter(rope.bytes_in_range(0..rope.len()))
210                    .enumerate()
211                {
212                    if (ix + 1) % YIELD_INTERVAL == 0 {
213                        yield_now().await;
214                    }
215
216                    let mat = mat.unwrap();
217                    if *whole_word {
218                        let prev_kind = rope.reversed_chars_at(mat.start()).next().map(char_kind);
219                        let start_kind = char_kind(rope.chars_at(mat.start()).next().unwrap());
220                        let end_kind = char_kind(rope.reversed_chars_at(mat.end()).next().unwrap());
221                        let next_kind = rope.chars_at(mat.end()).next().map(char_kind);
222                        if Some(start_kind) == prev_kind || Some(end_kind) == next_kind {
223                            continue;
224                        }
225                    }
226                    matches.push(mat.start()..mat.end())
227                }
228            }
229            Self::Regex {
230                regex, multiline, ..
231            } => {
232                if *multiline {
233                    let text = rope.to_string();
234                    for (ix, mat) in regex.find_iter(&text).enumerate() {
235                        if (ix + 1) % YIELD_INTERVAL == 0 {
236                            yield_now().await;
237                        }
238
239                        matches.push(mat.start()..mat.end());
240                    }
241                } else {
242                    let mut line = String::new();
243                    let mut line_offset = 0;
244                    for (chunk_ix, chunk) in rope.chunks().chain(["\n"]).enumerate() {
245                        if (chunk_ix + 1) % YIELD_INTERVAL == 0 {
246                            yield_now().await;
247                        }
248
249                        for (newline_ix, text) in chunk.split('\n').enumerate() {
250                            if newline_ix > 0 {
251                                for mat in regex.find_iter(&line) {
252                                    let start = line_offset + mat.start();
253                                    let end = line_offset + mat.end();
254                                    matches.push(start..end);
255                                }
256
257                                line_offset += line.len() + 1;
258                                line.clear();
259                            }
260                            line.push_str(text);
261                        }
262                    }
263                }
264            }
265        }
266        matches
267    }
268
269    pub fn as_str(&self) -> &str {
270        match self {
271            Self::Text { query, .. } => query.as_ref(),
272            Self::Regex { query, .. } => query.as_ref(),
273        }
274    }
275
276    pub fn whole_word(&self) -> bool {
277        match self {
278            Self::Text { whole_word, .. } => *whole_word,
279            Self::Regex { whole_word, .. } => *whole_word,
280        }
281    }
282
283    pub fn case_sensitive(&self) -> bool {
284        match self {
285            Self::Text { case_sensitive, .. } => *case_sensitive,
286            Self::Regex { case_sensitive, .. } => *case_sensitive,
287        }
288    }
289
290    pub fn is_regex(&self) -> bool {
291        matches!(self, Self::Regex { .. })
292    }
293
294    pub fn files_to_include(&self) -> &[PathMatcher] {
295        match self {
296            Self::Text {
297                files_to_include, ..
298            } => files_to_include,
299            Self::Regex {
300                files_to_include, ..
301            } => files_to_include,
302        }
303    }
304
305    pub fn files_to_exclude(&self) -> &[PathMatcher] {
306        match self {
307            Self::Text {
308                files_to_exclude, ..
309            } => files_to_exclude,
310            Self::Regex {
311                files_to_exclude, ..
312            } => files_to_exclude,
313        }
314    }
315
316    pub fn file_matches(&self, file_path: Option<&Path>) -> bool {
317        match file_path {
318            Some(file_path) => {
319                !self
320                    .files_to_exclude()
321                    .iter()
322                    .any(|exclude_glob| exclude_glob.is_match(file_path))
323                    && (self.files_to_include().is_empty()
324                        || self
325                            .files_to_include()
326                            .iter()
327                            .any(|include_glob| include_glob.is_match(file_path)))
328            }
329            None => self.files_to_include().is_empty(),
330        }
331    }
332}
333
334fn deserialize_path_matches(glob_set: &str) -> anyhow::Result<Vec<PathMatcher>> {
335    glob_set
336        .split(',')
337        .map(str::trim)
338        .filter(|glob_str| !glob_str.is_empty())
339        .map(|glob_str| {
340            PathMatcher::new(glob_str)
341                .with_context(|| format!("deserializing path match glob {glob_str}"))
342        })
343        .collect()
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn path_matcher_creation_for_valid_paths() {
352        for valid_path in [
353            "file",
354            "Cargo.toml",
355            ".DS_Store",
356            "~/dir/another_dir/",
357            "./dir/file",
358            "dir/[a-z].txt",
359            "../dir/filé",
360        ] {
361            let path_matcher = PathMatcher::new(valid_path).unwrap_or_else(|e| {
362                panic!("Valid path {valid_path} should be accepted, but got: {e}")
363            });
364            assert!(
365                path_matcher.is_match(valid_path),
366                "Path matcher for valid path {valid_path} should match itself"
367            )
368        }
369    }
370
371    #[test]
372    fn path_matcher_creation_for_globs() {
373        for invalid_glob in ["dir/[].txt", "dir/[a-z.txt", "dir/{file"] {
374            match PathMatcher::new(invalid_glob) {
375                Ok(_) => panic!("Invalid glob {invalid_glob} should not be accepted"),
376                Err(_expected) => {}
377            }
378        }
379
380        for valid_glob in [
381            "dir/?ile",
382            "dir/*.txt",
383            "dir/**/file",
384            "dir/[a-z].txt",
385            "{dir,file}",
386        ] {
387            match PathMatcher::new(valid_glob) {
388                Ok(_expected) => {}
389                Err(e) => panic!("Valid glob {valid_glob} should be accepted, but got: {e}"),
390            }
391        }
392    }
393}