search.rs

  1use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
  2use anyhow::{Context, Result};
  3use client::proto;
  4use globset::{Glob, GlobMatcher};
  5use itertools::Itertools;
  6use language::{char_kind, BufferSnapshot};
  7use regex::{Regex, RegexBuilder};
  8use smol::future::yield_now;
  9use std::{
 10    borrow::Cow,
 11    io::{BufRead, BufReader, Read},
 12    ops::Range,
 13    path::{Path, PathBuf},
 14    sync::Arc,
 15};
 16
 17#[derive(Clone, Debug)]
 18pub struct SearchInputs {
 19    query: Arc<str>,
 20    files_to_include: Vec<PathMatcher>,
 21    files_to_exclude: Vec<PathMatcher>,
 22}
 23
 24impl SearchInputs {
 25    pub fn as_str(&self) -> &str {
 26        self.query.as_ref()
 27    }
 28    pub fn files_to_include(&self) -> &[PathMatcher] {
 29        &self.files_to_include
 30    }
 31    pub fn files_to_exclude(&self) -> &[PathMatcher] {
 32        &self.files_to_exclude
 33    }
 34}
 35#[derive(Clone, Debug)]
 36pub enum SearchQuery {
 37    Text {
 38        search: Arc<AhoCorasick<usize>>,
 39        replacement: Option<String>,
 40        whole_word: bool,
 41        case_sensitive: bool,
 42        inner: SearchInputs,
 43    },
 44
 45    Regex {
 46        regex: Regex,
 47        replacement: Option<String>,
 48        multiline: bool,
 49        whole_word: bool,
 50        case_sensitive: bool,
 51        inner: SearchInputs,
 52    },
 53}
 54
 55#[derive(Clone, Debug)]
 56pub struct PathMatcher {
 57    maybe_path: PathBuf,
 58    glob: GlobMatcher,
 59}
 60
 61impl std::fmt::Display for PathMatcher {
 62    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 63        self.maybe_path.to_string_lossy().fmt(f)
 64    }
 65}
 66
 67impl PathMatcher {
 68    pub fn new(maybe_glob: &str) -> Result<Self, globset::Error> {
 69        Ok(PathMatcher {
 70            glob: Glob::new(&maybe_glob)?.compile_matcher(),
 71            maybe_path: PathBuf::from(maybe_glob),
 72        })
 73    }
 74
 75    pub fn is_match<P: AsRef<Path>>(&self, other: P) -> bool {
 76        other.as_ref().starts_with(&self.maybe_path) || self.glob.is_match(other)
 77    }
 78}
 79
 80impl SearchQuery {
 81    pub fn text(
 82        query: impl ToString,
 83        whole_word: bool,
 84        case_sensitive: bool,
 85        files_to_include: Vec<PathMatcher>,
 86        files_to_exclude: Vec<PathMatcher>,
 87    ) -> Self {
 88        let query = query.to_string();
 89        let search = AhoCorasickBuilder::new()
 90            .auto_configure(&[&query])
 91            .ascii_case_insensitive(!case_sensitive)
 92            .build(&[&query]);
 93        let inner = SearchInputs {
 94            query: query.into(),
 95            files_to_exclude,
 96            files_to_include,
 97        };
 98        Self::Text {
 99            search: Arc::new(search),
100            replacement: None,
101            whole_word,
102            case_sensitive,
103            inner,
104        }
105    }
106
107    pub fn regex(
108        query: impl ToString,
109        whole_word: bool,
110        case_sensitive: bool,
111        files_to_include: Vec<PathMatcher>,
112        files_to_exclude: Vec<PathMatcher>,
113    ) -> Result<Self> {
114        let mut query = query.to_string();
115        let initial_query = Arc::from(query.as_str());
116        if whole_word {
117            let mut word_query = String::new();
118            word_query.push_str("\\b");
119            word_query.push_str(&query);
120            word_query.push_str("\\b");
121            query = word_query
122        }
123
124        let multiline = query.contains('\n') || query.contains("\\n");
125        let regex = RegexBuilder::new(&query)
126            .case_insensitive(!case_sensitive)
127            .multi_line(multiline)
128            .build()?;
129        let inner = SearchInputs {
130            query: initial_query,
131            files_to_exclude,
132            files_to_include,
133        };
134        Ok(Self::Regex {
135            regex,
136            replacement: None,
137            multiline,
138            whole_word,
139            case_sensitive,
140            inner,
141        })
142    }
143
144    pub fn from_proto(message: proto::SearchProject) -> Result<Self> {
145        if message.regex {
146            Self::regex(
147                message.query,
148                message.whole_word,
149                message.case_sensitive,
150                deserialize_path_matches(&message.files_to_include)?,
151                deserialize_path_matches(&message.files_to_exclude)?,
152            )
153        } else {
154            Ok(Self::text(
155                message.query,
156                message.whole_word,
157                message.case_sensitive,
158                deserialize_path_matches(&message.files_to_include)?,
159                deserialize_path_matches(&message.files_to_exclude)?,
160            ))
161        }
162    }
163    pub fn with_replacement(mut self, new_replacement: Option<String>) -> Self {
164        match self {
165            Self::Text {
166                ref mut replacement,
167                ..
168            }
169            | Self::Regex {
170                ref mut replacement,
171                ..
172            } => {
173                *replacement = new_replacement;
174                self
175            }
176        }
177    }
178    pub fn to_proto(&self, project_id: u64) -> proto::SearchProject {
179        proto::SearchProject {
180            project_id,
181            query: self.as_str().to_string(),
182            regex: self.is_regex(),
183            whole_word: self.whole_word(),
184            case_sensitive: self.case_sensitive(),
185            files_to_include: self
186                .files_to_include()
187                .iter()
188                .map(|matcher| matcher.to_string())
189                .join(","),
190            files_to_exclude: self
191                .files_to_exclude()
192                .iter()
193                .map(|matcher| matcher.to_string())
194                .join(","),
195        }
196    }
197
198    pub fn detect<T: Read>(&self, stream: T) -> Result<bool> {
199        if self.as_str().is_empty() {
200            return Ok(false);
201        }
202
203        match self {
204            Self::Text { search, .. } => {
205                let mat = search.stream_find_iter(stream).next();
206                match mat {
207                    Some(Ok(_)) => Ok(true),
208                    Some(Err(err)) => Err(err.into()),
209                    None => Ok(false),
210                }
211            }
212            Self::Regex {
213                regex, multiline, ..
214            } => {
215                let mut reader = BufReader::new(stream);
216                if *multiline {
217                    let mut text = String::new();
218                    if let Err(err) = reader.read_to_string(&mut text) {
219                        Err(err.into())
220                    } else {
221                        Ok(regex.find(&text).is_some())
222                    }
223                } else {
224                    for line in reader.lines() {
225                        let line = line?;
226                        if regex.find(&line).is_some() {
227                            return Ok(true);
228                        }
229                    }
230                    Ok(false)
231                }
232            }
233        }
234    }
235    pub fn replacement<'a>(&self, text: &'a str) -> Option<Cow<'a, str>> {
236        match self {
237            SearchQuery::Text { replacement, .. } => replacement.clone().map(Cow::from),
238            SearchQuery::Regex {
239                regex, replacement, ..
240            } => {
241                if let Some(replacement) = replacement {
242                    Some(regex.replace(text, replacement))
243                } else {
244                    None
245                }
246            }
247        }
248    }
249    pub async fn search(
250        &self,
251        buffer: &BufferSnapshot,
252        subrange: Option<Range<usize>>,
253    ) -> Vec<Range<usize>> {
254        const YIELD_INTERVAL: usize = 20000;
255
256        if self.as_str().is_empty() {
257            return Default::default();
258        }
259
260        let range_offset = subrange.as_ref().map(|r| r.start).unwrap_or(0);
261        let rope = if let Some(range) = subrange {
262            buffer.as_rope().slice(range)
263        } else {
264            buffer.as_rope().clone()
265        };
266
267        let mut matches = Vec::new();
268        match self {
269            Self::Text {
270                search, whole_word, ..
271            } => {
272                for (ix, mat) in search
273                    .stream_find_iter(rope.bytes_in_range(0..rope.len()))
274                    .enumerate()
275                {
276                    if (ix + 1) % YIELD_INTERVAL == 0 {
277                        yield_now().await;
278                    }
279
280                    let mat = mat.unwrap();
281                    if *whole_word {
282                        let scope = buffer.language_scope_at(range_offset + mat.start());
283                        let kind = |c| char_kind(&scope, c);
284
285                        let prev_kind = rope.reversed_chars_at(mat.start()).next().map(kind);
286                        let start_kind = kind(rope.chars_at(mat.start()).next().unwrap());
287                        let end_kind = kind(rope.reversed_chars_at(mat.end()).next().unwrap());
288                        let next_kind = rope.chars_at(mat.end()).next().map(kind);
289                        if Some(start_kind) == prev_kind || Some(end_kind) == next_kind {
290                            continue;
291                        }
292                    }
293                    matches.push(mat.start()..mat.end())
294                }
295            }
296
297            Self::Regex {
298                regex, multiline, ..
299            } => {
300                if *multiline {
301                    let text = rope.to_string();
302                    for (ix, mat) in regex.find_iter(&text).enumerate() {
303                        if (ix + 1) % YIELD_INTERVAL == 0 {
304                            yield_now().await;
305                        }
306
307                        matches.push(mat.start()..mat.end());
308                    }
309                } else {
310                    let mut line = String::new();
311                    let mut line_offset = 0;
312                    for (chunk_ix, chunk) in rope.chunks().chain(["\n"]).enumerate() {
313                        if (chunk_ix + 1) % YIELD_INTERVAL == 0 {
314                            yield_now().await;
315                        }
316
317                        for (newline_ix, text) in chunk.split('\n').enumerate() {
318                            if newline_ix > 0 {
319                                for mat in regex.find_iter(&line) {
320                                    let start = line_offset + mat.start();
321                                    let end = line_offset + mat.end();
322                                    matches.push(start..end);
323                                }
324
325                                line_offset += line.len() + 1;
326                                line.clear();
327                            }
328                            line.push_str(text);
329                        }
330                    }
331                }
332            }
333        }
334
335        matches
336    }
337
338    pub fn as_str(&self) -> &str {
339        self.as_inner().as_str()
340    }
341
342    pub fn whole_word(&self) -> bool {
343        match self {
344            Self::Text { whole_word, .. } => *whole_word,
345            Self::Regex { whole_word, .. } => *whole_word,
346        }
347    }
348
349    pub fn case_sensitive(&self) -> bool {
350        match self {
351            Self::Text { case_sensitive, .. } => *case_sensitive,
352            Self::Regex { case_sensitive, .. } => *case_sensitive,
353        }
354    }
355
356    pub fn is_regex(&self) -> bool {
357        matches!(self, Self::Regex { .. })
358    }
359
360    pub fn files_to_include(&self) -> &[PathMatcher] {
361        self.as_inner().files_to_include()
362    }
363
364    pub fn files_to_exclude(&self) -> &[PathMatcher] {
365        self.as_inner().files_to_exclude()
366    }
367
368    pub fn file_matches(&self, file_path: Option<&Path>) -> bool {
369        match file_path {
370            Some(file_path) => {
371                !self
372                    .files_to_exclude()
373                    .iter()
374                    .any(|exclude_glob| exclude_glob.is_match(file_path))
375                    && (self.files_to_include().is_empty()
376                        || self
377                            .files_to_include()
378                            .iter()
379                            .any(|include_glob| include_glob.is_match(file_path)))
380            }
381            None => self.files_to_include().is_empty(),
382        }
383    }
384    pub fn as_inner(&self) -> &SearchInputs {
385        match self {
386            Self::Regex { inner, .. } | Self::Text { inner, .. } => inner,
387        }
388    }
389}
390
391fn deserialize_path_matches(glob_set: &str) -> anyhow::Result<Vec<PathMatcher>> {
392    glob_set
393        .split(',')
394        .map(str::trim)
395        .filter(|glob_str| !glob_str.is_empty())
396        .map(|glob_str| {
397            PathMatcher::new(glob_str)
398                .with_context(|| format!("deserializing path match glob {glob_str}"))
399        })
400        .collect()
401}
402
403#[cfg(test)]
404mod tests {
405    use super::*;
406
407    #[test]
408    fn path_matcher_creation_for_valid_paths() {
409        for valid_path in [
410            "file",
411            "Cargo.toml",
412            ".DS_Store",
413            "~/dir/another_dir/",
414            "./dir/file",
415            "dir/[a-z].txt",
416            "../dir/filé",
417        ] {
418            let path_matcher = PathMatcher::new(valid_path).unwrap_or_else(|e| {
419                panic!("Valid path {valid_path} should be accepted, but got: {e}")
420            });
421            assert!(
422                path_matcher.is_match(valid_path),
423                "Path matcher for valid path {valid_path} should match itself"
424            )
425        }
426    }
427
428    #[test]
429    fn path_matcher_creation_for_globs() {
430        for invalid_glob in ["dir/[].txt", "dir/[a-z.txt", "dir/{file"] {
431            match PathMatcher::new(invalid_glob) {
432                Ok(_) => panic!("Invalid glob {invalid_glob} should not be accepted"),
433                Err(_expected) => {}
434            }
435        }
436
437        for valid_glob in [
438            "dir/?ile",
439            "dir/*.txt",
440            "dir/**/file",
441            "dir/[a-z].txt",
442            "{dir,file}",
443        ] {
444            match PathMatcher::new(valid_glob) {
445                Ok(_expected) => {}
446                Err(e) => panic!("Valid glob {valid_glob} should be accepted, but got: {e}"),
447            }
448        }
449    }
450}