search.rs

  1use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
  2use anyhow::Result;
  3use client::proto;
  4use fancy_regex::{Captures, Regex, RegexBuilder};
  5use gpui::Entity;
  6use itertools::Itertools as _;
  7use language::{Buffer, BufferSnapshot, CharKind};
  8use smol::future::yield_now;
  9use std::{
 10    borrow::Cow,
 11    io::{BufRead, BufReader, Read},
 12    ops::Range,
 13    sync::{Arc, LazyLock},
 14};
 15use text::Anchor;
 16use util::{
 17    paths::{PathMatcher, PathStyle},
 18    rel_path::RelPath,
 19};
 20
 21#[derive(Debug)]
 22pub enum SearchResult {
 23    Buffer {
 24        buffer: Entity<Buffer>,
 25        ranges: Vec<Range<Anchor>>,
 26    },
 27    LimitReached,
 28}
 29
 30#[derive(Clone, Copy, PartialEq)]
 31pub enum SearchInputKind {
 32    Query,
 33    Include,
 34    Exclude,
 35}
 36
 37#[derive(Clone, Debug)]
 38pub struct SearchInputs {
 39    query: Arc<str>,
 40    files_to_include: PathMatcher,
 41    files_to_exclude: PathMatcher,
 42    match_full_paths: bool,
 43    buffers: Option<Vec<Entity<Buffer>>>,
 44}
 45
 46impl SearchInputs {
 47    pub fn as_str(&self) -> &str {
 48        self.query.as_ref()
 49    }
 50    pub fn files_to_include(&self) -> &PathMatcher {
 51        &self.files_to_include
 52    }
 53    pub fn files_to_exclude(&self) -> &PathMatcher {
 54        &self.files_to_exclude
 55    }
 56    pub fn buffers(&self) -> &Option<Vec<Entity<Buffer>>> {
 57        &self.buffers
 58    }
 59}
 60#[derive(Clone, Debug)]
 61pub enum SearchQuery {
 62    Text {
 63        search: AhoCorasick,
 64        replacement: Option<String>,
 65        whole_word: bool,
 66        case_sensitive: bool,
 67        include_ignored: bool,
 68        inner: SearchInputs,
 69    },
 70    Regex {
 71        regex: Regex,
 72        replacement: Option<String>,
 73        multiline: bool,
 74        whole_word: bool,
 75        case_sensitive: bool,
 76        include_ignored: bool,
 77        one_match_per_line: bool,
 78        inner: SearchInputs,
 79    },
 80}
 81
 82static WORD_MATCH_TEST: LazyLock<Regex> = LazyLock::new(|| {
 83    RegexBuilder::new(r"\B")
 84        .build()
 85        .expect("Failed to create WORD_MATCH_TEST")
 86});
 87
 88impl SearchQuery {
 89    /// Create a text query
 90    ///
 91    /// If `match_full_paths` is true, include/exclude patterns will always be matched against fully qualified project paths beginning with a project root.
 92    /// If `match_full_paths` is false, patterns will be matched against full paths only when the project has multiple roots.
 93    pub fn text(
 94        query: impl ToString,
 95        whole_word: bool,
 96        case_sensitive: bool,
 97        include_ignored: bool,
 98        files_to_include: PathMatcher,
 99        files_to_exclude: PathMatcher,
100        match_full_paths: bool,
101        buffers: Option<Vec<Entity<Buffer>>>,
102    ) -> Result<Self> {
103        let query = query.to_string();
104        if !case_sensitive && !query.is_ascii() {
105            // AhoCorasickBuilder doesn't support case-insensitive search with unicode characters
106            // Fallback to regex search as recommended by
107            // https://docs.rs/aho-corasick/1.1/aho_corasick/struct.AhoCorasickBuilder.html#method.ascii_case_insensitive
108            return Self::regex(
109                regex::escape(&query),
110                whole_word,
111                case_sensitive,
112                include_ignored,
113                false,
114                files_to_include,
115                files_to_exclude,
116                false,
117                buffers,
118            );
119        }
120        let search = AhoCorasickBuilder::new()
121            .ascii_case_insensitive(!case_sensitive)
122            .build([&query])?;
123        let inner = SearchInputs {
124            query: query.into(),
125            files_to_exclude,
126            files_to_include,
127            match_full_paths,
128            buffers,
129        };
130        Ok(Self::Text {
131            search,
132            replacement: None,
133            whole_word,
134            case_sensitive,
135            include_ignored,
136            inner,
137        })
138    }
139
140    /// Create a regex query
141    ///
142    /// If `match_full_paths` is true, include/exclude patterns will be matched against fully qualified project paths
143    /// beginning with a project root name. If false, they will be matched against project-relative paths (which don't start
144    /// with their respective project root).
145    pub fn regex(
146        query: impl ToString,
147        whole_word: bool,
148        mut case_sensitive: bool,
149        include_ignored: bool,
150        one_match_per_line: bool,
151        files_to_include: PathMatcher,
152        files_to_exclude: PathMatcher,
153        match_full_paths: bool,
154        buffers: Option<Vec<Entity<Buffer>>>,
155    ) -> Result<Self> {
156        let mut query = query.to_string();
157        let initial_query = Arc::from(query.as_str());
158
159        if let Some((case_sensitive_from_pattern, new_query)) =
160            Self::case_sensitive_from_pattern(&query)
161        {
162            case_sensitive = case_sensitive_from_pattern;
163            query = new_query
164        }
165
166        if whole_word {
167            let mut word_query = String::new();
168            if let Some(first) = query.get(0..1)
169                && WORD_MATCH_TEST.is_match(first).is_ok_and(|x| !x)
170            {
171                word_query.push_str("\\b");
172            }
173            word_query.push_str(&query);
174            if let Some(last) = query.get(query.len() - 1..)
175                && WORD_MATCH_TEST.is_match(last).is_ok_and(|x| !x)
176            {
177                word_query.push_str("\\b");
178            }
179            query = word_query
180        }
181
182        let multiline = query.contains('\n') || query.contains("\\n");
183        if multiline {
184            query.insert_str(0, "(?m)");
185        }
186
187        let regex = RegexBuilder::new(&query)
188            .case_insensitive(!case_sensitive)
189            .build()?;
190        let inner = SearchInputs {
191            query: initial_query,
192            files_to_exclude,
193            files_to_include,
194            match_full_paths,
195            buffers,
196        };
197        Ok(Self::Regex {
198            regex,
199            replacement: None,
200            multiline,
201            whole_word,
202            case_sensitive,
203            include_ignored,
204            inner,
205            one_match_per_line,
206        })
207    }
208
209    /// Extracts case sensitivity settings from pattern items in the provided
210    /// query and returns the same query, with the pattern items removed.
211    ///
212    /// The following pattern modifiers are supported:
213    ///
214    /// - `\c` (case_sensitive: false)
215    /// - `\C` (case_sensitive: true)
216    ///
217    /// If no pattern item were found, `None` will be returned.
218    fn case_sensitive_from_pattern(query: &str) -> Option<(bool, String)> {
219        if !(query.contains("\\c") || query.contains("\\C")) {
220            return None;
221        }
222
223        let mut was_escaped = false;
224        let mut new_query = String::new();
225        let mut is_case_sensitive = None;
226
227        for c in query.chars() {
228            if was_escaped {
229                if c == 'c' {
230                    is_case_sensitive = Some(false);
231                } else if c == 'C' {
232                    is_case_sensitive = Some(true);
233                } else {
234                    new_query.push('\\');
235                    new_query.push(c);
236                }
237                was_escaped = false
238            } else if c == '\\' {
239                was_escaped = true
240            } else {
241                new_query.push(c);
242            }
243        }
244
245        is_case_sensitive.map(|c| (c, new_query))
246    }
247
248    pub fn from_proto(message: proto::SearchQuery, path_style: PathStyle) -> Result<Self> {
249        let files_to_include = if message.files_to_include.is_empty() {
250            message
251                .files_to_include_legacy
252                .split(',')
253                .map(str::trim)
254                .filter(|&glob_str| !glob_str.is_empty())
255                .map(|s| s.to_string())
256                .collect()
257        } else {
258            message.files_to_include
259        };
260
261        let files_to_exclude = if message.files_to_exclude.is_empty() {
262            message
263                .files_to_exclude_legacy
264                .split(',')
265                .map(str::trim)
266                .filter(|&glob_str| !glob_str.is_empty())
267                .map(|s| s.to_string())
268                .collect()
269        } else {
270            message.files_to_exclude
271        };
272
273        if message.regex {
274            Self::regex(
275                message.query,
276                message.whole_word,
277                message.case_sensitive,
278                message.include_ignored,
279                false,
280                PathMatcher::new(files_to_include, path_style)?,
281                PathMatcher::new(files_to_exclude, path_style)?,
282                message.match_full_paths,
283                None, // search opened only don't need search remote
284            )
285        } else {
286            Self::text(
287                message.query,
288                message.whole_word,
289                message.case_sensitive,
290                message.include_ignored,
291                PathMatcher::new(files_to_include, path_style)?,
292                PathMatcher::new(files_to_exclude, path_style)?,
293                false,
294                None, // search opened only don't need search remote
295            )
296        }
297    }
298
299    pub fn with_replacement(mut self, new_replacement: String) -> Self {
300        match self {
301            Self::Text {
302                ref mut replacement,
303                ..
304            }
305            | Self::Regex {
306                ref mut replacement,
307                ..
308            } => {
309                *replacement = Some(new_replacement);
310                self
311            }
312        }
313    }
314
315    pub fn to_proto(&self) -> proto::SearchQuery {
316        let mut files_to_include = self.files_to_include().sources();
317        let mut files_to_exclude = self.files_to_exclude().sources();
318        proto::SearchQuery {
319            query: self.as_str().to_string(),
320            regex: self.is_regex(),
321            whole_word: self.whole_word(),
322            case_sensitive: self.case_sensitive(),
323            include_ignored: self.include_ignored(),
324            files_to_include: files_to_include.clone().map(ToOwned::to_owned).collect(),
325            files_to_exclude: files_to_exclude.clone().map(ToOwned::to_owned).collect(),
326            match_full_paths: self.match_full_paths(),
327            // Populate legacy fields for backwards compatibility
328            files_to_include_legacy: files_to_include.join(","),
329            files_to_exclude_legacy: files_to_exclude.join(","),
330        }
331    }
332
333    pub(crate) async fn detect(
334        &self,
335        mut reader: BufReader<Box<dyn Read + Send + Sync>>,
336    ) -> Result<bool> {
337        let query_str = self.as_str();
338        let needle_len = query_str.len();
339        if needle_len == 0 {
340            return Ok(false);
341        }
342        if self.as_str().is_empty() {
343            return Ok(false);
344        }
345
346        let mut text = String::new();
347        let mut bytes_read = 0;
348        // Yield from this function every 128 bytes scanned.
349        const YIELD_THRESHOLD: usize = 128;
350        match self {
351            Self::Text { search, .. } => {
352                if query_str.contains('\n') {
353                    reader.read_to_string(&mut text)?;
354                    Ok(search.is_match(&text))
355                } else {
356                    // Yield from this function every 128 bytes scanned.
357                    const YIELD_THRESHOLD: usize = 128;
358                    while reader.read_line(&mut text)? > 0 {
359                        if search.is_match(&text) {
360                            return Ok(true);
361                        }
362                        bytes_read += text.len();
363                        if bytes_read >= YIELD_THRESHOLD {
364                            bytes_read = 0;
365                            smol::future::yield_now().await;
366                        }
367                        text.clear();
368                    }
369                    Ok(false)
370                }
371            }
372            Self::Regex {
373                regex, multiline, ..
374            } => {
375                if *multiline {
376                    if let Err(err) = reader.read_to_string(&mut text) {
377                        Err(err.into())
378                    } else {
379                        Ok(regex.is_match(&text)?)
380                    }
381                } else {
382                    while reader.read_line(&mut text)? > 0 {
383                        if regex.is_match(&text)? {
384                            return Ok(true);
385                        }
386                        bytes_read += text.len();
387                        if bytes_read >= YIELD_THRESHOLD {
388                            bytes_read = 0;
389                            smol::future::yield_now().await;
390                        }
391                        text.clear();
392                    }
393                    Ok(false)
394                }
395            }
396        }
397    }
398    /// Returns the replacement text for this `SearchQuery`.
399    pub fn replacement(&self) -> Option<&str> {
400        match self {
401            SearchQuery::Text { replacement, .. } | SearchQuery::Regex { replacement, .. } => {
402                replacement.as_deref()
403            }
404        }
405    }
406    /// Replaces search hits if replacement is set. `text` is assumed to be a string that matches this `SearchQuery` exactly, without any leftovers on either side.
407    pub fn replacement_for<'a>(&self, text: &'a str) -> Option<Cow<'a, str>> {
408        match self {
409            SearchQuery::Text { replacement, .. } => replacement.clone().map(Cow::from),
410            SearchQuery::Regex {
411                regex, replacement, ..
412            } => {
413                if let Some(replacement) = replacement {
414                    static TEXT_REPLACEMENT_SPECIAL_CHARACTERS_REGEX: LazyLock<Regex> =
415                        LazyLock::new(|| Regex::new(r"\\\\|\\n|\\t").unwrap());
416                    let replacement = TEXT_REPLACEMENT_SPECIAL_CHARACTERS_REGEX.replace_all(
417                        replacement,
418                        |c: &Captures| match c.get(0).unwrap().as_str() {
419                            r"\\" => "\\",
420                            r"\n" => "\n",
421                            r"\t" => "\t",
422                            x => unreachable!("Unexpected escape sequence: {}", x),
423                        },
424                    );
425                    Some(regex.replace(text, replacement))
426                } else {
427                    None
428                }
429            }
430        }
431    }
432
433    pub async fn search(
434        &self,
435        buffer: &BufferSnapshot,
436        subrange: Option<Range<usize>>,
437    ) -> Vec<Range<usize>> {
438        const YIELD_INTERVAL: usize = 20000;
439
440        if self.as_str().is_empty() {
441            return Default::default();
442        }
443
444        let range_offset = subrange.as_ref().map(|r| r.start).unwrap_or(0);
445        let rope = if let Some(range) = subrange {
446            buffer.as_rope().slice(range)
447        } else {
448            buffer.as_rope().clone()
449        };
450
451        let mut matches = Vec::new();
452        match self {
453            Self::Text {
454                search, whole_word, ..
455            } => {
456                for (ix, mat) in search
457                    .stream_find_iter(rope.bytes_in_range(0..rope.len()))
458                    .enumerate()
459                {
460                    if (ix + 1) % YIELD_INTERVAL == 0 {
461                        yield_now().await;
462                    }
463
464                    let mat = mat.unwrap();
465                    if *whole_word {
466                        let classifier = buffer.char_classifier_at(range_offset + mat.start());
467
468                        let prev_kind = rope
469                            .reversed_chars_at(mat.start())
470                            .next()
471                            .map(|c| classifier.kind(c));
472                        let start_kind =
473                            classifier.kind(rope.chars_at(mat.start()).next().unwrap());
474                        let end_kind =
475                            classifier.kind(rope.reversed_chars_at(mat.end()).next().unwrap());
476                        let next_kind = rope.chars_at(mat.end()).next().map(|c| classifier.kind(c));
477                        if (Some(start_kind) == prev_kind && start_kind == CharKind::Word)
478                            || (Some(end_kind) == next_kind && end_kind == CharKind::Word)
479                        {
480                            continue;
481                        }
482                    }
483                    matches.push(mat.start()..mat.end())
484                }
485            }
486
487            Self::Regex {
488                regex, multiline, ..
489            } => {
490                if *multiline {
491                    let text = rope.to_string();
492                    for (ix, mat) in regex.find_iter(&text).enumerate() {
493                        if (ix + 1) % YIELD_INTERVAL == 0 {
494                            yield_now().await;
495                        }
496
497                        if let Ok(mat) = mat {
498                            matches.push(mat.start()..mat.end());
499                        }
500                    }
501                } else {
502                    let mut line = String::new();
503                    let mut line_offset = 0;
504                    for (chunk_ix, chunk) in rope.chunks().chain(["\n"]).enumerate() {
505                        if (chunk_ix + 1) % YIELD_INTERVAL == 0 {
506                            yield_now().await;
507                        }
508
509                        for (newline_ix, text) in chunk.split('\n').enumerate() {
510                            if newline_ix > 0 {
511                                for mat in regex.find_iter(&line).flatten() {
512                                    let start = line_offset + mat.start();
513                                    let end = line_offset + mat.end();
514                                    matches.push(start..end);
515                                    if self.one_match_per_line() == Some(true) {
516                                        break;
517                                    }
518                                }
519
520                                line_offset += line.len() + 1;
521                                line.clear();
522                            }
523                            line.push_str(text);
524                        }
525                    }
526                }
527            }
528        }
529
530        matches
531    }
532
533    pub fn is_empty(&self) -> bool {
534        self.as_str().is_empty()
535    }
536
537    pub fn as_str(&self) -> &str {
538        self.as_inner().as_str()
539    }
540
541    pub fn whole_word(&self) -> bool {
542        match self {
543            Self::Text { whole_word, .. } => *whole_word,
544            Self::Regex { whole_word, .. } => *whole_word,
545        }
546    }
547
548    pub fn case_sensitive(&self) -> bool {
549        match self {
550            Self::Text { case_sensitive, .. } => *case_sensitive,
551            Self::Regex { case_sensitive, .. } => *case_sensitive,
552        }
553    }
554
555    pub fn include_ignored(&self) -> bool {
556        match self {
557            Self::Text {
558                include_ignored, ..
559            } => *include_ignored,
560            Self::Regex {
561                include_ignored, ..
562            } => *include_ignored,
563        }
564    }
565
566    pub fn is_regex(&self) -> bool {
567        matches!(self, Self::Regex { .. })
568    }
569
570    pub fn files_to_include(&self) -> &PathMatcher {
571        self.as_inner().files_to_include()
572    }
573
574    pub fn files_to_exclude(&self) -> &PathMatcher {
575        self.as_inner().files_to_exclude()
576    }
577
578    pub fn buffers(&self) -> Option<&Vec<Entity<Buffer>>> {
579        self.as_inner().buffers.as_ref()
580    }
581
582    pub fn is_opened_only(&self) -> bool {
583        self.as_inner().buffers.is_some()
584    }
585
586    pub fn filters_path(&self) -> bool {
587        !(self.files_to_exclude().sources().next().is_none()
588            && self.files_to_include().sources().next().is_none())
589    }
590
591    pub fn match_full_paths(&self) -> bool {
592        self.as_inner().match_full_paths
593    }
594
595    /// Check match full paths to determine whether you're required to pass a fully qualified
596    /// project path (starts with a project root).
597    pub fn match_path(&self, file_path: &RelPath) -> bool {
598        let mut path = file_path.to_rel_path_buf();
599        loop {
600            if self.files_to_exclude().is_match(&path) {
601                return false;
602            } else if self.files_to_include().sources().next().is_none()
603                || self.files_to_include().is_match(&path)
604            {
605                return true;
606            } else if !path.pop() {
607                return false;
608            }
609        }
610    }
611    pub fn as_inner(&self) -> &SearchInputs {
612        match self {
613            Self::Regex { inner, .. } | Self::Text { inner, .. } => inner,
614        }
615    }
616
617    /// Whether this search should replace only one match per line, instead of
618    /// all matches.
619    /// Returns `None` for text searches, as only regex searches support this
620    /// option.
621    pub fn one_match_per_line(&self) -> Option<bool> {
622        match self {
623            Self::Regex {
624                one_match_per_line, ..
625            } => Some(*one_match_per_line),
626            Self::Text { .. } => None,
627        }
628    }
629}
630
631#[cfg(test)]
632mod tests {
633    use super::*;
634
635    #[test]
636    fn path_matcher_creation_for_valid_paths() {
637        for valid_path in [
638            "file",
639            "Cargo.toml",
640            ".DS_Store",
641            "~/dir/another_dir/",
642            "./dir/file",
643            "dir/[a-z].txt",
644        ] {
645            let path_matcher = PathMatcher::new(&[valid_path.to_owned()], PathStyle::local())
646                .unwrap_or_else(|e| {
647                    panic!("Valid path {valid_path} should be accepted, but got: {e}")
648                });
649            assert!(
650                path_matcher
651                    .is_match(&RelPath::new(valid_path.as_ref(), PathStyle::local()).unwrap()),
652                "Path matcher for valid path {valid_path} should match itself"
653            )
654        }
655    }
656
657    #[test]
658    fn path_matcher_creation_for_globs() {
659        for invalid_glob in ["dir/[].txt", "dir/[a-z.txt", "dir/{file"] {
660            match PathMatcher::new(&[invalid_glob.to_owned()], PathStyle::local()) {
661                Ok(_) => panic!("Invalid glob {invalid_glob} should not be accepted"),
662                Err(_expected) => {}
663            }
664        }
665
666        for valid_glob in [
667            "dir/?ile",
668            "dir/*.txt",
669            "dir/**/file",
670            "dir/[a-z].txt",
671            "{dir,file}",
672        ] {
673            match PathMatcher::new(&[valid_glob.to_owned()], PathStyle::local()) {
674                Ok(_expected) => {}
675                Err(e) => panic!("Valid glob should be accepted, but got: {e}"),
676            }
677        }
678    }
679
680    #[test]
681    fn test_case_sensitive_pattern_items() {
682        let case_sensitive = false;
683        let search_query = SearchQuery::regex(
684            "test\\C",
685            false,
686            case_sensitive,
687            false,
688            false,
689            Default::default(),
690            Default::default(),
691            false,
692            None,
693        )
694        .expect("Should be able to create a regex SearchQuery");
695
696        assert_eq!(
697            search_query.case_sensitive(),
698            true,
699            "Case sensitivity should be enabled when \\C pattern item is present in the query."
700        );
701
702        let case_sensitive = true;
703        let search_query = SearchQuery::regex(
704            "test\\c",
705            true,
706            case_sensitive,
707            false,
708            false,
709            Default::default(),
710            Default::default(),
711            false,
712            None,
713        )
714        .expect("Should be able to create a regex SearchQuery");
715
716        assert_eq!(
717            search_query.case_sensitive(),
718            false,
719            "Case sensitivity should be disabled when \\c pattern item is present, even if initially set to true."
720        );
721
722        let case_sensitive = false;
723        let search_query = SearchQuery::regex(
724            "test\\c\\C",
725            false,
726            case_sensitive,
727            false,
728            false,
729            Default::default(),
730            Default::default(),
731            false,
732            None,
733        )
734        .expect("Should be able to create a regex SearchQuery");
735
736        assert_eq!(
737            search_query.case_sensitive(),
738            true,
739            "Case sensitivity should be enabled when \\C is the last pattern item, even after a \\c."
740        );
741
742        let case_sensitive = false;
743        let search_query = SearchQuery::regex(
744            "tests\\\\C",
745            false,
746            case_sensitive,
747            false,
748            false,
749            Default::default(),
750            Default::default(),
751            false,
752            None,
753        )
754        .expect("Should be able to create a regex SearchQuery");
755
756        assert_eq!(
757            search_query.case_sensitive(),
758            false,
759            "Case sensitivity should not be enabled when \\C pattern item is preceded by a backslash."
760        );
761    }
762
763    #[gpui::test]
764    async fn test_multiline_regex(cx: &mut gpui::TestAppContext) {
765        let search_query = SearchQuery::regex(
766            "^hello$\n",
767            false,
768            false,
769            false,
770            false,
771            Default::default(),
772            Default::default(),
773            false,
774            None,
775        )
776        .expect("Should be able to create a regex SearchQuery");
777
778        use language::Buffer;
779        let text = crate::Rope::from("hello\nworld\nhello\nworld");
780        let snapshot = cx
781            .update(|app| Buffer::build_snapshot(text, None, None, app))
782            .await;
783
784        let results = search_query.search(&snapshot, None).await;
785        assert_eq!(results, vec![0..6, 12..18]);
786    }
787}