search.rs

  1use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
  2use anyhow::Result;
  3use client::proto;
  4use fancy_regex::{Captures, Regex, RegexBuilder};
  5use gpui::Entity;
  6use itertools::Itertools as _;
  7use language::{Buffer, BufferSnapshot, CharKind};
  8use smol::future::yield_now;
  9use std::{
 10    borrow::Cow,
 11    io::{BufRead, BufReader, Read},
 12    ops::Range,
 13    sync::{Arc, LazyLock},
 14};
 15use text::Anchor;
 16use util::{
 17    paths::{PathMatcher, PathStyle},
 18    rel_path::RelPath,
 19};
 20
 21#[derive(Debug)]
 22pub enum SearchResult {
 23    Buffer {
 24        buffer: Entity<Buffer>,
 25        ranges: Vec<Range<Anchor>>,
 26    },
 27    LimitReached,
 28    WaitingForScan,
 29}
 30
 31#[derive(Clone, Copy, PartialEq)]
 32pub enum SearchInputKind {
 33    Query,
 34    Include,
 35    Exclude,
 36}
 37
 38#[derive(Clone, Debug)]
 39pub struct SearchInputs {
 40    query: Arc<str>,
 41    files_to_include: PathMatcher,
 42    files_to_exclude: PathMatcher,
 43    match_full_paths: bool,
 44    buffers: Option<Vec<Entity<Buffer>>>,
 45}
 46
 47impl SearchInputs {
 48    pub fn as_str(&self) -> &str {
 49        self.query.as_ref()
 50    }
 51    pub fn files_to_include(&self) -> &PathMatcher {
 52        &self.files_to_include
 53    }
 54    pub fn files_to_exclude(&self) -> &PathMatcher {
 55        &self.files_to_exclude
 56    }
 57    pub fn buffers(&self) -> &Option<Vec<Entity<Buffer>>> {
 58        &self.buffers
 59    }
 60}
 61#[derive(Clone, Debug)]
 62pub enum SearchQuery {
 63    Text {
 64        search: AhoCorasick,
 65        replacement: Option<String>,
 66        whole_word: bool,
 67        case_sensitive: bool,
 68        include_ignored: bool,
 69        inner: SearchInputs,
 70    },
 71    Regex {
 72        regex: Regex,
 73        replacement: Option<String>,
 74        multiline: bool,
 75        whole_word: bool,
 76        case_sensitive: bool,
 77        include_ignored: bool,
 78        one_match_per_line: bool,
 79        inner: SearchInputs,
 80    },
 81}
 82
 83static WORD_MATCH_TEST: LazyLock<Regex> = LazyLock::new(|| {
 84    RegexBuilder::new(r"\B")
 85        .build()
 86        .expect("Failed to create WORD_MATCH_TEST")
 87});
 88
 89impl SearchQuery {
 90    /// Create a text query
 91    ///
 92    /// If `match_full_paths` is true, include/exclude patterns will always be matched against fully qualified project paths beginning with a project root.
 93    /// If `match_full_paths` is false, patterns will be matched against worktree-relative paths.
 94    pub fn text(
 95        query: impl ToString,
 96        whole_word: bool,
 97        case_sensitive: bool,
 98        include_ignored: bool,
 99        files_to_include: PathMatcher,
100        files_to_exclude: PathMatcher,
101        match_full_paths: bool,
102        buffers: Option<Vec<Entity<Buffer>>>,
103    ) -> Result<Self> {
104        let query = query.to_string();
105        if !case_sensitive && !query.is_ascii() {
106            // AhoCorasickBuilder doesn't support case-insensitive search with unicode characters
107            // Fallback to regex search as recommended by
108            // https://docs.rs/aho-corasick/1.1/aho_corasick/struct.AhoCorasickBuilder.html#method.ascii_case_insensitive
109            return Self::escaped_regex(
110                query,
111                whole_word,
112                case_sensitive,
113                include_ignored,
114                files_to_include,
115                files_to_exclude,
116                false,
117                buffers,
118            );
119        }
120        let search = AhoCorasickBuilder::new()
121            .ascii_case_insensitive(!case_sensitive)
122            .build([&query])?;
123        let inner = SearchInputs {
124            query: query.into(),
125            files_to_exclude,
126            files_to_include,
127            match_full_paths,
128            buffers,
129        };
130        Ok(Self::Text {
131            search,
132            replacement: None,
133            whole_word,
134            case_sensitive,
135            include_ignored,
136            inner,
137        })
138    }
139
140    /// Create a regex query
141    ///
142    /// If `match_full_paths` is true, include/exclude patterns will be matched against fully qualified project paths
143    /// beginning with a project root name. If false, they will be matched against project-relative paths (which don't start
144    /// with their respective project root).
145    pub fn regex(
146        query: impl ToString,
147        whole_word: bool,
148        case_sensitive: bool,
149        include_ignored: bool,
150        one_match_per_line: bool,
151        files_to_include: PathMatcher,
152        files_to_exclude: PathMatcher,
153        match_full_paths: bool,
154        buffers: Option<Vec<Entity<Buffer>>>,
155    ) -> Result<Self> {
156        let query = query.to_string();
157        let inner = SearchInputs {
158            query: Arc::from(query.as_str()),
159            files_to_include,
160            files_to_exclude,
161            match_full_paths,
162            buffers,
163        };
164        Self::build_regex(
165            query,
166            whole_word,
167            case_sensitive,
168            include_ignored,
169            one_match_per_line,
170            inner,
171        )
172    }
173
174    /// Create a regex query from a literal string, escaping any regex
175    /// metacharacters so that the resulting query matches the literal text.
176    ///
177    /// Unlike `regex`, the query stored on the resulting `SearchQuery` is the
178    /// original unescaped text, so `as_str` returns what the user typed.
179    pub fn escaped_regex(
180        query: impl ToString,
181        whole_word: bool,
182        case_sensitive: bool,
183        include_ignored: bool,
184        files_to_include: PathMatcher,
185        files_to_exclude: PathMatcher,
186        match_full_paths: bool,
187        buffers: Option<Vec<Entity<Buffer>>>,
188    ) -> Result<Self> {
189        let query = query.to_string();
190        let inner = SearchInputs {
191            query: Arc::from(query.as_str()),
192            files_to_include,
193            files_to_exclude,
194            match_full_paths,
195            buffers,
196        };
197        Self::build_regex(
198            regex::escape(&query),
199            whole_word,
200            case_sensitive,
201            include_ignored,
202            false,
203            inner,
204        )
205    }
206
207    fn build_regex(
208        mut pattern: String,
209        whole_word: bool,
210        mut case_sensitive: bool,
211        include_ignored: bool,
212        one_match_per_line: bool,
213        inner: SearchInputs,
214    ) -> Result<Self> {
215        if let Some((case_sensitive_from_pattern, new_pattern)) =
216            Self::case_sensitive_from_pattern(&pattern)
217        {
218            case_sensitive = case_sensitive_from_pattern;
219            pattern = new_pattern
220        }
221
222        if whole_word {
223            let mut word_pattern = String::new();
224            if let Some(first) = pattern.get(0..1)
225                && WORD_MATCH_TEST.is_match(first).is_ok_and(|x| !x)
226            {
227                word_pattern.push_str("\\b");
228            }
229            word_pattern.push_str(&pattern);
230            if let Some(last) = pattern.get(pattern.len() - 1..)
231                && WORD_MATCH_TEST.is_match(last).is_ok_and(|x| !x)
232            {
233                word_pattern.push_str("\\b");
234            }
235            pattern = word_pattern
236        }
237
238        let multiline = pattern.contains('\n') || pattern.contains("\\n");
239        if multiline {
240            pattern.insert_str(0, "(?m)");
241        }
242
243        let regex = RegexBuilder::new(&pattern)
244            .case_insensitive(!case_sensitive)
245            .build()?;
246        Ok(Self::Regex {
247            regex,
248            replacement: None,
249            multiline,
250            whole_word,
251            case_sensitive,
252            include_ignored,
253            inner,
254            one_match_per_line,
255        })
256    }
257
258    /// Extracts case sensitivity settings from pattern items in the provided
259    /// query and returns the same query, with the pattern items removed.
260    ///
261    /// The following pattern modifiers are supported:
262    ///
263    /// - `\c` (case_sensitive: false)
264    /// - `\C` (case_sensitive: true)
265    ///
266    /// If no pattern item were found, `None` will be returned.
267    fn case_sensitive_from_pattern(query: &str) -> Option<(bool, String)> {
268        if !(query.contains("\\c") || query.contains("\\C")) {
269            return None;
270        }
271
272        let mut was_escaped = false;
273        let mut new_query = String::new();
274        let mut is_case_sensitive = None;
275
276        for c in query.chars() {
277            if was_escaped {
278                if c == 'c' {
279                    is_case_sensitive = Some(false);
280                } else if c == 'C' {
281                    is_case_sensitive = Some(true);
282                } else {
283                    new_query.push('\\');
284                    new_query.push(c);
285                }
286                was_escaped = false
287            } else if c == '\\' {
288                was_escaped = true
289            } else {
290                new_query.push(c);
291            }
292        }
293
294        is_case_sensitive.map(|c| (c, new_query))
295    }
296
297    pub fn from_proto(message: proto::SearchQuery, path_style: PathStyle) -> Result<Self> {
298        let files_to_include = if message.files_to_include.is_empty() {
299            message
300                .files_to_include_legacy
301                .split(',')
302                .map(str::trim)
303                .filter(|&glob_str| !glob_str.is_empty())
304                .map(|s| s.to_string())
305                .collect()
306        } else {
307            message.files_to_include
308        };
309
310        let files_to_exclude = if message.files_to_exclude.is_empty() {
311            message
312                .files_to_exclude_legacy
313                .split(',')
314                .map(str::trim)
315                .filter(|&glob_str| !glob_str.is_empty())
316                .map(|s| s.to_string())
317                .collect()
318        } else {
319            message.files_to_exclude
320        };
321
322        if message.regex {
323            Self::regex(
324                message.query,
325                message.whole_word,
326                message.case_sensitive,
327                message.include_ignored,
328                false,
329                PathMatcher::new(files_to_include, path_style)?,
330                PathMatcher::new(files_to_exclude, path_style)?,
331                message.match_full_paths,
332                None, // search opened only don't need search remote
333            )
334        } else {
335            Self::text(
336                message.query,
337                message.whole_word,
338                message.case_sensitive,
339                message.include_ignored,
340                PathMatcher::new(files_to_include, path_style)?,
341                PathMatcher::new(files_to_exclude, path_style)?,
342                message.match_full_paths,
343                None, // search opened only don't need search remote
344            )
345        }
346    }
347
348    pub fn with_replacement(mut self, new_replacement: String) -> Self {
349        match self {
350            Self::Text {
351                ref mut replacement,
352                ..
353            }
354            | Self::Regex {
355                ref mut replacement,
356                ..
357            } => {
358                *replacement = Some(new_replacement);
359                self
360            }
361        }
362    }
363
364    pub fn to_proto(&self) -> proto::SearchQuery {
365        let mut files_to_include = self.files_to_include().sources();
366        let mut files_to_exclude = self.files_to_exclude().sources();
367        proto::SearchQuery {
368            query: self.as_str().to_string(),
369            regex: self.is_regex(),
370            whole_word: self.whole_word(),
371            case_sensitive: self.case_sensitive(),
372            include_ignored: self.include_ignored(),
373            files_to_include: files_to_include.clone().map(ToOwned::to_owned).collect(),
374            files_to_exclude: files_to_exclude.clone().map(ToOwned::to_owned).collect(),
375            match_full_paths: self.match_full_paths(),
376            // Populate legacy fields for backwards compatibility
377            files_to_include_legacy: files_to_include.join(","),
378            files_to_exclude_legacy: files_to_exclude.join(","),
379        }
380    }
381
382    pub(crate) async fn detect(
383        &self,
384        mut reader: BufReader<Box<dyn Read + Send + Sync>>,
385    ) -> Result<bool> {
386        let query_str = self.as_str();
387        if query_str.is_empty() {
388            return Ok(false);
389        }
390
391        // Yield from this function every 20KB scanned.
392        const YIELD_THRESHOLD: usize = 20 * 1024;
393
394        match self {
395            Self::Text { search, .. } => {
396                let mut text = String::new();
397                if query_str.contains('\n') {
398                    reader.read_to_string(&mut text)?;
399                    Ok(search.is_match(&text))
400                } else {
401                    let mut bytes_read = 0;
402                    while reader.read_line(&mut text)? > 0 {
403                        if search.is_match(&text) {
404                            return Ok(true);
405                        }
406                        bytes_read += text.len();
407                        if bytes_read >= YIELD_THRESHOLD {
408                            bytes_read = 0;
409                            smol::future::yield_now().await;
410                        }
411                        text.clear();
412                    }
413                    Ok(false)
414                }
415            }
416            Self::Regex {
417                regex, multiline, ..
418            } => {
419                let mut text = String::new();
420                if *multiline {
421                    reader.read_to_string(&mut text)?;
422                    Ok(regex.is_match(&text)?)
423                } else {
424                    let mut bytes_read = 0;
425                    while reader.read_line(&mut text)? > 0 {
426                        if regex.is_match(&text)? {
427                            return Ok(true);
428                        }
429                        bytes_read += text.len();
430                        if bytes_read >= YIELD_THRESHOLD {
431                            bytes_read = 0;
432                            smol::future::yield_now().await;
433                        }
434                        text.clear();
435                    }
436                    Ok(false)
437                }
438            }
439        }
440    }
441    /// Returns the replacement text for this `SearchQuery`.
442    pub fn replacement(&self) -> Option<&str> {
443        match self {
444            SearchQuery::Text { replacement, .. } | SearchQuery::Regex { replacement, .. } => {
445                replacement.as_deref()
446            }
447        }
448    }
449    /// Replaces search hits if replacement is set. `text` is assumed to be a string that matches this `SearchQuery` exactly, without any leftovers on either side.
450    pub fn replacement_for<'a>(&self, text: &'a str) -> Option<Cow<'a, str>> {
451        match self {
452            SearchQuery::Text { replacement, .. } => replacement.clone().map(Cow::from),
453            SearchQuery::Regex {
454                regex, replacement, ..
455            } => {
456                if let Some(replacement) = replacement {
457                    static TEXT_REPLACEMENT_SPECIAL_CHARACTERS_REGEX: LazyLock<Regex> =
458                        LazyLock::new(|| Regex::new(r"\\\\|\\n|\\t").unwrap());
459                    let replacement = TEXT_REPLACEMENT_SPECIAL_CHARACTERS_REGEX.replace_all(
460                        replacement,
461                        |c: &Captures| match c.get(0).unwrap().as_str() {
462                            r"\\" => "\\",
463                            r"\n" => "\n",
464                            r"\t" => "\t",
465                            x => unreachable!("Unexpected escape sequence: {}", x),
466                        },
467                    );
468                    Some(regex.replace(text, replacement))
469                } else {
470                    None
471                }
472            }
473        }
474    }
475
476    pub async fn search(
477        &self,
478        buffer: &BufferSnapshot,
479        subrange: Option<Range<usize>>,
480    ) -> Vec<Range<usize>> {
481        const YIELD_INTERVAL: usize = 20000;
482
483        if self.as_str().is_empty() {
484            return Default::default();
485        }
486
487        let range_offset = subrange.as_ref().map(|r| r.start).unwrap_or(0);
488        let rope = if let Some(range) = subrange {
489            buffer.as_rope().slice(range)
490        } else {
491            buffer.as_rope().clone()
492        };
493
494        let mut matches = Vec::new();
495        match self {
496            Self::Text {
497                search, whole_word, ..
498            } => {
499                for (ix, mat) in search
500                    .stream_find_iter(rope.bytes_in_range(0..rope.len()))
501                    .enumerate()
502                {
503                    if (ix + 1) % YIELD_INTERVAL == 0 {
504                        yield_now().await;
505                    }
506
507                    let mat = mat.unwrap();
508                    if *whole_word {
509                        let classifier = buffer.char_classifier_at(range_offset + mat.start());
510
511                        let prev_kind = rope
512                            .reversed_chars_at(mat.start())
513                            .next()
514                            .map(|c| classifier.kind(c));
515                        let start_kind =
516                            classifier.kind(rope.chars_at(mat.start()).next().unwrap());
517                        let end_kind =
518                            classifier.kind(rope.reversed_chars_at(mat.end()).next().unwrap());
519                        let next_kind = rope.chars_at(mat.end()).next().map(|c| classifier.kind(c));
520                        if (Some(start_kind) == prev_kind && start_kind == CharKind::Word)
521                            || (Some(end_kind) == next_kind && end_kind == CharKind::Word)
522                        {
523                            continue;
524                        }
525                    }
526                    matches.push(mat.start()..mat.end())
527                }
528            }
529
530            Self::Regex {
531                regex, multiline, ..
532            } => {
533                if *multiline {
534                    let text = rope.to_string();
535                    for (ix, mat) in regex.find_iter(&text).enumerate() {
536                        if (ix + 1) % YIELD_INTERVAL == 0 {
537                            yield_now().await;
538                        }
539
540                        if let Ok(mat) = mat {
541                            matches.push(mat.start()..mat.end());
542                        }
543                    }
544                } else {
545                    let mut line = String::new();
546                    let mut line_offset = 0;
547                    for (chunk_ix, chunk) in rope.chunks().chain(["\n"]).enumerate() {
548                        if (chunk_ix + 1) % YIELD_INTERVAL == 0 {
549                            yield_now().await;
550                        }
551
552                        for (newline_ix, text) in chunk.split('\n').enumerate() {
553                            if newline_ix > 0 {
554                                for mat in regex.find_iter(&line).flatten() {
555                                    let start = line_offset + mat.start();
556                                    let end = line_offset + mat.end();
557                                    matches.push(start..end);
558                                    if self.one_match_per_line() == Some(true) {
559                                        break;
560                                    }
561                                }
562
563                                line_offset += line.len() + 1;
564                                line.clear();
565                            }
566                            line.push_str(text);
567                        }
568                    }
569                }
570            }
571        }
572
573        matches
574    }
575
576    pub fn is_empty(&self) -> bool {
577        self.as_str().is_empty()
578    }
579
580    pub fn as_str(&self) -> &str {
581        self.as_inner().as_str()
582    }
583
584    pub fn whole_word(&self) -> bool {
585        match self {
586            Self::Text { whole_word, .. } => *whole_word,
587            Self::Regex { whole_word, .. } => *whole_word,
588        }
589    }
590
591    pub fn case_sensitive(&self) -> bool {
592        match self {
593            Self::Text { case_sensitive, .. } => *case_sensitive,
594            Self::Regex { case_sensitive, .. } => *case_sensitive,
595        }
596    }
597
598    pub fn include_ignored(&self) -> bool {
599        match self {
600            Self::Text {
601                include_ignored, ..
602            } => *include_ignored,
603            Self::Regex {
604                include_ignored, ..
605            } => *include_ignored,
606        }
607    }
608
609    pub fn is_regex(&self) -> bool {
610        matches!(self, Self::Regex { .. })
611    }
612
613    pub fn files_to_include(&self) -> &PathMatcher {
614        self.as_inner().files_to_include()
615    }
616
617    pub fn files_to_exclude(&self) -> &PathMatcher {
618        self.as_inner().files_to_exclude()
619    }
620
621    pub fn buffers(&self) -> Option<&Vec<Entity<Buffer>>> {
622        self.as_inner().buffers.as_ref()
623    }
624
625    pub fn is_opened_only(&self) -> bool {
626        self.as_inner().buffers.is_some()
627    }
628
629    pub fn filters_path(&self) -> bool {
630        !(self.files_to_exclude().sources().next().is_none()
631            && self.files_to_include().sources().next().is_none())
632    }
633
634    pub fn match_full_paths(&self) -> bool {
635        self.as_inner().match_full_paths
636    }
637
638    /// Check match full paths to determine whether you're required to pass a fully qualified
639    /// project path (starts with a project root).
640    pub fn match_path(&self, file_path: &RelPath) -> bool {
641        let mut path = file_path.to_rel_path_buf();
642        loop {
643            if self.files_to_exclude().is_match(&path) {
644                return false;
645            } else if self.files_to_include().sources().next().is_none()
646                || self.files_to_include().is_match(&path)
647            {
648                return true;
649            } else if !path.pop() {
650                return false;
651            }
652        }
653    }
654    pub fn as_inner(&self) -> &SearchInputs {
655        match self {
656            Self::Regex { inner, .. } | Self::Text { inner, .. } => inner,
657        }
658    }
659
660    /// Whether this search should replace only one match per line, instead of
661    /// all matches.
662    /// Returns `None` for text searches, as only regex searches support this
663    /// option.
664    pub fn one_match_per_line(&self) -> Option<bool> {
665        match self {
666            Self::Regex {
667                one_match_per_line, ..
668            } => Some(*one_match_per_line),
669            Self::Text { .. } => None,
670        }
671    }
672
673    pub fn search_str(&self, text: &str) -> Vec<Range<usize>> {
674        if self.as_str().is_empty() {
675            return Vec::new();
676        }
677
678        let is_word_char = |c: char| c.is_alphanumeric() || c == '_';
679
680        let mut matches = Vec::new();
681        match self {
682            Self::Text {
683                search, whole_word, ..
684            } => {
685                for mat in search.find_iter(text.as_bytes()) {
686                    if *whole_word {
687                        let prev_char = text[..mat.start()].chars().last();
688                        let next_char = text[mat.end()..].chars().next();
689                        if prev_char.is_some_and(&is_word_char)
690                            || next_char.is_some_and(&is_word_char)
691                        {
692                            continue;
693                        }
694                    }
695                    matches.push(mat.start()..mat.end());
696                }
697            }
698            Self::Regex {
699                regex,
700                multiline,
701                one_match_per_line,
702                ..
703            } => {
704                if *multiline {
705                    for mat in regex.find_iter(text).flatten() {
706                        matches.push(mat.start()..mat.end());
707                    }
708                } else {
709                    let mut line_offset = 0;
710                    for line in text.split('\n') {
711                        for mat in regex.find_iter(line).flatten() {
712                            matches.push((line_offset + mat.start())..(line_offset + mat.end()));
713                            if *one_match_per_line {
714                                break;
715                            }
716                        }
717                        line_offset += line.len() + 1;
718                    }
719                }
720            }
721        }
722        matches
723    }
724}