project_search.rs

   1use std::{
   2    cell::LazyCell,
   3    collections::BTreeSet,
   4    io::{BufReader, Cursor, Read},
   5    ops::Range,
   6    path::{Path, PathBuf},
   7    pin::pin,
   8    sync::Arc,
   9};
  10
  11use anyhow::Context;
  12use collections::HashSet;
  13use fs::Fs;
  14use futures::{SinkExt, StreamExt, select_biased, stream::FuturesOrdered};
  15use gpui::{App, AppContext, AsyncApp, Entity, Task};
  16use language::{Buffer, BufferSnapshot};
  17use parking_lot::Mutex;
  18use postage::oneshot;
  19use rpc::{AnyProtoClient, proto};
  20use smol::{
  21    channel::{Receiver, Sender, bounded, unbounded},
  22    future::FutureExt,
  23};
  24
  25use text::BufferId;
  26use util::{ResultExt, maybe, paths::compare_rel_paths, rel_path::RelPath};
  27use worktree::{Entry, ProjectEntryId, Snapshot, Worktree, WorktreeSettings};
  28
  29use crate::{
  30    Project, ProjectItem, ProjectPath, RemotelyCreatedModels,
  31    buffer_store::BufferStore,
  32    search::{SearchQuery, SearchResult},
  33    worktree_store::WorktreeStore,
  34};
  35
  36pub struct Search {
  37    buffer_store: Entity<BufferStore>,
  38    worktree_store: Entity<WorktreeStore>,
  39    limit: usize,
  40    kind: SearchKind,
  41}
  42
  43/// Represents search setup, before it is actually kicked off with Search::into_results
  44enum SearchKind {
  45    /// Search for candidates by inspecting file contents on file system, avoiding loading the buffer unless we know that a given file contains a match.
  46    Local {
  47        fs: Arc<dyn Fs>,
  48        worktrees: Vec<Entity<Worktree>>,
  49    },
  50    /// Query remote host for candidates. As of writing, the host runs a local search in "buffers with matches only" mode.
  51    Remote {
  52        client: AnyProtoClient,
  53        remote_id: u64,
  54        models: Arc<Mutex<RemotelyCreatedModels>>,
  55    },
  56    /// Run search against a known set of candidates. Even when working with a remote host, this won't round-trip to host.
  57    OpenBuffersOnly,
  58}
  59
  60/// Represents results of project search and allows one to either obtain match positions OR
  61/// just the handles to buffers that may match the search. Grabbing the handles is cheaper than obtaining full match positions, because in that case we'll look for
  62/// at most one match in each file.
  63#[must_use]
  64pub struct SearchResultsHandle {
  65    results: Receiver<SearchResult>,
  66    matching_buffers: Receiver<Entity<Buffer>>,
  67    trigger_search: Box<dyn FnOnce(&mut App) -> Task<()> + Send + Sync>,
  68}
  69
  70impl SearchResultsHandle {
  71    pub fn results(self, cx: &mut App) -> (Receiver<SearchResult>, Task<()>) {
  72        let task = (self.trigger_search)(cx);
  73        (self.results, task)
  74    }
  75
  76    pub fn matching_buffers(self, cx: &mut App) -> (Receiver<Entity<Buffer>>, Task<()>) {
  77        let task = (self.trigger_search)(cx);
  78        (self.matching_buffers, task)
  79    }
  80}
  81
  82#[derive(Clone)]
  83enum FindSearchCandidates {
  84    Local {
  85        fs: Arc<dyn Fs>,
  86        /// Start off with all paths in project and filter them based on:
  87        /// - Include filters
  88        /// - Exclude filters
  89        /// - Only open buffers
  90        /// - Scan ignored files
  91        /// Put another way: filter out files that can't match (without looking at file contents)
  92        input_paths_rx: Receiver<InputPath>,
  93        /// After that, if the buffer is not yet loaded, we'll figure out if it contains at least one match
  94        /// based on disk contents of a buffer. This step is not performed for buffers we already have in memory.
  95        confirm_contents_will_match_tx: Sender<MatchingEntry>,
  96        confirm_contents_will_match_rx: Receiver<MatchingEntry>,
  97    },
  98    Remote,
  99    OpenBuffersOnly,
 100}
 101
 102impl Search {
 103    pub fn local(
 104        fs: Arc<dyn Fs>,
 105        buffer_store: Entity<BufferStore>,
 106        worktree_store: Entity<WorktreeStore>,
 107        limit: usize,
 108        cx: &mut App,
 109    ) -> Self {
 110        let worktrees = worktree_store.read(cx).visible_worktrees(cx).collect();
 111        Self {
 112            kind: SearchKind::Local { fs, worktrees },
 113            buffer_store,
 114            worktree_store,
 115            limit,
 116        }
 117    }
 118
 119    pub(crate) fn remote(
 120        buffer_store: Entity<BufferStore>,
 121        worktree_store: Entity<WorktreeStore>,
 122        limit: usize,
 123        client_state: (AnyProtoClient, u64, Arc<Mutex<RemotelyCreatedModels>>),
 124    ) -> Self {
 125        Self {
 126            kind: SearchKind::Remote {
 127                client: client_state.0,
 128                remote_id: client_state.1,
 129                models: client_state.2,
 130            },
 131            buffer_store,
 132            worktree_store,
 133            limit,
 134        }
 135    }
 136    pub(crate) fn open_buffers_only(
 137        buffer_store: Entity<BufferStore>,
 138        worktree_store: Entity<WorktreeStore>,
 139        limit: usize,
 140    ) -> Self {
 141        Self {
 142            kind: SearchKind::OpenBuffersOnly,
 143            buffer_store,
 144            worktree_store,
 145            limit,
 146        }
 147    }
 148
 149    pub(crate) const MAX_SEARCH_RESULT_FILES: usize = 5_000;
 150    pub(crate) const MAX_SEARCH_RESULT_RANGES: usize = 10_000;
 151    /// Prepares a project search run. The resulting [`SearchResultsHandle`] has to be used to specify whether you're interested in matching buffers
 152    /// or full search results.
 153    pub fn into_handle(mut self, query: SearchQuery, cx: &mut App) -> SearchResultsHandle {
 154        let mut open_buffers = HashSet::default();
 155        let mut unnamed_buffers = Vec::new();
 156        const MAX_CONCURRENT_BUFFER_OPENS: usize = 64;
 157        let buffers = self.buffer_store.read(cx);
 158        for handle in buffers.buffers() {
 159            let buffer = handle.read(cx);
 160            if !buffers.is_searchable(&buffer.remote_id()) {
 161                continue;
 162            } else if let Some(entry_id) = buffer.entry_id(cx) {
 163                open_buffers.insert(entry_id);
 164            } else {
 165                self.limit = self.limit.saturating_sub(1);
 166                unnamed_buffers.push(handle)
 167            };
 168        }
 169        let executor = cx.background_executor().clone();
 170        let (tx, rx) = unbounded();
 171        let (grab_buffer_snapshot_tx, grab_buffer_snapshot_rx) = unbounded();
 172        let matching_buffers = grab_buffer_snapshot_rx.clone();
 173        let trigger_search = Box::new(move |cx: &mut App| {
 174            cx.spawn(async move |cx| {
 175                for buffer in unnamed_buffers {
 176                    _ = grab_buffer_snapshot_tx.send(buffer).await;
 177                }
 178
 179                let (find_all_matches_tx, find_all_matches_rx) =
 180                    bounded(MAX_CONCURRENT_BUFFER_OPENS);
 181                let query = Arc::new(query);
 182                let (candidate_searcher, tasks) = match self.kind {
 183                    SearchKind::OpenBuffersOnly => {
 184                        let Ok(open_buffers) = cx.update(|cx| self.all_loaded_buffers(&query, cx))
 185                        else {
 186                            return;
 187                        };
 188                        let fill_requests = cx
 189                            .background_spawn(async move {
 190                                for buffer in open_buffers {
 191                                    if let Err(_) = grab_buffer_snapshot_tx.send(buffer).await {
 192                                        return;
 193                                    }
 194                                }
 195                            })
 196                            .boxed_local();
 197                        (FindSearchCandidates::OpenBuffersOnly, vec![fill_requests])
 198                    }
 199                    SearchKind::Local {
 200                        fs,
 201                        ref mut worktrees,
 202                    } => {
 203                        let (get_buffer_for_full_scan_tx, get_buffer_for_full_scan_rx) =
 204                            unbounded();
 205                        let (confirm_contents_will_match_tx, confirm_contents_will_match_rx) =
 206                            bounded(64);
 207                        let (sorted_search_results_tx, sorted_search_results_rx) = unbounded();
 208
 209                        let (input_paths_tx, input_paths_rx) = unbounded();
 210                        let tasks = vec![
 211                            cx.spawn(Self::provide_search_paths(
 212                                std::mem::take(worktrees),
 213                                query.clone(),
 214                                input_paths_tx,
 215                                sorted_search_results_tx,
 216                            ))
 217                            .boxed_local(),
 218                            Self::open_buffers(
 219                                &self.buffer_store,
 220                                get_buffer_for_full_scan_rx,
 221                                grab_buffer_snapshot_tx,
 222                                cx.clone(),
 223                            )
 224                            .boxed_local(),
 225                            cx.background_spawn(Self::maintain_sorted_search_results(
 226                                sorted_search_results_rx,
 227                                get_buffer_for_full_scan_tx,
 228                                self.limit,
 229                            ))
 230                            .boxed_local(),
 231                        ];
 232                        (
 233                            FindSearchCandidates::Local {
 234                                fs,
 235                                confirm_contents_will_match_tx,
 236                                confirm_contents_will_match_rx,
 237                                input_paths_rx,
 238                            },
 239                            tasks,
 240                        )
 241                    }
 242                    SearchKind::Remote {
 243                        client,
 244                        remote_id,
 245                        models,
 246                    } => {
 247                        let request = client.request(proto::FindSearchCandidates {
 248                            project_id: remote_id,
 249                            query: Some(query.to_proto()),
 250                            limit: self.limit as _,
 251                        });
 252                        let Ok(guard) = cx.update(|cx| {
 253                            Project::retain_remotely_created_models_impl(
 254                                &models,
 255                                &self.buffer_store,
 256                                &self.worktree_store,
 257                                cx,
 258                            )
 259                        }) else {
 260                            return;
 261                        };
 262                        let buffer_store = self.buffer_store.downgrade();
 263                        let issue_remote_buffers_request = cx
 264                            .spawn(async move |cx| {
 265                                let _ = maybe!(async move {
 266                                    let response = request.await?;
 267                                    for buffer_id in response.buffer_ids {
 268                                        let buffer_id = BufferId::new(buffer_id)?;
 269                                        let buffer = buffer_store
 270                                            .update(cx, |buffer_store, cx| {
 271                                                buffer_store.wait_for_remote_buffer(buffer_id, cx)
 272                                            })?
 273                                            .await?;
 274                                        let _ = grab_buffer_snapshot_tx.send(buffer).await;
 275                                    }
 276
 277                                    drop(guard);
 278                                    anyhow::Ok(())
 279                                })
 280                                .await
 281                                .log_err();
 282                            })
 283                            .boxed_local();
 284                        (
 285                            FindSearchCandidates::Remote,
 286                            vec![issue_remote_buffers_request],
 287                        )
 288                    }
 289                };
 290
 291                let should_find_all_matches = !tx.is_closed();
 292
 293                let worker_pool = executor.scoped(|scope| {
 294                    let num_cpus = executor.num_cpus();
 295
 296                    assert!(num_cpus > 0);
 297                    for _ in 0..executor.num_cpus() - 1 {
 298                        let worker = Worker {
 299                            query: &query,
 300                            open_buffers: &open_buffers,
 301                            candidates: candidate_searcher.clone(),
 302                            find_all_matches_rx: find_all_matches_rx.clone(),
 303                        };
 304                        scope.spawn(worker.run());
 305                    }
 306
 307                    drop(find_all_matches_rx);
 308                    drop(candidate_searcher);
 309                });
 310
 311                let (sorted_matches_tx, sorted_matches_rx) = unbounded();
 312                // The caller of `into_handle` decides whether they're interested in all matches (files that matched + all matching ranges) or
 313                // just the files. *They are using the same stream as the guts of the project search do*.
 314                // This means that we cannot grab values off of that stream unless it's strictly needed for making a progress in project search.
 315                //
 316                // Grabbing buffer snapshots is only necessary when we're looking for all matches. If the caller decided that they're not interested
 317                // in all matches, running that task unconditionally would hinder caller's ability to observe all matching file paths.
 318                let buffer_snapshots = if should_find_all_matches {
 319                    Some(
 320                        Self::grab_buffer_snapshots(
 321                            grab_buffer_snapshot_rx,
 322                            find_all_matches_tx,
 323                            sorted_matches_tx,
 324                            cx.clone(),
 325                        )
 326                        .boxed_local(),
 327                    )
 328                } else {
 329                    drop(find_all_matches_tx);
 330
 331                    None
 332                };
 333                let ensure_matches_are_reported_in_order = if should_find_all_matches {
 334                    Some(
 335                        Self::ensure_matched_ranges_are_reported_in_order(sorted_matches_rx, tx)
 336                            .boxed_local(),
 337                    )
 338                } else {
 339                    drop(tx);
 340                    None
 341                };
 342
 343                futures::future::join_all(
 344                    [worker_pool.boxed_local()]
 345                        .into_iter()
 346                        .chain(buffer_snapshots)
 347                        .chain(ensure_matches_are_reported_in_order)
 348                        .chain(tasks),
 349                )
 350                .await;
 351            })
 352        });
 353
 354        SearchResultsHandle {
 355            results: rx,
 356            matching_buffers,
 357            trigger_search,
 358        }
 359    }
 360
 361    fn provide_search_paths(
 362        worktrees: Vec<Entity<Worktree>>,
 363        query: Arc<SearchQuery>,
 364        tx: Sender<InputPath>,
 365        results: Sender<oneshot::Receiver<ProjectPath>>,
 366    ) -> impl AsyncFnOnce(&mut AsyncApp) {
 367        async move |cx| {
 368            _ = maybe!(async move {
 369                let gitignored_tracker = PathInclusionMatcher::new(query.clone());
 370                for worktree in worktrees {
 371                    let (mut snapshot, worktree_settings) = worktree
 372                        .read_with(cx, |this, _| {
 373                            Some((this.snapshot(), this.as_local()?.settings()))
 374                        })?
 375                        .context("The worktree is not local")?;
 376                    if query.include_ignored() {
 377                        // Pre-fetch all of the ignored directories as they're going to be searched.
 378                        let mut entries_to_refresh = vec![];
 379
 380                        for entry in snapshot.entries(query.include_ignored(), 0) {
 381                            if gitignored_tracker.should_scan_gitignored_dir(
 382                                entry,
 383                                &snapshot,
 384                                &worktree_settings,
 385                            ) {
 386                                entries_to_refresh.push(entry.path.clone());
 387                            }
 388                        }
 389                        let barrier = worktree.update(cx, |this, _| {
 390                            let local = this.as_local_mut()?;
 391                            let barrier = entries_to_refresh
 392                                .into_iter()
 393                                .map(|path| local.add_path_prefix_to_scan(path).into_future())
 394                                .collect::<Vec<_>>();
 395                            Some(barrier)
 396                        })?;
 397                        if let Some(barriers) = barrier {
 398                            futures::future::join_all(barriers).await;
 399                        }
 400                        snapshot = worktree.read_with(cx, |this, _| this.snapshot())?;
 401                    }
 402                    cx.background_executor()
 403                        .scoped(|scope| {
 404                            scope.spawn(async {
 405                                for entry in snapshot.files(query.include_ignored(), 0) {
 406                                    let (should_scan_tx, should_scan_rx) = oneshot::channel();
 407
 408                                    let Ok(_) = tx
 409                                        .send(InputPath {
 410                                            entry: entry.clone(),
 411                                            snapshot: snapshot.clone(),
 412                                            should_scan_tx,
 413                                        })
 414                                        .await
 415                                    else {
 416                                        return;
 417                                    };
 418                                    if results.send(should_scan_rx).await.is_err() {
 419                                        return;
 420                                    };
 421                                }
 422                            })
 423                        })
 424                        .await;
 425                }
 426                anyhow::Ok(())
 427            })
 428            .await;
 429        }
 430    }
 431
 432    async fn maintain_sorted_search_results(
 433        rx: Receiver<oneshot::Receiver<ProjectPath>>,
 434        paths_for_full_scan: Sender<ProjectPath>,
 435        limit: usize,
 436    ) {
 437        let mut rx = pin!(rx);
 438        let mut matched = 0;
 439        while let Some(mut next_path_result) = rx.next().await {
 440            let Some(successful_path) = next_path_result.next().await else {
 441                // This file did not produce a match, hence skip it.
 442                continue;
 443            };
 444            if paths_for_full_scan.send(successful_path).await.is_err() {
 445                return;
 446            };
 447            matched += 1;
 448            if matched >= limit {
 449                break;
 450            }
 451        }
 452    }
 453
 454    /// Background workers cannot open buffers by themselves, hence main thread will do it on their behalf.
 455    async fn open_buffers(
 456        buffer_store: &Entity<BufferStore>,
 457        rx: Receiver<ProjectPath>,
 458        find_all_matches_tx: Sender<Entity<Buffer>>,
 459        mut cx: AsyncApp,
 460    ) {
 461        let mut rx = pin!(rx.ready_chunks(64));
 462        _ = maybe!(async move {
 463            while let Some(requested_paths) = rx.next().await {
 464                let mut buffers = buffer_store.update(&mut cx, |this, cx| {
 465                    requested_paths
 466                        .into_iter()
 467                        .map(|path| this.open_buffer(path, cx))
 468                        .collect::<FuturesOrdered<_>>()
 469                })?;
 470
 471                while let Some(buffer) = buffers.next().await {
 472                    if let Some(buffer) = buffer.log_err() {
 473                        find_all_matches_tx.send(buffer).await?;
 474                    }
 475                }
 476            }
 477            Result::<_, anyhow::Error>::Ok(())
 478        })
 479        .await;
 480    }
 481
 482    async fn grab_buffer_snapshots(
 483        rx: Receiver<Entity<Buffer>>,
 484        find_all_matches_tx: Sender<(
 485            Entity<Buffer>,
 486            BufferSnapshot,
 487            oneshot::Sender<(Entity<Buffer>, Vec<Range<language::Anchor>>)>,
 488        )>,
 489        results: Sender<oneshot::Receiver<(Entity<Buffer>, Vec<Range<language::Anchor>>)>>,
 490        mut cx: AsyncApp,
 491    ) {
 492        _ = maybe!(async move {
 493            while let Ok(buffer) = rx.recv().await {
 494                let snapshot = buffer.read_with(&mut cx, |this, _| this.snapshot())?;
 495                let (tx, rx) = oneshot::channel();
 496                find_all_matches_tx.send((buffer, snapshot, tx)).await?;
 497                results.send(rx).await?;
 498            }
 499            debug_assert!(rx.is_empty());
 500            Result::<_, anyhow::Error>::Ok(())
 501        })
 502        .await;
 503    }
 504
 505    async fn ensure_matched_ranges_are_reported_in_order(
 506        rx: Receiver<oneshot::Receiver<(Entity<Buffer>, Vec<Range<language::Anchor>>)>>,
 507        tx: Sender<SearchResult>,
 508    ) {
 509        use postage::stream::Stream;
 510        _ = maybe!(async move {
 511            let mut matched_buffers = 0;
 512            let mut matches = 0;
 513            while let Ok(mut next_buffer_matches) = rx.recv().await {
 514                let Some((buffer, ranges)) = next_buffer_matches.recv().await else {
 515                    continue;
 516                };
 517
 518                if matched_buffers > Search::MAX_SEARCH_RESULT_FILES
 519                    || matches > Search::MAX_SEARCH_RESULT_RANGES
 520                {
 521                    _ = tx.send(SearchResult::LimitReached).await;
 522                    break;
 523                }
 524                matched_buffers += 1;
 525                matches += ranges.len();
 526
 527                _ = tx.send(SearchResult::Buffer { buffer, ranges }).await?;
 528            }
 529            anyhow::Ok(())
 530        })
 531        .await;
 532    }
 533
 534    fn all_loaded_buffers(&self, search_query: &SearchQuery, cx: &App) -> Vec<Entity<Buffer>> {
 535        let worktree_store = self.worktree_store.read(cx);
 536        let mut buffers = search_query
 537            .buffers()
 538            .into_iter()
 539            .flatten()
 540            .filter(|buffer| {
 541                let b = buffer.read(cx);
 542                if let Some(file) = b.file() {
 543                    if !search_query.match_path(file.path()) {
 544                        return false;
 545                    }
 546                    if !search_query.include_ignored()
 547                        && let Some(entry) = b
 548                            .entry_id(cx)
 549                            .and_then(|entry_id| worktree_store.entry_for_id(entry_id, cx))
 550                        && entry.is_ignored
 551                    {
 552                        return false;
 553                    }
 554                }
 555                true
 556            })
 557            .cloned()
 558            .collect::<Vec<_>>();
 559        buffers.sort_by(|a, b| {
 560            let a = a.read(cx);
 561            let b = b.read(cx);
 562            match (a.file(), b.file()) {
 563                (None, None) => a.remote_id().cmp(&b.remote_id()),
 564                (None, Some(_)) => std::cmp::Ordering::Less,
 565                (Some(_), None) => std::cmp::Ordering::Greater,
 566                (Some(a), Some(b)) => compare_rel_paths((a.path(), true), (b.path(), true)),
 567            }
 568        });
 569
 570        buffers
 571    }
 572}
 573
 574struct Worker<'search> {
 575    query: &'search SearchQuery,
 576    open_buffers: &'search HashSet<ProjectEntryId>,
 577    candidates: FindSearchCandidates,
 578    /// Ok, we're back in background: run full scan & find all matches in a given buffer snapshot.
 579    /// Then, when you're done, share them via the channel you were given.
 580    find_all_matches_rx: Receiver<(
 581        Entity<Buffer>,
 582        BufferSnapshot,
 583        oneshot::Sender<(Entity<Buffer>, Vec<Range<language::Anchor>>)>,
 584    )>,
 585}
 586
 587impl Worker<'_> {
 588    async fn run(self) {
 589        let (
 590            input_paths_rx,
 591            confirm_contents_will_match_rx,
 592            mut confirm_contents_will_match_tx,
 593            fs,
 594        ) = match self.candidates {
 595            FindSearchCandidates::Local {
 596                fs,
 597                input_paths_rx,
 598                confirm_contents_will_match_rx,
 599                confirm_contents_will_match_tx,
 600            } => (
 601                input_paths_rx,
 602                confirm_contents_will_match_rx,
 603                confirm_contents_will_match_tx,
 604                Some(fs),
 605            ),
 606            FindSearchCandidates::Remote | FindSearchCandidates::OpenBuffersOnly => {
 607                (unbounded().1, unbounded().1, unbounded().0, None)
 608            }
 609        };
 610        // WorkerA: grabs a request for "find all matches in file/a" <- takes 5 minutes
 611        // right after: WorkerB: grabs a request for "find all matches in file/b" <- takes 5 seconds
 612        let mut find_all_matches = pin!(self.find_all_matches_rx.fuse());
 613        let mut find_first_match = pin!(confirm_contents_will_match_rx.fuse());
 614        let mut scan_path = pin!(input_paths_rx.fuse());
 615
 616        loop {
 617            let handler = RequestHandler {
 618                query: self.query,
 619                open_entries: &self.open_buffers,
 620                fs: fs.as_deref(),
 621                confirm_contents_will_match_tx: &confirm_contents_will_match_tx,
 622            };
 623            // Whenever we notice that some step of a pipeline is closed, we don't want to close subsequent
 624            // steps straight away. Another worker might be about to produce a value that will
 625            // be pushed there, thus we'll replace current worker's pipe with a dummy one.
 626            // That way, we'll only ever close a next-stage channel when ALL workers do so.
 627            select_biased! {
 628                find_all_matches = find_all_matches.next() => {
 629                    let Some(matches) = find_all_matches else {
 630                        continue;
 631                    };
 632                    handler.handle_find_all_matches(matches).await;
 633                },
 634                find_first_match = find_first_match.next() => {
 635                    if let Some(buffer_with_at_least_one_match) = find_first_match {
 636                        handler.handle_find_first_match(buffer_with_at_least_one_match).await;
 637                    }
 638                },
 639                scan_path = scan_path.next() => {
 640                    if let Some(path_to_scan) = scan_path {
 641                        handler.handle_scan_path(path_to_scan).await;
 642                    } else {
 643                        // If we're the last worker to notice that this is not producing values, close the upstream.
 644                        confirm_contents_will_match_tx = bounded(1).0;
 645                    }
 646
 647                 }
 648                 complete => {
 649                     break
 650                },
 651
 652            }
 653        }
 654    }
 655}
 656
 657struct RequestHandler<'worker> {
 658    query: &'worker SearchQuery,
 659    fs: Option<&'worker dyn Fs>,
 660    open_entries: &'worker HashSet<ProjectEntryId>,
 661    confirm_contents_will_match_tx: &'worker Sender<MatchingEntry>,
 662}
 663
 664impl RequestHandler<'_> {
 665    async fn handle_find_all_matches(
 666        &self,
 667        (buffer, snapshot, mut report_matches): (
 668            Entity<Buffer>,
 669            BufferSnapshot,
 670            oneshot::Sender<(Entity<Buffer>, Vec<Range<language::Anchor>>)>,
 671        ),
 672    ) {
 673        let ranges = self
 674            .query
 675            .search(&snapshot, None)
 676            .await
 677            .iter()
 678            .map(|range| snapshot.anchor_before(range.start)..snapshot.anchor_after(range.end))
 679            .collect::<Vec<_>>();
 680
 681        _ = report_matches.send((buffer, ranges)).await;
 682    }
 683
 684    async fn handle_find_first_match(&self, mut entry: MatchingEntry) {
 685        _ = (async move || -> anyhow::Result<()> {
 686            let abs_path = entry.worktree_root.join(entry.path.path.as_std_path());
 687
 688            // Avoid blocking IO here: cancellation of the search is implemented via task drop, and a
 689            // synchronous `std::fs::File::open` / `Read::read` can delay task cancellation for a long time.
 690            let contents = self
 691                .fs
 692                .context("Trying to query filesystem in remote project search")?
 693                .load_bytes(&abs_path)
 694                .await?;
 695
 696            // Before attempting to match the file content, throw away files that have invalid UTF-8 sequences early on;
 697            // That way we can still match files without having to look at "obviously binary" files.
 698            if let Err(error) = std::str::from_utf8(&contents) {
 699                if let Some(starting_position) = error.error_len() {
 700                    log::debug!(
 701                        "Invalid UTF-8 sequence in file {abs_path:?} at byte position {starting_position}"
 702                    );
 703                    return Ok(());
 704                }
 705            }
 706
 707            let file: Box<dyn Read + Send + Sync> = Box::new(Cursor::new(contents));
 708            let file = BufReader::new(file);
 709
 710            if self.query.detect(file).unwrap_or(false) {
 711                // Yes, we should scan the whole file.
 712                entry.should_scan_tx.send(entry.path).await?;
 713            }
 714
 715            Ok(())
 716        })()
 717        .await
 718        .log_err();
 719    }
 720
 721    async fn handle_scan_path(&self, req: InputPath) {
 722        _ = maybe!(async move {
 723            let InputPath {
 724                entry,
 725                snapshot,
 726                mut should_scan_tx,
 727            } = req;
 728
 729            if entry.is_fifo || !entry.is_file() {
 730                return Ok(());
 731            }
 732
 733            if self.query.filters_path() {
 734                let matched_path = if self.query.match_full_paths() {
 735                    let mut full_path = snapshot.root_name().to_owned();
 736                    full_path.push(&entry.path);
 737                    self.query.match_path(&full_path)
 738                } else {
 739                    self.query.match_path(&entry.path)
 740                };
 741                if !matched_path {
 742                    return Ok(());
 743                }
 744            }
 745
 746            if self.open_entries.contains(&entry.id) {
 747                // The buffer is already in memory and that's the version we want to scan;
 748                // hence skip the dilly-dally and look for all matches straight away.
 749                should_scan_tx
 750                    .send(ProjectPath {
 751                        worktree_id: snapshot.id(),
 752                        path: entry.path.clone(),
 753                    })
 754                    .await?;
 755            } else {
 756                self.confirm_contents_will_match_tx
 757                    .send(MatchingEntry {
 758                        should_scan_tx: should_scan_tx,
 759                        worktree_root: snapshot.abs_path().clone(),
 760                        path: ProjectPath {
 761                            worktree_id: snapshot.id(),
 762                            path: entry.path.clone(),
 763                        },
 764                    })
 765                    .await?;
 766            }
 767
 768            anyhow::Ok(())
 769        })
 770        .await;
 771    }
 772}
 773
 774struct InputPath {
 775    entry: Entry,
 776    snapshot: Snapshot,
 777    should_scan_tx: oneshot::Sender<ProjectPath>,
 778}
 779
 780struct MatchingEntry {
 781    worktree_root: Arc<Path>,
 782    path: ProjectPath,
 783    should_scan_tx: oneshot::Sender<ProjectPath>,
 784}
 785
 786/// This struct encapsulates the logic to decide whether a given gitignored directory should be
 787/// scanned based on include/exclude patterns of a search query (as include/exclude parameters may match paths inside it).
 788/// It is kind-of doing an inverse of glob. Given a glob pattern like `src/**/` and a parent path like `src`, we need to decide whether the parent
 789/// may contain glob hits.
 790struct PathInclusionMatcher {
 791    included: BTreeSet<PathBuf>,
 792    query: Arc<SearchQuery>,
 793}
 794
 795impl PathInclusionMatcher {
 796    fn new(query: Arc<SearchQuery>) -> Self {
 797        let mut included = BTreeSet::new();
 798        // To do an inverse glob match, we split each glob into it's prefix and the glob part.
 799        // For example, `src/**/*.rs` becomes `src/` and `**/*.rs`. The glob part gets dropped.
 800        // Then, when checking whether a given directory should be scanned, we check whether it is a non-empty substring of any glob prefix.
 801        if query.filters_path() {
 802            included.extend(
 803                query
 804                    .files_to_include()
 805                    .sources()
 806                    .flat_map(|glob| Some(wax::Glob::new(glob).ok()?.partition().0)),
 807            );
 808        }
 809        Self { included, query }
 810    }
 811
 812    fn should_scan_gitignored_dir(
 813        &self,
 814        entry: &Entry,
 815        snapshot: &Snapshot,
 816        worktree_settings: &WorktreeSettings,
 817    ) -> bool {
 818        if !entry.is_ignored || !entry.kind.is_unloaded() {
 819            return false;
 820        }
 821        if !self.query.include_ignored() {
 822            return false;
 823        }
 824        if worktree_settings.is_path_excluded(&entry.path) {
 825            return false;
 826        }
 827        if !self.query.filters_path() {
 828            return true;
 829        }
 830
 831        let as_abs_path = LazyCell::new(move || snapshot.absolutize(&entry.path));
 832        let entry_path = &entry.path;
 833        // 3. Check Exclusions (Pruning)
 834        // If the current path is a child of an excluded path, we stop.
 835        let is_excluded = self.path_is_definitely_excluded(&entry_path, snapshot);
 836
 837        if is_excluded {
 838            return false;
 839        }
 840
 841        // 4. Check Inclusions (Traversal)
 842        if self.included.is_empty() {
 843            return true;
 844        }
 845
 846        // We scan if the current path is a descendant of an include prefix
 847        // OR if the current path is an ancestor of an include prefix (we need to go deeper to find it).
 848        let is_included = self.included.iter().any(|prefix| {
 849            let (prefix_matches_entry, entry_matches_prefix) = if prefix.is_absolute() {
 850                (
 851                    prefix.starts_with(&**as_abs_path),
 852                    as_abs_path.starts_with(prefix),
 853                )
 854            } else {
 855                RelPath::new(prefix, snapshot.path_style()).map_or((false, false), |prefix| {
 856                    (
 857                        prefix.starts_with(entry_path),
 858                        entry_path.starts_with(&prefix),
 859                    )
 860                })
 861            };
 862
 863            // Logic:
 864            // 1. entry_matches_prefix: We are inside the target zone (e.g. glob: src/, current: src/lib/). Keep scanning.
 865            // 2. prefix_matches_entry: We are above the target zone (e.g. glob: src/foo/, current: src/). Keep scanning to reach foo.
 866            prefix_matches_entry || entry_matches_prefix
 867        });
 868
 869        is_included
 870    }
 871    fn path_is_definitely_excluded(&self, path: &RelPath, snapshot: &Snapshot) -> bool {
 872        if !self.query.files_to_exclude().sources().next().is_none() {
 873            let mut path = if self.query.match_full_paths() {
 874                let mut full_path = snapshot.root_name().to_owned();
 875                full_path.push(path);
 876                full_path
 877            } else {
 878                path.to_owned()
 879            };
 880            loop {
 881                if self.query.files_to_exclude().is_match(&path) {
 882                    return true;
 883                } else if !path.pop() {
 884                    return false;
 885                }
 886            }
 887        } else {
 888            false
 889        }
 890    }
 891}
 892
 893#[cfg(test)]
 894mod tests {
 895    use super::*;
 896    use fs::FakeFs;
 897    use serde_json::json;
 898    use settings::Settings;
 899    use util::{
 900        path,
 901        paths::{PathMatcher, PathStyle},
 902        rel_path::RelPath,
 903    };
 904    use worktree::{Entry, EntryKind, WorktreeSettings};
 905
 906    use crate::{
 907        Project, project_search::PathInclusionMatcher, project_tests::init_test,
 908        search::SearchQuery,
 909    };
 910
 911    #[gpui::test]
 912    async fn test_path_inclusion_matcher(cx: &mut gpui::TestAppContext) {
 913        init_test(cx);
 914
 915        let fs = FakeFs::new(cx.background_executor.clone());
 916        fs.insert_tree(
 917            "/root",
 918            json!({
 919                ".gitignore": "src/data/\n",
 920                "src": {
 921                    "data": {
 922                        "main.csv": "field_1,field_2,field_3",
 923                    },
 924                    "lib": {
 925                        "main.txt": "Are you familiar with fields?",
 926                    },
 927                },
 928            }),
 929        )
 930        .await;
 931
 932        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
 933        let worktree = project.update(cx, |project, cx| project.worktrees(cx).next().unwrap());
 934        let (worktree_settings, worktree_snapshot) = worktree.update(cx, |worktree, cx| {
 935            let settings_location = worktree.settings_location(cx);
 936            return (
 937                WorktreeSettings::get(Some(settings_location), cx).clone(),
 938                worktree.snapshot(),
 939            );
 940        });
 941
 942        // Manually create a test entry for the gitignored directory since it won't
 943        // be loaded by the worktree
 944        let entry = Entry {
 945            id: ProjectEntryId::from_proto(1),
 946            kind: EntryKind::UnloadedDir,
 947            path: Arc::from(RelPath::unix(Path::new("src/data")).unwrap()),
 948            inode: 0,
 949            mtime: None,
 950            canonical_path: None,
 951            is_ignored: true,
 952            is_hidden: false,
 953            is_always_included: false,
 954            is_external: false,
 955            is_private: false,
 956            size: 0,
 957            char_bag: Default::default(),
 958            is_fifo: false,
 959        };
 960
 961        // 1. Test searching for `field`, including ignored files without any
 962        // inclusion and exclusion filters.
 963        let include_ignored = true;
 964        let files_to_include = PathMatcher::default();
 965        let files_to_exclude = PathMatcher::default();
 966        let match_full_paths = false;
 967        let search_query = SearchQuery::text(
 968            "field",
 969            false,
 970            false,
 971            include_ignored,
 972            files_to_include,
 973            files_to_exclude,
 974            match_full_paths,
 975            None,
 976        )
 977        .unwrap();
 978
 979        let path_matcher = PathInclusionMatcher::new(Arc::new(search_query));
 980        assert!(path_matcher.should_scan_gitignored_dir(
 981            &entry,
 982            &worktree_snapshot,
 983            &worktree_settings
 984        ));
 985
 986        // 2. Test searching for `field`, including ignored files but updating
 987        // `files_to_include` to only include files under `src/lib`.
 988        let include_ignored = true;
 989        let files_to_include = PathMatcher::new(vec!["src/lib"], PathStyle::Posix).unwrap();
 990        let files_to_exclude = PathMatcher::default();
 991        let match_full_paths = false;
 992        let search_query = SearchQuery::text(
 993            "field",
 994            false,
 995            false,
 996            include_ignored,
 997            files_to_include,
 998            files_to_exclude,
 999            match_full_paths,
1000            None,
1001        )
1002        .unwrap();
1003
1004        let path_matcher = PathInclusionMatcher::new(Arc::new(search_query));
1005        assert!(!path_matcher.should_scan_gitignored_dir(
1006            &entry,
1007            &worktree_snapshot,
1008            &worktree_settings
1009        ));
1010    }
1011}