search: Add heuristic for discarding matching of binary files (#23581)

Piotr Osiewicz created

Fixes #23398 
Closes #23398

We'll bail on searches of files that we know are binary (thus even if we
were to find a match in them, they'd be thrown away by buffer loader).

Release Notes:

- Improved project search performance in worktrees with binary files

Change summary

crates/fs/src/fs.rs                  |  6 +++---
crates/project/src/search.rs         |  8 +++++---
crates/project/src/worktree_store.rs | 16 +++++++++++++++-
3 files changed, 23 insertions(+), 7 deletions(-)

Detailed changes

crates/fs/src/fs.rs 🔗

@@ -101,7 +101,7 @@ pub trait Fs: Send + Sync {
         self.remove_file(path, options).await
     }
     async fn open_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>>;
-    async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read>>;
+    async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read + Send + Sync>>;
     async fn load(&self, path: &Path) -> Result<String> {
         Ok(String::from_utf8(self.load_bytes(path).await?)?)
     }
@@ -499,7 +499,7 @@ impl Fs for RealFs {
         Ok(())
     }
 
-    async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read>> {
+    async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read + Send + Sync>> {
         Ok(Box::new(std::fs::File::open(path)?))
     }
 
@@ -1746,7 +1746,7 @@ impl Fs for FakeFs {
         Ok(())
     }
 
-    async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read>> {
+    async fn open_sync(&self, path: &Path) -> Result<Box<dyn io::Read + Send + Sync>> {
         let bytes = self.load_internal(path).await?;
         Ok(Box::new(io::Cursor::new(bytes)))
     }

crates/project/src/search.rs 🔗

@@ -210,14 +210,17 @@ impl SearchQuery {
         }
     }
 
-    pub fn detect<T: Read>(&self, stream: T) -> Result<bool> {
+    pub(crate) fn detect(
+        &self,
+        mut reader: BufReader<Box<dyn Read + Send + Sync>>,
+    ) -> Result<bool> {
         if self.as_str().is_empty() {
             return Ok(false);
         }
 
         match self {
             Self::Text { search, .. } => {
-                let mat = search.stream_find_iter(stream).next();
+                let mat = search.stream_find_iter(reader).next();
                 match mat {
                     Some(Ok(_)) => Ok(true),
                     Some(Err(err)) => Err(err.into()),
@@ -227,7 +230,6 @@ impl SearchQuery {
             Self::Regex {
                 regex, multiline, ..
             } => {
-                let mut reader = BufReader::new(stream);
                 if *multiline {
                     let mut text = String::new();
                     if let Err(err) = reader.read_to_string(&mut text) {

crates/project/src/worktree_store.rs 🔗

@@ -1,4 +1,5 @@
 use std::{
+    io::{BufRead, BufReader},
     path::{Path, PathBuf},
     pin::pin,
     sync::{atomic::AtomicUsize, Arc},
@@ -985,7 +986,6 @@ impl WorktreeStore {
                     }
 
                     repo.change_branch(&new_branch)?;
-
                     Ok(())
                 });
 
@@ -1020,6 +1020,20 @@ impl WorktreeStore {
             let Some(file) = fs.open_sync(&abs_path).await.log_err() else {
                 continue;
             };
+
+            let mut file = BufReader::new(file);
+            let file_start = file.fill_buf()?;
+
+            if let Err(Some(starting_position)) =
+                std::str::from_utf8(file_start).map_err(|e| e.error_len())
+            {
+                // Before attempting to match the file content, throw away files that have invalid UTF-8 sequences early on;
+                // That way we can still match files in a streaming fashion without having look at "obviously binary" files.
+                return Err(anyhow!(
+                    "Invalid UTF-8 sequence at position {starting_position}"
+                ));
+            }
+
             if query.detect(file).unwrap_or(false) {
                 entry.respond.send(entry.path).await?
             }