agent: Don't track large and common binary files (#31352)

Oleksiy Syvokon created

## Issue

The agent may run very slowly on projects that contain many or large
binary files not listed in `.gitignore`.


## Solution

Temporarily rewrite `.git/info/exludes` to ignore:
- Common binary files based on the extension
- Files larger than 2 MB

## Benchmark

I measure the time between sending an agent message in UI ("hitting
Enter") and actually sending it to an LLM. Ideally, it should be
instant. Numbers for a 7.7 GB Rust project with no .gitignore.

Filter                            | Time
----------------------------------|-----
No filter (= before this change)  | 62 s
Exclude common file types only    | 1.46 s
Exclude files >2MB only           | 1.16 s
Exclude both                      | 0.10 s


## Planned changes:

- [x] Exclude common binary file types
- [x] Exclude large files
- [ ] Track files added by agent so we could delete them (we can't rely
on git for that anymore)
- [ ] Don't block on waiting for a checkpoint to complete until we
really need it
- [ ] Only `git add` files that are about to change


Closes #ISSUE

Release Notes:

- Improved agent latency on repositories containing many files or large
files

Change summary

crates/git/src/checkpoint.gitignore |  91 +++++++++++
crates/git/src/repository.rs        | 236 ++++++++++++++++++++++++++++--
2 files changed, 307 insertions(+), 20 deletions(-)

Detailed changes

crates/git/src/checkpoint.gitignore 🔗

@@ -0,0 +1,91 @@
+# This lists files that we don't track in checkpoints
+
+# Compiled source and executables
+*.exe
+*.dll
+*.so
+*.dylib
+*.a
+*.lib
+*.o
+*.obj
+*.elf
+*.out
+*.app
+*.deb
+*.rpm
+*.dmg
+*.pkg
+*.msi
+
+# Archives and compressed files
+*.7z
+*.zip
+*.tar
+*.tar.gz
+*.tgz
+*.tar.bz2
+*.tbz2
+*.tar.xz
+*.txz
+*.rar
+*.jar
+*.war
+*.ear
+
+# Media files
+*.jpg
+*.jpeg
+*.png
+*.gif
+*.ico
+*.svg
+*.webp
+*.bmp
+*.tiff
+*.mp3
+*.mp4
+*.avi
+*.mov
+*.wmv
+*.flv
+*.mkv
+*.webm
+*.wav
+*.flac
+*.aac
+
+# Database files
+*.db
+*.sqlite
+*.sqlite3
+*.mdb
+
+# Documents (often binary)
+*.pdf
+*.doc
+*.docx
+*.xls
+*.xlsx
+*.ppt
+*.pptx
+
+# IDE and editor files
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+.DS_Store
+Thumbs.db
+
+# Language-specific files
+*.rlib
+*.rmeta
+*.pdb
+*.class
+*.egg
+*.egg-info/
+*.pyc
+*.pto
+__pycache__

crates/git/src/repository.rs 🔗

@@ -193,6 +193,72 @@ pub enum ResetMode {
     Mixed,
 }
 
+/// Modifies .git/info/exclude temporarily
+pub struct GitExcludeOverride {
+    git_exclude_path: PathBuf,
+    original_excludes: Option<String>,
+    added_excludes: Option<String>,
+}
+
+impl GitExcludeOverride {
+    pub async fn new(git_exclude_path: PathBuf) -> Result<Self> {
+        let original_excludes = smol::fs::read_to_string(&git_exclude_path).await.ok();
+
+        Ok(GitExcludeOverride {
+            git_exclude_path,
+            original_excludes,
+            added_excludes: None,
+        })
+    }
+
+    pub async fn add_excludes(&mut self, excludes: &str) -> Result<()> {
+        self.added_excludes = Some(if let Some(ref already_added) = self.added_excludes {
+            format!("{already_added}\n{excludes}")
+        } else {
+            excludes.to_string()
+        });
+
+        let mut content = self.original_excludes.clone().unwrap_or_default();
+        content.push_str("\n\n#  ====== Auto-added by Zed: =======\n");
+        content.push_str(self.added_excludes.as_ref().unwrap());
+        content.push('\n');
+
+        smol::fs::write(&self.git_exclude_path, content).await?;
+        Ok(())
+    }
+
+    pub async fn restore_original(&mut self) -> Result<()> {
+        if let Some(ref original) = self.original_excludes {
+            smol::fs::write(&self.git_exclude_path, original).await?;
+        } else {
+            if self.git_exclude_path.exists() {
+                smol::fs::remove_file(&self.git_exclude_path).await?;
+            }
+        }
+
+        self.added_excludes = None;
+
+        Ok(())
+    }
+}
+
+impl Drop for GitExcludeOverride {
+    fn drop(&mut self) {
+        if self.added_excludes.is_some() {
+            let git_exclude_path = self.git_exclude_path.clone();
+            let original_excludes = self.original_excludes.clone();
+            smol::spawn(async move {
+                if let Some(original) = original_excludes {
+                    smol::fs::write(&git_exclude_path, original).await
+                } else {
+                    smol::fs::remove_file(&git_exclude_path).await
+                }
+            })
+            .detach();
+        }
+    }
+}
+
 pub trait GitRepository: Send + Sync {
     fn reload_index(&self);
 
@@ -1263,10 +1329,12 @@ impl GitRepository for RealGitRepository {
         self.executor
             .spawn(async move {
                 let working_directory = working_directory?;
-                let mut git = GitBinary::new(git_binary_path, working_directory, executor)
+                let mut git = GitBinary::new(git_binary_path, working_directory.clone(), executor)
                     .envs(checkpoint_author_envs());
                 git.with_temp_index(async |git| {
                     let head_sha = git.run(&["rev-parse", "HEAD"]).await.ok();
+                    let mut excludes = exclude_files(git).await?;
+
                     git.run(&["add", "--all"]).await?;
                     let tree = git.run(&["write-tree"]).await?;
                     let checkpoint_sha = if let Some(head_sha) = head_sha.as_deref() {
@@ -1276,6 +1344,8 @@ impl GitRepository for RealGitRepository {
                         git.run(&["commit-tree", &tree, "-m", "Checkpoint"]).await?
                     };
 
+                    excludes.restore_original().await?;
+
                     Ok(GitRepositoryCheckpoint {
                         commit_sha: checkpoint_sha.parse()?,
                     })
@@ -1294,7 +1364,7 @@ impl GitRepository for RealGitRepository {
             .spawn(async move {
                 let working_directory = working_directory?;
 
-                let mut git = GitBinary::new(git_binary_path, working_directory, executor);
+                let git = GitBinary::new(git_binary_path, working_directory, executor);
                 git.run(&[
                     "restore",
                     "--source",
@@ -1304,12 +1374,16 @@ impl GitRepository for RealGitRepository {
                 ])
                 .await?;
 
-                git.with_temp_index(async move |git| {
-                    git.run(&["read-tree", &checkpoint.commit_sha.to_string()])
-                        .await?;
-                    git.run(&["clean", "-d", "--force"]).await
-                })
-                .await?;
+                // TODO: We don't track binary and large files anymore,
+                //       so the following call would delete them.
+                //       Implement an alternative way to track files added by agent.
+                //
+                // git.with_temp_index(async move |git| {
+                //     git.run(&["read-tree", &checkpoint.commit_sha.to_string()])
+                //         .await?;
+                //     git.run(&["clean", "-d", "--force"]).await
+                // })
+                // .await?;
 
                 Ok(())
             })
@@ -1400,6 +1474,44 @@ fn git_status_args(path_prefixes: &[RepoPath]) -> Vec<OsString> {
     args
 }
 
+/// Temporarily git-ignore commonly ignored files and files over 2MB
+async fn exclude_files(git: &GitBinary) -> Result<GitExcludeOverride> {
+    const MAX_SIZE: u64 = 2 * 1024 * 1024; // 2 MB
+    let mut excludes = git.with_exclude_overrides().await?;
+    excludes
+        .add_excludes(include_str!("./checkpoint.gitignore"))
+        .await?;
+
+    let working_directory = git.working_directory.clone();
+    let untracked_files = git.list_untracked_files().await?;
+    let excluded_paths = untracked_files.into_iter().map(|path| {
+        let working_directory = working_directory.clone();
+        smol::spawn(async move {
+            let full_path = working_directory.join(path.clone());
+            match smol::fs::metadata(&full_path).await {
+                Ok(metadata) if metadata.is_file() && metadata.len() >= MAX_SIZE => {
+                    Some(PathBuf::from("/").join(path.clone()))
+                }
+                _ => None,
+            }
+        })
+    });
+
+    let excluded_paths = futures::future::join_all(excluded_paths).await;
+    let excluded_paths = excluded_paths.into_iter().flatten().collect::<Vec<_>>();
+
+    if !excluded_paths.is_empty() {
+        let exclude_patterns = excluded_paths
+            .into_iter()
+            .map(|path| path.to_string_lossy().to_string())
+            .collect::<Vec<_>>()
+            .join("\n");
+        excludes.add_excludes(&exclude_patterns).await?;
+    }
+
+    Ok(excludes)
+}
+
 struct GitBinary {
     git_binary_path: PathBuf,
     working_directory: PathBuf,
@@ -1423,6 +1535,19 @@ impl GitBinary {
         }
     }
 
+    async fn list_untracked_files(&self) -> Result<Vec<PathBuf>> {
+        let status_output = self
+            .run(&["status", "--porcelain=v1", "--untracked-files=all", "-z"])
+            .await?;
+
+        let paths = status_output
+            .split('\0')
+            .filter(|entry| entry.len() >= 3 && entry.starts_with("?? "))
+            .map(|entry| PathBuf::from(&entry[3..]))
+            .collect::<Vec<_>>();
+        Ok(paths)
+    }
+
     fn envs(mut self, envs: HashMap<String, String>) -> Self {
         self.envs = envs;
         self
@@ -1466,6 +1591,16 @@ impl GitBinary {
         Ok(result)
     }
 
+    pub async fn with_exclude_overrides(&self) -> Result<GitExcludeOverride> {
+        let path = self
+            .working_directory
+            .join(".git")
+            .join("info")
+            .join("exclude");
+
+        GitExcludeOverride::new(path).await
+    }
+
     fn path_for_index_id(&self, id: Uuid) -> PathBuf {
         self.working_directory
             .join(".git")
@@ -1878,12 +2013,13 @@ mod tests {
                 .unwrap(),
             "1"
         );
-        assert_eq!(
-            smol::fs::read_to_string(repo_dir.path().join("new_file_after_checkpoint"))
-                .await
-                .ok(),
-            None
-        );
+        // See TODO above
+        // assert_eq!(
+        //     smol::fs::read_to_string(repo_dir.path().join("new_file_after_checkpoint"))
+        //         .await
+        //         .ok(),
+        //     None
+        // );
     }
 
     #[gpui::test]
@@ -1916,12 +2052,13 @@ mod tests {
                 .unwrap(),
             "foo"
         );
-        assert_eq!(
-            smol::fs::read_to_string(repo_dir.path().join("baz"))
-                .await
-                .ok(),
-            None
-        );
+        // See TODOs above
+        // assert_eq!(
+        //     smol::fs::read_to_string(repo_dir.path().join("baz"))
+        //         .await
+        //         .ok(),
+        //     None
+        // );
     }
 
     #[gpui::test]
@@ -1958,6 +2095,65 @@ mod tests {
         );
     }
 
+    #[gpui::test]
+    async fn test_checkpoint_exclude_binary_files(cx: &mut TestAppContext) {
+        cx.executor().allow_parking();
+
+        let repo_dir = tempfile::tempdir().unwrap();
+        let text_path = repo_dir.path().join("main.rs");
+        let bin_path = repo_dir.path().join("binary.o");
+
+        git2::Repository::init(repo_dir.path()).unwrap();
+
+        smol::fs::write(&text_path, "fn main() {}").await.unwrap();
+
+        smol::fs::write(&bin_path, "some binary file here")
+            .await
+            .unwrap();
+
+        let repo =
+            RealGitRepository::new(&repo_dir.path().join(".git"), None, cx.executor()).unwrap();
+
+        // initial commit
+        repo.stage_paths(
+            vec![RepoPath::from_str("main.rs")],
+            Arc::new(HashMap::default()),
+        )
+        .await
+        .unwrap();
+        repo.commit(
+            "Initial commit".into(),
+            None,
+            CommitOptions::default(),
+            Arc::new(checkpoint_author_envs()),
+        )
+        .await
+        .unwrap();
+
+        let checkpoint = repo.checkpoint().await.unwrap();
+
+        smol::fs::write(&text_path, "fn main() { println!(\"Modified\"); }")
+            .await
+            .unwrap();
+        smol::fs::write(&bin_path, "Modified binary file")
+            .await
+            .unwrap();
+
+        repo.restore_checkpoint(checkpoint).await.unwrap();
+
+        // Text files should be restored to checkpoint state,
+        // but binaries should not (they aren't tracked)
+        assert_eq!(
+            smol::fs::read_to_string(&text_path).await.unwrap(),
+            "fn main() {}"
+        );
+
+        assert_eq!(
+            smol::fs::read_to_string(&bin_path).await.unwrap(),
+            "Modified binary file"
+        );
+    }
+
     #[test]
     fn test_branches_parsing() {
         // suppress "help: octal escapes are not supported, `\0` is always null"