More git status optimizations (#2779)

Max Brunsfeld created

Follow-up to https://github.com/zed-industries/zed/pull/2777
Refs https://github.com/zed-industries/community/issues/1770

In this PR, I reworked the way that git statuses are retrieved. In a
huge repository like `WebKit`, the really slow part of computing a list
of git statuses is the *unstaged* portion of the diff. For the *staged*
diff, `git` can avoid comparing the contents of unchanged directories,
because the index contains hashes of every tree. But for the *unstaged*
portion, Git needs to compare every file in the worktree against the
index. In the common case, when there are no changes, it's enough to
check the `mtime` of every file (because the index stores the mtimes of
files when they are added). But this still requires an `lstat` call to
retrieve each file's metadata.

I realized that this is redundant work, because the worktree is
*already* calling `lstat` on every file, and caching their metadata. So
in this PR, I've changed the `Repository` API so that there are separate
methods for retrieving a file's *staged* and *unstaged* statuses. The
*staged* statuses are retrieved in one giant batch, like before, to
reduce our git calls (which also have an inherent cost). But the
`unstaged` statuses are retrieved one-by-one, after we load files'
mtimes. Often, all that's required is an index lookup, and an mtime
comparison.

With this optimization, it once again becomes pretty responsive to open
`WebKit` or `chromium` in Zed.

Release Notes:

- Optimized the loading of project file when working in very large git
repositories

Change summary

crates/fs/src/repository.rs    | 128 +++++++++++++++++++++++++++--------
crates/project/src/worktree.rs |  97 +++++++++++++++++++-------
2 files changed, 169 insertions(+), 56 deletions(-)

Detailed changes

crates/fs/src/repository.rs 🔗

@@ -1,6 +1,6 @@
 use anyhow::Result;
 use collections::HashMap;
-use git2::{BranchType, ErrorCode};
+use git2::{BranchType, StatusShow};
 use parking_lot::Mutex;
 use rpc::proto;
 use serde_derive::{Deserialize, Serialize};
@@ -10,6 +10,7 @@ use std::{
     os::unix::prelude::OsStrExt,
     path::{Component, Path, PathBuf},
     sync::Arc,
+    time::SystemTime,
 };
 use sum_tree::{MapSeekTarget, TreeMap};
 use util::ResultExt;
@@ -27,8 +28,25 @@ pub trait GitRepository: Send {
     fn reload_index(&self);
     fn load_index_text(&self, relative_file_path: &Path) -> Option<String>;
     fn branch_name(&self) -> Option<String>;
-    fn statuses(&self) -> TreeMap<RepoPath, GitFileStatus>;
-    fn status(&self, path: &RepoPath) -> Result<Option<GitFileStatus>>;
+
+    /// Get the statuses of all of the files in the index that start with the given
+    /// path and have changes with resepect to the HEAD commit. This is fast because
+    /// the index stores hashes of trees, so that unchanged directories can be skipped.
+    fn staged_statuses(&self, path_prefix: &Path) -> TreeMap<RepoPath, GitFileStatus>;
+
+    /// Get the status of a given file in the working directory with respect to
+    /// the index. In the common case, when there are no changes, this only requires
+    /// an index lookup. The index stores the mtime of each file when it was added,
+    /// so there's no work to do if the mtime matches.
+    fn unstaged_status(&self, path: &RepoPath, mtime: SystemTime) -> Option<GitFileStatus>;
+
+    /// Get the status of a given file in the working directory with respect to
+    /// the HEAD commit. In the common case, when there are no changes, this only
+    /// requires an index lookup and blob comparison between the index and the HEAD
+    /// commit. The index stores the mtime of each file when it was added, so there's
+    /// no need to consider the working directory file if the mtime matches.
+    fn status(&self, path: &RepoPath, mtime: SystemTime) -> Option<GitFileStatus>;
+
     fn branches(&self) -> Result<Vec<Branch>>;
     fn change_branch(&self, _: &str) -> Result<()>;
     fn create_branch(&self, _: &str) -> Result<()>;
@@ -40,7 +58,6 @@ impl std::fmt::Debug for dyn GitRepository {
     }
 }
 
-#[async_trait::async_trait]
 impl GitRepository for LibGitRepository {
     fn reload_index(&self) {
         if let Ok(mut index) = self.index() {
@@ -78,37 +95,67 @@ impl GitRepository for LibGitRepository {
         Some(branch.to_string())
     }
 
-    fn statuses(&self) -> TreeMap<RepoPath, GitFileStatus> {
+    fn staged_statuses(&self, path_prefix: &Path) -> TreeMap<RepoPath, GitFileStatus> {
         let mut map = TreeMap::default();
-        if let Some(statuses) = self.statuses(None).log_err() {
-            for status in statuses
-                .iter()
-                .filter(|status| !status.status().contains(git2::Status::IGNORED))
-            {
-                let path = RepoPath(PathBuf::from(OsStr::from_bytes(status.path_bytes())));
-                let Some(status) = read_status(status.status()) else {
-                    continue
-                };
 
-                map.insert(path, status)
+        let mut options = git2::StatusOptions::new();
+        options.pathspec(path_prefix);
+        options.show(StatusShow::Index);
+
+        if let Some(statuses) = self.statuses(Some(&mut options)).log_err() {
+            for status in statuses.iter() {
+                let path = RepoPath(PathBuf::from(OsStr::from_bytes(status.path_bytes())));
+                let status = status.status();
+                if !status.contains(git2::Status::IGNORED) {
+                    if let Some(status) = read_status(status) {
+                        map.insert(path, status)
+                    }
+                }
             }
         }
         map
     }
 
-    fn status(&self, path: &RepoPath) -> Result<Option<GitFileStatus>> {
-        let status = self.status_file(path);
-        match status {
-            Ok(status) => Ok(read_status(status)),
-            Err(e) => {
-                if e.code() == ErrorCode::NotFound {
-                    Ok(None)
-                } else {
-                    Err(e.into())
-                }
-            }
+    fn unstaged_status(&self, path: &RepoPath, mtime: SystemTime) -> Option<GitFileStatus> {
+        // If the file has not changed since it was added to the index, then
+        // there can't be any changes.
+        if matches_index(self, path, mtime) {
+            return None;
         }
+
+        let mut options = git2::StatusOptions::new();
+        options.pathspec(&path.0);
+        options.disable_pathspec_match(true);
+        options.include_untracked(true);
+        options.recurse_untracked_dirs(true);
+        options.include_unmodified(true);
+        options.show(StatusShow::Workdir);
+
+        let statuses = self.statuses(Some(&mut options)).log_err()?;
+        let status = statuses.get(0).and_then(|s| read_status(s.status()));
+        status
     }
+
+    fn status(&self, path: &RepoPath, mtime: SystemTime) -> Option<GitFileStatus> {
+        let mut options = git2::StatusOptions::new();
+        options.pathspec(&path.0);
+        options.disable_pathspec_match(true);
+        options.include_untracked(true);
+        options.recurse_untracked_dirs(true);
+        options.include_unmodified(true);
+
+        // If the file has not changed since it was added to the index, then
+        // there's no need to examine the working directory file: just compare
+        // the blob in the index to the one in the HEAD commit.
+        if matches_index(self, path, mtime) {
+            options.show(StatusShow::Index);
+        }
+
+        let statuses = self.statuses(Some(&mut options)).log_err()?;
+        let status = statuses.get(0).and_then(|s| read_status(s.status()));
+        status
+    }
+
     fn branches(&self) -> Result<Vec<Branch>> {
         let local_branches = self.branches(Some(BranchType::Local))?;
         let valid_branches = local_branches
@@ -151,6 +198,21 @@ impl GitRepository for LibGitRepository {
     }
 }
 
+fn matches_index(repo: &LibGitRepository, path: &RepoPath, mtime: SystemTime) -> bool {
+    if let Some(index) = repo.index().log_err() {
+        if let Some(entry) = index.get_path(&path, 0) {
+            if let Some(mtime) = mtime.duration_since(SystemTime::UNIX_EPOCH).log_err() {
+                if entry.mtime.seconds() == mtime.as_secs() as i32
+                    && entry.mtime.nanoseconds() == mtime.subsec_nanos()
+                {
+                    return true;
+                }
+            }
+        }
+    }
+    false
+}
+
 fn read_status(status: git2::Status) -> Option<GitFileStatus> {
     if status.contains(git2::Status::CONFLICTED) {
         Some(GitFileStatus::Conflict)
@@ -200,18 +262,24 @@ impl GitRepository for FakeGitRepository {
         state.branch_name.clone()
     }
 
-    fn statuses(&self) -> TreeMap<RepoPath, GitFileStatus> {
+    fn staged_statuses(&self, path_prefix: &Path) -> TreeMap<RepoPath, GitFileStatus> {
         let mut map = TreeMap::default();
         let state = self.state.lock();
         for (repo_path, status) in state.worktree_statuses.iter() {
-            map.insert(repo_path.to_owned(), status.to_owned());
+            if repo_path.0.starts_with(path_prefix) {
+                map.insert(repo_path.to_owned(), status.to_owned());
+            }
         }
         map
     }
 
-    fn status(&self, path: &RepoPath) -> Result<Option<GitFileStatus>> {
+    fn unstaged_status(&self, _path: &RepoPath, _mtime: SystemTime) -> Option<GitFileStatus> {
+        None
+    }
+
+    fn status(&self, path: &RepoPath, _mtime: SystemTime) -> Option<GitFileStatus> {
         let state = self.state.lock();
-        Ok(state.worktree_statuses.get(path).cloned())
+        state.worktree_statuses.get(path).cloned()
     }
 
     fn branches(&self) -> Result<Vec<Branch>> {

crates/project/src/worktree.rs 🔗

@@ -2162,10 +2162,18 @@ impl BackgroundScannerState {
         let path = entry.path.clone();
         let ignore_stack = self.snapshot.ignore_stack_for_abs_path(&abs_path, true);
         let mut ancestor_inodes = self.snapshot.ancestor_inodes_for_path(&path);
-        let containing_repository = self
-            .snapshot
-            .local_repo_for_path(&path)
-            .map(|(path, repo)| (path, repo.repo_ptr.lock().statuses()));
+        let mut containing_repository = None;
+        if !ignore_stack.is_all() {
+            if let Some((workdir_path, repo)) = self.snapshot.local_repo_for_path(&path) {
+                if let Ok(repo_path) = path.strip_prefix(&workdir_path.0) {
+                    containing_repository = Some((
+                        workdir_path,
+                        repo.repo_ptr.clone(),
+                        repo.repo_ptr.lock().staged_statuses(repo_path),
+                    ));
+                }
+            }
+        }
         if !ancestor_inodes.contains(&entry.inode) {
             ancestor_inodes.insert(entry.inode);
             scan_job_tx
@@ -2355,8 +2363,7 @@ impl BackgroundScannerState {
                         .repository_entries
                         .update(&work_dir, |entry| entry.branch = branch.map(Into::into));
 
-                    let statuses = repository.statuses();
-                    self.update_git_statuses(&work_dir, &statuses);
+                    self.update_git_statuses(&work_dir, &*repository);
                 }
             }
         }
@@ -2381,7 +2388,11 @@ impl BackgroundScannerState {
         &mut self,
         dot_git_path: Arc<Path>,
         fs: &dyn Fs,
-    ) -> Option<(RepositoryWorkDirectory, TreeMap<RepoPath, GitFileStatus>)> {
+    ) -> Option<(
+        RepositoryWorkDirectory,
+        Arc<Mutex<dyn GitRepository>>,
+        TreeMap<RepoPath, GitFileStatus>,
+    )> {
         log::info!("build git repository {:?}", dot_git_path);
 
         let work_dir_path: Arc<Path> = dot_git_path.parent().unwrap().into();
@@ -2413,27 +2424,28 @@ impl BackgroundScannerState {
             },
         );
 
-        let statuses = repo_lock.statuses();
-        self.update_git_statuses(&work_directory, &statuses);
+        let staged_statuses = self.update_git_statuses(&work_directory, &*repo_lock);
         drop(repo_lock);
 
         self.snapshot.git_repositories.insert(
             work_dir_id,
             LocalRepositoryEntry {
                 git_dir_scan_id: 0,
-                repo_ptr: repository,
+                repo_ptr: repository.clone(),
                 git_dir_path: dot_git_path.clone(),
             },
         );
 
-        Some((work_directory, statuses))
+        Some((work_directory, repository, staged_statuses))
     }
 
     fn update_git_statuses(
         &mut self,
         work_directory: &RepositoryWorkDirectory,
-        statuses: &TreeMap<RepoPath, GitFileStatus>,
-    ) {
+        repo: &dyn GitRepository,
+    ) -> TreeMap<RepoPath, GitFileStatus> {
+        let staged_statuses = repo.staged_statuses(Path::new(""));
+
         let mut changes = vec![];
         let mut edits = vec![];
 
@@ -2446,7 +2458,10 @@ impl BackgroundScannerState {
                 continue;
             };
             let repo_path = RepoPath(repo_path.to_path_buf());
-            let git_file_status = statuses.get(&repo_path).copied();
+            let git_file_status = combine_git_statuses(
+                staged_statuses.get(&repo_path).copied(),
+                repo.unstaged_status(&repo_path, entry.mtime),
+            );
             if entry.git_status != git_file_status {
                 entry.git_status = git_file_status;
                 changes.push(entry.path.clone());
@@ -2456,6 +2471,7 @@ impl BackgroundScannerState {
 
         self.snapshot.entries_by_path.edit(edits, &());
         util::extend_sorted(&mut self.changed_paths, changes, usize::MAX, Ord::cmp);
+        staged_statuses
     }
 }
 
@@ -3517,10 +3533,19 @@ impl BackgroundScanner {
                 }
             } else {
                 child_entry.is_ignored = ignore_stack.is_abs_path_ignored(&child_abs_path, false);
-
-                if let Some((repository_dir, statuses)) = &job.containing_repository {
-                    if let Ok(repo_path) = child_entry.path.strip_prefix(&repository_dir.0) {
-                        child_entry.git_status = statuses.get(&RepoPath(repo_path.into())).copied();
+                if !child_entry.is_ignored {
+                    if let Some((repository_dir, repository, staged_statuses)) =
+                        &job.containing_repository
+                    {
+                        if let Ok(repo_path) = child_entry.path.strip_prefix(&repository_dir.0) {
+                            let repo_path = RepoPath(repo_path.into());
+                            child_entry.git_status = combine_git_statuses(
+                                staged_statuses.get(&repo_path).copied(),
+                                repository
+                                    .lock()
+                                    .unstaged_status(&repo_path, child_entry.mtime),
+                            );
+                        }
                     }
                 }
             }
@@ -3630,13 +3655,10 @@ impl BackgroundScanner {
                             if let Some((work_dir, repo)) =
                                 state.snapshot.local_repo_for_path(&path)
                             {
-                                if let Ok(path) = path.strip_prefix(work_dir.0) {
-                                    fs_entry.git_status = repo
-                                        .repo_ptr
-                                        .lock()
-                                        .status(&RepoPath(path.into()))
-                                        .log_err()
-                                        .flatten()
+                                if let Ok(repo_path) = path.strip_prefix(work_dir.0) {
+                                    let repo_path = RepoPath(repo_path.into());
+                                    let repo = repo.repo_ptr.lock();
+                                    fs_entry.git_status = repo.status(&repo_path, fs_entry.mtime);
                                 }
                             }
                         }
@@ -3990,7 +4012,11 @@ struct ScanJob {
     scan_queue: Sender<ScanJob>,
     ancestor_inodes: TreeSet<u64>,
     is_external: bool,
-    containing_repository: Option<(RepositoryWorkDirectory, TreeMap<RepoPath, GitFileStatus>)>,
+    containing_repository: Option<(
+        RepositoryWorkDirectory,
+        Arc<Mutex<dyn GitRepository>>,
+        TreeMap<RepoPath, GitFileStatus>,
+    )>,
 }
 
 struct UpdateIgnoreStatusJob {
@@ -4317,3 +4343,22 @@ impl<'a> TryFrom<(&'a CharBag, proto::Entry)> for Entry {
         }
     }
 }
+
+fn combine_git_statuses(
+    staged: Option<GitFileStatus>,
+    unstaged: Option<GitFileStatus>,
+) -> Option<GitFileStatus> {
+    if let Some(staged) = staged {
+        if let Some(unstaged) = unstaged {
+            if unstaged != staged {
+                Some(GitFileStatus::Modified)
+            } else {
+                Some(staged)
+            }
+        } else {
+            Some(staged)
+        }
+    } else {
+        unstaged
+    }
+}