updated file compare in the semantic indexing engine, to work off of modified system times as opposed to file hashes

KCaverly and maxbrunsfeld created

Co-authored-by: maxbrunsfeld <max@zed.dev>

Change summary

Cargo.lock                              | 25 ------
crates/vector_store/Cargo.toml          |  4 
crates/vector_store/src/db.rs           | 99 +++++++++------------------
crates/vector_store/src/vector_store.rs | 78 ++++++++++----------
4 files changed, 75 insertions(+), 131 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -4232,19 +4232,6 @@ dependencies = [
  "tempfile",
 ]
 
-[[package]]
-name = "ndarray"
-version = "0.15.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
-dependencies = [
- "matrixmultiply",
- "num-complex",
- "num-integer",
- "num-traits",
- "rawpointer",
-]
-
 [[package]]
 name = "net2"
 version = "0.2.38"
@@ -4353,15 +4340,6 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "num-complex"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
-dependencies = [
- "num-traits",
-]
-
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -8050,14 +8028,13 @@ dependencies = [
  "lazy_static",
  "log",
  "matrixmultiply",
- "ndarray",
  "picker",
  "project",
  "rand 0.8.5",
+ "rpc",
  "rusqlite",
  "serde",
  "serde_json",
- "sha-1 0.10.1",
  "smol",
  "tempdir",
  "theme",

crates/vector_store/Cargo.toml 🔗

@@ -17,6 +17,7 @@ util = { path = "../util" }
 picker = { path = "../picker" }
 theme = { path = "../theme" }
 editor = { path = "../editor" }
+rpc = { path = "../rpc" }
 anyhow.workspace = true
 futures.workspace = true
 smol.workspace = true
@@ -29,14 +30,13 @@ serde.workspace = true
 serde_json.workspace = true
 async-trait.workspace = true
 bincode = "1.3.3"
-ndarray = "0.15.6"
-sha-1 = "0.10.1"
 matrixmultiply = "0.3.7"
 
 [dev-dependencies]
 gpui = { path = "../gpui", features = ["test-support"] }
 language = { path = "../language", features = ["test-support"] }
 project = { path = "../project", features = ["test-support"] }
+rpc = { path = "../rpc", features = ["test-support"] }
 workspace = { path = "../workspace", features = ["test-support"] }
 tree-sitter-rust = "*"
 rand.workspace = true

crates/vector_store/src/db.rs 🔗

@@ -2,18 +2,17 @@ use std::{
     collections::HashMap,
     path::{Path, PathBuf},
     rc::Rc,
+    time::SystemTime,
 };
 
 use anyhow::{anyhow, Result};
 
+use crate::IndexedFile;
+use rpc::proto::Timestamp;
 use rusqlite::{
     params,
-    types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
-    ToSql,
+    types::{FromSql, FromSqlResult, ValueRef},
 };
-use sha1::{Digest, Sha1};
-
-use crate::IndexedFile;
 
 // Note this is not an appropriate document
 #[derive(Debug)]
@@ -29,60 +28,7 @@ pub struct DocumentRecord {
 pub struct FileRecord {
     pub id: usize,
     pub relative_path: String,
-    pub sha1: FileSha1,
-}
-
-#[derive(Debug)]
-pub struct FileSha1(pub Vec<u8>);
-
-impl FileSha1 {
-    pub fn from_str(content: String) -> Self {
-        let mut hasher = Sha1::new();
-        hasher.update(content);
-        let sha1 = hasher.finalize()[..]
-            .into_iter()
-            .map(|val| val.to_owned())
-            .collect::<Vec<u8>>();
-        return FileSha1(sha1);
-    }
-
-    pub fn equals(&self, content: &String) -> bool {
-        let mut hasher = Sha1::new();
-        hasher.update(content);
-        let sha1 = hasher.finalize()[..]
-            .into_iter()
-            .map(|val| val.to_owned())
-            .collect::<Vec<u8>>();
-
-        let equal = self
-            .0
-            .clone()
-            .into_iter()
-            .zip(sha1)
-            .filter(|&(a, b)| a == b)
-            .count()
-            == self.0.len();
-
-        equal
-    }
-}
-
-impl ToSql for FileSha1 {
-    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {
-        return self.0.to_sql();
-    }
-}
-
-impl FromSql for FileSha1 {
-    fn column_result(value: ValueRef) -> FromSqlResult<Self> {
-        let bytes = value.as_blob()?;
-        Ok(FileSha1(
-            bytes
-                .into_iter()
-                .map(|val| val.to_owned())
-                .collect::<Vec<u8>>(),
-        ))
-    }
+    pub mtime: Timestamp,
 }
 
 #[derive(Debug)]
@@ -133,7 +79,8 @@ impl VectorDatabase {
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 worktree_id INTEGER NOT NULL,
                 relative_path VARCHAR NOT NULL,
-                sha1 BLOB NOT NULL,
+                mtime_seconds INTEGER NOT NULL,
+                mtime_nanos INTEGER NOT NULL,
                 FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE
             )",
             [],
@@ -170,11 +117,20 @@ impl VectorDatabase {
             ",
             params![worktree_id, indexed_file.path.to_str()],
         )?;
+        let mtime = Timestamp::from(indexed_file.mtime);
         self.db.execute(
             "
-            INSERT INTO files (worktree_id, relative_path, sha1) VALUES (?1, ?2, $3);
+            INSERT INTO files
+            (worktree_id, relative_path, mtime_seconds, mtime_nanos)
+            VALUES
+            (?1, ?2, $3, $4);
             ",
-            params![worktree_id, indexed_file.path.to_str(), indexed_file.sha1],
+            params![
+                worktree_id,
+                indexed_file.path.to_str(),
+                mtime.seconds,
+                mtime.nanos
+            ],
         )?;
 
         let file_id = self.db.last_insert_rowid();
@@ -224,13 +180,24 @@ impl VectorDatabase {
         Ok(self.db.last_insert_rowid())
     }
 
-    pub fn get_file_hashes(&self, worktree_id: i64) -> Result<HashMap<PathBuf, FileSha1>> {
+    pub fn get_file_mtimes(&self, worktree_id: i64) -> Result<HashMap<PathBuf, SystemTime>> {
         let mut statement = self.db.prepare(
-            "SELECT relative_path, sha1 FROM files WHERE worktree_id = ?1 ORDER BY relative_path",
+            "
+            SELECT relative_path, mtime_seconds, mtime_nanos
+            FROM files
+            WHERE worktree_id = ?1
+            ORDER BY relative_path",
         )?;
-        let mut result: HashMap<PathBuf, FileSha1> = HashMap::new();
+        let mut result: HashMap<PathBuf, SystemTime> = HashMap::new();
         for row in statement.query_map(params![worktree_id], |row| {
-            Ok((row.get::<_, String>(0)?.into(), row.get(1)?))
+            Ok((
+                row.get::<_, String>(0)?.into(),
+                Timestamp {
+                    seconds: row.get(1)?,
+                    nanos: row.get(2)?,
+                }
+                .into(),
+            ))
         })? {
             let row = row?;
             result.insert(row.0, row.1);

crates/vector_store/src/vector_store.rs 🔗

@@ -6,7 +6,7 @@ mod modal;
 mod vector_store_tests;
 
 use anyhow::{anyhow, Result};
-use db::{FileSha1, VectorDatabase};
+use db::VectorDatabase;
 use embedding::{EmbeddingProvider, OpenAIEmbeddings};
 use gpui::{AppContext, Entity, ModelContext, ModelHandle, Task, ViewContext};
 use language::{Language, LanguageRegistry};
@@ -15,9 +15,10 @@ use project::{Fs, Project, WorktreeId};
 use smol::channel;
 use std::{
     cmp::Ordering,
-    collections::{HashMap, HashSet},
+    collections::HashMap,
     path::{Path, PathBuf},
     sync::Arc,
+    time::SystemTime,
 };
 use tree_sitter::{Parser, QueryCursor};
 use util::{
@@ -46,6 +47,7 @@ pub fn init(
         VectorStore::new(
             fs,
             db_file_path,
+            // Arc::new(embedding::DummyEmbeddings {}),
             Arc::new(OpenAIEmbeddings {
                 client: http_client,
             }),
@@ -91,7 +93,7 @@ pub fn init(
 #[derive(Debug)]
 pub struct IndexedFile {
     path: PathBuf,
-    sha1: FileSha1,
+    mtime: SystemTime,
     documents: Vec<Document>,
 }
 
@@ -131,9 +133,10 @@ impl VectorStore {
         cursor: &mut QueryCursor,
         parser: &mut Parser,
         embedding_provider: &dyn EmbeddingProvider,
+        fs: &Arc<dyn Fs>,
         language: Arc<Language>,
         file_path: PathBuf,
-        content: String,
+        mtime: SystemTime,
     ) -> Result<IndexedFile> {
         let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?;
         let embedding_config = grammar
@@ -141,6 +144,8 @@ impl VectorStore {
             .as_ref()
             .ok_or_else(|| anyhow!("no outline query"))?;
 
+        let content = fs.load(&file_path).await?;
+
         parser.set_language(grammar.ts_language).unwrap();
         let tree = parser
             .parse(&content, None)
@@ -184,11 +189,9 @@ impl VectorStore {
             }
         }
 
-        let sha1 = FileSha1::from_str(content);
-
         return Ok(IndexedFile {
             path: file_path,
-            sha1,
+            mtime,
             documents,
         });
     }
@@ -231,38 +234,36 @@ impl VectorStore {
 
             // Here we query the worktree ids, and yet we dont have them elsewhere
             // We likely want to clean up these datastructures
-            let (db, worktree_hashes, worktree_db_ids) = cx
+            let (db, mut worktree_file_times, worktree_db_ids) = cx
                 .background()
                 .spawn({
                     let worktrees = worktrees.clone();
                     async move {
                         let mut worktree_db_ids: HashMap<WorktreeId, i64> = HashMap::new();
-                        let mut hashes: HashMap<WorktreeId, HashMap<PathBuf, FileSha1>> =
+                        let mut file_times: HashMap<WorktreeId, HashMap<PathBuf, SystemTime>> =
                             HashMap::new();
                         for worktree in worktrees {
                             let worktree_db_id =
                                 db.find_or_create_worktree(worktree.abs_path().as_ref())?;
                             worktree_db_ids.insert(worktree.id(), worktree_db_id);
-                            hashes.insert(worktree.id(), db.get_file_hashes(worktree_db_id)?);
+                            file_times.insert(worktree.id(), db.get_file_mtimes(worktree_db_id)?);
                         }
-                        anyhow::Ok((db, hashes, worktree_db_ids))
+                        anyhow::Ok((db, file_times, worktree_db_ids))
                     }
                 })
                 .await?;
 
             let (paths_tx, paths_rx) =
-                channel::unbounded::<(i64, PathBuf, String, Arc<Language>)>();
+                channel::unbounded::<(i64, PathBuf, Arc<Language>, SystemTime)>();
             let (delete_paths_tx, delete_paths_rx) = channel::unbounded::<(i64, PathBuf)>();
             let (indexed_files_tx, indexed_files_rx) = channel::unbounded::<(i64, IndexedFile)>();
             cx.background()
                 .spawn({
-                    let fs = fs.clone();
                     let worktree_db_ids = worktree_db_ids.clone();
                     async move {
                         for worktree in worktrees.into_iter() {
-                            let file_hashes = &worktree_hashes[&worktree.id()];
-                            let mut files_included =
-                                file_hashes.keys().collect::<HashSet<&PathBuf>>();
+                            let mut file_mtimes =
+                                worktree_file_times.remove(&worktree.id()).unwrap();
                             for file in worktree.files(false, 0) {
                                 let absolute_path = worktree.absolutize(&file.path);
 
@@ -278,30 +279,26 @@ impl VectorStore {
                                         continue;
                                     }
 
-                                    if let Some(content) = fs.load(&absolute_path).await.log_err() {
-                                        let path_buf = file.path.to_path_buf();
-                                        let already_stored = file_hashes.get(&path_buf).map_or(
-                                            false,
-                                            |existing_hash| {
-                                                files_included.remove(&path_buf);
-                                                existing_hash.equals(&content)
-                                            },
-                                        );
-
-                                        if !already_stored {
-                                            paths_tx
-                                                .try_send((
-                                                    worktree_db_ids[&worktree.id()],
-                                                    path_buf,
-                                                    content,
-                                                    language,
-                                                ))
-                                                .unwrap();
-                                        }
+                                    let path_buf = file.path.to_path_buf();
+                                    let stored_mtime = file_mtimes.remove(&file.path.to_path_buf());
+                                    let already_stored = stored_mtime
+                                        .map_or(false, |existing_mtime| {
+                                            existing_mtime == file.mtime
+                                        });
+
+                                    if !already_stored {
+                                        paths_tx
+                                            .try_send((
+                                                worktree_db_ids[&worktree.id()],
+                                                path_buf,
+                                                language,
+                                                file.mtime,
+                                            ))
+                                            .unwrap();
                                     }
                                 }
                             }
-                            for file in files_included {
+                            for file in file_mtimes.keys() {
                                 delete_paths_tx
                                     .try_send((worktree_db_ids[&worktree.id()], file.to_owned()))
                                     .unwrap();
@@ -336,16 +333,17 @@ impl VectorStore {
                         scope.spawn(async {
                             let mut parser = Parser::new();
                             let mut cursor = QueryCursor::new();
-                            while let Ok((worktree_id, file_path, content, language)) =
+                            while let Ok((worktree_id, file_path, language, mtime)) =
                                 paths_rx.recv().await
                             {
                                 if let Some(indexed_file) = Self::index_file(
                                     &mut cursor,
                                     &mut parser,
                                     embedding_provider.as_ref(),
+                                    &fs,
                                     language,
                                     file_path,
-                                    content,
+                                    mtime,
                                 )
                                 .await
                                 .log_err()
@@ -395,6 +393,8 @@ impl VectorStore {
             })
             .collect::<Vec<_>>();
 
+        log::info!("Searching for: {:?}", phrase);
+
         let embedding_provider = self.embedding_provider.clone();
         let database_url = self.database_url.clone();
         cx.spawn(|this, cx| async move {