From 3408b98167481aa54c70839f6024bdc5cdfb2aec Mon Sep 17 00:00:00 2001 From: KCaverly Date: Fri, 30 Jun 2023 16:53:23 -0400 Subject: [PATCH] updated file compare in the semantic indexing engine, to work off of modified system times as opposed to file hashes Co-authored-by: maxbrunsfeld --- Cargo.lock | 25 +------ crates/vector_store/Cargo.toml | 4 +- crates/vector_store/src/db.rs | 99 +++++++++---------------- crates/vector_store/src/vector_store.rs | 78 +++++++++---------- 4 files changed, 75 insertions(+), 131 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 85599036a1a23bb515d7f61fa71ba5a25e774b8e..59cf30001e1625b8a46af84ca4f5e6f87ead9bac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4232,19 +4232,6 @@ dependencies = [ "tempfile", ] -[[package]] -name = "ndarray" -version = "0.15.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" -dependencies = [ - "matrixmultiply", - "num-complex", - "num-integer", - "num-traits", - "rawpointer", -] - [[package]] name = "net2" version = "0.2.38" @@ -4353,15 +4340,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "num-complex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d" -dependencies = [ - "num-traits", -] - [[package]] name = "num-integer" version = "0.1.45" @@ -8050,14 +8028,13 @@ dependencies = [ "lazy_static", "log", "matrixmultiply", - "ndarray", "picker", "project", "rand 0.8.5", + "rpc", "rusqlite", "serde", "serde_json", - "sha-1 0.10.1", "smol", "tempdir", "theme", diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index 4ecd46cb92533aaf6ba230a2a57f36ade18c2a4d..d1ad8a0f9b16a0cb2736ae9f0bf5a0bad8c331cc 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -17,6 +17,7 @@ util = { path = "../util" } picker = { path = "../picker" } theme = { path = "../theme" } editor = { path = "../editor" } +rpc = { path = "../rpc" } anyhow.workspace = true futures.workspace = true smol.workspace = true @@ -29,14 +30,13 @@ serde.workspace = true serde_json.workspace = true async-trait.workspace = true bincode = "1.3.3" -ndarray = "0.15.6" -sha-1 = "0.10.1" matrixmultiply = "0.3.7" [dev-dependencies] gpui = { path = "../gpui", features = ["test-support"] } language = { path = "../language", features = ["test-support"] } project = { path = "../project", features = ["test-support"] } +rpc = { path = "../rpc", features = ["test-support"] } workspace = { path = "../workspace", features = ["test-support"] } tree-sitter-rust = "*" rand.workspace = true diff --git a/crates/vector_store/src/db.rs b/crates/vector_store/src/db.rs index fec2980550e1721a0741afdc82c5008054cb7344..f822cca77eaf3c63a1a4928f73e98a85126c20d0 100644 --- a/crates/vector_store/src/db.rs +++ b/crates/vector_store/src/db.rs @@ -2,18 +2,17 @@ use std::{ collections::HashMap, path::{Path, PathBuf}, rc::Rc, + time::SystemTime, }; use anyhow::{anyhow, Result}; +use crate::IndexedFile; +use rpc::proto::Timestamp; use rusqlite::{ params, - types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef}, - ToSql, + types::{FromSql, FromSqlResult, ValueRef}, }; -use sha1::{Digest, Sha1}; - -use crate::IndexedFile; // Note this is not an appropriate document #[derive(Debug)] @@ -29,60 +28,7 @@ pub struct DocumentRecord { pub struct FileRecord { pub id: usize, pub relative_path: String, - pub sha1: FileSha1, -} - -#[derive(Debug)] -pub struct FileSha1(pub Vec); - -impl FileSha1 { - pub fn from_str(content: String) -> Self { - let mut hasher = Sha1::new(); - hasher.update(content); - let sha1 = hasher.finalize()[..] - .into_iter() - .map(|val| val.to_owned()) - .collect::>(); - return FileSha1(sha1); - } - - pub fn equals(&self, content: &String) -> bool { - let mut hasher = Sha1::new(); - hasher.update(content); - let sha1 = hasher.finalize()[..] - .into_iter() - .map(|val| val.to_owned()) - .collect::>(); - - let equal = self - .0 - .clone() - .into_iter() - .zip(sha1) - .filter(|&(a, b)| a == b) - .count() - == self.0.len(); - - equal - } -} - -impl ToSql for FileSha1 { - fn to_sql(&self) -> rusqlite::Result> { - return self.0.to_sql(); - } -} - -impl FromSql for FileSha1 { - fn column_result(value: ValueRef) -> FromSqlResult { - let bytes = value.as_blob()?; - Ok(FileSha1( - bytes - .into_iter() - .map(|val| val.to_owned()) - .collect::>(), - )) - } + pub mtime: Timestamp, } #[derive(Debug)] @@ -133,7 +79,8 @@ impl VectorDatabase { id INTEGER PRIMARY KEY AUTOINCREMENT, worktree_id INTEGER NOT NULL, relative_path VARCHAR NOT NULL, - sha1 BLOB NOT NULL, + mtime_seconds INTEGER NOT NULL, + mtime_nanos INTEGER NOT NULL, FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE )", [], @@ -170,11 +117,20 @@ impl VectorDatabase { ", params![worktree_id, indexed_file.path.to_str()], )?; + let mtime = Timestamp::from(indexed_file.mtime); self.db.execute( " - INSERT INTO files (worktree_id, relative_path, sha1) VALUES (?1, ?2, $3); + INSERT INTO files + (worktree_id, relative_path, mtime_seconds, mtime_nanos) + VALUES + (?1, ?2, $3, $4); ", - params![worktree_id, indexed_file.path.to_str(), indexed_file.sha1], + params![ + worktree_id, + indexed_file.path.to_str(), + mtime.seconds, + mtime.nanos + ], )?; let file_id = self.db.last_insert_rowid(); @@ -224,13 +180,24 @@ impl VectorDatabase { Ok(self.db.last_insert_rowid()) } - pub fn get_file_hashes(&self, worktree_id: i64) -> Result> { + pub fn get_file_mtimes(&self, worktree_id: i64) -> Result> { let mut statement = self.db.prepare( - "SELECT relative_path, sha1 FROM files WHERE worktree_id = ?1 ORDER BY relative_path", + " + SELECT relative_path, mtime_seconds, mtime_nanos + FROM files + WHERE worktree_id = ?1 + ORDER BY relative_path", )?; - let mut result: HashMap = HashMap::new(); + let mut result: HashMap = HashMap::new(); for row in statement.query_map(params![worktree_id], |row| { - Ok((row.get::<_, String>(0)?.into(), row.get(1)?)) + Ok(( + row.get::<_, String>(0)?.into(), + Timestamp { + seconds: row.get(1)?, + nanos: row.get(2)?, + } + .into(), + )) })? { let row = row?; result.insert(row.0, row.1); diff --git a/crates/vector_store/src/vector_store.rs b/crates/vector_store/src/vector_store.rs index 35a467b82f6ba10358d7eca6379d5524aaa5b15c..c329206c4bb2c2cd23b8cf34fcfb39d29202525a 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/vector_store/src/vector_store.rs @@ -6,7 +6,7 @@ mod modal; mod vector_store_tests; use anyhow::{anyhow, Result}; -use db::{FileSha1, VectorDatabase}; +use db::VectorDatabase; use embedding::{EmbeddingProvider, OpenAIEmbeddings}; use gpui::{AppContext, Entity, ModelContext, ModelHandle, Task, ViewContext}; use language::{Language, LanguageRegistry}; @@ -15,9 +15,10 @@ use project::{Fs, Project, WorktreeId}; use smol::channel; use std::{ cmp::Ordering, - collections::{HashMap, HashSet}, + collections::HashMap, path::{Path, PathBuf}, sync::Arc, + time::SystemTime, }; use tree_sitter::{Parser, QueryCursor}; use util::{ @@ -46,6 +47,7 @@ pub fn init( VectorStore::new( fs, db_file_path, + // Arc::new(embedding::DummyEmbeddings {}), Arc::new(OpenAIEmbeddings { client: http_client, }), @@ -91,7 +93,7 @@ pub fn init( #[derive(Debug)] pub struct IndexedFile { path: PathBuf, - sha1: FileSha1, + mtime: SystemTime, documents: Vec, } @@ -131,9 +133,10 @@ impl VectorStore { cursor: &mut QueryCursor, parser: &mut Parser, embedding_provider: &dyn EmbeddingProvider, + fs: &Arc, language: Arc, file_path: PathBuf, - content: String, + mtime: SystemTime, ) -> Result { let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?; let embedding_config = grammar @@ -141,6 +144,8 @@ impl VectorStore { .as_ref() .ok_or_else(|| anyhow!("no outline query"))?; + let content = fs.load(&file_path).await?; + parser.set_language(grammar.ts_language).unwrap(); let tree = parser .parse(&content, None) @@ -184,11 +189,9 @@ impl VectorStore { } } - let sha1 = FileSha1::from_str(content); - return Ok(IndexedFile { path: file_path, - sha1, + mtime, documents, }); } @@ -231,38 +234,36 @@ impl VectorStore { // Here we query the worktree ids, and yet we dont have them elsewhere // We likely want to clean up these datastructures - let (db, worktree_hashes, worktree_db_ids) = cx + let (db, mut worktree_file_times, worktree_db_ids) = cx .background() .spawn({ let worktrees = worktrees.clone(); async move { let mut worktree_db_ids: HashMap = HashMap::new(); - let mut hashes: HashMap> = + let mut file_times: HashMap> = HashMap::new(); for worktree in worktrees { let worktree_db_id = db.find_or_create_worktree(worktree.abs_path().as_ref())?; worktree_db_ids.insert(worktree.id(), worktree_db_id); - hashes.insert(worktree.id(), db.get_file_hashes(worktree_db_id)?); + file_times.insert(worktree.id(), db.get_file_mtimes(worktree_db_id)?); } - anyhow::Ok((db, hashes, worktree_db_ids)) + anyhow::Ok((db, file_times, worktree_db_ids)) } }) .await?; let (paths_tx, paths_rx) = - channel::unbounded::<(i64, PathBuf, String, Arc)>(); + channel::unbounded::<(i64, PathBuf, Arc, SystemTime)>(); let (delete_paths_tx, delete_paths_rx) = channel::unbounded::<(i64, PathBuf)>(); let (indexed_files_tx, indexed_files_rx) = channel::unbounded::<(i64, IndexedFile)>(); cx.background() .spawn({ - let fs = fs.clone(); let worktree_db_ids = worktree_db_ids.clone(); async move { for worktree in worktrees.into_iter() { - let file_hashes = &worktree_hashes[&worktree.id()]; - let mut files_included = - file_hashes.keys().collect::>(); + let mut file_mtimes = + worktree_file_times.remove(&worktree.id()).unwrap(); for file in worktree.files(false, 0) { let absolute_path = worktree.absolutize(&file.path); @@ -278,30 +279,26 @@ impl VectorStore { continue; } - if let Some(content) = fs.load(&absolute_path).await.log_err() { - let path_buf = file.path.to_path_buf(); - let already_stored = file_hashes.get(&path_buf).map_or( - false, - |existing_hash| { - files_included.remove(&path_buf); - existing_hash.equals(&content) - }, - ); - - if !already_stored { - paths_tx - .try_send(( - worktree_db_ids[&worktree.id()], - path_buf, - content, - language, - )) - .unwrap(); - } + let path_buf = file.path.to_path_buf(); + let stored_mtime = file_mtimes.remove(&file.path.to_path_buf()); + let already_stored = stored_mtime + .map_or(false, |existing_mtime| { + existing_mtime == file.mtime + }); + + if !already_stored { + paths_tx + .try_send(( + worktree_db_ids[&worktree.id()], + path_buf, + language, + file.mtime, + )) + .unwrap(); } } } - for file in files_included { + for file in file_mtimes.keys() { delete_paths_tx .try_send((worktree_db_ids[&worktree.id()], file.to_owned())) .unwrap(); @@ -336,16 +333,17 @@ impl VectorStore { scope.spawn(async { let mut parser = Parser::new(); let mut cursor = QueryCursor::new(); - while let Ok((worktree_id, file_path, content, language)) = + while let Ok((worktree_id, file_path, language, mtime)) = paths_rx.recv().await { if let Some(indexed_file) = Self::index_file( &mut cursor, &mut parser, embedding_provider.as_ref(), + &fs, language, file_path, - content, + mtime, ) .await .log_err() @@ -395,6 +393,8 @@ impl VectorStore { }) .collect::>(); + log::info!("Searching for: {:?}", phrase); + let embedding_provider = self.embedding_provider.clone(); let database_url = self.database_url.clone(); cx.spawn(|this, cx| async move {