@@ -4232,19 +4232,6 @@ dependencies = [
"tempfile",
]
-[[package]]
-name = "ndarray"
-version = "0.15.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
-dependencies = [
- "matrixmultiply",
- "num-complex",
- "num-integer",
- "num-traits",
- "rawpointer",
-]
-
[[package]]
name = "net2"
version = "0.2.38"
@@ -4353,15 +4340,6 @@ dependencies = [
"zeroize",
]
-[[package]]
-name = "num-complex"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
-dependencies = [
- "num-traits",
-]
-
[[package]]
name = "num-integer"
version = "0.1.45"
@@ -8050,14 +8028,13 @@ dependencies = [
"lazy_static",
"log",
"matrixmultiply",
- "ndarray",
"picker",
"project",
"rand 0.8.5",
+ "rpc",
"rusqlite",
"serde",
"serde_json",
- "sha-1 0.10.1",
"smol",
"tempdir",
"theme",
@@ -2,18 +2,17 @@ use std::{
collections::HashMap,
path::{Path, PathBuf},
rc::Rc,
+ time::SystemTime,
};
use anyhow::{anyhow, Result};
+use crate::IndexedFile;
+use rpc::proto::Timestamp;
use rusqlite::{
params,
- types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
- ToSql,
+ types::{FromSql, FromSqlResult, ValueRef},
};
-use sha1::{Digest, Sha1};
-
-use crate::IndexedFile;
// Note this is not an appropriate document
#[derive(Debug)]
@@ -29,60 +28,7 @@ pub struct DocumentRecord {
pub struct FileRecord {
pub id: usize,
pub relative_path: String,
- pub sha1: FileSha1,
-}
-
-#[derive(Debug)]
-pub struct FileSha1(pub Vec<u8>);
-
-impl FileSha1 {
- pub fn from_str(content: String) -> Self {
- let mut hasher = Sha1::new();
- hasher.update(content);
- let sha1 = hasher.finalize()[..]
- .into_iter()
- .map(|val| val.to_owned())
- .collect::<Vec<u8>>();
- return FileSha1(sha1);
- }
-
- pub fn equals(&self, content: &String) -> bool {
- let mut hasher = Sha1::new();
- hasher.update(content);
- let sha1 = hasher.finalize()[..]
- .into_iter()
- .map(|val| val.to_owned())
- .collect::<Vec<u8>>();
-
- let equal = self
- .0
- .clone()
- .into_iter()
- .zip(sha1)
- .filter(|&(a, b)| a == b)
- .count()
- == self.0.len();
-
- equal
- }
-}
-
-impl ToSql for FileSha1 {
- fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {
- return self.0.to_sql();
- }
-}
-
-impl FromSql for FileSha1 {
- fn column_result(value: ValueRef) -> FromSqlResult<Self> {
- let bytes = value.as_blob()?;
- Ok(FileSha1(
- bytes
- .into_iter()
- .map(|val| val.to_owned())
- .collect::<Vec<u8>>(),
- ))
- }
+ pub mtime: Timestamp,
}
#[derive(Debug)]
@@ -133,7 +79,8 @@ impl VectorDatabase {
id INTEGER PRIMARY KEY AUTOINCREMENT,
worktree_id INTEGER NOT NULL,
relative_path VARCHAR NOT NULL,
- sha1 BLOB NOT NULL,
+ mtime_seconds INTEGER NOT NULL,
+ mtime_nanos INTEGER NOT NULL,
FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE
)",
[],
@@ -170,11 +117,20 @@ impl VectorDatabase {
",
params![worktree_id, indexed_file.path.to_str()],
)?;
+ let mtime = Timestamp::from(indexed_file.mtime);
self.db.execute(
"
- INSERT INTO files (worktree_id, relative_path, sha1) VALUES (?1, ?2, $3);
+ INSERT INTO files
+ (worktree_id, relative_path, mtime_seconds, mtime_nanos)
+ VALUES
+ (?1, ?2, $3, $4);
",
- params![worktree_id, indexed_file.path.to_str(), indexed_file.sha1],
+ params![
+ worktree_id,
+ indexed_file.path.to_str(),
+ mtime.seconds,
+ mtime.nanos
+ ],
)?;
let file_id = self.db.last_insert_rowid();
@@ -224,13 +180,24 @@ impl VectorDatabase {
Ok(self.db.last_insert_rowid())
}
- pub fn get_file_hashes(&self, worktree_id: i64) -> Result<HashMap<PathBuf, FileSha1>> {
+ pub fn get_file_mtimes(&self, worktree_id: i64) -> Result<HashMap<PathBuf, SystemTime>> {
let mut statement = self.db.prepare(
- "SELECT relative_path, sha1 FROM files WHERE worktree_id = ?1 ORDER BY relative_path",
+ "
+ SELECT relative_path, mtime_seconds, mtime_nanos
+ FROM files
+ WHERE worktree_id = ?1
+ ORDER BY relative_path",
)?;
- let mut result: HashMap<PathBuf, FileSha1> = HashMap::new();
+ let mut result: HashMap<PathBuf, SystemTime> = HashMap::new();
for row in statement.query_map(params![worktree_id], |row| {
- Ok((row.get::<_, String>(0)?.into(), row.get(1)?))
+ Ok((
+ row.get::<_, String>(0)?.into(),
+ Timestamp {
+ seconds: row.get(1)?,
+ nanos: row.get(2)?,
+ }
+ .into(),
+ ))
})? {
let row = row?;
result.insert(row.0, row.1);
@@ -6,7 +6,7 @@ mod modal;
mod vector_store_tests;
use anyhow::{anyhow, Result};
-use db::{FileSha1, VectorDatabase};
+use db::VectorDatabase;
use embedding::{EmbeddingProvider, OpenAIEmbeddings};
use gpui::{AppContext, Entity, ModelContext, ModelHandle, Task, ViewContext};
use language::{Language, LanguageRegistry};
@@ -15,9 +15,10 @@ use project::{Fs, Project, WorktreeId};
use smol::channel;
use std::{
cmp::Ordering,
- collections::{HashMap, HashSet},
+ collections::HashMap,
path::{Path, PathBuf},
sync::Arc,
+ time::SystemTime,
};
use tree_sitter::{Parser, QueryCursor};
use util::{
@@ -46,6 +47,7 @@ pub fn init(
VectorStore::new(
fs,
db_file_path,
+ // Arc::new(embedding::DummyEmbeddings {}),
Arc::new(OpenAIEmbeddings {
client: http_client,
}),
@@ -91,7 +93,7 @@ pub fn init(
#[derive(Debug)]
pub struct IndexedFile {
path: PathBuf,
- sha1: FileSha1,
+ mtime: SystemTime,
documents: Vec<Document>,
}
@@ -131,9 +133,10 @@ impl VectorStore {
cursor: &mut QueryCursor,
parser: &mut Parser,
embedding_provider: &dyn EmbeddingProvider,
+ fs: &Arc<dyn Fs>,
language: Arc<Language>,
file_path: PathBuf,
- content: String,
+ mtime: SystemTime,
) -> Result<IndexedFile> {
let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?;
let embedding_config = grammar
@@ -141,6 +144,8 @@ impl VectorStore {
.as_ref()
.ok_or_else(|| anyhow!("no outline query"))?;
+ let content = fs.load(&file_path).await?;
+
parser.set_language(grammar.ts_language).unwrap();
let tree = parser
.parse(&content, None)
@@ -184,11 +189,9 @@ impl VectorStore {
}
}
- let sha1 = FileSha1::from_str(content);
-
return Ok(IndexedFile {
path: file_path,
- sha1,
+ mtime,
documents,
});
}
@@ -231,38 +234,36 @@ impl VectorStore {
// Here we query the worktree ids, and yet we dont have them elsewhere
// We likely want to clean up these datastructures
- let (db, worktree_hashes, worktree_db_ids) = cx
+ let (db, mut worktree_file_times, worktree_db_ids) = cx
.background()
.spawn({
let worktrees = worktrees.clone();
async move {
let mut worktree_db_ids: HashMap<WorktreeId, i64> = HashMap::new();
- let mut hashes: HashMap<WorktreeId, HashMap<PathBuf, FileSha1>> =
+ let mut file_times: HashMap<WorktreeId, HashMap<PathBuf, SystemTime>> =
HashMap::new();
for worktree in worktrees {
let worktree_db_id =
db.find_or_create_worktree(worktree.abs_path().as_ref())?;
worktree_db_ids.insert(worktree.id(), worktree_db_id);
- hashes.insert(worktree.id(), db.get_file_hashes(worktree_db_id)?);
+ file_times.insert(worktree.id(), db.get_file_mtimes(worktree_db_id)?);
}
- anyhow::Ok((db, hashes, worktree_db_ids))
+ anyhow::Ok((db, file_times, worktree_db_ids))
}
})
.await?;
let (paths_tx, paths_rx) =
- channel::unbounded::<(i64, PathBuf, String, Arc<Language>)>();
+ channel::unbounded::<(i64, PathBuf, Arc<Language>, SystemTime)>();
let (delete_paths_tx, delete_paths_rx) = channel::unbounded::<(i64, PathBuf)>();
let (indexed_files_tx, indexed_files_rx) = channel::unbounded::<(i64, IndexedFile)>();
cx.background()
.spawn({
- let fs = fs.clone();
let worktree_db_ids = worktree_db_ids.clone();
async move {
for worktree in worktrees.into_iter() {
- let file_hashes = &worktree_hashes[&worktree.id()];
- let mut files_included =
- file_hashes.keys().collect::<HashSet<&PathBuf>>();
+ let mut file_mtimes =
+ worktree_file_times.remove(&worktree.id()).unwrap();
for file in worktree.files(false, 0) {
let absolute_path = worktree.absolutize(&file.path);
@@ -278,30 +279,26 @@ impl VectorStore {
continue;
}
- if let Some(content) = fs.load(&absolute_path).await.log_err() {
- let path_buf = file.path.to_path_buf();
- let already_stored = file_hashes.get(&path_buf).map_or(
- false,
- |existing_hash| {
- files_included.remove(&path_buf);
- existing_hash.equals(&content)
- },
- );
-
- if !already_stored {
- paths_tx
- .try_send((
- worktree_db_ids[&worktree.id()],
- path_buf,
- content,
- language,
- ))
- .unwrap();
- }
+ let path_buf = file.path.to_path_buf();
+ let stored_mtime = file_mtimes.remove(&file.path.to_path_buf());
+ let already_stored = stored_mtime
+ .map_or(false, |existing_mtime| {
+ existing_mtime == file.mtime
+ });
+
+ if !already_stored {
+ paths_tx
+ .try_send((
+ worktree_db_ids[&worktree.id()],
+ path_buf,
+ language,
+ file.mtime,
+ ))
+ .unwrap();
}
}
}
- for file in files_included {
+ for file in file_mtimes.keys() {
delete_paths_tx
.try_send((worktree_db_ids[&worktree.id()], file.to_owned()))
.unwrap();
@@ -336,16 +333,17 @@ impl VectorStore {
scope.spawn(async {
let mut parser = Parser::new();
let mut cursor = QueryCursor::new();
- while let Ok((worktree_id, file_path, content, language)) =
+ while let Ok((worktree_id, file_path, language, mtime)) =
paths_rx.recv().await
{
if let Some(indexed_file) = Self::index_file(
&mut cursor,
&mut parser,
embedding_provider.as_ref(),
+ &fs,
language,
file_path,
- content,
+ mtime,
)
.await
.log_err()
@@ -395,6 +393,8 @@ impl VectorStore {
})
.collect::<Vec<_>>();
+ log::info!("Searching for: {:?}", phrase);
+
let embedding_provider = self.embedding_provider.clone();
let database_url = self.database_url.clone();
cx.spawn(|this, cx| async move {