Detailed changes
@@ -1,4 +1,8 @@
-use crate::{embedding::Embedding, parsing::Document, SEMANTIC_INDEX_VERSION};
+use crate::{
+ embedding::Embedding,
+ parsing::{Document, DocumentDigest},
+ SEMANTIC_INDEX_VERSION,
+};
use anyhow::{anyhow, Context, Result};
use futures::channel::oneshot;
use gpui::executor;
@@ -165,7 +169,7 @@ impl VectorDatabase {
end_byte INTEGER NOT NULL,
name VARCHAR NOT NULL,
embedding BLOB NOT NULL,
- sha1 BLOB NOT NULL,
+ digest BLOB NOT NULL,
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
)",
[],
@@ -225,14 +229,14 @@ impl VectorDatabase {
// I imagine we can speed this up with a bulk insert of some kind.
for document in documents {
db.execute(
- "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, sha1) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+ "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, digest) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
params![
file_id,
document.range.start.to_string(),
document.range.end.to_string(),
document.name,
document.embedding,
- document.sha1
+ document.digest
],
)?;
}
@@ -1,11 +1,11 @@
-use crate::embedding::{EmbeddingProvider, Embedding};
+use crate::embedding::{Embedding, EmbeddingProvider};
use anyhow::{anyhow, Result};
use language::{Grammar, Language};
use rusqlite::{
types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
ToSql,
};
-use sha1::Digest;
+use sha1::{Digest, Sha1};
use std::{
cmp::{self, Reverse},
collections::HashSet,
@@ -15,10 +15,10 @@ use std::{
};
use tree_sitter::{Parser, QueryCursor};
-#[derive(Debug, PartialEq, Clone)]
-pub struct Sha1([u8; 20]);
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub struct DocumentDigest([u8; 20]);
-impl FromSql for Sha1 {
+impl FromSql for DocumentDigest {
fn column_result(value: ValueRef) -> FromSqlResult<Self> {
let blob = value.as_blob()?;
let bytes =
@@ -27,19 +27,19 @@ impl FromSql for Sha1 {
expected_size: 20,
blob_size: blob.len(),
})?;
- return Ok(Sha1(bytes));
+ return Ok(DocumentDigest(bytes));
}
}
-impl ToSql for Sha1 {
+impl ToSql for DocumentDigest {
fn to_sql(&self) -> rusqlite::Result<ToSqlOutput> {
self.0.to_sql()
}
}
-impl From<&'_ str> for Sha1 {
+impl From<&'_ str> for DocumentDigest {
fn from(value: &'_ str) -> Self {
- let mut sha1 = sha1::Sha1::new();
+ let mut sha1 = Sha1::new();
sha1.update(value);
Self(sha1.finalize().into())
}
@@ -51,7 +51,7 @@ pub struct Document {
pub range: Range<usize>,
pub content: String,
pub embedding: Option<Embedding>,
- pub sha1: Sha1,
+ pub digest: DocumentDigest,
pub token_count: usize,
}
@@ -102,17 +102,14 @@ impl CodeContextRetriever {
.replace("<path>", relative_path.to_string_lossy().as_ref())
.replace("<language>", language_name.as_ref())
.replace("<item>", &content);
-
- let sha1 = Sha1::from(document_span.as_str());
-
+ let digest = DocumentDigest::from(document_span.as_str());
let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
-
Ok(vec![Document {
range: 0..content.len(),
content: document_span,
embedding: Default::default(),
name: language_name.to_string(),
- sha1,
+ digest,
token_count,
}])
}
@@ -121,14 +118,14 @@ impl CodeContextRetriever {
let document_span = MARKDOWN_CONTEXT_TEMPLATE
.replace("<path>", relative_path.to_string_lossy().as_ref())
.replace("<item>", &content);
- let sha1 = Sha1::from(document_span.as_str());
+ let digest = DocumentDigest::from(document_span.as_str());
let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
Ok(vec![Document {
range: 0..content.len(),
content: document_span,
embedding: None,
name: "Markdown".to_string(),
- sha1,
+ digest,
token_count,
}])
}
@@ -308,13 +305,13 @@ impl CodeContextRetriever {
);
}
- let sha1 = Sha1::from(document_content.as_str());
+ let sha1 = DocumentDigest::from(document_content.as_str());
documents.push(Document {
name,
content: document_content,
range: item_range.clone(),
embedding: None,
- sha1,
+ digest: sha1,
token_count: 0,
})
}
@@ -37,7 +37,7 @@ use util::{
};
use workspace::WorkspaceCreated;
-const SEMANTIC_INDEX_VERSION: usize = 7;
+const SEMANTIC_INDEX_VERSION: usize = 8;
const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600);
pub fn init(
@@ -1,7 +1,7 @@
use crate::{
embedding::{DummyEmbeddings, Embedding, EmbeddingProvider},
embedding_queue::EmbeddingQueue,
- parsing::{subtract_ranges, CodeContextRetriever, Document, Sha1},
+ parsing::{subtract_ranges, CodeContextRetriever, Document, DocumentDigest},
semantic_index_settings::SemanticIndexSettings,
FileToEmbed, JobHandle, SearchResult, SemanticIndex,
};
@@ -220,13 +220,13 @@ async fn test_embedding_batching(cx: &mut TestAppContext, mut rng: StdRng) {
.with_simple_text()
.take(content_len)
.collect::<String>();
- let sha1 = Sha1::from(content.as_str());
+ let digest = DocumentDigest::from(content.as_str());
Document {
range: 0..10,
embedding: None,
name: format!("document {document_ix}"),
content,
- sha1,
+ digest,
token_count: rng.gen_range(10..30),
}
})