Rename `Sha1` to `DocumentDigest`

Antonio Scandurra and Kyle Caverly created

Co-Authored-By: Kyle Caverly <kyle@zed.dev>

Change summary

crates/semantic_index/src/db.rs                   | 12 +++-
crates/semantic_index/src/parsing.rs              | 35 +++++++---------
crates/semantic_index/src/semantic_index.rs       |  2 
crates/semantic_index/src/semantic_index_tests.rs |  6 +-
4 files changed, 28 insertions(+), 27 deletions(-)

Detailed changes

crates/semantic_index/src/db.rs 🔗

@@ -1,4 +1,8 @@
-use crate::{embedding::Embedding, parsing::Document, SEMANTIC_INDEX_VERSION};
+use crate::{
+    embedding::Embedding,
+    parsing::{Document, DocumentDigest},
+    SEMANTIC_INDEX_VERSION,
+};
 use anyhow::{anyhow, Context, Result};
 use futures::channel::oneshot;
 use gpui::executor;
@@ -165,7 +169,7 @@ impl VectorDatabase {
                     end_byte INTEGER NOT NULL,
                     name VARCHAR NOT NULL,
                     embedding BLOB NOT NULL,
-                    sha1 BLOB NOT NULL,
+                    digest BLOB NOT NULL,
                     FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
                 )",
                 [],
@@ -225,14 +229,14 @@ impl VectorDatabase {
             // I imagine we can speed this up with a bulk insert of some kind.
             for document in documents {
                 db.execute(
-                    "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, sha1) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+                    "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, digest) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
                     params![
                         file_id,
                         document.range.start.to_string(),
                         document.range.end.to_string(),
                         document.name,
                         document.embedding,
-                        document.sha1
+                        document.digest
                     ],
                 )?;
            }

crates/semantic_index/src/parsing.rs 🔗

@@ -1,11 +1,11 @@
-use crate::embedding::{EmbeddingProvider, Embedding};
+use crate::embedding::{Embedding, EmbeddingProvider};
 use anyhow::{anyhow, Result};
 use language::{Grammar, Language};
 use rusqlite::{
     types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
     ToSql,
 };
-use sha1::Digest;
+use sha1::{Digest, Sha1};
 use std::{
     cmp::{self, Reverse},
     collections::HashSet,
@@ -15,10 +15,10 @@ use std::{
 };
 use tree_sitter::{Parser, QueryCursor};
 
-#[derive(Debug, PartialEq, Clone)]
-pub struct Sha1([u8; 20]);
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub struct DocumentDigest([u8; 20]);
 
-impl FromSql for Sha1 {
+impl FromSql for DocumentDigest {
     fn column_result(value: ValueRef) -> FromSqlResult<Self> {
         let blob = value.as_blob()?;
         let bytes =
@@ -27,19 +27,19 @@ impl FromSql for Sha1 {
                     expected_size: 20,
                     blob_size: blob.len(),
                 })?;
-        return Ok(Sha1(bytes));
+        return Ok(DocumentDigest(bytes));
     }
 }
 
-impl ToSql for Sha1 {
+impl ToSql for DocumentDigest {
     fn to_sql(&self) -> rusqlite::Result<ToSqlOutput> {
         self.0.to_sql()
     }
 }
 
-impl From<&'_ str> for Sha1 {
+impl From<&'_ str> for DocumentDigest {
     fn from(value: &'_ str) -> Self {
-        let mut sha1 = sha1::Sha1::new();
+        let mut sha1 = Sha1::new();
         sha1.update(value);
         Self(sha1.finalize().into())
     }
@@ -51,7 +51,7 @@ pub struct Document {
     pub range: Range<usize>,
     pub content: String,
     pub embedding: Option<Embedding>,
-    pub sha1: Sha1,
+    pub digest: DocumentDigest,
     pub token_count: usize,
 }
 
@@ -102,17 +102,14 @@ impl CodeContextRetriever {
             .replace("<path>", relative_path.to_string_lossy().as_ref())
             .replace("<language>", language_name.as_ref())
             .replace("<item>", &content);
-
-        let sha1 = Sha1::from(document_span.as_str());
-
+        let digest = DocumentDigest::from(document_span.as_str());
         let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
-
         Ok(vec![Document {
             range: 0..content.len(),
             content: document_span,
             embedding: Default::default(),
             name: language_name.to_string(),
-            sha1,
+            digest,
             token_count,
         }])
     }
@@ -121,14 +118,14 @@ impl CodeContextRetriever {
         let document_span = MARKDOWN_CONTEXT_TEMPLATE
             .replace("<path>", relative_path.to_string_lossy().as_ref())
             .replace("<item>", &content);
-        let sha1 = Sha1::from(document_span.as_str());
+        let digest = DocumentDigest::from(document_span.as_str());
         let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
         Ok(vec![Document {
             range: 0..content.len(),
             content: document_span,
             embedding: None,
             name: "Markdown".to_string(),
-            sha1,
+            digest,
             token_count,
         }])
     }
@@ -308,13 +305,13 @@ impl CodeContextRetriever {
                 );
             }
 
-            let sha1 = Sha1::from(document_content.as_str());
+            let sha1 = DocumentDigest::from(document_content.as_str());
             documents.push(Document {
                 name,
                 content: document_content,
                 range: item_range.clone(),
                 embedding: None,
-                sha1,
+                digest: sha1,
                 token_count: 0,
             })
         }

crates/semantic_index/src/semantic_index.rs 🔗

@@ -37,7 +37,7 @@ use util::{
 };
 use workspace::WorkspaceCreated;
 
-const SEMANTIC_INDEX_VERSION: usize = 7;
+const SEMANTIC_INDEX_VERSION: usize = 8;
 const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600);
 
 pub fn init(

crates/semantic_index/src/semantic_index_tests.rs 🔗

@@ -1,7 +1,7 @@
 use crate::{
     embedding::{DummyEmbeddings, Embedding, EmbeddingProvider},
     embedding_queue::EmbeddingQueue,
-    parsing::{subtract_ranges, CodeContextRetriever, Document, Sha1},
+    parsing::{subtract_ranges, CodeContextRetriever, Document, DocumentDigest},
     semantic_index_settings::SemanticIndexSettings,
     FileToEmbed, JobHandle, SearchResult, SemanticIndex,
 };
@@ -220,13 +220,13 @@ async fn test_embedding_batching(cx: &mut TestAppContext, mut rng: StdRng) {
                         .with_simple_text()
                         .take(content_len)
                         .collect::<String>();
-                    let sha1 = Sha1::from(content.as_str());
+                    let digest = DocumentDigest::from(content.as_str());
                     Document {
                         range: 0..10,
                         embedding: None,
                         name: format!("document {document_ix}"),
                         content,
-                        sha1,
+                        digest,
                         token_count: rng.gen_range(10..30),
                     }
                 })