updated vector store indexing to only use languages with an embedding.scm treesitter query

KCaverly and maxbrunsfeld created

Co-authored-by: maxbrunsfeld <max@zed.dev>

Change summary

crates/language/src/language.rs               | 44 +++++++++++++++++++++
crates/vector_store/src/vector_store.rs       | 22 +++++++--
crates/vector_store/src/vector_store_tests.rs |  2 
crates/zed/src/languages.rs                   |  1 
crates/zed/src/languages/rust/embedding.scm   | 36 +++++++++++++++++
5 files changed, 98 insertions(+), 7 deletions(-)

Detailed changes

crates/language/src/language.rs 🔗

@@ -350,6 +350,7 @@ pub struct LanguageQueries {
     pub brackets: Option<Cow<'static, str>>,
     pub indents: Option<Cow<'static, str>>,
     pub outline: Option<Cow<'static, str>>,
+    pub embedding: Option<Cow<'static, str>>,
     pub injections: Option<Cow<'static, str>>,
     pub overrides: Option<Cow<'static, str>>,
 }
@@ -495,6 +496,7 @@ pub struct Grammar {
     pub(crate) brackets_config: Option<BracketConfig>,
     pub(crate) indents_config: Option<IndentConfig>,
     pub outline_config: Option<OutlineConfig>,
+    pub embedding_config: Option<EmbeddingConfig>,
     pub(crate) injection_config: Option<InjectionConfig>,
     pub(crate) override_config: Option<OverrideConfig>,
     pub(crate) highlight_map: Mutex<HighlightMap>,
@@ -516,6 +518,15 @@ pub struct OutlineConfig {
     pub extra_context_capture_ix: Option<u32>,
 }
 
+#[derive(Debug)]
+pub struct EmbeddingConfig {
+    pub query: Query,
+    pub item_capture_ix: u32,
+    pub name_capture_ix: u32,
+    pub context_capture_ix: Option<u32>,
+    pub extra_context_capture_ix: Option<u32>,
+}
+
 struct InjectionConfig {
     query: Query,
     content_capture_ix: u32,
@@ -1145,6 +1156,7 @@ impl Language {
                     highlights_query: None,
                     brackets_config: None,
                     outline_config: None,
+                    embedding_config: None,
                     indents_config: None,
                     injection_config: None,
                     override_config: None,
@@ -1181,6 +1193,9 @@ impl Language {
         if let Some(query) = queries.outline {
             self = self.with_outline_query(query.as_ref())?;
         }
+        if let Some(query) = queries.embedding {
+            self = self.with_embedding_query(query.as_ref())?;
+        }
         if let Some(query) = queries.injections {
             self = self.with_injection_query(query.as_ref())?;
         }
@@ -1189,6 +1204,7 @@ impl Language {
         }
         Ok(self)
     }
+
     pub fn with_highlights_query(mut self, source: &str) -> Result<Self> {
         let grammar = self.grammar_mut();
         grammar.highlights_query = Some(Query::new(grammar.ts_language, source)?);
@@ -1223,6 +1239,34 @@ impl Language {
         Ok(self)
     }
 
+    pub fn with_embedding_query(mut self, source: &str) -> Result<Self> {
+        let grammar = self.grammar_mut();
+        let query = Query::new(grammar.ts_language, source)?;
+        let mut item_capture_ix = None;
+        let mut name_capture_ix = None;
+        let mut context_capture_ix = None;
+        let mut extra_context_capture_ix = None;
+        get_capture_indices(
+            &query,
+            &mut [
+                ("item", &mut item_capture_ix),
+                ("name", &mut name_capture_ix),
+                ("context", &mut context_capture_ix),
+                ("context.extra", &mut extra_context_capture_ix),
+            ],
+        );
+        if let Some((item_capture_ix, name_capture_ix)) = item_capture_ix.zip(name_capture_ix) {
+            grammar.embedding_config = Some(EmbeddingConfig {
+                query,
+                item_capture_ix,
+                name_capture_ix,
+                context_capture_ix,
+                extra_context_capture_ix,
+            });
+        }
+        Ok(self)
+    }
+
     pub fn with_brackets_query(mut self, source: &str) -> Result<Self> {
         let grammar = self.grammar_mut();
         let query = Query::new(grammar.ts_language, source)?;

crates/vector_store/src/vector_store.rs 🔗

@@ -136,8 +136,8 @@ impl VectorStore {
         content: String,
     ) -> Result<IndexedFile> {
         let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?;
-        let outline_config = grammar
-            .outline_config
+        let embedding_config = grammar
+            .embedding_config
             .as_ref()
             .ok_or_else(|| anyhow!("no outline query"))?;
 
@@ -148,13 +148,17 @@ impl VectorStore {
 
         let mut documents = Vec::new();
         let mut context_spans = Vec::new();
-        for mat in cursor.matches(&outline_config.query, tree.root_node(), content.as_bytes()) {
+        for mat in cursor.matches(
+            &embedding_config.query,
+            tree.root_node(),
+            content.as_bytes(),
+        ) {
             let mut item_range = None;
             let mut name_range = None;
             for capture in mat.captures {
-                if capture.index == outline_config.item_capture_ix {
+                if capture.index == embedding_config.item_capture_ix {
                     item_range = Some(capture.node.byte_range());
-                } else if capture.index == outline_config.name_capture_ix {
+                } else if capture.index == embedding_config.name_capture_ix {
                     name_range = Some(capture.node.byte_range());
                 }
             }
@@ -266,7 +270,11 @@ impl VectorStore {
                                     .language_for_file(&absolute_path, None)
                                     .await
                                 {
-                                    if language.name().as_ref() != "Rust" {
+                                    if language
+                                        .grammar()
+                                        .and_then(|grammar| grammar.embedding_config.as_ref())
+                                        .is_none()
+                                    {
                                         continue;
                                     }
 
@@ -359,6 +367,8 @@ impl VectorStore {
                 this.worktree_db_ids.extend(worktree_db_ids);
             });
 
+            log::info!("Semantic Indexing Complete!");
+
             anyhow::Ok(())
         })
     }

crates/vector_store/src/vector_store_tests.rs 🔗

@@ -46,7 +46,7 @@ async fn test_vector_store(cx: &mut TestAppContext) {
             },
             Some(tree_sitter_rust::language()),
         )
-        .with_outline_query(
+        .with_embedding_query(
             r#"
             (function_item
                 name: (identifier) @name

crates/zed/src/languages.rs 🔗

@@ -170,6 +170,7 @@ fn load_queries(name: &str) -> LanguageQueries {
         brackets: load_query(name, "/brackets"),
         indents: load_query(name, "/indents"),
         outline: load_query(name, "/outline"),
+        embedding: load_query(name, "/embedding"),
         injections: load_query(name, "/injections"),
         overrides: load_query(name, "/overrides"),
     }

crates/zed/src/languages/rust/embedding.scm 🔗

@@ -0,0 +1,36 @@
+(struct_item
+    (visibility_modifier)? @context
+    "struct" @context
+    name: (_) @name) @item
+
+(enum_item
+    (visibility_modifier)? @context
+    "enum" @context
+    name: (_) @name) @item
+
+(impl_item
+    "impl" @context
+    trait: (_)? @name
+    "for"? @context
+    type: (_) @name) @item
+
+(trait_item
+    (visibility_modifier)? @context
+    "trait" @context
+    name: (_) @name) @item
+
+(function_item
+    (visibility_modifier)? @context
+    (function_modifiers)? @context
+    "fn" @context
+    name: (_) @name) @item
+
+(function_signature_item
+    (visibility_modifier)? @context
+    (function_modifiers)? @context
+    "fn" @context
+    name: (_) @name) @item
+
+(macro_definition
+    . "macro_rules!" @context
+    name: (_) @name) @item