update vector_store to accomodate for full file parsing for JSON, TOML and YAML files

KCaverly created

Change summary

Cargo.lock                                    | 14 +++++++++-
crates/vector_store/Cargo.toml                |  2 
crates/vector_store/src/parsing.rs            | 26 +++++++++++++++++++++
crates/vector_store/src/vector_store.rs       | 11 ++++----
crates/vector_store/src/vector_store_tests.rs | 18 +++++++++++++
5 files changed, 62 insertions(+), 9 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -8134,6 +8134,16 @@ dependencies = [
  "tree-sitter",
 ]
 
+[[package]]
+name = "tree-sitter-toml"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca517f578a98b23d20780247cc2688407fa81effad5b627a5a364ec3339b53e8"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
 [[package]]
 name = "tree-sitter-typescript"
 version = "0.20.2"
@@ -8508,8 +8518,8 @@ dependencies = [
  "theme",
  "tiktoken-rs 0.5.0",
  "tree-sitter",
- "tree-sitter-javascript",
  "tree-sitter-rust",
+ "tree-sitter-toml 0.20.0",
  "tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "unindent",
  "util",
@@ -9560,7 +9570,7 @@ dependencies = [
  "tree-sitter-ruby",
  "tree-sitter-rust",
  "tree-sitter-scheme",
- "tree-sitter-toml",
+ "tree-sitter-toml 0.5.1",
  "tree-sitter-typescript 0.20.2 (git+https://github.com/tree-sitter/tree-sitter-typescript?rev=5d20856f34315b068c41edaee2ac8a100081d259)",
  "tree-sitter-yaml",
  "unindent",

crates/vector_store/Cargo.toml 🔗

@@ -51,6 +51,6 @@ tempdir.workspace = true
 ctor.workspace = true
 env_logger.workspace = true
 
-tree-sitter-javascript = "*"
 tree-sitter-typescript = "*"
 tree-sitter-rust = "*"
+tree-sitter-toml = "*"

crates/vector_store/src/parsing.rs 🔗

@@ -13,6 +13,9 @@ pub struct Document {
 
 const CODE_CONTEXT_TEMPLATE: &str =
     "The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
+const ENTIRE_FILE_TEMPLATE: &str =
+    "The below snippet is from file '<path>'\n\n```<language>\n<item>\n```";
+pub const PARSEABLE_ENTIRE_FILE_TYPES: [&str; 3] = ["TOML", "YAML", "JSON"];
 
 pub struct CodeContextRetriever {
     pub parser: Parser,
@@ -27,12 +30,35 @@ impl CodeContextRetriever {
         }
     }
 
+    fn _parse_entire_file(
+        &self,
+        relative_path: &Path,
+        language_name: Arc<str>,
+        content: &str,
+    ) -> Result<Vec<Document>> {
+        let document_span = ENTIRE_FILE_TEMPLATE
+            .replace("<path>", relative_path.to_string_lossy().as_ref())
+            .replace("<language>", language_name.as_ref())
+            .replace("item", &content);
+
+        Ok(vec![Document {
+            range: 0..content.len(),
+            content: document_span,
+            embedding: Vec::new(),
+            name: language_name.to_string(),
+        }])
+    }
+
     pub fn parse_file(
         &mut self,
         relative_path: &Path,
         content: &str,
         language: Arc<Language>,
     ) -> Result<Vec<Document>> {
+        if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) {
+            return self._parse_entire_file(relative_path, language.name(), &content);
+        }
+
         let grammar = language
             .grammar()
             .ok_or_else(|| anyhow!("no grammar for language"))?;

crates/vector_store/src/vector_store.rs 🔗

@@ -19,7 +19,7 @@ use gpui::{
 use language::{Language, LanguageRegistry};
 use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
 use parking_lot::Mutex;
-use parsing::{CodeContextRetriever, Document};
+use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES};
 use project::{Fs, Project, WorktreeId};
 use smol::channel;
 use std::{
@@ -537,10 +537,11 @@ impl VectorStore {
                                 .language_for_file(&absolute_path, None)
                                 .await
                             {
-                                if language
-                                    .grammar()
-                                    .and_then(|grammar| grammar.embedding_config.as_ref())
-                                    .is_none()
+                                if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref())
+                                    && language
+                                        .grammar()
+                                        .and_then(|grammar| grammar.embedding_config.as_ref())
+                                        .is_none()
                                 {
                                     continue;
                                 }

crates/vector_store/src/vector_store_tests.rs 🔗

@@ -56,6 +56,9 @@ async fn test_vector_store(cx: &mut TestAppContext) {
                         println!(\"bbbb!\");
                     }
                 ".unindent(),
+                "file3.toml": "
+                    ZZZZZZZ = 5
+                    ".unindent(),
             }
         }),
     )
@@ -63,7 +66,9 @@ async fn test_vector_store(cx: &mut TestAppContext) {
 
     let languages = Arc::new(LanguageRegistry::new(Task::ready(())));
     let rust_language = rust_lang();
+    let toml_language = toml_lang();
     languages.add(rust_language);
+    languages.add(toml_language);
 
     let db_dir = tempdir::TempDir::new("vector-store").unwrap();
     let db_path = db_dir.path().join("db.sqlite");
@@ -87,7 +92,7 @@ async fn test_vector_store(cx: &mut TestAppContext) {
         .update(cx, |store, cx| store.index_project(project.clone(), cx))
         .await
         .unwrap();
-    assert_eq!(file_count, 2);
+    assert_eq!(file_count, 3);
     cx.foreground().run_until_parked();
     store.update(cx, |store, _cx| {
         assert_eq!(
@@ -578,3 +583,14 @@ fn rust_lang() -> Arc<Language> {
         .unwrap(),
     )
 }
+
+fn toml_lang() -> Arc<Language> {
+    Arc::new(Language::new(
+        LanguageConfig {
+            name: "TOML".into(),
+            path_suffixes: vec!["toml".into()],
+            ..Default::default()
+        },
+        Some(tree_sitter_toml::language()),
+    ))
+}