pulled treesitter parsing to own file for ease of testing and management

KCaverly created

Change summary

crates/vector_store/src/db.rs           |   4 
crates/vector_store/src/parsing.rs      |  94 ++++++++++++++++++++
crates/vector_store/src/vector_store.rs | 121 +++-----------------------
3 files changed, 110 insertions(+), 109 deletions(-)

Detailed changes

crates/vector_store/src/db.rs 🔗

@@ -7,7 +7,7 @@ use std::{
 
 use anyhow::{anyhow, Result};
 
-use crate::IndexedFile;
+use crate::parsing::ParsedFile;
 use rpc::proto::Timestamp;
 use rusqlite::{
     params,
@@ -109,7 +109,7 @@ impl VectorDatabase {
         Ok(())
     }
 
-    pub fn insert_file(&self, worktree_id: i64, indexed_file: IndexedFile) -> Result<()> {
+    pub fn insert_file(&self, worktree_id: i64, indexed_file: ParsedFile) -> Result<()> {
         // Write to files table, and return generated id.
         self.db.execute(
             "

crates/vector_store/src/parsing.rs 🔗

@@ -0,0 +1,94 @@
+use std::{ops::Range, path::PathBuf, sync::Arc, time::SystemTime};
+
+use anyhow::{anyhow, Ok, Result};
+use project::Fs;
+use tree_sitter::{Parser, QueryCursor};
+
+use crate::PendingFile;
+
+#[derive(Debug, PartialEq, Clone)]
+pub struct Document {
+    pub offset: usize,
+    pub name: String,
+    pub embedding: Vec<f32>,
+}
+
+#[derive(Debug, PartialEq, Clone)]
+pub struct ParsedFile {
+    pub path: PathBuf,
+    pub mtime: SystemTime,
+    pub documents: Vec<Document>,
+}
+
+pub struct CodeContextRetriever {
+    pub parser: Parser,
+    pub cursor: QueryCursor,
+    pub fs: Arc<dyn Fs>,
+}
+
+impl CodeContextRetriever {
+    pub async fn parse_file(
+        &mut self,
+        pending_file: PendingFile,
+    ) -> Result<(ParsedFile, Vec<String>)> {
+        let grammar = pending_file
+            .language
+            .grammar()
+            .ok_or_else(|| anyhow!("no grammar for language"))?;
+        let embedding_config = grammar
+            .embedding_config
+            .as_ref()
+            .ok_or_else(|| anyhow!("no embedding queries"))?;
+
+        let content = self.fs.load(&pending_file.absolute_path).await?;
+
+        self.parser.set_language(grammar.ts_language).unwrap();
+
+        let tree = self
+            .parser
+            .parse(&content, None)
+            .ok_or_else(|| anyhow!("parsing failed"))?;
+
+        let mut documents = Vec::new();
+        let mut context_spans = Vec::new();
+
+        // Iterate through query matches
+        for mat in self.cursor.matches(
+            &embedding_config.query,
+            tree.root_node(),
+            content.as_bytes(),
+        ) {
+            let mut item_range: Option<Range<usize>> = None;
+            let mut name_range: Option<Range<usize>> = None;
+            for capture in mat.captures {
+                if capture.index == embedding_config.item_capture_ix {
+                    item_range = Some(capture.node.byte_range());
+                } else if capture.index == embedding_config.name_capture_ix {
+                    name_range = Some(capture.node.byte_range());
+                }
+            }
+
+            if let Some((item_range, name_range)) = item_range.zip(name_range) {
+                if let Some((item, name)) =
+                    content.get(item_range.clone()).zip(content.get(name_range))
+                {
+                    context_spans.push(item.to_string());
+                    documents.push(Document {
+                        name: name.to_string(),
+                        offset: item_range.start,
+                        embedding: Vec::new(),
+                    });
+                }
+            }
+        }
+
+        return Ok((
+            ParsedFile {
+                path: pending_file.relative_path,
+                mtime: pending_file.modified_time,
+                documents,
+            },
+            context_spans,
+        ));
+    }
+}

crates/vector_store/src/vector_store.rs 🔗

@@ -1,6 +1,7 @@
 mod db;
 mod embedding;
 mod modal;
+mod parsing;
 
 #[cfg(test)]
 mod vector_store_tests;
@@ -15,6 +16,7 @@ use gpui::{
 };
 use language::{Language, LanguageRegistry};
 use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
+use parsing::{CodeContextRetriever, ParsedFile};
 use project::{Fs, Project, WorktreeId};
 use smol::channel;
 use std::{
@@ -38,13 +40,6 @@ use workspace::{Workspace, WorkspaceCreated};
 const REINDEXING_DELAY_SECONDS: u64 = 3;
 const EMBEDDINGS_BATCH_SIZE: usize = 150;
 
-#[derive(Debug, Clone)]
-pub struct Document {
-    pub offset: usize,
-    pub name: String,
-    pub embedding: Vec<f32>,
-}
-
 pub fn init(
     fs: Arc<dyn Fs>,
     http_client: Arc<dyn HttpClient>,
@@ -113,13 +108,6 @@ pub fn init(
     .detach();
 }
 
-#[derive(Debug, Clone)]
-pub struct IndexedFile {
-    path: PathBuf,
-    mtime: SystemTime,
-    documents: Vec<Document>,
-}
-
 pub struct VectorStore {
     fs: Arc<dyn Fs>,
     database_url: Arc<PathBuf>,
@@ -182,7 +170,7 @@ impl ProjectState {
 }
 
 #[derive(Clone, Debug)]
-struct PendingFile {
+pub struct PendingFile {
     worktree_db_id: i64,
     relative_path: PathBuf,
     absolute_path: PathBuf,
@@ -201,7 +189,7 @@ pub struct SearchResult {
 enum DbWrite {
     InsertFile {
         worktree_id: i64,
-        indexed_file: IndexedFile,
+        indexed_file: ParsedFile,
     },
     Delete {
         worktree_id: i64,
@@ -267,7 +255,7 @@ impl VectorStore {
 
             // embed_tx/rx: Embed Batch and Send to Database
             let (embed_batch_tx, embed_batch_rx) =
-                channel::unbounded::<Vec<(i64, IndexedFile, Vec<String>)>>();
+                channel::unbounded::<Vec<(i64, ParsedFile, Vec<String>)>>();
             let mut _embed_batch_task = Vec::new();
             for _ in 0..1 {
                 //cx.background().num_cpus() {
@@ -324,13 +312,14 @@ impl VectorStore {
 
             // batch_tx/rx: Batch Files to Send for Embeddings
             let (batch_files_tx, batch_files_rx) =
-                channel::unbounded::<(i64, IndexedFile, Vec<String>)>();
+                channel::unbounded::<(i64, ParsedFile, Vec<String>)>();
             let _batch_files_task = cx.background().spawn(async move {
                 let mut queue_len = 0;
                 let mut embeddings_queue = vec![];
                 while let Ok((worktree_id, indexed_file, document_spans)) =
                     batch_files_rx.recv().await
                 {
+                    dbg!("Batching in while loop");
                     queue_len += &document_spans.len();
                     embeddings_queue.push((worktree_id, indexed_file, document_spans));
                     if queue_len >= EMBEDDINGS_BATCH_SIZE {
@@ -339,6 +328,7 @@ impl VectorStore {
                         queue_len = 0;
                     }
                 }
+                // TODO: This is never getting called, We've gotta manage for how to clear the embedding batch if its less than the necessary batch size.
                 if queue_len > 0 {
                     embed_batch_tx.try_send(embeddings_queue).unwrap();
                 }
@@ -353,21 +343,14 @@ impl VectorStore {
                 let parsing_files_rx = parsing_files_rx.clone();
                 let batch_files_tx = batch_files_tx.clone();
                 _parsing_files_tasks.push(cx.background().spawn(async move {
-                    let mut parser = Parser::new();
-                    let mut cursor = QueryCursor::new();
+                    let parser = Parser::new();
+                    let cursor = QueryCursor::new();
+                    let mut retriever = CodeContextRetriever { parser, cursor, fs };
                     while let Ok(pending_file) = parsing_files_rx.recv().await {
                         log::info!("Parsing File: {:?}", &pending_file.relative_path);
-                        if let Some((indexed_file, document_spans)) = Self::index_file(
-                            &mut cursor,
-                            &mut parser,
-                            &fs,
-                            pending_file.language,
-                            pending_file.relative_path.clone(),
-                            pending_file.absolute_path.clone(),
-                            pending_file.modified_time,
-                        )
-                        .await
-                        .log_err()
+
+                        if let Some((indexed_file, document_spans)) =
+                            retriever.parse_file(pending_file.clone()).await.log_err()
                         {
                             batch_files_tx
                                 .try_send((
@@ -397,82 +380,6 @@ impl VectorStore {
         }))
     }
 
-    async fn index_file(
-        cursor: &mut QueryCursor,
-        parser: &mut Parser,
-        fs: &Arc<dyn Fs>,
-        language: Arc<Language>,
-        relative_file_path: PathBuf,
-        absolute_file_path: PathBuf,
-        mtime: SystemTime,
-    ) -> Result<(IndexedFile, Vec<String>)> {
-        let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?;
-        let embedding_config = grammar
-            .embedding_config
-            .as_ref()
-            .ok_or_else(|| anyhow!("no outline query"))?;
-
-        let content = fs.load(&absolute_file_path).await?;
-
-        parser.set_language(grammar.ts_language).unwrap();
-        let tree = parser
-            .parse(&content, None)
-            .ok_or_else(|| anyhow!("parsing failed"))?;
-
-        let mut documents = Vec::new();
-        let mut context_spans = Vec::new();
-        for mat in cursor.matches(
-            &embedding_config.query,
-            tree.root_node(),
-            content.as_bytes(),
-        ) {
-            let mut item_range = None;
-            let mut name_range = None;
-            let mut context_range = None;
-            for capture in mat.captures {
-                if capture.index == embedding_config.item_capture_ix {
-                    item_range = Some(capture.node.byte_range());
-                } else if capture.index == embedding_config.name_capture_ix {
-                    name_range = Some(capture.node.byte_range());
-                }
-                if let Some(context_capture_ix) = embedding_config.context_capture_ix {
-                    if capture.index == context_capture_ix {
-                        context_range = Some(capture.node.byte_range());
-                    }
-                }
-            }
-
-            if let Some((item_range, name_range)) = item_range.zip(name_range) {
-                let mut context_data = String::new();
-                if let Some(context_range) = context_range {
-                    if let Some(context) = content.get(context_range.clone()) {
-                        context_data.push_str(context);
-                    }
-                }
-
-                if let Some((item, name)) =
-                    content.get(item_range.clone()).zip(content.get(name_range))
-                {
-                    context_spans.push(item.to_string());
-                    documents.push(Document {
-                        name: format!("{} {}", context_data.to_string(), name.to_string()),
-                        offset: item_range.start,
-                        embedding: Vec::new(),
-                    });
-                }
-            }
-        }
-
-        return Ok((
-            IndexedFile {
-                path: relative_file_path,
-                mtime,
-                documents,
-            },
-            context_spans,
-        ));
-    }
-
     fn find_or_create_worktree(&self, path: PathBuf) -> impl Future<Output = Result<i64>> {
         let (tx, rx) = oneshot::channel();
         self.db_update_tx