From a56d454a0760420899d8c6582bde60e851efcf27 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 13 Jul 2023 10:10:24 -0400 Subject: [PATCH 01/34] added semantic search support for c --- crates/zed/src/languages/c/embedding.scm | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 crates/zed/src/languages/c/embedding.scm diff --git a/crates/zed/src/languages/c/embedding.scm b/crates/zed/src/languages/c/embedding.scm new file mode 100644 index 0000000000000000000000000000000000000000..cd1915f62bb5c27f7617bde91327a78129564511 --- /dev/null +++ b/crates/zed/src/languages/c/embedding.scm @@ -0,0 +1,39 @@ +(declaration + (type_qualifier)? @context + type: (_)? @context + declarator: [ + (function_declarator + declarator: (_) @name) + (pointer_declarator + "*" @context + declarator: (function_declarator + declarator: (_) @name)) + (pointer_declarator + "*" @context + declarator: (pointer_declarator + "*" @context + declarator: (function_declarator + declarator: (_) @name))) + ] +) @item + +(function_definition + (type_qualifier)? @context + type: (_)? @context + declarator: [ + (function_declarator + declarator: (_) @name + ) + (pointer_declarator + "*" @context + declarator: (function_declarator + declarator: (_) @name + )) + (pointer_declarator + "*" @context + declarator: (pointer_declarator + "*" @context + declarator: (function_declarator + declarator: (_) @name))) + ] +) @item From 5eab62858004493879172ff4576ca32ced8e6bea Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 13 Jul 2023 14:33:31 -0400 Subject: [PATCH 02/34] Added go parsing for semantic search, and added preceeding comments on go and rust. Co-authored-by: Alex Co-authored-by: maxbrunsfeld --- crates/language/src/language.rs | 4 -- crates/vector_store/src/parsing.rs | 23 ++++---- crates/zed/src/languages/go/embedding.scm | 24 +++++++++ crates/zed/src/languages/rust/embedding.scm | 58 ++++++++------------- 4 files changed, 55 insertions(+), 54 deletions(-) create mode 100644 crates/zed/src/languages/go/embedding.scm diff --git a/crates/language/src/language.rs b/crates/language/src/language.rs index dbd35f0e87bc602ac91aaf8196b81a4a017fff93..4ec5e88a7edde55c90e975c4bd944e3f38c1bb8b 100644 --- a/crates/language/src/language.rs +++ b/crates/language/src/language.rs @@ -525,7 +525,6 @@ pub struct EmbeddingConfig { pub item_capture_ix: u32, pub name_capture_ix: u32, pub context_capture_ix: Option, - pub extra_context_capture_ix: Option, } struct InjectionConfig { @@ -1246,14 +1245,12 @@ impl Language { let mut item_capture_ix = None; let mut name_capture_ix = None; let mut context_capture_ix = None; - let mut extra_context_capture_ix = None; get_capture_indices( &query, &mut [ ("item", &mut item_capture_ix), ("name", &mut name_capture_ix), ("context", &mut context_capture_ix), - ("context.extra", &mut extra_context_capture_ix), ], ); if let Some((item_capture_ix, name_capture_ix)) = item_capture_ix.zip(name_capture_ix) { @@ -1262,7 +1259,6 @@ impl Language { item_capture_ix, name_capture_ix, context_capture_ix, - extra_context_capture_ix, }); } Ok(self) diff --git a/crates/vector_store/src/parsing.rs b/crates/vector_store/src/parsing.rs index 91dcf699f8c3add9088b2af4f0d4df59b7551ac2..3e697399b1fa5b6dc2a42a29ec97f1e490643613 100644 --- a/crates/vector_store/src/parsing.rs +++ b/crates/vector_store/src/parsing.rs @@ -53,7 +53,7 @@ impl CodeContextRetriever { .ok_or_else(|| anyhow!("parsing failed"))?; let mut documents = Vec::new(); - let mut context_spans = Vec::new(); + let mut document_texts = Vec::new(); // Iterate through query matches for mat in self.cursor.matches( @@ -61,11 +61,10 @@ impl CodeContextRetriever { tree.root_node(), content.as_bytes(), ) { - // log::info!("-----MATCH-----"); - let mut name: Vec<&str> = vec![]; let mut item: Option<&str> = None; let mut offset: Option = None; + let mut context_spans: Vec<&str> = vec![]; for capture in mat.captures { if capture.index == embedding_config.item_capture_ix { offset = Some(capture.node.byte_range().start); @@ -79,25 +78,21 @@ impl CodeContextRetriever { if let Some(context_capture_ix) = embedding_config.context_capture_ix { if capture.index == context_capture_ix { if let Some(context) = content.get(capture.node.byte_range()) { - name.push(context); + context_spans.push(context); } } } } if item.is_some() && offset.is_some() && name.len() > 0 { - let context_span = CODE_CONTEXT_TEMPLATE + let item = format!("{}\n{}", context_spans.join("\n"), item.unwrap()); + + let document_text = CODE_CONTEXT_TEMPLATE .replace("", pending_file.relative_path.to_str().unwrap()) .replace("", &pending_file.language.name().to_lowercase()) - .replace("", item.unwrap()); - - let mut truncated_span = context_span.clone(); - truncated_span.truncate(100); - - // log::info!("Name: {:?}", name); - // log::info!("Span: {:?}", truncated_span); + .replace("", item.as_str()); - context_spans.push(context_span); + document_texts.push(document_text); documents.push(Document { name: name.join(" "), offset: offset.unwrap(), @@ -112,7 +107,7 @@ impl CodeContextRetriever { mtime: pending_file.modified_time, documents, }, - context_spans, + document_texts, )); } } diff --git a/crates/zed/src/languages/go/embedding.scm b/crates/zed/src/languages/go/embedding.scm new file mode 100644 index 0000000000000000000000000000000000000000..9d8700cdfb57d1008acc09c11013f2046e7bd157 --- /dev/null +++ b/crates/zed/src/languages/go/embedding.scm @@ -0,0 +1,24 @@ +( + (comment)* @context + . + (type_declaration + (type_spec + name: (_) @name) + ) @item +) + +( + (comment)* @context + . + (function_declaration + name: (_) @name + ) @item +) + +( + (comment)* @context + . + (method_declaration + name: (_) @name + ) @item +) diff --git a/crates/zed/src/languages/rust/embedding.scm b/crates/zed/src/languages/rust/embedding.scm index ea8bab9f68113a9b725e094a4d31f3e572c4bed7..3aec101e9fbb5d63a49db52869f34757135b0ab2 100644 --- a/crates/zed/src/languages/rust/embedding.scm +++ b/crates/zed/src/languages/rust/embedding.scm @@ -1,36 +1,22 @@ -(struct_item - (visibility_modifier)? @context - "struct" @context - name: (_) @name) @item - -(enum_item - (visibility_modifier)? @context - "enum" @context - name: (_) @name) @item - -(impl_item - "impl" @context - trait: (_)? @name - "for"? @context - type: (_) @name) @item - -(trait_item - (visibility_modifier)? @context - "trait" @context - name: (_) @name) @item - -(function_item - (visibility_modifier)? @context - (function_modifiers)? @context - "fn" @context - name: (_) @name) @item - -(function_signature_item - (visibility_modifier)? @context - (function_modifiers)? @context - "fn" @context - name: (_) @name) @item - -(macro_definition - . "macro_rules!" @context - name: (_) @name) @item +( + (line_comment)* @context + . + [ + (enum_item + name: (_) @name) @item + (struct_item + name: (_) @name) @item + (impl_item + trait: (_)? @name + "for"? @name + type: (_) @name) @item + (trait_item + name: (_) @name) @item + (function_item + name: (_) @name) @item + (macro_definition + name: (_) @name) @item + (function_signature_item + name: (_) @name) @item + ] +) From 0a0e40fb246b3f1e0e8751f24bf008387f223c4b Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 13 Jul 2023 16:34:32 -0400 Subject: [PATCH 03/34] refactored code context retrieval and standardized database migration Co-authored-by: maxbrunsfeld --- Cargo.lock | 2 + crates/vector_store/Cargo.toml | 3 + crates/vector_store/src/db.rs | 132 +++++++++++------ crates/vector_store/src/modal.rs | 2 +- crates/vector_store/src/parsing.rs | 82 +++++----- crates/vector_store/src/vector_store.rs | 140 ++++++++++-------- crates/vector_store/src/vector_store_tests.rs | 21 ++- 7 files changed, 233 insertions(+), 149 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0ac6a2ee890418104d6851a961d321f1ef7e8f36..4359659a53bad7b2b33bca0fa9e41cd6ae09b11f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8483,7 +8483,9 @@ dependencies = [ "anyhow", "async-trait", "bincode", + "ctor", "editor", + "env_logger 0.9.3", "futures 0.3.28", "gpui", "isahc", diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index 40bff8b95c167e43c9f20d31e47871d52d5ff8b1..8e1dea59fd8c0fe890291388fccaa9ac7cd3443d 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -44,6 +44,9 @@ rpc = { path = "../rpc", features = ["test-support"] } workspace = { path = "../workspace", features = ["test-support"] } settings = { path = "../settings", features = ["test-support"]} tree-sitter-rust = "*" + rand.workspace = true unindent.workspace = true tempdir.workspace = true +ctor.workspace = true +env_logger.workspace = true diff --git a/crates/vector_store/src/db.rs b/crates/vector_store/src/db.rs index a91a1872b59774a1863ae2a9ff867cf1b7ad39b3..d3d05f8c62c9d5639e641094204caa112e96c54f 100644 --- a/crates/vector_store/src/db.rs +++ b/crates/vector_store/src/db.rs @@ -1,21 +1,21 @@ +use crate::{parsing::Document, VECTOR_STORE_VERSION}; +use anyhow::{anyhow, Result}; +use project::Fs; +use rpc::proto::Timestamp; +use rusqlite::{ + params, + types::{FromSql, FromSqlResult, ValueRef}, +}; use std::{ cmp::Ordering, collections::HashMap, + ops::Range, path::{Path, PathBuf}, rc::Rc, + sync::Arc, time::SystemTime, }; -use anyhow::{anyhow, Result}; - -use crate::parsing::ParsedFile; -use crate::VECTOR_STORE_VERSION; -use rpc::proto::Timestamp; -use rusqlite::{ - params, - types::{FromSql, FromSqlResult, ValueRef}, -}; - #[derive(Debug)] pub struct FileRecord { pub id: usize, @@ -42,48 +42,88 @@ pub struct VectorDatabase { } impl VectorDatabase { - pub fn new(path: String) -> Result { + pub async fn new(fs: Arc, path: Arc) -> Result { + if let Some(db_directory) = path.parent() { + fs.create_dir(db_directory).await?; + } + let this = Self { - db: rusqlite::Connection::open(path)?, + db: rusqlite::Connection::open(path.as_path())?, }; this.initialize_database()?; Ok(this) } + fn get_existing_version(&self) -> Result { + let mut version_query = self.db.prepare("SELECT version from vector_store_config")?; + version_query + .query_row([], |row| Ok(row.get::<_, i64>(0)?)) + .map_err(|err| anyhow!("version query failed: {err}")) + } + fn initialize_database(&self) -> Result<()> { rusqlite::vtab::array::load_module(&self.db)?; - // This will create the database if it doesnt exist + if self + .get_existing_version() + .map_or(false, |version| version == VECTOR_STORE_VERSION as i64) + { + return Ok(()); + } + + self.db + .execute( + " + DROP TABLE vector_store_config; + DROP TABLE worktrees; + DROP TABLE files; + DROP TABLE documents; + ", + [], + ) + .ok(); // Initialize Vector Databasing Tables self.db.execute( - "CREATE TABLE IF NOT EXISTS worktrees ( + "CREATE TABLE vector_store_config ( + version INTEGER NOT NULL + )", + [], + )?; + + self.db.execute( + "INSERT INTO vector_store_config (version) VALUES (?1)", + params![VECTOR_STORE_VERSION], + )?; + + self.db.execute( + "CREATE TABLE worktrees ( id INTEGER PRIMARY KEY AUTOINCREMENT, absolute_path VARCHAR NOT NULL ); - CREATE UNIQUE INDEX IF NOT EXISTS worktrees_absolute_path ON worktrees (absolute_path); + CREATE UNIQUE INDEX worktrees_absolute_path ON worktrees (absolute_path); ", [], )?; self.db.execute( - "CREATE TABLE IF NOT EXISTS files ( + "CREATE TABLE files ( id INTEGER PRIMARY KEY AUTOINCREMENT, worktree_id INTEGER NOT NULL, relative_path VARCHAR NOT NULL, mtime_seconds INTEGER NOT NULL, mtime_nanos INTEGER NOT NULL, - vector_store_version INTEGER NOT NULL, FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE )", [], )?; self.db.execute( - "CREATE TABLE IF NOT EXISTS documents ( + "CREATE TABLE documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, file_id INTEGER NOT NULL, - offset INTEGER NOT NULL, + start_byte INTEGER NOT NULL, + end_byte INTEGER NOT NULL, name VARCHAR NOT NULL, embedding BLOB NOT NULL, FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE @@ -102,43 +142,44 @@ impl VectorDatabase { Ok(()) } - pub fn insert_file(&self, worktree_id: i64, indexed_file: ParsedFile) -> Result<()> { + pub fn insert_file( + &self, + worktree_id: i64, + path: PathBuf, + mtime: SystemTime, + documents: Vec, + ) -> Result<()> { // Write to files table, and return generated id. self.db.execute( " DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2; ", - params![worktree_id, indexed_file.path.to_str()], + params![worktree_id, path.to_str()], )?; - let mtime = Timestamp::from(indexed_file.mtime); + let mtime = Timestamp::from(mtime); self.db.execute( " INSERT INTO files - (worktree_id, relative_path, mtime_seconds, mtime_nanos, vector_store_version) + (worktree_id, relative_path, mtime_seconds, mtime_nanos) VALUES - (?1, ?2, $3, $4, $5); + (?1, ?2, $3, $4); ", - params![ - worktree_id, - indexed_file.path.to_str(), - mtime.seconds, - mtime.nanos, - VECTOR_STORE_VERSION - ], + params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos], )?; let file_id = self.db.last_insert_rowid(); // Currently inserting at approximately 3400 documents a second // I imagine we can speed this up with a bulk insert of some kind. - for document in indexed_file.documents { + for document in documents { let embedding_blob = bincode::serialize(&document.embedding)?; self.db.execute( - "INSERT INTO documents (file_id, offset, name, embedding) VALUES (?1, ?2, ?3, ?4)", + "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding) VALUES (?1, ?2, ?3, ?4, $5)", params![ file_id, - document.offset.to_string(), + document.range.start.to_string(), + document.range.end.to_string(), document.name, embedding_blob ], @@ -204,7 +245,7 @@ impl VectorDatabase { worktree_ids: &[i64], query_embedding: &Vec, limit: usize, - ) -> Result> { + ) -> Result, String)>> { let mut results = Vec::<(i64, f32)>::with_capacity(limit + 1); self.for_each_document(&worktree_ids, |id, embedding| { let similarity = dot(&embedding, &query_embedding); @@ -248,11 +289,18 @@ impl VectorDatabase { Ok(()) } - fn get_documents_by_ids(&self, ids: &[i64]) -> Result> { + fn get_documents_by_ids( + &self, + ids: &[i64], + ) -> Result, String)>> { let mut statement = self.db.prepare( " SELECT - documents.id, files.worktree_id, files.relative_path, documents.offset, documents.name + documents.id, + files.worktree_id, + files.relative_path, + documents.start_byte, + documents.end_byte, documents.name FROM documents, files WHERE @@ -266,15 +314,15 @@ impl VectorDatabase { row.get::<_, i64>(0)?, row.get::<_, i64>(1)?, row.get::<_, String>(2)?.into(), - row.get(3)?, - row.get(4)?, + row.get(3)?..row.get(4)?, + row.get(5)?, )) })?; - let mut values_by_id = HashMap::::default(); + let mut values_by_id = HashMap::, String)>::default(); for row in result_iter { - let (id, worktree_id, path, offset, name) = row?; - values_by_id.insert(id, (worktree_id, path, offset, name)); + let (id, worktree_id, path, range, name) = row?; + values_by_id.insert(id, (worktree_id, path, range, name)); } let mut results = Vec::with_capacity(ids.len()); diff --git a/crates/vector_store/src/modal.rs b/crates/vector_store/src/modal.rs index 9225fe8786e9173a82b32a9aedf5a7e979ff6f88..b797a208062ee623db11fbbfe40948847639465f 100644 --- a/crates/vector_store/src/modal.rs +++ b/crates/vector_store/src/modal.rs @@ -66,7 +66,7 @@ impl PickerDelegate for SemanticSearchDelegate { }); let workspace = self.workspace.clone(); - let position = search_result.clone().offset; + let position = search_result.clone().byte_range.start; cx.spawn(|_, mut cx| async move { let buffer = buffer.await?; workspace.update(&mut cx, |workspace, cx| { diff --git a/crates/vector_store/src/parsing.rs b/crates/vector_store/src/parsing.rs index 3e697399b1fa5b6dc2a42a29ec97f1e490643613..23dcf505c92896b1eb18499d4b05b633d1c37bf7 100644 --- a/crates/vector_store/src/parsing.rs +++ b/crates/vector_store/src/parsing.rs @@ -1,41 +1,39 @@ -use std::{path::PathBuf, sync::Arc, time::SystemTime}; - use anyhow::{anyhow, Ok, Result}; -use project::Fs; +use language::Language; +use std::{ops::Range, path::Path, sync::Arc}; use tree_sitter::{Parser, QueryCursor}; -use crate::PendingFile; - #[derive(Debug, PartialEq, Clone)] pub struct Document { - pub offset: usize, pub name: String, + pub range: Range, + pub content: String, pub embedding: Vec, } -#[derive(Debug, PartialEq, Clone)] -pub struct ParsedFile { - pub path: PathBuf, - pub mtime: SystemTime, - pub documents: Vec, -} - const CODE_CONTEXT_TEMPLATE: &str = "The below code snippet is from file ''\n\n```\n\n```"; pub struct CodeContextRetriever { pub parser: Parser, pub cursor: QueryCursor, - pub fs: Arc, } impl CodeContextRetriever { - pub async fn parse_file( + pub fn new() -> Self { + Self { + parser: Parser::new(), + cursor: QueryCursor::new(), + } + } + + pub fn parse_file( &mut self, - pending_file: PendingFile, - ) -> Result<(ParsedFile, Vec)> { - let grammar = pending_file - .language + relative_path: &Path, + content: &str, + language: Arc, + ) -> Result> { + let grammar = language .grammar() .ok_or_else(|| anyhow!("no grammar for language"))?; let embedding_config = grammar @@ -43,8 +41,6 @@ impl CodeContextRetriever { .as_ref() .ok_or_else(|| anyhow!("no embedding queries"))?; - let content = self.fs.load(&pending_file.absolute_path).await?; - self.parser.set_language(grammar.ts_language).unwrap(); let tree = self @@ -53,7 +49,6 @@ impl CodeContextRetriever { .ok_or_else(|| anyhow!("parsing failed"))?; let mut documents = Vec::new(); - let mut document_texts = Vec::new(); // Iterate through query matches for mat in self.cursor.matches( @@ -63,11 +58,11 @@ impl CodeContextRetriever { ) { let mut name: Vec<&str> = vec![]; let mut item: Option<&str> = None; - let mut offset: Option = None; + let mut byte_range: Option> = None; let mut context_spans: Vec<&str> = vec![]; for capture in mat.captures { if capture.index == embedding_config.item_capture_ix { - offset = Some(capture.node.byte_range().start); + byte_range = Some(capture.node.byte_range()); item = content.get(capture.node.byte_range()); } else if capture.index == embedding_config.name_capture_ix { if let Some(name_content) = content.get(capture.node.byte_range()) { @@ -84,30 +79,25 @@ impl CodeContextRetriever { } } - if item.is_some() && offset.is_some() && name.len() > 0 { - let item = format!("{}\n{}", context_spans.join("\n"), item.unwrap()); - - let document_text = CODE_CONTEXT_TEMPLATE - .replace("", pending_file.relative_path.to_str().unwrap()) - .replace("", &pending_file.language.name().to_lowercase()) - .replace("", item.as_str()); - - document_texts.push(document_text); - documents.push(Document { - name: name.join(" "), - offset: offset.unwrap(), - embedding: Vec::new(), - }) + if let Some((item, byte_range)) = item.zip(byte_range) { + if !name.is_empty() { + let item = format!("{}\n{}", context_spans.join("\n"), item); + + let document_text = CODE_CONTEXT_TEMPLATE + .replace("", relative_path.to_str().unwrap()) + .replace("", &language.name().to_lowercase()) + .replace("", item.as_str()); + + documents.push(Document { + range: byte_range, + content: document_text, + embedding: Vec::new(), + name: name.join(" ").to_string(), + }); + } } } - return Ok(( - ParsedFile { - path: pending_file.relative_path, - mtime: pending_file.modified_time, - documents, - }, - document_texts, - )); + return Ok(documents); } } diff --git a/crates/vector_store/src/vector_store.rs b/crates/vector_store/src/vector_store.rs index 0a197bc40663034d4156cf025c731449cef725c7..3d9c32875eef17c6cc58b1bed1637c1b920c2b0f 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/vector_store/src/vector_store.rs @@ -18,16 +18,16 @@ use gpui::{ }; use language::{Language, LanguageRegistry}; use modal::{SemanticSearch, SemanticSearchDelegate, Toggle}; -use parsing::{CodeContextRetriever, ParsedFile}; +use parsing::{CodeContextRetriever, Document}; use project::{Fs, PathChange, Project, ProjectEntryId, WorktreeId}; use smol::channel; use std::{ collections::HashMap, + ops::Range, path::{Path, PathBuf}, sync::Arc, time::{Duration, Instant, SystemTime}, }; -use tree_sitter::{Parser, QueryCursor}; use util::{ channel::{ReleaseChannel, RELEASE_CHANNEL, RELEASE_CHANNEL_NAME}, http::HttpClient, @@ -36,7 +36,7 @@ use util::{ }; use workspace::{Workspace, WorkspaceCreated}; -const VECTOR_STORE_VERSION: usize = 0; +const VECTOR_STORE_VERSION: usize = 1; const EMBEDDINGS_BATCH_SIZE: usize = 150; pub fn init( @@ -80,11 +80,11 @@ pub fn init( let vector_store = VectorStore::new( fs, db_file_path, - // Arc::new(embedding::DummyEmbeddings {}), - Arc::new(OpenAIEmbeddings { - client: http_client, - executor: cx.background(), - }), + Arc::new(embedding::DummyEmbeddings {}), + // Arc::new(OpenAIEmbeddings { + // client: http_client, + // executor: cx.background(), + // }), language_registry, cx.clone(), ) @@ -212,14 +212,16 @@ pub struct PendingFile { pub struct SearchResult { pub worktree_id: WorktreeId, pub name: String, - pub offset: usize, + pub byte_range: Range, pub file_path: PathBuf, } enum DbOperation { InsertFile { worktree_id: i64, - indexed_file: ParsedFile, + documents: Vec, + path: PathBuf, + mtime: SystemTime, }, Delete { worktree_id: i64, @@ -238,8 +240,9 @@ enum DbOperation { enum EmbeddingJob { Enqueue { worktree_id: i64, - parsed_file: ParsedFile, - document_spans: Vec, + path: PathBuf, + mtime: SystemTime, + documents: Vec, }, Flush, } @@ -256,18 +259,7 @@ impl VectorStore { let db = cx .background() - .spawn({ - let fs = fs.clone(); - let database_url = database_url.clone(); - async move { - if let Some(db_directory) = database_url.parent() { - fs.create_dir(db_directory).await.log_err(); - } - - let db = VectorDatabase::new(database_url.to_string_lossy().to_string())?; - anyhow::Ok(db) - } - }) + .spawn(VectorDatabase::new(fs.clone(), database_url.clone())) .await?; Ok(cx.add_model(|cx| { @@ -280,9 +272,12 @@ impl VectorStore { match job { DbOperation::InsertFile { worktree_id, - indexed_file, + documents, + path, + mtime, } => { - db.insert_file(worktree_id, indexed_file).log_err(); + db.insert_file(worktree_id, path, mtime, documents) + .log_err(); } DbOperation::Delete { worktree_id, path } => { db.delete_file(worktree_id, path).log_err(); @@ -304,35 +299,45 @@ impl VectorStore { // embed_tx/rx: Embed Batch and Send to Database let (embed_batch_tx, embed_batch_rx) = - channel::unbounded::)>>(); + channel::unbounded::, PathBuf, SystemTime)>>(); let _embed_batch_task = cx.background().spawn({ let db_update_tx = db_update_tx.clone(); let embedding_provider = embedding_provider.clone(); async move { while let Ok(mut embeddings_queue) = embed_batch_rx.recv().await { // Construct Batch - let mut document_spans = vec![]; - for (_, _, document_span) in embeddings_queue.iter() { - document_spans.extend(document_span.iter().map(|s| s.as_str())); + let mut batch_documents = vec![]; + for (_, documents, _, _) in embeddings_queue.iter() { + batch_documents + .extend(documents.iter().map(|document| document.content.as_str())); } - if let Ok(embeddings) = embedding_provider.embed_batch(document_spans).await + if let Ok(embeddings) = + embedding_provider.embed_batch(batch_documents).await { + log::trace!( + "created {} embeddings for {} files", + embeddings.len(), + embeddings_queue.len(), + ); + let mut i = 0; let mut j = 0; for embedding in embeddings.iter() { - while embeddings_queue[i].1.documents.len() == j { + while embeddings_queue[i].1.len() == j { i += 1; j = 0; } - embeddings_queue[i].1.documents[j].embedding = embedding.to_owned(); + embeddings_queue[i].1[j].embedding = embedding.to_owned(); j += 1; } - for (worktree_id, indexed_file, _) in embeddings_queue.into_iter() { - for document in indexed_file.documents.iter() { + for (worktree_id, documents, path, mtime) in + embeddings_queue.into_iter() + { + for document in documents.iter() { // TODO: Update this so it doesn't panic assert!( document.embedding.len() > 0, @@ -343,7 +348,9 @@ impl VectorStore { db_update_tx .send(DbOperation::InsertFile { worktree_id, - indexed_file, + documents, + path, + mtime, }) .await .unwrap(); @@ -362,12 +369,13 @@ impl VectorStore { while let Ok(job) = batch_files_rx.recv().await { let should_flush = match job { EmbeddingJob::Enqueue { - document_spans, + documents, worktree_id, - parsed_file, + path, + mtime, } => { - queue_len += &document_spans.len(); - embeddings_queue.push((worktree_id, parsed_file, document_spans)); + queue_len += &documents.len(); + embeddings_queue.push((worktree_id, documents, path, mtime)); queue_len >= EMBEDDINGS_BATCH_SIZE } EmbeddingJob::Flush => true, @@ -385,26 +393,38 @@ impl VectorStore { let (parsing_files_tx, parsing_files_rx) = channel::unbounded::(); let mut _parsing_files_tasks = Vec::new(); - // for _ in 0..cx.background().num_cpus() { - for _ in 0..1 { + for _ in 0..cx.background().num_cpus() { let fs = fs.clone(); let parsing_files_rx = parsing_files_rx.clone(); let batch_files_tx = batch_files_tx.clone(); _parsing_files_tasks.push(cx.background().spawn(async move { - let parser = Parser::new(); - let cursor = QueryCursor::new(); - let mut retriever = CodeContextRetriever { parser, cursor, fs }; + let mut retriever = CodeContextRetriever::new(); while let Ok(pending_file) = parsing_files_rx.recv().await { - if let Some((indexed_file, document_spans)) = - retriever.parse_file(pending_file.clone()).await.log_err() + if let Some(content) = fs.load(&pending_file.absolute_path).await.log_err() { - batch_files_tx - .try_send(EmbeddingJob::Enqueue { - worktree_id: pending_file.worktree_db_id, - parsed_file: indexed_file, - document_spans, - }) - .unwrap(); + if let Some(documents) = retriever + .parse_file( + &pending_file.relative_path, + &content, + pending_file.language, + ) + .log_err() + { + log::trace!( + "parsed path {:?}: {} documents", + pending_file.relative_path, + documents.len() + ); + + batch_files_tx + .try_send(EmbeddingJob::Enqueue { + worktree_id: pending_file.worktree_db_id, + path: pending_file.relative_path, + mtime: pending_file.modified_time, + documents, + }) + .unwrap(); + } } if parsing_files_rx.len() == 0 { @@ -543,6 +563,7 @@ impl VectorStore { }); if !already_stored { + log::trace!("sending for parsing: {:?}", path_buf); parsing_files_tx .try_send(PendingFile { worktree_db_id: db_ids_by_worktree_id @@ -565,8 +586,8 @@ impl VectorStore { .unwrap(); } } - log::info!( - "Parsing Worktree Completed in {:?}", + log::trace!( + "parsing worktree completed in {:?}", t0.elapsed().as_millis() ); } @@ -622,11 +643,12 @@ impl VectorStore { let embedding_provider = self.embedding_provider.clone(); let database_url = self.database_url.clone(); + let fs = self.fs.clone(); cx.spawn(|this, cx| async move { let documents = cx .background() .spawn(async move { - let database = VectorDatabase::new(database_url.to_string_lossy().into())?; + let database = VectorDatabase::new(fs, database_url).await?; let phrase_embedding = embedding_provider .embed_batch(vec![&phrase]) @@ -648,12 +670,12 @@ impl VectorStore { Ok(documents .into_iter() - .filter_map(|(worktree_db_id, file_path, offset, name)| { + .filter_map(|(worktree_db_id, file_path, byte_range, name)| { let worktree_id = project_state.worktree_id_for_db_id(worktree_db_id)?; Some(SearchResult { worktree_id, name, - offset, + byte_range, file_path, }) }) diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index b6e47e7a2341e71a790a749642770e81cd147aaf..c4349c72808a90ff2baeedd2485a11592a56d87c 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -12,6 +12,13 @@ use settings::SettingsStore; use std::sync::Arc; use unindent::Unindent; +#[ctor::ctor] +fn init_logger() { + if std::env::var("RUST_LOG").is_ok() { + env_logger::init(); + } +} + #[gpui::test] async fn test_vector_store(cx: &mut TestAppContext) { cx.update(|cx| { @@ -95,11 +102,23 @@ async fn test_vector_store(cx: &mut TestAppContext) { .await .unwrap(); - assert_eq!(search_results[0].offset, 0); + assert_eq!(search_results[0].byte_range.start, 0); assert_eq!(search_results[0].name, "aaa"); assert_eq!(search_results[0].worktree_id, worktree_id); } +#[gpui::test] +async fn test_code_context_retrieval(cx: &mut TestAppContext) { + // let mut retriever = CodeContextRetriever::new(fs); + + // retriever::parse_file( + // " + // // + // ", + // ); + // +} + #[gpui::test] fn test_dot_product(mut rng: StdRng) { assert_eq!(dot(&[1., 0., 0., 0., 0.], &[0., 1., 0., 0., 0.]), 0.); From 623cb9833c17aaac11d4a2d4bea03295ffa842c4 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 13 Jul 2023 16:58:42 -0400 Subject: [PATCH 04/34] add tests for rust context parsing, and update rust embedding query Co-authored-by: maxbrunsfeld --- crates/vector_store/src/parsing.rs | 6 +- crates/vector_store/src/vector_store_tests.rs | 156 ++++++++++++++---- crates/zed/src/languages/rust/embedding.scm | 64 +++++-- 3 files changed, 179 insertions(+), 47 deletions(-) diff --git a/crates/vector_store/src/parsing.rs b/crates/vector_store/src/parsing.rs index 23dcf505c92896b1eb18499d4b05b633d1c37bf7..8d6e03d6eb29d524db23848adfa15a8ac6b4b164 100644 --- a/crates/vector_store/src/parsing.rs +++ b/crates/vector_store/src/parsing.rs @@ -81,7 +81,11 @@ impl CodeContextRetriever { if let Some((item, byte_range)) = item.zip(byte_range) { if !name.is_empty() { - let item = format!("{}\n{}", context_spans.join("\n"), item); + let item = if context_spans.is_empty() { + item.to_string() + } else { + format!("{}\n{}", context_spans.join("\n"), item) + }; let document_text = CODE_CONTEXT_TEMPLATE .replace("", relative_path.to_str().unwrap()) diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index c4349c72808a90ff2baeedd2485a11592a56d87c..ccdd9fdaf07605b36f025d3a4bad63a3a2f516c2 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -1,5 +1,9 @@ use crate::{ - db::dot, embedding::EmbeddingProvider, vector_store_settings::VectorStoreSettings, VectorStore, + db::dot, + embedding::EmbeddingProvider, + parsing::{CodeContextRetriever, Document}, + vector_store_settings::VectorStoreSettings, + VectorStore, }; use anyhow::Result; use async_trait::async_trait; @@ -9,7 +13,7 @@ use project::{project_settings::ProjectSettings, FakeFs, Project}; use rand::{rngs::StdRng, Rng}; use serde_json::json; use settings::SettingsStore; -use std::sync::Arc; +use std::{path::Path, sync::Arc}; use unindent::Unindent; #[ctor::ctor] @@ -52,24 +56,7 @@ async fn test_vector_store(cx: &mut TestAppContext) { .await; let languages = Arc::new(LanguageRegistry::new(Task::ready(()))); - let rust_language = Arc::new( - Language::new( - LanguageConfig { - name: "Rust".into(), - path_suffixes: vec!["rs".into()], - ..Default::default() - }, - Some(tree_sitter_rust::language()), - ) - .with_embedding_query( - r#" - (function_item - name: (identifier) @name - body: (block)) @item - "#, - ) - .unwrap(), - ); + let rust_language = rust_lang(); languages.add(rust_language); let db_dir = tempdir::TempDir::new("vector-store").unwrap(); @@ -109,14 +96,59 @@ async fn test_vector_store(cx: &mut TestAppContext) { #[gpui::test] async fn test_code_context_retrieval(cx: &mut TestAppContext) { - // let mut retriever = CodeContextRetriever::new(fs); - - // retriever::parse_file( - // " - // // - // ", - // ); - // + let language = rust_lang(); + let mut retriever = CodeContextRetriever::new(); + + let text = " + /// A doc comment + /// that spans multiple lines + fn a() { + b + } + + impl C for D { + } + " + .unindent(); + + let parsed_files = retriever + .parse_file(Path::new("foo.rs"), &text, language) + .unwrap(); + + assert_eq!( + parsed_files, + &[ + Document { + name: "a".into(), + range: text.find("fn a").unwrap()..(text.find("}").unwrap() + 1), + content: " + The below code snippet is from file 'foo.rs' + + ```rust + /// A doc comment + /// that spans multiple lines + fn a() { + b + } + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "C for D".into(), + range: text.find("impl C").unwrap()..(text.rfind("}").unwrap() + 1), + content: " + The below code snippet is from file 'foo.rs' + + ```rust + impl C for D { + } + ```" + .unindent(), + embedding: vec![], + } + ] + ); } #[gpui::test] @@ -178,3 +210,71 @@ impl EmbeddingProvider for FakeEmbeddingProvider { .collect()) } } + +fn rust_lang() -> Arc { + Arc::new( + Language::new( + LanguageConfig { + name: "Rust".into(), + path_suffixes: vec!["rs".into()], + ..Default::default() + }, + Some(tree_sitter_rust::language()), + ) + .with_embedding_query( + r#" + ( + (line_comment)* @context + . + (enum_item + name: (_) @name) @item + ) + + ( + (line_comment)* @context + . + (struct_item + name: (_) @name) @item + ) + + ( + (line_comment)* @context + . + (impl_item + trait: (_)? @name + "for"? @name + type: (_) @name) @item + ) + + ( + (line_comment)* @context + . + (trait_item + name: (_) @name) @item + ) + + ( + (line_comment)* @context + . + (function_item + name: (_) @name) @item + ) + + ( + (line_comment)* @context + . + (macro_definition + name: (_) @name) @item + ) + + ( + (line_comment)* @context + . + (function_signature_item + name: (_) @name) @item + ) + "#, + ) + .unwrap(), + ) +} diff --git a/crates/zed/src/languages/rust/embedding.scm b/crates/zed/src/languages/rust/embedding.scm index 3aec101e9fbb5d63a49db52869f34757135b0ab2..66e4083de5f0fe8b1adfa2ea657668e4453e4b61 100644 --- a/crates/zed/src/languages/rust/embedding.scm +++ b/crates/zed/src/languages/rust/embedding.scm @@ -1,22 +1,50 @@ ( (line_comment)* @context . - [ - (enum_item - name: (_) @name) @item - (struct_item - name: (_) @name) @item - (impl_item - trait: (_)? @name - "for"? @name - type: (_) @name) @item - (trait_item - name: (_) @name) @item - (function_item - name: (_) @name) @item - (macro_definition - name: (_) @name) @item - (function_signature_item - name: (_) @name) @item - ] + (enum_item + name: (_) @name) @item +) + +( + (line_comment)* @context + . + (struct_item + name: (_) @name) @item +) + +( + (line_comment)* @context + . + (impl_item + trait: (_)? @name + "for"? @name + type: (_) @name) @item +) + +( + (line_comment)* @context + . + (trait_item + name: (_) @name) @item +) + +( + (line_comment)* @context + . + (function_item + name: (_) @name) @item +) + +( + (line_comment)* @context + . + (macro_definition + name: (_) @name) @item +) + +( + (line_comment)* @context + . + (function_signature_item + name: (_) @name) @item ) From d8fd0be59832d52ef7e21784a43c697c53e789e9 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 13 Jul 2023 17:01:56 -0400 Subject: [PATCH 05/34] update vector store to remove dummy embeddings --- crates/vector_store/src/vector_store.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/crates/vector_store/src/vector_store.rs b/crates/vector_store/src/vector_store.rs index 3d9c32875eef17c6cc58b1bed1637c1b920c2b0f..d35798a58db607f4b979e34ac2c00ae1f7bef8bf 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/vector_store/src/vector_store.rs @@ -80,11 +80,10 @@ pub fn init( let vector_store = VectorStore::new( fs, db_file_path, - Arc::new(embedding::DummyEmbeddings {}), - // Arc::new(OpenAIEmbeddings { - // client: http_client, - // executor: cx.background(), - // }), + Arc::new(OpenAIEmbeddings { + client: http_client, + executor: cx.background(), + }), language_registry, cx.clone(), ) From b38e3b804c7e1124c8a41ac3fb471c305e522639 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 13 Jul 2023 18:14:44 -0400 Subject: [PATCH 06/34] remove reindexing subscription, and add status methods for vector store Co-authored-by: maxbrunsfeld --- Cargo.lock | 1 + crates/vector_store/Cargo.toml | 1 + crates/vector_store/src/modal.rs | 2 +- crates/vector_store/src/vector_store.rs | 379 +++++++----------- crates/vector_store/src/vector_store_tests.rs | 78 +++- 5 files changed, 208 insertions(+), 253 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4359659a53bad7b2b33bca0fa9e41cd6ae09b11f..239aa6a302ded4391422e1c2d8752236f4019bb3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8493,6 +8493,7 @@ dependencies = [ "lazy_static", "log", "matrixmultiply", + "parking_lot 0.11.2", "picker", "project", "rand 0.8.5", diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index 8e1dea59fd8c0fe890291388fccaa9ac7cd3443d..bac9cdedfafc4567f24b7502e0f9ea9e4d0e71e3 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -33,6 +33,7 @@ async-trait.workspace = true bincode = "1.3.3" matrixmultiply = "0.3.7" tiktoken-rs = "0.5.0" +parking_lot.workspace = true rand.workspace = true schemars.workspace = true diff --git a/crates/vector_store/src/modal.rs b/crates/vector_store/src/modal.rs index b797a208062ee623db11fbbfe40948847639465f..2981fa4e73ef77ce3b54b68da9b177452f6d245e 100644 --- a/crates/vector_store/src/modal.rs +++ b/crates/vector_store/src/modal.rs @@ -124,7 +124,7 @@ impl PickerDelegate for SemanticSearchDelegate { if let Some(retrieved) = retrieved_cached.log_err() { if !retrieved { let task = vector_store.update(&mut cx, |store, cx| { - store.search(project.clone(), query.to_string(), 10, cx) + store.search_project(project.clone(), query.to_string(), 10, cx) }); if let Some(results) = task.await.log_err() { diff --git a/crates/vector_store/src/vector_store.rs b/crates/vector_store/src/vector_store.rs index d35798a58db607f4b979e34ac2c00ae1f7bef8bf..3f7ab5c6cd1b2e0296ee560071377a84a6c527db 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/vector_store/src/vector_store.rs @@ -18,15 +18,19 @@ use gpui::{ }; use language::{Language, LanguageRegistry}; use modal::{SemanticSearch, SemanticSearchDelegate, Toggle}; +use parking_lot::Mutex; use parsing::{CodeContextRetriever, Document}; -use project::{Fs, PathChange, Project, ProjectEntryId, WorktreeId}; +use project::{Fs, Project, WorktreeId}; use smol::channel; use std::{ - collections::HashMap, + collections::{HashMap, HashSet}, ops::Range, path::{Path, PathBuf}, - sync::Arc, - time::{Duration, Instant, SystemTime}, + sync::{ + atomic::{self, AtomicUsize}, + Arc, Weak, + }, + time::{Instant, SystemTime}, }; use util::{ channel::{ReleaseChannel, RELEASE_CHANNEL, RELEASE_CHANNEL_NAME}, @@ -99,7 +103,7 @@ pub fn init( let project = workspace.read(cx).project().clone(); if project.read(cx).is_local() { vector_store.update(cx, |store, cx| { - store.add_project(project, cx).detach(); + store.index_project(project, cx).detach(); }); } } @@ -124,13 +128,20 @@ pub struct VectorStore { _embed_batch_task: Task<()>, _batch_files_task: Task<()>, _parsing_files_tasks: Vec>, + next_job_id: Arc, projects: HashMap, ProjectState>, } struct ProjectState { worktree_db_ids: Vec<(WorktreeId, i64)>, - pending_files: HashMap, - _subscription: gpui::Subscription, + outstanding_jobs: Arc>>, +} + +type JobId = usize; + +struct JobHandle { + id: JobId, + set: Weak>>, } impl ProjectState { @@ -157,54 +168,15 @@ impl ProjectState { } }) } - - fn update_pending_files(&mut self, pending_file: PendingFile, indexing_time: SystemTime) { - // If Pending File Already Exists, Replace it with the new one - // but keep the old indexing time - if let Some(old_file) = self - .pending_files - .remove(&pending_file.relative_path.clone()) - { - self.pending_files.insert( - pending_file.relative_path.clone(), - (pending_file, old_file.1), - ); - } else { - self.pending_files.insert( - pending_file.relative_path.clone(), - (pending_file, indexing_time), - ); - }; - } - - fn get_outstanding_files(&mut self) -> Vec { - let mut outstanding_files = vec![]; - let mut remove_keys = vec![]; - for key in self.pending_files.keys().into_iter() { - if let Some(pending_details) = self.pending_files.get(key) { - let (pending_file, index_time) = pending_details; - if index_time <= &SystemTime::now() { - outstanding_files.push(pending_file.clone()); - remove_keys.push(key.clone()); - } - } - } - - for key in remove_keys.iter() { - self.pending_files.remove(key); - } - - return outstanding_files; - } } -#[derive(Clone, Debug)] pub struct PendingFile { worktree_db_id: i64, relative_path: PathBuf, absolute_path: PathBuf, language: Arc, modified_time: SystemTime, + job_handle: JobHandle, } #[derive(Debug, Clone)] @@ -221,6 +193,7 @@ enum DbOperation { documents: Vec, path: PathBuf, mtime: SystemTime, + job_handle: JobHandle, }, Delete { worktree_id: i64, @@ -242,6 +215,7 @@ enum EmbeddingJob { path: PathBuf, mtime: SystemTime, documents: Vec, + job_handle: JobHandle, }, Flush, } @@ -274,9 +248,11 @@ impl VectorStore { documents, path, mtime, + job_handle, } => { db.insert_file(worktree_id, path, mtime, documents) .log_err(); + drop(job_handle) } DbOperation::Delete { worktree_id, path } => { db.delete_file(worktree_id, path).log_err(); @@ -298,7 +274,7 @@ impl VectorStore { // embed_tx/rx: Embed Batch and Send to Database let (embed_batch_tx, embed_batch_rx) = - channel::unbounded::, PathBuf, SystemTime)>>(); + channel::unbounded::, PathBuf, SystemTime, JobHandle)>>(); let _embed_batch_task = cx.background().spawn({ let db_update_tx = db_update_tx.clone(); let embedding_provider = embedding_provider.clone(); @@ -306,7 +282,7 @@ impl VectorStore { while let Ok(mut embeddings_queue) = embed_batch_rx.recv().await { // Construct Batch let mut batch_documents = vec![]; - for (_, documents, _, _) in embeddings_queue.iter() { + for (_, documents, _, _, _) in embeddings_queue.iter() { batch_documents .extend(documents.iter().map(|document| document.content.as_str())); } @@ -333,7 +309,7 @@ impl VectorStore { j += 1; } - for (worktree_id, documents, path, mtime) in + for (worktree_id, documents, path, mtime, job_handle) in embeddings_queue.into_iter() { for document in documents.iter() { @@ -350,6 +326,7 @@ impl VectorStore { documents, path, mtime, + job_handle, }) .await .unwrap(); @@ -372,9 +349,16 @@ impl VectorStore { worktree_id, path, mtime, + job_handle, } => { queue_len += &documents.len(); - embeddings_queue.push((worktree_id, documents, path, mtime)); + embeddings_queue.push(( + worktree_id, + documents, + path, + mtime, + job_handle, + )); queue_len >= EMBEDDINGS_BATCH_SIZE } EmbeddingJob::Flush => true, @@ -420,6 +404,7 @@ impl VectorStore { worktree_id: pending_file.worktree_db_id, path: pending_file.relative_path, mtime: pending_file.modified_time, + job_handle: pending_file.job_handle, documents, }) .unwrap(); @@ -439,6 +424,7 @@ impl VectorStore { embedding_provider, language_registry, db_update_tx, + next_job_id: Default::default(), parsing_files_tx, _db_update_task, _embed_batch_task, @@ -471,11 +457,11 @@ impl VectorStore { async move { rx.await? } } - fn add_project( + fn index_project( &mut self, project: ModelHandle, cx: &mut ModelContext, - ) -> Task> { + ) -> Task> { let worktree_scans_complete = project .read(cx) .worktrees(cx) @@ -494,21 +480,16 @@ impl VectorStore { }) .collect::>(); - let fs = self.fs.clone(); let language_registry = self.language_registry.clone(); - let database_url = self.database_url.clone(); let db_update_tx = self.db_update_tx.clone(); let parsing_files_tx = self.parsing_files_tx.clone(); + let next_job_id = self.next_job_id.clone(); cx.spawn(|this, mut cx| async move { futures::future::join_all(worktree_scans_complete).await; let worktree_db_ids = futures::future::join_all(worktree_db_ids).await; - if let Some(db_directory) = database_url.parent() { - fs.create_dir(db_directory).await.log_err(); - } - let worktrees = project.read_with(&cx, |project, cx| { project .worktrees(cx) @@ -516,109 +497,115 @@ impl VectorStore { .collect::>() }); - let mut worktree_file_times = HashMap::new(); + let mut worktree_file_mtimes = HashMap::new(); let mut db_ids_by_worktree_id = HashMap::new(); for (worktree, db_id) in worktrees.iter().zip(worktree_db_ids) { let db_id = db_id?; db_ids_by_worktree_id.insert(worktree.id(), db_id); - worktree_file_times.insert( + worktree_file_mtimes.insert( worktree.id(), this.read_with(&cx, |this, _| this.get_file_mtimes(db_id)) .await?, ); } - cx.background() - .spawn({ - let db_ids_by_worktree_id = db_ids_by_worktree_id.clone(); - let db_update_tx = db_update_tx.clone(); - let language_registry = language_registry.clone(); - let parsing_files_tx = parsing_files_tx.clone(); - async move { - let t0 = Instant::now(); - for worktree in worktrees.into_iter() { - let mut file_mtimes = - worktree_file_times.remove(&worktree.id()).unwrap(); - for file in worktree.files(false, 0) { - let absolute_path = worktree.absolutize(&file.path); - - if let Ok(language) = language_registry - .language_for_file(&absolute_path, None) - .await - { - if language - .grammar() - .and_then(|grammar| grammar.embedding_config.as_ref()) - .is_none() - { - continue; - } - - let path_buf = file.path.to_path_buf(); - let stored_mtime = file_mtimes.remove(&file.path.to_path_buf()); - let already_stored = stored_mtime - .map_or(false, |existing_mtime| { - existing_mtime == file.mtime - }); - - if !already_stored { - log::trace!("sending for parsing: {:?}", path_buf); - parsing_files_tx - .try_send(PendingFile { - worktree_db_id: db_ids_by_worktree_id - [&worktree.id()], - relative_path: path_buf, - absolute_path, - language, - modified_time: file.mtime, - }) - .unwrap(); - } - } - } - for file in file_mtimes.keys() { - db_update_tx - .try_send(DbOperation::Delete { - worktree_id: db_ids_by_worktree_id[&worktree.id()], - path: file.to_owned(), - }) - .unwrap(); - } - } - log::trace!( - "parsing worktree completed in {:?}", - t0.elapsed().as_millis() - ); - } - }) - .detach(); - // let mut pending_files: Vec<(PathBuf, ((i64, PathBuf, Arc, SystemTime), SystemTime))> = vec![]; - this.update(&mut cx, |this, cx| { - // The below is managing for updated on save - // Currently each time a file is saved, this code is run, and for all the files that were changed, if the current time is - // greater than the previous embedded time by the REINDEXING_DELAY variable, we will send the file off to be indexed. - let _subscription = cx.subscribe(&project, |this, project, event, cx| { - if let project::Event::WorktreeUpdatedEntries(worktree_id, changes) = event { - this.project_entries_changed(project, changes.clone(), cx, worktree_id); - } - }); - + let outstanding_jobs = Arc::new(Mutex::new(HashSet::new())); + this.update(&mut cx, |this, _| { this.projects.insert( project.downgrade(), ProjectState { - pending_files: HashMap::new(), - worktree_db_ids: db_ids_by_worktree_id.into_iter().collect(), - _subscription, + worktree_db_ids: db_ids_by_worktree_id + .iter() + .map(|(a, b)| (*a, *b)) + .collect(), + outstanding_jobs: outstanding_jobs.clone(), }, ); }); - anyhow::Ok(()) + cx.background() + .spawn(async move { + let mut count = 0; + let t0 = Instant::now(); + for worktree in worktrees.into_iter() { + let mut file_mtimes = worktree_file_mtimes.remove(&worktree.id()).unwrap(); + for file in worktree.files(false, 0) { + let absolute_path = worktree.absolutize(&file.path); + + if let Ok(language) = language_registry + .language_for_file(&absolute_path, None) + .await + { + if language + .grammar() + .and_then(|grammar| grammar.embedding_config.as_ref()) + .is_none() + { + continue; + } + + let path_buf = file.path.to_path_buf(); + let stored_mtime = file_mtimes.remove(&file.path.to_path_buf()); + let already_stored = stored_mtime + .map_or(false, |existing_mtime| existing_mtime == file.mtime); + + if !already_stored { + log::trace!("sending for parsing: {:?}", path_buf); + count += 1; + let job_id = next_job_id.fetch_add(1, atomic::Ordering::SeqCst); + let job_handle = JobHandle { + id: job_id, + set: Arc::downgrade(&outstanding_jobs), + }; + outstanding_jobs.lock().insert(job_id); + parsing_files_tx + .try_send(PendingFile { + worktree_db_id: db_ids_by_worktree_id[&worktree.id()], + relative_path: path_buf, + absolute_path, + language, + job_handle, + modified_time: file.mtime, + }) + .unwrap(); + } + } + } + for file in file_mtimes.keys() { + db_update_tx + .try_send(DbOperation::Delete { + worktree_id: db_ids_by_worktree_id[&worktree.id()], + path: file.to_owned(), + }) + .unwrap(); + } + } + log::trace!( + "parsing worktree completed in {:?}", + t0.elapsed().as_millis() + ); + + Ok(count) + }) + .await }) } - pub fn search( + pub fn remaining_files_to_index_for_project( + &self, + project: &ModelHandle, + ) -> Option { + Some( + self.projects + .get(&project.downgrade())? + .outstanding_jobs + .lock() + .len(), + ) + } + + pub fn search_project( &mut self, project: ModelHandle, phrase: String, @@ -682,110 +669,16 @@ impl VectorStore { }) }) } - - fn project_entries_changed( - &mut self, - project: ModelHandle, - changes: Arc<[(Arc, ProjectEntryId, PathChange)]>, - cx: &mut ModelContext<'_, VectorStore>, - worktree_id: &WorktreeId, - ) -> Option<()> { - let reindexing_delay = settings::get::(cx).reindexing_delay_seconds; - - let worktree = project - .read(cx) - .worktree_for_id(worktree_id.clone(), cx)? - .read(cx) - .snapshot(); - - let worktree_db_id = self - .projects - .get(&project.downgrade())? - .db_id_for_worktree_id(worktree.id())?; - let file_mtimes = self.get_file_mtimes(worktree_db_id); - - let language_registry = self.language_registry.clone(); - - cx.spawn(|this, mut cx| async move { - let file_mtimes = file_mtimes.await.log_err()?; - - for change in changes.into_iter() { - let change_path = change.0.clone(); - let absolute_path = worktree.absolutize(&change_path); - - // Skip if git ignored or symlink - if let Some(entry) = worktree.entry_for_id(change.1) { - if entry.is_ignored || entry.is_symlink || entry.is_external { - continue; - } - } - - match change.2 { - PathChange::Removed => this.update(&mut cx, |this, _| { - this.db_update_tx - .try_send(DbOperation::Delete { - worktree_id: worktree_db_id, - path: absolute_path, - }) - .unwrap(); - }), - _ => { - if let Ok(language) = language_registry - .language_for_file(&change_path.to_path_buf(), None) - .await - { - if language - .grammar() - .and_then(|grammar| grammar.embedding_config.as_ref()) - .is_none() - { - continue; - } - - let modified_time = - change_path.metadata().log_err()?.modified().log_err()?; - - let existing_time = file_mtimes.get(&change_path.to_path_buf()); - let already_stored = existing_time - .map_or(false, |existing_time| &modified_time != existing_time); - - if !already_stored { - this.update(&mut cx, |this, _| { - let reindex_time = modified_time - + Duration::from_secs(reindexing_delay as u64); - - let project_state = - this.projects.get_mut(&project.downgrade())?; - project_state.update_pending_files( - PendingFile { - relative_path: change_path.to_path_buf(), - absolute_path, - modified_time, - worktree_db_id, - language: language.clone(), - }, - reindex_time, - ); - - for file in project_state.get_outstanding_files() { - this.parsing_files_tx.try_send(file).unwrap(); - } - Some(()) - }); - } - } - } - } - } - - Some(()) - }) - .detach(); - - Some(()) - } } impl Entity for VectorStore { type Event = (); } + +impl Drop for JobHandle { + fn drop(&mut self) { + if let Some(set) = self.set.upgrade() { + set.lock().remove(&self.id); + } + } +} diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index ccdd9fdaf07605b36f025d3a4bad63a3a2f516c2..de82bc2f482351166d4f57d37c0a82087dbaa662 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -9,11 +9,17 @@ use anyhow::Result; use async_trait::async_trait; use gpui::{Task, TestAppContext}; use language::{Language, LanguageConfig, LanguageRegistry}; -use project::{project_settings::ProjectSettings, FakeFs, Project}; +use project::{project_settings::ProjectSettings, FakeFs, Fs, Project}; use rand::{rngs::StdRng, Rng}; use serde_json::json; use settings::SettingsStore; -use std::{path::Path, sync::Arc}; +use std::{ + path::Path, + sync::{ + atomic::{self, AtomicUsize}, + Arc, + }, +}; use unindent::Unindent; #[ctor::ctor] @@ -62,29 +68,37 @@ async fn test_vector_store(cx: &mut TestAppContext) { let db_dir = tempdir::TempDir::new("vector-store").unwrap(); let db_path = db_dir.path().join("db.sqlite"); + let embedding_provider = Arc::new(FakeEmbeddingProvider::default()); let store = VectorStore::new( fs.clone(), db_path, - Arc::new(FakeEmbeddingProvider), + embedding_provider.clone(), languages, cx.to_async(), ) .await .unwrap(); - let project = Project::test(fs, ["/the-root".as_ref()], cx).await; + let project = Project::test(fs.clone(), ["/the-root".as_ref()], cx).await; let worktree_id = project.read_with(cx, |project, cx| { project.worktrees(cx).next().unwrap().read(cx).id() }); - store - .update(cx, |store, cx| store.add_project(project.clone(), cx)) + let file_count = store + .update(cx, |store, cx| store.index_project(project.clone(), cx)) .await .unwrap(); + assert_eq!(file_count, 2); cx.foreground().run_until_parked(); + store.update(cx, |store, _cx| { + assert_eq!( + store.remaining_files_to_index_for_project(&project), + Some(0) + ); + }); let search_results = store .update(cx, |store, cx| { - store.search(project.clone(), "aaaa".to_string(), 5, cx) + store.search_project(project.clone(), "aaaa".to_string(), 5, cx) }) .await .unwrap(); @@ -92,10 +106,45 @@ async fn test_vector_store(cx: &mut TestAppContext) { assert_eq!(search_results[0].byte_range.start, 0); assert_eq!(search_results[0].name, "aaa"); assert_eq!(search_results[0].worktree_id, worktree_id); + + fs.save( + "/the-root/src/file2.rs".as_ref(), + &" + fn dddd() { println!(\"ddddd!\"); } + struct pqpqpqp {} + " + .unindent() + .into(), + Default::default(), + ) + .await + .unwrap(); + + cx.foreground().run_until_parked(); + + let prev_embedding_count = embedding_provider.embedding_count(); + let file_count = store + .update(cx, |store, cx| store.index_project(project.clone(), cx)) + .await + .unwrap(); + assert_eq!(file_count, 1); + + cx.foreground().run_until_parked(); + store.update(cx, |store, _cx| { + assert_eq!( + store.remaining_files_to_index_for_project(&project), + Some(0) + ); + }); + + assert_eq!( + embedding_provider.embedding_count() - prev_embedding_count, + 2 + ); } #[gpui::test] -async fn test_code_context_retrieval(cx: &mut TestAppContext) { +async fn test_code_context_retrieval() { let language = rust_lang(); let mut retriever = CodeContextRetriever::new(); @@ -181,11 +230,22 @@ fn test_dot_product(mut rng: StdRng) { } } -struct FakeEmbeddingProvider; +#[derive(Default)] +struct FakeEmbeddingProvider { + embedding_count: AtomicUsize, +} + +impl FakeEmbeddingProvider { + fn embedding_count(&self) -> usize { + self.embedding_count.load(atomic::Ordering::SeqCst) + } +} #[async_trait] impl EmbeddingProvider for FakeEmbeddingProvider { async fn embed_batch(&self, spans: Vec<&str>) -> Result>> { + self.embedding_count + .fetch_add(spans.len(), atomic::Ordering::SeqCst); Ok(spans .iter() .map(|span| { From 3a625d15d30ba26c4500f88de7a16dc980bc0019 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Fri, 14 Jul 2023 11:33:49 -0400 Subject: [PATCH 07/34] update c embedding query for preceding comments --- crates/zed/src/languages/c/embedding.scm | 74 +++++++++++++----------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/crates/zed/src/languages/c/embedding.scm b/crates/zed/src/languages/c/embedding.scm index cd1915f62bb5c27f7617bde91327a78129564511..0178abeb18374771967c09c93b9fcfc504e1e556 100644 --- a/crates/zed/src/languages/c/embedding.scm +++ b/crates/zed/src/languages/c/embedding.scm @@ -1,39 +1,43 @@ -(declaration - (type_qualifier)? @context - type: (_)? @context - declarator: [ - (function_declarator - declarator: (_) @name) - (pointer_declarator - "*" @context - declarator: (function_declarator - declarator: (_) @name)) - (pointer_declarator - "*" @context - declarator: (pointer_declarator - "*" @context +( + (comment)* @context + . + (declaration + declarator: [ + (function_declarator + declarator: (_) @name) + (pointer_declarator + "*" @name declarator: (function_declarator - declarator: (_) @name))) - ] -) @item + declarator: (_) @name)) + (pointer_declarator + "*" @name + declarator: (pointer_declarator + "*" @name + declarator: (function_declarator + declarator: (_) @name))) + ] + ) @item + ) -(function_definition - (type_qualifier)? @context - type: (_)? @context - declarator: [ - (function_declarator - declarator: (_) @name - ) - (pointer_declarator - "*" @context - declarator: (function_declarator +( + (comment)* @context + . + (function_definition + declarator: [ + (function_declarator declarator: (_) @name - )) - (pointer_declarator - "*" @context - declarator: (pointer_declarator - "*" @context + ) + (pointer_declarator + "*" @name declarator: (function_declarator - declarator: (_) @name))) - ] -) @item + declarator: (_) @name + )) + (pointer_declarator + "*" @name + declarator: (pointer_declarator + "*" @name + declarator: (function_declarator + declarator: (_) @name))) + ] + ) @item + ) From d4971e9eadebc9e629ca413a0df309230f2d14fc Mon Sep 17 00:00:00 2001 From: KCaverly Date: Fri, 14 Jul 2023 13:47:10 -0400 Subject: [PATCH 08/34] update typescript parsing to manage for leading tsdoc comments --- crates/vector_store/src/parsing.rs | 8 +- .../src/languages/typescript/embedding.scm | 113 +++++++++--------- 2 files changed, 62 insertions(+), 59 deletions(-) diff --git a/crates/vector_store/src/parsing.rs b/crates/vector_store/src/parsing.rs index 8d6e03d6eb29d524db23848adfa15a8ac6b4b164..4ce8b6763a33cae8253a1e6518cbc53b19eff030 100644 --- a/crates/vector_store/src/parsing.rs +++ b/crates/vector_store/src/parsing.rs @@ -51,6 +51,7 @@ impl CodeContextRetriever { let mut documents = Vec::new(); // Iterate through query matches + let mut name_ranges: Vec> = vec![]; for mat in self.cursor.matches( &embedding_config.query, tree.root_node(), @@ -65,7 +66,12 @@ impl CodeContextRetriever { byte_range = Some(capture.node.byte_range()); item = content.get(capture.node.byte_range()); } else if capture.index == embedding_config.name_capture_ix { - if let Some(name_content) = content.get(capture.node.byte_range()) { + let name_range = capture.node.byte_range(); + if name_ranges.contains(&name_range) { + continue; + } + name_ranges.push(name_range.clone()); + if let Some(name_content) = content.get(name_range.clone()) { name.push(name_content); } } diff --git a/crates/zed/src/languages/typescript/embedding.scm b/crates/zed/src/languages/typescript/embedding.scm index f261a0a56577176108dc1ef2b5cf6de3569a0531..d850f9b82307fc9bd0560c866ca149cffe5a1f5e 100644 --- a/crates/zed/src/languages/typescript/embedding.scm +++ b/crates/zed/src/languages/typescript/embedding.scm @@ -1,59 +1,56 @@ -; (internal_module -; "namespace" @context -; name: (_) @name) @item - -(enum_declaration - "enum" @context - name: (_) @name) @item - -; (type_alias_declaration -; "type" @context -; name: (_) @name) @item - -(function_declaration - "async"? @context - "function" @context - name: (_) @name) @item - -(interface_declaration - "interface" @context - name: (_) @name) @item - -; (export_statement -; (lexical_declaration -; ["let" "const"] @context -; (variable_declarator -; name: (_) @name) @item)) - -(program - (lexical_declaration - ["let" "const"] @context - (variable_declarator - name: (_) @name) @item)) - -(class_declaration - "class" @context - name: (_) @name) @item - -(method_definition +( + (comment)* @context + . + (enum_declaration + "enum" @context + name: (_) @name) @item + ) + +( + (comment)* @context + . [ - "get" - "set" - "async" - "*" - "readonly" - "static" - (override_modifier) - (accessibility_modifier) - ]* @context - name: (_) @name) @item - -; (public_field_definition -; [ -; "declare" -; "readonly" -; "abstract" -; "static" -; (accessibility_modifier) -; ]* @context -; name: (_) @name) @item + (export_statement + (function_declaration + "async"? @name + "function" @name + name: (_) @name) + ) @item + (function_declaration + "async"? @name + "function" @name + name: (_) @name) @item + ]) + +( + (comment)* @context + . + (interface_declaration + "interface" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (class_declaration + "class" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (method_definition + [ + "get" + "set" + "async" + "*" + "readonly" + "static" + (override_modifier) + (accessibility_modifier) + ]* @name + name: (_) @name) @item + ) From 2dae42b1ba49b4e10fe13826674610774078454f Mon Sep 17 00:00:00 2001 From: KCaverly Date: Fri, 14 Jul 2023 14:25:08 -0400 Subject: [PATCH 09/34] update embedding query for tsx to accomodate for leading comments --- crates/zed/src/languages/tsx/embedding.scm | 83 ++++++++++++++-------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/crates/zed/src/languages/tsx/embedding.scm b/crates/zed/src/languages/tsx/embedding.scm index 305f634e04ba245115907c1f113fe0c64cab1143..1c47a5a238ff9d944dc321b4eb10b0e56d8a6221 100644 --- a/crates/zed/src/languages/tsx/embedding.scm +++ b/crates/zed/src/languages/tsx/embedding.scm @@ -1,35 +1,56 @@ -(enum_declaration - "enum" @context - name: (_) @name) @item +( + (comment)* @context + . + (enum_declaration + "enum" @context + name: (_) @name) @item + ) -(function_declaration - "async"? @context - "function" @context - name: (_) @name) @item - -(interface_declaration - "interface" @context - name: (_) @name) @item +( + (comment)* @context + . + [ + (export_statement + (function_declaration + "async"? @name + "function" @name + name: (_) @name) + ) @item + (function_declaration + "async"? @name + "function" @name + name: (_) @name) @item + ]) -(program - (lexical_declaration - ["let" "const"] @context - (variable_declarator - name: (_) @name) @item)) +( + (comment)* @context + . + (interface_declaration + "interface" @name + name: (_) @name) @item + ) -(class_declaration - "class" @context - name: (_) @name) @item +( + (comment)* @context + . + (class_declaration + "class" @name + name: (_) @name) @item + ) -(method_definition - [ - "get" - "set" - "async" - "*" - "readonly" - "static" - (override_modifier) - (accessibility_modifier) - ]* @context - name: (_) @name) @item +( + (comment)* @context + . + (method_definition + [ + "get" + "set" + "async" + "*" + "readonly" + "static" + (override_modifier) + (accessibility_modifier) + ]* @name + name: (_) @name) @item + ) From 4bece54655980ac9c2f6ec5266e9bfc9306cc422 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 17 Jul 2023 09:22:37 -0400 Subject: [PATCH 10/34] update jsx family of languages for preceeding comments and nested exports --- Cargo.lock | 2 + crates/vector_store/Cargo.toml | 5 +- crates/vector_store/src/vector_store_tests.rs | 242 +++++++++++++++++- .../src/languages/javascript/embedding.scm | 139 ++++++---- crates/zed/src/languages/tsx/embedding.scm | 85 ++++-- .../src/languages/typescript/embedding.scm | 85 ++++-- 6 files changed, 458 insertions(+), 100 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 239aa6a302ded4391422e1c2d8752236f4019bb3..b6049e611ed0c72a0fc13d822545dc994d80af4e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8508,7 +8508,9 @@ dependencies = [ "theme", "tiktoken-rs 0.5.0", "tree-sitter", + "tree-sitter-javascript", "tree-sitter-rust", + "tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)", "unindent", "util", "workspace", diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index bac9cdedfafc4567f24b7502e0f9ea9e4d0e71e3..6b2e77e904016eb9e27584e8fc4e9ede71134d0a 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -44,10 +44,13 @@ project = { path = "../project", features = ["test-support"] } rpc = { path = "../rpc", features = ["test-support"] } workspace = { path = "../workspace", features = ["test-support"] } settings = { path = "../settings", features = ["test-support"]} -tree-sitter-rust = "*" rand.workspace = true unindent.workspace = true tempdir.workspace = true ctor.workspace = true env_logger.workspace = true + +tree-sitter-javascript = "*" +tree-sitter-typescript = "*" +tree-sitter-rust = "*" diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index de82bc2f482351166d4f57d37c0a82087dbaa662..76465b1aaf95ef98b2305e35dbead3628bc461ed 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -144,7 +144,7 @@ async fn test_vector_store(cx: &mut TestAppContext) { } #[gpui::test] -async fn test_code_context_retrieval() { +async fn test_code_context_retrieval_rust() { let language = rust_lang(); let mut retriever = CodeContextRetriever::new(); @@ -200,6 +200,142 @@ async fn test_code_context_retrieval() { ); } +#[gpui::test] +async fn test_code_context_retrieval_javascript() { + let language = js_lang(); + let mut retriever = CodeContextRetriever::new(); + + let text = " +/* globals importScripts, backend */ +function _authorize() {} + +/** + * Sometimes the frontend build is way faster than backend. + */ +export async function authorizeBank() { + _authorize(pushModal, upgradingAccountId, {}); +} + +export class SettingsPage { + /* This is a test setting */ + constructor(page) { + this.page = page; + } +} + +/* This is a test comment */ +class TestClass {} + +/* Schema for editor_events in Clickhouse. */ +export interface ClickhouseEditorEvent { + installation_id: string + operation: string +} +"; + + let parsed_files = retriever + .parse_file(Path::new("foo.js"), &text, language) + .unwrap(); + + let test_documents = &[ + Document { + name: "function _authorize".into(), + range: text.find("function _authorize").unwrap()..(text.find("}").unwrap() + 1), + content: " + The below code snippet is from file 'foo.js' + + ```javascript + /* globals importScripts, backend */ + function _authorize() {} + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "async function authorizeBank".into(), + range: text.find("export async").unwrap()..224, + content: " + The below code snippet is from file 'foo.js' + + ```javascript + /** + * Sometimes the frontend build is way faster than backend. + */ + export async function authorizeBank() { + _authorize(pushModal, upgradingAccountId, {}); + } + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "class SettingsPage".into(), + range: 226..344, + content: " + The below code snippet is from file 'foo.js' + + ```javascript + export class SettingsPage { + /* This is a test setting */ + constructor(page) { + this.page = page; + } + } + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "constructor".into(), + range: 291..342, + content: " + The below code snippet is from file 'foo.js' + + ```javascript + /* This is a test setting */ + constructor(page) { + this.page = page; + } + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "class TestClass".into(), + range: 375..393, + content: " + The below code snippet is from file 'foo.js' + + ```javascript + /* This is a test comment */ + class TestClass {} + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "interface ClickhouseEditorEvent".into(), + range: 441..533, + content: " + The below code snippet is from file 'foo.js' + + ```javascript + /* Schema for editor_events in Clickhouse. */ + export interface ClickhouseEditorEvent { + installation_id: string + operation: string + } + ```" + .unindent(), + embedding: vec![], + }, + ]; + + for idx in 0..test_documents.len() { + assert_eq!(test_documents[idx], parsed_files[idx]); + } +} + #[gpui::test] fn test_dot_product(mut rng: StdRng) { assert_eq!(dot(&[1., 0., 0., 0., 0.], &[0., 1., 0., 0., 0.]), 0.); @@ -271,6 +407,110 @@ impl EmbeddingProvider for FakeEmbeddingProvider { } } +fn js_lang() -> Arc { + Arc::new( + Language::new( + LanguageConfig { + name: "Javascript".into(), + path_suffixes: vec!["js".into()], + ..Default::default() + }, + Some(tree_sitter_typescript::language_tsx()), + ) + .with_embedding_query( + &r#" + + ( + (comment)* @context + . + (export_statement + (function_declaration + "async"? @name + "function" @name + name: (_) @name)) @item + ) + + ( + (comment)* @context + . + (function_declaration + "async"? @name + "function" @name + name: (_) @name) @item + ) + + ( + (comment)* @context + . + (export_statement + (class_declaration + "class" @name + name: (_) @name)) @item + ) + + ( + (comment)* @context + . + (class_declaration + "class" @name + name: (_) @name) @item + ) + + ( + (comment)* @context + . + (method_definition + [ + "get" + "set" + "async" + "*" + "static" + ]* @name + name: (_) @name) @item + ) + + ( + (comment)* @context + . + (export_statement + (interface_declaration + "interface" @name + name: (_) @name)) @item + ) + + ( + (comment)* @context + . + (interface_declaration + "interface" @name + name: (_) @name) @item + ) + + ( + (comment)* @context + . + (export_statement + (enum_declaration + "enum" @name + name: (_) @name)) @item + ) + + ( + (comment)* @context + . + (enum_declaration + "enum" @name + name: (_) @name) @item + ) + + "# + .unindent(), + ) + .unwrap(), + ) +} + fn rust_lang() -> Arc { Arc::new( Language::new( diff --git a/crates/zed/src/languages/javascript/embedding.scm b/crates/zed/src/languages/javascript/embedding.scm index ec6eb5ab1a8be481bc7a9987056ce2d1cb7d2474..a2140400318db95a8d29074402ab2d212561a79b 100644 --- a/crates/zed/src/languages/javascript/embedding.scm +++ b/crates/zed/src/languages/javascript/embedding.scm @@ -1,56 +1,83 @@ -; (internal_module -; "namespace" @context -; name: (_) @name) @item - -(enum_declaration - "enum" @context - name: (_) @name) @item - -(function_declaration - "async"? @context - "function" @context - name: (_) @name) @item - -(interface_declaration - "interface" @context - name: (_) @name) @item - -; (program -; (export_statement -; (lexical_declaration -; ["let" "const"] @context -; (variable_declarator -; name: (_) @name) @item))) - -(program - (lexical_declaration - ["let" "const"] @context - (variable_declarator - name: (_) @name) @item)) - -(class_declaration - "class" @context - name: (_) @name) @item - -(method_definition - [ - "get" - "set" - "async" - "*" - "readonly" - "static" - (override_modifier) - (accessibility_modifier) - ]* @context - name: (_) @name) @item - -; (public_field_definition -; [ -; "declare" -; "readonly" -; "abstract" -; "static" -; (accessibility_modifier) -; ]* @context -; name: (_) @name) @item +( + (comment)* @context + . + (export_statement + (function_declaration + "async"? @name + "function" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (function_declaration + "async"? @name + "function" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (export_statement + (class_declaration + "class" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (class_declaration + "class" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (method_definition + [ + "get" + "set" + "async" + "*" + "static" + ]* @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (export_statement + (interface_declaration + "interface" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (interface_declaration + "interface" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (export_statement + (enum_declaration + "enum" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (enum_declaration + "enum" @name + name: (_) @name) @item + ) diff --git a/crates/zed/src/languages/tsx/embedding.scm b/crates/zed/src/languages/tsx/embedding.scm index 1c47a5a238ff9d944dc321b4eb10b0e56d8a6221..4bb4fea254d0cf86f2fbb9d5c8f657e06238971f 100644 --- a/crates/zed/src/languages/tsx/embedding.scm +++ b/crates/zed/src/languages/tsx/embedding.scm @@ -1,33 +1,29 @@ ( (comment)* @context . - (enum_declaration - "enum" @context - name: (_) @name) @item + (export_statement + (function_declaration + "async"? @name + "function" @name + name: (_) @name)) @item ) ( (comment)* @context . - [ - (export_statement - (function_declaration - "async"? @name - "function" @name - name: (_) @name) - ) @item - (function_declaration - "async"? @name - "function" @name - name: (_) @name) @item - ]) + (function_declaration + "async"? @name + "function" @name + name: (_) @name) @item + ) ( (comment)* @context . - (interface_declaration - "interface" @name - name: (_) @name) @item + (export_statement + (class_declaration + "class" @name + name: (_) @name)) @item ) ( @@ -47,10 +43,57 @@ "set" "async" "*" - "readonly" "static" - (override_modifier) - (accessibility_modifier) ]* @name name: (_) @name) @item ) + +( + (comment)* @context + . + (export_statement + (interface_declaration + "interface" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (interface_declaration + "interface" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (export_statement + (enum_declaration + "enum" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (enum_declaration + "enum" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (export_statement + (type_alias_declaration + "type" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (type_alias_declaration + "type" @name + name: (_) @name) @item) diff --git a/crates/zed/src/languages/typescript/embedding.scm b/crates/zed/src/languages/typescript/embedding.scm index d850f9b82307fc9bd0560c866ca149cffe5a1f5e..4bb4fea254d0cf86f2fbb9d5c8f657e06238971f 100644 --- a/crates/zed/src/languages/typescript/embedding.scm +++ b/crates/zed/src/languages/typescript/embedding.scm @@ -1,33 +1,29 @@ ( (comment)* @context . - (enum_declaration - "enum" @context - name: (_) @name) @item + (export_statement + (function_declaration + "async"? @name + "function" @name + name: (_) @name)) @item ) ( (comment)* @context . - [ - (export_statement - (function_declaration - "async"? @name - "function" @name - name: (_) @name) - ) @item - (function_declaration - "async"? @name - "function" @name - name: (_) @name) @item - ]) + (function_declaration + "async"? @name + "function" @name + name: (_) @name) @item + ) ( (comment)* @context . - (interface_declaration - "interface" @name - name: (_) @name) @item + (export_statement + (class_declaration + "class" @name + name: (_) @name)) @item ) ( @@ -47,10 +43,57 @@ "set" "async" "*" - "readonly" "static" - (override_modifier) - (accessibility_modifier) ]* @name name: (_) @name) @item ) + +( + (comment)* @context + . + (export_statement + (interface_declaration + "interface" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (interface_declaration + "interface" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (export_statement + (enum_declaration + "enum" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (enum_declaration + "enum" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (export_statement + (type_alias_declaration + "type" @name + name: (_) @name)) @item + ) + +( + (comment)* @context + . + (type_alias_declaration + "type" @name + name: (_) @name) @item) From cf0dd09b5cdd9fd18c06d43a6774121cb86ce544 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 17 Jul 2023 10:04:32 -0400 Subject: [PATCH 11/34] update vector_store to accomodate for full file parsing for JSON, TOML and YAML files --- Cargo.lock | 14 ++++++++-- crates/vector_store/Cargo.toml | 2 +- crates/vector_store/src/parsing.rs | 26 +++++++++++++++++++ crates/vector_store/src/vector_store.rs | 11 ++++---- crates/vector_store/src/vector_store_tests.rs | 18 ++++++++++++- 5 files changed, 62 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b6049e611ed0c72a0fc13d822545dc994d80af4e..afd40fd3081b0948d38afc37a6fd7e37066e625e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8134,6 +8134,16 @@ dependencies = [ "tree-sitter", ] +[[package]] +name = "tree-sitter-toml" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca517f578a98b23d20780247cc2688407fa81effad5b627a5a364ec3339b53e8" +dependencies = [ + "cc", + "tree-sitter", +] + [[package]] name = "tree-sitter-typescript" version = "0.20.2" @@ -8508,8 +8518,8 @@ dependencies = [ "theme", "tiktoken-rs 0.5.0", "tree-sitter", - "tree-sitter-javascript", "tree-sitter-rust", + "tree-sitter-toml 0.20.0", "tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)", "unindent", "util", @@ -9560,7 +9570,7 @@ dependencies = [ "tree-sitter-ruby", "tree-sitter-rust", "tree-sitter-scheme", - "tree-sitter-toml", + "tree-sitter-toml 0.5.1", "tree-sitter-typescript 0.20.2 (git+https://github.com/tree-sitter/tree-sitter-typescript?rev=5d20856f34315b068c41edaee2ac8a100081d259)", "tree-sitter-yaml", "unindent", diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index 6b2e77e904016eb9e27584e8fc4e9ede71134d0a..31119a1ba65363721593d68fa705b75763bafd58 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -51,6 +51,6 @@ tempdir.workspace = true ctor.workspace = true env_logger.workspace = true -tree-sitter-javascript = "*" tree-sitter-typescript = "*" tree-sitter-rust = "*" +tree-sitter-toml = "*" diff --git a/crates/vector_store/src/parsing.rs b/crates/vector_store/src/parsing.rs index 4ce8b6763a33cae8253a1e6518cbc53b19eff030..216ef1b5e11697519900dd4445234ebacb5ade21 100644 --- a/crates/vector_store/src/parsing.rs +++ b/crates/vector_store/src/parsing.rs @@ -13,6 +13,9 @@ pub struct Document { const CODE_CONTEXT_TEMPLATE: &str = "The below code snippet is from file ''\n\n```\n\n```"; +const ENTIRE_FILE_TEMPLATE: &str = + "The below snippet is from file ''\n\n```\n\n```"; +pub const PARSEABLE_ENTIRE_FILE_TYPES: [&str; 3] = ["TOML", "YAML", "JSON"]; pub struct CodeContextRetriever { pub parser: Parser, @@ -27,12 +30,35 @@ impl CodeContextRetriever { } } + fn _parse_entire_file( + &self, + relative_path: &Path, + language_name: Arc, + content: &str, + ) -> Result> { + let document_span = ENTIRE_FILE_TEMPLATE + .replace("", relative_path.to_string_lossy().as_ref()) + .replace("", language_name.as_ref()) + .replace("item", &content); + + Ok(vec![Document { + range: 0..content.len(), + content: document_span, + embedding: Vec::new(), + name: language_name.to_string(), + }]) + } + pub fn parse_file( &mut self, relative_path: &Path, content: &str, language: Arc, ) -> Result> { + if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) { + return self._parse_entire_file(relative_path, language.name(), &content); + } + let grammar = language .grammar() .ok_or_else(|| anyhow!("no grammar for language"))?; diff --git a/crates/vector_store/src/vector_store.rs b/crates/vector_store/src/vector_store.rs index 3f7ab5c6cd1b2e0296ee560071377a84a6c527db..0f55bd9e63f3a95ce113478a406f485600348973 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/vector_store/src/vector_store.rs @@ -19,7 +19,7 @@ use gpui::{ use language::{Language, LanguageRegistry}; use modal::{SemanticSearch, SemanticSearchDelegate, Toggle}; use parking_lot::Mutex; -use parsing::{CodeContextRetriever, Document}; +use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES}; use project::{Fs, Project, WorktreeId}; use smol::channel; use std::{ @@ -537,10 +537,11 @@ impl VectorStore { .language_for_file(&absolute_path, None) .await { - if language - .grammar() - .and_then(|grammar| grammar.embedding_config.as_ref()) - .is_none() + if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) + && language + .grammar() + .and_then(|grammar| grammar.embedding_config.as_ref()) + .is_none() { continue; } diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index 76465b1aaf95ef98b2305e35dbead3628bc461ed..84c9962493a78c4fe7a27fd74581863476440570 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -56,6 +56,9 @@ async fn test_vector_store(cx: &mut TestAppContext) { println!(\"bbbb!\"); } ".unindent(), + "file3.toml": " + ZZZZZZZ = 5 + ".unindent(), } }), ) @@ -63,7 +66,9 @@ async fn test_vector_store(cx: &mut TestAppContext) { let languages = Arc::new(LanguageRegistry::new(Task::ready(()))); let rust_language = rust_lang(); + let toml_language = toml_lang(); languages.add(rust_language); + languages.add(toml_language); let db_dir = tempdir::TempDir::new("vector-store").unwrap(); let db_path = db_dir.path().join("db.sqlite"); @@ -87,7 +92,7 @@ async fn test_vector_store(cx: &mut TestAppContext) { .update(cx, |store, cx| store.index_project(project.clone(), cx)) .await .unwrap(); - assert_eq!(file_count, 2); + assert_eq!(file_count, 3); cx.foreground().run_until_parked(); store.update(cx, |store, _cx| { assert_eq!( @@ -578,3 +583,14 @@ fn rust_lang() -> Arc { .unwrap(), ) } + +fn toml_lang() -> Arc { + Arc::new(Language::new( + LanguageConfig { + name: "TOML".into(), + path_suffixes: vec!["toml".into()], + ..Default::default() + }, + Some(tree_sitter_toml::language()), + )) +} From 1362c5a3d9753702820bc615dcfd4a4b261f0a3f Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 17 Jul 2023 14:43:29 -0400 Subject: [PATCH 12/34] add embedding treesitter query for cpp --- Cargo.lock | 1 + crates/vector_store/Cargo.toml | 1 + crates/vector_store/src/vector_store_tests.rs | 312 ++++++++++++++++-- crates/zed/src/languages/cpp/embedding.scm | 61 ++++ 4 files changed, 347 insertions(+), 28 deletions(-) create mode 100644 crates/zed/src/languages/cpp/embedding.scm diff --git a/Cargo.lock b/Cargo.lock index afd40fd3081b0948d38afc37a6fd7e37066e625e..28a0e76d143086ba6af22d8c8d01a69de47872b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8518,6 +8518,7 @@ dependencies = [ "theme", "tiktoken-rs 0.5.0", "tree-sitter", + "tree-sitter-cpp", "tree-sitter-rust", "tree-sitter-toml 0.20.0", "tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index 31119a1ba65363721593d68fa705b75763bafd58..0009665e26a1b4e6b57c4aab061f6c457138fd2f 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -54,3 +54,4 @@ env_logger.workspace = true tree-sitter-typescript = "*" tree-sitter-rust = "*" tree-sitter-toml = "*" +tree-sitter-cpp = "*" diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index 84c9962493a78c4fe7a27fd74581863476440570..3a9e1748c54fd576fb1d0b49dada41a39842dad0 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -211,32 +211,33 @@ async fn test_code_context_retrieval_javascript() { let mut retriever = CodeContextRetriever::new(); let text = " -/* globals importScripts, backend */ -function _authorize() {} - -/** - * Sometimes the frontend build is way faster than backend. - */ -export async function authorizeBank() { - _authorize(pushModal, upgradingAccountId, {}); -} + /* globals importScripts, backend */ + function _authorize() {} + + /** + * Sometimes the frontend build is way faster than backend. + */ + export async function authorizeBank() { + _authorize(pushModal, upgradingAccountId, {}); + } -export class SettingsPage { - /* This is a test setting */ - constructor(page) { - this.page = page; - } -} + export class SettingsPage { + /* This is a test setting */ + constructor(page) { + this.page = page; + } + } -/* This is a test comment */ -class TestClass {} + /* This is a test comment */ + class TestClass {} -/* Schema for editor_events in Clickhouse. */ -export interface ClickhouseEditorEvent { - installation_id: string - operation: string -} -"; + /* Schema for editor_events in Clickhouse. */ + export interface ClickhouseEditorEvent { + installation_id: string + operation: string + } + " + .unindent(); let parsed_files = retriever .parse_file(Path::new("foo.js"), &text, language) @@ -258,7 +259,7 @@ export interface ClickhouseEditorEvent { }, Document { name: "async function authorizeBank".into(), - range: text.find("export async").unwrap()..224, + range: text.find("export async").unwrap()..223, content: " The below code snippet is from file 'foo.js' @@ -275,7 +276,7 @@ export interface ClickhouseEditorEvent { }, Document { name: "class SettingsPage".into(), - range: 226..344, + range: 225..343, content: " The below code snippet is from file 'foo.js' @@ -292,7 +293,7 @@ export interface ClickhouseEditorEvent { }, Document { name: "constructor".into(), - range: 291..342, + range: 290..341, content: " The below code snippet is from file 'foo.js' @@ -307,7 +308,7 @@ export interface ClickhouseEditorEvent { }, Document { name: "class TestClass".into(), - range: 375..393, + range: 374..392, content: " The below code snippet is from file 'foo.js' @@ -320,7 +321,7 @@ export interface ClickhouseEditorEvent { }, Document { name: "interface ClickhouseEditorEvent".into(), - range: 441..533, + range: 440..532, content: " The below code snippet is from file 'foo.js' @@ -341,6 +342,181 @@ export interface ClickhouseEditorEvent { } } +#[gpui::test] +async fn test_code_context_retrieval_cpp() { + let language = cpp_lang(); + let mut retriever = CodeContextRetriever::new(); + + let text = " + /** + * @brief Main function + * @returns 0 on exit + */ + int main() { return 0; } + + /** + * This is a test comment + */ + class MyClass { // The class + public: // Access specifier + int myNum; // Attribute (int variable) + string myString; // Attribute (string variable) + }; + + // This is a test comment + enum Color { red, green, blue }; + + /** This is a preceeding block comment + * This is the second line + */ + struct { // Structure declaration + int myNum; // Member (int variable) + string myString; // Member (string variable) + } myStructure; + + /** + * @brief Matrix class. + */ + template ::value || std::is_floating_point::value, + bool>::type> + class Matrix2 { + std::vector> _mat; + + public: + /** + * @brief Constructor + * @tparam Integer ensuring integers are being evaluated and not other + * data types. + * @param size denoting the size of Matrix as size x size + */ + template ::value, + Integer>::type> + explicit Matrix(const Integer size) { + for (size_t i = 0; i < size; ++i) { + _mat.emplace_back(std::vector(size, 0)); + } + } + }" + .unindent(); + + let parsed_files = retriever + .parse_file(Path::new("foo.cpp"), &text, language) + .unwrap(); + + let test_documents = &[ + Document { + name: "int main".into(), + range: 54..78, + content: " + The below code snippet is from file 'foo.cpp' + + ```cpp + /** + * @brief Main function + * @returns 0 on exit + */ + int main() { return 0; } + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "class MyClass".into(), + range: 112..295, + content: " + The below code snippet is from file 'foo.cpp' + + ```cpp + /** + * This is a test comment + */ + class MyClass { // The class + public: // Access specifier + int myNum; // Attribute (int variable) + string myString; // Attribute (string variable) + } + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "enum Color".into(), + range: 324..355, + content: " + The below code snippet is from file 'foo.cpp' + + ```cpp + // This is a test comment + enum Color { red, green, blue } + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "struct myStructure".into(), + range: 428..581, + content: " + The below code snippet is from file 'foo.cpp' + + ```cpp + /** This is a preceeding block comment + * This is the second line + */ + struct { // Structure declaration + int myNum; // Member (int variable) + string myString; // Member (string variable) + } myStructure; + ```" + .unindent(), + embedding: vec![], + }, + Document { + name: "class Matrix2".into(), + range: 613..1342, + content: " + The below code snippet is from file 'foo.cpp' + + ```cpp + /** + * @brief Matrix class. + */ + template ::value || std::is_floating_point::value, + bool>::type> + class Matrix2 { + std::vector> _mat; + + public: + /** + * @brief Constructor + * @tparam Integer ensuring integers are being evaluated and not other + * data types. + * @param size denoting the size of Matrix as size x size + */ + template ::value, + Integer>::type> + explicit Matrix(const Integer size) { + for (size_t i = 0; i < size; ++i) { + _mat.emplace_back(std::vector(size, 0)); + } + } + } + ```" + .unindent(), + embedding: vec![], + }, + ]; + + for idx in 0..test_documents.len() { + assert_eq!(test_documents[idx], parsed_files[idx]); + } +} + #[gpui::test] fn test_dot_product(mut rng: StdRng) { assert_eq!(dot(&[1., 0., 0., 0., 0.], &[0., 1., 0., 0., 0.]), 0.); @@ -594,3 +770,83 @@ fn toml_lang() -> Arc { Some(tree_sitter_toml::language()), )) } + +fn cpp_lang() -> Arc { + Arc::new( + Language::new( + LanguageConfig { + name: "CPP".into(), + path_suffixes: vec!["cpp".into()], + ..Default::default() + }, + Some(tree_sitter_cpp::language()), + ) + .with_embedding_query( + r#" + ( + (comment)* @context + . + (function_definition + (type_qualifier)? @name + type: (_)? @name + declarator: [ + (function_declarator + declarator: (_) @name) + (pointer_declarator + "*" @name + declarator: (function_declarator + declarator: (_) @name)) + (pointer_declarator + "*" @name + declarator: (pointer_declarator + "*" @name + declarator: (function_declarator + declarator: (_) @name))) + (reference_declarator + ["&" "&&"] @name + (function_declarator + declarator: (_) @name)) + ] + (type_qualifier)? @name) @item + ) + + ( + (comment)* @context + . + (template_declaration + (class_specifier + "class" @name + name: (_) @name) + ) @item + ) + + ( + (comment)* @context + . + (class_specifier + "class" @name + name: (_) @name) @item + ) + + ( + (comment)* @context + . + (enum_specifier + "enum" @name + name: (_) @name) @item + ) + + ( + (comment)* @context + . + (declaration + type: (struct_specifier + "struct" @name) + declarator: (_) @name) @item + ) + + "#, + ) + .unwrap(), + ) +} diff --git a/crates/zed/src/languages/cpp/embedding.scm b/crates/zed/src/languages/cpp/embedding.scm new file mode 100644 index 0000000000000000000000000000000000000000..bbd93f20dbdf6eddd097f49b7603ec5e0dc9bc59 --- /dev/null +++ b/crates/zed/src/languages/cpp/embedding.scm @@ -0,0 +1,61 @@ +( + (comment)* @context + . + (function_definition + (type_qualifier)? @name + type: (_)? @name + declarator: [ + (function_declarator + declarator: (_) @name) + (pointer_declarator + "*" @name + declarator: (function_declarator + declarator: (_) @name)) + (pointer_declarator + "*" @name + declarator: (pointer_declarator + "*" @name + declarator: (function_declarator + declarator: (_) @name))) + (reference_declarator + ["&" "&&"] @name + (function_declarator + declarator: (_) @name)) + ] + (type_qualifier)? @name) @item + ) + +( + (comment)* @context + . + (template_declaration + (class_specifier + "class" @name + name: (_) @name) + ) @item +) + +( + (comment)* @context + . + (class_specifier + "class" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (enum_specifier + "enum" @name + name: (_) @name) @item + ) + +( + (comment)* @context + . + (declaration + type: (struct_specifier + "struct" @name) + declarator: (_) @name) @item +) From f0bf60fdedc56ec594a5e60f4442e8eb5a998c0b Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 17 Jul 2023 14:53:57 -0400 Subject: [PATCH 13/34] add css as a embeddable file type in which the entire file is embedded individually --- crates/vector_store/src/parsing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/vector_store/src/parsing.rs b/crates/vector_store/src/parsing.rs index 216ef1b5e11697519900dd4445234ebacb5ade21..663f0f473b63358496c8dcbc337aa7ccbe452c76 100644 --- a/crates/vector_store/src/parsing.rs +++ b/crates/vector_store/src/parsing.rs @@ -15,7 +15,7 @@ const CODE_CONTEXT_TEMPLATE: &str = "The below code snippet is from file ''\n\n```\n\n```"; const ENTIRE_FILE_TEMPLATE: &str = "The below snippet is from file ''\n\n```\n\n```"; -pub const PARSEABLE_ENTIRE_FILE_TYPES: [&str; 3] = ["TOML", "YAML", "JSON"]; +pub const PARSEABLE_ENTIRE_FILE_TYPES: [&str; 4] = ["TOML", "YAML", "JSON", "CSS"]; pub struct CodeContextRetriever { pub parser: Parser, From e630ff38c4f4099e4e9c8d926c6a75c3e364fc58 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 17 Jul 2023 16:29:25 -0400 Subject: [PATCH 14/34] add embedding treesitter queries for elixir --- Cargo.lock | 13 +- crates/vector_store/Cargo.toml | 1 + crates/vector_store/src/vector_store_tests.rs | 182 ++++++++++++++++++ crates/zed/src/languages/elixir/embedding.scm | 27 +++ 4 files changed, 222 insertions(+), 1 deletion(-) create mode 100644 crates/zed/src/languages/elixir/embedding.scm diff --git a/Cargo.lock b/Cargo.lock index 28a0e76d143086ba6af22d8c8d01a69de47872b6..8fcca507d1fa47309793843f2268a87ff59a2e49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7982,6 +7982,16 @@ dependencies = [ "tree-sitter", ] +[[package]] +name = "tree-sitter-elixir" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a9916f3e1c80b3c8aab8582604e97e8720cb9b893489b347cf999f80f9d469e" +dependencies = [ + "cc", + "tree-sitter", +] + [[package]] name = "tree-sitter-elixir" version = "0.1.0" @@ -8519,6 +8529,7 @@ dependencies = [ "tiktoken-rs 0.5.0", "tree-sitter", "tree-sitter-cpp", + "tree-sitter-elixir 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "tree-sitter-rust", "tree-sitter-toml 0.20.0", "tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -9558,7 +9569,7 @@ dependencies = [ "tree-sitter-c", "tree-sitter-cpp", "tree-sitter-css", - "tree-sitter-elixir", + "tree-sitter-elixir 0.1.0 (git+https://github.com/elixir-lang/tree-sitter-elixir?rev=4ba9dab6e2602960d95b2b625f3386c27e08084e)", "tree-sitter-embedded-template", "tree-sitter-go", "tree-sitter-heex", diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index 0009665e26a1b4e6b57c4aab061f6c457138fd2f..6808f6c630ca8dda97fce819765995b78f3d2a9a 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -55,3 +55,4 @@ tree-sitter-typescript = "*" tree-sitter-rust = "*" tree-sitter-toml = "*" tree-sitter-cpp = "*" +tree-sitter-elixir = "*" diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index 3a9e1748c54fd576fb1d0b49dada41a39842dad0..d55dfcfc7151eb04efb5a33b119ae33b0875d86d 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -342,6 +342,143 @@ async fn test_code_context_retrieval_javascript() { } } +#[gpui::test] +async fn test_code_context_retrieval_elixir() { + let language = elixir_lang(); + let mut retriever = CodeContextRetriever::new(); + + let text = r#" +defmodule File.Stream do + @moduledoc """ + Defines a `File.Stream` struct returned by `File.stream!/3`. + + The following fields are public: + + * `path` - the file path + * `modes` - the file modes + * `raw` - a boolean indicating if bin functions should be used + * `line_or_bytes` - if reading should read lines or a given number of bytes + * `node` - the node the file belongs to + + """ + + defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil + + @type t :: %__MODULE__{} + + @doc false + def __build__(path, modes, line_or_bytes) do + raw = :lists.keyfind(:encoding, 1, modes) == false + + modes = + case raw do + true -> + case :lists.keyfind(:read_ahead, 1, modes) do + {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] + {:read_ahead, _} -> [:raw | modes] + false -> [:raw, :read_ahead | modes] + end + + false -> + modes + end + + %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} + + end +"# + .unindent(); + + let parsed_files = retriever + .parse_file(Path::new("foo.ex"), &text, language) + .unwrap(); + + let test_documents = &[ + Document{ + name: "defmodule File.Stream".into(), + range: 0..1132, + content: r#" + The below code snippet is from file 'foo.ex' + + ```elixir + defmodule File.Stream do + @moduledoc """ + Defines a `File.Stream` struct returned by `File.stream!/3`. + + The following fields are public: + + * `path` - the file path + * `modes` - the file modes + * `raw` - a boolean indicating if bin functions should be used + * `line_or_bytes` - if reading should read lines or a given number of bytes + * `node` - the node the file belongs to + + """ + + defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil + + @type t :: %__MODULE__{} + + @doc false + def __build__(path, modes, line_or_bytes) do + raw = :lists.keyfind(:encoding, 1, modes) == false + + modes = + case raw do + true -> + case :lists.keyfind(:read_ahead, 1, modes) do + {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] + {:read_ahead, _} -> [:raw | modes] + false -> [:raw, :read_ahead | modes] + end + + false -> + modes + end + + %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} + + end + ```"#.unindent(), + embedding: vec![], + }, + Document { + name: "def __build__".into(), + range: 574..1132, + content: r#" +The below code snippet is from file 'foo.ex' + +```elixir +@doc false +def __build__(path, modes, line_or_bytes) do + raw = :lists.keyfind(:encoding, 1, modes) == false + + modes = + case raw do + true -> + case :lists.keyfind(:read_ahead, 1, modes) do + {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] + {:read_ahead, _} -> [:raw | modes] + false -> [:raw, :read_ahead | modes] + end + + false -> + modes + end + + %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} + + end +```"# + .unindent(), + embedding: vec![], + }]; + + for idx in 0..test_documents.len() { + assert_eq!(test_documents[idx], parsed_files[idx]); + } +} + #[gpui::test] async fn test_code_context_retrieval_cpp() { let language = cpp_lang(); @@ -850,3 +987,48 @@ fn cpp_lang() -> Arc { .unwrap(), ) } + +fn elixir_lang() -> Arc { + Arc::new( + Language::new( + LanguageConfig { + name: "Elixir".into(), + path_suffixes: vec!["rs".into()], + ..Default::default() + }, + Some(tree_sitter_elixir::language()), + ) + .with_embedding_query( + r#" + ( + (unary_operator + operator: "@" + operand: (call + target: (identifier) @unary + (#match? @unary "^(doc)$")) + ) @context + . + (call + target: (identifier) @name + (arguments + [ + (identifier) @name + (call + target: (identifier) @name) + (binary_operator + left: (call + target: (identifier) @name) + operator: "when") + ]) + (#match? @name "^(def|defp|defdelegate|defguard|defguardp|defmacro|defmacrop|defn|defnp)$")) @item + ) + + (call + target: (identifier) @name + (arguments (alias) @name) + (#match? @name "^(defmodule|defprotocol)$")) @item + "#, + ) + .unwrap(), + ) +} diff --git a/crates/zed/src/languages/elixir/embedding.scm b/crates/zed/src/languages/elixir/embedding.scm new file mode 100644 index 0000000000000000000000000000000000000000..16ad20746d4b0c8697ff126fcc5150636cb8b794 --- /dev/null +++ b/crates/zed/src/languages/elixir/embedding.scm @@ -0,0 +1,27 @@ +( + (unary_operator + operator: "@" + operand: (call + target: (identifier) @unary + (#match? @unary "^(doc)$")) + ) @context + . + (call + target: (identifier) @name + (arguments + [ + (identifier) @name + (call + target: (identifier) @name) + (binary_operator + left: (call + target: (identifier) @name) + operator: "when") + ]) + (#match? @name "^(def|defp|defdelegate|defguard|defguardp|defmacro|defmacrop|defn|defnp)$")) @item + ) + + (call + target: (identifier) @name + (arguments (alias) @name) + (#match? @name "^(defmodule|defprotocol)$")) @item From 8b42f5b1b379e175a599067654724b8d6ea48f35 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 17 Jul 2023 17:06:10 -0400 Subject: [PATCH 15/34] rename vector_store crate to semantic_index --- Cargo.lock | 274 +++++++++--------- Cargo.toml | 2 +- assets/settings/default.json | 4 +- .../Cargo.toml | 4 +- .../README.md | 0 .../src/db.rs | 16 +- .../src/embedding.rs | 0 .../src/modal.rs | 12 +- .../src/parsing.rs | 0 .../src/semantic_index.rs} | 33 ++- .../src/semantic_index_settings.rs} | 10 +- .../src/semantic_index_tests.rs} | 10 +- crates/zed/Cargo.toml | 2 +- crates/zed/src/main.rs | 2 +- 14 files changed, 186 insertions(+), 183 deletions(-) rename crates/{vector_store => semantic_index}/Cargo.toml (96%) rename crates/{vector_store => semantic_index}/README.md (100%) rename crates/{vector_store => semantic_index}/src/db.rs (95%) rename crates/{vector_store => semantic_index}/src/embedding.rs (100%) rename crates/{vector_store => semantic_index}/src/modal.rs (95%) rename crates/{vector_store => semantic_index}/src/parsing.rs (100%) rename crates/{vector_store/src/vector_store.rs => semantic_index/src/semantic_index.rs} (96%) rename crates/{vector_store/src/vector_store_settings.rs => semantic_index/src/semantic_index_settings.rs} (71%) rename crates/{vector_store/src/vector_store_tests.rs => semantic_index/src/semantic_index_tests.rs} (99%) diff --git a/Cargo.lock b/Cargo.lock index 8fcca507d1fa47309793843f2268a87ff59a2e49..430a665f98b2a7f353855b9645c2e148dd02fb4b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" [[package]] name = "allocator-api2" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56fc6cf8dc8c4158eed8649f9b8b0ea1518eb62b544fe9490d66fa0b349eafe9" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" [[package]] name = "alsa" @@ -277,9 +277,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.71" +version = "1.0.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" +checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854" [[package]] name = "arrayref" @@ -481,7 +481,7 @@ checksum = "0e97ce7de6cf12de5d7226c73f5ba9811622f4db3a5b91b55c53e987e5f91cba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -529,7 +529,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -572,7 +572,7 @@ checksum = "a564d521dd56509c4c47480d00b80ee55f7e385ae48db5744c67ad50c92d2ebf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -680,7 +680,7 @@ dependencies = [ "http", "http-body", "hyper", - "itoa 1.0.8", + "itoa 1.0.9", "matchit", "memchr", "mime", @@ -830,7 +830,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.25", + "syn 2.0.26", "which", ] @@ -1243,20 +1243,20 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.11" +version = "4.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d" +checksum = "98330784c494e49850cb23b8e2afcca13587d2500b2e3f1f78ae20248059c9be" dependencies = [ "clap_builder", - "clap_derive 4.3.2", + "clap_derive 4.3.12", "once_cell", ] [[package]] name = "clap_builder" -version = "4.3.11" +version = "4.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b" +checksum = "e182eb5f2562a67dda37e2c57af64d720a9e010c5e860ed87c056586aeafa52e" dependencies = [ "anstream", "anstyle", @@ -1279,14 +1279,14 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.3.2" +version = "4.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" +checksum = "54a9bb5758fc5dfe728d1019941681eccaf0cf8a4189b692a0ee2f2ecf90a050" dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -1357,7 +1357,7 @@ dependencies = [ "tiny_http", "url", "util", - "uuid 1.4.0", + "uuid 1.4.1", ] [[package]] @@ -2204,9 +2204,9 @@ dependencies = [ [[package]] name = "dyn-clone" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b0cf012f1230e43cd00ebb729c6bb58707ecfa8ad08b52ef3a4ccd2697fc30" +checksum = "304e6508efa593091e97a9abbc10f90aa7ca635b6d2784feff3c89d41dd12272" [[package]] name = "editor" @@ -2319,9 +2319,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "erased-serde" -version = "0.3.27" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f94c0e13118e7d7533271f754a168ae8400e6a1cc043f2bfd53cc7290f1a1de3" +checksum = "da96524cc884f6558f1769b6c46686af2fe8e8b4cd253bd5a3cdba8181b8e070" dependencies = [ "serde", ] @@ -2789,7 +2789,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -3033,7 +3033,7 @@ dependencies = [ "tiny-skia", "usvg", "util", - "uuid 1.4.0", + "uuid 1.4.1", "waker-fn", ] @@ -3235,7 +3235,7 @@ checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" dependencies = [ "bytes 1.4.0", "fnv", - "itoa 1.0.8", + "itoa 1.0.9", ] [[package]] @@ -3294,7 +3294,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.8", + "itoa 1.0.9", "pin-project-lite 0.2.10", "socket2", "tokio", @@ -3499,7 +3499,7 @@ dependencies = [ "rand 0.7.3", "serde", "tempfile", - "uuid 1.4.0", + "uuid 1.4.1", "winapi 0.3.9", ] @@ -3576,9 +3576,9 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "itoa" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b02a5381cc465bd3041d84623d0fa3b66738b52b8e2fc3bab8ad63ab032f4a" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" [[package]] name = "ittapi-rs" @@ -4722,7 +4722,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -4886,9 +4886,9 @@ dependencies = [ [[package]] name = "paste" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4b27ab7be369122c218afc2079489cdcb4b517c0a3fc386ff11e1fedfcc2b35" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "pathfinder_color" @@ -4952,9 +4952,9 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] name = "pest" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f73935e4d55e2abf7f130186537b19e7a4abc886a0252380b59248af473a3fc9" +checksum = "0d2d1d55045829d65aad9d389139882ad623b33b904e7c9f1b10c5b8927298e5" dependencies = [ "thiserror", "ucd-trie", @@ -5010,7 +5010,7 @@ checksum = "ec2e072ecce94ec471b13398d5402c188e76ac03cf74dd1a975161b23a3f6d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -5163,7 +5163,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92139198957b410250d43fad93e630d956499a625c527eda65175c8680f83387" dependencies = [ "proc-macro2", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -5211,9 +5211,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.64" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" dependencies = [ "unicode-ident", ] @@ -5491,9 +5491,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.29" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" +checksum = "5fe8a65d69dd0808184ebb5f836ab526bb259db23c657efa38711b1072ee47f0" dependencies = [ "proc-macro2", ] @@ -5879,7 +5879,7 @@ dependencies = [ "rkyv_derive", "seahash", "tinyvec", - "uuid 1.4.0", + "uuid 1.4.1", ] [[package]] @@ -6034,7 +6034,7 @@ dependencies = [ "proc-macro2", "quote", "rust-embed-utils", - "syn 2.0.25", + "syn 2.0.26", "walkdir", ] @@ -6097,7 +6097,7 @@ dependencies = [ "bitflags 1.3.2", "errno 0.2.8", "io-lifetimes 0.5.3", - "itoa 1.0.8", + "itoa 1.0.9", "libc", "linux-raw-sys 0.0.42", "once_cell", @@ -6167,9 +6167,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc31bd9b61a32c31f9650d18add92aa83a49ba979c143eefd27fe7177b05bd5f" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" [[package]] name = "rustybuzz" @@ -6189,9 +6189,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe232bdf6be8c8de797b22184ee71118d63780ea42ac85b61d1baa6d3b782ae9" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" [[package]] name = "safe_arch" @@ -6267,9 +6267,9 @@ checksum = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" [[package]] name = "scopeguard" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scrypt" @@ -6329,7 +6329,7 @@ dependencies = [ "time 0.3.23", "tracing", "url", - "uuid 1.4.0", + "uuid 1.4.1", ] [[package]] @@ -6355,7 +6355,7 @@ dependencies = [ "sea-query-derive", "serde_json", "time 0.3.23", - "uuid 1.4.0", + "uuid 1.4.1", ] [[package]] @@ -6370,7 +6370,7 @@ dependencies = [ "serde_json", "sqlx", "time 0.3.23", - "uuid 1.4.0", + "uuid 1.4.1", ] [[package]] @@ -6465,6 +6465,48 @@ dependencies = [ "libc", ] +[[package]] +name = "semantic_index" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bincode", + "ctor", + "editor", + "env_logger 0.9.3", + "futures 0.3.28", + "gpui", + "isahc", + "language", + "lazy_static", + "log", + "matrixmultiply", + "parking_lot 0.11.2", + "picker", + "project", + "rand 0.8.5", + "rpc", + "rusqlite", + "schemars", + "serde", + "serde_json", + "settings", + "smol", + "tempdir", + "theme", + "tiktoken-rs 0.5.0", + "tree-sitter", + "tree-sitter-cpp", + "tree-sitter-elixir 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tree-sitter-rust", + "tree-sitter-toml 0.20.0", + "tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)", + "unindent", + "util", + "workspace", +] + [[package]] name = "semver" version = "0.11.0" @@ -6506,7 +6548,7 @@ checksum = "389894603bd18c46fa56231694f8d827779c0951a667087194cf9de94ed24682" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -6531,12 +6573,12 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.102" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5062a995d481b2308b6064e9af76011f2921c35f97b0468811ed9f6cd91dfed" +checksum = "d03b412469450d4404fe8499a268edd7f8b79fecb074b0d812ad64ca21f4031b" dependencies = [ "indexmap 2.0.0", - "itoa 1.0.8", + "itoa 1.0.9", "ryu", "serde", ] @@ -6561,7 +6603,7 @@ checksum = "1d89a8107374290037607734c0b73a85db7ed80cae314b3c5791f192a496e731" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -6571,7 +6613,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.8", + "itoa 1.0.9", "ryu", "serde", ] @@ -6702,9 +6744,9 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "signal-hook" -version = "0.3.15" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9" +checksum = "b824b6e687aff278cdbf3b36f07aa52d4bd4099699324d5da86a2ebce3aa00b3" dependencies = [ "libc", "signal-hook-registry", @@ -6891,7 +6933,7 @@ dependencies = [ "parking_lot 0.11.2", "smol", "thread_local", - "uuid 1.4.0", + "uuid 1.4.1", ] [[package]] @@ -6957,7 +6999,7 @@ dependencies = [ "hkdf", "hmac 0.12.1", "indexmap 1.9.3", - "itoa 1.0.8", + "itoa 1.0.9", "libc", "libsqlite3-sys", "log", @@ -6983,7 +7025,7 @@ dependencies = [ "time 0.3.23", "tokio-stream", "url", - "uuid 1.4.0", + "uuid 1.4.1", "webpki-roots 0.22.6", "whoami", ] @@ -7041,9 +7083,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "stringprep" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1" +checksum = "db3737bde7edce97102e0e2b15365bf7a20bfdb5f60f4f9e8d7004258a51a8da" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -7103,7 +7145,7 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dc09e9364c2045ab5fa38f7b04d077b3359d30c4c2b3ec4bae67a358bd64326" dependencies = [ - "itoa 1.0.8", + "itoa 1.0.9", "ryu", "sval", ] @@ -7114,7 +7156,7 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ada6f627e38cbb8860283649509d87bc4a5771141daa41c78fd31f2b9485888d" dependencies = [ - "itoa 1.0.8", + "itoa 1.0.9", "ryu", "sval", ] @@ -7229,9 +7271,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.25" +version = "2.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15e3fc8c0c74267e2df136e5e5fb656a464158aa57624053375eb9c8c6e25ae2" +checksum = "45c3457aacde3c65315de5031ec191ce46604304d2446e803d71ade03308d970" dependencies = [ "proc-macro2", "quote", @@ -7485,7 +7527,7 @@ checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -7562,7 +7604,7 @@ version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59e399c068f43a5d116fedaf73b203fa4f9c519f17e2b34f63221d3792f81446" dependencies = [ - "itoa 1.0.8", + "itoa 1.0.9", "serde", "time-core", "time-macros", @@ -7674,7 +7716,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -7767,9 +7809,9 @@ checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" [[package]] name = "toml_edit" -version = "0.19.12" +version = "0.19.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c500344a19072298cd05a7224b3c0c629348b78692bf48466c5238656e315a78" +checksum = "f8123f27e969974a3dfba720fdb560be359f57b44302d280ba72e76a74480e8a" dependencies = [ "indexmap 2.0.0", "toml_datetime", @@ -7879,7 +7921,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -8285,9 +8327,9 @@ checksum = "7f9af028e052a610d99e066b33304625dea9613170a2563314490a4e6ec5cf7f" [[package]] name = "unicode-ident" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73" +checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" [[package]] name = "unicode-normalization" @@ -8427,9 +8469,9 @@ checksum = "bcc7e3b898aa6f6c08e5295b6c89258d1331e9ac578cc992fb818759951bdc22" [[package]] name = "uuid" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d023da39d1fde5a8a3fe1f3e01ca9632ada0a63e9797de55a879d6e2236277be" +checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" dependencies = [ "getrandom 0.2.10", "serde", @@ -8496,48 +8538,6 @@ dependencies = [ "workspace", ] -[[package]] -name = "vector_store" -version = "0.1.0" -dependencies = [ - "anyhow", - "async-trait", - "bincode", - "ctor", - "editor", - "env_logger 0.9.3", - "futures 0.3.28", - "gpui", - "isahc", - "language", - "lazy_static", - "log", - "matrixmultiply", - "parking_lot 0.11.2", - "picker", - "project", - "rand 0.8.5", - "rpc", - "rusqlite", - "schemars", - "serde", - "serde_json", - "settings", - "smol", - "tempdir", - "theme", - "tiktoken-rs 0.5.0", - "tree-sitter", - "tree-sitter-cpp", - "tree-sitter-elixir 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "tree-sitter-rust", - "tree-sitter-toml 0.20.0", - "tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)", - "unindent", - "util", - "workspace", -] - [[package]] name = "version_check" version = "0.9.4" @@ -8698,7 +8698,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", "wasm-bindgen-shared", ] @@ -8732,7 +8732,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -8745,9 +8745,9 @@ checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "wasm-encoder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2f8e9778e04cbf44f58acc301372577375a666b966c50b03ef46144f80436a8" +checksum = "06a3d1b4a575ffb873679402b2aedb3117555eb65c27b1b86c8a91e574bc2a2a" dependencies = [ "leb128", ] @@ -8969,9 +8969,9 @@ dependencies = [ [[package]] name = "wast" -version = "61.0.0" +version = "62.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc6b347851b52fd500657d301155c79e8c67595501d179cef87b6f04ebd25ac4" +checksum = "c7f7ee878019d69436895f019b65f62c33da63595d8e857cbdc87c13ecb29a32" dependencies = [ "leb128", "memchr", @@ -8981,11 +8981,11 @@ dependencies = [ [[package]] name = "wat" -version = "1.0.67" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459e764d27c3ab7beba1ebd617cc025c7e76dea6e7c5ce3189989a970aea3491" +checksum = "295572bf24aa5b685a971a83ad3e8b6e684aaad8a9be24bc7bf59bed84cc1c08" dependencies = [ - "wast 61.0.0", + "wast 62.0.0", ] [[package]] @@ -9315,9 +9315,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "winnow" -version = "0.4.9" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81a2094c43cc94775293eaa0e499fbc30048a6d824ac82c0351a8c0bf9112529" +checksum = "81fac9742fd1ad1bd9643b991319f72dd031016d44b77039a26977eb667141e7" dependencies = [ "memchr", ] @@ -9399,7 +9399,7 @@ dependencies = [ "terminal", "theme", "util", - "uuid 1.4.0", + "uuid 1.4.1", ] [[package]] @@ -9447,7 +9447,7 @@ name = "xtask" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.3.11", + "clap 4.3.14", "schemars", "serde_json", "theme", @@ -9548,6 +9548,7 @@ dependencies = [ "rsa", "rust-embed", "search", + "semantic_index", "serde", "serde_derive", "serde_json", @@ -9589,8 +9590,7 @@ dependencies = [ "url", "urlencoding", "util", - "uuid 1.4.0", - "vector_store", + "uuid 1.4.1", "vim", "welcome", "workspace", @@ -9621,7 +9621,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 529f297f700d5006df3c169f3e663144bc24f9d1..ce3dd9c46221db7f189a66843162fbb483e68aa4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,7 +63,7 @@ members = [ "crates/theme", "crates/theme_selector", "crates/util", - "crates/vector_store", + "crates/semantic_index", "crates/vim", "crates/vcs_menu", "crates/workspace", diff --git a/assets/settings/default.json b/assets/settings/default.json index 1f8d12a3d9db82a87ca1788fe8369dbfb42b7596..b109b8d595a1e184a668ff2fa226d1ffa7ce2f70 100644 --- a/assets/settings/default.json +++ b/assets/settings/default.json @@ -291,8 +291,8 @@ // the terminal will default to matching the buffer's font family. // "font_family": "Zed Mono" }, - // Difference settings for vector_store - "vector_store": { + // Difference settings for semantic_index + "semantic_index": { "enabled": false, "reindexing_delay_seconds": 600 }, diff --git a/crates/vector_store/Cargo.toml b/crates/semantic_index/Cargo.toml similarity index 96% rename from crates/vector_store/Cargo.toml rename to crates/semantic_index/Cargo.toml index 6808f6c630ca8dda97fce819765995b78f3d2a9a..5c5af072c8f614e8eb8111d31c72bf9bbf905ada 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -1,11 +1,11 @@ [package] -name = "vector_store" +name = "semantic_index" version = "0.1.0" edition = "2021" publish = false [lib] -path = "src/vector_store.rs" +path = "src/semantic_index.rs" doctest = false [dependencies] diff --git a/crates/vector_store/README.md b/crates/semantic_index/README.md similarity index 100% rename from crates/vector_store/README.md rename to crates/semantic_index/README.md diff --git a/crates/vector_store/src/db.rs b/crates/semantic_index/src/db.rs similarity index 95% rename from crates/vector_store/src/db.rs rename to crates/semantic_index/src/db.rs index d3d05f8c62c9d5639e641094204caa112e96c54f..1d5a9a475ea826cdb7baa91406b83b2189f95587 100644 --- a/crates/vector_store/src/db.rs +++ b/crates/semantic_index/src/db.rs @@ -1,4 +1,4 @@ -use crate::{parsing::Document, VECTOR_STORE_VERSION}; +use crate::{parsing::Document, SEMANTIC_INDEX_VERSION}; use anyhow::{anyhow, Result}; use project::Fs; use rpc::proto::Timestamp; @@ -55,7 +55,9 @@ impl VectorDatabase { } fn get_existing_version(&self) -> Result { - let mut version_query = self.db.prepare("SELECT version from vector_store_config")?; + let mut version_query = self + .db + .prepare("SELECT version from semantic_index_config")?; version_query .query_row([], |row| Ok(row.get::<_, i64>(0)?)) .map_err(|err| anyhow!("version query failed: {err}")) @@ -66,7 +68,7 @@ impl VectorDatabase { if self .get_existing_version() - .map_or(false, |version| version == VECTOR_STORE_VERSION as i64) + .map_or(false, |version| version == SEMANTIC_INDEX_VERSION as i64) { return Ok(()); } @@ -74,7 +76,7 @@ impl VectorDatabase { self.db .execute( " - DROP TABLE vector_store_config; + DROP TABLE semantic_index_config; DROP TABLE worktrees; DROP TABLE files; DROP TABLE documents; @@ -85,15 +87,15 @@ impl VectorDatabase { // Initialize Vector Databasing Tables self.db.execute( - "CREATE TABLE vector_store_config ( + "CREATE TABLE semantic_index_config ( version INTEGER NOT NULL )", [], )?; self.db.execute( - "INSERT INTO vector_store_config (version) VALUES (?1)", - params![VECTOR_STORE_VERSION], + "INSERT INTO semantic_index_config (version) VALUES (?1)", + params![SEMANTIC_INDEX_VERSION], )?; self.db.execute( diff --git a/crates/vector_store/src/embedding.rs b/crates/semantic_index/src/embedding.rs similarity index 100% rename from crates/vector_store/src/embedding.rs rename to crates/semantic_index/src/embedding.rs diff --git a/crates/vector_store/src/modal.rs b/crates/semantic_index/src/modal.rs similarity index 95% rename from crates/vector_store/src/modal.rs rename to crates/semantic_index/src/modal.rs index 2981fa4e73ef77ce3b54b68da9b177452f6d245e..ffc64a195ccfb23009922f71878c17ea90b1e375 100644 --- a/crates/vector_store/src/modal.rs +++ b/crates/semantic_index/src/modal.rs @@ -1,4 +1,4 @@ -use crate::{SearchResult, VectorStore}; +use crate::{SearchResult, SemanticIndex}; use editor::{scroll::autoscroll::Autoscroll, Editor}; use gpui::{ actions, elements::*, AnyElement, AppContext, ModelHandle, MouseState, Task, ViewContext, @@ -20,7 +20,7 @@ pub type SemanticSearch = Picker; pub struct SemanticSearchDelegate { workspace: WeakViewHandle, project: ModelHandle, - vector_store: ModelHandle, + semantic_index: ModelHandle, selected_match_index: usize, matches: Vec, history: HashMap>, @@ -33,12 +33,12 @@ impl SemanticSearchDelegate { pub fn new( workspace: WeakViewHandle, project: ModelHandle, - vector_store: ModelHandle, + semantic_index: ModelHandle, ) -> Self { Self { workspace, project, - vector_store, + semantic_index, selected_match_index: 0, matches: vec![], history: HashMap::new(), @@ -105,7 +105,7 @@ impl PickerDelegate for SemanticSearchDelegate { return Task::ready(()); } - let vector_store = self.vector_store.clone(); + let semantic_index = self.semantic_index.clone(); let project = self.project.clone(); cx.spawn(|this, mut cx| async move { cx.background().timer(EMBEDDING_DEBOUNCE_INTERVAL).await; @@ -123,7 +123,7 @@ impl PickerDelegate for SemanticSearchDelegate { if let Some(retrieved) = retrieved_cached.log_err() { if !retrieved { - let task = vector_store.update(&mut cx, |store, cx| { + let task = semantic_index.update(&mut cx, |store, cx| { store.search_project(project.clone(), query.to_string(), 10, cx) }); diff --git a/crates/vector_store/src/parsing.rs b/crates/semantic_index/src/parsing.rs similarity index 100% rename from crates/vector_store/src/parsing.rs rename to crates/semantic_index/src/parsing.rs diff --git a/crates/vector_store/src/vector_store.rs b/crates/semantic_index/src/semantic_index.rs similarity index 96% rename from crates/vector_store/src/vector_store.rs rename to crates/semantic_index/src/semantic_index.rs index 0f55bd9e63f3a95ce113478a406f485600348973..58ffa512ce6e6714cb2af2ea3593d1f7eb96c534 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -2,12 +2,12 @@ mod db; mod embedding; mod modal; mod parsing; -mod vector_store_settings; +mod semantic_index_settings; #[cfg(test)] -mod vector_store_tests; +mod semantic_index_tests; -use crate::vector_store_settings::VectorStoreSettings; +use crate::semantic_index_settings::SemanticIndexSettings; use anyhow::{anyhow, Result}; use db::VectorDatabase; use embedding::{EmbeddingProvider, OpenAIEmbeddings}; @@ -40,7 +40,7 @@ use util::{ }; use workspace::{Workspace, WorkspaceCreated}; -const VECTOR_STORE_VERSION: usize = 1; +const SEMANTIC_INDEX_VERSION: usize = 1; const EMBEDDINGS_BATCH_SIZE: usize = 150; pub fn init( @@ -49,7 +49,7 @@ pub fn init( language_registry: Arc, cx: &mut AppContext, ) { - settings::register::(cx); + settings::register::(cx); let db_file_path = EMBEDDINGS_DIR .join(Path::new(RELEASE_CHANNEL_NAME.as_str())) @@ -58,14 +58,14 @@ pub fn init( SemanticSearch::init(cx); cx.add_action( |workspace: &mut Workspace, _: &Toggle, cx: &mut ViewContext| { - if cx.has_global::>() { - let vector_store = cx.global::>().clone(); + if cx.has_global::>() { + let semantic_index = cx.global::>().clone(); workspace.toggle_modal(cx, |workspace, cx| { let project = workspace.project().clone(); let workspace = cx.weak_handle(); cx.add_view(|cx| { SemanticSearch::new( - SemanticSearchDelegate::new(workspace, project, vector_store), + SemanticSearchDelegate::new(workspace, project, semantic_index), cx, ) }) @@ -75,13 +75,14 @@ pub fn init( ); if *RELEASE_CHANNEL == ReleaseChannel::Stable - || !settings::get::(cx).enabled + || !settings::get::(cx).enabled { + log::info!("NOT ENABLED"); return; } cx.spawn(move |mut cx| async move { - let vector_store = VectorStore::new( + let semantic_index = SemanticIndex::new( fs, db_file_path, Arc::new(OpenAIEmbeddings { @@ -94,15 +95,15 @@ pub fn init( .await?; cx.update(|cx| { - cx.set_global(vector_store.clone()); + cx.set_global(semantic_index.clone()); cx.subscribe_global::({ - let vector_store = vector_store.clone(); + let semantic_index = semantic_index.clone(); move |event, cx| { let workspace = &event.0; if let Some(workspace) = workspace.upgrade(cx) { let project = workspace.read(cx).project().clone(); if project.read(cx).is_local() { - vector_store.update(cx, |store, cx| { + semantic_index.update(cx, |store, cx| { store.index_project(project, cx).detach(); }); } @@ -117,7 +118,7 @@ pub fn init( .detach(); } -pub struct VectorStore { +pub struct SemanticIndex { fs: Arc, database_url: Arc, embedding_provider: Arc, @@ -220,7 +221,7 @@ enum EmbeddingJob { Flush, } -impl VectorStore { +impl SemanticIndex { async fn new( fs: Arc, database_url: PathBuf, @@ -672,7 +673,7 @@ impl VectorStore { } } -impl Entity for VectorStore { +impl Entity for SemanticIndex { type Event = (); } diff --git a/crates/vector_store/src/vector_store_settings.rs b/crates/semantic_index/src/semantic_index_settings.rs similarity index 71% rename from crates/vector_store/src/vector_store_settings.rs rename to crates/semantic_index/src/semantic_index_settings.rs index e1fa7cc05a362829fae1a361097740d04b115b6c..86872457f841e1bfe1b601d1fb6d5d86a12911dc 100644 --- a/crates/vector_store/src/vector_store_settings.rs +++ b/crates/semantic_index/src/semantic_index_settings.rs @@ -4,21 +4,21 @@ use serde::{Deserialize, Serialize}; use settings::Setting; #[derive(Deserialize, Debug)] -pub struct VectorStoreSettings { +pub struct SemanticIndexSettings { pub enabled: bool, pub reindexing_delay_seconds: usize, } #[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug)] -pub struct VectorStoreSettingsContent { +pub struct SemanticIndexSettingsContent { pub enabled: Option, pub reindexing_delay_seconds: Option, } -impl Setting for VectorStoreSettings { - const KEY: Option<&'static str> = Some("vector_store"); +impl Setting for SemanticIndexSettings { + const KEY: Option<&'static str> = Some("semantic_index"); - type FileContent = VectorStoreSettingsContent; + type FileContent = SemanticIndexSettingsContent; fn load( default_value: &Self::FileContent, diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs similarity index 99% rename from crates/vector_store/src/vector_store_tests.rs rename to crates/semantic_index/src/semantic_index_tests.rs index d55dfcfc7151eb04efb5a33b119ae33b0875d86d..ed48cf256bed1bce335c942b2508d486acf82ce0 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -2,8 +2,8 @@ use crate::{ db::dot, embedding::EmbeddingProvider, parsing::{CodeContextRetriever, Document}, - vector_store_settings::VectorStoreSettings, - VectorStore, + semantic_index_settings::SemanticIndexSettings, + SemanticIndex, }; use anyhow::Result; use async_trait::async_trait; @@ -30,10 +30,10 @@ fn init_logger() { } #[gpui::test] -async fn test_vector_store(cx: &mut TestAppContext) { +async fn test_semantic_index(cx: &mut TestAppContext) { cx.update(|cx| { cx.set_global(SettingsStore::test(cx)); - settings::register::(cx); + settings::register::(cx); settings::register::(cx); }); @@ -74,7 +74,7 @@ async fn test_vector_store(cx: &mut TestAppContext) { let db_path = db_dir.path().join("db.sqlite"); let embedding_provider = Arc::new(FakeEmbeddingProvider::default()); - let store = VectorStore::new( + let store = SemanticIndex::new( fs.clone(), db_path, embedding_provider.clone(), diff --git a/crates/zed/Cargo.toml b/crates/zed/Cargo.toml index 597e40161fb029eee16cf53208ce0e20d0c0a603..265312bc9a9de5de76465fc2e7e737bc4cb52a4f 100644 --- a/crates/zed/Cargo.toml +++ b/crates/zed/Cargo.toml @@ -64,7 +64,7 @@ terminal_view = { path = "../terminal_view" } theme = { path = "../theme" } theme_selector = { path = "../theme_selector" } util = { path = "../util" } -vector_store = { path = "../vector_store" } +semantic_index = { path = "../semantic_index" } vim = { path = "../vim" } workspace = { path = "../workspace" } welcome = { path = "../welcome" } diff --git a/crates/zed/src/main.rs b/crates/zed/src/main.rs index 4c75d370d517423e395119d2ceb4f3c47b61a21b..3598da5dee2eb9c96b18474bcef5e8e763cbae14 100644 --- a/crates/zed/src/main.rs +++ b/crates/zed/src/main.rs @@ -157,7 +157,7 @@ fn main() { project_panel::init(cx); diagnostics::init(cx); search::init(cx); - vector_store::init(fs.clone(), http.clone(), languages.clone(), cx); + semantic_index::init(fs.clone(), http.clone(), languages.clone(), cx); vim::init(cx); terminal_view::init(cx); copilot::init(http.clone(), node_runtime, cx); From d83c4ffb072081d0b07f62f3c90f3bff5be48509 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 17 Jul 2023 17:09:51 -0400 Subject: [PATCH 16/34] remove debug logging for enabled settings --- crates/semantic_index/src/semantic_index.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 58ffa512ce6e6714cb2af2ea3593d1f7eb96c534..b59b20370aff967de5b2c805da5c693993e0c23e 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -77,7 +77,6 @@ pub fn init( if *RELEASE_CHANNEL == ReleaseChannel::Stable || !settings::get::(cx).enabled { - log::info!("NOT ENABLED"); return; } From afc4c10ec1162b151c33a9ffe051233dca10a5e5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Mon, 17 Jul 2023 18:10:51 -0700 Subject: [PATCH 17/34] Start work on exposing semantic search via project search view Co-authored-by: Kyle --- Cargo.lock | 2 + crates/search/Cargo.toml | 1 + crates/search/src/project_search.rs | 156 +++++- crates/semantic_index/Cargo.toml | 1 + crates/semantic_index/src/db.rs | 12 +- crates/semantic_index/src/embedding.rs | 7 +- crates/semantic_index/src/modal.rs | 172 ------- crates/semantic_index/src/semantic_index.rs | 451 +++++++++--------- .../src/semantic_index_tests.rs | 18 +- 9 files changed, 397 insertions(+), 423 deletions(-) delete mode 100644 crates/semantic_index/src/modal.rs diff --git a/Cargo.lock b/Cargo.lock index 430a665f98b2a7f353855b9645c2e148dd02fb4b..484ef3644b1cdddab26755c4eaf293154bbbcb3f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6430,6 +6430,7 @@ dependencies = [ "menu", "postage", "project", + "semantic_index", "serde", "serde_derive", "serde_json", @@ -6484,6 +6485,7 @@ dependencies = [ "matrixmultiply", "parking_lot 0.11.2", "picker", + "postage", "project", "rand 0.8.5", "rpc", diff --git a/crates/search/Cargo.toml b/crates/search/Cargo.toml index 7ef388f7c087638c1ee3f5c2002ab3d2c3371dc7..f6ed6c3fef4bf3a566e27cdd46b7169405a72c97 100644 --- a/crates/search/Cargo.toml +++ b/crates/search/Cargo.toml @@ -19,6 +19,7 @@ settings = { path = "../settings" } theme = { path = "../theme" } util = { path = "../util" } workspace = { path = "../workspace" } +semantic_index = { path = "../semantic_index" } anyhow.workspace = true futures.workspace = true log.workspace = true diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index ebd504d02c2334aa6876a478937718cb1aa4d496..91d2b142ae27c3b1579d99b04fe6cb8b6f745705 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -2,7 +2,7 @@ use crate::{ SearchOption, SelectNextMatch, SelectPrevMatch, ToggleCaseSensitive, ToggleRegex, ToggleWholeWord, }; -use anyhow::Result; +use anyhow::{Context, Result}; use collections::HashMap; use editor::{ items::active_match_index, scroll::autoscroll::Autoscroll, Anchor, Editor, MultiBuffer, @@ -18,7 +18,9 @@ use gpui::{ Task, View, ViewContext, ViewHandle, WeakModelHandle, WeakViewHandle, }; use menu::Confirm; +use postage::stream::Stream; use project::{search::SearchQuery, Project}; +use semantic_index::SemanticIndex; use smallvec::SmallVec; use std::{ any::{Any, TypeId}, @@ -36,7 +38,10 @@ use workspace::{ ItemNavHistory, Pane, ToolbarItemLocation, ToolbarItemView, Workspace, WorkspaceId, }; -actions!(project_search, [SearchInNew, ToggleFocus, NextField]); +actions!( + project_search, + [SearchInNew, ToggleFocus, NextField, ToggleSemanticSearch] +); #[derive(Default)] struct ActiveSearches(HashMap, WeakViewHandle>); @@ -92,6 +97,7 @@ pub struct ProjectSearchView { case_sensitive: bool, whole_word: bool, regex: bool, + semantic: Option, panels_with_errors: HashSet, active_match_index: Option, search_id: usize, @@ -100,6 +106,13 @@ pub struct ProjectSearchView { excluded_files_editor: ViewHandle, } +struct SemanticSearchState { + file_count: usize, + outstanding_file_count: usize, + _progress_task: Task<()>, + search_task: Option>>, +} + pub struct ProjectSearchBar { active_project_search: Option>, subscription: Option, @@ -198,12 +211,25 @@ impl View for ProjectSearchView { let theme = theme::current(cx).clone(); let text = if self.query_editor.read(cx).text(cx).is_empty() { - "" + Cow::Borrowed("") + } else if let Some(semantic) = &self.semantic { + if semantic.search_task.is_some() { + Cow::Borrowed("Searching...") + } else if semantic.outstanding_file_count > 0 { + Cow::Owned(format!( + "Indexing. {} of {}...", + semantic.file_count - semantic.outstanding_file_count, + semantic.file_count + )) + } else { + Cow::Borrowed("Indexing complete") + } } else if model.pending_search.is_some() { - "Searching..." + Cow::Borrowed("Searching...") } else { - "No results" + Cow::Borrowed("No results") }; + MouseEventHandler::::new(0, cx, |_, _| { Label::new(text, theme.search.results_status.clone()) .aligned() @@ -499,6 +525,7 @@ impl ProjectSearchView { case_sensitive, whole_word, regex, + semantic: None, panels_with_errors: HashSet::new(), active_match_index: None, query_editor_was_focused: false, @@ -563,6 +590,35 @@ impl ProjectSearchView { } fn search(&mut self, cx: &mut ViewContext) { + if let Some(semantic) = &mut self.semantic { + if semantic.outstanding_file_count > 0 { + return; + } + + let search_phrase = self.query_editor.read(cx).text(cx); + let project = self.model.read(cx).project.clone(); + if let Some(semantic_index) = SemanticIndex::global(cx) { + let search_task = semantic_index.update(cx, |semantic_index, cx| { + semantic_index.search_project(project, search_phrase, 10, cx) + }); + semantic.search_task = Some(cx.spawn(|this, mut cx| async move { + let results = search_task.await.context("search task")?; + + this.update(&mut cx, |this, cx| { + dbg!(&results); + // TODO: Update results + + if let Some(semantic) = &mut this.semantic { + semantic.search_task = None; + } + })?; + + anyhow::Ok(()) + })); + } + return; + } + if let Some(query) = self.build_search_query(cx) { self.model.update(cx, |model, cx| model.search(query, cx)); } @@ -876,6 +932,59 @@ impl ProjectSearchBar { } } + fn toggle_semantic_search(&mut self, cx: &mut ViewContext) -> bool { + if let Some(search_view) = self.active_project_search.as_ref() { + search_view.update(cx, |search_view, cx| { + if search_view.semantic.is_some() { + search_view.semantic = None; + } else if let Some(semantic_index) = SemanticIndex::global(cx) { + // TODO: confirm that it's ok to send this project + + let project = search_view.model.read(cx).project.clone(); + let index_task = semantic_index.update(cx, |semantic_index, cx| { + semantic_index.index_project(project, cx) + }); + + cx.spawn(|search_view, mut cx| async move { + let (files_to_index, mut files_remaining_rx) = index_task.await?; + + search_view.update(&mut cx, |search_view, cx| { + search_view.semantic = Some(SemanticSearchState { + file_count: files_to_index, + outstanding_file_count: files_to_index, + search_task: None, + _progress_task: cx.spawn(|search_view, mut cx| async move { + while let Some(count) = files_remaining_rx.recv().await { + search_view + .update(&mut cx, |search_view, cx| { + if let Some(semantic_search_state) = + &mut search_view.semantic + { + semantic_search_state.outstanding_file_count = + count; + cx.notify(); + if count == 0 { + return; + } + } + }) + .ok(); + } + }), + }); + })?; + anyhow::Ok(()) + }) + .detach_and_log_err(cx); + } + }); + cx.notify(); + true + } else { + false + } + } + fn render_nav_button( &self, icon: &'static str, @@ -953,6 +1062,42 @@ impl ProjectSearchBar { .into_any() } + fn render_semantic_search_button(&self, cx: &mut ViewContext) -> AnyElement { + let tooltip_style = theme::current(cx).tooltip.clone(); + let is_active = if let Some(search) = self.active_project_search.as_ref() { + let search = search.read(cx); + search.semantic.is_some() + } else { + false + }; + + let region_id = 3; + + MouseEventHandler::::new(region_id, cx, |state, cx| { + let theme = theme::current(cx); + let style = theme + .search + .option_button + .in_state(is_active) + .style_for(state); + Label::new("Semantic", style.text.clone()) + .contained() + .with_style(style.container) + }) + .on_click(MouseButton::Left, move |_, this, cx| { + this.toggle_semantic_search(cx); + }) + .with_cursor_style(CursorStyle::PointingHand) + .with_tooltip::( + region_id, + format!("Toggle Semantic Search"), + Some(Box::new(ToggleSemanticSearch)), + tooltip_style, + cx, + ) + .into_any() + } + fn is_option_enabled(&self, option: SearchOption, cx: &AppContext) -> bool { if let Some(search) = self.active_project_search.as_ref() { let search = search.read(cx); @@ -1049,6 +1194,7 @@ impl View for ProjectSearchBar { ) .with_child( Flex::row() + .with_child(self.render_semantic_search_button(cx)) .with_child(self.render_option_button( "Case", SearchOption::CaseSensitive, diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index 5c5af072c8f614e8eb8111d31c72bf9bbf905ada..2d21ff6c1c42710e597101cd024fdde9183bcbc5 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -20,6 +20,7 @@ editor = { path = "../editor" } rpc = { path = "../rpc" } settings = { path = "../settings" } anyhow.workspace = true +postage.workspace = true futures.workspace = true smol.workspace = true rusqlite = { version = "0.27.0", features = ["blob", "array", "modern_sqlite"] } diff --git a/crates/semantic_index/src/db.rs b/crates/semantic_index/src/db.rs index 1d5a9a475ea826cdb7baa91406b83b2189f95587..a667ff877c2e02c65e669e10b7fdbc07e319653b 100644 --- a/crates/semantic_index/src/db.rs +++ b/crates/semantic_index/src/db.rs @@ -1,5 +1,5 @@ use crate::{parsing::Document, SEMANTIC_INDEX_VERSION}; -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Context, Result}; use project::Fs; use rpc::proto::Timestamp; use rusqlite::{ @@ -76,14 +76,14 @@ impl VectorDatabase { self.db .execute( " - DROP TABLE semantic_index_config; - DROP TABLE worktrees; - DROP TABLE files; - DROP TABLE documents; + DROP TABLE IF EXISTS documents; + DROP TABLE IF EXISTS files; + DROP TABLE IF EXISTS worktrees; + DROP TABLE IF EXISTS semantic_index_config; ", [], ) - .ok(); + .context("failed to drop tables")?; // Initialize Vector Databasing Tables self.db.execute( diff --git a/crates/semantic_index/src/embedding.rs b/crates/semantic_index/src/embedding.rs index ea349c8afa4a8d908d60760f8ff1eb6839e3120b..4f49d66ce7eefeb70961fcaab936edf102b715b9 100644 --- a/crates/semantic_index/src/embedding.rs +++ b/crates/semantic_index/src/embedding.rs @@ -86,6 +86,7 @@ impl OpenAIEmbeddings { async fn send_request(&self, api_key: &str, spans: Vec<&str>) -> Result> { let request = Request::post("https://api.openai.com/v1/embeddings") .redirect_policy(isahc::config::RedirectPolicy::Follow) + .timeout(Duration::from_secs(4)) .header("Content-Type", "application/json") .header("Authorization", format!("Bearer {}", api_key)) .body( @@ -133,7 +134,11 @@ impl EmbeddingProvider for OpenAIEmbeddings { self.executor.timer(delay).await; } StatusCode::BAD_REQUEST => { - log::info!("BAD REQUEST: {:?}", &response.status()); + log::info!( + "BAD REQUEST: {:?} {:?}", + &response.status(), + response.body() + ); // Don't worry about delaying bad request, as we can assume // we haven't been rate limited yet. for span in spans.iter_mut() { diff --git a/crates/semantic_index/src/modal.rs b/crates/semantic_index/src/modal.rs deleted file mode 100644 index ffc64a195ccfb23009922f71878c17ea90b1e375..0000000000000000000000000000000000000000 --- a/crates/semantic_index/src/modal.rs +++ /dev/null @@ -1,172 +0,0 @@ -use crate::{SearchResult, SemanticIndex}; -use editor::{scroll::autoscroll::Autoscroll, Editor}; -use gpui::{ - actions, elements::*, AnyElement, AppContext, ModelHandle, MouseState, Task, ViewContext, - WeakViewHandle, -}; -use picker::{Picker, PickerDelegate, PickerEvent}; -use project::{Project, ProjectPath}; -use std::{collections::HashMap, sync::Arc, time::Duration}; -use util::ResultExt; -use workspace::Workspace; - -const MIN_QUERY_LEN: usize = 5; -const EMBEDDING_DEBOUNCE_INTERVAL: Duration = Duration::from_millis(500); - -actions!(semantic_search, [Toggle]); - -pub type SemanticSearch = Picker; - -pub struct SemanticSearchDelegate { - workspace: WeakViewHandle, - project: ModelHandle, - semantic_index: ModelHandle, - selected_match_index: usize, - matches: Vec, - history: HashMap>, -} - -impl SemanticSearchDelegate { - // This is currently searching on every keystroke, - // This is wildly overkill, and has the potential to get expensive - // We will need to update this to throttle searching - pub fn new( - workspace: WeakViewHandle, - project: ModelHandle, - semantic_index: ModelHandle, - ) -> Self { - Self { - workspace, - project, - semantic_index, - selected_match_index: 0, - matches: vec![], - history: HashMap::new(), - } - } -} - -impl PickerDelegate for SemanticSearchDelegate { - fn placeholder_text(&self) -> Arc { - "Search repository in natural language...".into() - } - - fn confirm(&mut self, cx: &mut ViewContext) { - if let Some(search_result) = self.matches.get(self.selected_match_index) { - // Open Buffer - let search_result = search_result.clone(); - let buffer = self.project.update(cx, |project, cx| { - project.open_buffer( - ProjectPath { - worktree_id: search_result.worktree_id, - path: search_result.file_path.clone().into(), - }, - cx, - ) - }); - - let workspace = self.workspace.clone(); - let position = search_result.clone().byte_range.start; - cx.spawn(|_, mut cx| async move { - let buffer = buffer.await?; - workspace.update(&mut cx, |workspace, cx| { - let editor = workspace.open_project_item::(buffer, cx); - editor.update(cx, |editor, cx| { - editor.change_selections(Some(Autoscroll::center()), cx, |s| { - s.select_ranges([position..position]) - }); - }); - })?; - Ok::<_, anyhow::Error>(()) - }) - .detach_and_log_err(cx); - cx.emit(PickerEvent::Dismiss); - } - } - - fn dismissed(&mut self, _cx: &mut ViewContext) {} - - fn match_count(&self) -> usize { - self.matches.len() - } - - fn selected_index(&self) -> usize { - self.selected_match_index - } - - fn set_selected_index(&mut self, ix: usize, _cx: &mut ViewContext) { - self.selected_match_index = ix; - } - - fn update_matches(&mut self, query: String, cx: &mut ViewContext) -> Task<()> { - log::info!("Searching for {:?}...", query); - if query.len() < MIN_QUERY_LEN { - log::info!("Query below minimum length"); - return Task::ready(()); - } - - let semantic_index = self.semantic_index.clone(); - let project = self.project.clone(); - cx.spawn(|this, mut cx| async move { - cx.background().timer(EMBEDDING_DEBOUNCE_INTERVAL).await; - - let retrieved_cached = this.update(&mut cx, |this, _| { - let delegate = this.delegate_mut(); - if delegate.history.contains_key(&query) { - let historic_results = delegate.history.get(&query).unwrap().to_owned(); - delegate.matches = historic_results.clone(); - true - } else { - false - } - }); - - if let Some(retrieved) = retrieved_cached.log_err() { - if !retrieved { - let task = semantic_index.update(&mut cx, |store, cx| { - store.search_project(project.clone(), query.to_string(), 10, cx) - }); - - if let Some(results) = task.await.log_err() { - log::info!("Not queried previously, searching..."); - this.update(&mut cx, |this, _| { - let delegate = this.delegate_mut(); - delegate.matches = results.clone(); - delegate.history.insert(query, results); - }) - .ok(); - } - } else { - log::info!("Already queried, retrieved directly from cached history"); - } - } - }) - } - - fn render_match( - &self, - ix: usize, - mouse_state: &mut MouseState, - selected: bool, - cx: &AppContext, - ) -> AnyElement> { - let theme = theme::current(cx); - let style = &theme.picker.item; - let current_style = style.in_state(selected).style_for(mouse_state); - - let search_result = &self.matches[ix]; - - let path = search_result.file_path.to_string_lossy(); - let name = search_result.name.clone(); - - Flex::column() - .with_child(Text::new(name, current_style.label.text.clone()).with_soft_wrap(false)) - .with_child(Label::new( - path.to_string(), - style.inactive_state().default.label.clone(), - )) - .contained() - .with_style(current_style.container) - .into_any() - } -} diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index b59b20370aff967de5b2c805da5c693993e0c23e..e6443870aa5312b4a7ea4ecfe72c134841923c66 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -1,6 +1,5 @@ mod db; mod embedding; -mod modal; mod parsing; mod semantic_index_settings; @@ -12,25 +11,20 @@ use anyhow::{anyhow, Result}; use db::VectorDatabase; use embedding::{EmbeddingProvider, OpenAIEmbeddings}; use futures::{channel::oneshot, Future}; -use gpui::{ - AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, ViewContext, - WeakModelHandle, -}; +use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle}; use language::{Language, LanguageRegistry}; -use modal::{SemanticSearch, SemanticSearchDelegate, Toggle}; use parking_lot::Mutex; use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES}; +use postage::watch; use project::{Fs, Project, WorktreeId}; use smol::channel; use std::{ - collections::{HashMap, HashSet}, + collections::HashMap, + mem, ops::Range, path::{Path, PathBuf}, - sync::{ - atomic::{self, AtomicUsize}, - Arc, Weak, - }, - time::{Instant, SystemTime}, + sync::{Arc, Weak}, + time::SystemTime, }; use util::{ channel::{ReleaseChannel, RELEASE_CHANNEL, RELEASE_CHANNEL_NAME}, @@ -38,9 +32,8 @@ use util::{ paths::EMBEDDINGS_DIR, ResultExt, }; -use workspace::{Workspace, WorkspaceCreated}; -const SEMANTIC_INDEX_VERSION: usize = 1; +const SEMANTIC_INDEX_VERSION: usize = 3; const EMBEDDINGS_BATCH_SIZE: usize = 150; pub fn init( @@ -55,25 +48,6 @@ pub fn init( .join(Path::new(RELEASE_CHANNEL_NAME.as_str())) .join("embeddings_db"); - SemanticSearch::init(cx); - cx.add_action( - |workspace: &mut Workspace, _: &Toggle, cx: &mut ViewContext| { - if cx.has_global::>() { - let semantic_index = cx.global::>().clone(); - workspace.toggle_modal(cx, |workspace, cx| { - let project = workspace.project().clone(); - let workspace = cx.weak_handle(); - cx.add_view(|cx| { - SemanticSearch::new( - SemanticSearchDelegate::new(workspace, project, semantic_index), - cx, - ) - }) - }); - } - }, - ); - if *RELEASE_CHANNEL == ReleaseChannel::Stable || !settings::get::(cx).enabled { @@ -95,21 +69,6 @@ pub fn init( cx.update(|cx| { cx.set_global(semantic_index.clone()); - cx.subscribe_global::({ - let semantic_index = semantic_index.clone(); - move |event, cx| { - let workspace = &event.0; - if let Some(workspace) = workspace.upgrade(cx) { - let project = workspace.read(cx).project().clone(); - if project.read(cx).is_local() { - semantic_index.update(cx, |store, cx| { - store.index_project(project, cx).detach(); - }); - } - } - } - }) - .detach(); }); anyhow::Ok(()) @@ -128,20 +87,17 @@ pub struct SemanticIndex { _embed_batch_task: Task<()>, _batch_files_task: Task<()>, _parsing_files_tasks: Vec>, - next_job_id: Arc, projects: HashMap, ProjectState>, } struct ProjectState { worktree_db_ids: Vec<(WorktreeId, i64)>, - outstanding_jobs: Arc>>, + outstanding_job_count_rx: watch::Receiver, + outstanding_job_count_tx: Arc>>, } -type JobId = usize; - struct JobHandle { - id: JobId, - set: Weak>>, + tx: Weak>>, } impl ProjectState { @@ -221,6 +177,14 @@ enum EmbeddingJob { } impl SemanticIndex { + pub fn global(cx: &AppContext) -> Option> { + if cx.has_global::>() { + Some(cx.global::>().clone()) + } else { + None + } + } + async fn new( fs: Arc, database_url: PathBuf, @@ -236,184 +200,69 @@ impl SemanticIndex { .await?; Ok(cx.add_model(|cx| { - // paths_tx -> embeddings_tx -> db_update_tx - - //db_update_tx/rx: Updating Database + // Perform database operations let (db_update_tx, db_update_rx) = channel::unbounded(); - let _db_update_task = cx.background().spawn(async move { - while let Ok(job) = db_update_rx.recv().await { - match job { - DbOperation::InsertFile { - worktree_id, - documents, - path, - mtime, - job_handle, - } => { - db.insert_file(worktree_id, path, mtime, documents) - .log_err(); - drop(job_handle) - } - DbOperation::Delete { worktree_id, path } => { - db.delete_file(worktree_id, path).log_err(); - } - DbOperation::FindOrCreateWorktree { path, sender } => { - let id = db.find_or_create_worktree(&path); - sender.send(id).ok(); - } - DbOperation::FileMTimes { - worktree_id: worktree_db_id, - sender, - } => { - let file_mtimes = db.get_file_mtimes(worktree_db_id); - sender.send(file_mtimes).ok(); - } + let _db_update_task = cx.background().spawn({ + async move { + while let Ok(job) = db_update_rx.recv().await { + Self::run_db_operation(&db, job) } } }); - // embed_tx/rx: Embed Batch and Send to Database + // Group documents into batches and send them to the embedding provider. let (embed_batch_tx, embed_batch_rx) = channel::unbounded::, PathBuf, SystemTime, JobHandle)>>(); let _embed_batch_task = cx.background().spawn({ let db_update_tx = db_update_tx.clone(); let embedding_provider = embedding_provider.clone(); async move { - while let Ok(mut embeddings_queue) = embed_batch_rx.recv().await { - // Construct Batch - let mut batch_documents = vec![]; - for (_, documents, _, _, _) in embeddings_queue.iter() { - batch_documents - .extend(documents.iter().map(|document| document.content.as_str())); - } - - if let Ok(embeddings) = - embedding_provider.embed_batch(batch_documents).await - { - log::trace!( - "created {} embeddings for {} files", - embeddings.len(), - embeddings_queue.len(), - ); - - let mut i = 0; - let mut j = 0; - - for embedding in embeddings.iter() { - while embeddings_queue[i].1.len() == j { - i += 1; - j = 0; - } - - embeddings_queue[i].1[j].embedding = embedding.to_owned(); - j += 1; - } - - for (worktree_id, documents, path, mtime, job_handle) in - embeddings_queue.into_iter() - { - for document in documents.iter() { - // TODO: Update this so it doesn't panic - assert!( - document.embedding.len() > 0, - "Document Embedding Not Complete" - ); - } - - db_update_tx - .send(DbOperation::InsertFile { - worktree_id, - documents, - path, - mtime, - job_handle, - }) - .await - .unwrap(); - } - } + while let Ok(embeddings_queue) = embed_batch_rx.recv().await { + Self::compute_embeddings_for_batch( + embeddings_queue, + &embedding_provider, + &db_update_tx, + ) + .await; } } }); - // batch_tx/rx: Batch Files to Send for Embeddings + // Group documents into batches and send them to the embedding provider. let (batch_files_tx, batch_files_rx) = channel::unbounded::(); let _batch_files_task = cx.background().spawn(async move { let mut queue_len = 0; let mut embeddings_queue = vec![]; - while let Ok(job) = batch_files_rx.recv().await { - let should_flush = match job { - EmbeddingJob::Enqueue { - documents, - worktree_id, - path, - mtime, - job_handle, - } => { - queue_len += &documents.len(); - embeddings_queue.push(( - worktree_id, - documents, - path, - mtime, - job_handle, - )); - queue_len >= EMBEDDINGS_BATCH_SIZE - } - EmbeddingJob::Flush => true, - }; - - if should_flush { - embed_batch_tx.try_send(embeddings_queue).unwrap(); - embeddings_queue = vec![]; - queue_len = 0; - } + Self::enqueue_documents_to_embed( + job, + &mut queue_len, + &mut embeddings_queue, + &embed_batch_tx, + ); } }); - // parsing_files_tx/rx: Parsing Files to Embeddable Documents + // Parse files into embeddable documents. let (parsing_files_tx, parsing_files_rx) = channel::unbounded::(); - let mut _parsing_files_tasks = Vec::new(); for _ in 0..cx.background().num_cpus() { let fs = fs.clone(); let parsing_files_rx = parsing_files_rx.clone(); let batch_files_tx = batch_files_tx.clone(); + let db_update_tx = db_update_tx.clone(); _parsing_files_tasks.push(cx.background().spawn(async move { let mut retriever = CodeContextRetriever::new(); while let Ok(pending_file) = parsing_files_rx.recv().await { - if let Some(content) = fs.load(&pending_file.absolute_path).await.log_err() - { - if let Some(documents) = retriever - .parse_file( - &pending_file.relative_path, - &content, - pending_file.language, - ) - .log_err() - { - log::trace!( - "parsed path {:?}: {} documents", - pending_file.relative_path, - documents.len() - ); - - batch_files_tx - .try_send(EmbeddingJob::Enqueue { - worktree_id: pending_file.worktree_db_id, - path: pending_file.relative_path, - mtime: pending_file.modified_time, - job_handle: pending_file.job_handle, - documents, - }) - .unwrap(); - } - } - - if parsing_files_rx.len() == 0 { - batch_files_tx.try_send(EmbeddingJob::Flush).unwrap(); - } + Self::parse_file( + &fs, + pending_file, + &mut retriever, + &batch_files_tx, + &parsing_files_rx, + &db_update_tx, + ) + .await; } })); } @@ -424,7 +273,6 @@ impl SemanticIndex { embedding_provider, language_registry, db_update_tx, - next_job_id: Default::default(), parsing_files_tx, _db_update_task, _embed_batch_task, @@ -435,6 +283,167 @@ impl SemanticIndex { })) } + fn run_db_operation(db: &VectorDatabase, job: DbOperation) { + match job { + DbOperation::InsertFile { + worktree_id, + documents, + path, + mtime, + job_handle, + } => { + db.insert_file(worktree_id, path, mtime, documents) + .log_err(); + drop(job_handle) + } + DbOperation::Delete { worktree_id, path } => { + db.delete_file(worktree_id, path).log_err(); + } + DbOperation::FindOrCreateWorktree { path, sender } => { + let id = db.find_or_create_worktree(&path); + sender.send(id).ok(); + } + DbOperation::FileMTimes { + worktree_id: worktree_db_id, + sender, + } => { + let file_mtimes = db.get_file_mtimes(worktree_db_id); + sender.send(file_mtimes).ok(); + } + } + } + + async fn compute_embeddings_for_batch( + mut embeddings_queue: Vec<(i64, Vec, PathBuf, SystemTime, JobHandle)>, + embedding_provider: &Arc, + db_update_tx: &channel::Sender, + ) { + let mut batch_documents = vec![]; + for (_, documents, _, _, _) in embeddings_queue.iter() { + batch_documents.extend(documents.iter().map(|document| document.content.as_str())); + } + + if let Ok(embeddings) = embedding_provider.embed_batch(batch_documents).await { + log::trace!( + "created {} embeddings for {} files", + embeddings.len(), + embeddings_queue.len(), + ); + + let mut i = 0; + let mut j = 0; + + for embedding in embeddings.iter() { + while embeddings_queue[i].1.len() == j { + i += 1; + j = 0; + } + + embeddings_queue[i].1[j].embedding = embedding.to_owned(); + j += 1; + } + + for (worktree_id, documents, path, mtime, job_handle) in embeddings_queue.into_iter() { + // for document in documents.iter() { + // // TODO: Update this so it doesn't panic + // assert!( + // document.embedding.len() > 0, + // "Document Embedding Not Complete" + // ); + // } + + db_update_tx + .send(DbOperation::InsertFile { + worktree_id, + documents, + path, + mtime, + job_handle, + }) + .await + .unwrap(); + } + } + } + + fn enqueue_documents_to_embed( + job: EmbeddingJob, + queue_len: &mut usize, + embeddings_queue: &mut Vec<(i64, Vec, PathBuf, SystemTime, JobHandle)>, + embed_batch_tx: &channel::Sender, PathBuf, SystemTime, JobHandle)>>, + ) { + let should_flush = match job { + EmbeddingJob::Enqueue { + documents, + worktree_id, + path, + mtime, + job_handle, + } => { + *queue_len += &documents.len(); + embeddings_queue.push((worktree_id, documents, path, mtime, job_handle)); + *queue_len >= EMBEDDINGS_BATCH_SIZE + } + EmbeddingJob::Flush => true, + }; + + if should_flush { + embed_batch_tx + .try_send(mem::take(embeddings_queue)) + .unwrap(); + *queue_len = 0; + } + } + + async fn parse_file( + fs: &Arc, + pending_file: PendingFile, + retriever: &mut CodeContextRetriever, + batch_files_tx: &channel::Sender, + parsing_files_rx: &channel::Receiver, + db_update_tx: &channel::Sender, + ) { + if let Some(content) = fs.load(&pending_file.absolute_path).await.log_err() { + if let Some(documents) = retriever + .parse_file(&pending_file.relative_path, &content, pending_file.language) + .log_err() + { + log::trace!( + "parsed path {:?}: {} documents", + pending_file.relative_path, + documents.len() + ); + + if documents.len() == 0 { + db_update_tx + .send(DbOperation::InsertFile { + worktree_id: pending_file.worktree_db_id, + documents, + path: pending_file.relative_path, + mtime: pending_file.modified_time, + job_handle: pending_file.job_handle, + }) + .await + .unwrap(); + } else { + batch_files_tx + .try_send(EmbeddingJob::Enqueue { + worktree_id: pending_file.worktree_db_id, + path: pending_file.relative_path, + mtime: pending_file.modified_time, + job_handle: pending_file.job_handle, + documents, + }) + .unwrap(); + } + } + } + + if parsing_files_rx.len() == 0 { + batch_files_tx.try_send(EmbeddingJob::Flush).unwrap(); + } + } + fn find_or_create_worktree(&self, path: PathBuf) -> impl Future> { let (tx, rx) = oneshot::channel(); self.db_update_tx @@ -457,11 +466,11 @@ impl SemanticIndex { async move { rx.await? } } - fn index_project( + pub fn index_project( &mut self, project: ModelHandle, cx: &mut ModelContext, - ) -> Task> { + ) -> Task)>> { let worktree_scans_complete = project .read(cx) .worktrees(cx) @@ -483,7 +492,6 @@ impl SemanticIndex { let language_registry = self.language_registry.clone(); let db_update_tx = self.db_update_tx.clone(); let parsing_files_tx = self.parsing_files_tx.clone(); - let next_job_id = self.next_job_id.clone(); cx.spawn(|this, mut cx| async move { futures::future::join_all(worktree_scans_complete).await; @@ -509,8 +517,8 @@ impl SemanticIndex { ); } - // let mut pending_files: Vec<(PathBuf, ((i64, PathBuf, Arc, SystemTime), SystemTime))> = vec![]; - let outstanding_jobs = Arc::new(Mutex::new(HashSet::new())); + let (job_count_tx, job_count_rx) = watch::channel_with(0); + let job_count_tx = Arc::new(Mutex::new(job_count_tx)); this.update(&mut cx, |this, _| { this.projects.insert( project.downgrade(), @@ -519,7 +527,8 @@ impl SemanticIndex { .iter() .map(|(a, b)| (*a, *b)) .collect(), - outstanding_jobs: outstanding_jobs.clone(), + outstanding_job_count_rx: job_count_rx.clone(), + outstanding_job_count_tx: job_count_tx.clone(), }, ); }); @@ -527,7 +536,6 @@ impl SemanticIndex { cx.background() .spawn(async move { let mut count = 0; - let t0 = Instant::now(); for worktree in worktrees.into_iter() { let mut file_mtimes = worktree_file_mtimes.remove(&worktree.id()).unwrap(); for file in worktree.files(false, 0) { @@ -552,14 +560,11 @@ impl SemanticIndex { .map_or(false, |existing_mtime| existing_mtime == file.mtime); if !already_stored { - log::trace!("sending for parsing: {:?}", path_buf); count += 1; - let job_id = next_job_id.fetch_add(1, atomic::Ordering::SeqCst); + *job_count_tx.lock().borrow_mut() += 1; let job_handle = JobHandle { - id: job_id, - set: Arc::downgrade(&outstanding_jobs), + tx: Arc::downgrade(&job_count_tx), }; - outstanding_jobs.lock().insert(job_id); parsing_files_tx .try_send(PendingFile { worktree_db_id: db_ids_by_worktree_id[&worktree.id()], @@ -582,27 +587,22 @@ impl SemanticIndex { .unwrap(); } } - log::trace!( - "parsing worktree completed in {:?}", - t0.elapsed().as_millis() - ); - Ok(count) + anyhow::Ok((count, job_count_rx)) }) .await }) } - pub fn remaining_files_to_index_for_project( + pub fn outstanding_job_count_rx( &self, project: &ModelHandle, - ) -> Option { + ) -> Option> { Some( self.projects .get(&project.downgrade())? - .outstanding_jobs - .lock() - .len(), + .outstanding_job_count_rx + .clone(), ) } @@ -678,8 +678,9 @@ impl Entity for SemanticIndex { impl Drop for JobHandle { fn drop(&mut self) { - if let Some(set) = self.set.upgrade() { - set.lock().remove(&self.id); + if let Some(tx) = self.tx.upgrade() { + let mut tx = tx.lock(); + *tx.borrow_mut() -= 1; } } } diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index ed48cf256bed1bce335c942b2508d486acf82ce0..2ccc52d64b598e56be41a0aae5284517c9f0b36b 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -88,18 +88,13 @@ async fn test_semantic_index(cx: &mut TestAppContext) { let worktree_id = project.read_with(cx, |project, cx| { project.worktrees(cx).next().unwrap().read(cx).id() }); - let file_count = store + let (file_count, outstanding_file_count) = store .update(cx, |store, cx| store.index_project(project.clone(), cx)) .await .unwrap(); assert_eq!(file_count, 3); cx.foreground().run_until_parked(); - store.update(cx, |store, _cx| { - assert_eq!( - store.remaining_files_to_index_for_project(&project), - Some(0) - ); - }); + assert_eq!(*outstanding_file_count.borrow(), 0); let search_results = store .update(cx, |store, cx| { @@ -128,19 +123,14 @@ async fn test_semantic_index(cx: &mut TestAppContext) { cx.foreground().run_until_parked(); let prev_embedding_count = embedding_provider.embedding_count(); - let file_count = store + let (file_count, outstanding_file_count) = store .update(cx, |store, cx| store.index_project(project.clone(), cx)) .await .unwrap(); assert_eq!(file_count, 1); cx.foreground().run_until_parked(); - store.update(cx, |store, _cx| { - assert_eq!( - store.remaining_files_to_index_for_project(&project), - Some(0) - ); - }); + assert_eq!(*outstanding_file_count.borrow(), 0); assert_eq!( embedding_provider.embedding_count() - prev_embedding_count, From ed1b1a5ccd58610111eba38373b6ff42a1e05792 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 18 Jul 2023 11:00:21 -0400 Subject: [PATCH 18/34] update logging for open ai embedding and remove redundant truncation --- crates/semantic_index/src/embedding.rs | 34 ++++++++++++++++---------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/crates/semantic_index/src/embedding.rs b/crates/semantic_index/src/embedding.rs index 5e2025b644c5be8e7089955a80819f7b71229f3e..d41350f321d3b13aea217b322163cf73cca07269 100644 --- a/crates/semantic_index/src/embedding.rs +++ b/crates/semantic_index/src/embedding.rs @@ -67,13 +67,17 @@ impl EmbeddingProvider for DummyEmbeddings { } } -const INPUT_LIMIT: usize = 8190; +const OPENAI_INPUT_LIMIT: usize = 8190; impl OpenAIEmbeddings { + pub fn new(client: Arc, executor: Arc) -> Self { + Self { client, executor } + } + fn truncate(span: String) -> String { let mut tokens = OPENAI_BPE_TOKENIZER.encode_with_special_tokens(span.as_ref()); - if tokens.len() > INPUT_LIMIT { - tokens.truncate(INPUT_LIMIT); + if tokens.len() > OPENAI_INPUT_LIMIT { + tokens.truncate(OPENAI_INPUT_LIMIT); let result = OPENAI_BPE_TOKENIZER.decode(tokens.clone()); if result.is_ok() { let transformed = result.unwrap(); @@ -115,6 +119,7 @@ impl EmbeddingProvider for OpenAIEmbeddings { .ok_or_else(|| anyhow!("no api key"))?; let mut request_number = 0; + let mut truncated = false; let mut response: Response; let mut spans: Vec = spans.iter().map(|x| x.to_string()).collect(); while request_number < MAX_RETRIES { @@ -136,15 +141,18 @@ impl EmbeddingProvider for OpenAIEmbeddings { self.executor.timer(delay).await; } StatusCode::BAD_REQUEST => { - log::info!( - "BAD REQUEST: {:?} {:?}", - &response.status(), - response.body() - ); - // Don't worry about delaying bad request, as we can assume - // we haven't been rate limited yet. - for span in spans.iter_mut() { - *span = Self::truncate(span.to_string()); + // Only truncate if it hasnt been truncated before + if !truncated { + for span in spans.iter_mut() { + *span = Self::truncate(span.clone()); + } + truncated = true; + } else { + // If failing once already truncated, log the error and break the loop + let mut body = String::new(); + response.body_mut().read_to_string(&mut body).await?; + log::trace!("open ai bad request: {:?} {:?}", &response.status(), body); + break; } } StatusCode::OK => { @@ -152,7 +160,7 @@ impl EmbeddingProvider for OpenAIEmbeddings { response.body_mut().read_to_string(&mut body).await?; let response: OpenAIEmbeddingResponse = serde_json::from_str(&body)?; - log::info!( + log::trace!( "openai embedding completed. tokens: {:?}", response.usage.total_tokens ); From 80ef92a3e158618d9dcc255fe8689b8597aacb4d Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 18 Jul 2023 11:14:13 -0400 Subject: [PATCH 19/34] fix db schema update process to ensure all tables are dropped --- crates/semantic_index/src/db.rs | 25 ++++++++++++--------- crates/semantic_index/src/semantic_index.rs | 10 +-------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/crates/semantic_index/src/db.rs b/crates/semantic_index/src/db.rs index a667ff877c2e02c65e669e10b7fdbc07e319653b..74e1021b152f4bdaf36e3b780d9288bed374466b 100644 --- a/crates/semantic_index/src/db.rs +++ b/crates/semantic_index/src/db.rs @@ -66,24 +66,28 @@ impl VectorDatabase { fn initialize_database(&self) -> Result<()> { rusqlite::vtab::array::load_module(&self.db)?; + // Delete existing tables, if SEMANTIC_INDEX_VERSION is bumped if self .get_existing_version() .map_or(false, |version| version == SEMANTIC_INDEX_VERSION as i64) { + log::trace!("vector database schema up to date"); return Ok(()); } + log::trace!("vector database schema out of date. updating..."); self.db - .execute( - " - DROP TABLE IF EXISTS documents; - DROP TABLE IF EXISTS files; - DROP TABLE IF EXISTS worktrees; - DROP TABLE IF EXISTS semantic_index_config; - ", - [], - ) - .context("failed to drop tables")?; + .execute("DROP TABLE IF EXISTS documents", []) + .context("failed to drop 'documents' table")?; + self.db + .execute("DROP TABLE IF EXISTS files", []) + .context("failed to drop 'files' table")?; + self.db + .execute("DROP TABLE IF EXISTS worktrees", []) + .context("failed to drop 'worktrees' table")?; + self.db + .execute("DROP TABLE IF EXISTS semantic_index_config", []) + .context("failed to drop 'semantic_index_config' table")?; // Initialize Vector Databasing Tables self.db.execute( @@ -133,6 +137,7 @@ impl VectorDatabase { [], )?; + log::trace!("vector database initialized with updated schema."); Ok(()) } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index e6443870aa5312b4a7ea4ecfe72c134841923c66..f6575f6ad7188dbaf7ba56160d72cc12f678de10 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -33,7 +33,7 @@ use util::{ ResultExt, }; -const SEMANTIC_INDEX_VERSION: usize = 3; +const SEMANTIC_INDEX_VERSION: usize = 4; const EMBEDDINGS_BATCH_SIZE: usize = 150; pub fn init( @@ -344,14 +344,6 @@ impl SemanticIndex { } for (worktree_id, documents, path, mtime, job_handle) in embeddings_queue.into_iter() { - // for document in documents.iter() { - // // TODO: Update this so it doesn't panic - // assert!( - // document.embedding.len() > 0, - // "Document Embedding Not Complete" - // ); - // } - db_update_tx .send(DbOperation::InsertFile { worktree_id, From 8d0614ce741a7cd279777bd16dcff6349105f077 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 18 Jul 2023 11:44:58 -0700 Subject: [PATCH 20/34] Populate project search results multi-buffer from semantic search Co-authored-by: Kyle --- crates/search/src/project_search.rs | 73 +++++++++++++------ crates/semantic_index/src/db.rs | 16 ++-- crates/semantic_index/src/embedding.rs | 5 -- crates/semantic_index/src/semantic_index.rs | 68 +++++++++-------- .../src/semantic_index_tests.rs | 15 ++-- 5 files changed, 104 insertions(+), 73 deletions(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index 91d2b142ae27c3b1579d99b04fe6cb8b6f745705..1097969c00efca6c025da6193251e480b943d7aa 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -2,7 +2,7 @@ use crate::{ SearchOption, SelectNextMatch, SelectPrevMatch, ToggleCaseSensitive, ToggleRegex, ToggleWholeWord, }; -use anyhow::{Context, Result}; +use anyhow::Result; use collections::HashMap; use editor::{ items::active_match_index, scroll::autoscroll::Autoscroll, Anchor, Editor, MultiBuffer, @@ -187,6 +187,53 @@ impl ProjectSearch { })); cx.notify(); } + + fn semantic_search(&mut self, query: String, cx: &mut ModelContext) -> Option<()> { + let project = self.project.clone(); + let semantic_index = SemanticIndex::global(cx)?; + let search_task = semantic_index.update(cx, |semantic_index, cx| { + semantic_index.search_project(project, query.clone(), 10, cx) + }); + + self.search_id += 1; + // self.active_query = Some(query); + self.match_ranges.clear(); + self.pending_search = Some(cx.spawn(|this, mut cx| async move { + let results = search_task.await.log_err()?; + + let (_task, mut match_ranges) = this.update(&mut cx, |this, cx| { + this.excerpts.update(cx, |excerpts, cx| { + excerpts.clear(cx); + + let matches = results + .into_iter() + .map(|result| (result.buffer, vec![result.range])) + .collect(); + + excerpts.stream_excerpts_with_context_lines(matches, 3, cx) + }) + }); + + while let Some(match_range) = match_ranges.next().await { + this.update(&mut cx, |this, cx| { + this.match_ranges.push(match_range); + while let Ok(Some(match_range)) = match_ranges.try_next() { + this.match_ranges.push(match_range); + } + cx.notify(); + }); + } + + this.update(&mut cx, |this, cx| { + this.pending_search.take(); + cx.notify(); + }); + + None + })); + + Some(()) + } } pub enum ViewEvent { @@ -595,27 +642,9 @@ impl ProjectSearchView { return; } - let search_phrase = self.query_editor.read(cx).text(cx); - let project = self.model.read(cx).project.clone(); - if let Some(semantic_index) = SemanticIndex::global(cx) { - let search_task = semantic_index.update(cx, |semantic_index, cx| { - semantic_index.search_project(project, search_phrase, 10, cx) - }); - semantic.search_task = Some(cx.spawn(|this, mut cx| async move { - let results = search_task.await.context("search task")?; - - this.update(&mut cx, |this, cx| { - dbg!(&results); - // TODO: Update results - - if let Some(semantic) = &mut this.semantic { - semantic.search_task = None; - } - })?; - - anyhow::Ok(()) - })); - } + let query = self.query_editor.read(cx).text(cx); + self.model + .update(cx, |model, cx| model.semantic_search(query, cx)); return; } diff --git a/crates/semantic_index/src/db.rs b/crates/semantic_index/src/db.rs index 74e1021b152f4bdaf36e3b780d9288bed374466b..fd99594aab578919f80bd8236270b352a8540993 100644 --- a/crates/semantic_index/src/db.rs +++ b/crates/semantic_index/src/db.rs @@ -252,7 +252,7 @@ impl VectorDatabase { worktree_ids: &[i64], query_embedding: &Vec, limit: usize, - ) -> Result, String)>> { + ) -> Result)>> { let mut results = Vec::<(i64, f32)>::with_capacity(limit + 1); self.for_each_document(&worktree_ids, |id, embedding| { let similarity = dot(&embedding, &query_embedding); @@ -296,10 +296,7 @@ impl VectorDatabase { Ok(()) } - fn get_documents_by_ids( - &self, - ids: &[i64], - ) -> Result, String)>> { + fn get_documents_by_ids(&self, ids: &[i64]) -> Result)>> { let mut statement = self.db.prepare( " SELECT @@ -307,7 +304,7 @@ impl VectorDatabase { files.worktree_id, files.relative_path, documents.start_byte, - documents.end_byte, documents.name + documents.end_byte FROM documents, files WHERE @@ -322,14 +319,13 @@ impl VectorDatabase { row.get::<_, i64>(1)?, row.get::<_, String>(2)?.into(), row.get(3)?..row.get(4)?, - row.get(5)?, )) })?; - let mut values_by_id = HashMap::, String)>::default(); + let mut values_by_id = HashMap::)>::default(); for row in result_iter { - let (id, worktree_id, path, range, name) = row?; - values_by_id.insert(id, (worktree_id, path, range, name)); + let (id, worktree_id, path, range) = row?; + values_by_id.insert(id, (worktree_id, path, range)); } let mut results = Vec::with_capacity(ids.len()); diff --git a/crates/semantic_index/src/embedding.rs b/crates/semantic_index/src/embedding.rs index d41350f321d3b13aea217b322163cf73cca07269..728fc9283a1ebcaf13bd035ac3fd0766c9112913 100644 --- a/crates/semantic_index/src/embedding.rs +++ b/crates/semantic_index/src/embedding.rs @@ -70,10 +70,6 @@ impl EmbeddingProvider for DummyEmbeddings { const OPENAI_INPUT_LIMIT: usize = 8190; impl OpenAIEmbeddings { - pub fn new(client: Arc, executor: Arc) -> Self { - Self { client, executor } - } - fn truncate(span: String) -> String { let mut tokens = OPENAI_BPE_TOKENIZER.encode_with_special_tokens(span.as_ref()); if tokens.len() > OPENAI_INPUT_LIMIT { @@ -81,7 +77,6 @@ impl OpenAIEmbeddings { let result = OPENAI_BPE_TOKENIZER.decode(tokens.clone()); if result.is_ok() { let transformed = result.unwrap(); - // assert_ne!(transformed, span); return transformed; } } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index f6575f6ad7188dbaf7ba56160d72cc12f678de10..5c6919d4fd46ee80d2e82515a3710cff044a4e10 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -12,7 +12,7 @@ use db::VectorDatabase; use embedding::{EmbeddingProvider, OpenAIEmbeddings}; use futures::{channel::oneshot, Future}; use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle}; -use language::{Language, LanguageRegistry}; +use language::{Anchor, Buffer, Language, LanguageRegistry}; use parking_lot::Mutex; use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES}; use postage::watch; @@ -93,7 +93,7 @@ pub struct SemanticIndex { struct ProjectState { worktree_db_ids: Vec<(WorktreeId, i64)>, outstanding_job_count_rx: watch::Receiver, - outstanding_job_count_tx: Arc>>, + _outstanding_job_count_tx: Arc>>, } struct JobHandle { @@ -135,12 +135,9 @@ pub struct PendingFile { job_handle: JobHandle, } -#[derive(Debug, Clone)] pub struct SearchResult { - pub worktree_id: WorktreeId, - pub name: String, - pub byte_range: Range, - pub file_path: PathBuf, + pub buffer: ModelHandle, + pub range: Range, } enum DbOperation { @@ -520,7 +517,7 @@ impl SemanticIndex { .map(|(a, b)| (*a, *b)) .collect(), outstanding_job_count_rx: job_count_rx.clone(), - outstanding_job_count_tx: job_count_tx.clone(), + _outstanding_job_count_tx: job_count_tx.clone(), }, ); }); @@ -623,7 +620,7 @@ impl SemanticIndex { let embedding_provider = self.embedding_provider.clone(); let database_url = self.database_url.clone(); let fs = self.fs.clone(); - cx.spawn(|this, cx| async move { + cx.spawn(|this, mut cx| async move { let documents = cx .background() .spawn(async move { @@ -640,26 +637,39 @@ impl SemanticIndex { }) .await?; - this.read_with(&cx, |this, _| { - let project_state = if let Some(state) = this.projects.get(&project.downgrade()) { - state - } else { - return Err(anyhow!("project not added")); - }; - - Ok(documents - .into_iter() - .filter_map(|(worktree_db_id, file_path, byte_range, name)| { - let worktree_id = project_state.worktree_id_for_db_id(worktree_db_id)?; - Some(SearchResult { - worktree_id, - name, - byte_range, - file_path, - }) - }) - .collect()) - }) + let mut tasks = Vec::new(); + let mut ranges = Vec::new(); + let weak_project = project.downgrade(); + project.update(&mut cx, |project, cx| { + for (worktree_db_id, file_path, byte_range) in documents { + let project_state = + if let Some(state) = this.read(cx).projects.get(&weak_project) { + state + } else { + return Err(anyhow!("project not added")); + }; + if let Some(worktree_id) = project_state.worktree_id_for_db_id(worktree_db_id) { + tasks.push(project.open_buffer((worktree_id, file_path), cx)); + ranges.push(byte_range); + } + } + + Ok(()) + })?; + + let buffers = futures::future::join_all(tasks).await; + + Ok(buffers + .into_iter() + .zip(ranges) + .filter_map(|(buffer, range)| { + let buffer = buffer.log_err()?; + let range = buffer.read_with(&cx, |buffer, _| { + buffer.anchor_before(range.start)..buffer.anchor_after(range.end) + }); + Some(SearchResult { buffer, range }) + }) + .collect::>()) }) } } diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 2ccc52d64b598e56be41a0aae5284517c9f0b36b..63b28798ad91d67d6786b4b420900135050dfe5b 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -8,7 +8,7 @@ use crate::{ use anyhow::Result; use async_trait::async_trait; use gpui::{Task, TestAppContext}; -use language::{Language, LanguageConfig, LanguageRegistry}; +use language::{Language, LanguageConfig, LanguageRegistry, ToOffset}; use project::{project_settings::ProjectSettings, FakeFs, Fs, Project}; use rand::{rngs::StdRng, Rng}; use serde_json::json; @@ -85,9 +85,6 @@ async fn test_semantic_index(cx: &mut TestAppContext) { .unwrap(); let project = Project::test(fs.clone(), ["/the-root".as_ref()], cx).await; - let worktree_id = project.read_with(cx, |project, cx| { - project.worktrees(cx).next().unwrap().read(cx).id() - }); let (file_count, outstanding_file_count) = store .update(cx, |store, cx| store.index_project(project.clone(), cx)) .await @@ -103,9 +100,13 @@ async fn test_semantic_index(cx: &mut TestAppContext) { .await .unwrap(); - assert_eq!(search_results[0].byte_range.start, 0); - assert_eq!(search_results[0].name, "aaa"); - assert_eq!(search_results[0].worktree_id, worktree_id); + search_results[0].buffer.read_with(cx, |buffer, _cx| { + assert_eq!(search_results[0].range.start.to_offset(buffer), 0); + assert_eq!( + buffer.file().unwrap().path().as_ref(), + Path::new("file1.rs") + ); + }); fs.save( "/the-root/src/file2.rs".as_ref(), From 342dbc69459d771f802d7e77fdd7fb20f7445d1f Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Tue, 18 Jul 2023 12:01:42 -0700 Subject: [PATCH 21/34] Fix rendering of project search while semantic index is indexing or running Co-authored-by: Kyle --- crates/search/src/project_search.rs | 32 +++++++++++++---------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index 1097969c00efca6c025da6193251e480b943d7aa..5feb94426eb60c67a756c564982a826699bd20a1 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -110,7 +110,6 @@ struct SemanticSearchState { file_count: usize, outstanding_file_count: usize, _progress_task: Task<()>, - search_task: Option>>, } pub struct ProjectSearchBar { @@ -188,18 +187,17 @@ impl ProjectSearch { cx.notify(); } - fn semantic_search(&mut self, query: String, cx: &mut ModelContext) -> Option<()> { - let project = self.project.clone(); - let semantic_index = SemanticIndex::global(cx)?; - let search_task = semantic_index.update(cx, |semantic_index, cx| { - semantic_index.search_project(project, query.clone(), 10, cx) + fn semantic_search(&mut self, query: String, cx: &mut ModelContext) { + let search = SemanticIndex::global(cx).map(|index| { + index.update(cx, |semantic_index, cx| { + semantic_index.search_project(self.project.clone(), query.clone(), 10, cx) + }) }); - self.search_id += 1; // self.active_query = Some(query); self.match_ranges.clear(); self.pending_search = Some(cx.spawn(|this, mut cx| async move { - let results = search_task.await.log_err()?; + let results = search?.await.log_err()?; let (_task, mut match_ranges) = this.update(&mut cx, |this, cx| { this.excerpts.update(cx, |excerpts, cx| { @@ -231,8 +229,7 @@ impl ProjectSearch { None })); - - Some(()) + cx.notify(); } } @@ -257,12 +254,10 @@ impl View for ProjectSearchView { enum Status {} let theme = theme::current(cx).clone(); - let text = if self.query_editor.read(cx).text(cx).is_empty() { - Cow::Borrowed("") + let text = if model.pending_search.is_some() { + Cow::Borrowed("Searching...") } else if let Some(semantic) = &self.semantic { - if semantic.search_task.is_some() { - Cow::Borrowed("Searching...") - } else if semantic.outstanding_file_count > 0 { + if semantic.outstanding_file_count > 0 { Cow::Owned(format!( "Indexing. {} of {}...", semantic.file_count - semantic.outstanding_file_count, @@ -271,8 +266,8 @@ impl View for ProjectSearchView { } else { Cow::Borrowed("Indexing complete") } - } else if model.pending_search.is_some() { - Cow::Borrowed("Searching...") + } else if self.query_editor.read(cx).text(cx).is_empty() { + Cow::Borrowed("") } else { Cow::Borrowed("No results") }; @@ -978,10 +973,10 @@ impl ProjectSearchBar { let (files_to_index, mut files_remaining_rx) = index_task.await?; search_view.update(&mut cx, |search_view, cx| { + cx.notify(); search_view.semantic = Some(SemanticSearchState { file_count: files_to_index, outstanding_file_count: files_to_index, - search_task: None, _progress_task: cx.spawn(|search_view, mut cx| async move { while let Some(count) = files_remaining_rx.recv().await { search_view @@ -1006,6 +1001,7 @@ impl ProjectSearchBar { }) .detach_and_log_err(cx); } + cx.notify(); }); cx.notify(); true From 0e071919a07967f781f87c060c5c94168a844ba6 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 18 Jul 2023 16:09:44 -0400 Subject: [PATCH 22/34] parellelize embedding api calls --- crates/semantic_index/src/embedding.rs | 6 ++- crates/semantic_index/src/semantic_index.rs | 54 ++++++++++++++------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/crates/semantic_index/src/embedding.rs b/crates/semantic_index/src/embedding.rs index 728fc9283a1ebcaf13bd035ac3fd0766c9112913..77457ec7f6e34961ab2a784ef6f0d8068c4c1dbb 100644 --- a/crates/semantic_index/src/embedding.rs +++ b/crates/semantic_index/src/embedding.rs @@ -106,7 +106,7 @@ impl OpenAIEmbeddings { #[async_trait] impl EmbeddingProvider for OpenAIEmbeddings { async fn embed_batch(&self, spans: Vec<&str>) -> Result>> { - const BACKOFF_SECONDS: [usize; 3] = [65, 180, 360]; + const BACKOFF_SECONDS: [usize; 3] = [45, 75, 125]; const MAX_RETRIES: usize = 3; let api_key = OPENAI_API_KEY @@ -133,6 +133,10 @@ impl EmbeddingProvider for OpenAIEmbeddings { match response.status() { StatusCode::TOO_MANY_REQUESTS => { let delay = Duration::from_secs(BACKOFF_SECONDS[request_number - 1] as u64); + log::trace!( + "open ai rate limiting, delaying request by {:?} seconds", + delay.as_secs() + ); self.executor.timer(delay).await; } StatusCode::BAD_REQUEST => { diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 5c6919d4fd46ee80d2e82515a3710cff044a4e10..44ce45f457004c7167f8c61501c2b03ca239d199 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -24,7 +24,7 @@ use std::{ ops::Range, path::{Path, PathBuf}, sync::{Arc, Weak}, - time::SystemTime, + time::{Instant, SystemTime}, }; use util::{ channel::{ReleaseChannel, RELEASE_CHANNEL, RELEASE_CHANNEL_NAME}, @@ -34,7 +34,7 @@ use util::{ }; const SEMANTIC_INDEX_VERSION: usize = 4; -const EMBEDDINGS_BATCH_SIZE: usize = 150; +const EMBEDDINGS_BATCH_SIZE: usize = 80; pub fn init( fs: Arc, @@ -84,7 +84,7 @@ pub struct SemanticIndex { db_update_tx: channel::Sender, parsing_files_tx: channel::Sender, _db_update_task: Task<()>, - _embed_batch_task: Task<()>, + _embed_batch_tasks: Vec>, _batch_files_task: Task<()>, _parsing_files_tasks: Vec>, projects: HashMap, ProjectState>, @@ -189,6 +189,7 @@ impl SemanticIndex { language_registry: Arc, mut cx: AsyncAppContext, ) -> Result> { + let t0 = Instant::now(); let database_url = Arc::new(database_url); let db = cx @@ -196,7 +197,13 @@ impl SemanticIndex { .spawn(VectorDatabase::new(fs.clone(), database_url.clone())) .await?; + log::trace!( + "db initialization took {:?} milliseconds", + t0.elapsed().as_millis() + ); + Ok(cx.add_model(|cx| { + let t0 = Instant::now(); // Perform database operations let (db_update_tx, db_update_rx) = channel::unbounded(); let _db_update_task = cx.background().spawn({ @@ -210,20 +217,24 @@ impl SemanticIndex { // Group documents into batches and send them to the embedding provider. let (embed_batch_tx, embed_batch_rx) = channel::unbounded::, PathBuf, SystemTime, JobHandle)>>(); - let _embed_batch_task = cx.background().spawn({ - let db_update_tx = db_update_tx.clone(); - let embedding_provider = embedding_provider.clone(); - async move { - while let Ok(embeddings_queue) = embed_batch_rx.recv().await { - Self::compute_embeddings_for_batch( - embeddings_queue, - &embedding_provider, - &db_update_tx, - ) - .await; + let mut _embed_batch_tasks = Vec::new(); + for _ in 0..cx.background().num_cpus() { + let embed_batch_rx = embed_batch_rx.clone(); + _embed_batch_tasks.push(cx.background().spawn({ + let db_update_tx = db_update_tx.clone(); + let embedding_provider = embedding_provider.clone(); + async move { + while let Ok(embeddings_queue) = embed_batch_rx.recv().await { + Self::compute_embeddings_for_batch( + embeddings_queue, + &embedding_provider, + &db_update_tx, + ) + .await; + } } - } - }); + })); + } // Group documents into batches and send them to the embedding provider. let (batch_files_tx, batch_files_rx) = channel::unbounded::(); @@ -264,6 +275,10 @@ impl SemanticIndex { })); } + log::trace!( + "semantic index task initialization took {:?} milliseconds", + t0.elapsed().as_millis() + ); Self { fs, database_url, @@ -272,7 +287,7 @@ impl SemanticIndex { db_update_tx, parsing_files_tx, _db_update_task, - _embed_batch_task, + _embed_batch_tasks, _batch_files_task, _parsing_files_tasks, projects: HashMap::new(), @@ -460,6 +475,7 @@ impl SemanticIndex { project: ModelHandle, cx: &mut ModelContext, ) -> Task)>> { + let t0 = Instant::now(); let worktree_scans_complete = project .read(cx) .worktrees(cx) @@ -577,6 +593,10 @@ impl SemanticIndex { } } + log::trace!( + "walking worktree took {:?} milliseconds", + t0.elapsed().as_millis() + ); anyhow::Ok((count, job_count_rx)) }) .await From 9809ec3d706a19cd409a8a7494fabc06803e0ed7 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 19 Jul 2023 15:47:05 -0400 Subject: [PATCH 23/34] update treesitter parsing to accomodate for collapsed nested functions Co-authored-by: maxbrunsfeld --- Cargo.lock | 3 +- Cargo.toml | 2 +- crates/language/src/language.rs | 22 + crates/semantic_index/Cargo.toml | 1 + crates/semantic_index/src/parsing.rs | 257 +++- crates/semantic_index/src/semantic_index.rs | 8 +- .../src/semantic_index_tests.rs | 1079 +++++++++-------- crates/zed/src/languages/rust/config.toml | 1 + crates/zed/src/languages/rust/embedding.scm | 64 +- 9 files changed, 813 insertions(+), 624 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7719eb24c228613114bf999f207629ba0c6d4664..8ea6f61da04f215b91b31941ccce795be778a204 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6486,6 +6486,7 @@ dependencies = [ "parking_lot 0.11.2", "picker", "postage", + "pretty_assertions", "project", "rand 0.8.5", "rpc", @@ -7991,7 +7992,7 @@ dependencies = [ [[package]] name = "tree-sitter" version = "0.20.10" -source = "git+https://github.com/tree-sitter/tree-sitter?rev=49226023693107fba9a1191136a4f47f38cdca73#49226023693107fba9a1191136a4f47f38cdca73" +source = "git+https://github.com/tree-sitter/tree-sitter?rev=1c65ca24bc9a734ab70115188f465e12eecf224e#1c65ca24bc9a734ab70115188f465e12eecf224e" dependencies = [ "cc", "regex", diff --git a/Cargo.toml b/Cargo.toml index 4b6574534845456623d8c1a6510c15817c2b6151..04f2147431ffe183de21e250885ce16b28166ec9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -130,7 +130,7 @@ tree-sitter-yaml = { git = "https://github.com/zed-industries/tree-sitter-yaml", tree-sitter-lua = "0.0.14" [patch.crates-io] -tree-sitter = { git = "https://github.com/tree-sitter/tree-sitter", rev = "49226023693107fba9a1191136a4f47f38cdca73" } +tree-sitter = { git = "https://github.com/tree-sitter/tree-sitter", rev = "1c65ca24bc9a734ab70115188f465e12eecf224e" } async-task = { git = "https://github.com/zed-industries/async-task", rev = "341b57d6de98cdfd7b418567b8de2022ca993a6e" } # TODO - Remove when a version is released with this PR: https://github.com/servo/core-foundation-rs/pull/457 diff --git a/crates/language/src/language.rs b/crates/language/src/language.rs index 8c6d6e9c09f17f48c58c42e3d67d144ceb7e56cb..ec233716d6ce5345515600b14e00a212b3dcb3a5 100644 --- a/crates/language/src/language.rs +++ b/crates/language/src/language.rs @@ -339,6 +339,8 @@ pub struct LanguageConfig { #[serde(default)] pub line_comment: Option>, #[serde(default)] + pub collapsed_placeholder: String, + #[serde(default)] pub block_comment: Option<(Arc, Arc)>, #[serde(default)] pub overrides: HashMap, @@ -408,6 +410,7 @@ impl Default for LanguageConfig { line_comment: Default::default(), block_comment: Default::default(), overrides: Default::default(), + collapsed_placeholder: Default::default(), } } } @@ -525,6 +528,8 @@ pub struct EmbeddingConfig { pub item_capture_ix: u32, pub name_capture_ix: u32, pub context_capture_ix: Option, + pub collapse_capture_ix: Option, + pub keep_capture_ix: Option, } struct InjectionConfig { @@ -1246,12 +1251,16 @@ impl Language { let mut item_capture_ix = None; let mut name_capture_ix = None; let mut context_capture_ix = None; + let mut collapse_capture_ix = None; + let mut keep_capture_ix = None; get_capture_indices( &query, &mut [ ("item", &mut item_capture_ix), ("name", &mut name_capture_ix), ("context", &mut context_capture_ix), + ("keep", &mut keep_capture_ix), + ("collapse", &mut collapse_capture_ix), ], ); if let Some((item_capture_ix, name_capture_ix)) = item_capture_ix.zip(name_capture_ix) { @@ -1260,6 +1269,8 @@ impl Language { item_capture_ix, name_capture_ix, context_capture_ix, + collapse_capture_ix, + keep_capture_ix, }); } Ok(self) @@ -1544,9 +1555,20 @@ impl Language { pub fn grammar(&self) -> Option<&Arc> { self.grammar.as_ref() } + + pub fn default_scope(self: &Arc) -> LanguageScope { + LanguageScope { + language: self.clone(), + override_id: None, + } + } } impl LanguageScope { + pub fn collapsed_placeholder(&self) -> &str { + self.language.config.collapsed_placeholder.as_ref() + } + pub fn line_comment_prefix(&self) -> Option<&Arc> { Override::as_option( self.config_override().map(|o| &o.line_comment), diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index 2d21ff6c1c42710e597101cd024fdde9183bcbc5..1b3169bfe41940eef5c863901c560e106acf4816 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -46,6 +46,7 @@ rpc = { path = "../rpc", features = ["test-support"] } workspace = { path = "../workspace", features = ["test-support"] } settings = { path = "../settings", features = ["test-support"]} +pretty_assertions.workspace = true rand.workspace = true unindent.workspace = true tempdir.workspace = true diff --git a/crates/semantic_index/src/parsing.rs b/crates/semantic_index/src/parsing.rs index 663f0f473b63358496c8dcbc337aa7ccbe452c76..0d2aeb60fb24dcb347c3a0f870ef6e348e08a88a 100644 --- a/crates/semantic_index/src/parsing.rs +++ b/crates/semantic_index/src/parsing.rs @@ -1,6 +1,6 @@ use anyhow::{anyhow, Ok, Result}; -use language::Language; -use std::{ops::Range, path::Path, sync::Arc}; +use language::{Grammar, Language}; +use std::{cmp, collections::HashSet, ops::Range, path::Path, sync::Arc}; use tree_sitter::{Parser, QueryCursor}; #[derive(Debug, PartialEq, Clone)] @@ -22,6 +22,20 @@ pub struct CodeContextRetriever { pub cursor: QueryCursor, } +// Every match has an item, this represents the fundamental treesitter symbol and anchors the search +// Every match has one or more 'name' captures. These indicate the display range of the item for deduplication. +// If there are preceeding comments, we track this with a context capture +// If there is a piece that should be collapsed in hierarchical queries, we capture it with a collapse capture +// If there is a piece that should be kept inside a collapsed node, we capture it with a keep capture +#[derive(Debug, Clone)] +pub struct CodeContextMatch { + pub start_col: usize, + pub item_range: Range, + pub name_range: Range, + pub context_ranges: Vec>, + pub collapse_ranges: Vec>, +} + impl CodeContextRetriever { pub fn new() -> Self { Self { @@ -49,24 +63,15 @@ impl CodeContextRetriever { }]) } - pub fn parse_file( + fn get_matches_in_file( &mut self, - relative_path: &Path, content: &str, - language: Arc, - ) -> Result> { - if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) { - return self._parse_entire_file(relative_path, language.name(), &content); - } - - let grammar = language - .grammar() - .ok_or_else(|| anyhow!("no grammar for language"))?; + grammar: &Arc, + ) -> Result> { let embedding_config = grammar .embedding_config .as_ref() .ok_or_else(|| anyhow!("no embedding queries"))?; - self.parser.set_language(grammar.ts_language).unwrap(); let tree = self @@ -74,66 +79,204 @@ impl CodeContextRetriever { .parse(&content, None) .ok_or_else(|| anyhow!("parsing failed"))?; - let mut documents = Vec::new(); - - // Iterate through query matches - let mut name_ranges: Vec> = vec![]; + let mut captures: Vec = Vec::new(); + let mut collapse_ranges: Vec> = Vec::new(); + let mut keep_ranges: Vec> = Vec::new(); for mat in self.cursor.matches( &embedding_config.query, tree.root_node(), content.as_bytes(), ) { - let mut name: Vec<&str> = vec![]; - let mut item: Option<&str> = None; - let mut byte_range: Option> = None; - let mut context_spans: Vec<&str> = vec![]; + let mut start_col = 0; + let mut item_range: Option> = None; + let mut name_range: Option> = None; + let mut context_ranges: Vec> = Vec::new(); + collapse_ranges.clear(); + keep_ranges.clear(); for capture in mat.captures { if capture.index == embedding_config.item_capture_ix { - byte_range = Some(capture.node.byte_range()); - item = content.get(capture.node.byte_range()); + item_range = Some(capture.node.byte_range()); + start_col = capture.node.start_position().column; } else if capture.index == embedding_config.name_capture_ix { - let name_range = capture.node.byte_range(); - if name_ranges.contains(&name_range) { - continue; - } - name_ranges.push(name_range.clone()); - if let Some(name_content) = content.get(name_range.clone()) { - name.push(name_content); - } + name_range = Some(capture.node.byte_range()); + } else if Some(capture.index) == embedding_config.context_capture_ix { + context_ranges.push(capture.node.byte_range()); + } else if Some(capture.index) == embedding_config.collapse_capture_ix { + collapse_ranges.push(capture.node.byte_range()); + } else if Some(capture.index) == embedding_config.keep_capture_ix { + keep_ranges.push(capture.node.byte_range()); } + } - if let Some(context_capture_ix) = embedding_config.context_capture_ix { - if capture.index == context_capture_ix { - if let Some(context) = content.get(capture.node.byte_range()) { - context_spans.push(context); - } - } + if item_range.is_some() && name_range.is_some() { + let item_range = item_range.unwrap(); + captures.push(CodeContextMatch { + start_col, + item_range, + name_range: name_range.unwrap(), + context_ranges, + collapse_ranges: subtract_ranges(&collapse_ranges, &keep_ranges), + }); + } + } + Ok(captures) + } + + pub fn parse_file_with_template( + &mut self, + relative_path: &Path, + content: &str, + language: Arc, + ) -> Result> { + let language_name = language.name(); + let mut documents = self.parse_file(relative_path, content, language)?; + for document in &mut documents { + document.content = CODE_CONTEXT_TEMPLATE + .replace("", relative_path.to_string_lossy().as_ref()) + .replace("", language_name.as_ref()) + .replace("item", &document.content); + } + Ok(documents) + } + + pub fn parse_file( + &mut self, + relative_path: &Path, + content: &str, + language: Arc, + ) -> Result> { + if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) { + return self._parse_entire_file(relative_path, language.name(), &content); + } + + let grammar = language + .grammar() + .ok_or_else(|| anyhow!("no grammar for language"))?; + + // Iterate through query matches + let matches = self.get_matches_in_file(content, grammar)?; + + let language_scope = language.default_scope(); + let placeholder = language_scope.collapsed_placeholder(); + + let mut documents = Vec::new(); + let mut collapsed_ranges_within = Vec::new(); + let mut parsed_name_ranges = HashSet::new(); + for (i, context_match) in matches.iter().enumerate() { + if parsed_name_ranges.contains(&context_match.name_range) { + continue; + } + + collapsed_ranges_within.clear(); + for remaining_match in &matches[(i + 1)..] { + if context_match + .item_range + .contains(&remaining_match.item_range.start) + && context_match + .item_range + .contains(&remaining_match.item_range.end) + { + collapsed_ranges_within.extend(remaining_match.collapse_ranges.iter().cloned()); + } else { + break; } } - if let Some((item, byte_range)) = item.zip(byte_range) { - if !name.is_empty() { - let item = if context_spans.is_empty() { - item.to_string() - } else { - format!("{}\n{}", context_spans.join("\n"), item) - }; - - let document_text = CODE_CONTEXT_TEMPLATE - .replace("", relative_path.to_str().unwrap()) - .replace("", &language.name().to_lowercase()) - .replace("", item.as_str()); - - documents.push(Document { - range: byte_range, - content: document_text, - embedding: Vec::new(), - name: name.join(" ").to_string(), - }); + let mut document_content = String::new(); + for context_range in &context_match.context_ranges { + document_content.push_str(&content[context_range.clone()]); + document_content.push_str("\n"); + } + + let mut offset = context_match.item_range.start; + for collapsed_range in &collapsed_ranges_within { + if collapsed_range.start > offset { + add_content_from_range( + &mut document_content, + content, + offset..collapsed_range.start, + context_match.start_col, + ); } + document_content.push_str(placeholder); + offset = collapsed_range.end; + } + + if offset < context_match.item_range.end { + add_content_from_range( + &mut document_content, + content, + offset..context_match.item_range.end, + context_match.start_col, + ); + } + + if let Some(name) = content.get(context_match.name_range.clone()) { + parsed_name_ranges.insert(context_match.name_range.clone()); + documents.push(Document { + name: name.to_string(), + content: document_content, + range: context_match.item_range.clone(), + embedding: vec![], + }) } } return Ok(documents); } } + +pub(crate) fn subtract_ranges( + ranges: &[Range], + ranges_to_subtract: &[Range], +) -> Vec> { + let mut result = Vec::new(); + + let mut ranges_to_subtract = ranges_to_subtract.iter().peekable(); + + for range in ranges { + let mut offset = range.start; + + while offset < range.end { + if let Some(range_to_subtract) = ranges_to_subtract.peek() { + if offset < range_to_subtract.start { + let next_offset = cmp::min(range_to_subtract.start, range.end); + result.push(offset..next_offset); + offset = next_offset; + } else { + let next_offset = cmp::min(range_to_subtract.end, range.end); + offset = next_offset; + } + + if offset >= range_to_subtract.end { + ranges_to_subtract.next(); + } + } else { + result.push(offset..range.end); + offset = range.end; + } + } + } + + result +} + +fn add_content_from_range( + output: &mut String, + content: &str, + range: Range, + start_col: usize, +) { + for mut line in content.get(range.clone()).unwrap_or("").lines() { + for _ in 0..start_col { + if line.starts_with(' ') { + line = &line[1..]; + } else { + break; + } + } + output.push_str(line); + output.push('\n'); + } + output.pop(); +} diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 44ce45f457004c7167f8c61501c2b03ca239d199..271fd741a643d1e04d5afe57a50b70b6e391cbf7 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -409,7 +409,11 @@ impl SemanticIndex { ) { if let Some(content) = fs.load(&pending_file.absolute_path).await.log_err() { if let Some(documents) = retriever - .parse_file(&pending_file.relative_path, &content, pending_file.language) + .parse_file_with_template( + &pending_file.relative_path, + &content, + pending_file.language, + ) .log_err() { log::trace!( @@ -657,6 +661,8 @@ impl SemanticIndex { }) .await?; + dbg!(&documents); + let mut tasks = Vec::new(); let mut ranges = Vec::new(); let weak_project = project.downgrade(); diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 63b28798ad91d67d6786b4b420900135050dfe5b..c54d5079d37f3b8ad5ce4dbb788f5eb5f68b02c8 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -1,7 +1,7 @@ use crate::{ db::dot, embedding::EmbeddingProvider, - parsing::{CodeContextRetriever, Document}, + parsing::{subtract_ranges, CodeContextRetriever, Document}, semantic_index_settings::SemanticIndexSettings, SemanticIndex, }; @@ -9,6 +9,7 @@ use anyhow::Result; use async_trait::async_trait; use gpui::{Task, TestAppContext}; use language::{Language, LanguageConfig, LanguageRegistry, ToOffset}; +use pretty_assertions::assert_eq; use project::{project_settings::ProjectSettings, FakeFs, Fs, Project}; use rand::{rngs::StdRng, Rng}; use serde_json::json; @@ -104,7 +105,7 @@ async fn test_semantic_index(cx: &mut TestAppContext) { assert_eq!(search_results[0].range.start.to_offset(buffer), 0); assert_eq!( buffer.file().unwrap().path().as_ref(), - Path::new("file1.rs") + Path::new("src/file1.rs") ); }); @@ -147,503 +148,548 @@ async fn test_code_context_retrieval_rust() { let text = " /// A doc comment /// that spans multiple lines + #[gpui::test] fn a() { b } impl C for D { } + + impl E { + // This is also a preceding comment + pub fn function_1() -> Option<()> { + todo!(); + } + + // This is a preceding comment + fn function_2() -> Result<()> { + todo!(); + } + } " .unindent(); - let parsed_files = retriever + let documents = retriever .parse_file(Path::new("foo.rs"), &text, language) .unwrap(); - assert_eq!( - parsed_files, + assert_documents_eq( + &documents, &[ - Document { - name: "a".into(), - range: text.find("fn a").unwrap()..(text.find("}").unwrap() + 1), - content: " - The below code snippet is from file 'foo.rs' - - ```rust - /// A doc comment - /// that spans multiple lines - fn a() { - b - } - ```" + ( + " + /// A doc comment + /// that spans multiple lines + #[gpui::test] + fn a() { + b + }" .unindent(), - embedding: vec![], - }, - Document { - name: "C for D".into(), - range: text.find("impl C").unwrap()..(text.rfind("}").unwrap() + 1), - content: " - The below code snippet is from file 'foo.rs' - - ```rust - impl C for D { - } - ```" + text.find("fn a").unwrap(), + ), + ( + " + impl C for D { + }" .unindent(), - embedding: vec![], - } - ] + text.find("impl C").unwrap(), + ), + ( + " + impl E { + // This is also a preceding comment + pub fn function_1() -> Option<()> { /* ... */ } + + // This is a preceding comment + fn function_2() -> Result<()> { /* ... */ } + }" + .unindent(), + text.find("impl E").unwrap(), + ), + ( + " + // This is also a preceding comment + pub fn function_1() -> Option<()> { + todo!(); + }" + .unindent(), + text.find("pub fn function_1").unwrap(), + ), + ( + " + // This is a preceding comment + fn function_2() -> Result<()> { + todo!(); + }" + .unindent(), + text.find("fn function_2").unwrap(), + ), + ], ); } -#[gpui::test] -async fn test_code_context_retrieval_javascript() { - let language = js_lang(); - let mut retriever = CodeContextRetriever::new(); - - let text = " - /* globals importScripts, backend */ - function _authorize() {} - - /** - * Sometimes the frontend build is way faster than backend. - */ - export async function authorizeBank() { - _authorize(pushModal, upgradingAccountId, {}); - } - - export class SettingsPage { - /* This is a test setting */ - constructor(page) { - this.page = page; - } - } - - /* This is a test comment */ - class TestClass {} - - /* Schema for editor_events in Clickhouse. */ - export interface ClickhouseEditorEvent { - installation_id: string - operation: string - } - " - .unindent(); - - let parsed_files = retriever - .parse_file(Path::new("foo.js"), &text, language) - .unwrap(); - - let test_documents = &[ - Document { - name: "function _authorize".into(), - range: text.find("function _authorize").unwrap()..(text.find("}").unwrap() + 1), - content: " - The below code snippet is from file 'foo.js' - - ```javascript - /* globals importScripts, backend */ - function _authorize() {} - ```" - .unindent(), - embedding: vec![], - }, - Document { - name: "async function authorizeBank".into(), - range: text.find("export async").unwrap()..223, - content: " - The below code snippet is from file 'foo.js' - - ```javascript - /** - * Sometimes the frontend build is way faster than backend. - */ - export async function authorizeBank() { - _authorize(pushModal, upgradingAccountId, {}); - } - ```" - .unindent(), - embedding: vec![], - }, - Document { - name: "class SettingsPage".into(), - range: 225..343, - content: " - The below code snippet is from file 'foo.js' - - ```javascript - export class SettingsPage { - /* This is a test setting */ - constructor(page) { - this.page = page; - } - } - ```" - .unindent(), - embedding: vec![], - }, - Document { - name: "constructor".into(), - range: 290..341, - content: " - The below code snippet is from file 'foo.js' - - ```javascript - /* This is a test setting */ - constructor(page) { - this.page = page; - } - ```" - .unindent(), - embedding: vec![], - }, - Document { - name: "class TestClass".into(), - range: 374..392, - content: " - The below code snippet is from file 'foo.js' - - ```javascript - /* This is a test comment */ - class TestClass {} - ```" - .unindent(), - embedding: vec![], - }, - Document { - name: "interface ClickhouseEditorEvent".into(), - range: 440..532, - content: " - The below code snippet is from file 'foo.js' - - ```javascript - /* Schema for editor_events in Clickhouse. */ - export interface ClickhouseEditorEvent { - installation_id: string - operation: string - } - ```" - .unindent(), - embedding: vec![], - }, - ]; - - for idx in 0..test_documents.len() { - assert_eq!(test_documents[idx], parsed_files[idx]); - } -} - -#[gpui::test] -async fn test_code_context_retrieval_elixir() { - let language = elixir_lang(); - let mut retriever = CodeContextRetriever::new(); - - let text = r#" -defmodule File.Stream do - @moduledoc """ - Defines a `File.Stream` struct returned by `File.stream!/3`. - - The following fields are public: - - * `path` - the file path - * `modes` - the file modes - * `raw` - a boolean indicating if bin functions should be used - * `line_or_bytes` - if reading should read lines or a given number of bytes - * `node` - the node the file belongs to - - """ - - defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil - - @type t :: %__MODULE__{} - - @doc false - def __build__(path, modes, line_or_bytes) do - raw = :lists.keyfind(:encoding, 1, modes) == false - - modes = - case raw do - true -> - case :lists.keyfind(:read_ahead, 1, modes) do - {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] - {:read_ahead, _} -> [:raw | modes] - false -> [:raw, :read_ahead | modes] - end - - false -> - modes - end - - %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} - - end -"# - .unindent(); - - let parsed_files = retriever - .parse_file(Path::new("foo.ex"), &text, language) - .unwrap(); - - let test_documents = &[ - Document{ - name: "defmodule File.Stream".into(), - range: 0..1132, - content: r#" - The below code snippet is from file 'foo.ex' - - ```elixir - defmodule File.Stream do - @moduledoc """ - Defines a `File.Stream` struct returned by `File.stream!/3`. - - The following fields are public: - - * `path` - the file path - * `modes` - the file modes - * `raw` - a boolean indicating if bin functions should be used - * `line_or_bytes` - if reading should read lines or a given number of bytes - * `node` - the node the file belongs to - - """ - - defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil - - @type t :: %__MODULE__{} - - @doc false - def __build__(path, modes, line_or_bytes) do - raw = :lists.keyfind(:encoding, 1, modes) == false - - modes = - case raw do - true -> - case :lists.keyfind(:read_ahead, 1, modes) do - {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] - {:read_ahead, _} -> [:raw | modes] - false -> [:raw, :read_ahead | modes] - end - - false -> - modes - end - - %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} - - end - ```"#.unindent(), - embedding: vec![], - }, - Document { - name: "def __build__".into(), - range: 574..1132, - content: r#" -The below code snippet is from file 'foo.ex' - -```elixir -@doc false -def __build__(path, modes, line_or_bytes) do - raw = :lists.keyfind(:encoding, 1, modes) == false - - modes = - case raw do - true -> - case :lists.keyfind(:read_ahead, 1, modes) do - {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] - {:read_ahead, _} -> [:raw | modes] - false -> [:raw, :read_ahead | modes] - end - - false -> - modes - end - - %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} - - end -```"# - .unindent(), - embedding: vec![], - }]; - - for idx in 0..test_documents.len() { - assert_eq!(test_documents[idx], parsed_files[idx]); - } +fn assert_documents_eq( + documents: &[Document], + expected_contents_and_start_offsets: &[(String, usize)], +) { + assert_eq!( + documents + .iter() + .map(|document| (document.content.clone(), document.range.start)) + .collect::>(), + expected_contents_and_start_offsets + ); } -#[gpui::test] -async fn test_code_context_retrieval_cpp() { - let language = cpp_lang(); - let mut retriever = CodeContextRetriever::new(); - - let text = " - /** - * @brief Main function - * @returns 0 on exit - */ - int main() { return 0; } - - /** - * This is a test comment - */ - class MyClass { // The class - public: // Access specifier - int myNum; // Attribute (int variable) - string myString; // Attribute (string variable) - }; - - // This is a test comment - enum Color { red, green, blue }; - - /** This is a preceeding block comment - * This is the second line - */ - struct { // Structure declaration - int myNum; // Member (int variable) - string myString; // Member (string variable) - } myStructure; - - /** - * @brief Matrix class. - */ - template ::value || std::is_floating_point::value, - bool>::type> - class Matrix2 { - std::vector> _mat; - - public: - /** - * @brief Constructor - * @tparam Integer ensuring integers are being evaluated and not other - * data types. - * @param size denoting the size of Matrix as size x size - */ - template ::value, - Integer>::type> - explicit Matrix(const Integer size) { - for (size_t i = 0; i < size; ++i) { - _mat.emplace_back(std::vector(size, 0)); - } - } - }" - .unindent(); - - let parsed_files = retriever - .parse_file(Path::new("foo.cpp"), &text, language) - .unwrap(); - - let test_documents = &[ - Document { - name: "int main".into(), - range: 54..78, - content: " - The below code snippet is from file 'foo.cpp' - - ```cpp - /** - * @brief Main function - * @returns 0 on exit - */ - int main() { return 0; } - ```" - .unindent(), - embedding: vec![], - }, - Document { - name: "class MyClass".into(), - range: 112..295, - content: " - The below code snippet is from file 'foo.cpp' - - ```cpp - /** - * This is a test comment - */ - class MyClass { // The class - public: // Access specifier - int myNum; // Attribute (int variable) - string myString; // Attribute (string variable) - } - ```" - .unindent(), - embedding: vec![], - }, - Document { - name: "enum Color".into(), - range: 324..355, - content: " - The below code snippet is from file 'foo.cpp' - - ```cpp - // This is a test comment - enum Color { red, green, blue } - ```" - .unindent(), - embedding: vec![], - }, - Document { - name: "struct myStructure".into(), - range: 428..581, - content: " - The below code snippet is from file 'foo.cpp' - - ```cpp - /** This is a preceeding block comment - * This is the second line - */ - struct { // Structure declaration - int myNum; // Member (int variable) - string myString; // Member (string variable) - } myStructure; - ```" - .unindent(), - embedding: vec![], - }, - Document { - name: "class Matrix2".into(), - range: 613..1342, - content: " - The below code snippet is from file 'foo.cpp' - - ```cpp - /** - * @brief Matrix class. - */ - template ::value || std::is_floating_point::value, - bool>::type> - class Matrix2 { - std::vector> _mat; - - public: - /** - * @brief Constructor - * @tparam Integer ensuring integers are being evaluated and not other - * data types. - * @param size denoting the size of Matrix as size x size - */ - template ::value, - Integer>::type> - explicit Matrix(const Integer size) { - for (size_t i = 0; i < size; ++i) { - _mat.emplace_back(std::vector(size, 0)); - } - } - } - ```" - .unindent(), - embedding: vec![], - }, - ]; - - for idx in 0..test_documents.len() { - assert_eq!(test_documents[idx], parsed_files[idx]); - } -} +// #[gpui::test] +// async fn test_code_context_retrieval_javascript() { +// let language = js_lang(); +// let mut retriever = CodeContextRetriever::new(); + +// let text = " +// /* globals importScripts, backend */ +// function _authorize() {} + +// /** +// * Sometimes the frontend build is way faster than backend. +// */ +// export async function authorizeBank() { +// _authorize(pushModal, upgradingAccountId, {}); +// } + +// export class SettingsPage { +// /* This is a test setting */ +// constructor(page) { +// this.page = page; +// } +// } + +// /* This is a test comment */ +// class TestClass {} + +// /* Schema for editor_events in Clickhouse. */ +// export interface ClickhouseEditorEvent { +// installation_id: string +// operation: string +// } +// " +// .unindent(); + +// let parsed_files = retriever +// .parse_file(Path::new("foo.js"), &text, language) +// .unwrap(); + +// let test_documents = &[ +// Document { +// name: "function _authorize".into(), +// range: text.find("function _authorize").unwrap()..(text.find("}").unwrap() + 1), +// content: " +// The below code snippet is from file 'foo.js' + +// ```javascript +// /* globals importScripts, backend */ +// function _authorize() {} +// ```" +// .unindent(), +// embedding: vec![], +// }, +// Document { +// name: "async function authorizeBank".into(), +// range: text.find("export async").unwrap()..223, +// content: " +// The below code snippet is from file 'foo.js' + +// ```javascript +// /** +// * Sometimes the frontend build is way faster than backend. +// */ +// export async function authorizeBank() { +// _authorize(pushModal, upgradingAccountId, {}); +// } +// ```" +// .unindent(), +// embedding: vec![], +// }, +// Document { +// name: "class SettingsPage".into(), +// range: 225..343, +// content: " +// The below code snippet is from file 'foo.js' + +// ```javascript +// export class SettingsPage { +// /* This is a test setting */ +// constructor(page) { +// this.page = page; +// } +// } +// ```" +// .unindent(), +// embedding: vec![], +// }, +// Document { +// name: "constructor".into(), +// range: 290..341, +// content: " +// The below code snippet is from file 'foo.js' + +// ```javascript +// /* This is a test setting */ +// constructor(page) { +// this.page = page; +// } +// ```" +// .unindent(), +// embedding: vec![], +// }, +// Document { +// name: "class TestClass".into(), +// range: 374..392, +// content: " +// The below code snippet is from file 'foo.js' + +// ```javascript +// /* This is a test comment */ +// class TestClass {} +// ```" +// .unindent(), +// embedding: vec![], +// }, +// Document { +// name: "interface ClickhouseEditorEvent".into(), +// range: 440..532, +// content: " +// The below code snippet is from file 'foo.js' + +// ```javascript +// /* Schema for editor_events in Clickhouse. */ +// export interface ClickhouseEditorEvent { +// installation_id: string +// operation: string +// } +// ```" +// .unindent(), +// embedding: vec![], +// }, +// ]; + +// for idx in 0..test_documents.len() { +// assert_eq!(test_documents[idx], parsed_files[idx]); +// } +// } + +// #[gpui::test] +// async fn test_code_context_retrieval_elixir() { +// let language = elixir_lang(); +// let mut retriever = CodeContextRetriever::new(); + +// let text = r#" +// defmodule File.Stream do +// @moduledoc """ +// Defines a `File.Stream` struct returned by `File.stream!/3`. + +// The following fields are public: + +// * `path` - the file path +// * `modes` - the file modes +// * `raw` - a boolean indicating if bin functions should be used +// * `line_or_bytes` - if reading should read lines or a given number of bytes +// * `node` - the node the file belongs to + +// """ + +// defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil + +// @type t :: %__MODULE__{} + +// @doc false +// def __build__(path, modes, line_or_bytes) do +// raw = :lists.keyfind(:encoding, 1, modes) == false + +// modes = +// case raw do +// true -> +// case :lists.keyfind(:read_ahead, 1, modes) do +// {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] +// {:read_ahead, _} -> [:raw | modes] +// false -> [:raw, :read_ahead | modes] +// end + +// false -> +// modes +// end + +// %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} + +// end +// "# +// .unindent(); + +// let parsed_files = retriever +// .parse_file(Path::new("foo.ex"), &text, language) +// .unwrap(); + +// let test_documents = &[ +// Document{ +// name: "defmodule File.Stream".into(), +// range: 0..1132, +// content: r#" +// The below code snippet is from file 'foo.ex' + +// ```elixir +// defmodule File.Stream do +// @moduledoc """ +// Defines a `File.Stream` struct returned by `File.stream!/3`. + +// The following fields are public: + +// * `path` - the file path +// * `modes` - the file modes +// * `raw` - a boolean indicating if bin functions should be used +// * `line_or_bytes` - if reading should read lines or a given number of bytes +// * `node` - the node the file belongs to + +// """ + +// defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil + +// @type t :: %__MODULE__{} + +// @doc false +// def __build__(path, modes, line_or_bytes) do +// raw = :lists.keyfind(:encoding, 1, modes) == false + +// modes = +// case raw do +// true -> +// case :lists.keyfind(:read_ahead, 1, modes) do +// {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] +// {:read_ahead, _} -> [:raw | modes] +// false -> [:raw, :read_ahead | modes] +// end + +// false -> +// modes +// end + +// %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} + +// end +// ```"#.unindent(), +// embedding: vec![], +// }, +// Document { +// name: "def __build__".into(), +// range: 574..1132, +// content: r#" +// The below code snippet is from file 'foo.ex' + +// ```elixir +// @doc false +// def __build__(path, modes, line_or_bytes) do +// raw = :lists.keyfind(:encoding, 1, modes) == false + +// modes = +// case raw do +// true -> +// case :lists.keyfind(:read_ahead, 1, modes) do +// {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] +// {:read_ahead, _} -> [:raw | modes] +// false -> [:raw, :read_ahead | modes] +// end + +// false -> +// modes +// end + +// %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} + +// end +// ```"# +// .unindent(), +// embedding: vec![], +// }]; + +// for idx in 0..test_documents.len() { +// assert_eq!(test_documents[idx], parsed_files[idx]); +// } +// } + +// #[gpui::test] +// async fn test_code_context_retrieval_cpp() { +// let language = cpp_lang(); +// let mut retriever = CodeContextRetriever::new(); + +// let text = " +// /** +// * @brief Main function +// * @returns 0 on exit +// */ +// int main() { return 0; } + +// /** +// * This is a test comment +// */ +// class MyClass { // The class +// public: // Access specifier +// int myNum; // Attribute (int variable) +// string myString; // Attribute (string variable) +// }; + +// // This is a test comment +// enum Color { red, green, blue }; + +// /** This is a preceding block comment +// * This is the second line +// */ +// struct { // Structure declaration +// int myNum; // Member (int variable) +// string myString; // Member (string variable) +// } myStructure; + +// /** +// * @brief Matrix class. +// */ +// template ::value || std::is_floating_point::value, +// bool>::type> +// class Matrix2 { +// std::vector> _mat; + +// public: +// /** +// * @brief Constructor +// * @tparam Integer ensuring integers are being evaluated and not other +// * data types. +// * @param size denoting the size of Matrix as size x size +// */ +// template ::value, +// Integer>::type> +// explicit Matrix(const Integer size) { +// for (size_t i = 0; i < size; ++i) { +// _mat.emplace_back(std::vector(size, 0)); +// } +// } +// }" +// .unindent(); + +// let parsed_files = retriever +// .parse_file(Path::new("foo.cpp"), &text, language) +// .unwrap(); + +// let test_documents = &[ +// Document { +// name: "int main".into(), +// range: 54..78, +// content: " +// The below code snippet is from file 'foo.cpp' + +// ```cpp +// /** +// * @brief Main function +// * @returns 0 on exit +// */ +// int main() { return 0; } +// ```" +// .unindent(), +// embedding: vec![], +// }, +// Document { +// name: "class MyClass".into(), +// range: 112..295, +// content: " +// The below code snippet is from file 'foo.cpp' + +// ```cpp +// /** +// * This is a test comment +// */ +// class MyClass { // The class +// public: // Access specifier +// int myNum; // Attribute (int variable) +// string myString; // Attribute (string variable) +// } +// ```" +// .unindent(), +// embedding: vec![], +// }, +// Document { +// name: "enum Color".into(), +// range: 324..355, +// content: " +// The below code snippet is from file 'foo.cpp' + +// ```cpp +// // This is a test comment +// enum Color { red, green, blue } +// ```" +// .unindent(), +// embedding: vec![], +// }, +// Document { +// name: "struct myStructure".into(), +// range: 428..581, +// content: " +// The below code snippet is from file 'foo.cpp' + +// ```cpp +// /** This is a preceding block comment +// * This is the second line +// */ +// struct { // Structure declaration +// int myNum; // Member (int variable) +// string myString; // Member (string variable) +// } myStructure; +// ```" +// .unindent(), +// embedding: vec![], +// }, +// Document { +// name: "class Matrix2".into(), +// range: 613..1342, +// content: " +// The below code snippet is from file 'foo.cpp' + +// ```cpp +// /** +// * @brief Matrix class. +// */ +// template ::value || std::is_floating_point::value, +// bool>::type> +// class Matrix2 { +// std::vector> _mat; + +// public: +// /** +// * @brief Constructor +// * @tparam Integer ensuring integers are being evaluated and not other +// * data types. +// * @param size denoting the size of Matrix as size x size +// */ +// template ::value, +// Integer>::type> +// explicit Matrix(const Integer size) { +// for (size_t i = 0; i < size; ++i) { +// _mat.emplace_back(std::vector(size, 0)); +// } +// } +// } +// ```" +// .unindent(), +// embedding: vec![], +// }, +// ]; + +// for idx in 0..test_documents.len() { +// assert_eq!(test_documents[idx], parsed_files[idx]); +// } +// } #[gpui::test] fn test_dot_product(mut rng: StdRng) { @@ -826,6 +872,7 @@ fn rust_lang() -> Arc { LanguageConfig { name: "Rust".into(), path_suffixes: vec!["rs".into()], + collapsed_placeholder: " /* ... */ ".to_string(), ..Default::default() }, Some(tree_sitter_rust::language()), @@ -833,54 +880,32 @@ fn rust_lang() -> Arc { .with_embedding_query( r#" ( - (line_comment)* @context - . - (enum_item - name: (_) @name) @item - ) - - ( - (line_comment)* @context + [(line_comment) (attribute_item)]* @context . - (struct_item - name: (_) @name) @item - ) + [ + (struct_item + name: (_) @name) - ( - (line_comment)* @context - . - (impl_item - trait: (_)? @name - "for"? @name - type: (_) @name) @item - ) + (enum_item + name: (_) @name) - ( - (line_comment)* @context - . - (trait_item - name: (_) @name) @item - ) + (impl_item + trait: (_)? @name + "for"? @name + type: (_) @name) - ( - (line_comment)* @context - . - (function_item - name: (_) @name) @item - ) + (trait_item + name: (_) @name) - ( - (line_comment)* @context - . - (macro_definition - name: (_) @name) @item - ) + (function_item + name: (_) @name + body: (block + "{" @keep + "}" @keep) @collapse) - ( - (line_comment)* @context - . - (function_signature_item - name: (_) @name) @item + (macro_definition + name: (_) @name) + ] @item ) "#, ) @@ -1023,3 +1048,15 @@ fn elixir_lang() -> Arc { .unwrap(), ) } + +#[gpui::test] +fn test_subtract_ranges() { + // collapsed_ranges: Vec>, keep_ranges: Vec> + + assert_eq!( + subtract_ranges(&[0..5, 10..21], &[0..1, 4..5]), + vec![1..4, 10..21] + ); + + assert_eq!(subtract_ranges(&[0..5], &[1..2]), &[0..1, 2..5]); +} diff --git a/crates/zed/src/languages/rust/config.toml b/crates/zed/src/languages/rust/config.toml index 705287f0a758045ce8179bfc8a6bf18e564970b8..8216ba0a74a90a16f2e29be77021f56530649c52 100644 --- a/crates/zed/src/languages/rust/config.toml +++ b/crates/zed/src/languages/rust/config.toml @@ -10,3 +10,4 @@ brackets = [ { start = "\"", end = "\"", close = true, newline = false, not_in = ["string"] }, { start = "/*", end = " */", close = true, newline = false, not_in = ["string", "comment"] }, ] +collapsed_placeholder = " /* ... */ " diff --git a/crates/zed/src/languages/rust/embedding.scm b/crates/zed/src/languages/rust/embedding.scm index 66e4083de5f0fe8b1adfa2ea657668e4453e4b61..e4218382a9b1ceb7e087b0d9247d5a4e66b77236 100644 --- a/crates/zed/src/languages/rust/embedding.scm +++ b/crates/zed/src/languages/rust/embedding.scm @@ -1,50 +1,28 @@ ( - (line_comment)* @context + [(line_comment) (attribute_item)]* @context . - (enum_item - name: (_) @name) @item -) + [ + (struct_item + name: (_) @name) -( - (line_comment)* @context - . - (struct_item - name: (_) @name) @item -) + (enum_item + name: (_) @name) -( - (line_comment)* @context - . - (impl_item - trait: (_)? @name - "for"? @name - type: (_) @name) @item -) + (impl_item + trait: (_)? @name + "for"? @name + type: (_) @name) -( - (line_comment)* @context - . - (trait_item - name: (_) @name) @item -) + (trait_item + name: (_) @name) -( - (line_comment)* @context - . - (function_item - name: (_) @name) @item -) - -( - (line_comment)* @context - . - (macro_definition - name: (_) @name) @item -) + (function_item + name: (_) @name + body: (block + "{" @keep + "}" @keep) @collapse) -( - (line_comment)* @context - . - (function_signature_item - name: (_) @name) @item -) + (macro_definition + name: (_) @name) + ] @item + ) From efe973ebe2f6c4c92159542eb4d8e1bc12455df4 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 19 Jul 2023 16:52:44 -0400 Subject: [PATCH 24/34] add embedding query for json with nested arrays and strings Co-authored-by: maxbrunsfeld --- Cargo.lock | 1 + crates/language/src/language.rs | 4 +- crates/semantic_index/Cargo.toml | 1 + crates/semantic_index/src/parsing.rs | 123 ++++++++++-------- crates/semantic_index/src/semantic_index.rs | 2 +- .../src/semantic_index_tests.rs | 103 ++++++++++++++- crates/zed/src/languages/json/embedding.scm | 14 ++ 7 files changed, 189 insertions(+), 59 deletions(-) create mode 100644 crates/zed/src/languages/json/embedding.scm diff --git a/Cargo.lock b/Cargo.lock index 8ea6f61da04f215b91b31941ccce795be778a204..75f66163e3fbf5048b01cbf5079f00f2e9c5ce46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6502,6 +6502,7 @@ dependencies = [ "tree-sitter", "tree-sitter-cpp", "tree-sitter-elixir 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tree-sitter-json 0.19.0", "tree-sitter-rust", "tree-sitter-toml 0.20.0", "tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/crates/language/src/language.rs b/crates/language/src/language.rs index ec233716d6ce5345515600b14e00a212b3dcb3a5..e34358c7c5def79e8141e86e54693bcc99188da0 100644 --- a/crates/language/src/language.rs +++ b/crates/language/src/language.rs @@ -526,7 +526,7 @@ pub struct OutlineConfig { pub struct EmbeddingConfig { pub query: Query, pub item_capture_ix: u32, - pub name_capture_ix: u32, + pub name_capture_ix: Option, pub context_capture_ix: Option, pub collapse_capture_ix: Option, pub keep_capture_ix: Option, @@ -1263,7 +1263,7 @@ impl Language { ("collapse", &mut collapse_capture_ix), ], ); - if let Some((item_capture_ix, name_capture_ix)) = item_capture_ix.zip(name_capture_ix) { + if let Some(item_capture_ix) = item_capture_ix { grammar.embedding_config = Some(EmbeddingConfig { query, item_capture_ix, diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index 1b3169bfe41940eef5c863901c560e106acf4816..35b97245124e8922d5e7a46a369e26c71af7731a 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -54,6 +54,7 @@ ctor.workspace = true env_logger.workspace = true tree-sitter-typescript = "*" +tree-sitter-json = "*" tree-sitter-rust = "*" tree-sitter-toml = "*" tree-sitter-cpp = "*" diff --git a/crates/semantic_index/src/parsing.rs b/crates/semantic_index/src/parsing.rs index 0d2aeb60fb24dcb347c3a0f870ef6e348e08a88a..c952ef3a4edf939ddc8ad17c9bab6e17ec5e0cce 100644 --- a/crates/semantic_index/src/parsing.rs +++ b/crates/semantic_index/src/parsing.rs @@ -1,6 +1,12 @@ use anyhow::{anyhow, Ok, Result}; use language::{Grammar, Language}; -use std::{cmp, collections::HashSet, ops::Range, path::Path, sync::Arc}; +use std::{ + cmp::{self, Reverse}, + collections::HashSet, + ops::Range, + path::Path, + sync::Arc, +}; use tree_sitter::{Parser, QueryCursor}; #[derive(Debug, PartialEq, Clone)] @@ -15,7 +21,7 @@ const CODE_CONTEXT_TEMPLATE: &str = "The below code snippet is from file ''\n\n```\n\n```"; const ENTIRE_FILE_TEMPLATE: &str = "The below snippet is from file ''\n\n```\n\n```"; -pub const PARSEABLE_ENTIRE_FILE_TYPES: [&str; 4] = ["TOML", "YAML", "JSON", "CSS"]; +pub const PARSEABLE_ENTIRE_FILE_TYPES: &[&str] = &["TOML", "YAML", "CSS"]; pub struct CodeContextRetriever { pub parser: Parser, @@ -30,8 +36,8 @@ pub struct CodeContextRetriever { #[derive(Debug, Clone)] pub struct CodeContextMatch { pub start_col: usize, - pub item_range: Range, - pub name_range: Range, + pub item_range: Option>, + pub name_range: Option>, pub context_ranges: Vec>, pub collapse_ranges: Vec>, } @@ -44,7 +50,7 @@ impl CodeContextRetriever { } } - fn _parse_entire_file( + fn parse_entire_file( &self, relative_path: &Path, language_name: Arc, @@ -97,7 +103,7 @@ impl CodeContextRetriever { if capture.index == embedding_config.item_capture_ix { item_range = Some(capture.node.byte_range()); start_col = capture.node.start_position().column; - } else if capture.index == embedding_config.name_capture_ix { + } else if Some(capture.index) == embedding_config.name_capture_ix { name_range = Some(capture.node.byte_range()); } else if Some(capture.index) == embedding_config.context_capture_ix { context_ranges.push(capture.node.byte_range()); @@ -108,16 +114,13 @@ impl CodeContextRetriever { } } - if item_range.is_some() && name_range.is_some() { - let item_range = item_range.unwrap(); - captures.push(CodeContextMatch { - start_col, - item_range, - name_range: name_range.unwrap(), - context_ranges, - collapse_ranges: subtract_ranges(&collapse_ranges, &keep_ranges), - }); - } + captures.push(CodeContextMatch { + start_col, + item_range, + name_range, + context_ranges, + collapse_ranges: subtract_ranges(&collapse_ranges, &keep_ranges), + }); } Ok(captures) } @@ -129,7 +132,12 @@ impl CodeContextRetriever { language: Arc, ) -> Result> { let language_name = language.name(); - let mut documents = self.parse_file(relative_path, content, language)?; + + if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language_name.as_ref()) { + return self.parse_entire_file(relative_path, language_name, &content); + } + + let mut documents = self.parse_file(content, language)?; for document in &mut documents { document.content = CODE_CONTEXT_TEMPLATE .replace("", relative_path.to_string_lossy().as_ref()) @@ -139,16 +147,7 @@ impl CodeContextRetriever { Ok(documents) } - pub fn parse_file( - &mut self, - relative_path: &Path, - content: &str, - language: Arc, - ) -> Result> { - if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) { - return self._parse_entire_file(relative_path, language.name(), &content); - } - + pub fn parse_file(&mut self, content: &str, language: Arc) -> Result> { let grammar = language .grammar() .ok_or_else(|| anyhow!("no grammar for language"))?; @@ -163,32 +162,49 @@ impl CodeContextRetriever { let mut collapsed_ranges_within = Vec::new(); let mut parsed_name_ranges = HashSet::new(); for (i, context_match) in matches.iter().enumerate() { - if parsed_name_ranges.contains(&context_match.name_range) { + // Items which are collapsible but not embeddable have no item range + let item_range = if let Some(item_range) = context_match.item_range.clone() { + item_range + } else { continue; + }; + + // Checks for deduplication + let name; + if let Some(name_range) = context_match.name_range.clone() { + name = content + .get(name_range.clone()) + .map_or(String::new(), |s| s.to_string()); + if parsed_name_ranges.contains(&name_range) { + continue; + } + parsed_name_ranges.insert(name_range); + } else { + name = String::new(); } collapsed_ranges_within.clear(); - for remaining_match in &matches[(i + 1)..] { - if context_match - .item_range - .contains(&remaining_match.item_range.start) - && context_match - .item_range - .contains(&remaining_match.item_range.end) - { - collapsed_ranges_within.extend(remaining_match.collapse_ranges.iter().cloned()); - } else { - break; + 'outer: for remaining_match in &matches[(i + 1)..] { + for collapsed_range in &remaining_match.collapse_ranges { + if item_range.start <= collapsed_range.start + && item_range.end >= collapsed_range.end + { + collapsed_ranges_within.push(collapsed_range.clone()); + } else { + break 'outer; + } } } + collapsed_ranges_within.sort_by_key(|r| (r.start, Reverse(r.end))); + let mut document_content = String::new(); for context_range in &context_match.context_ranges { document_content.push_str(&content[context_range.clone()]); document_content.push_str("\n"); } - let mut offset = context_match.item_range.start; + let mut offset = item_range.start; for collapsed_range in &collapsed_ranges_within { if collapsed_range.start > offset { add_content_from_range( @@ -197,29 +213,30 @@ impl CodeContextRetriever { offset..collapsed_range.start, context_match.start_col, ); + offset = collapsed_range.start; + } + + if collapsed_range.end > offset { + document_content.push_str(placeholder); + offset = collapsed_range.end; } - document_content.push_str(placeholder); - offset = collapsed_range.end; } - if offset < context_match.item_range.end { + if offset < item_range.end { add_content_from_range( &mut document_content, content, - offset..context_match.item_range.end, + offset..item_range.end, context_match.start_col, ); } - if let Some(name) = content.get(context_match.name_range.clone()) { - parsed_name_ranges.insert(context_match.name_range.clone()); - documents.push(Document { - name: name.to_string(), - content: document_content, - range: context_match.item_range.clone(), - embedding: vec![], - }) - } + documents.push(Document { + name, + content: document_content, + range: item_range.clone(), + embedding: vec![], + }) } return Ok(documents); diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 271fd741a643d1e04d5afe57a50b70b6e391cbf7..6e0477491518a0c4a18ebfa1c24ddaf51eaf1948 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -33,7 +33,7 @@ use util::{ ResultExt, }; -const SEMANTIC_INDEX_VERSION: usize = 4; +const SEMANTIC_INDEX_VERSION: usize = 5; const EMBEDDINGS_BATCH_SIZE: usize = 80; pub fn init( diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index c54d5079d37f3b8ad5ce4dbb788f5eb5f68b02c8..31c96ca207bb3da1ace202bd81df461f27ba229b 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -170,9 +170,7 @@ async fn test_code_context_retrieval_rust() { " .unindent(); - let documents = retriever - .parse_file(Path::new("foo.rs"), &text, language) - .unwrap(); + let documents = retriever.parse_file(&text, language).unwrap(); assert_documents_eq( &documents, @@ -229,6 +227,76 @@ async fn test_code_context_retrieval_rust() { ); } +#[gpui::test] +async fn test_code_context_retrieval_json() { + let language = json_lang(); + let mut retriever = CodeContextRetriever::new(); + + let text = r#" + { + "array": [1, 2, 3, 4], + "string": "abcdefg", + "nested_object": { + "array_2": [5, 6, 7, 8], + "string_2": "hijklmnop", + "boolean": true, + "none": null + } + } + "# + .unindent(); + + let documents = retriever.parse_file(&text, language.clone()).unwrap(); + + assert_documents_eq( + &documents, + &[( + r#" + { + "array": [], + "string": "", + "nested_object": { + "array_2": [], + "string_2": "", + "boolean": true, + "none": null + } + }"# + .unindent(), + text.find("{").unwrap(), + )], + ); + + let text = r#" + [ + { + "name": "somebody", + "age": 42 + }, + { + "name": "somebody else", + "age": 43 + } + ] + "# + .unindent(); + + let documents = retriever.parse_file(&text, language.clone()).unwrap(); + + assert_documents_eq( + &documents, + &[( + r#" + [{ + "name": "", + "age": 42 + }]"# + .unindent(), + text.find("[").unwrap(), + )], + ); +} + fn assert_documents_eq( documents: &[Document], expected_contents_and_start_offsets: &[(String, usize)], @@ -913,6 +981,35 @@ fn rust_lang() -> Arc { ) } +fn json_lang() -> Arc { + Arc::new( + Language::new( + LanguageConfig { + name: "JSON".into(), + path_suffixes: vec!["json".into()], + ..Default::default() + }, + Some(tree_sitter_json::language()), + ) + .with_embedding_query( + r#" + (document) @item + + (array + "[" @keep + . + (object)? @keep + "]" @keep) @collapse + + (pair value: (string + "\"" @keep + "\"" @keep) @collapse) + "#, + ) + .unwrap(), + ) +} + fn toml_lang() -> Arc { Arc::new(Language::new( LanguageConfig { diff --git a/crates/zed/src/languages/json/embedding.scm b/crates/zed/src/languages/json/embedding.scm new file mode 100644 index 0000000000000000000000000000000000000000..fa286e3880aa67d49f710f991d6839ebbd306104 --- /dev/null +++ b/crates/zed/src/languages/json/embedding.scm @@ -0,0 +1,14 @@ +; Only produce one embedding for the entire file. +(document) @item + +; Collapse arrays, except for the first object. +(array + "[" @keep + . + (object)? @keep + "]" @keep) @collapse + +; Collapse string values (but not keys). +(pair value: (string + "\"" @keep + "\"" @keep) @collapse) From e02d6bc0d41fe5006307833f5e4c2cd62ba7add1 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 20 Jul 2023 13:46:27 -0400 Subject: [PATCH 25/34] add glob filtering functionality to semantic search --- Cargo.lock | 1 + crates/search/src/project_search.rs | 60 +++++++++++++++++-- crates/semantic_index/Cargo.toml | 1 + crates/semantic_index/src/db.rs | 39 ++++++++---- crates/semantic_index/src/semantic_index.rs | 13 +++- .../src/semantic_index_tests.rs | 57 +++++++++++++++++- 6 files changed, 149 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 75f66163e3fbf5048b01cbf5079f00f2e9c5ce46..f534a4fe7d68a362fd910f0bd02cbf72b24955fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6477,6 +6477,7 @@ dependencies = [ "editor", "env_logger 0.9.3", "futures 0.3.28", + "globset", "gpui", "isahc", "language", diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index 5feb94426eb60c67a756c564982a826699bd20a1..25fc897707af6be8b97b277a2d65b8d4cf1eeb17 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -187,14 +187,26 @@ impl ProjectSearch { cx.notify(); } - fn semantic_search(&mut self, query: String, cx: &mut ModelContext) { + fn semantic_search( + &mut self, + query: String, + include_files: Vec, + exclude_files: Vec, + cx: &mut ModelContext, + ) { let search = SemanticIndex::global(cx).map(|index| { index.update(cx, |semantic_index, cx| { - semantic_index.search_project(self.project.clone(), query.clone(), 10, cx) + semantic_index.search_project( + self.project.clone(), + query.clone(), + 10, + include_files, + exclude_files, + cx, + ) }) }); self.search_id += 1; - // self.active_query = Some(query); self.match_ranges.clear(); self.pending_search = Some(cx.spawn(|this, mut cx| async move { let results = search?.await.log_err()?; @@ -638,8 +650,13 @@ impl ProjectSearchView { } let query = self.query_editor.read(cx).text(cx); - self.model - .update(cx, |model, cx| model.semantic_search(query, cx)); + if let Some((included_files, exclude_files)) = + self.get_included_and_excluded_globsets(cx) + { + self.model.update(cx, |model, cx| { + model.semantic_search(query, included_files, exclude_files, cx) + }); + } return; } @@ -648,6 +665,39 @@ impl ProjectSearchView { } } + fn get_included_and_excluded_globsets( + &mut self, + cx: &mut ViewContext, + ) -> Option<(Vec, Vec)> { + let text = self.query_editor.read(cx).text(cx); + let included_files = + match Self::load_glob_set(&self.included_files_editor.read(cx).text(cx)) { + Ok(included_files) => { + self.panels_with_errors.remove(&InputPanel::Include); + included_files + } + Err(_e) => { + self.panels_with_errors.insert(InputPanel::Include); + cx.notify(); + return None; + } + }; + let excluded_files = + match Self::load_glob_set(&self.excluded_files_editor.read(cx).text(cx)) { + Ok(excluded_files) => { + self.panels_with_errors.remove(&InputPanel::Exclude); + excluded_files + } + Err(_e) => { + self.panels_with_errors.insert(InputPanel::Exclude); + cx.notify(); + return None; + } + }; + + Some((included_files, excluded_files)) + } + fn build_search_query(&mut self, cx: &mut ViewContext) -> Option { let text = self.query_editor.read(cx).text(cx); let included_files = diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index 35b97245124e8922d5e7a46a369e26c71af7731a..a1f126bfb841ecb8334aeca391ac4959ef9f57b0 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -37,6 +37,7 @@ tiktoken-rs = "0.5.0" parking_lot.workspace = true rand.workspace = true schemars.workspace = true +globset.workspace = true [dev-dependencies] gpui = { path = "../gpui", features = ["test-support"] } diff --git a/crates/semantic_index/src/db.rs b/crates/semantic_index/src/db.rs index fd99594aab578919f80bd8236270b352a8540993..3ba85a275d0a0d6b197bbad22d5ad5bd792a2fbf 100644 --- a/crates/semantic_index/src/db.rs +++ b/crates/semantic_index/src/db.rs @@ -1,5 +1,6 @@ use crate::{parsing::Document, SEMANTIC_INDEX_VERSION}; use anyhow::{anyhow, Context, Result}; +use globset::{Glob, GlobMatcher}; use project::Fs; use rpc::proto::Timestamp; use rusqlite::{ @@ -252,18 +253,30 @@ impl VectorDatabase { worktree_ids: &[i64], query_embedding: &Vec, limit: usize, + include_globs: Vec, + exclude_globs: Vec, ) -> Result)>> { let mut results = Vec::<(i64, f32)>::with_capacity(limit + 1); - self.for_each_document(&worktree_ids, |id, embedding| { - let similarity = dot(&embedding, &query_embedding); - let ix = match results - .binary_search_by(|(_, s)| similarity.partial_cmp(&s).unwrap_or(Ordering::Equal)) + self.for_each_document(&worktree_ids, |relative_path, id, embedding| { + if (include_globs.is_empty() + || include_globs + .iter() + .any(|include_glob| include_glob.is_match(relative_path.clone()))) + && (exclude_globs.is_empty() + || !exclude_globs + .iter() + .any(|exclude_glob| exclude_glob.is_match(relative_path.clone()))) { - Ok(ix) => ix, - Err(ix) => ix, - }; - results.insert(ix, (id, similarity)); - results.truncate(limit); + let similarity = dot(&embedding, &query_embedding); + let ix = match results.binary_search_by(|(_, s)| { + similarity.partial_cmp(&s).unwrap_or(Ordering::Equal) + }) { + Ok(ix) => ix, + Err(ix) => ix, + }; + results.insert(ix, (id, similarity)); + results.truncate(limit); + } })?; let ids = results.into_iter().map(|(id, _)| id).collect::>(); @@ -273,12 +286,12 @@ impl VectorDatabase { fn for_each_document( &self, worktree_ids: &[i64], - mut f: impl FnMut(i64, Vec), + mut f: impl FnMut(String, i64, Vec), ) -> Result<()> { let mut query_statement = self.db.prepare( " SELECT - documents.id, documents.embedding + files.relative_path, documents.id, documents.embedding FROM documents, files WHERE @@ -289,10 +302,10 @@ impl VectorDatabase { query_statement .query_map(params![ids_to_sql(worktree_ids)], |row| { - Ok((row.get(0)?, row.get::<_, Embedding>(1)?)) + Ok((row.get(0)?, row.get(1)?, row.get::<_, Embedding>(2)?)) })? .filter_map(|row| row.ok()) - .for_each(|(id, embedding)| f(id, embedding.0)); + .for_each(|(relative_path, id, embedding)| f(relative_path, id, embedding.0)); Ok(()) } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 6e0477491518a0c4a18ebfa1c24ddaf51eaf1948..32a11a42ebdcb01205869bcb273784582e291dcf 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -11,6 +11,7 @@ use anyhow::{anyhow, Result}; use db::VectorDatabase; use embedding::{EmbeddingProvider, OpenAIEmbeddings}; use futures::{channel::oneshot, Future}; +use globset::{Glob, GlobMatcher}; use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle}; use language::{Anchor, Buffer, Language, LanguageRegistry}; use parking_lot::Mutex; @@ -624,6 +625,8 @@ impl SemanticIndex { project: ModelHandle, phrase: String, limit: usize, + include_globs: Vec, + exclude_globs: Vec, cx: &mut ModelContext, ) -> Task>> { let project_state = if let Some(state) = self.projects.get(&project.downgrade()) { @@ -657,12 +660,16 @@ impl SemanticIndex { .next() .unwrap(); - database.top_k_search(&worktree_db_ids, &phrase_embedding, limit) + database.top_k_search( + &worktree_db_ids, + &phrase_embedding, + limit, + include_globs, + exclude_globs, + ) }) .await?; - dbg!(&documents); - let mut tasks = Vec::new(); let mut ranges = Vec::new(); let weak_project = project.downgrade(); diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 31c96ca207bb3da1ace202bd81df461f27ba229b..366d634ddb68df629832b23c3777d6f5cc775b7c 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -7,6 +7,7 @@ use crate::{ }; use anyhow::Result; use async_trait::async_trait; +use globset::Glob; use gpui::{Task, TestAppContext}; use language::{Language, LanguageConfig, LanguageRegistry, ToOffset}; use pretty_assertions::assert_eq; @@ -96,7 +97,7 @@ async fn test_semantic_index(cx: &mut TestAppContext) { let search_results = store .update(cx, |store, cx| { - store.search_project(project.clone(), "aaaa".to_string(), 5, cx) + store.search_project(project.clone(), "aaaa".to_string(), 5, vec![], vec![], cx) }) .await .unwrap(); @@ -109,6 +110,60 @@ async fn test_semantic_index(cx: &mut TestAppContext) { ); }); + // Test Include Files Functonality + let include_files = vec![Glob::new("*.rs").unwrap().compile_matcher()]; + let exclude_files = vec![Glob::new("*.rs").unwrap().compile_matcher()]; + let search_results = store + .update(cx, |store, cx| { + store.search_project( + project.clone(), + "aaaa".to_string(), + 5, + include_files, + vec![], + cx, + ) + }) + .await + .unwrap(); + + for res in &search_results { + res.buffer.read_with(cx, |buffer, _cx| { + assert!(buffer + .file() + .unwrap() + .path() + .to_str() + .unwrap() + .ends_with("rs")); + }); + } + + let search_results = store + .update(cx, |store, cx| { + store.search_project( + project.clone(), + "aaaa".to_string(), + 5, + vec![], + exclude_files, + cx, + ) + }) + .await + .unwrap(); + + for res in &search_results { + res.buffer.read_with(cx, |buffer, _cx| { + assert!(!buffer + .file() + .unwrap() + .path() + .to_str() + .unwrap() + .ends_with("rs")); + }); + } fs.save( "/the-root/src/file2.rs".as_ref(), &" From 81b05f2a083e999751646f53b72212ceedf3b6d9 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 20 Jul 2023 14:23:11 -0700 Subject: [PATCH 26/34] Optimize glob filtering of semantic search Co-authored-by: Kyle --- crates/search/src/project_search.rs | 1 - crates/semantic_index/src/db.rs | 66 +++++++---- crates/semantic_index/src/semantic_index.rs | 2 +- .../src/semantic_index_tests.rs | 103 +++++++++++------- 4 files changed, 109 insertions(+), 63 deletions(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index 25fc897707af6be8b97b277a2d65b8d4cf1eeb17..28085f59feb16bd9158ee766ba752f4d2cd72340 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -669,7 +669,6 @@ impl ProjectSearchView { &mut self, cx: &mut ViewContext, ) -> Option<(Vec, Vec)> { - let text = self.query_editor.read(cx).text(cx); let included_files = match Self::load_glob_set(&self.included_files_editor.read(cx).text(cx)) { Ok(included_files) => { diff --git a/crates/semantic_index/src/db.rs b/crates/semantic_index/src/db.rs index 3ba85a275d0a0d6b197bbad22d5ad5bd792a2fbf..b1e78b7aff994ca977fbbea41d595f08fb65766a 100644 --- a/crates/semantic_index/src/db.rs +++ b/crates/semantic_index/src/db.rs @@ -1,6 +1,6 @@ use crate::{parsing::Document, SEMANTIC_INDEX_VERSION}; use anyhow::{anyhow, Context, Result}; -use globset::{Glob, GlobMatcher}; +use globset::GlobMatcher; use project::Fs; use rpc::proto::Timestamp; use rusqlite::{ @@ -257,16 +257,11 @@ impl VectorDatabase { exclude_globs: Vec, ) -> Result)>> { let mut results = Vec::<(i64, f32)>::with_capacity(limit + 1); - self.for_each_document(&worktree_ids, |relative_path, id, embedding| { - if (include_globs.is_empty() - || include_globs - .iter() - .any(|include_glob| include_glob.is_match(relative_path.clone()))) - && (exclude_globs.is_empty() - || !exclude_globs - .iter() - .any(|exclude_glob| exclude_glob.is_match(relative_path.clone()))) - { + self.for_each_document( + &worktree_ids, + include_globs, + exclude_globs, + |id, embedding| { let similarity = dot(&embedding, &query_embedding); let ix = match results.binary_search_by(|(_, s)| { similarity.partial_cmp(&s).unwrap_or(Ordering::Equal) @@ -276,8 +271,8 @@ impl VectorDatabase { }; results.insert(ix, (id, similarity)); results.truncate(limit); - } - })?; + }, + )?; let ids = results.into_iter().map(|(id, _)| id).collect::>(); self.get_documents_by_ids(&ids) @@ -286,26 +281,55 @@ impl VectorDatabase { fn for_each_document( &self, worktree_ids: &[i64], - mut f: impl FnMut(String, i64, Vec), + include_globs: Vec, + exclude_globs: Vec, + mut f: impl FnMut(i64, Vec), ) -> Result<()> { + let mut file_query = self.db.prepare( + " + SELECT + id, relative_path + FROM + files + WHERE + worktree_id IN rarray(?) + ", + )?; + + let mut file_ids = Vec::::new(); + let mut rows = file_query.query([ids_to_sql(worktree_ids)])?; + while let Some(row) = rows.next()? { + let file_id = row.get(0)?; + let relative_path = row.get_ref(1)?.as_str()?; + let included = include_globs.is_empty() + || include_globs + .iter() + .any(|glob| glob.is_match(relative_path)); + let excluded = exclude_globs + .iter() + .any(|glob| glob.is_match(relative_path)); + if included && !excluded { + file_ids.push(file_id); + } + } + let mut query_statement = self.db.prepare( " SELECT - files.relative_path, documents.id, documents.embedding + id, embedding FROM - documents, files + documents WHERE - documents.file_id = files.id AND - files.worktree_id IN rarray(?) + file_id IN rarray(?) ", )?; query_statement - .query_map(params![ids_to_sql(worktree_ids)], |row| { - Ok((row.get(0)?, row.get(1)?, row.get::<_, Embedding>(2)?)) + .query_map(params![ids_to_sql(&file_ids)], |row| { + Ok((row.get(0)?, row.get::<_, Embedding>(1)?)) })? .filter_map(|row| row.ok()) - .for_each(|(relative_path, id, embedding)| f(relative_path, id, embedding.0)); + .for_each(|(id, embedding)| f(id, embedding.0)); Ok(()) } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 32a11a42ebdcb01205869bcb273784582e291dcf..215ca38a28845fdf8b24d8c5d0a5d1249a03bcec 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -11,7 +11,7 @@ use anyhow::{anyhow, Result}; use db::VectorDatabase; use embedding::{EmbeddingProvider, OpenAIEmbeddings}; use futures::{channel::oneshot, Future}; -use globset::{Glob, GlobMatcher}; +use globset::GlobMatcher; use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle}; use language::{Anchor, Buffer, Language, LanguageRegistry}; use parking_lot::Mutex; diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 366d634ddb68df629832b23c3777d6f5cc775b7c..432f6b5b5328ed7bf72c52a34a762de0baacb7de 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -3,7 +3,7 @@ use crate::{ embedding::EmbeddingProvider, parsing::{subtract_ranges, CodeContextRetriever, Document}, semantic_index_settings::SemanticIndexSettings, - SemanticIndex, + SearchResult, SemanticIndex, }; use anyhow::Result; use async_trait::async_trait; @@ -46,21 +46,21 @@ async fn test_semantic_index(cx: &mut TestAppContext) { "src": { "file1.rs": " fn aaa() { - println!(\"aaaa!\"); + println!(\"aaaaaaaaaaaa!\"); } - fn zzzzzzzzz() { + fn zzzzz() { println!(\"SLEEPING\"); } ".unindent(), "file2.rs": " fn bbb() { - println!(\"bbbb!\"); + println!(\"bbbbbbbbbbbbb!\"); } ".unindent(), "file3.toml": " - ZZZZZZZ = 5 - ".unindent(), + ZZZZZZZZZZZZZZZZZZ = 5 + ".unindent(), } }), ) @@ -97,27 +97,37 @@ async fn test_semantic_index(cx: &mut TestAppContext) { let search_results = store .update(cx, |store, cx| { - store.search_project(project.clone(), "aaaa".to_string(), 5, vec![], vec![], cx) + store.search_project( + project.clone(), + "aaaaaabbbbzz".to_string(), + 5, + vec![], + vec![], + cx, + ) }) .await .unwrap(); - search_results[0].buffer.read_with(cx, |buffer, _cx| { - assert_eq!(search_results[0].range.start.to_offset(buffer), 0); - assert_eq!( - buffer.file().unwrap().path().as_ref(), - Path::new("src/file1.rs") - ); - }); + assert_search_results( + &search_results, + &[ + (Path::new("src/file1.rs").into(), 0), + (Path::new("src/file2.rs").into(), 0), + (Path::new("src/file3.toml").into(), 0), + (Path::new("src/file1.rs").into(), 45), + ], + cx, + ); // Test Include Files Functonality let include_files = vec![Glob::new("*.rs").unwrap().compile_matcher()]; let exclude_files = vec![Glob::new("*.rs").unwrap().compile_matcher()]; - let search_results = store + let rust_only_search_results = store .update(cx, |store, cx| { store.search_project( project.clone(), - "aaaa".to_string(), + "aaaaaabbbbzz".to_string(), 5, include_files, vec![], @@ -127,23 +137,21 @@ async fn test_semantic_index(cx: &mut TestAppContext) { .await .unwrap(); - for res in &search_results { - res.buffer.read_with(cx, |buffer, _cx| { - assert!(buffer - .file() - .unwrap() - .path() - .to_str() - .unwrap() - .ends_with("rs")); - }); - } + assert_search_results( + &rust_only_search_results, + &[ + (Path::new("src/file1.rs").into(), 0), + (Path::new("src/file2.rs").into(), 0), + (Path::new("src/file1.rs").into(), 45), + ], + cx, + ); - let search_results = store + let no_rust_search_results = store .update(cx, |store, cx| { store.search_project( project.clone(), - "aaaa".to_string(), + "aaaaaabbbbzz".to_string(), 5, vec![], exclude_files, @@ -153,17 +161,12 @@ async fn test_semantic_index(cx: &mut TestAppContext) { .await .unwrap(); - for res in &search_results { - res.buffer.read_with(cx, |buffer, _cx| { - assert!(!buffer - .file() - .unwrap() - .path() - .to_str() - .unwrap() - .ends_with("rs")); - }); - } + assert_search_results( + &no_rust_search_results, + &[(Path::new("src/file3.toml").into(), 0)], + cx, + ); + fs.save( "/the-root/src/file2.rs".as_ref(), &" @@ -195,6 +198,26 @@ async fn test_semantic_index(cx: &mut TestAppContext) { ); } +#[track_caller] +fn assert_search_results( + actual: &[SearchResult], + expected: &[(Arc, usize)], + cx: &TestAppContext, +) { + let actual = actual + .iter() + .map(|search_result| { + search_result.buffer.read_with(cx, |buffer, _cx| { + ( + buffer.file().unwrap().path().clone(), + search_result.range.start.to_offset(buffer), + ) + }) + }) + .collect::>(); + assert_eq!(actual, expected); +} + #[gpui::test] async fn test_code_context_retrieval_rust() { let language = rust_lang(); From c86096a886701c96c6dd09fca36c0281bc140111 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 25 Jul 2023 10:38:37 -0400 Subject: [PATCH 27/34] update semantic index tests for javascript --- .../src/semantic_index_tests.rs | 196 +++++++++++------- .../src/languages/javascript/embedding.scm | 92 ++++---- crates/zed/src/languages/tsx/embedding.scm | 106 ++++------ .../src/languages/typescript/embedding.scm | 116 +++++------ 4 files changed, 259 insertions(+), 251 deletions(-) diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 432f6b5b5328ed7bf72c52a34a762de0baacb7de..9bd6efc954ced0f505d8532c3049be9be13466af 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -388,43 +388,103 @@ fn assert_documents_eq( ); } -// #[gpui::test] -// async fn test_code_context_retrieval_javascript() { -// let language = js_lang(); -// let mut retriever = CodeContextRetriever::new(); +#[gpui::test] +async fn test_code_context_retrieval_javascript() { + let language = js_lang(); + let mut retriever = CodeContextRetriever::new(); -// let text = " -// /* globals importScripts, backend */ -// function _authorize() {} + let text = " + /* globals importScripts, backend */ + function _authorize() {} + + /** + * Sometimes the frontend build is way faster than backend. + */ + export async function authorizeBank() { + _authorize(pushModal, upgradingAccountId, {}); + } -// /** -// * Sometimes the frontend build is way faster than backend. -// */ -// export async function authorizeBank() { -// _authorize(pushModal, upgradingAccountId, {}); -// } + export class SettingsPage { + /* This is a test setting */ + constructor(page) { + this.page = page; + } + } -// export class SettingsPage { -// /* This is a test setting */ -// constructor(page) { -// this.page = page; -// } -// } + /* This is a test comment */ + class TestClass {} -// /* This is a test comment */ -// class TestClass {} + /* Schema for editor_events in Clickhouse. */ + export interface ClickhouseEditorEvent { + installation_id: string + operation: string + } + " + .unindent(); -// /* Schema for editor_events in Clickhouse. */ -// export interface ClickhouseEditorEvent { -// installation_id: string -// operation: string -// } -// " -// .unindent(); + let documents = retriever.parse_file(&text, language.clone()).unwrap(); -// let parsed_files = retriever -// .parse_file(Path::new("foo.js"), &text, language) -// .unwrap(); + assert_documents_eq( + &documents, + &[ + ( + " + /* globals importScripts, backend */ + function _authorize() {}" + .unindent(), + 37, + ), + ( + " + /** + * Sometimes the frontend build is way faster than backend. + */ + export async function authorizeBank() { + _authorize(pushModal, upgradingAccountId, {}); + }" + .unindent(), + 131, + ), + ( + " + export class SettingsPage { + /* This is a test setting */ + constructor(page) { + this.page = page; + } + }" + .unindent(), + 225, + ), + ( + " + /* This is a test setting */ + constructor(page) { + this.page = page; + }" + .unindent(), + 290, + ), + ( + " + /* This is a test comment */ + class TestClass {}" + .unindent(), + 374, + ), + ( + " + /* Schema for editor_events in Clickhouse. */ + export interface ClickhouseEditorEvent { + installation_id: string + operation: string + }" + .unindent(), + 440, + ), + ], + ) +} // let test_documents = &[ // Document { @@ -924,86 +984,74 @@ fn js_lang() -> Arc { ( (comment)* @context . + [ (export_statement (function_declaration "async"? @name "function" @name - name: (_) @name)) @item - ) - - ( - (comment)* @context - . + name: (_) @name)) (function_declaration "async"? @name "function" @name - name: (_) @name) @item - ) + name: (_) @name) + ] @item + ) ( (comment)* @context . + [ (export_statement (class_declaration "class" @name - name: (_) @name)) @item - ) - - ( - (comment)* @context - . + name: (_) @name)) (class_declaration "class" @name - name: (_) @name) @item - ) - - ( - (comment)* @context - . - (method_definition - [ - "get" - "set" - "async" - "*" - "static" - ]* @name - name: (_) @name) @item - ) + name: (_) @name) + ] @item + ) ( (comment)* @context . + [ (export_statement (interface_declaration "interface" @name - name: (_) @name)) @item - ) - - ( - (comment)* @context - . + name: (_) @name)) (interface_declaration "interface" @name - name: (_) @name) @item - ) + name: (_) @name) + ] @item + ) ( (comment)* @context . + [ (export_statement (enum_declaration "enum" @name - name: (_) @name)) @item - ) + name: (_) @name)) + (enum_declaration + "enum" @name + name: (_) @name) + ] @item + ) ( (comment)* @context . - (enum_declaration - "enum" @name + (method_definition + [ + "get" + "set" + "async" + "*" + "static" + ]* @name name: (_) @name) @item - ) + ) "# .unindent(), diff --git a/crates/zed/src/languages/javascript/embedding.scm b/crates/zed/src/languages/javascript/embedding.scm index a2140400318db95a8d29074402ab2d212561a79b..ab1a3b6b063c3bf57adad3c302a156fcd0239448 100644 --- a/crates/zed/src/languages/javascript/embedding.scm +++ b/crates/zed/src/languages/javascript/embedding.scm @@ -1,38 +1,60 @@ ( (comment)* @context . - (export_statement + [ + (export_statement + (function_declaration + "async"? @name + "function" @name + name: (_) @name)) (function_declaration "async"? @name "function" @name - name: (_) @name)) @item - ) + name: (_) @name) + ] @item +) ( (comment)* @context . - (function_declaration - "async"? @name - "function" @name - name: (_) @name) @item - ) + [ + (export_statement + (class_declaration + "class" @name + name: (_) @name)) + (class_declaration + "class" @name + name: (_) @name) + ] @item +) ( (comment)* @context . - (export_statement - (class_declaration - "class" @name - name: (_) @name)) @item - ) + [ + (export_statement + (interface_declaration + "interface" @name + name: (_) @name)) + (interface_declaration + "interface" @name + name: (_) @name) + ] @item +) ( (comment)* @context . - (class_declaration - "class" @name - name: (_) @name) @item - ) + [ + (export_statement + (enum_declaration + "enum" @name + name: (_) @name)) + (enum_declaration + "enum" @name + name: (_) @name) + ] @item +) ( (comment)* @context @@ -46,38 +68,4 @@ "static" ]* @name name: (_) @name) @item - ) - -( - (comment)* @context - . - (export_statement - (interface_declaration - "interface" @name - name: (_) @name)) @item - ) - -( - (comment)* @context - . - (interface_declaration - "interface" @name - name: (_) @name) @item - ) - -( - (comment)* @context - . - (export_statement - (enum_declaration - "enum" @name - name: (_) @name)) @item - ) - -( - (comment)* @context - . - (enum_declaration - "enum" @name - name: (_) @name) @item - ) +) diff --git a/crates/zed/src/languages/tsx/embedding.scm b/crates/zed/src/languages/tsx/embedding.scm index 4bb4fea254d0cf86f2fbb9d5c8f657e06238971f..ddcff665841091aa170bd5f9bb60439a2cadb2c5 100644 --- a/crates/zed/src/languages/tsx/embedding.scm +++ b/crates/zed/src/languages/tsx/embedding.scm @@ -1,99 +1,85 @@ ( (comment)* @context . - (export_statement + [ + (export_statement + (function_declaration + "async"? @name + "function" @name + name: (_) @name)) (function_declaration "async"? @name "function" @name - name: (_) @name)) @item + name: (_) @name) + ] @item ) ( (comment)* @context . - (function_declaration - "async"? @name - "function" @name - name: (_) @name) @item - ) - -( - (comment)* @context - . - (export_statement + [ + (export_statement + (class_declaration + "class" @name + name: (_) @name)) (class_declaration "class" @name - name: (_) @name)) @item - ) - -( - (comment)* @context - . - (class_declaration - "class" @name - name: (_) @name) @item - ) - -( - (comment)* @context - . - (method_definition - [ - "get" - "set" - "async" - "*" - "static" - ]* @name - name: (_) @name) @item + name: (_) @name) + ] @item ) ( (comment)* @context . - (export_statement + [ + (export_statement + (interface_declaration + "interface" @name + name: (_) @name)) (interface_declaration "interface" @name - name: (_) @name)) @item - ) - -( - (comment)* @context - . - (interface_declaration - "interface" @name - name: (_) @name) @item + name: (_) @name) + ] @item ) ( (comment)* @context . - (export_statement + [ + (export_statement + (enum_declaration + "enum" @name + name: (_) @name)) (enum_declaration "enum" @name - name: (_) @name)) @item - ) - -( - (comment)* @context - . - (enum_declaration - "enum" @name - name: (_) @name) @item + name: (_) @name) + ] @item ) ( (comment)* @context . - (export_statement + [ + (export_statement + (type_alias_declaration + "type" @name + name: (_) @name)) (type_alias_declaration "type" @name - name: (_) @name)) @item + name: (_) @name) + ] @item ) ( (comment)* @context . - (type_alias_declaration - "type" @name - name: (_) @name) @item) + (method_definition + [ + "get" + "set" + "async" + "*" + "static" + ]* @name + name: (_) @name) @item + ) diff --git a/crates/zed/src/languages/typescript/embedding.scm b/crates/zed/src/languages/typescript/embedding.scm index 4bb4fea254d0cf86f2fbb9d5c8f657e06238971f..3170cb7c957e51e00c175c7eaa2b4b51deda042a 100644 --- a/crates/zed/src/languages/typescript/embedding.scm +++ b/crates/zed/src/languages/typescript/embedding.scm @@ -1,99 +1,85 @@ ( (comment)* @context . - (export_statement + [ + (export_statement + (function_declaration + "async"? @name + "function" @name + name: (_) @name)) (function_declaration "async"? @name "function" @name - name: (_) @name)) @item - ) + name: (_) @name) + ] @item +) ( (comment)* @context . - (function_declaration - "async"? @name - "function" @name - name: (_) @name) @item - ) - -( - (comment)* @context - . - (export_statement + [ + (export_statement + (class_declaration + "class" @name + name: (_) @name)) (class_declaration "class" @name - name: (_) @name)) @item - ) - -( - (comment)* @context - . - (class_declaration - "class" @name - name: (_) @name) @item - ) - -( - (comment)* @context - . - (method_definition - [ - "get" - "set" - "async" - "*" - "static" - ]* @name - name: (_) @name) @item - ) + name: (_) @name) + ] @item +) ( (comment)* @context . - (export_statement + [ + (export_statement + (interface_declaration + "interface" @name + name: (_) @name)) (interface_declaration "interface" @name - name: (_) @name)) @item - ) + name: (_) @name) + ] @item +) ( (comment)* @context . - (interface_declaration - "interface" @name - name: (_) @name) @item - ) - -( - (comment)* @context - . - (export_statement + [ + (export_statement + (enum_declaration + "enum" @name + name: (_) @name)) (enum_declaration "enum" @name - name: (_) @name)) @item - ) + name: (_) @name) + ] @item +) ( (comment)* @context . - (enum_declaration - "enum" @name - name: (_) @name) @item - ) - -( - (comment)* @context - . - (export_statement + [ + (export_statement + (type_alias_declaration + "type" @name + name: (_) @name)) (type_alias_declaration "type" @name - name: (_) @name)) @item - ) + name: (_) @name) + ] @item +) ( (comment)* @context . - (type_alias_declaration - "type" @name - name: (_) @name) @item) + (method_definition + [ + "get" + "set" + "async" + "*" + "static" + ]* @name + name: (_) @name) @item +) From 97c3d97792ec0feb1e93de302c377fe8df28fcf0 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 25 Jul 2023 13:30:38 -0400 Subject: [PATCH 28/34] update semantic index tests for cpp --- .../src/semantic_index_tests.rs | 418 +++++++----------- 1 file changed, 148 insertions(+), 270 deletions(-) diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 9bd6efc954ced0f505d8532c3049be9be13466af..2ae9a06c0fbbeeb3371ff958612b9d94bc88daef 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -486,105 +486,6 @@ async fn test_code_context_retrieval_javascript() { ) } -// let test_documents = &[ -// Document { -// name: "function _authorize".into(), -// range: text.find("function _authorize").unwrap()..(text.find("}").unwrap() + 1), -// content: " -// The below code snippet is from file 'foo.js' - -// ```javascript -// /* globals importScripts, backend */ -// function _authorize() {} -// ```" -// .unindent(), -// embedding: vec![], -// }, -// Document { -// name: "async function authorizeBank".into(), -// range: text.find("export async").unwrap()..223, -// content: " -// The below code snippet is from file 'foo.js' - -// ```javascript -// /** -// * Sometimes the frontend build is way faster than backend. -// */ -// export async function authorizeBank() { -// _authorize(pushModal, upgradingAccountId, {}); -// } -// ```" -// .unindent(), -// embedding: vec![], -// }, -// Document { -// name: "class SettingsPage".into(), -// range: 225..343, -// content: " -// The below code snippet is from file 'foo.js' - -// ```javascript -// export class SettingsPage { -// /* This is a test setting */ -// constructor(page) { -// this.page = page; -// } -// } -// ```" -// .unindent(), -// embedding: vec![], -// }, -// Document { -// name: "constructor".into(), -// range: 290..341, -// content: " -// The below code snippet is from file 'foo.js' - -// ```javascript -// /* This is a test setting */ -// constructor(page) { -// this.page = page; -// } -// ```" -// .unindent(), -// embedding: vec![], -// }, -// Document { -// name: "class TestClass".into(), -// range: 374..392, -// content: " -// The below code snippet is from file 'foo.js' - -// ```javascript -// /* This is a test comment */ -// class TestClass {} -// ```" -// .unindent(), -// embedding: vec![], -// }, -// Document { -// name: "interface ClickhouseEditorEvent".into(), -// range: 440..532, -// content: " -// The below code snippet is from file 'foo.js' - -// ```javascript -// /* Schema for editor_events in Clickhouse. */ -// export interface ClickhouseEditorEvent { -// installation_id: string -// operation: string -// } -// ```" -// .unindent(), -// embedding: vec![], -// }, -// ]; - -// for idx in 0..test_documents.len() { -// assert_eq!(test_documents[idx], parsed_files[idx]); -// } -// } - // #[gpui::test] // async fn test_code_context_retrieval_elixir() { // let language = elixir_lang(); @@ -722,180 +623,157 @@ async fn test_code_context_retrieval_javascript() { // } // } -// #[gpui::test] -// async fn test_code_context_retrieval_cpp() { -// let language = cpp_lang(); -// let mut retriever = CodeContextRetriever::new(); - -// let text = " -// /** -// * @brief Main function -// * @returns 0 on exit -// */ -// int main() { return 0; } - -// /** -// * This is a test comment -// */ -// class MyClass { // The class -// public: // Access specifier -// int myNum; // Attribute (int variable) -// string myString; // Attribute (string variable) -// }; - -// // This is a test comment -// enum Color { red, green, blue }; - -// /** This is a preceding block comment -// * This is the second line -// */ -// struct { // Structure declaration -// int myNum; // Member (int variable) -// string myString; // Member (string variable) -// } myStructure; - -// /** -// * @brief Matrix class. -// */ -// template ::value || std::is_floating_point::value, -// bool>::type> -// class Matrix2 { -// std::vector> _mat; - -// public: -// /** -// * @brief Constructor -// * @tparam Integer ensuring integers are being evaluated and not other -// * data types. -// * @param size denoting the size of Matrix as size x size -// */ -// template ::value, -// Integer>::type> -// explicit Matrix(const Integer size) { -// for (size_t i = 0; i < size; ++i) { -// _mat.emplace_back(std::vector(size, 0)); -// } -// } -// }" -// .unindent(); +#[gpui::test] +async fn test_code_context_retrieval_cpp() { + let language = cpp_lang(); + let mut retriever = CodeContextRetriever::new(); -// let parsed_files = retriever -// .parse_file(Path::new("foo.cpp"), &text, language) -// .unwrap(); + let text = " + /** + * @brief Main function + * @returns 0 on exit + */ + int main() { return 0; } + + /** + * This is a test comment + */ + class MyClass { // The class + public: // Access specifier + int myNum; // Attribute (int variable) + string myString; // Attribute (string variable) + }; + + // This is a test comment + enum Color { red, green, blue }; + + /** This is a preceding block comment + * This is the second line + */ + struct { // Structure declaration + int myNum; // Member (int variable) + string myString; // Member (string variable) + } myStructure; + + /** + * @brief Matrix class. + */ + template ::value || std::is_floating_point::value, + bool>::type> + class Matrix2 { + std::vector> _mat; + + public: + /** + * @brief Constructor + * @tparam Integer ensuring integers are being evaluated and not other + * data types. + * @param size denoting the size of Matrix as size x size + */ + template ::value, + Integer>::type> + explicit Matrix(const Integer size) { + for (size_t i = 0; i < size; ++i) { + _mat.emplace_back(std::vector(size, 0)); + } + } + }" + .unindent(); -// let test_documents = &[ -// Document { -// name: "int main".into(), -// range: 54..78, -// content: " -// The below code snippet is from file 'foo.cpp' - -// ```cpp -// /** -// * @brief Main function -// * @returns 0 on exit -// */ -// int main() { return 0; } -// ```" -// .unindent(), -// embedding: vec![], -// }, -// Document { -// name: "class MyClass".into(), -// range: 112..295, -// content: " -// The below code snippet is from file 'foo.cpp' - -// ```cpp -// /** -// * This is a test comment -// */ -// class MyClass { // The class -// public: // Access specifier -// int myNum; // Attribute (int variable) -// string myString; // Attribute (string variable) -// } -// ```" -// .unindent(), -// embedding: vec![], -// }, -// Document { -// name: "enum Color".into(), -// range: 324..355, -// content: " -// The below code snippet is from file 'foo.cpp' - -// ```cpp -// // This is a test comment -// enum Color { red, green, blue } -// ```" -// .unindent(), -// embedding: vec![], -// }, -// Document { -// name: "struct myStructure".into(), -// range: 428..581, -// content: " -// The below code snippet is from file 'foo.cpp' - -// ```cpp -// /** This is a preceding block comment -// * This is the second line -// */ -// struct { // Structure declaration -// int myNum; // Member (int variable) -// string myString; // Member (string variable) -// } myStructure; -// ```" -// .unindent(), -// embedding: vec![], -// }, -// Document { -// name: "class Matrix2".into(), -// range: 613..1342, -// content: " -// The below code snippet is from file 'foo.cpp' - -// ```cpp -// /** -// * @brief Matrix class. -// */ -// template ::value || std::is_floating_point::value, -// bool>::type> -// class Matrix2 { -// std::vector> _mat; - -// public: -// /** -// * @brief Constructor -// * @tparam Integer ensuring integers are being evaluated and not other -// * data types. -// * @param size denoting the size of Matrix as size x size -// */ -// template ::value, -// Integer>::type> -// explicit Matrix(const Integer size) { -// for (size_t i = 0; i < size; ++i) { -// _mat.emplace_back(std::vector(size, 0)); -// } -// } -// } -// ```" -// .unindent(), -// embedding: vec![], -// }, -// ]; + let documents = retriever.parse_file(&text, language.clone()).unwrap(); -// for idx in 0..test_documents.len() { -// assert_eq!(test_documents[idx], parsed_files[idx]); -// } -// } + assert_documents_eq( + &documents, + &[ + ( + " + /** + * @brief Main function + * @returns 0 on exit + */ + int main() { return 0; }" + .unindent(), + 54, + ), + ( + " + /** + * This is a test comment + */ + class MyClass { // The class + public: // Access specifier + int myNum; // Attribute (int variable) + string myString; // Attribute (string variable) + }" + .unindent(), + 112, + ), + ( + " + // This is a test comment + enum Color { red, green, blue }" + .unindent(), + 322, + ), + ( + " + /** This is a preceding block comment + * This is the second line + */ + struct { // Structure declaration + int myNum; // Member (int variable) + string myString; // Member (string variable) + } myStructure;" + .unindent(), + 425, + ), + ( + " + /** + * @brief Matrix class. + */ + template ::value || std::is_floating_point::value, + bool>::type> + class Matrix2 { + std::vector> _mat; + + public: + /** + * @brief Constructor + * @tparam Integer ensuring integers are being evaluated and not other + * data types. + * @param size denoting the size of Matrix as size x size + */ + template ::value, + Integer>::type> + explicit Matrix(const Integer size) { + for (size_t i = 0; i < size; ++i) { + _mat.emplace_back(std::vector(size, 0)); + } + } + }" + .unindent(), + 612, + ), + ( + " + explicit Matrix(const Integer size) { + for (size_t i = 0; i < size; ++i) { + _mat.emplace_back(std::vector(size, 0)); + } + }" + .unindent(), + 1226, + ), + ], + ); +} #[gpui::test] fn test_dot_product(mut rng: StdRng) { From cdceddd2cc3d21f3efd504e8b568a67535093626 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 25 Jul 2023 15:20:35 -0400 Subject: [PATCH 29/34] update semantic index tests for elixir --- .../src/semantic_index_tests.rs | 251 ++++++++---------- 1 file changed, 115 insertions(+), 136 deletions(-) diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 2ae9a06c0fbbeeb3371ff958612b9d94bc88daef..acf5a9d72b43a1123102e105da2b4b039fba87c6 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -486,142 +486,121 @@ async fn test_code_context_retrieval_javascript() { ) } -// #[gpui::test] -// async fn test_code_context_retrieval_elixir() { -// let language = elixir_lang(); -// let mut retriever = CodeContextRetriever::new(); - -// let text = r#" -// defmodule File.Stream do -// @moduledoc """ -// Defines a `File.Stream` struct returned by `File.stream!/3`. - -// The following fields are public: - -// * `path` - the file path -// * `modes` - the file modes -// * `raw` - a boolean indicating if bin functions should be used -// * `line_or_bytes` - if reading should read lines or a given number of bytes -// * `node` - the node the file belongs to - -// """ - -// defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil - -// @type t :: %__MODULE__{} - -// @doc false -// def __build__(path, modes, line_or_bytes) do -// raw = :lists.keyfind(:encoding, 1, modes) == false - -// modes = -// case raw do -// true -> -// case :lists.keyfind(:read_ahead, 1, modes) do -// {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] -// {:read_ahead, _} -> [:raw | modes] -// false -> [:raw, :read_ahead | modes] -// end - -// false -> -// modes -// end - -// %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} - -// end -// "# -// .unindent(); - -// let parsed_files = retriever -// .parse_file(Path::new("foo.ex"), &text, language) -// .unwrap(); - -// let test_documents = &[ -// Document{ -// name: "defmodule File.Stream".into(), -// range: 0..1132, -// content: r#" -// The below code snippet is from file 'foo.ex' - -// ```elixir -// defmodule File.Stream do -// @moduledoc """ -// Defines a `File.Stream` struct returned by `File.stream!/3`. - -// The following fields are public: - -// * `path` - the file path -// * `modes` - the file modes -// * `raw` - a boolean indicating if bin functions should be used -// * `line_or_bytes` - if reading should read lines or a given number of bytes -// * `node` - the node the file belongs to - -// """ - -// defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil - -// @type t :: %__MODULE__{} - -// @doc false -// def __build__(path, modes, line_or_bytes) do -// raw = :lists.keyfind(:encoding, 1, modes) == false - -// modes = -// case raw do -// true -> -// case :lists.keyfind(:read_ahead, 1, modes) do -// {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] -// {:read_ahead, _} -> [:raw | modes] -// false -> [:raw, :read_ahead | modes] -// end - -// false -> -// modes -// end - -// %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} - -// end -// ```"#.unindent(), -// embedding: vec![], -// }, -// Document { -// name: "def __build__".into(), -// range: 574..1132, -// content: r#" -// The below code snippet is from file 'foo.ex' - -// ```elixir -// @doc false -// def __build__(path, modes, line_or_bytes) do -// raw = :lists.keyfind(:encoding, 1, modes) == false - -// modes = -// case raw do -// true -> -// case :lists.keyfind(:read_ahead, 1, modes) do -// {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] -// {:read_ahead, _} -> [:raw | modes] -// false -> [:raw, :read_ahead | modes] -// end - -// false -> -// modes -// end - -// %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} - -// end -// ```"# -// .unindent(), -// embedding: vec![], -// }]; - -// for idx in 0..test_documents.len() { -// assert_eq!(test_documents[idx], parsed_files[idx]); -// } -// } +#[gpui::test] +async fn test_code_context_retrieval_elixir() { + let language = elixir_lang(); + let mut retriever = CodeContextRetriever::new(); + + let text = r#" + defmodule File.Stream do + @moduledoc """ + Defines a `File.Stream` struct returned by `File.stream!/3`. + + The following fields are public: + + * `path` - the file path + * `modes` - the file modes + * `raw` - a boolean indicating if bin functions should be used + * `line_or_bytes` - if reading should read lines or a given number of bytes + * `node` - the node the file belongs to + + """ + + defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil + + @type t :: %__MODULE__{} + + @doc false + def __build__(path, modes, line_or_bytes) do + raw = :lists.keyfind(:encoding, 1, modes) == false + + modes = + case raw do + true -> + case :lists.keyfind(:read_ahead, 1, modes) do + {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] + {:read_ahead, _} -> [:raw | modes] + false -> [:raw, :read_ahead | modes] + end + + false -> + modes + end + + %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} + + end"# + .unindent(); + + let documents = retriever.parse_file(&text, language.clone()).unwrap(); + + assert_documents_eq( + &documents, + &[( + r#" + defmodule File.Stream do + @moduledoc """ + Defines a `File.Stream` struct returned by `File.stream!/3`. + + The following fields are public: + + * `path` - the file path + * `modes` - the file modes + * `raw` - a boolean indicating if bin functions should be used + * `line_or_bytes` - if reading should read lines or a given number of bytes + * `node` - the node the file belongs to + + """ + + defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil + + @type t :: %__MODULE__{} + + @doc false + def __build__(path, modes, line_or_bytes) do + raw = :lists.keyfind(:encoding, 1, modes) == false + + modes = + case raw do + true -> + case :lists.keyfind(:read_ahead, 1, modes) do + {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] + {:read_ahead, _} -> [:raw | modes] + false -> [:raw, :read_ahead | modes] + end + + false -> + modes + end + + %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} + + end"# + .unindent(), + 0, + ),(r#" + @doc false + def __build__(path, modes, line_or_bytes) do + raw = :lists.keyfind(:encoding, 1, modes) == false + + modes = + case raw do + true -> + case :lists.keyfind(:read_ahead, 1, modes) do + {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)] + {:read_ahead, _} -> [:raw | modes] + false -> [:raw, :read_ahead | modes] + end + + false -> + modes + end + + %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()} + + end"#.unindent(), 574)], + ); +} #[gpui::test] async fn test_code_context_retrieval_cpp() { From e8210b827d8bb0871dbebf046506e12d4d6a934d Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 25 Jul 2023 15:24:27 -0400 Subject: [PATCH 30/34] move visible text to just start anchor with context lines for semantic search --- crates/search/src/project_search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index 28085f59feb16bd9158ee766ba752f4d2cd72340..6903337e07160bdb1bec048ad9d3fe4672c91f09 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -217,7 +217,7 @@ impl ProjectSearch { let matches = results .into_iter() - .map(|result| (result.buffer, vec![result.range])) + .map(|result| (result.buffer, vec![result.range.start..result.range.start])) .collect(); excerpts.stream_excerpts_with_context_lines(matches, 3, cx) From 75999204adcee661da958dd27fc3acc536d05b67 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 25 Jul 2023 16:26:37 -0400 Subject: [PATCH 31/34] update project search to only show semantic button visible with semantic_index enabled --- crates/search/src/project_search.rs | 24 ++++++++++++++++----- crates/semantic_index/src/semantic_index.rs | 6 +++++- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index 6903337e07160bdb1bec048ad9d3fe4672c91f09..ec9108f92cffa392233f22625f40c15da59bf3cc 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -996,6 +996,10 @@ impl ProjectSearchBar { SearchOption::Regex => &mut search_view.regex, }; *value = !*value; + + if value.clone() { + search_view.semantic = None; + } search_view.search(cx); }); cx.notify(); @@ -1012,6 +1016,9 @@ impl ProjectSearchBar { search_view.semantic = None; } else if let Some(semantic_index) = SemanticIndex::global(cx) { // TODO: confirm that it's ok to send this project + search_view.regex = false; + search_view.case_sensitive = false; + search_view.whole_word = false; let project = search_view.model.read(cx).project.clone(); let index_task = semantic_index.update(cx, |semantic_index, cx| { @@ -1266,9 +1273,14 @@ impl View for ProjectSearchBar { .with_child(self.render_nav_button(">", Direction::Next, cx)) .aligned(), ) - .with_child( - Flex::row() - .with_child(self.render_semantic_search_button(cx)) + .with_child({ + let row = if SemanticIndex::enabled(cx) { + Flex::row().with_child(self.render_semantic_search_button(cx)) + } else { + Flex::row() + }; + + let row = row .with_child(self.render_option_button( "Case", SearchOption::CaseSensitive, @@ -1286,8 +1298,10 @@ impl View for ProjectSearchBar { )) .contained() .with_style(theme.search.option_button_group) - .aligned(), - ) + .aligned(); + + row + }) .contained() .with_margin_bottom(row_spacing), ) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 215ca38a28845fdf8b24d8c5d0a5d1249a03bcec..7e8d183ba00a9626af53a881a9fa3d272c257a83 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -1,7 +1,7 @@ mod db; mod embedding; mod parsing; -mod semantic_index_settings; +pub mod semantic_index_settings; #[cfg(test)] mod semantic_index_tests; @@ -183,6 +183,10 @@ impl SemanticIndex { } } + pub fn enabled(cx: &AppContext) -> bool { + settings::get::(cx).enabled + } + async fn new( fs: Arc, database_url: PathBuf, From ca6f7d8a804ec486a4608cd627a7e9182f51a3c8 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 26 Jul 2023 09:17:04 -0400 Subject: [PATCH 32/34] add worktree previously indexed functionality to vector db --- crates/semantic_index/src/db.rs | 17 +++++++ crates/semantic_index/src/semantic_index.rs | 53 ++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/crates/semantic_index/src/db.rs b/crates/semantic_index/src/db.rs index b1e78b7aff994ca977fbbea41d595f08fb65766a..4bc97da0f08e3d56dd249da72cd8deaff56f7e0f 100644 --- a/crates/semantic_index/src/db.rs +++ b/crates/semantic_index/src/db.rs @@ -197,6 +197,23 @@ impl VectorDatabase { Ok(()) } + pub fn worktree_previously_indexed(&self, worktree_root_path: &Path) -> Result { + let mut worktree_query = self + .db + .prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?; + let worktree_id = worktree_query + .query_row(params![worktree_root_path.to_string_lossy()], |row| { + Ok(row.get::<_, i64>(0)?) + }) + .map_err(|err| anyhow!(err)); + + if worktree_id.is_ok() { + return Ok(true); + } else { + return Ok(false); + } + } + pub fn find_or_create_worktree(&self, worktree_root_path: &Path) -> Result { // Check that the absolute path doesnt exist let mut worktree_query = self diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 7e8d183ba00a9626af53a881a9fa3d272c257a83..7fee09dcff1d3cdae5b85ec01a11de632da71528 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -34,7 +34,7 @@ use util::{ ResultExt, }; -const SEMANTIC_INDEX_VERSION: usize = 5; +const SEMANTIC_INDEX_VERSION: usize = 6; const EMBEDDINGS_BATCH_SIZE: usize = 80; pub fn init( @@ -161,6 +161,10 @@ enum DbOperation { worktree_id: i64, sender: oneshot::Sender>>, }, + WorktreePreviouslyIndexed { + path: Arc, + sender: oneshot::Sender>, + }, } enum EmbeddingJob { @@ -327,6 +331,10 @@ impl SemanticIndex { let file_mtimes = db.get_file_mtimes(worktree_db_id); sender.send(file_mtimes).ok(); } + DbOperation::WorktreePreviouslyIndexed { path, sender } => { + let worktree_indexed = db.worktree_previously_indexed(path.as_ref()); + sender.send(worktree_indexed).ok(); + } } } @@ -479,6 +487,49 @@ impl SemanticIndex { async move { rx.await? } } + fn worktree_previously_indexed(&self, path: Arc) -> impl Future> { + let (tx, rx) = oneshot::channel(); + self.db_update_tx + .try_send(DbOperation::WorktreePreviouslyIndexed { path, sender: tx }) + .unwrap(); + async move { rx.await? } + } + + pub fn project_previously_indexed( + &mut self, + project: ModelHandle, + cx: &mut ModelContext, + ) -> Task> { + let worktree_scans_complete = project + .read(cx) + .worktrees(cx) + .map(|worktree| { + let scan_complete = worktree.read(cx).as_local().unwrap().scan_complete(); + async move { + scan_complete.await; + } + }) + .collect::>(); + + let worktrees_indexed_previously = project + .read(cx) + .worktrees(cx) + .map(|worktree| self.worktree_previously_indexed(worktree.read(cx).abs_path())) + .collect::>(); + + cx.spawn(|this, mut cx| async move { + futures::future::join_all(worktree_scans_complete).await; + + let worktree_indexed_previously = + futures::future::join_all(worktrees_indexed_previously).await; + + Ok(worktree_indexed_previously + .iter() + .filter(|worktree| worktree.is_ok()) + .all(|v| v.as_ref().log_err().is_some_and(|v| v.to_owned()))) + }) + } + pub fn index_project( &mut self, project: ModelHandle, From 394a105639413d83c8486ff3ccac2530f6d7dcf2 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 26 Jul 2023 10:03:30 -0400 Subject: [PATCH 33/34] fix warnings --- crates/semantic_index/src/semantic_index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 7fee09dcff1d3cdae5b85ec01a11de632da71528..396a0a8607dc6d31d3ff13a6eb1c00a54ccf061f 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -517,7 +517,7 @@ impl SemanticIndex { .map(|worktree| self.worktree_previously_indexed(worktree.read(cx).abs_path())) .collect::>(); - cx.spawn(|this, mut cx| async move { + cx.spawn(|_, _cx| async move { futures::future::join_all(worktree_scans_complete).await; let worktree_indexed_previously = From 0b61c93a25c23487c1bb52107d9fa5cec80618cf Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 26 Jul 2023 10:22:33 -0400 Subject: [PATCH 34/34] ensure semantic search is not enabled on stable --- crates/semantic_index/src/semantic_index.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 396a0a8607dc6d31d3ff13a6eb1c00a54ccf061f..e4a307573aabc00336863b23a799138c52adc895 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -189,6 +189,7 @@ impl SemanticIndex { pub fn enabled(cx: &AppContext) -> bool { settings::get::(cx).enabled + && *RELEASE_CHANNEL != ReleaseChannel::Stable } async fn new(