More fixes to the semantic index's chunking (#11376)

Max Brunsfeld and Marshall created

This fixes a tricky intermittent issue I was seeing, where failed to
chunk certain files correctly because of the way we reuse Tree-sitter
`Parser` instances across parses.

I've also accounted for leading comments in chunk boundaries, so that
items are grouped with their leading comments whenever possible when
chunking.

Finally, we've changed the `debug project index` action so that it opens
a simple debug view in a pane, instead of printing paths to the console.
This lets you click into a path and see how it was chunked.

Release Notes:

- N/A

---------

Co-authored-by: Marshall <marshall@zed.dev>

Change summary

Cargo.lock                                            |   3 
crates/assistant2/src/assistant2.rs                   |  35 
crates/editor/src/editor.rs                           |  32 
crates/language/src/buffer_tests.rs                   |  25 
crates/language/src/language.rs                       |  37 
crates/language/src/language_registry.rs              |   7 
crates/language/src/syntax_map.rs                     |  12 
crates/semantic_index/Cargo.toml                      |   4 
crates/semantic_index/src/chunking.rs                 | 171 ++++--
crates/semantic_index/src/project_index_debug_view.rs | 300 +++++++++++++
crates/semantic_index/src/semantic_index.rs           |  79 ++-
11 files changed, 533 insertions(+), 172 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -8713,9 +8713,12 @@ dependencies = [
  "sha2 0.10.7",
  "smol",
  "tempfile",
+ "theme",
  "tree-sitter",
+ "ui",
  "unindent",
  "util",
+ "workspace",
  "worktree",
 ]
 

crates/assistant2/src/assistant2.rs 🔗

@@ -21,7 +21,7 @@ use gpui::{
 use language::{language_settings::SoftWrap, LanguageRegistry};
 use open_ai::{FunctionContent, ToolCall, ToolCallContent};
 use rich_text::RichText;
-use semantic_index::{CloudEmbeddingProvider, ProjectIndex, SemanticIndex};
+use semantic_index::{CloudEmbeddingProvider, ProjectIndex, ProjectIndexDebugView, SemanticIndex};
 use serde::Deserialize;
 use settings::Settings;
 use std::sync::Arc;
@@ -83,6 +83,14 @@ pub fn init(client: Arc<Client>, cx: &mut AppContext) {
             workspace.register_action(|workspace, _: &ToggleFocus, cx| {
                 workspace.toggle_panel_focus::<AssistantPanel>(cx);
             });
+            workspace.register_action(|workspace, _: &DebugProjectIndex, cx| {
+                if let Some(panel) = workspace.panel::<AssistantPanel>(cx) {
+                    if let Some(index) = panel.read(cx).chat.read(cx).project_index.clone() {
+                        let view = cx.new_view(|cx| ProjectIndexDebugView::new(index, cx));
+                        workspace.add_item_to_center(Box::new(view), cx);
+                    }
+                }
+            });
         },
     )
     .detach();
@@ -107,8 +115,6 @@ impl AssistantPanel {
                 (workspace.app_state().clone(), workspace.project().clone())
             })?;
 
-            let user_store = app_state.user_store.clone();
-
             cx.new_view(|cx| {
                 let project_index = cx.update_global(|semantic_index: &mut SemanticIndex, cx| {
                     semantic_index.project_index(project.clone(), cx)
@@ -117,7 +123,7 @@ impl AssistantPanel {
                 let mut tool_registry = ToolRegistry::new();
                 tool_registry
                     .register(
-                        ProjectIndexTool::new(project_index.clone(), app_state.fs.clone()),
+                        ProjectIndexTool::new(project_index.clone(), project.read(cx).fs().clone()),
                         cx,
                     )
                     .context("failed to register ProjectIndexTool")
@@ -135,9 +141,9 @@ impl AssistantPanel {
 
                 Self::new(
                     app_state.languages.clone(),
-                    Arc::new(attachment_store),
                     Arc::new(tool_registry),
-                    user_store,
+                    Arc::new(attachment_store),
+                    app_state.user_store.clone(),
                     Some(project_index),
                     workspace,
                     cx,
@@ -148,8 +154,8 @@ impl AssistantPanel {
 
     pub fn new(
         language_registry: Arc<LanguageRegistry>,
-        attachment_store: Arc<UserAttachmentStore>,
         tool_registry: Arc<ToolRegistry>,
+        attachment_store: Arc<UserAttachmentStore>,
         user_store: Model<UserStore>,
         project_index: Option<Model<ProjectIndex>>,
         workspace: WeakView<Workspace>,
@@ -157,9 +163,9 @@ impl AssistantPanel {
     ) -> Self {
         let chat = cx.new_view(|cx| {
             AssistantChat::new(
-                language_registry.clone(),
-                attachment_store.clone(),
+                language_registry,
                 tool_registry.clone(),
+                attachment_store,
                 user_store,
                 project_index,
                 workspace,
@@ -257,8 +263,8 @@ struct EditingMessage {
 impl AssistantChat {
     fn new(
         language_registry: Arc<LanguageRegistry>,
-        attachment_store: Arc<UserAttachmentStore>,
         tool_registry: Arc<ToolRegistry>,
+        attachment_store: Arc<UserAttachmentStore>,
         user_store: Model<UserStore>,
         project_index: Option<Model<ProjectIndex>>,
         workspace: WeakView<Workspace>,
@@ -429,14 +435,6 @@ impl AssistantChat {
         }));
     }
 
-    fn debug_project_index(&mut self, _: &DebugProjectIndex, cx: &mut ViewContext<Self>) {
-        if let Some(index) = &self.project_index {
-            index.update(cx, |project_index, cx| {
-                project_index.debug(cx).detach_and_log_err(cx)
-            });
-        }
-    }
-
     async fn request_completion(
         this: WeakView<Self>,
         mode: SubmitMode,
@@ -846,7 +844,6 @@ impl Render for AssistantChat {
             .key_context("AssistantChat")
             .on_action(cx.listener(Self::submit))
             .on_action(cx.listener(Self::cancel))
-            .on_action(cx.listener(Self::debug_project_index))
             .text_color(Color::Default.color(cx))
             .child(list(self.list_state.clone()).flex_1())
             .child(Composer::new(

crates/editor/src/editor.rs 🔗

@@ -2768,7 +2768,7 @@ impl Editor {
                         indent.len = cmp::min(indent.len, start_point.column);
                         let start = selection.start;
                         let end = selection.end;
-                        let is_cursor = start == end;
+                        let selection_is_empty = start == end;
                         let language_scope = buffer.language_scope_at(start);
                         let (comment_delimiter, insert_extra_newline) = if let Some(language) =
                             &language_scope
@@ -2802,13 +2802,18 @@ impl Editor {
                                             pair_start,
                                         )
                                 });
+
                             // Comment extension on newline is allowed only for cursor selections
-                            let comment_delimiter = language.line_comment_prefixes().filter(|_| {
-                                let is_comment_extension_enabled =
-                                    multi_buffer.settings_at(0, cx).extend_comment_on_newline;
-                                is_cursor && is_comment_extension_enabled
-                            });
-                            let get_comment_delimiter = |delimiters: &[Arc<str>]| {
+                            let comment_delimiter = maybe!({
+                                if !selection_is_empty {
+                                    return None;
+                                }
+
+                                if !multi_buffer.settings_at(0, cx).extend_comment_on_newline {
+                                    return None;
+                                }
+
+                                let delimiters = language.line_comment_prefixes();
                                 let max_len_of_delimiter =
                                     delimiters.iter().map(|delimiter| delimiter.len()).max()?;
                                 let (snapshot, range) =
@@ -2837,12 +2842,7 @@ impl Editor {
                                 } else {
                                     None
                                 }
-                            };
-                            let comment_delimiter = if let Some(delimiters) = comment_delimiter {
-                                get_comment_delimiter(delimiters)
-                            } else {
-                                None
-                            };
+                            });
                             (comment_delimiter, insert_extra_newline)
                         } else {
                             (None, false)
@@ -7181,10 +7181,8 @@ impl Editor {
                 }
 
                 // If the language has line comments, toggle those.
-                if let Some(full_comment_prefixes) = language
-                    .line_comment_prefixes()
-                    .filter(|prefixes| !prefixes.is_empty())
-                {
+                let full_comment_prefixes = language.line_comment_prefixes();
+                if !full_comment_prefixes.is_empty() {
                     let first_prefix = full_comment_prefixes
                         .first()
                         .expect("prefixes is non-empty");

crates/language/src/buffer_tests.rs 🔗

@@ -1818,7 +1818,7 @@ fn test_language_scope_at_with_javascript(cx: &mut AppContext) {
         let snapshot = buffer.snapshot();
 
         let config = snapshot.language_scope_at(0).unwrap();
-        assert_eq!(config.line_comment_prefixes().unwrap(), &[Arc::from("// ")]);
+        assert_eq!(config.line_comment_prefixes(), &[Arc::from("// ")]);
         // Both bracket pairs are enabled
         assert_eq!(
             config.brackets().map(|e| e.1).collect::<Vec<_>>(),
@@ -1828,10 +1828,7 @@ fn test_language_scope_at_with_javascript(cx: &mut AppContext) {
         let string_config = snapshot
             .language_scope_at(text.find("b\"").unwrap())
             .unwrap();
-        assert_eq!(
-            string_config.line_comment_prefixes().unwrap(),
-            &[Arc::from("// ")]
-        );
+        assert_eq!(string_config.line_comment_prefixes(), &[Arc::from("// ")]);
         // Second bracket pair is disabled
         assert_eq!(
             string_config.brackets().map(|e| e.1).collect::<Vec<_>>(),
@@ -1842,7 +1839,7 @@ fn test_language_scope_at_with_javascript(cx: &mut AppContext) {
         let element_config = snapshot
             .language_scope_at(text.find("<F>").unwrap())
             .unwrap();
-        assert_eq!(element_config.line_comment_prefixes(), None);
+        assert_eq!(element_config.line_comment_prefixes(), &[]);
         assert_eq!(
             element_config.block_comment_delimiters(),
             Some((&"{/*".into(), &"*/}".into()))
@@ -1856,10 +1853,7 @@ fn test_language_scope_at_with_javascript(cx: &mut AppContext) {
         let tag_config = snapshot
             .language_scope_at(text.find(" d=").unwrap() + 1)
             .unwrap();
-        assert_eq!(
-            tag_config.line_comment_prefixes().unwrap(),
-            &[Arc::from("// ")]
-        );
+        assert_eq!(tag_config.line_comment_prefixes(), &[Arc::from("// ")]);
         assert_eq!(
             tag_config.brackets().map(|e| e.1).collect::<Vec<_>>(),
             &[true, true]
@@ -1870,9 +1864,7 @@ fn test_language_scope_at_with_javascript(cx: &mut AppContext) {
             .language_scope_at(text.find('{').unwrap() + 1)
             .unwrap();
         assert_eq!(
-            expression_in_element_config
-                .line_comment_prefixes()
-                .unwrap(),
+            expression_in_element_config.line_comment_prefixes(),
             &[Arc::from("// ")]
         );
         assert_eq!(
@@ -1988,17 +1980,14 @@ fn test_language_scope_at_with_combined_injections(cx: &mut AppContext) {
 
         let snapshot = buffer.snapshot();
         let html_config = snapshot.language_scope_at(Point::new(2, 4)).unwrap();
-        assert_eq!(html_config.line_comment_prefixes(), Some(&vec![]));
+        assert_eq!(html_config.line_comment_prefixes(), &[]);
         assert_eq!(
             html_config.block_comment_delimiters(),
             Some((&"<!--".into(), &"-->".into()))
         );
 
         let ruby_config = snapshot.language_scope_at(Point::new(3, 12)).unwrap();
-        assert_eq!(
-            ruby_config.line_comment_prefixes().unwrap(),
-            &[Arc::from("# ")]
-        );
+        assert_eq!(ruby_config.line_comment_prefixes(), &[Arc::from("# ")]);
         assert_eq!(ruby_config.block_comment_delimiters(), None);
 
         buffer

crates/language/src/language.rs 🔗

@@ -20,6 +20,7 @@ mod task_context;
 mod buffer_tests;
 pub mod markdown;
 
+use crate::language_settings::SoftWrap;
 use anyhow::{anyhow, Context, Result};
 use async_trait::async_trait;
 use collections::{HashMap, HashSet};
@@ -41,12 +42,11 @@ use smol::future::FutureExt as _;
 use std::num::NonZeroU32;
 use std::{
     any::Any,
-    cell::RefCell,
     ffi::OsStr,
     fmt::Debug,
     hash::Hash,
     mem,
-    ops::Range,
+    ops::{DerefMut, Range},
     path::{Path, PathBuf},
     pin::Pin,
     str,
@@ -74,8 +74,6 @@ pub use syntax_map::{OwnedSyntaxLayer, SyntaxLayer};
 pub use text::LineEnding;
 pub use tree_sitter::{Node, Parser, Tree, TreeCursor};
 
-use crate::language_settings::SoftWrap;
-
 /// Initializes the `language` crate.
 ///
 /// This should be called before making use of items from the create.
@@ -83,29 +81,30 @@ pub fn init(cx: &mut AppContext) {
     language_settings::init(cx);
 }
 
-thread_local! {
-    static PARSER: RefCell<Parser> = {
-        let mut parser = Parser::new();
-        parser.set_wasm_store(WasmStore::new(WASM_ENGINE.clone()).unwrap()).unwrap();
-        RefCell::new(parser)
-    };
-}
+static QUERY_CURSORS: Mutex<Vec<QueryCursor>> = Mutex::new(vec![]);
+static PARSERS: Mutex<Vec<Parser>> = Mutex::new(vec![]);
 
 pub fn with_parser<F, R>(func: F) -> R
 where
     F: FnOnce(&mut Parser) -> R,
 {
-    PARSER.with(|parser| {
-        let mut parser = parser.borrow_mut();
-        func(&mut parser)
-    })
+    let mut parser = PARSERS.lock().pop().unwrap_or_else(|| {
+        let mut parser = Parser::new();
+        parser
+            .set_wasm_store(WasmStore::new(WASM_ENGINE.clone()).unwrap())
+            .unwrap();
+        parser
+    });
+    parser.set_included_ranges(&[]).unwrap();
+    let result = func(&mut parser);
+    PARSERS.lock().push(parser);
+    result
 }
 
 pub fn with_query_cursor<F, R>(func: F) -> R
 where
     F: FnOnce(&mut QueryCursor) -> R,
 {
-    use std::ops::DerefMut;
     let mut cursor = QueryCursorHandle::new();
     func(cursor.deref_mut())
 }
@@ -1340,11 +1339,12 @@ impl LanguageScope {
 
     /// Returns line prefix that is inserted in e.g. line continuations or
     /// in `toggle comments` action.
-    pub fn line_comment_prefixes(&self) -> Option<&Vec<Arc<str>>> {
+    pub fn line_comment_prefixes(&self) -> &[Arc<str>] {
         Override::as_option(
             self.config_override().map(|o| &o.line_comments),
             Some(&self.language.config.line_comments),
         )
+        .map_or(&[] as &[_], |e| e.as_slice())
     }
 
     pub fn block_comment_delimiters(&self) -> Option<(&Arc<str>, &Arc<str>)> {
@@ -1445,8 +1445,7 @@ impl Grammar {
     }
 
     fn parse_text(&self, text: &Rope, old_tree: Option<Tree>) -> Tree {
-        PARSER.with(|parser| {
-            let mut parser = parser.borrow_mut();
+        with_parser(|parser| {
             parser
                 .set_language(&self.ts_language)
                 .expect("incompatible grammar");

crates/language/src/language_registry.rs 🔗

@@ -3,8 +3,8 @@ use crate::{
         all_language_settings, AllLanguageSettingsContent, LanguageSettingsContent,
     },
     task_context::ContextProvider,
-    CachedLspAdapter, File, Language, LanguageConfig, LanguageId, LanguageMatcher,
-    LanguageServerName, LspAdapter, LspAdapterDelegate, PARSER, PLAIN_TEXT,
+    with_parser, CachedLspAdapter, File, Language, LanguageConfig, LanguageId, LanguageMatcher,
+    LanguageServerName, LspAdapter, LspAdapterDelegate, PLAIN_TEXT,
 };
 use anyhow::{anyhow, Context as _, Result};
 use collections::{hash_map, HashMap};
@@ -668,8 +668,7 @@ impl LanguageRegistry {
                                     .file_stem()
                                     .and_then(OsStr::to_str)
                                     .ok_or_else(|| anyhow!("invalid grammar filename"))?;
-                                anyhow::Ok(PARSER.with(|parser| {
-                                    let mut parser = parser.borrow_mut();
+                                anyhow::Ok(with_parser(|parser| {
                                     let mut store = parser.take_wasm_store().unwrap();
                                     let grammar = store.load_language(&grammar_name, &wasm_bytes);
                                     parser.set_wasm_store(store).unwrap();

crates/language/src/syntax_map.rs 🔗

@@ -1,10 +1,11 @@
 #[cfg(test)]
 mod syntax_map_tests;
 
-use crate::{Grammar, InjectionConfig, Language, LanguageId, LanguageRegistry};
+use crate::{
+    with_parser, Grammar, InjectionConfig, Language, LanguageId, LanguageRegistry, QUERY_CURSORS,
+};
 use collections::HashMap;
 use futures::FutureExt;
-use parking_lot::Mutex;
 use std::{
     borrow::Cow,
     cmp::{self, Ordering, Reverse},
@@ -17,10 +18,6 @@ use sum_tree::{Bias, SeekTarget, SumTree};
 use text::{Anchor, BufferSnapshot, OffsetRangeExt, Point, Rope, ToOffset, ToPoint};
 use tree_sitter::{Node, Query, QueryCapture, QueryCaptures, QueryCursor, QueryMatches, Tree};
 
-use super::PARSER;
-
-static QUERY_CURSORS: Mutex<Vec<QueryCursor>> = Mutex::new(vec![]);
-
 #[derive(Default)]
 pub struct SyntaxMap {
     snapshot: SyntaxSnapshot,
@@ -1177,8 +1174,7 @@ fn parse_text(
     ranges: Vec<tree_sitter::Range>,
     old_tree: Option<Tree>,
 ) -> anyhow::Result<Tree> {
-    PARSER.with(|parser| {
-        let mut parser = parser.borrow_mut();
+    with_parser(|parser| {
         let mut chunks = text.chunks_in_range(start_byte..text.len());
         parser.set_included_ranges(&ranges)?;
         parser.set_language(&grammar.ts_language)?;

crates/semantic_index/Cargo.toml 🔗

@@ -37,9 +37,12 @@ serde.workspace = true
 serde_json.workspace = true
 sha2.workspace = true
 smol.workspace = true
+theme.workspace = true
 tree-sitter.workspace = true
+ui. workspace = true
 util. workspace = true
 unindent.workspace = true
+workspace.workspace = true
 worktree.workspace = true
 
 [dev-dependencies]
@@ -54,3 +57,4 @@ project = { workspace = true, features = ["test-support"] }
 tempfile.workspace = true
 util = { workspace = true, features = ["test-support"] }
 worktree = { workspace = true, features = ["test-support"] }
+workspace = { workspace = true, features = ["test-support"] }

crates/semantic_index/src/chunking.rs 🔗

@@ -1,9 +1,10 @@
-use language::{with_parser, with_query_cursor, Grammar};
+use language::{with_parser, with_query_cursor, Language};
 use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
 use std::{
     cmp::{self, Reverse},
     ops::Range,
+    path::Path,
     sync::Arc,
 };
 use tree_sitter::QueryCapture;
@@ -26,52 +27,95 @@ pub struct Chunk {
     pub digest: [u8; 32],
 }
 
-pub fn chunk_text(text: &str, grammar: Option<&Arc<Grammar>>) -> Vec<Chunk> {
-    chunk_text_with_size_range(text, grammar, CHUNK_SIZE_RANGE)
+pub fn chunk_text(text: &str, language: Option<&Arc<Language>>, path: &Path) -> Vec<Chunk> {
+    chunk_text_with_size_range(text, language, path, CHUNK_SIZE_RANGE)
 }
 
 fn chunk_text_with_size_range(
     text: &str,
-    grammar: Option<&Arc<Grammar>>,
+    language: Option<&Arc<Language>>,
+    path: &Path,
     size_config: ChunkSizeRange,
 ) -> Vec<Chunk> {
-    let mut syntactic_ranges = Vec::new();
-
-    if let Some(grammar) = grammar {
-        if let Some(outline) = grammar.outline_config.as_ref() {
-            let tree = with_parser(|parser| {
-                parser.set_language(&grammar.ts_language).log_err()?;
-                parser.parse(&text, None)
-            });
+    let ranges = syntactic_ranges(text, language, path).unwrap_or_default();
+    chunk_text_with_syntactic_ranges(text, &ranges, size_config)
+}
 
-            if let Some(tree) = tree {
-                with_query_cursor(|cursor| {
-                    // Retrieve a list of ranges of outline items (types, functions, etc) in the document.
-                    // Omit single-line outline items (e.g. struct fields, constant declarations), because
-                    // we'll already be attempting to split on lines.
-                    syntactic_ranges = cursor
-                        .matches(&outline.query, tree.root_node(), text.as_bytes())
-                        .filter_map(|mat| {
-                            mat.captures
-                                .iter()
-                                .find_map(|QueryCapture { node, index }| {
-                                    if *index == outline.item_capture_ix {
-                                        if node.end_position().row > node.start_position().row {
-                                            return Some(node.byte_range());
-                                        }
-                                    }
-                                    None
-                                })
-                        })
-                        .collect::<Vec<_>>();
-                    syntactic_ranges
-                        .sort_unstable_by_key(|range| (range.start, Reverse(range.end)));
-                });
-            }
-        }
+fn syntactic_ranges(
+    text: &str,
+    language: Option<&Arc<Language>>,
+    path: &Path,
+) -> Option<Vec<Range<usize>>> {
+    let language = language?;
+    let grammar = language.grammar()?;
+    let outline = grammar.outline_config.as_ref()?;
+    let tree = with_parser(|parser| {
+        parser.set_language(&grammar.ts_language).log_err()?;
+        parser.parse(&text, None)
+    });
+
+    let Some(tree) = tree else {
+        log::error!("failed to parse file {path:?} for chunking");
+        return None;
+    };
+
+    struct RowInfo {
+        offset: usize,
+        is_comment: bool,
     }
 
-    chunk_text_with_syntactic_ranges(text, &syntactic_ranges, size_config)
+    let scope = language.default_scope();
+    let line_comment_prefixes = scope.line_comment_prefixes();
+    let row_infos = text
+        .split('\n')
+        .map({
+            let mut offset = 0;
+            move |line| {
+                let line = line.trim_start();
+                let is_comment = line_comment_prefixes
+                    .iter()
+                    .any(|prefix| line.starts_with(prefix.as_ref()));
+                let result = RowInfo { offset, is_comment };
+                offset += line.len() + 1;
+                result
+            }
+        })
+        .collect::<Vec<_>>();
+
+    // Retrieve a list of ranges of outline items (types, functions, etc) in the document.
+    // Omit single-line outline items (e.g. struct fields, constant declarations), because
+    // we'll already be attempting to split on lines.
+    let mut ranges = with_query_cursor(|cursor| {
+        cursor
+            .matches(&outline.query, tree.root_node(), text.as_bytes())
+            .filter_map(|mat| {
+                mat.captures
+                    .iter()
+                    .find_map(|QueryCapture { node, index }| {
+                        if *index == outline.item_capture_ix {
+                            let mut start_offset = node.start_byte();
+                            let mut start_row = node.start_position().row;
+                            let end_offset = node.end_byte();
+                            let end_row = node.end_position().row;
+
+                            // Expand the range to include any preceding comments.
+                            while start_row > 0 && row_infos[start_row - 1].is_comment {
+                                start_offset = row_infos[start_row - 1].offset;
+                                start_row -= 1;
+                            }
+
+                            if end_row > start_row {
+                                return Some(start_offset..end_offset);
+                            }
+                        }
+                        None
+                    })
+            })
+            .collect::<Vec<_>>()
+    });
+
+    ranges.sort_unstable_by_key(|range| (range.start, Reverse(range.end)));
+    Some(ranges)
 }
 
 fn chunk_text_with_syntactic_ranges(
@@ -148,7 +192,7 @@ fn chunk_text_with_syntactic_ranges(
     if !range.is_empty() {
         chunks.push(Chunk {
             range: range.clone(),
-            digest: Sha256::digest(&text[range.clone()]).into(),
+            digest: Sha256::digest(&text[range]).into(),
         });
     }
 
@@ -177,6 +221,8 @@ mod tests {
                     Self { first_name, last_name, age }
                 }
 
+                /// Returns the first name
+                /// something something something
                 fn first_name(&self) -> &str {
                     &self.first_name
                 }
@@ -185,8 +231,8 @@ mod tests {
                     &self.last_name
                 }
 
-                fn age(&self) -> usize {
-                    self.ages
+                fn age(&self) -> u32 {
+                    self.age
                 }
             }
         "
@@ -194,7 +240,8 @@ mod tests {
 
         let chunks = chunk_text_with_size_range(
             &text,
-            language.grammar(),
+            Some(&language),
+            Path::new("lib.rs"),
             ChunkSizeRange {
                 min: text.find('}').unwrap(),
                 max: text.find("Self {").unwrap(),
@@ -209,8 +256,8 @@ mod tests {
             &[
                 "struct Person {", // ...
                 "impl Person {",
-                "    fn first_name",
-                "    fn age",
+                "    /// Returns the first name",
+                "    fn last_name",
             ],
         );
 
@@ -227,7 +274,8 @@ mod tests {
 
         let chunks = chunk_text_with_size_range(
             &text,
-            language.grammar(),
+            Some(&language),
+            Path::new("lib.rs"),
             ChunkSizeRange {
                 min: text.find('{').unwrap(),
                 max: text.find('V').unwrap(),
@@ -263,7 +311,8 @@ mod tests {
 
         let chunks = chunk_text_with_size_range(
             &text,
-            language.grammar(),
+            Some(&language),
+            Path::new("lib.rs"),
             ChunkSizeRange { min: 32, max: 64 },
         );
 
@@ -331,33 +380,35 @@ mod tests {
     #[test]
     fn test_chunk_text() {
         let text = "a\n".repeat(1000);
-        let chunks = chunk_text(&text, None);
+        let chunks = chunk_text(&text, None, Path::new("lib.rs"));
         assert_eq!(
             chunks.len(),
             ((2000_f64) / (CHUNK_SIZE_RANGE.max as f64)).ceil() as usize
         );
     }
 
-    fn rust_language() -> Language {
-        Language::new(
-            LanguageConfig {
-                name: "Rust".into(),
-                matcher: LanguageMatcher {
-                    path_suffixes: vec!["rs".to_string()],
+    fn rust_language() -> Arc<Language> {
+        Arc::new(
+            Language::new(
+                LanguageConfig {
+                    name: "Rust".into(),
+                    matcher: LanguageMatcher {
+                        path_suffixes: vec!["rs".to_string()],
+                        ..Default::default()
+                    },
                     ..Default::default()
                 },
-                ..Default::default()
-            },
-            Some(tree_sitter_rust::language()),
-        )
-        .with_outline_query(
-            "
+                Some(tree_sitter_rust::language()),
+            )
+            .with_outline_query(
+                "
             (function_item name: (_) @name) @item
             (impl_item type: (_) @name) @item
             (struct_item name: (_) @name) @item
             (field_declaration name: (_) @name) @item
         ",
+            )
+            .unwrap(),
         )
-        .unwrap()
     }
 }

crates/semantic_index/src/project_index_debug_view.rs 🔗

@@ -0,0 +1,300 @@
+use crate::ProjectIndex;
+use gpui::{
+    canvas, div, list, uniform_list, AnyElement, AppContext, CursorStyle, EventEmitter,
+    FocusHandle, FocusableView, IntoElement, ListOffset, ListState, Model, MouseMoveEvent, Render,
+    UniformListScrollHandle, View,
+};
+use project::WorktreeId;
+use settings::Settings;
+use std::{path::Path, sync::Arc};
+use theme::ThemeSettings;
+use ui::prelude::*;
+use workspace::item::{Item, TabContentParams};
+
+pub struct ProjectIndexDebugView {
+    index: Model<ProjectIndex>,
+    rows: Vec<Row>,
+    selected_path: Option<PathState>,
+    hovered_row_ix: Option<usize>,
+    focus_handle: FocusHandle,
+    list_scroll_handle: UniformListScrollHandle,
+    _subscription: gpui::Subscription,
+}
+
+struct PathState {
+    path: Arc<Path>,
+    chunks: Vec<SharedString>,
+    list_state: ListState,
+}
+
+enum Row {
+    Worktree(Arc<Path>),
+    Entry(WorktreeId, Arc<Path>),
+}
+
+impl ProjectIndexDebugView {
+    pub fn new(index: Model<ProjectIndex>, cx: &mut ViewContext<Self>) -> Self {
+        let mut this = Self {
+            rows: Vec::new(),
+            list_scroll_handle: UniformListScrollHandle::new(),
+            selected_path: None,
+            hovered_row_ix: None,
+            focus_handle: cx.focus_handle(),
+            _subscription: cx.subscribe(&index, |this, _, _, cx| this.update_rows(cx)),
+            index,
+        };
+        this.update_rows(cx);
+        this
+    }
+
+    fn update_rows(&mut self, cx: &mut ViewContext<Self>) {
+        let worktree_indices = self.index.read(cx).worktree_indices(cx);
+        cx.spawn(|this, mut cx| async move {
+            let mut rows = Vec::new();
+
+            for index in worktree_indices {
+                let (root_path, worktree_id, worktree_paths) =
+                    index.read_with(&cx, |index, cx| {
+                        let worktree = index.worktree.read(cx);
+                        (worktree.abs_path(), worktree.id(), index.paths(cx))
+                    })?;
+                rows.push(Row::Worktree(root_path));
+                rows.extend(
+                    worktree_paths
+                        .await?
+                        .into_iter()
+                        .map(|path| Row::Entry(worktree_id, path)),
+                );
+            }
+
+            this.update(&mut cx, |this, cx| {
+                this.rows = rows;
+                cx.notify();
+            })
+        })
+        .detach();
+    }
+
+    fn handle_path_click(
+        &mut self,
+        worktree_id: WorktreeId,
+        file_path: Arc<Path>,
+        cx: &mut ViewContext<Self>,
+    ) -> Option<()> {
+        let project_index = self.index.read(cx);
+        let fs = project_index.fs.clone();
+        let worktree_index = project_index.worktree_index(worktree_id, cx)?.read(cx);
+        let root_path = worktree_index.worktree.read(cx).abs_path();
+        let chunks = worktree_index.chunks_for_path(file_path.clone(), cx);
+
+        cx.spawn(|this, mut cx| async move {
+            let chunks = chunks.await?;
+            let content = fs.load(&root_path.join(&file_path)).await?;
+            let chunks = chunks
+                .into_iter()
+                .map(|chunk| {
+                    let mut start = chunk.chunk.range.start.min(content.len());
+                    let mut end = chunk.chunk.range.end.min(content.len());
+                    while !content.is_char_boundary(start) {
+                        start += 1;
+                    }
+                    while !content.is_char_boundary(end) {
+                        end -= 1;
+                    }
+                    content[start..end].to_string().into()
+                })
+                .collect::<Vec<_>>();
+
+            this.update(&mut cx, |this, cx| {
+                let view = cx.view().downgrade();
+                this.selected_path = Some(PathState {
+                    path: file_path,
+                    list_state: ListState::new(
+                        chunks.len(),
+                        gpui::ListAlignment::Top,
+                        px(100.),
+                        move |ix, cx| {
+                            if let Some(view) = view.upgrade() {
+                                view.update(cx, |view, cx| view.render_chunk(ix, cx))
+                            } else {
+                                div().into_any()
+                            }
+                        },
+                    ),
+                    chunks,
+                });
+                cx.notify();
+            })
+        })
+        .detach();
+        None
+    }
+
+    fn render_chunk(&mut self, ix: usize, cx: &mut ViewContext<Self>) -> AnyElement {
+        let buffer_font = ThemeSettings::get_global(cx).buffer_font.family.clone();
+        let Some(state) = &self.selected_path else {
+            return div().into_any();
+        };
+
+        let colors = cx.theme().colors();
+        let chunk = &state.chunks[ix];
+
+        div()
+            .text_ui(cx)
+            .w_full()
+            .font_family(buffer_font)
+            .child(
+                h_flex()
+                    .justify_between()
+                    .child(format!(
+                        "chunk {} of {}. length: {}",
+                        ix + 1,
+                        state.chunks.len(),
+                        chunk.len(),
+                    ))
+                    .child(
+                        h_flex()
+                            .child(
+                                Button::new(("prev", ix), "prev")
+                                    .disabled(ix == 0)
+                                    .on_click(cx.listener(move |this, _, _| {
+                                        this.scroll_to_chunk(ix.saturating_sub(1))
+                                    })),
+                            )
+                            .child(
+                                Button::new(("next", ix), "next")
+                                    .disabled(ix + 1 == state.chunks.len())
+                                    .on_click(
+                                        cx.listener(move |this, _, _| this.scroll_to_chunk(ix + 1)),
+                                    ),
+                            ),
+                    ),
+            )
+            .child(
+                div()
+                    .bg(colors.editor_background)
+                    .text_xs()
+                    .child(chunk.clone()),
+            )
+            .into_any_element()
+    }
+
+    fn scroll_to_chunk(&mut self, ix: usize) {
+        if let Some(state) = self.selected_path.as_mut() {
+            state.list_state.scroll_to(ListOffset {
+                item_ix: ix,
+                offset_in_item: px(0.),
+            })
+        }
+    }
+}
+
+impl Render for ProjectIndexDebugView {
+    fn render(&mut self, cx: &mut gpui::ViewContext<'_, Self>) -> impl IntoElement {
+        if let Some(selected_path) = self.selected_path.as_ref() {
+            v_flex()
+                .child(
+                    div()
+                        .id("selected-path-name")
+                        .child(
+                            h_flex()
+                                .justify_between()
+                                .child(selected_path.path.to_string_lossy().to_string())
+                                .child("x"),
+                        )
+                        .border_b_1()
+                        .border_color(cx.theme().colors().border)
+                        .cursor(CursorStyle::PointingHand)
+                        .on_click(cx.listener(|this, _, cx| {
+                            this.selected_path.take();
+                            cx.notify();
+                        })),
+                )
+                .child(list(selected_path.list_state.clone()).size_full())
+                .size_full()
+                .into_any_element()
+        } else {
+            let mut list = uniform_list(
+                cx.view().clone(),
+                "ProjectIndexDebugView",
+                self.rows.len(),
+                move |this, range, cx| {
+                    this.rows[range]
+                        .iter()
+                        .enumerate()
+                        .map(|(ix, row)| match row {
+                            Row::Worktree(root_path) => div()
+                                .id(ix)
+                                .child(Label::new(root_path.to_string_lossy().to_string())),
+                            Row::Entry(worktree_id, file_path) => div()
+                                .id(ix)
+                                .pl_8()
+                                .child(Label::new(file_path.to_string_lossy().to_string()))
+                                .on_mouse_move(cx.listener(move |this, _: &MouseMoveEvent, cx| {
+                                    if this.hovered_row_ix != Some(ix) {
+                                        this.hovered_row_ix = Some(ix);
+                                        cx.notify();
+                                    }
+                                }))
+                                .cursor(CursorStyle::PointingHand)
+                                .on_click(cx.listener({
+                                    let worktree_id = *worktree_id;
+                                    let file_path = file_path.clone();
+                                    move |this, _, cx| {
+                                        this.handle_path_click(worktree_id, file_path.clone(), cx);
+                                    }
+                                })),
+                        })
+                        .collect()
+                },
+            )
+            .track_scroll(self.list_scroll_handle.clone())
+            .size_full()
+            .text_bg(cx.theme().colors().background)
+            .into_any_element();
+
+            canvas(
+                move |bounds, cx| {
+                    list.prepaint_as_root(bounds.origin, bounds.size.into(), cx);
+                    list
+                },
+                |_, mut list, cx| list.paint(cx),
+            )
+            .size_full()
+            .into_any_element()
+        }
+    }
+}
+
+impl EventEmitter<()> for ProjectIndexDebugView {}
+
+impl Item for ProjectIndexDebugView {
+    type Event = ();
+
+    fn tab_content(&self, params: TabContentParams, _: &WindowContext<'_>) -> AnyElement {
+        Label::new("Project Index (Debug)")
+            .color(if params.selected {
+                Color::Default
+            } else {
+                Color::Muted
+            })
+            .into_any_element()
+    }
+
+    fn clone_on_split(
+        &self,
+        _: workspace::WorkspaceId,
+        cx: &mut ViewContext<Self>,
+    ) -> Option<View<Self>>
+    where
+        Self: Sized,
+    {
+        Some(cx.new_view(|cx| Self::new(self.index.clone(), cx)))
+    }
+}
+
+impl FocusableView for ProjectIndexDebugView {
+    fn focus_handle(&self, _: &AppContext) -> gpui::FocusHandle {
+        self.focus_handle.clone()
+    }
+}

crates/semantic_index/src/semantic_index.rs 🔗

@@ -1,5 +1,6 @@
 mod chunking;
 mod embedding;
+mod project_index_debug_view;
 
 use anyhow::{anyhow, Context as _, Result};
 use chunking::{chunk_text, Chunk};
@@ -31,6 +32,8 @@ use std::{
 use util::ResultExt;
 use worktree::LocalSnapshot;
 
+pub use project_index_debug_view::ProjectIndexDebugView;
+
 pub struct SemanticIndex {
     embedding_provider: Arc<dyn EmbeddingProvider>,
     db_connection: heed::Env,
@@ -397,26 +400,35 @@ impl ProjectIndex {
         Ok(result)
     }
 
-    pub fn debug(&self, cx: &mut ModelContext<Self>) -> Task<Result<()>> {
-        let indices = self
+    pub(crate) fn worktree_index(
+        &self,
+        worktree_id: WorktreeId,
+        cx: &AppContext,
+    ) -> Option<Model<WorktreeIndex>> {
+        for index in self.worktree_indices.values() {
+            if let WorktreeIndexHandle::Loaded { index, .. } = index {
+                if index.read(cx).worktree.read(cx).id() == worktree_id {
+                    return Some(index.clone());
+                }
+            }
+        }
+        None
+    }
+
+    pub(crate) fn worktree_indices(&self, cx: &AppContext) -> Vec<Model<WorktreeIndex>> {
+        let mut result = self
             .worktree_indices
             .values()
-            .filter_map(|worktree_index| {
-                if let WorktreeIndexHandle::Loaded { index, .. } = worktree_index {
+            .filter_map(|index| {
+                if let WorktreeIndexHandle::Loaded { index, .. } = index {
                     Some(index.clone())
                 } else {
                     None
                 }
             })
             .collect::<Vec<_>>();
-
-        cx.spawn(|_, mut cx| async move {
-            eprintln!("semantic index contents:");
-            for index in indices {
-                index.update(&mut cx, |index, cx| index.debug(cx))?.await?
-            }
-            Ok(())
-        })
+        result.sort_by_key(|index| index.read(cx).worktree.read(cx).id());
+        result
     }
 }
 
@@ -726,10 +738,8 @@ impl WorktreeIndex {
                                     .language_for_file_path(&entry.path)
                                     .await
                                     .ok();
-                                let grammar =
-                                    language.as_ref().and_then(|language| language.grammar());
                                 let chunked_file = ChunkedFile {
-                                    chunks: chunk_text(&text, grammar),
+                                    chunks: chunk_text(&text, language.as_ref(), &entry.path),
                                     handle,
                                     path: entry.path,
                                     mtime: entry.mtime,
@@ -861,7 +871,6 @@ impl WorktreeIndex {
                     db.put(&mut txn, &key, file)?;
                 }
                 txn.commit()?;
-                eprintln!("committed {:?}", embedded_files.len());
 
                 drop(embedded_files);
                 log::debug!("committed");
@@ -871,18 +880,38 @@ impl WorktreeIndex {
         })
     }
 
-    fn debug(&mut self, cx: &mut ModelContext<Self>) -> Task<Result<()>> {
+    fn paths(&self, cx: &AppContext) -> Task<Result<Vec<Arc<Path>>>> {
         let connection = self.db_connection.clone();
         let db = self.db;
         cx.background_executor().spawn(async move {
             let tx = connection
                 .read_txn()
                 .context("failed to create read transaction")?;
-            for record in db.iter(&tx)? {
-                let (key, _) = record?;
-                eprintln!("{}", path_for_db_key(key));
-            }
-            Ok(())
+            let result = db
+                .iter(&tx)?
+                .map(|entry| Ok(entry?.1.path.clone()))
+                .collect::<Result<Vec<Arc<Path>>>>();
+            drop(tx);
+            result
+        })
+    }
+
+    fn chunks_for_path(
+        &self,
+        path: Arc<Path>,
+        cx: &AppContext,
+    ) -> Task<Result<Vec<EmbeddedChunk>>> {
+        let connection = self.db_connection.clone();
+        let db = self.db;
+        cx.background_executor().spawn(async move {
+            let tx = connection
+                .read_txn()
+                .context("failed to create read transaction")?;
+            Ok(db
+                .get(&tx, &db_key_for_path(&path))?
+                .ok_or_else(|| anyhow!("no such path"))?
+                .chunks
+                .clone())
         })
     }
 
@@ -927,7 +956,7 @@ struct EmbeddedFile {
     chunks: Vec<EmbeddedChunk>,
 }
 
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 struct EmbeddedChunk {
     chunk: Chunk,
     embedding: Embedding,
@@ -981,10 +1010,6 @@ fn db_key_for_path(path: &Arc<Path>) -> String {
     path.to_string_lossy().replace('/', "\0")
 }
 
-fn path_for_db_key(key: &str) -> String {
-    key.replace('\0', "/")
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;