update treesitter parsing to accomodate for collapsed nested functions

KCaverly and maxbrunsfeld created

Co-authored-by: maxbrunsfeld <max@zed.dev>

Change summary

Cargo.lock                                        |    3 
Cargo.toml                                        |    2 
crates/language/src/language.rs                   |   22 
crates/semantic_index/Cargo.toml                  |    1 
crates/semantic_index/src/parsing.rs              |  257 +++
crates/semantic_index/src/semantic_index.rs       |    8 
crates/semantic_index/src/semantic_index_tests.rs | 1004 ++++++++--------
crates/zed/src/languages/rust/config.toml         |    1 
crates/zed/src/languages/rust/embedding.scm       |   64 
9 files changed, 780 insertions(+), 582 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -6486,6 +6486,7 @@ dependencies = [
  "parking_lot 0.11.2",
  "picker",
  "postage",
+ "pretty_assertions",
  "project",
  "rand 0.8.5",
  "rpc",
@@ -7991,7 +7992,7 @@ dependencies = [
 [[package]]
 name = "tree-sitter"
 version = "0.20.10"
-source = "git+https://github.com/tree-sitter/tree-sitter?rev=49226023693107fba9a1191136a4f47f38cdca73#49226023693107fba9a1191136a4f47f38cdca73"
+source = "git+https://github.com/tree-sitter/tree-sitter?rev=1c65ca24bc9a734ab70115188f465e12eecf224e#1c65ca24bc9a734ab70115188f465e12eecf224e"
 dependencies = [
  "cc",
  "regex",

Cargo.toml 🔗

@@ -130,7 +130,7 @@ tree-sitter-yaml = { git = "https://github.com/zed-industries/tree-sitter-yaml",
 tree-sitter-lua = "0.0.14"
 
 [patch.crates-io]
-tree-sitter = { git = "https://github.com/tree-sitter/tree-sitter", rev = "49226023693107fba9a1191136a4f47f38cdca73" }
+tree-sitter = { git = "https://github.com/tree-sitter/tree-sitter", rev = "1c65ca24bc9a734ab70115188f465e12eecf224e" }
 async-task = { git = "https://github.com/zed-industries/async-task", rev = "341b57d6de98cdfd7b418567b8de2022ca993a6e" }
 
 # TODO - Remove when a version is released with this PR: https://github.com/servo/core-foundation-rs/pull/457

crates/language/src/language.rs 🔗

@@ -339,6 +339,8 @@ pub struct LanguageConfig {
     #[serde(default)]
     pub line_comment: Option<Arc<str>>,
     #[serde(default)]
+    pub collapsed_placeholder: String,
+    #[serde(default)]
     pub block_comment: Option<(Arc<str>, Arc<str>)>,
     #[serde(default)]
     pub overrides: HashMap<String, LanguageConfigOverride>,
@@ -408,6 +410,7 @@ impl Default for LanguageConfig {
             line_comment: Default::default(),
             block_comment: Default::default(),
             overrides: Default::default(),
+            collapsed_placeholder: Default::default(),
         }
     }
 }
@@ -525,6 +528,8 @@ pub struct EmbeddingConfig {
     pub item_capture_ix: u32,
     pub name_capture_ix: u32,
     pub context_capture_ix: Option<u32>,
+    pub collapse_capture_ix: Option<u32>,
+    pub keep_capture_ix: Option<u32>,
 }
 
 struct InjectionConfig {
@@ -1246,12 +1251,16 @@ impl Language {
         let mut item_capture_ix = None;
         let mut name_capture_ix = None;
         let mut context_capture_ix = None;
+        let mut collapse_capture_ix = None;
+        let mut keep_capture_ix = None;
         get_capture_indices(
             &query,
             &mut [
                 ("item", &mut item_capture_ix),
                 ("name", &mut name_capture_ix),
                 ("context", &mut context_capture_ix),
+                ("keep", &mut keep_capture_ix),
+                ("collapse", &mut collapse_capture_ix),
             ],
         );
         if let Some((item_capture_ix, name_capture_ix)) = item_capture_ix.zip(name_capture_ix) {
@@ -1260,6 +1269,8 @@ impl Language {
                 item_capture_ix,
                 name_capture_ix,
                 context_capture_ix,
+                collapse_capture_ix,
+                keep_capture_ix,
             });
         }
         Ok(self)
@@ -1544,9 +1555,20 @@ impl Language {
     pub fn grammar(&self) -> Option<&Arc<Grammar>> {
         self.grammar.as_ref()
     }
+
+    pub fn default_scope(self: &Arc<Self>) -> LanguageScope {
+        LanguageScope {
+            language: self.clone(),
+            override_id: None,
+        }
+    }
 }
 
 impl LanguageScope {
+    pub fn collapsed_placeholder(&self) -> &str {
+        self.language.config.collapsed_placeholder.as_ref()
+    }
+
     pub fn line_comment_prefix(&self) -> Option<&Arc<str>> {
         Override::as_option(
             self.config_override().map(|o| &o.line_comment),

crates/semantic_index/Cargo.toml 🔗

@@ -46,6 +46,7 @@ rpc = { path = "../rpc", features = ["test-support"] }
 workspace = { path = "../workspace", features = ["test-support"] }
 settings = { path = "../settings", features = ["test-support"]}
 
+pretty_assertions.workspace = true
 rand.workspace = true
 unindent.workspace = true
 tempdir.workspace = true

crates/semantic_index/src/parsing.rs 🔗

@@ -1,6 +1,6 @@
 use anyhow::{anyhow, Ok, Result};
-use language::Language;
-use std::{ops::Range, path::Path, sync::Arc};
+use language::{Grammar, Language};
+use std::{cmp, collections::HashSet, ops::Range, path::Path, sync::Arc};
 use tree_sitter::{Parser, QueryCursor};
 
 #[derive(Debug, PartialEq, Clone)]
@@ -22,6 +22,20 @@ pub struct CodeContextRetriever {
     pub cursor: QueryCursor,
 }
 
+// Every match has an item, this represents the fundamental treesitter symbol and anchors the search
+// Every match has one or more 'name' captures. These indicate the display range of the item for deduplication.
+// If there are preceeding comments, we track this with a context capture
+// If there is a piece that should be collapsed in hierarchical queries, we capture it with a collapse capture
+// If there is a piece that should be kept inside a collapsed node, we capture it with a keep capture
+#[derive(Debug, Clone)]
+pub struct CodeContextMatch {
+    pub start_col: usize,
+    pub item_range: Range<usize>,
+    pub name_range: Range<usize>,
+    pub context_ranges: Vec<Range<usize>>,
+    pub collapse_ranges: Vec<Range<usize>>,
+}
+
 impl CodeContextRetriever {
     pub fn new() -> Self {
         Self {
@@ -49,24 +63,15 @@ impl CodeContextRetriever {
         }])
     }
 
-    pub fn parse_file(
+    fn get_matches_in_file(
         &mut self,
-        relative_path: &Path,
         content: &str,
-        language: Arc<Language>,
-    ) -> Result<Vec<Document>> {
-        if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) {
-            return self._parse_entire_file(relative_path, language.name(), &content);
-        }
-
-        let grammar = language
-            .grammar()
-            .ok_or_else(|| anyhow!("no grammar for language"))?;
+        grammar: &Arc<Grammar>,
+    ) -> Result<Vec<CodeContextMatch>> {
         let embedding_config = grammar
             .embedding_config
             .as_ref()
             .ok_or_else(|| anyhow!("no embedding queries"))?;
-
         self.parser.set_language(grammar.ts_language).unwrap();
 
         let tree = self
@@ -74,66 +79,204 @@ impl CodeContextRetriever {
             .parse(&content, None)
             .ok_or_else(|| anyhow!("parsing failed"))?;
 
-        let mut documents = Vec::new();
-
-        // Iterate through query matches
-        let mut name_ranges: Vec<Range<usize>> = vec![];
+        let mut captures: Vec<CodeContextMatch> = Vec::new();
+        let mut collapse_ranges: Vec<Range<usize>> = Vec::new();
+        let mut keep_ranges: Vec<Range<usize>> = Vec::new();
         for mat in self.cursor.matches(
             &embedding_config.query,
             tree.root_node(),
             content.as_bytes(),
         ) {
-            let mut name: Vec<&str> = vec![];
-            let mut item: Option<&str> = None;
-            let mut byte_range: Option<Range<usize>> = None;
-            let mut context_spans: Vec<&str> = vec![];
+            let mut start_col = 0;
+            let mut item_range: Option<Range<usize>> = None;
+            let mut name_range: Option<Range<usize>> = None;
+            let mut context_ranges: Vec<Range<usize>> = Vec::new();
+            collapse_ranges.clear();
+            keep_ranges.clear();
             for capture in mat.captures {
                 if capture.index == embedding_config.item_capture_ix {
-                    byte_range = Some(capture.node.byte_range());
-                    item = content.get(capture.node.byte_range());
+                    item_range = Some(capture.node.byte_range());
+                    start_col = capture.node.start_position().column;
                 } else if capture.index == embedding_config.name_capture_ix {
-                    let name_range = capture.node.byte_range();
-                    if name_ranges.contains(&name_range) {
-                        continue;
-                    }
-                    name_ranges.push(name_range.clone());
-                    if let Some(name_content) = content.get(name_range.clone()) {
-                        name.push(name_content);
-                    }
+                    name_range = Some(capture.node.byte_range());
+                } else if Some(capture.index) == embedding_config.context_capture_ix {
+                    context_ranges.push(capture.node.byte_range());
+                } else if Some(capture.index) == embedding_config.collapse_capture_ix {
+                    collapse_ranges.push(capture.node.byte_range());
+                } else if Some(capture.index) == embedding_config.keep_capture_ix {
+                    keep_ranges.push(capture.node.byte_range());
                 }
+            }
 
-                if let Some(context_capture_ix) = embedding_config.context_capture_ix {
-                    if capture.index == context_capture_ix {
-                        if let Some(context) = content.get(capture.node.byte_range()) {
-                            context_spans.push(context);
-                        }
-                    }
+            if item_range.is_some() && name_range.is_some() {
+                let item_range = item_range.unwrap();
+                captures.push(CodeContextMatch {
+                    start_col,
+                    item_range,
+                    name_range: name_range.unwrap(),
+                    context_ranges,
+                    collapse_ranges: subtract_ranges(&collapse_ranges, &keep_ranges),
+                });
+            }
+        }
+        Ok(captures)
+    }
+
+    pub fn parse_file_with_template(
+        &mut self,
+        relative_path: &Path,
+        content: &str,
+        language: Arc<Language>,
+    ) -> Result<Vec<Document>> {
+        let language_name = language.name();
+        let mut documents = self.parse_file(relative_path, content, language)?;
+        for document in &mut documents {
+            document.content = CODE_CONTEXT_TEMPLATE
+                .replace("<path>", relative_path.to_string_lossy().as_ref())
+                .replace("<language>", language_name.as_ref())
+                .replace("item", &document.content);
+        }
+        Ok(documents)
+    }
+
+    pub fn parse_file(
+        &mut self,
+        relative_path: &Path,
+        content: &str,
+        language: Arc<Language>,
+    ) -> Result<Vec<Document>> {
+        if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) {
+            return self._parse_entire_file(relative_path, language.name(), &content);
+        }
+
+        let grammar = language
+            .grammar()
+            .ok_or_else(|| anyhow!("no grammar for language"))?;
+
+        // Iterate through query matches
+        let matches = self.get_matches_in_file(content, grammar)?;
+
+        let language_scope = language.default_scope();
+        let placeholder = language_scope.collapsed_placeholder();
+
+        let mut documents = Vec::new();
+        let mut collapsed_ranges_within = Vec::new();
+        let mut parsed_name_ranges = HashSet::new();
+        for (i, context_match) in matches.iter().enumerate() {
+            if parsed_name_ranges.contains(&context_match.name_range) {
+                continue;
+            }
+
+            collapsed_ranges_within.clear();
+            for remaining_match in &matches[(i + 1)..] {
+                if context_match
+                    .item_range
+                    .contains(&remaining_match.item_range.start)
+                    && context_match
+                        .item_range
+                        .contains(&remaining_match.item_range.end)
+                {
+                    collapsed_ranges_within.extend(remaining_match.collapse_ranges.iter().cloned());
+                } else {
+                    break;
                 }
             }
 
-            if let Some((item, byte_range)) = item.zip(byte_range) {
-                if !name.is_empty() {
-                    let item = if context_spans.is_empty() {
-                        item.to_string()
-                    } else {
-                        format!("{}\n{}", context_spans.join("\n"), item)
-                    };
-
-                    let document_text = CODE_CONTEXT_TEMPLATE
-                        .replace("<path>", relative_path.to_str().unwrap())
-                        .replace("<language>", &language.name().to_lowercase())
-                        .replace("<item>", item.as_str());
-
-                    documents.push(Document {
-                        range: byte_range,
-                        content: document_text,
-                        embedding: Vec::new(),
-                        name: name.join(" ").to_string(),
-                    });
+            let mut document_content = String::new();
+            for context_range in &context_match.context_ranges {
+                document_content.push_str(&content[context_range.clone()]);
+                document_content.push_str("\n");
+            }
+
+            let mut offset = context_match.item_range.start;
+            for collapsed_range in &collapsed_ranges_within {
+                if collapsed_range.start > offset {
+                    add_content_from_range(
+                        &mut document_content,
+                        content,
+                        offset..collapsed_range.start,
+                        context_match.start_col,
+                    );
                 }
+                document_content.push_str(placeholder);
+                offset = collapsed_range.end;
+            }
+
+            if offset < context_match.item_range.end {
+                add_content_from_range(
+                    &mut document_content,
+                    content,
+                    offset..context_match.item_range.end,
+                    context_match.start_col,
+                );
+            }
+
+            if let Some(name) = content.get(context_match.name_range.clone()) {
+                parsed_name_ranges.insert(context_match.name_range.clone());
+                documents.push(Document {
+                    name: name.to_string(),
+                    content: document_content,
+                    range: context_match.item_range.clone(),
+                    embedding: vec![],
+                })
             }
         }
 
         return Ok(documents);
     }
 }
+
+pub(crate) fn subtract_ranges(
+    ranges: &[Range<usize>],
+    ranges_to_subtract: &[Range<usize>],
+) -> Vec<Range<usize>> {
+    let mut result = Vec::new();
+
+    let mut ranges_to_subtract = ranges_to_subtract.iter().peekable();
+
+    for range in ranges {
+        let mut offset = range.start;
+
+        while offset < range.end {
+            if let Some(range_to_subtract) = ranges_to_subtract.peek() {
+                if offset < range_to_subtract.start {
+                    let next_offset = cmp::min(range_to_subtract.start, range.end);
+                    result.push(offset..next_offset);
+                    offset = next_offset;
+                } else {
+                    let next_offset = cmp::min(range_to_subtract.end, range.end);
+                    offset = next_offset;
+                }
+
+                if offset >= range_to_subtract.end {
+                    ranges_to_subtract.next();
+                }
+            } else {
+                result.push(offset..range.end);
+                offset = range.end;
+            }
+        }
+    }
+
+    result
+}
+
+fn add_content_from_range(
+    output: &mut String,
+    content: &str,
+    range: Range<usize>,
+    start_col: usize,
+) {
+    for mut line in content.get(range.clone()).unwrap_or("").lines() {
+        for _ in 0..start_col {
+            if line.starts_with(' ') {
+                line = &line[1..];
+            } else {
+                break;
+            }
+        }
+        output.push_str(line);
+        output.push('\n');
+    }
+    output.pop();
+}

crates/semantic_index/src/semantic_index.rs 🔗

@@ -409,7 +409,11 @@ impl SemanticIndex {
     ) {
         if let Some(content) = fs.load(&pending_file.absolute_path).await.log_err() {
             if let Some(documents) = retriever
-                .parse_file(&pending_file.relative_path, &content, pending_file.language)
+                .parse_file_with_template(
+                    &pending_file.relative_path,
+                    &content,
+                    pending_file.language,
+                )
                 .log_err()
             {
                 log::trace!(
@@ -657,6 +661,8 @@ impl SemanticIndex {
                 })
                 .await?;
 
+            dbg!(&documents);
+
             let mut tasks = Vec::new();
             let mut ranges = Vec::new();
             let weak_project = project.downgrade();

crates/semantic_index/src/semantic_index_tests.rs 🔗

@@ -1,7 +1,7 @@
 use crate::{
     db::dot,
     embedding::EmbeddingProvider,
-    parsing::{CodeContextRetriever, Document},
+    parsing::{subtract_ranges, CodeContextRetriever, Document},
     semantic_index_settings::SemanticIndexSettings,
     SemanticIndex,
 };
@@ -9,6 +9,7 @@ use anyhow::Result;
 use async_trait::async_trait;
 use gpui::{Task, TestAppContext};
 use language::{Language, LanguageConfig, LanguageRegistry, ToOffset};
+use pretty_assertions::assert_eq;
 use project::{project_settings::ProjectSettings, FakeFs, Fs, Project};
 use rand::{rngs::StdRng, Rng};
 use serde_json::json;
@@ -104,7 +105,7 @@ async fn test_semantic_index(cx: &mut TestAppContext) {
         assert_eq!(search_results[0].range.start.to_offset(buffer), 0);
         assert_eq!(
             buffer.file().unwrap().path().as_ref(),
-            Path::new("file1.rs")
+            Path::new("src/file1.rs")
         );
     });
 
@@ -147,503 +148,548 @@ async fn test_code_context_retrieval_rust() {
     let text = "
         /// A doc comment
         /// that spans multiple lines
+        #[gpui::test]
         fn a() {
             b
         }
 
         impl C for D {
         }
+
+        impl E {
+            // This is also a preceding comment
+            pub fn function_1() -> Option<()> {
+                todo!();
+            }
+
+            // This is a preceding comment
+            fn function_2() -> Result<()> {
+                todo!();
+            }
+        }
     "
     .unindent();
 
-    let parsed_files = retriever
+    let documents = retriever
         .parse_file(Path::new("foo.rs"), &text, language)
         .unwrap();
 
-    assert_eq!(
-        parsed_files,
+    assert_documents_eq(
+        &documents,
         &[
-            Document {
-                name: "a".into(),
-                range: text.find("fn a").unwrap()..(text.find("}").unwrap() + 1),
-                content: "
-                    The below code snippet is from file 'foo.rs'
-
-                    ```rust
-                    /// A doc comment
-                    /// that spans multiple lines
-                    fn a() {
-                        b
-                    }
-                    ```"
+            (
+                "
+                /// A doc comment
+                /// that spans multiple lines
+                #[gpui::test]
+                fn a() {
+                    b
+                }"
                 .unindent(),
-                embedding: vec![],
-            },
-            Document {
-                name: "C for D".into(),
-                range: text.find("impl C").unwrap()..(text.rfind("}").unwrap() + 1),
-                content: "
-                    The below code snippet is from file 'foo.rs'
-
-                    ```rust
-                    impl C for D {
-                    }
-                    ```"
+                text.find("fn a").unwrap(),
+            ),
+            (
+                "
+                impl C for D {
+                }"
                 .unindent(),
-                embedding: vec![],
-            }
-        ]
+                text.find("impl C").unwrap(),
+            ),
+            (
+                "
+                impl E {
+                    // This is also a preceding comment
+                    pub fn function_1() -> Option<()> { /* ... */ }
+
+                    // This is a preceding comment
+                    fn function_2() -> Result<()> { /* ... */ }
+                }"
+                .unindent(),
+                text.find("impl E").unwrap(),
+            ),
+            (
+                "
+                // This is also a preceding comment
+                pub fn function_1() -> Option<()> {
+                    todo!();
+                }"
+                .unindent(),
+                text.find("pub fn function_1").unwrap(),
+            ),
+            (
+                "
+                // This is a preceding comment
+                fn function_2() -> Result<()> {
+                    todo!();
+                }"
+                .unindent(),
+                text.find("fn function_2").unwrap(),
+            ),
+        ],
     );
 }
 
-#[gpui::test]
-async fn test_code_context_retrieval_javascript() {
-    let language = js_lang();
-    let mut retriever = CodeContextRetriever::new();
-
-    let text = "
-        /* globals importScripts, backend */
-        function _authorize() {}
-
-        /**
-         * Sometimes the frontend build is way faster than backend.
-         */
-        export async function authorizeBank() {
-            _authorize(pushModal, upgradingAccountId, {});
-        }
-
-        export class SettingsPage {
-            /* This is a test setting */
-            constructor(page) {
-                this.page = page;
-            }
-        }
-
-        /* This is a test comment */
-        class TestClass {}
-
-        /* Schema for editor_events in Clickhouse. */
-        export interface ClickhouseEditorEvent {
-            installation_id: string
-            operation: string
-        }
-        "
-    .unindent();
-
-    let parsed_files = retriever
-        .parse_file(Path::new("foo.js"), &text, language)
-        .unwrap();
-
-    let test_documents = &[
-        Document {
-            name: "function _authorize".into(),
-            range: text.find("function _authorize").unwrap()..(text.find("}").unwrap() + 1),
-            content: "
-                    The below code snippet is from file 'foo.js'
-
-                    ```javascript
-                    /* globals importScripts, backend */
-                    function _authorize() {}
-                    ```"
-            .unindent(),
-            embedding: vec![],
-        },
-        Document {
-            name: "async function authorizeBank".into(),
-            range: text.find("export async").unwrap()..223,
-            content: "
-                    The below code snippet is from file 'foo.js'
-
-                    ```javascript
-                    /**
-                     * Sometimes the frontend build is way faster than backend.
-                     */
-                    export async function authorizeBank() {
-                        _authorize(pushModal, upgradingAccountId, {});
-                    }
-                    ```"
-            .unindent(),
-            embedding: vec![],
-        },
-        Document {
-            name: "class SettingsPage".into(),
-            range: 225..343,
-            content: "
-                    The below code snippet is from file 'foo.js'
-
-                    ```javascript
-                    export class SettingsPage {
-                        /* This is a test setting */
-                        constructor(page) {
-                            this.page = page;
-                        }
-                    }
-                    ```"
-            .unindent(),
-            embedding: vec![],
-        },
-        Document {
-            name: "constructor".into(),
-            range: 290..341,
-            content: "
-                The below code snippet is from file 'foo.js'
-
-                ```javascript
-                /* This is a test setting */
-                constructor(page) {
-                        this.page = page;
-                    }
-                ```"
-            .unindent(),
-            embedding: vec![],
-        },
-        Document {
-            name: "class TestClass".into(),
-            range: 374..392,
-            content: "
-                    The below code snippet is from file 'foo.js'
-
-                    ```javascript
-                    /* This is a test comment */
-                    class TestClass {}
-                    ```"
-            .unindent(),
-            embedding: vec![],
-        },
-        Document {
-            name: "interface ClickhouseEditorEvent".into(),
-            range: 440..532,
-            content: "
-                    The below code snippet is from file 'foo.js'
-
-                    ```javascript
-                    /* Schema for editor_events in Clickhouse. */
-                    export interface ClickhouseEditorEvent {
-                        installation_id: string
-                        operation: string
-                    }
-                    ```"
-            .unindent(),
-            embedding: vec![],
-        },
-    ];
-
-    for idx in 0..test_documents.len() {
-        assert_eq!(test_documents[idx], parsed_files[idx]);
-    }
-}
-
-#[gpui::test]
-async fn test_code_context_retrieval_elixir() {
-    let language = elixir_lang();
-    let mut retriever = CodeContextRetriever::new();
-
-    let text = r#"
-defmodule File.Stream do
-    @moduledoc """
-    Defines a `File.Stream` struct returned by `File.stream!/3`.
-
-    The following fields are public:
-
-    * `path`          - the file path
-    * `modes`         - the file modes
-    * `raw`           - a boolean indicating if bin functions should be used
-    * `line_or_bytes` - if reading should read lines or a given number of bytes
-    * `node`          - the node the file belongs to
-
-    """
-
-    defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil
-
-    @type t :: %__MODULE__{}
-
-    @doc false
-    def __build__(path, modes, line_or_bytes) do
-    raw = :lists.keyfind(:encoding, 1, modes) == false
-
-    modes =
-        case raw do
-        true ->
-            case :lists.keyfind(:read_ahead, 1, modes) do
-            {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)]
-            {:read_ahead, _} -> [:raw | modes]
-            false -> [:raw, :read_ahead | modes]
-            end
-
-        false ->
-            modes
-        end
-
-    %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()}
-
-    end
-"#
-    .unindent();
-
-    let parsed_files = retriever
-        .parse_file(Path::new("foo.ex"), &text, language)
-        .unwrap();
-
-    let test_documents = &[
-        Document{
-            name: "defmodule File.Stream".into(),
-            range: 0..1132,
-            content: r#"
-                The below code snippet is from file 'foo.ex'
-
-                ```elixir
-                defmodule File.Stream do
-                    @moduledoc """
-                    Defines a `File.Stream` struct returned by `File.stream!/3`.
-
-                    The following fields are public:
-
-                    * `path`          - the file path
-                    * `modes`         - the file modes
-                    * `raw`           - a boolean indicating if bin functions should be used
-                    * `line_or_bytes` - if reading should read lines or a given number of bytes
-                    * `node`          - the node the file belongs to
-
-                    """
-
-                    defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil
-
-                    @type t :: %__MODULE__{}
-
-                    @doc false
-                    def __build__(path, modes, line_or_bytes) do
-                    raw = :lists.keyfind(:encoding, 1, modes) == false
-
-                    modes =
-                        case raw do
-                        true ->
-                            case :lists.keyfind(:read_ahead, 1, modes) do
-                            {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)]
-                            {:read_ahead, _} -> [:raw | modes]
-                            false -> [:raw, :read_ahead | modes]
-                            end
-
-                        false ->
-                            modes
-                        end
-
-                    %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()}
-
-                    end
-                ```"#.unindent(),
-            embedding: vec![],
-        },
-        Document {
-        name: "def __build__".into(),
-        range: 574..1132,
-        content: r#"
-The below code snippet is from file 'foo.ex'
-
-```elixir
-@doc false
-def __build__(path, modes, line_or_bytes) do
-    raw = :lists.keyfind(:encoding, 1, modes) == false
-
-    modes =
-        case raw do
-        true ->
-            case :lists.keyfind(:read_ahead, 1, modes) do
-            {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)]
-            {:read_ahead, _} -> [:raw | modes]
-            false -> [:raw, :read_ahead | modes]
-            end
-
-        false ->
-            modes
-        end
-
-    %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()}
-
-    end
-```"#
-            .unindent(),
-        embedding: vec![],
-    }];
-
-    for idx in 0..test_documents.len() {
-        assert_eq!(test_documents[idx], parsed_files[idx]);
-    }
+fn assert_documents_eq(
+    documents: &[Document],
+    expected_contents_and_start_offsets: &[(String, usize)],
+) {
+    assert_eq!(
+        documents
+            .iter()
+            .map(|document| (document.content.clone(), document.range.start))
+            .collect::<Vec<_>>(),
+        expected_contents_and_start_offsets
+    );
 }
 
-#[gpui::test]
-async fn test_code_context_retrieval_cpp() {
-    let language = cpp_lang();
-    let mut retriever = CodeContextRetriever::new();
-
-    let text = "
-    /**
-     * @brief Main function
-     * @returns 0 on exit
-     */
-    int main() { return 0; }
-
-    /**
-    * This is a test comment
-    */
-    class MyClass {       // The class
-        public:             // Access specifier
-        int myNum;        // Attribute (int variable)
-        string myString;  // Attribute (string variable)
-    };
-
-    // This is a test comment
-    enum Color { red, green, blue };
-
-    /** This is a preceeding block comment
-     * This is the second line
-     */
-    struct {           // Structure declaration
-        int myNum;       // Member (int variable)
-        string myString; // Member (string variable)
-    } myStructure;
-
-    /**
-    * @brief Matrix class.
-    */
-    template <typename T,
-              typename = typename std::enable_if<
-                std::is_integral<T>::value || std::is_floating_point<T>::value,
-                bool>::type>
-    class Matrix2 {
-        std::vector<std::vector<T>> _mat;
-
-    public:
-        /**
-        * @brief Constructor
-        * @tparam Integer ensuring integers are being evaluated and not other
-        * data types.
-        * @param size denoting the size of Matrix as size x size
-        */
-        template <typename Integer,
-                  typename = typename std::enable_if<std::is_integral<Integer>::value,
-                  Integer>::type>
-        explicit Matrix(const Integer size) {
-            for (size_t i = 0; i < size; ++i) {
-                _mat.emplace_back(std::vector<T>(size, 0));
-            }
-        }
-    }"
-    .unindent();
-
-    let parsed_files = retriever
-        .parse_file(Path::new("foo.cpp"), &text, language)
-        .unwrap();
-
-    let test_documents = &[
-        Document {
-            name: "int main".into(),
-            range: 54..78,
-            content: "
-                The below code snippet is from file 'foo.cpp'
-
-                ```cpp
-                /**
-                 * @brief Main function
-                 * @returns 0 on exit
-                 */
-                int main() { return 0; }
-                ```"
-            .unindent(),
-            embedding: vec![],
-        },
-        Document {
-            name: "class MyClass".into(),
-            range: 112..295,
-            content: "
-                The below code snippet is from file 'foo.cpp'
-
-                ```cpp
-                /**
-                * This is a test comment
-                */
-                class MyClass {       // The class
-                    public:             // Access specifier
-                    int myNum;        // Attribute (int variable)
-                    string myString;  // Attribute (string variable)
-                }
-                ```"
-            .unindent(),
-            embedding: vec![],
-        },
-        Document {
-            name: "enum Color".into(),
-            range: 324..355,
-            content: "
-                The below code snippet is from file 'foo.cpp'
-
-                ```cpp
-                // This is a test comment
-                enum Color { red, green, blue }
-                ```"
-            .unindent(),
-            embedding: vec![],
-        },
-        Document {
-            name: "struct myStructure".into(),
-            range: 428..581,
-            content: "
-                The below code snippet is from file 'foo.cpp'
-
-                ```cpp
-                /** This is a preceeding block comment
-                 * This is the second line
-                 */
-                struct {           // Structure declaration
-                    int myNum;       // Member (int variable)
-                    string myString; // Member (string variable)
-                } myStructure;
-                ```"
-            .unindent(),
-            embedding: vec![],
-        },
-        Document {
-            name: "class Matrix2".into(),
-            range: 613..1342,
-            content: "
-                The below code snippet is from file 'foo.cpp'
-
-                ```cpp
-                /**
-                * @brief Matrix class.
-                */
-                template <typename T,
-                          typename = typename std::enable_if<
-                            std::is_integral<T>::value || std::is_floating_point<T>::value,
-                            bool>::type>
-                class Matrix2 {
-                    std::vector<std::vector<T>> _mat;
-
-                public:
-                    /**
-                    * @brief Constructor
-                    * @tparam Integer ensuring integers are being evaluated and not other
-                    * data types.
-                    * @param size denoting the size of Matrix as size x size
-                    */
-                    template <typename Integer,
-                              typename = typename std::enable_if<std::is_integral<Integer>::value,
-                              Integer>::type>
-                    explicit Matrix(const Integer size) {
-                        for (size_t i = 0; i < size; ++i) {
-                            _mat.emplace_back(std::vector<T>(size, 0));
-                        }
-                    }
-                }
-                ```"
-            .unindent(),
-            embedding: vec![],
-        },
-    ];
-
-    for idx in 0..test_documents.len() {
-        assert_eq!(test_documents[idx], parsed_files[idx]);
-    }
-}
+// #[gpui::test]
+// async fn test_code_context_retrieval_javascript() {
+//     let language = js_lang();
+//     let mut retriever = CodeContextRetriever::new();
+
+//     let text = "
+//         /* globals importScripts, backend */
+//         function _authorize() {}
+
+//         /**
+//          * Sometimes the frontend build is way faster than backend.
+//          */
+//         export async function authorizeBank() {
+//             _authorize(pushModal, upgradingAccountId, {});
+//         }
+
+//         export class SettingsPage {
+//             /* This is a test setting */
+//             constructor(page) {
+//                 this.page = page;
+//             }
+//         }
+
+//         /* This is a test comment */
+//         class TestClass {}
+
+//         /* Schema for editor_events in Clickhouse. */
+//         export interface ClickhouseEditorEvent {
+//             installation_id: string
+//             operation: string
+//         }
+//         "
+//     .unindent();
+
+//     let parsed_files = retriever
+//         .parse_file(Path::new("foo.js"), &text, language)
+//         .unwrap();
+
+//     let test_documents = &[
+//         Document {
+//             name: "function _authorize".into(),
+//             range: text.find("function _authorize").unwrap()..(text.find("}").unwrap() + 1),
+//             content: "
+//                     The below code snippet is from file 'foo.js'
+
+//                     ```javascript
+//                     /* globals importScripts, backend */
+//                     function _authorize() {}
+//                     ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//             name: "async function authorizeBank".into(),
+//             range: text.find("export async").unwrap()..223,
+//             content: "
+//                     The below code snippet is from file 'foo.js'
+
+//                     ```javascript
+//                     /**
+//                      * Sometimes the frontend build is way faster than backend.
+//                      */
+//                     export async function authorizeBank() {
+//                         _authorize(pushModal, upgradingAccountId, {});
+//                     }
+//                     ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//             name: "class SettingsPage".into(),
+//             range: 225..343,
+//             content: "
+//                     The below code snippet is from file 'foo.js'
+
+//                     ```javascript
+//                     export class SettingsPage {
+//                         /* This is a test setting */
+//                         constructor(page) {
+//                             this.page = page;
+//                         }
+//                     }
+//                     ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//             name: "constructor".into(),
+//             range: 290..341,
+//             content: "
+//                 The below code snippet is from file 'foo.js'
+
+//                 ```javascript
+//                 /* This is a test setting */
+//                 constructor(page) {
+//                         this.page = page;
+//                     }
+//                 ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//             name: "class TestClass".into(),
+//             range: 374..392,
+//             content: "
+//                     The below code snippet is from file 'foo.js'
+
+//                     ```javascript
+//                     /* This is a test comment */
+//                     class TestClass {}
+//                     ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//             name: "interface ClickhouseEditorEvent".into(),
+//             range: 440..532,
+//             content: "
+//                     The below code snippet is from file 'foo.js'
+
+//                     ```javascript
+//                     /* Schema for editor_events in Clickhouse. */
+//                     export interface ClickhouseEditorEvent {
+//                         installation_id: string
+//                         operation: string
+//                     }
+//                     ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//     ];
+
+//     for idx in 0..test_documents.len() {
+//         assert_eq!(test_documents[idx], parsed_files[idx]);
+//     }
+// }
+
+// #[gpui::test]
+// async fn test_code_context_retrieval_elixir() {
+//     let language = elixir_lang();
+//     let mut retriever = CodeContextRetriever::new();
+
+//     let text = r#"
+// defmodule File.Stream do
+//     @moduledoc """
+//     Defines a `File.Stream` struct returned by `File.stream!/3`.
+
+//     The following fields are public:
+
+//     * `path`          - the file path
+//     * `modes`         - the file modes
+//     * `raw`           - a boolean indicating if bin functions should be used
+//     * `line_or_bytes` - if reading should read lines or a given number of bytes
+//     * `node`          - the node the file belongs to
+
+//     """
+
+//     defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil
+
+//     @type t :: %__MODULE__{}
+
+//     @doc false
+//     def __build__(path, modes, line_or_bytes) do
+//     raw = :lists.keyfind(:encoding, 1, modes) == false
+
+//     modes =
+//         case raw do
+//         true ->
+//             case :lists.keyfind(:read_ahead, 1, modes) do
+//             {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)]
+//             {:read_ahead, _} -> [:raw | modes]
+//             false -> [:raw, :read_ahead | modes]
+//             end
+
+//         false ->
+//             modes
+//         end
+
+//     %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()}
+
+//     end
+// "#
+//     .unindent();
+
+//     let parsed_files = retriever
+//         .parse_file(Path::new("foo.ex"), &text, language)
+//         .unwrap();
+
+//     let test_documents = &[
+//         Document{
+//             name: "defmodule File.Stream".into(),
+//             range: 0..1132,
+//             content: r#"
+//                 The below code snippet is from file 'foo.ex'
+
+//                 ```elixir
+//                 defmodule File.Stream do
+//                     @moduledoc """
+//                     Defines a `File.Stream` struct returned by `File.stream!/3`.
+
+//                     The following fields are public:
+
+//                     * `path`          - the file path
+//                     * `modes`         - the file modes
+//                     * `raw`           - a boolean indicating if bin functions should be used
+//                     * `line_or_bytes` - if reading should read lines or a given number of bytes
+//                     * `node`          - the node the file belongs to
+
+//                     """
+
+//                     defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil
+
+//                     @type t :: %__MODULE__{}
+
+//                     @doc false
+//                     def __build__(path, modes, line_or_bytes) do
+//                     raw = :lists.keyfind(:encoding, 1, modes) == false
+
+//                     modes =
+//                         case raw do
+//                         true ->
+//                             case :lists.keyfind(:read_ahead, 1, modes) do
+//                             {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)]
+//                             {:read_ahead, _} -> [:raw | modes]
+//                             false -> [:raw, :read_ahead | modes]
+//                             end
+
+//                         false ->
+//                             modes
+//                         end
+
+//                     %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()}
+
+//                     end
+//                 ```"#.unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//         name: "def __build__".into(),
+//         range: 574..1132,
+//         content: r#"
+// The below code snippet is from file 'foo.ex'
+
+// ```elixir
+// @doc false
+// def __build__(path, modes, line_or_bytes) do
+//     raw = :lists.keyfind(:encoding, 1, modes) == false
+
+//     modes =
+//         case raw do
+//         true ->
+//             case :lists.keyfind(:read_ahead, 1, modes) do
+//             {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)]
+//             {:read_ahead, _} -> [:raw | modes]
+//             false -> [:raw, :read_ahead | modes]
+//             end
+
+//         false ->
+//             modes
+//         end
+
+//     %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()}
+
+//     end
+// ```"#
+//             .unindent(),
+//         embedding: vec![],
+//     }];
+
+//     for idx in 0..test_documents.len() {
+//         assert_eq!(test_documents[idx], parsed_files[idx]);
+//     }
+// }
+
+// #[gpui::test]
+// async fn test_code_context_retrieval_cpp() {
+//     let language = cpp_lang();
+//     let mut retriever = CodeContextRetriever::new();
+
+//     let text = "
+//     /**
+//      * @brief Main function
+//      * @returns 0 on exit
+//      */
+//     int main() { return 0; }
+
+//     /**
+//     * This is a test comment
+//     */
+//     class MyClass {       // The class
+//         public:             // Access specifier
+//         int myNum;        // Attribute (int variable)
+//         string myString;  // Attribute (string variable)
+//     };
+
+//     // This is a test comment
+//     enum Color { red, green, blue };
+
+//     /** This is a preceding block comment
+//      * This is the second line
+//      */
+//     struct {           // Structure declaration
+//         int myNum;       // Member (int variable)
+//         string myString; // Member (string variable)
+//     } myStructure;
+
+//     /**
+//     * @brief Matrix class.
+//     */
+//     template <typename T,
+//               typename = typename std::enable_if<
+//                 std::is_integral<T>::value || std::is_floating_point<T>::value,
+//                 bool>::type>
+//     class Matrix2 {
+//         std::vector<std::vector<T>> _mat;
+
+//     public:
+//         /**
+//         * @brief Constructor
+//         * @tparam Integer ensuring integers are being evaluated and not other
+//         * data types.
+//         * @param size denoting the size of Matrix as size x size
+//         */
+//         template <typename Integer,
+//                   typename = typename std::enable_if<std::is_integral<Integer>::value,
+//                   Integer>::type>
+//         explicit Matrix(const Integer size) {
+//             for (size_t i = 0; i < size; ++i) {
+//                 _mat.emplace_back(std::vector<T>(size, 0));
+//             }
+//         }
+//     }"
+//     .unindent();
+
+//     let parsed_files = retriever
+//         .parse_file(Path::new("foo.cpp"), &text, language)
+//         .unwrap();
+
+//     let test_documents = &[
+//         Document {
+//             name: "int main".into(),
+//             range: 54..78,
+//             content: "
+//                 The below code snippet is from file 'foo.cpp'
+
+//                 ```cpp
+//                 /**
+//                  * @brief Main function
+//                  * @returns 0 on exit
+//                  */
+//                 int main() { return 0; }
+//                 ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//             name: "class MyClass".into(),
+//             range: 112..295,
+//             content: "
+//                 The below code snippet is from file 'foo.cpp'
+
+//                 ```cpp
+//                 /**
+//                 * This is a test comment
+//                 */
+//                 class MyClass {       // The class
+//                     public:             // Access specifier
+//                     int myNum;        // Attribute (int variable)
+//                     string myString;  // Attribute (string variable)
+//                 }
+//                 ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//             name: "enum Color".into(),
+//             range: 324..355,
+//             content: "
+//                 The below code snippet is from file 'foo.cpp'
+
+//                 ```cpp
+//                 // This is a test comment
+//                 enum Color { red, green, blue }
+//                 ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//             name: "struct myStructure".into(),
+//             range: 428..581,
+//             content: "
+//                 The below code snippet is from file 'foo.cpp'
+
+//                 ```cpp
+//                 /** This is a preceding block comment
+//                  * This is the second line
+//                  */
+//                 struct {           // Structure declaration
+//                     int myNum;       // Member (int variable)
+//                     string myString; // Member (string variable)
+//                 } myStructure;
+//                 ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//         Document {
+//             name: "class Matrix2".into(),
+//             range: 613..1342,
+//             content: "
+//                 The below code snippet is from file 'foo.cpp'
+
+//                 ```cpp
+//                 /**
+//                 * @brief Matrix class.
+//                 */
+//                 template <typename T,
+//                           typename = typename std::enable_if<
+//                             std::is_integral<T>::value || std::is_floating_point<T>::value,
+//                             bool>::type>
+//                 class Matrix2 {
+//                     std::vector<std::vector<T>> _mat;
+
+//                 public:
+//                     /**
+//                     * @brief Constructor
+//                     * @tparam Integer ensuring integers are being evaluated and not other
+//                     * data types.
+//                     * @param size denoting the size of Matrix as size x size
+//                     */
+//                     template <typename Integer,
+//                               typename = typename std::enable_if<std::is_integral<Integer>::value,
+//                               Integer>::type>
+//                     explicit Matrix(const Integer size) {
+//                         for (size_t i = 0; i < size; ++i) {
+//                             _mat.emplace_back(std::vector<T>(size, 0));
+//                         }
+//                     }
+//                 }
+//                 ```"
+//             .unindent(),
+//             embedding: vec![],
+//         },
+//     ];
+
+//     for idx in 0..test_documents.len() {
+//         assert_eq!(test_documents[idx], parsed_files[idx]);
+//     }
+// }
 
 #[gpui::test]
 fn test_dot_product(mut rng: StdRng) {

crates/zed/src/languages/rust/config.toml 🔗

@@ -10,3 +10,4 @@ brackets = [
     { start = "\"", end = "\"", close = true, newline = false, not_in = ["string"] },
     { start = "/*", end = " */", close = true, newline = false, not_in = ["string", "comment"] },
 ]
+collapsed_placeholder = " /* ... */ "

crates/zed/src/languages/rust/embedding.scm 🔗

@@ -1,50 +1,28 @@
 (
-    (line_comment)* @context
+    [(line_comment) (attribute_item)]* @context
     .
-    (enum_item
-        name: (_) @name) @item
-)
+    [
+        (struct_item
+            name: (_) @name)
 
-(
-    (line_comment)* @context
-    .
-    (struct_item
-        name: (_) @name) @item
-)
+        (enum_item
+            name: (_) @name)
 
-(
-    (line_comment)* @context
-    .
-    (impl_item
-        trait: (_)? @name
-        "for"? @name
-        type: (_) @name) @item
-)
+        (impl_item
+            trait: (_)? @name
+            "for"? @name
+            type: (_) @name)
 
-(
-    (line_comment)* @context
-    .
-    (trait_item
-        name: (_) @name) @item
-)
+        (trait_item
+            name: (_) @name)
 
-(
-    (line_comment)* @context
-    .
-    (function_item
-        name: (_) @name) @item
-)
-
-(
-    (line_comment)* @context
-    .
-    (macro_definition
-        name: (_) @name) @item
-)
+        (function_item
+            name: (_) @name
+            body: (block
+                "{" @keep
+                "}" @keep) @collapse)
 
-(
-    (line_comment)* @context
-    .
-    (function_signature_item
-        name: (_) @name) @item
-)
+        (macro_definition
+            name: (_) @name)
+        ] @item
+    )