Cargo.lock 🔗
@@ -8704,6 +8704,8 @@ dependencies = [
"sha2 0.10.7",
"smol",
"tempfile",
+ "tree-sitter",
+ "unindent",
"util",
"worktree",
]
Max Brunsfeld created
This chunking strategy uses the existing `outline` query to chunk files.
We try to find chunk boundaries that are:
* at starts or ends of lines
* nested within as few outline items as possible
Release Notes:
- N/A
Cargo.lock | 2
crates/language/src/language.rs | 13
crates/language/src/syntax_map.rs | 4
crates/semantic_index/Cargo.toml | 2
crates/semantic_index/src/chunking.rs | 634 +++++++++++++---------------
5 files changed, 312 insertions(+), 343 deletions(-)
@@ -8704,6 +8704,8 @@ dependencies = [
"sha2 0.10.7",
"smol",
"tempfile",
+ "tree-sitter",
+ "unindent",
"util",
"worktree",
]
@@ -55,10 +55,10 @@ use std::{
Arc,
},
};
-use syntax_map::SyntaxSnapshot;
+use syntax_map::{QueryCursorHandle, SyntaxSnapshot};
pub use task_context::{BasicContextProvider, ContextProvider, ContextProviderWithTasks};
use theme::SyntaxTheme;
-use tree_sitter::{self, wasmtime, Query, WasmStore};
+use tree_sitter::{self, wasmtime, Query, QueryCursor, WasmStore};
use util::http::HttpClient;
pub use buffer::Operation;
@@ -101,6 +101,15 @@ where
})
}
+pub fn with_query_cursor<F, R>(func: F) -> R
+where
+ F: FnOnce(&mut QueryCursor) -> R,
+{
+ use std::ops::DerefMut;
+ let mut cursor = QueryCursorHandle::new();
+ func(cursor.deref_mut())
+}
+
lazy_static! {
static ref NEXT_LANGUAGE_ID: AtomicUsize = Default::default();
static ref NEXT_GRAMMAR_ID: AtomicUsize = Default::default();
@@ -211,7 +211,7 @@ struct TextProvider<'a>(&'a Rope);
struct ByteChunks<'a>(text::Chunks<'a>);
-struct QueryCursorHandle(Option<QueryCursor>);
+pub(crate) struct QueryCursorHandle(Option<QueryCursor>);
impl SyntaxMap {
pub fn new() -> Self {
@@ -1739,7 +1739,7 @@ impl<'a> Iterator for ByteChunks<'a> {
}
impl QueryCursorHandle {
- pub(crate) fn new() -> Self {
+ pub fn new() -> Self {
let mut cursor = QUERY_CURSORS.lock().pop().unwrap_or_else(QueryCursor::new);
cursor.set_match_limit(64);
QueryCursorHandle(Some(cursor))
@@ -37,7 +37,9 @@ serde.workspace = true
serde_json.workspace = true
sha2.workspace = true
smol.workspace = true
+tree-sitter.workspace = true
util. workspace = true
+unindent.workspace = true
worktree.workspace = true
[dev-dependencies]
@@ -1,9 +1,24 @@
-use language::{with_parser, Grammar, Tree};
+use language::{with_parser, with_query_cursor, Grammar};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
-use std::{cmp, ops::Range, sync::Arc};
+use std::{
+ cmp::{self, Reverse},
+ ops::Range,
+ sync::Arc,
+};
+use tree_sitter::QueryCapture;
+use util::ResultExt as _;
+
+#[derive(Copy, Clone)]
+struct ChunkSizeRange {
+ min: usize,
+ max: usize,
+}
-const CHUNK_THRESHOLD: usize = 1500;
+const CHUNK_SIZE_RANGE: ChunkSizeRange = ChunkSizeRange {
+ min: 1024,
+ max: 8192,
+};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Chunk {
@@ -12,396 +27,337 @@ pub struct Chunk {
}
pub fn chunk_text(text: &str, grammar: Option<&Arc<Grammar>>) -> Vec<Chunk> {
- if let Some(grammar) = grammar {
- let tree = with_parser(|parser| {
- parser
- .set_language(&grammar.ts_language)
- .expect("incompatible grammar");
- parser.parse(&text, None).expect("invalid language")
- });
-
- chunk_parse_tree(tree, &text, CHUNK_THRESHOLD)
- } else {
- chunk_lines(&text)
- }
+ chunk_text_with_size_range(text, grammar, CHUNK_SIZE_RANGE)
}
-fn chunk_parse_tree(tree: Tree, text: &str, chunk_threshold: usize) -> Vec<Chunk> {
- let mut chunk_ranges = Vec::new();
- let mut cursor = tree.walk();
-
- let mut range = 0..0;
- loop {
- let node = cursor.node();
-
- // If adding the node to the current chunk exceeds the threshold
- if node.end_byte() - range.start > chunk_threshold {
- // Try to descend into its first child. If we can't, flush the current
- // range and try again.
- if cursor.goto_first_child() {
- continue;
- } else if !range.is_empty() {
- chunk_ranges.push(range.clone());
- range.start = range.end;
- continue;
- }
-
- // If we get here, the node itself has no children but is larger than the threshold.
- // Break its text into arbitrary chunks.
- split_text(text, range.clone(), node.end_byte(), &mut chunk_ranges);
- }
- range.end = node.end_byte();
-
- // If we get here, we consumed the node. Advance to the next child, ascending if there isn't one.
- while !cursor.goto_next_sibling() {
- if !cursor.goto_parent() {
- if !range.is_empty() {
- chunk_ranges.push(range);
- }
+fn chunk_text_with_size_range(
+ text: &str,
+ grammar: Option<&Arc<Grammar>>,
+ size_config: ChunkSizeRange,
+) -> Vec<Chunk> {
+ let mut syntactic_ranges = Vec::new();
- return chunk_ranges
- .into_iter()
- .map(|range| {
- let digest = Sha256::digest(&text[range.clone()]).into();
- Chunk { range, digest }
- })
- .collect();
+ if let Some(grammar) = grammar {
+ if let Some(outline) = grammar.outline_config.as_ref() {
+ let tree = with_parser(|parser| {
+ parser.set_language(&grammar.ts_language).log_err()?;
+ parser.parse(&text, None)
+ });
+
+ if let Some(tree) = tree {
+ with_query_cursor(|cursor| {
+ // Retrieve a list of ranges of outline items (types, functions, etc) in the document.
+ // Omit single-line outline items (e.g. struct fields, constant declarations), because
+ // we'll already be attempting to split on lines.
+ syntactic_ranges = cursor
+ .matches(&outline.query, tree.root_node(), text.as_bytes())
+ .filter_map(|mat| {
+ mat.captures
+ .iter()
+ .find_map(|QueryCapture { node, index }| {
+ if *index == outline.item_capture_ix {
+ if node.end_position().row > node.start_position().row {
+ return Some(node.byte_range());
+ }
+ }
+ None
+ })
+ })
+ .collect::<Vec<_>>();
+ syntactic_ranges
+ .sort_unstable_by_key(|range| (range.start, Reverse(range.end)));
+ });
}
}
}
+
+ chunk_text_with_syntactic_ranges(text, &syntactic_ranges, size_config)
}
-fn chunk_lines(text: &str) -> Vec<Chunk> {
- let mut chunk_ranges = Vec::new();
+fn chunk_text_with_syntactic_ranges(
+ text: &str,
+ mut syntactic_ranges: &[Range<usize>],
+ size_config: ChunkSizeRange,
+) -> Vec<Chunk> {
+ let mut chunks = Vec::new();
let mut range = 0..0;
-
- let mut newlines = text.match_indices('\n').peekable();
- while let Some((newline_ix, _)) = newlines.peek() {
- let newline_ix = newline_ix + 1;
- if newline_ix - range.start <= CHUNK_THRESHOLD {
- range.end = newline_ix;
- newlines.next();
+ let mut range_end_nesting_depth = 0;
+
+ // Try to split the text at line boundaries.
+ let mut line_ixs = text
+ .match_indices('\n')
+ .map(|(ix, _)| ix + 1)
+ .chain(if text.ends_with('\n') {
+ None
} else {
+ Some(text.len())
+ })
+ .peekable();
+
+ while let Some(&line_ix) = line_ixs.peek() {
+ // If the current position is beyond the maximum chunk size, then
+ // start a new chunk.
+ if line_ix - range.start > size_config.max {
if range.is_empty() {
- split_text(text, range, newline_ix, &mut chunk_ranges);
- range = newline_ix..newline_ix;
+ range.end = cmp::min(range.start + size_config.max, line_ix);
+ while !text.is_char_boundary(range.end) {
+ range.end -= 1;
+ }
+ }
+
+ chunks.push(Chunk {
+ range: range.clone(),
+ digest: Sha256::digest(&text[range.clone()]).into(),
+ });
+ range_end_nesting_depth = 0;
+ range.start = range.end;
+ continue;
+ }
+
+ // Discard any syntactic ranges that end before the current position.
+ while let Some(first_item) = syntactic_ranges.first() {
+ if first_item.end < line_ix {
+ syntactic_ranges = &syntactic_ranges[1..];
+ continue;
} else {
- chunk_ranges.push(range.clone());
- range.start = range.end;
+ break;
}
}
- }
- if !range.is_empty() {
- chunk_ranges.push(range);
+ // Count how many syntactic ranges contain the current position.
+ let mut nesting_depth = 0;
+ for range in syntactic_ranges {
+ if range.start > line_ix {
+ break;
+ }
+ if range.start < line_ix && range.end > line_ix {
+ nesting_depth += 1;
+ }
+ }
+
+ // Extend the current range to this position, unless an earlier candidate
+ // end position was less nested syntactically.
+ if range.len() < size_config.min || nesting_depth <= range_end_nesting_depth {
+ range.end = line_ix;
+ range_end_nesting_depth = nesting_depth;
+ }
+
+ line_ixs.next();
}
- chunk_ranges
- .into_iter()
- .map(|range| Chunk {
+ if !range.is_empty() {
+ chunks.push(Chunk {
+ range: range.clone(),
digest: Sha256::digest(&text[range.clone()]).into(),
- range,
- })
- .collect()
-}
-
-fn split_text(
- text: &str,
- mut range: Range<usize>,
- max_end: usize,
- chunk_ranges: &mut Vec<Range<usize>>,
-) {
- while range.start < max_end {
- range.end = cmp::min(range.start + CHUNK_THRESHOLD, max_end);
- while !text.is_char_boundary(range.end) {
- range.end -= 1;
- }
- chunk_ranges.push(range.clone());
- range.start = range.end;
+ });
}
+
+ chunks
}
#[cfg(test)]
mod tests {
use super::*;
use language::{tree_sitter_rust, Language, LanguageConfig, LanguageMatcher};
+ use unindent::Unindent as _;
- // This example comes from crates/gpui/examples/window_positioning.rs which
- // has the property of being CHUNK_THRESHOLD < TEXT.len() < 2*CHUNK_THRESHOLD
- static TEXT: &str = r#"
- use gpui::*;
+ #[test]
+ fn test_chunk_text_with_syntax() {
+ let language = rust_language();
+
+ let text = "
+ struct Person {
+ first_name: String,
+ last_name: String,
+ age: u32,
+ }
- struct WindowContent {
- text: SharedString,
- }
+ impl Person {
+ fn new(first_name: String, last_name: String, age: u32) -> Self {
+ Self { first_name, last_name, age }
+ }
- impl Render for WindowContent {
- fn render(&mut self, _cx: &mut ViewContext<Self>) -> impl IntoElement {
- div()
- .flex()
- .bg(rgb(0x1e2025))
- .size_full()
- .justify_center()
- .items_center()
- .text_xl()
- .text_color(rgb(0xffffff))
- .child(self.text.clone())
- }
- }
+ fn first_name(&self) -> &str {
+ &self.first_name
+ }
- fn main() {
- App::new().run(|cx: &mut AppContext| {
- // Create several new windows, positioned in the top right corner of each screen
-
- for screen in cx.displays() {
- let options = {
- let popup_margin_width = DevicePixels::from(16);
- let popup_margin_height = DevicePixels::from(-0) - DevicePixels::from(48);
-
- let window_size = Size {
- width: px(400.),
- height: px(72.),
- };
-
- let screen_bounds = screen.bounds();
- let size: Size<DevicePixels> = window_size.into();
-
- let bounds = gpui::Bounds::<DevicePixels> {
- origin: screen_bounds.upper_right()
- - point(size.width + popup_margin_width, popup_margin_height),
- size: window_size.into(),
- };
-
- WindowOptions {
- // Set the bounds of the window in screen coordinates
- bounds: Some(bounds),
- // Specify the display_id to ensure the window is created on the correct screen
- display_id: Some(screen.id()),
-
- titlebar: None,
- window_background: WindowBackgroundAppearance::default(),
- focus: false,
- show: true,
- kind: WindowKind::PopUp,
- is_movable: false,
- fullscreen: false,
- app_id: None,
- }
- };
-
- cx.open_window(options, |cx| {
- cx.new_view(|_| WindowContent {
- text: format!("{:?}", screen.id()).into(),
- })
- });
- }
- });
- }"#;
+ fn last_name(&self) -> &str {
+ &self.last_name
+ }
- fn setup_rust_language() -> Language {
- Language::new(
- LanguageConfig {
- name: "Rust".into(),
- matcher: LanguageMatcher {
- path_suffixes: vec!["rs".to_string()],
- ..Default::default()
- },
- ..Default::default()
+ fn age(&self) -> usize {
+ self.ages
+ }
+ }
+ "
+ .unindent();
+
+ let chunks = chunk_text_with_size_range(
+ &text,
+ language.grammar(),
+ ChunkSizeRange {
+ min: text.find('}').unwrap(),
+ max: text.find("Self {").unwrap(),
},
- Some(tree_sitter_rust::language()),
- )
- }
-
- #[test]
- fn test_chunk_text() {
- let text = "a\n".repeat(1000);
- let chunks = chunk_text(&text, None);
- assert_eq!(
- chunks.len(),
- ((2000_f64) / (CHUNK_THRESHOLD as f64)).ceil() as usize
);
- }
-
- #[test]
- fn test_chunk_text_grammar() {
- // Let's set up a big text with some known segments
- // We'll then chunk it and verify that the chunks are correct
-
- let language = setup_rust_language();
-
- let chunks = chunk_text(TEXT, language.grammar());
- assert_eq!(chunks.len(), 2);
-
- assert_eq!(chunks[0].range.start, 0);
- assert_eq!(chunks[0].range.end, 1498);
- // The break between chunks is right before the "Specify the display_id" comment
-
- assert_eq!(chunks[1].range.start, 1498);
- assert_eq!(chunks[1].range.end, 2434);
- }
-
- #[test]
- fn test_chunk_parse_tree() {
- let language = setup_rust_language();
- let grammar = language.grammar().unwrap();
-
- let tree = with_parser(|parser| {
- parser
- .set_language(&grammar.ts_language)
- .expect("incompatible grammar");
- parser.parse(TEXT, None).expect("invalid language")
- });
-
- let chunks = chunk_parse_tree(tree, TEXT, 250);
- assert_eq!(chunks.len(), 11);
- }
- #[test]
- fn test_chunk_unparsable() {
- // Even if a chunk is unparsable, we should still be able to chunk it
- let language = setup_rust_language();
- let grammar = language.grammar().unwrap();
-
- let text = r#"fn main() {"#;
- let tree = with_parser(|parser| {
- parser
- .set_language(&grammar.ts_language)
- .expect("incompatible grammar");
- parser.parse(text, None).expect("invalid language")
- });
+ // The entire impl cannot fit in a chunk, so it is split.
+ // Within the impl, two methods can fit in a chunk.
+ assert_chunks(
+ &text,
+ &chunks,
+ &[
+ "struct Person {", // ...
+ "impl Person {",
+ " fn first_name",
+ " fn age",
+ ],
+ );
- let chunks = chunk_parse_tree(tree, text, 250);
- assert_eq!(chunks.len(), 1);
+ let text = "
+ struct T {}
+ struct U {}
+ struct V {}
+ struct W {
+ a: T,
+ b: U,
+ }
+ "
+ .unindent();
+
+ let chunks = chunk_text_with_size_range(
+ &text,
+ language.grammar(),
+ ChunkSizeRange {
+ min: text.find('{').unwrap(),
+ max: text.find('V').unwrap(),
+ },
+ );
- assert_eq!(chunks[0].range.start, 0);
- assert_eq!(chunks[0].range.end, 11);
+ // Two single-line structs can fit in a chunk.
+ // The last struct cannot fit in a chunk together
+ // with the previous single-line struct.
+ assert_chunks(
+ &text,
+ &chunks,
+ &[
+ "struct T", // ...
+ "struct V", // ...
+ "struct W", // ...
+ "}",
+ ],
+ );
}
#[test]
- fn test_empty_text() {
- let language = setup_rust_language();
- let grammar = language.grammar().unwrap();
-
- let tree = with_parser(|parser| {
- parser
- .set_language(&grammar.ts_language)
- .expect("incompatible grammar");
- parser.parse("", None).expect("invalid language")
- });
+ fn test_chunk_with_long_lines() {
+ let language = rust_language();
+
+ let text = "
+ struct S { a: u32 }
+ struct T { a: u64 }
+ struct U { a: u64, b: u64, c: u64, d: u64, e: u64, f: u64, g: u64, h: u64, i: u64, j: u64 }
+ struct W { a: u64, b: u64, c: u64, d: u64, e: u64, f: u64, g: u64, h: u64, i: u64, j: u64 }
+ "
+ .unindent();
+
+ let chunks = chunk_text_with_size_range(
+ &text,
+ language.grammar(),
+ ChunkSizeRange { min: 32, max: 64 },
+ );
- let chunks = chunk_parse_tree(tree, "", CHUNK_THRESHOLD);
- assert!(chunks.is_empty(), "Chunks should be empty for empty text");
+ // The line is too long to fit in one chunk
+ assert_chunks(
+ &text,
+ &chunks,
+ &[
+ "struct S {", // ...
+ "struct U",
+ "4, h: u64, i: u64", // ...
+ "struct W",
+ "4, h: u64, i: u64", // ...
+ ],
+ );
}
- #[test]
- fn test_single_large_node() {
- let large_text = "static ".to_owned() + "a".repeat(CHUNK_THRESHOLD - 1).as_str() + " = 2";
-
- let language = setup_rust_language();
- let grammar = language.grammar().unwrap();
-
- let tree = with_parser(|parser| {
- parser
- .set_language(&grammar.ts_language)
- .expect("incompatible grammar");
- parser.parse(&large_text, None).expect("invalid language")
- });
-
- let chunks = chunk_parse_tree(tree, &large_text, CHUNK_THRESHOLD);
+ #[track_caller]
+ fn assert_chunks(text: &str, chunks: &[Chunk], expected_chunk_text_prefixes: &[&str]) {
+ check_chunk_invariants(text, chunks);
assert_eq!(
chunks.len(),
- 3,
- "Large chunks are broken up according to grammar as best as possible"
+ expected_chunk_text_prefixes.len(),
+ "unexpected number of chunks: {chunks:?}",
);
- // Expect chunks to be static, aaaaaa..., and = 2
- assert_eq!(chunks[0].range.start, 0);
- assert_eq!(chunks[0].range.end, "static".len());
-
- assert_eq!(chunks[1].range.start, "static".len());
- assert_eq!(chunks[1].range.end, "static".len() + CHUNK_THRESHOLD);
-
- assert_eq!(chunks[2].range.start, "static".len() + CHUNK_THRESHOLD);
- assert_eq!(chunks[2].range.end, large_text.len());
+ let mut prev_chunk_end = 0;
+ for (ix, chunk) in chunks.iter().enumerate() {
+ let expected_prefix = expected_chunk_text_prefixes[ix];
+ let chunk_text = &text[chunk.range.clone()];
+ if !chunk_text.starts_with(expected_prefix) {
+ let chunk_prefix_offset = text[prev_chunk_end..].find(expected_prefix);
+ if let Some(chunk_prefix_offset) = chunk_prefix_offset {
+ panic!(
+ "chunk {ix} starts at unexpected offset {}. expected {}",
+ chunk.range.start,
+ chunk_prefix_offset + prev_chunk_end
+ );
+ } else {
+ panic!("invalid expected chunk prefix {ix}: {expected_prefix:?}");
+ }
+ }
+ prev_chunk_end = chunk.range.end;
+ }
}
- #[test]
- fn test_multiple_small_nodes() {
- let small_text = "a b c d e f g h i j k l m n o p q r s t u v w x y z";
- let language = setup_rust_language();
- let grammar = language.grammar().unwrap();
-
- let tree = with_parser(|parser| {
- parser
- .set_language(&grammar.ts_language)
- .expect("incompatible grammar");
- parser.parse(small_text, None).expect("invalid language")
- });
+ #[track_caller]
+ fn check_chunk_invariants(text: &str, chunks: &[Chunk]) {
+ for (ix, chunk) in chunks.iter().enumerate() {
+ if ix > 0 && chunk.range.start != chunks[ix - 1].range.end {
+ panic!("chunk ranges are not contiguous: {:?}", chunks);
+ }
+ }
- let chunks = chunk_parse_tree(tree, small_text, 5);
- assert!(
- chunks.len() > 1,
- "Should have multiple chunks for multiple small nodes"
- );
+ if text.is_empty() {
+ assert!(chunks.is_empty())
+ } else if chunks.first().unwrap().range.start != 0
+ || chunks.last().unwrap().range.end != text.len()
+ {
+ panic!("chunks don't cover entire text {:?}", chunks);
+ }
}
#[test]
- fn test_node_with_children() {
- let nested_text = "fn main() { let a = 1; let b = 2; }";
- let language = setup_rust_language();
- let grammar = language.grammar().unwrap();
-
- let tree = with_parser(|parser| {
- parser
- .set_language(&grammar.ts_language)
- .expect("incompatible grammar");
- parser.parse(nested_text, None).expect("invalid language")
- });
-
- let chunks = chunk_parse_tree(tree, nested_text, 10);
- assert!(
- chunks.len() > 1,
- "Should have multiple chunks for a node with children"
+ fn test_chunk_text() {
+ let text = "a\n".repeat(1000);
+ let chunks = chunk_text(&text, None);
+ assert_eq!(
+ chunks.len(),
+ ((2000_f64) / (CHUNK_SIZE_RANGE.max as f64)).ceil() as usize
);
}
- #[test]
- fn test_text_with_unparsable_sections() {
- // This test uses purposefully hit-or-miss sizing of 11 characters per likely chunk
- let mixed_text = "fn main() { let a = 1; let b = 2; } unparsable bits here";
- let language = setup_rust_language();
- let grammar = language.grammar().unwrap();
-
- let tree = with_parser(|parser| {
- parser
- .set_language(&grammar.ts_language)
- .expect("incompatible grammar");
- parser.parse(mixed_text, None).expect("invalid language")
- });
-
- let chunks = chunk_parse_tree(tree, mixed_text, 11);
- assert!(
- chunks.len() > 1,
- "Should handle both parsable and unparsable sections correctly"
- );
-
- let expected_chunks = [
- "fn main() {",
- " let a = 1;",
- " let b = 2;",
- " }",
- " unparsable",
- " bits here",
- ];
-
- for (i, chunk) in chunks.iter().enumerate() {
- assert_eq!(
- &mixed_text[chunk.range.clone()],
- expected_chunks[i],
- "Chunk {} should match",
- i
- );
- }
+ fn rust_language() -> Language {
+ Language::new(
+ LanguageConfig {
+ name: "Rust".into(),
+ matcher: LanguageMatcher {
+ path_suffixes: vec!["rs".to_string()],
+ ..Default::default()
+ },
+ ..Default::default()
+ },
+ Some(tree_sitter_rust::language()),
+ )
+ .with_outline_query(
+ "
+ (function_item name: (_) @name) @item
+ (impl_item type: (_) @name) @item
+ (struct_item name: (_) @name) @item
+ (field_declaration name: (_) @name) @item
+ ",
+ )
+ .unwrap()
}
}