1use language::BufferSnapshot;
2use std::{borrow::Cow, ops::Range, sync::Arc};
3use text::ToOffset;
4use util::RangeExt as _;
5
6use crate::{
7 OccurrenceSource, Occurrences, SlidingWindow,
8 excerpt::{next_line_start, previous_line_start},
9};
10
11// TODO:
12//
13// * Use Tree-sitter nodes to reduce the number of windows considered / make snippet boundaries more
14// sensible.
15//
16// * Also use adjacent_occurrences.
17//
18// * Use multiple window sizes, or somehow otherwise allow finding high quality smaller snippets.
19
20// Potential future enhancements / things to consider:
21//
22// * Use edit history info? Exclude snippets that are part of edit history?
23//
24// * Tokenizer that includes symbols
25//
26// * Accumulate `Occurrences` for the whole buffer, for BM25/TF-IDF, where the buffer is the corpus.
27//
28// * Configurable ratio of forward scan bytes when limit is reached
29
30#[derive(Clone, Debug)]
31pub struct SimilarSnippet {
32 pub score: f32,
33 pub range: Range<usize>,
34 pub text: Arc<str>,
35}
36
37#[derive(Clone, Copy, Debug, PartialEq)]
38pub enum SimilarityMetric {
39 Jaccard,
40 OverlapCoefficient,
41}
42
43#[derive(Clone, Debug, PartialEq)]
44pub struct SimilarSnippetOptions {
45 pub min_bytes: usize,
46 pub max_bytes: usize,
47 pub min_bytes_delta_since_last_window: usize,
48 pub min_similarity: f32,
49 pub max_scan_bytes: usize,
50 pub similarity_metric: SimilarityMetric,
51 pub max_result_count: usize,
52}
53
54impl SimilarSnippetOptions {
55 pub const DEFAULT: Self = Self {
56 min_bytes: 128,
57 max_bytes: 256,
58 min_bytes_delta_since_last_window: 64,
59 min_similarity: 0.05,
60 max_scan_bytes: 512 * 1024,
61 similarity_metric: SimilarityMetric::OverlapCoefficient,
62 max_result_count: 5,
63 };
64}
65
66impl Default for SimilarSnippetOptions {
67 fn default() -> Self {
68 Self::DEFAULT
69 }
70}
71
72pub fn similar_snippets<S: OccurrenceSource>(
73 target: &Occurrences<S>,
74 excerpt_range: Range<usize>,
75 buffer: &BufferSnapshot,
76 options: &SimilarSnippetOptions,
77) -> Vec<SimilarSnippet> {
78 let mut backward_range = 0..excerpt_range.start;
79 let mut forward_range = excerpt_range.end..buffer.len();
80 let full_scan_bytes = backward_range.len() + forward_range.len();
81 if full_scan_bytes > options.max_scan_bytes {
82 let backward_start = next_line_start(
83 excerpt_range
84 .start
85 .saturating_sub(options.max_scan_bytes / 2),
86 buffer,
87 )
88 .to_offset(buffer);
89 backward_range = backward_start..excerpt_range.start;
90
91 let remaining_bytes = options.max_scan_bytes - backward_range.len();
92 let forward_end =
93 previous_line_start(excerpt_range.end + remaining_bytes, buffer).to_offset(buffer);
94 forward_range = excerpt_range.end..forward_end;
95 }
96
97 let mut window = SlidingWindow::with_capacity(target, 16);
98 let mut top_windows: Vec<TopWindow> = Vec::new();
99 add_similar_snippets_in_range(
100 forward_range,
101 buffer,
102 options,
103 &mut window,
104 &mut top_windows,
105 );
106 window.clear();
107 add_similar_snippets_in_range(
108 backward_range,
109 buffer,
110 options,
111 &mut window,
112 &mut top_windows,
113 );
114
115 top_windows
116 .into_iter()
117 .map(|window| SimilarSnippet {
118 score: window.similarity,
119 range: window.range.clone(),
120 text: buffer
121 .text_for_range(window.range)
122 .collect::<Cow<str>>()
123 .into(),
124 })
125 .collect()
126}
127
128fn add_similar_snippets_in_range<S: OccurrenceSource>(
129 range: Range<usize>,
130 buffer: &BufferSnapshot,
131 options: &SimilarSnippetOptions,
132 window: &mut SlidingWindow<usize, &Occurrences<S>, S>,
133 top_windows: &mut Vec<TopWindow>,
134) {
135 let mut bytes = 0;
136 let mut bytes_delta_since_last_window = usize::MAX;
137 let mut start_offset = range.start;
138 // TODO: This could be more efficient if there was a way to iterate the chunks within lines. The
139 // chunk bytes can be iterated and provided to occurrences_in_utf8_bytes.
140 let mut lines = buffer.text_for_range(range).lines();
141 while let Some(line) = lines.next() {
142 let len_with_newline = line.len() + 1;
143 bytes += len_with_newline;
144 if len_with_newline > options.max_bytes {
145 window.clear();
146 bytes_delta_since_last_window = usize::MAX;
147 bytes = 0;
148 continue;
149 }
150 bytes_delta_since_last_window =
151 bytes_delta_since_last_window.saturating_add(len_with_newline);
152 window.push_back(len_with_newline, S::occurrences_in_str(line));
153 while bytes > options.max_bytes {
154 let popped_bytes = window.pop_front();
155 bytes -= popped_bytes;
156 bytes_delta_since_last_window =
157 bytes_delta_since_last_window.saturating_add(popped_bytes);
158 start_offset += popped_bytes;
159 }
160 if bytes >= options.min_bytes
161 && bytes_delta_since_last_window >= options.min_bytes_delta_since_last_window
162 {
163 let similarity = match options.similarity_metric {
164 SimilarityMetric::Jaccard => window.weighted_jaccard_similarity(),
165 SimilarityMetric::OverlapCoefficient => window.weighted_overlap_coefficient(),
166 };
167
168 if similarity > options.min_similarity {
169 insert_into_top_windows(
170 similarity,
171 start_offset..start_offset + bytes,
172 options,
173 top_windows,
174 );
175 }
176 bytes_delta_since_last_window = 0;
177 }
178 }
179}
180
181struct TopWindow {
182 similarity: f32,
183 range: Range<usize>,
184}
185
186/// Maintains the sort order of `top_windows`. If it overlaps with an existing window that has a
187/// lower similarity score, that window is removed.
188fn insert_into_top_windows(
189 similarity: f32,
190 range: Range<usize>,
191 options: &SimilarSnippetOptions,
192 top_windows: &mut Vec<TopWindow>,
193) {
194 if top_windows.len() >= options.max_result_count
195 && let Some(min_top_window) = top_windows.last()
196 && similarity <= min_top_window.similarity
197 {
198 return;
199 }
200
201 let mut ix = 0;
202 let mut inserted = false;
203 while ix < top_windows.len() {
204 if top_windows[ix].similarity < similarity {
205 let new_top_window = TopWindow {
206 similarity,
207 range: range.clone(),
208 };
209 if top_windows[ix].range.overlaps(&range) {
210 top_windows[ix] = new_top_window;
211 return;
212 } else {
213 top_windows.insert(ix, new_top_window);
214 ix += 1;
215 inserted = true;
216 break;
217 }
218 } else {
219 if top_windows[ix].range.overlaps(&range) {
220 return;
221 }
222 }
223 ix += 1;
224 }
225
226 if inserted {
227 for ix in ix..top_windows.len() {
228 if top_windows[ix].range.overlaps(&range) {
229 top_windows.remove(ix);
230 return;
231 }
232 }
233 if top_windows.len() > options.max_result_count {
234 top_windows.pop();
235 }
236 } else if top_windows.len() < options.max_result_count {
237 top_windows.push(TopWindow { similarity, range });
238 }
239}