similar_snippets.rs

  1use language::BufferSnapshot;
  2use std::{borrow::Cow, ops::Range, sync::Arc};
  3use text::ToOffset;
  4use util::RangeExt as _;
  5
  6use crate::{
  7    OccurrenceSource, Occurrences, SlidingWindow,
  8    excerpt::{next_line_start, previous_line_start},
  9};
 10
 11// TODO:
 12//
 13// * Use Tree-sitter nodes to reduce the number of windows considered / make snippet boundaries more
 14// sensible.
 15//
 16// * Also use adjacent_occurrences.
 17//
 18// * Use multiple window sizes, or somehow otherwise allow finding high quality smaller snippets.
 19
 20// Potential future enhancements / things to consider:
 21//
 22// * Use edit history info? Exclude snippets that are part of edit history?
 23//
 24// * Tokenizer that includes symbols
 25//
 26// * Accumulate `Occurrences` for the whole buffer, for BM25/TF-IDF, where the buffer is the corpus.
 27//
 28// * Configurable ratio of forward scan bytes when limit is reached
 29
 30#[derive(Clone, Debug)]
 31pub struct SimilarSnippet {
 32    pub score: f32,
 33    pub range: Range<usize>,
 34    pub text: Arc<str>,
 35}
 36
 37#[derive(Clone, Copy, Debug, PartialEq)]
 38pub enum SimilarityMetric {
 39    Jaccard,
 40    OverlapCoefficient,
 41}
 42
 43#[derive(Clone, Debug, PartialEq)]
 44pub struct SimilarSnippetOptions {
 45    pub min_bytes: usize,
 46    pub max_bytes: usize,
 47    pub min_bytes_delta_since_last_window: usize,
 48    pub min_similarity: f32,
 49    pub max_scan_bytes: usize,
 50    pub similarity_metric: SimilarityMetric,
 51    pub max_result_count: usize,
 52}
 53
 54impl SimilarSnippetOptions {
 55    pub const DEFAULT: Self = Self {
 56        min_bytes: 128,
 57        max_bytes: 256,
 58        min_bytes_delta_since_last_window: 64,
 59        min_similarity: 0.05,
 60        max_scan_bytes: 512 * 1024,
 61        similarity_metric: SimilarityMetric::OverlapCoefficient,
 62        max_result_count: 5,
 63    };
 64}
 65
 66impl Default for SimilarSnippetOptions {
 67    fn default() -> Self {
 68        Self::DEFAULT
 69    }
 70}
 71
 72pub fn similar_snippets<S: OccurrenceSource>(
 73    target: &Occurrences<S>,
 74    excerpt_range: Range<usize>,
 75    buffer: &BufferSnapshot,
 76    options: &SimilarSnippetOptions,
 77) -> Vec<SimilarSnippet> {
 78    let mut backward_range = 0..excerpt_range.start;
 79    let mut forward_range = excerpt_range.end..buffer.len();
 80    let full_scan_bytes = backward_range.len() + forward_range.len();
 81    if full_scan_bytes > options.max_scan_bytes {
 82        let backward_start = next_line_start(
 83            excerpt_range
 84                .start
 85                .saturating_sub(options.max_scan_bytes / 2),
 86            buffer,
 87        )
 88        .to_offset(buffer);
 89        backward_range = backward_start..excerpt_range.start;
 90
 91        let remaining_bytes = options.max_scan_bytes - backward_range.len();
 92        let forward_end =
 93            previous_line_start(excerpt_range.end + remaining_bytes, buffer).to_offset(buffer);
 94        forward_range = excerpt_range.end..forward_end;
 95    }
 96
 97    let mut window = SlidingWindow::with_capacity(target, 16);
 98    let mut top_windows: Vec<TopWindow> = Vec::new();
 99    add_similar_snippets_in_range(
100        forward_range,
101        buffer,
102        options,
103        &mut window,
104        &mut top_windows,
105    );
106    window.clear();
107    add_similar_snippets_in_range(
108        backward_range,
109        buffer,
110        options,
111        &mut window,
112        &mut top_windows,
113    );
114
115    top_windows
116        .into_iter()
117        .map(|window| SimilarSnippet {
118            score: window.similarity,
119            range: window.range.clone(),
120            text: buffer
121                .text_for_range(window.range)
122                .collect::<Cow<str>>()
123                .into(),
124        })
125        .collect()
126}
127
128fn add_similar_snippets_in_range<S: OccurrenceSource>(
129    range: Range<usize>,
130    buffer: &BufferSnapshot,
131    options: &SimilarSnippetOptions,
132    window: &mut SlidingWindow<usize, &Occurrences<S>, S>,
133    top_windows: &mut Vec<TopWindow>,
134) {
135    let mut bytes = 0;
136    let mut bytes_delta_since_last_window = usize::MAX;
137    let mut start_offset = range.start;
138    // TODO: This could be more efficient if there was a way to iterate the chunks within lines. The
139    // chunk bytes can be iterated and provided to occurrences_in_utf8_bytes.
140    let mut lines = buffer.text_for_range(range).lines();
141    while let Some(line) = lines.next() {
142        let len_with_newline = line.len() + 1;
143        bytes += len_with_newline;
144        if len_with_newline > options.max_bytes {
145            window.clear();
146            bytes_delta_since_last_window = usize::MAX;
147            bytes = 0;
148            continue;
149        }
150        bytes_delta_since_last_window =
151            bytes_delta_since_last_window.saturating_add(len_with_newline);
152        window.push_back(len_with_newline, S::occurrences_in_str(line));
153        while bytes > options.max_bytes {
154            let popped_bytes = window.pop_front();
155            bytes -= popped_bytes;
156            bytes_delta_since_last_window =
157                bytes_delta_since_last_window.saturating_add(popped_bytes);
158            start_offset += popped_bytes;
159        }
160        if bytes >= options.min_bytes
161            && bytes_delta_since_last_window >= options.min_bytes_delta_since_last_window
162        {
163            let similarity = match options.similarity_metric {
164                SimilarityMetric::Jaccard => window.weighted_jaccard_similarity(),
165                SimilarityMetric::OverlapCoefficient => window.weighted_overlap_coefficient(),
166            };
167
168            if similarity > options.min_similarity {
169                insert_into_top_windows(
170                    similarity,
171                    start_offset..start_offset + bytes,
172                    options,
173                    top_windows,
174                );
175            }
176            bytes_delta_since_last_window = 0;
177        }
178    }
179}
180
181struct TopWindow {
182    similarity: f32,
183    range: Range<usize>,
184}
185
186/// Maintains the sort order of `top_windows`. If it overlaps with an existing window that has a
187/// lower similarity score, that window is removed.
188fn insert_into_top_windows(
189    similarity: f32,
190    range: Range<usize>,
191    options: &SimilarSnippetOptions,
192    top_windows: &mut Vec<TopWindow>,
193) {
194    if top_windows.len() >= options.max_result_count
195        && let Some(min_top_window) = top_windows.last()
196        && similarity <= min_top_window.similarity
197    {
198        return;
199    }
200
201    let mut ix = 0;
202    let mut inserted = false;
203    while ix < top_windows.len() {
204        if top_windows[ix].similarity < similarity {
205            let new_top_window = TopWindow {
206                similarity,
207                range: range.clone(),
208            };
209            if top_windows[ix].range.overlaps(&range) {
210                top_windows[ix] = new_top_window;
211                return;
212            } else {
213                top_windows.insert(ix, new_top_window);
214                ix += 1;
215                inserted = true;
216                break;
217            }
218        } else {
219            if top_windows[ix].range.overlaps(&range) {
220                return;
221            }
222        }
223        ix += 1;
224    }
225
226    if inserted {
227        for ix in ix..top_windows.len() {
228            if top_windows[ix].range.overlaps(&range) {
229                top_windows.remove(ix);
230                return;
231            }
232        }
233        if top_windows.len() > options.max_result_count {
234            top_windows.pop();
235        }
236    } else if top_windows.len() < options.max_result_count {
237        top_windows.push(TopWindow { similarity, range });
238    }
239}