scored_declaration.rs

  1use itertools::Itertools as _;
  2use serde::Serialize;
  3use std::collections::HashMap;
  4use std::ops::Range;
  5use std::path::Path;
  6use std::sync::Arc;
  7use strum::EnumIter;
  8use tree_sitter::{QueryCursor, StreamingIterator, Tree};
  9
 10use crate::{Declaration, outline::Identifier};
 11
 12#[derive(Clone, Debug)]
 13pub struct ScoredSnippet {
 14    #[allow(dead_code)]
 15    pub identifier: Identifier,
 16    pub definition_file: Arc<Path>,
 17    pub definition: OutlineItem,
 18    pub score_components: ScoreInputs,
 19    pub scores: Scores,
 20}
 21
 22// TODO: Consider having "Concise" style corresponding to `concise_text`
 23#[derive(EnumIter, Clone, Copy, PartialEq, Eq, Hash, Debug)]
 24pub enum SnippetStyle {
 25    Signature,
 26    Definition,
 27}
 28
 29impl ScoredSnippet {
 30    /// Returns the score for this snippet with the specified style.
 31    pub fn score(&self, style: SnippetStyle) -> f32 {
 32        match style {
 33            SnippetStyle::Signature => self.scores.signature,
 34            SnippetStyle::Definition => self.scores.definition,
 35        }
 36    }
 37
 38    /// Returns the byte range for the snippet with the specified style. For `Signature` this is the
 39    /// signature_range expanded to line boundaries. For `Definition` this is the item_range expanded to
 40    /// line boundaries (similar to slice_at_line_boundaries).
 41    pub fn line_range(
 42        &self,
 43        identifier_index: &IdentifierIndex,
 44        style: SnippetStyle,
 45    ) -> Range<usize> {
 46        let source = identifier_index
 47            .path_to_source
 48            .get(&self.definition_file)
 49            .unwrap();
 50
 51        let base_range = match style {
 52            SnippetStyle::Signature => self.definition.signature_range.clone(),
 53            SnippetStyle::Definition => self.definition.item_range.clone(),
 54        };
 55
 56        expand_range_to_line_boundaries(source, base_range)
 57    }
 58
 59    pub fn score_density(&self, identifier_index: &IdentifierIndex, style: SnippetStyle) -> f32 {
 60        self.score(style) / range_size(self.line_range(identifier_index, style)) as f32
 61    }
 62}
 63
 64fn scored_snippets(
 65    language: &Language,
 66    index: &IdentifierIndex,
 67    source: &str,
 68    reference_file: &Path,
 69    references: Vec<Reference>,
 70    cursor_offset: usize,
 71    excerpt_range: Range<usize>,
 72) -> Vec<ScoredSnippet> {
 73    let cursor = point_from_offset(source, cursor_offset);
 74
 75    let containing_range_identifier_occurrences =
 76        IdentifierOccurrences::within_string(&source[excerpt_range.clone()]);
 77
 78    let start_point = Point::new(cursor.row.saturating_sub(2), 0);
 79    let end_point = Point::new(cursor.row + 1, 0);
 80    let adjacent_identifier_occurrences = IdentifierOccurrences::within_string(
 81        &source[offset_from_point(source, start_point)..offset_from_point(source, end_point)],
 82    );
 83
 84    let mut identifier_to_references: HashMap<Identifier, Vec<Reference>> = HashMap::new();
 85    for reference in references {
 86        identifier_to_references
 87            .entry(reference.identifier.clone())
 88            .or_insert_with(Vec::new)
 89            .push(reference);
 90    }
 91
 92    identifier_to_references
 93        .into_iter()
 94        .flat_map(|(identifier, references)| {
 95            let Some(definitions) = index
 96                .identifier_to_definitions
 97                .get(&(identifier.clone(), language.name.clone()))
 98            else {
 99                return Vec::new();
100            };
101            let definition_count = definitions.len();
102            let definition_file_count = definitions.keys().len();
103
104            definitions
105                .iter_all()
106                .flat_map(|(definition_file, file_definitions)| {
107                    let same_file_definition_count = file_definitions.len();
108                    let is_same_file = reference_file == definition_file.as_ref();
109                    file_definitions
110                        .iter()
111                        .filter(|definition| {
112                            !is_same_file
113                                || !range_intersection(&definition.item_range, &excerpt_range)
114                                    .is_some()
115                        })
116                        .filter_map(|definition| {
117                            let definition_line_distance = if is_same_file {
118                                let definition_line =
119                                    point_from_offset(source, definition.item_range.start).row;
120                                (cursor.row as i32 - definition_line as i32).abs() as u32
121                            } else {
122                                0
123                            };
124                            Some((definition_line_distance, definition))
125                        })
126                        .sorted_by_key(|&(distance, _)| distance)
127                        .enumerate()
128                        .map(
129                            |(
130                                definition_line_distance_rank,
131                                (definition_line_distance, definition),
132                            )| {
133                                score_snippet(
134                                    index,
135                                    source,
136                                    &identifier,
137                                    &references,
138                                    definition_file.clone(),
139                                    definition.clone(),
140                                    is_same_file,
141                                    definition_line_distance,
142                                    definition_line_distance_rank,
143                                    same_file_definition_count,
144                                    definition_count,
145                                    definition_file_count,
146                                    &containing_range_identifier_occurrences,
147                                    &adjacent_identifier_occurrences,
148                                    cursor,
149                                )
150                            },
151                        )
152                        .collect::<Vec<_>>()
153                })
154                .collect::<Vec<_>>()
155        })
156        .flatten()
157        .collect::<Vec<_>>()
158}
159
160fn score_snippet(
161    index: &IdentifierIndex,
162    reference_source: &str,
163    identifier: &Identifier,
164    references: &Vec<Reference>,
165    definition_file: Arc<Path>,
166    definition: OutlineItem,
167    is_same_file: bool,
168    definition_line_distance: u32,
169    definition_line_distance_rank: usize,
170    same_file_definition_count: usize,
171    definition_count: usize,
172    definition_file_count: usize,
173    containing_range_identifier_occurrences: &IdentifierOccurrences,
174    adjacent_identifier_occurrences: &IdentifierOccurrences,
175    cursor: Point,
176) -> Option<ScoredSnippet> {
177    let is_referenced_nearby = references
178        .iter()
179        .any(|r| r.reference_region == ReferenceRegion::Nearby);
180    let is_referenced_in_breadcrumb = references
181        .iter()
182        .any(|r| r.reference_region == ReferenceRegion::Breadcrumb);
183    let reference_count = references.len();
184    let reference_line_distance = references
185        .iter()
186        .map(|r| {
187            let reference_line = point_from_offset(reference_source, r.range.start).row as i32;
188            (cursor.row as i32 - reference_line).abs() as u32
189        })
190        .min()
191        .unwrap();
192
193    let definition_source = index.path_to_source.get(&definition_file).unwrap();
194    let item_source_occurrences =
195        IdentifierOccurrences::within_string(definition.item(&definition_source));
196    let item_signature_occurrences =
197        IdentifierOccurrences::within_string(definition.signature(&definition_source));
198    let containing_range_vs_item_jaccard = jaccard_similarity(
199        containing_range_identifier_occurrences,
200        &item_source_occurrences,
201    );
202    let containing_range_vs_signature_jaccard = jaccard_similarity(
203        containing_range_identifier_occurrences,
204        &item_signature_occurrences,
205    );
206    let adjacent_vs_item_jaccard =
207        jaccard_similarity(adjacent_identifier_occurrences, &item_source_occurrences);
208    let adjacent_vs_signature_jaccard =
209        jaccard_similarity(adjacent_identifier_occurrences, &item_signature_occurrences);
210
211    let containing_range_vs_item_weighted_overlap = weighted_overlap_coefficient(
212        containing_range_identifier_occurrences,
213        &item_source_occurrences,
214    );
215    let containing_range_vs_signature_weighted_overlap = weighted_overlap_coefficient(
216        containing_range_identifier_occurrences,
217        &item_signature_occurrences,
218    );
219    let adjacent_vs_item_weighted_overlap =
220        weighted_overlap_coefficient(adjacent_identifier_occurrences, &item_source_occurrences);
221    let adjacent_vs_signature_weighted_overlap =
222        weighted_overlap_coefficient(adjacent_identifier_occurrences, &item_signature_occurrences);
223
224    let score_components = ScoreInputs {
225        is_same_file,
226        is_referenced_nearby,
227        is_referenced_in_breadcrumb,
228        reference_line_distance,
229        definition_line_distance,
230        definition_line_distance_rank,
231        reference_count,
232        same_file_definition_count,
233        definition_count,
234        definition_file_count,
235        containing_range_vs_item_jaccard,
236        containing_range_vs_signature_jaccard,
237        adjacent_vs_item_jaccard,
238        adjacent_vs_signature_jaccard,
239        containing_range_vs_item_weighted_overlap,
240        containing_range_vs_signature_weighted_overlap,
241        adjacent_vs_item_weighted_overlap,
242        adjacent_vs_signature_weighted_overlap,
243    };
244
245    Some(ScoredSnippet {
246        identifier: identifier.clone(),
247        definition_file,
248        definition,
249        scores: score_components.score(),
250        score_components,
251    })
252}
253
254#[derive(Clone, Debug, Serialize)]
255pub struct ScoreInputs {
256    pub is_same_file: bool,
257    pub is_referenced_nearby: bool,
258    pub is_referenced_in_breadcrumb: bool,
259    pub reference_count: usize,
260    pub same_file_definition_count: usize,
261    pub definition_count: usize,
262    pub definition_file_count: usize,
263    pub reference_line_distance: u32,
264    pub definition_line_distance: u32,
265    pub definition_line_distance_rank: usize,
266    pub containing_range_vs_item_jaccard: f32,
267    pub containing_range_vs_signature_jaccard: f32,
268    pub adjacent_vs_item_jaccard: f32,
269    pub adjacent_vs_signature_jaccard: f32,
270    pub containing_range_vs_item_weighted_overlap: f32,
271    pub containing_range_vs_signature_weighted_overlap: f32,
272    pub adjacent_vs_item_weighted_overlap: f32,
273    pub adjacent_vs_signature_weighted_overlap: f32,
274}
275
276#[derive(Clone, Debug, Serialize)]
277pub struct Scores {
278    pub signature: f32,
279    pub definition: f32,
280}
281
282impl ScoreInputs {
283    fn score(&self) -> Scores {
284        // Score related to how likely this is the correct definition, range 0 to 1
285        let accuracy_score = if self.is_same_file {
286            // TODO: use definition_line_distance_rank
287            (0.5 / self.same_file_definition_count as f32)
288                + (0.5 / self.definition_file_count as f32)
289        } else {
290            1.0 / self.definition_count as f32
291        };
292
293        // Score related to the distance between the reference and cursor, range 0 to 1
294        let distance_score = if self.is_referenced_nearby {
295            1.0 / (1.0 + self.reference_line_distance as f32 / 10.0).powf(2.0)
296        } else {
297            // same score as ~14 lines away, rationale is to not overly penalize references from parent signatures
298            0.5
299        };
300
301        // For now instead of linear combination, the scores are just multiplied together.
302        let combined_score = 10.0 * accuracy_score * distance_score;
303
304        Scores {
305            signature: combined_score * self.containing_range_vs_signature_weighted_overlap,
306            // definition score gets boosted both by being multipled by 2 and by there being more
307            // weighted overlap.
308            definition: 2.0 * combined_score * self.containing_range_vs_item_weighted_overlap,
309        }
310    }
311}