1use ::util::rel_path::RelPath;
2use ::util::{RangeExt, ResultExt as _};
3use anyhow::{Context as _, Result};
4use cloud_llm_client::predict_edits_v3::DeclarationScoreComponents;
5use edit_prediction_context::{
6 Declaration, DeclarationStyle, EditPredictionContext, EditPredictionContextOptions, Identifier,
7 Imports, Reference, ReferenceRegion, SyntaxIndex, SyntaxIndexState, references_in_range,
8};
9use futures::StreamExt as _;
10use futures::channel::mpsc;
11use gpui::Entity;
12use gpui::{AppContext, AsyncApp};
13use language::OffsetRangeExt;
14use language::{BufferSnapshot, Point};
15use ordered_float::OrderedFloat;
16use polars::prelude::*;
17use project::{Project, ProjectEntryId, ProjectPath, Worktree};
18use serde::{Deserialize, Serialize};
19use std::fs;
20use std::{
21 cmp::Reverse,
22 collections::{HashMap, HashSet},
23 fs::File,
24 hash::{Hash, Hasher},
25 io::{BufRead, BufReader, BufWriter, Write as _},
26 ops::Range,
27 path::{Path, PathBuf},
28 sync::{
29 Arc,
30 atomic::{self, AtomicUsize},
31 },
32 time::Duration,
33};
34use util::paths::PathStyle;
35use zeta2::ContextMode;
36
37use crate::headless::ZetaCliAppState;
38use crate::source_location::SourceLocation;
39use crate::util::{open_buffer, open_buffer_with_language_server};
40
41pub async fn retrieval_stats(
42 worktree: PathBuf,
43 app_state: Arc<ZetaCliAppState>,
44 only_extension: Option<String>,
45 file_limit: Option<usize>,
46 skip_files: Option<usize>,
47 options: zeta2::ZetaOptions,
48 cx: &mut AsyncApp,
49) -> Result<String> {
50 let ContextMode::Syntax(context_options) = options.context.clone() else {
51 anyhow::bail!("retrieval stats only works in ContextMode::Syntax");
52 };
53
54 let options = Arc::new(options);
55 let worktree_path = worktree.canonicalize()?;
56
57 let project = cx.update(|cx| {
58 Project::local(
59 app_state.client.clone(),
60 app_state.node_runtime.clone(),
61 app_state.user_store.clone(),
62 app_state.languages.clone(),
63 app_state.fs.clone(),
64 None,
65 cx,
66 )
67 })?;
68
69 let worktree = project
70 .update(cx, |project, cx| {
71 project.create_worktree(&worktree_path, true, cx)
72 })?
73 .await?;
74
75 // wait for worktree scan so that wait_for_initial_file_indexing waits for the whole worktree.
76 worktree
77 .read_with(cx, |worktree, _cx| {
78 worktree.as_local().unwrap().scan_complete()
79 })?
80 .await;
81
82 let index = cx.new(|cx| SyntaxIndex::new(&project, options.file_indexing_parallelism, cx))?;
83 index
84 .read_with(cx, |index, cx| index.wait_for_initial_file_indexing(cx))?
85 .await?;
86 let indexed_files = index
87 .read_with(cx, |index, cx| index.indexed_file_paths(cx))?
88 .await;
89 let mut filtered_files = indexed_files
90 .into_iter()
91 .filter(|project_path| {
92 let file_extension = project_path.path.extension();
93 if let Some(only_extension) = only_extension.as_ref() {
94 file_extension.is_some_and(|extension| extension == only_extension)
95 } else {
96 file_extension
97 .is_some_and(|extension| !["md", "json", "sh", "diff"].contains(&extension))
98 }
99 })
100 .collect::<Vec<_>>();
101 filtered_files.sort_by(|a, b| a.path.cmp(&b.path));
102
103 let index_state = index.read_with(cx, |index, _cx| index.state().clone())?;
104 cx.update(|_| {
105 drop(index);
106 })?;
107 let index_state = Arc::new(
108 Arc::into_inner(index_state)
109 .context("Index state had more than 1 reference")?
110 .into_inner(),
111 );
112
113 struct FileSnapshot {
114 project_entry_id: ProjectEntryId,
115 snapshot: BufferSnapshot,
116 hash: u64,
117 parent_abs_path: Arc<Path>,
118 }
119
120 let files: Vec<FileSnapshot> = futures::future::try_join_all({
121 filtered_files
122 .iter()
123 .map(|file| {
124 let buffer_task =
125 open_buffer(project.clone(), worktree.clone(), file.path.clone(), cx);
126 cx.spawn(async move |cx| {
127 let buffer = buffer_task.await?;
128 let (project_entry_id, parent_abs_path, snapshot) =
129 buffer.read_with(cx, |buffer, cx| {
130 let file = project::File::from_dyn(buffer.file()).unwrap();
131 let project_entry_id = file.project_entry_id().unwrap();
132 let mut parent_abs_path = file.worktree.read(cx).absolutize(&file.path);
133 if !parent_abs_path.pop() {
134 panic!("Invalid worktree path");
135 }
136
137 (project_entry_id, parent_abs_path, buffer.snapshot())
138 })?;
139
140 anyhow::Ok(
141 cx.background_spawn(async move {
142 let mut hasher = collections::FxHasher::default();
143 snapshot.text().hash(&mut hasher);
144 FileSnapshot {
145 project_entry_id,
146 snapshot,
147 hash: hasher.finish(),
148 parent_abs_path: parent_abs_path.into(),
149 }
150 })
151 .await,
152 )
153 })
154 })
155 .collect::<Vec<_>>()
156 })
157 .await?;
158
159 let mut file_snapshots = HashMap::default();
160 let mut hasher = collections::FxHasher::default();
161 for FileSnapshot {
162 project_entry_id,
163 snapshot,
164 hash,
165 ..
166 } in &files
167 {
168 file_snapshots.insert(*project_entry_id, snapshot.clone());
169 hash.hash(&mut hasher);
170 }
171 let files_hash = hasher.finish();
172 let file_snapshots = Arc::new(file_snapshots);
173 let target_cli_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("../../target/zeta_cli");
174 fs::create_dir_all(&target_cli_dir).unwrap();
175 let target_cli_dir = target_cli_dir.canonicalize().unwrap();
176
177 let lsp_cache_dir = target_cli_dir.join("cache");
178 fs::create_dir_all(&lsp_cache_dir).unwrap();
179
180 let lsp_definitions_path = lsp_cache_dir.join(format!(
181 "{}-{:x}.jsonl",
182 worktree_path.file_stem().unwrap_or_default().display(),
183 files_hash
184 ));
185
186 let mut lsp_definitions = HashMap::default();
187 let mut lsp_files = 0;
188
189 if fs::exists(&lsp_definitions_path)? {
190 log::info!(
191 "Using cached LSP definitions from {}",
192 lsp_definitions_path.display()
193 );
194
195 let file = File::options()
196 .read(true)
197 .write(true)
198 .open(&lsp_definitions_path)?;
199 let lines = BufReader::new(&file).lines();
200 let mut valid_len: usize = 0;
201
202 for (line, expected_file) in lines.zip(files.iter()) {
203 let line = line?;
204 let FileLspDefinitions { path, references } = match serde_json::from_str(&line) {
205 Ok(ok) => ok,
206 Err(_) => {
207 log::error!("Found invalid cache line. Truncating to #{lsp_files}.",);
208 file.set_len(valid_len as u64)?;
209 break;
210 }
211 };
212 let expected_path = expected_file.snapshot.file().unwrap().path().as_unix_str();
213 if expected_path != path.as_ref() {
214 log::error!(
215 "Expected file #{} to be {expected_path}, but found {path}. Truncating to #{lsp_files}.",
216 lsp_files + 1
217 );
218 file.set_len(valid_len as u64)?;
219 break;
220 }
221 for (point, ranges) in references {
222 let Ok(path) = RelPath::new(Path::new(path.as_ref()), PathStyle::Posix) else {
223 log::warn!("Invalid path: {}", path);
224 continue;
225 };
226 lsp_definitions.insert(
227 SourceLocation {
228 path: path.into_arc(),
229 point: point.into(),
230 },
231 ranges,
232 );
233 }
234 lsp_files += 1;
235 valid_len += line.len() + 1
236 }
237 }
238
239 if lsp_files < files.len() {
240 if lsp_files == 0 {
241 log::warn!(
242 "No LSP definitions found, populating {}",
243 lsp_definitions_path.display()
244 );
245 } else {
246 log::warn!("{} files missing from LSP cache", files.len() - lsp_files);
247 }
248
249 gather_lsp_definitions(
250 &lsp_definitions_path,
251 lsp_files,
252 &filtered_files,
253 &worktree,
254 &project,
255 &mut lsp_definitions,
256 cx,
257 )
258 .await?;
259 }
260 let files_len = files.len().min(file_limit.unwrap_or(usize::MAX));
261 let done_count = Arc::new(AtomicUsize::new(0));
262
263 let (output_tx, output_rx) = mpsc::unbounded::<ReferenceRetrievalResult>();
264
265 let tasks = files
266 .into_iter()
267 .skip(skip_files.unwrap_or(0))
268 .take(file_limit.unwrap_or(usize::MAX))
269 .map(|project_file| {
270 let index_state = index_state.clone();
271 let lsp_definitions = lsp_definitions.clone();
272 let output_tx = output_tx.clone();
273 let done_count = done_count.clone();
274 let file_snapshots = file_snapshots.clone();
275 let context_options = context_options.clone();
276 cx.background_spawn(async move {
277 let snapshot = project_file.snapshot;
278
279 let full_range = 0..snapshot.len();
280 let references = references_in_range(
281 full_range,
282 &snapshot.text(),
283 ReferenceRegion::Nearby,
284 &snapshot,
285 );
286
287 let imports = if context_options.use_imports {
288 Imports::gather(&snapshot, Some(&project_file.parent_abs_path))
289 } else {
290 Imports::default()
291 };
292
293 let path = snapshot.file().unwrap().path();
294
295 for reference in references {
296 let query_point = snapshot.offset_to_point(reference.range.start);
297 let source_location = SourceLocation {
298 path: path.clone(),
299 point: query_point,
300 };
301 let lsp_definitions = lsp_definitions
302 .get(&source_location)
303 .cloned()
304 .unwrap_or_else(|| {
305 log::warn!(
306 "No definitions found for source location: {:?}",
307 source_location
308 );
309 Vec::new()
310 });
311
312 let retrieve_result = retrieve_definitions(
313 &reference,
314 &imports,
315 query_point,
316 &snapshot,
317 &index_state,
318 &file_snapshots,
319 &context_options,
320 )
321 .await?;
322
323 let result = ReferenceRetrievalResult {
324 cursor_path: path.clone(),
325 identifier: reference.identifier,
326 cursor_point: query_point,
327 lsp_definitions,
328 retrieved_definitions: retrieve_result.definitions,
329 excerpt_range: retrieve_result.excerpt_range,
330 };
331
332 output_tx.unbounded_send(result).ok();
333 }
334
335 println!(
336 "{:02}/{:02} done",
337 done_count.fetch_add(1, atomic::Ordering::Relaxed) + 1,
338 files_len,
339 );
340
341 anyhow::Ok(())
342 })
343 })
344 .collect::<Vec<_>>();
345
346 drop(output_tx);
347
348 let df_task = cx.background_spawn(build_dataframe(output_rx));
349
350 futures::future::try_join_all(tasks).await?;
351 let mut df = df_task.await?;
352
353 let run_id = format!(
354 "{}-{}",
355 worktree_path.file_stem().unwrap_or_default().display(),
356 chrono::Local::now().format("%Y%m%d_%H%M%S")
357 );
358 let run_dir = target_cli_dir.join(run_id);
359 fs::create_dir(&run_dir).unwrap();
360
361 let parquet_path = run_dir.join("stats.parquet");
362 let mut parquet_file = fs::File::create(&parquet_path)?;
363
364 ParquetWriter::new(&mut parquet_file)
365 .finish(&mut df)
366 .unwrap();
367
368 let stats = SummaryStats::from_dataframe(df)?;
369
370 let stats_path = run_dir.join("stats.txt");
371 fs::write(&stats_path, format!("{}", stats))?;
372
373 println!("{}", stats);
374 println!("\nWrote:");
375 println!("- {}", relativize_path(&parquet_path).display());
376 println!("- {}", relativize_path(&stats_path).display());
377 println!("- {}", relativize_path(&lsp_definitions_path).display());
378
379 Ok("".to_string())
380}
381
382async fn build_dataframe(
383 mut output_rx: mpsc::UnboundedReceiver<ReferenceRetrievalResult>,
384) -> Result<DataFrame> {
385 use soa_rs::{Soa, Soars};
386
387 #[derive(Default, Soars)]
388 struct Row {
389 ref_id: u32,
390 cursor_path: String,
391 cursor_row: u32,
392 cursor_column: u32,
393 cursor_identifier: String,
394 gold_in_excerpt: bool,
395 gold_path: String,
396 gold_row: u32,
397 gold_column: u32,
398 gold_is_external: bool,
399 candidate_count: u32,
400 candidate_path: Option<String>,
401 candidate_row: Option<u32>,
402 candidate_column: Option<u32>,
403 candidate_is_gold: Option<bool>,
404 candidate_rank: Option<u32>,
405 candidate_is_same_file: Option<bool>,
406 candidate_is_referenced_nearby: Option<bool>,
407 candidate_is_referenced_in_breadcrumb: Option<bool>,
408 candidate_reference_count: Option<u32>,
409 candidate_same_file_declaration_count: Option<u32>,
410 candidate_declaration_count: Option<u32>,
411 candidate_reference_line_distance: Option<u32>,
412 candidate_declaration_line_distance: Option<u32>,
413 candidate_excerpt_vs_item_jaccard: Option<f32>,
414 candidate_excerpt_vs_signature_jaccard: Option<f32>,
415 candidate_adjacent_vs_item_jaccard: Option<f32>,
416 candidate_adjacent_vs_signature_jaccard: Option<f32>,
417 candidate_excerpt_vs_item_weighted_overlap: Option<f32>,
418 candidate_excerpt_vs_signature_weighted_overlap: Option<f32>,
419 candidate_adjacent_vs_item_weighted_overlap: Option<f32>,
420 candidate_adjacent_vs_signature_weighted_overlap: Option<f32>,
421 candidate_path_import_match_count: Option<u32>,
422 candidate_wildcard_path_import_match_count: Option<u32>,
423 candidate_import_similarity: Option<f32>,
424 candidate_max_import_similarity: Option<f32>,
425 candidate_normalized_import_similarity: Option<f32>,
426 candidate_wildcard_import_similarity: Option<f32>,
427 candidate_normalized_wildcard_import_similarity: Option<f32>,
428 candidate_included_by_others: Option<u32>,
429 candidate_includes_others: Option<u32>,
430 }
431 let mut rows = Soa::<Row>::new();
432 let mut next_ref_id = 0;
433
434 while let Some(result) = output_rx.next().await {
435 let mut gold_is_external = false;
436 let mut gold_in_excerpt = false;
437 let cursor_path = result.cursor_path.as_unix_str();
438 let cursor_row = result.cursor_point.row + 1;
439 let cursor_column = result.cursor_point.column + 1;
440 let cursor_identifier = result.identifier.name.to_string();
441 let ref_id = next_ref_id;
442 next_ref_id += 1;
443
444 for lsp_definition in result.lsp_definitions {
445 let SourceRange {
446 path: gold_path,
447 point_range: gold_point_range,
448 offset_range: gold_offset_range,
449 } = lsp_definition;
450 let lsp_point_range =
451 SerializablePoint::into_language_point_range(gold_point_range.clone());
452
453 gold_is_external = gold_is_external
454 || gold_path.is_absolute()
455 || gold_path
456 .components()
457 .any(|component| component.as_os_str() == "node_modules");
458
459 gold_in_excerpt = gold_in_excerpt
460 || result.excerpt_range.as_ref().is_some_and(|excerpt_range| {
461 excerpt_range.contains_inclusive(&gold_offset_range)
462 });
463
464 let gold_row = gold_point_range.start.row;
465 let gold_column = gold_point_range.start.column;
466 let candidate_count = result.retrieved_definitions.len() as u32;
467
468 for (candidate_rank, retrieved_definition) in
469 result.retrieved_definitions.iter().enumerate()
470 {
471 let candidate_is_gold = gold_path.as_path()
472 == retrieved_definition.path.as_std_path()
473 && retrieved_definition
474 .range
475 .contains_inclusive(&lsp_point_range);
476
477 let candidate_row = retrieved_definition.range.start.row + 1;
478 let candidate_column = retrieved_definition.range.start.column + 1;
479
480 let DeclarationScoreComponents {
481 is_same_file,
482 is_referenced_nearby,
483 is_referenced_in_breadcrumb,
484 reference_count,
485 same_file_declaration_count,
486 declaration_count,
487 reference_line_distance,
488 declaration_line_distance,
489 excerpt_vs_item_jaccard,
490 excerpt_vs_signature_jaccard,
491 adjacent_vs_item_jaccard,
492 adjacent_vs_signature_jaccard,
493 excerpt_vs_item_weighted_overlap,
494 excerpt_vs_signature_weighted_overlap,
495 adjacent_vs_item_weighted_overlap,
496 adjacent_vs_signature_weighted_overlap,
497 path_import_match_count,
498 wildcard_path_import_match_count,
499 import_similarity,
500 max_import_similarity,
501 normalized_import_similarity,
502 wildcard_import_similarity,
503 normalized_wildcard_import_similarity,
504 included_by_others,
505 includes_others,
506 } = retrieved_definition.components;
507
508 rows.push(Row {
509 ref_id,
510 cursor_path: cursor_path.to_string(),
511 cursor_row,
512 cursor_column,
513 cursor_identifier: cursor_identifier.clone(),
514 gold_in_excerpt,
515 gold_path: gold_path.to_string_lossy().to_string(),
516 gold_row,
517 gold_column,
518 gold_is_external,
519 candidate_count,
520 candidate_path: Some(retrieved_definition.path.as_unix_str().to_string()),
521 candidate_row: Some(candidate_row),
522 candidate_column: Some(candidate_column),
523 candidate_is_gold: Some(candidate_is_gold),
524 candidate_rank: Some(candidate_rank as u32),
525 candidate_is_same_file: Some(is_same_file),
526 candidate_is_referenced_nearby: Some(is_referenced_nearby),
527 candidate_is_referenced_in_breadcrumb: Some(is_referenced_in_breadcrumb),
528 candidate_reference_count: Some(reference_count as u32),
529 candidate_same_file_declaration_count: Some(same_file_declaration_count as u32),
530 candidate_declaration_count: Some(declaration_count as u32),
531 candidate_reference_line_distance: Some(reference_line_distance),
532 candidate_declaration_line_distance: Some(declaration_line_distance),
533 candidate_excerpt_vs_item_jaccard: Some(excerpt_vs_item_jaccard),
534 candidate_excerpt_vs_signature_jaccard: Some(excerpt_vs_signature_jaccard),
535 candidate_adjacent_vs_item_jaccard: Some(adjacent_vs_item_jaccard),
536 candidate_adjacent_vs_signature_jaccard: Some(adjacent_vs_signature_jaccard),
537 candidate_excerpt_vs_item_weighted_overlap: Some(
538 excerpt_vs_item_weighted_overlap,
539 ),
540 candidate_excerpt_vs_signature_weighted_overlap: Some(
541 excerpt_vs_signature_weighted_overlap,
542 ),
543 candidate_adjacent_vs_item_weighted_overlap: Some(
544 adjacent_vs_item_weighted_overlap,
545 ),
546 candidate_adjacent_vs_signature_weighted_overlap: Some(
547 adjacent_vs_signature_weighted_overlap,
548 ),
549 candidate_path_import_match_count: Some(path_import_match_count as u32),
550 candidate_wildcard_path_import_match_count: Some(
551 wildcard_path_import_match_count as u32,
552 ),
553 candidate_import_similarity: Some(import_similarity),
554 candidate_max_import_similarity: Some(max_import_similarity),
555 candidate_normalized_import_similarity: Some(normalized_import_similarity),
556 candidate_wildcard_import_similarity: Some(wildcard_import_similarity),
557 candidate_normalized_wildcard_import_similarity: Some(
558 normalized_wildcard_import_similarity,
559 ),
560 candidate_included_by_others: Some(included_by_others as u32),
561 candidate_includes_others: Some(includes_others as u32),
562 });
563 }
564
565 if result.retrieved_definitions.is_empty() {
566 rows.push(Row {
567 ref_id,
568 cursor_path: cursor_path.to_string(),
569 cursor_row,
570 cursor_column,
571 cursor_identifier: cursor_identifier.clone(),
572 gold_in_excerpt,
573 gold_path: gold_path.to_string_lossy().to_string(),
574 gold_row,
575 gold_column,
576 gold_is_external,
577 candidate_count,
578 ..Default::default()
579 });
580 }
581 }
582 }
583 let slices = rows.slices();
584
585 let RowSlices {
586 ref_id,
587 cursor_path,
588 cursor_row,
589 cursor_column,
590 cursor_identifier,
591 gold_in_excerpt,
592 gold_path,
593 gold_row,
594 gold_column,
595 gold_is_external,
596 candidate_path,
597 candidate_row,
598 candidate_column,
599 candidate_is_gold,
600 candidate_rank,
601 candidate_count,
602 candidate_is_same_file,
603 candidate_is_referenced_nearby,
604 candidate_is_referenced_in_breadcrumb,
605 candidate_reference_count,
606 candidate_same_file_declaration_count,
607 candidate_declaration_count,
608 candidate_reference_line_distance,
609 candidate_declaration_line_distance,
610 candidate_excerpt_vs_item_jaccard,
611 candidate_excerpt_vs_signature_jaccard,
612 candidate_adjacent_vs_item_jaccard,
613 candidate_adjacent_vs_signature_jaccard,
614 candidate_excerpt_vs_item_weighted_overlap,
615 candidate_excerpt_vs_signature_weighted_overlap,
616 candidate_adjacent_vs_item_weighted_overlap,
617 candidate_adjacent_vs_signature_weighted_overlap,
618 candidate_path_import_match_count,
619 candidate_wildcard_path_import_match_count,
620 candidate_import_similarity,
621 candidate_max_import_similarity,
622 candidate_normalized_import_similarity,
623 candidate_wildcard_import_similarity,
624 candidate_normalized_wildcard_import_similarity,
625 candidate_included_by_others,
626 candidate_includes_others,
627 } = slices;
628
629 let df = DataFrame::new(vec![
630 Series::new(PlSmallStr::from_str("ref_id"), ref_id).into(),
631 Series::new(PlSmallStr::from_str("cursor_path"), cursor_path).into(),
632 Series::new(PlSmallStr::from_str("cursor_row"), cursor_row).into(),
633 Series::new(PlSmallStr::from_str("cursor_column"), cursor_column).into(),
634 Series::new(PlSmallStr::from_str("cursor_identifier"), cursor_identifier).into(),
635 Series::new(PlSmallStr::from_str("gold_in_excerpt"), gold_in_excerpt).into(),
636 Series::new(PlSmallStr::from_str("gold_path"), gold_path).into(),
637 Series::new(PlSmallStr::from_str("gold_row"), gold_row).into(),
638 Series::new(PlSmallStr::from_str("gold_column"), gold_column).into(),
639 Series::new(PlSmallStr::from_str("gold_is_external"), gold_is_external).into(),
640 Series::new(PlSmallStr::from_str("candidate_count"), candidate_count).into(),
641 Series::new(PlSmallStr::from_str("candidate_path"), candidate_path).into(),
642 Series::new(PlSmallStr::from_str("candidate_row"), candidate_row).into(),
643 Series::new(PlSmallStr::from_str("candidate_column"), candidate_column).into(),
644 Series::new(PlSmallStr::from_str("candidate_is_gold"), candidate_is_gold).into(),
645 Series::new(PlSmallStr::from_str("candidate_rank"), candidate_rank).into(),
646 Series::new(
647 PlSmallStr::from_str("candidate_is_same_file"),
648 candidate_is_same_file,
649 )
650 .into(),
651 Series::new(
652 PlSmallStr::from_str("candidate_is_referenced_nearby"),
653 candidate_is_referenced_nearby,
654 )
655 .into(),
656 Series::new(
657 PlSmallStr::from_str("candidate_is_referenced_in_breadcrumb"),
658 candidate_is_referenced_in_breadcrumb,
659 )
660 .into(),
661 Series::new(
662 PlSmallStr::from_str("candidate_reference_count"),
663 candidate_reference_count,
664 )
665 .into(),
666 Series::new(
667 PlSmallStr::from_str("candidate_same_file_declaration_count"),
668 candidate_same_file_declaration_count,
669 )
670 .into(),
671 Series::new(
672 PlSmallStr::from_str("candidate_declaration_count"),
673 candidate_declaration_count,
674 )
675 .into(),
676 Series::new(
677 PlSmallStr::from_str("candidate_reference_line_distance"),
678 candidate_reference_line_distance,
679 )
680 .into(),
681 Series::new(
682 PlSmallStr::from_str("candidate_declaration_line_distance"),
683 candidate_declaration_line_distance,
684 )
685 .into(),
686 Series::new(
687 PlSmallStr::from_str("candidate_excerpt_vs_item_jaccard"),
688 candidate_excerpt_vs_item_jaccard,
689 )
690 .into(),
691 Series::new(
692 PlSmallStr::from_str("candidate_excerpt_vs_signature_jaccard"),
693 candidate_excerpt_vs_signature_jaccard,
694 )
695 .into(),
696 Series::new(
697 PlSmallStr::from_str("candidate_adjacent_vs_item_jaccard"),
698 candidate_adjacent_vs_item_jaccard,
699 )
700 .into(),
701 Series::new(
702 PlSmallStr::from_str("candidate_adjacent_vs_signature_jaccard"),
703 candidate_adjacent_vs_signature_jaccard,
704 )
705 .into(),
706 Series::new(
707 PlSmallStr::from_str("candidate_excerpt_vs_item_weighted_overlap"),
708 candidate_excerpt_vs_item_weighted_overlap,
709 )
710 .into(),
711 Series::new(
712 PlSmallStr::from_str("candidate_excerpt_vs_signature_weighted_overlap"),
713 candidate_excerpt_vs_signature_weighted_overlap,
714 )
715 .into(),
716 Series::new(
717 PlSmallStr::from_str("candidate_adjacent_vs_item_weighted_overlap"),
718 candidate_adjacent_vs_item_weighted_overlap,
719 )
720 .into(),
721 Series::new(
722 PlSmallStr::from_str("candidate_adjacent_vs_signature_weighted_overlap"),
723 candidate_adjacent_vs_signature_weighted_overlap,
724 )
725 .into(),
726 Series::new(
727 PlSmallStr::from_str("candidate_path_import_match_count"),
728 candidate_path_import_match_count,
729 )
730 .into(),
731 Series::new(
732 PlSmallStr::from_str("candidate_wildcard_path_import_match_count"),
733 candidate_wildcard_path_import_match_count,
734 )
735 .into(),
736 Series::new(
737 PlSmallStr::from_str("candidate_import_similarity"),
738 candidate_import_similarity,
739 )
740 .into(),
741 Series::new(
742 PlSmallStr::from_str("candidate_max_import_similarity"),
743 candidate_max_import_similarity,
744 )
745 .into(),
746 Series::new(
747 PlSmallStr::from_str("candidate_normalized_import_similarity"),
748 candidate_normalized_import_similarity,
749 )
750 .into(),
751 Series::new(
752 PlSmallStr::from_str("candidate_wildcard_import_similarity"),
753 candidate_wildcard_import_similarity,
754 )
755 .into(),
756 Series::new(
757 PlSmallStr::from_str("candidate_normalized_wildcard_import_similarity"),
758 candidate_normalized_wildcard_import_similarity,
759 )
760 .into(),
761 Series::new(
762 PlSmallStr::from_str("candidate_included_by_others"),
763 candidate_included_by_others,
764 )
765 .into(),
766 Series::new(
767 PlSmallStr::from_str("candidate_includes_others"),
768 candidate_includes_others,
769 )
770 .into(),
771 ])?;
772
773 Ok(df)
774}
775
776fn relativize_path(path: &Path) -> &Path {
777 path.strip_prefix(std::env::current_dir().unwrap())
778 .unwrap_or(path)
779}
780
781struct SummaryStats {
782 references_count: u32,
783 retrieved_count: u32,
784 top_match_count: u32,
785 non_top_match_count: u32,
786 ranking_involved_top_match_count: u32,
787 missing_none_retrieved: u32,
788 missing_wrong_retrieval: u32,
789 missing_external: u32,
790 in_excerpt_count: u32,
791}
792
793impl SummaryStats {
794 fn from_dataframe(df: DataFrame) -> Result<Self> {
795 // TODO: use lazy more
796 let unique_refs =
797 df.unique::<(), ()>(Some(&["ref_id".into()]), UniqueKeepStrategy::Any, None)?;
798 let references_count = unique_refs.height() as u32;
799
800 let gold_mask = df.column("candidate_is_gold")?.bool()?;
801 let gold_df = df.filter(&gold_mask)?;
802 let retrieved_count = gold_df.height() as u32;
803
804 let top_match_mask = gold_df.column("candidate_rank")?.u32()?.equal(0);
805 let top_match_df = gold_df.filter(&top_match_mask)?;
806 let top_match_count = top_match_df.height() as u32;
807
808 let ranking_involved_top_match_count = top_match_df
809 .column("candidate_count")?
810 .u32()?
811 .gt(1)
812 .sum()
813 .unwrap_or_default();
814
815 let non_top_match_count = (!top_match_mask).sum().unwrap_or(0);
816
817 let not_retrieved_df = df
818 .lazy()
819 .group_by(&[col("ref_id"), col("candidate_count")])
820 .agg(&[
821 col("candidate_is_gold")
822 .fill_null(false)
823 .sum()
824 .alias("gold_count"),
825 col("gold_in_excerpt").sum().alias("gold_in_excerpt_count"),
826 col("gold_is_external")
827 .sum()
828 .alias("gold_is_external_count"),
829 ])
830 .filter(col("gold_count").eq(lit(0)))
831 .collect()?;
832
833 let in_excerpt_mask = not_retrieved_df
834 .column("gold_in_excerpt_count")?
835 .u32()?
836 .gt(0);
837 let in_excerpt_count = in_excerpt_mask.sum().unwrap_or(0);
838
839 let missing_df = not_retrieved_df.filter(&!in_excerpt_mask)?;
840
841 let missing_none_retrieved_mask = missing_df.column("candidate_count")?.u32()?.equal(0);
842 let missing_none_retrieved = missing_none_retrieved_mask.sum().unwrap_or(0);
843 let external_mask = missing_df.column("gold_is_external_count")?.u32()?.gt(0);
844 let missing_external = (missing_none_retrieved_mask & external_mask)
845 .sum()
846 .unwrap_or(0);
847
848 let missing_wrong_retrieval = missing_df
849 .column("candidate_count")?
850 .u32()?
851 .gt(0)
852 .sum()
853 .unwrap_or(0);
854
855 Ok(SummaryStats {
856 references_count,
857 retrieved_count,
858 top_match_count,
859 non_top_match_count,
860 ranking_involved_top_match_count,
861 missing_none_retrieved,
862 missing_wrong_retrieval,
863 missing_external,
864 in_excerpt_count,
865 })
866 }
867
868 fn count_and_percentage(part: u32, total: u32) -> String {
869 format!("{} ({:.2}%)", part, (part as f64 / total as f64) * 100.0)
870 }
871}
872
873impl std::fmt::Display for SummaryStats {
874 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
875 let included = self.in_excerpt_count + self.retrieved_count;
876 let missing = self.references_count - included;
877 writeln!(f)?;
878 writeln!(f, "╮ references: {}", self.references_count)?;
879 writeln!(
880 f,
881 "├─╮ included: {}",
882 Self::count_and_percentage(included, self.references_count),
883 )?;
884 writeln!(
885 f,
886 "│ ├─╮ retrieved: {}",
887 Self::count_and_percentage(self.retrieved_count, self.references_count)
888 )?;
889 writeln!(
890 f,
891 "│ │ ├─╮ top match : {}",
892 Self::count_and_percentage(self.top_match_count, self.retrieved_count)
893 )?;
894 writeln!(
895 f,
896 "│ │ │ ╰─╴ involving ranking: {}",
897 Self::count_and_percentage(self.ranking_involved_top_match_count, self.top_match_count)
898 )?;
899 writeln!(
900 f,
901 "│ │ ╰─╴ non-top match: {}",
902 Self::count_and_percentage(self.non_top_match_count, self.retrieved_count)
903 )?;
904 writeln!(
905 f,
906 "│ ╰─╴ in excerpt: {}",
907 Self::count_and_percentage(self.in_excerpt_count, included)
908 )?;
909 writeln!(
910 f,
911 "╰─╮ missing: {}",
912 Self::count_and_percentage(missing, self.references_count)
913 )?;
914 writeln!(
915 f,
916 " ├─╮ none retrieved: {}",
917 Self::count_and_percentage(self.missing_none_retrieved, missing)
918 )?;
919 writeln!(
920 f,
921 " │ ╰─╴ external (expected): {}",
922 Self::count_and_percentage(self.missing_external, missing)
923 )?;
924 writeln!(
925 f,
926 " ╰─╴ wrong retrieval: {}",
927 Self::count_and_percentage(self.missing_wrong_retrieval, missing)
928 )?;
929 Ok(())
930 }
931}
932
933#[derive(Debug)]
934struct ReferenceRetrievalResult {
935 cursor_path: Arc<RelPath>,
936 cursor_point: Point,
937 identifier: Identifier,
938 excerpt_range: Option<Range<usize>>,
939 lsp_definitions: Vec<SourceRange>,
940 retrieved_definitions: Vec<RetrievedDefinition>,
941}
942
943#[derive(Debug)]
944struct RetrievedDefinition {
945 path: Arc<RelPath>,
946 range: Range<Point>,
947 score: f32,
948 #[allow(dead_code)]
949 retrieval_score: f32,
950 #[allow(dead_code)]
951 components: DeclarationScoreComponents,
952}
953
954struct RetrieveResult {
955 definitions: Vec<RetrievedDefinition>,
956 excerpt_range: Option<Range<usize>>,
957}
958
959async fn retrieve_definitions(
960 reference: &Reference,
961 imports: &Imports,
962 query_point: Point,
963 snapshot: &BufferSnapshot,
964 index: &Arc<SyntaxIndexState>,
965 file_snapshots: &Arc<HashMap<ProjectEntryId, BufferSnapshot>>,
966 context_options: &EditPredictionContextOptions,
967) -> Result<RetrieveResult> {
968 let mut single_reference_map = HashMap::default();
969 single_reference_map.insert(reference.identifier.clone(), vec![reference.clone()]);
970 let edit_prediction_context = EditPredictionContext::gather_context_with_references_fn(
971 query_point,
972 snapshot,
973 imports,
974 &context_options,
975 Some(&index),
976 |_, _, _| single_reference_map,
977 );
978
979 let Some(edit_prediction_context) = edit_prediction_context else {
980 return Ok(RetrieveResult {
981 definitions: Vec::new(),
982 excerpt_range: None,
983 });
984 };
985
986 let mut retrieved_definitions = Vec::new();
987 for scored_declaration in edit_prediction_context.declarations {
988 match &scored_declaration.declaration {
989 Declaration::File {
990 project_entry_id,
991 declaration,
992 ..
993 } => {
994 let Some(snapshot) = file_snapshots.get(&project_entry_id) else {
995 log::error!("bug: file project entry not found");
996 continue;
997 };
998 let path = snapshot.file().unwrap().path().clone();
999 retrieved_definitions.push(RetrievedDefinition {
1000 path,
1001 range: snapshot.offset_to_point(declaration.item_range.start)
1002 ..snapshot.offset_to_point(declaration.item_range.end),
1003 score: scored_declaration.score(DeclarationStyle::Declaration),
1004 retrieval_score: scored_declaration.retrieval_score(),
1005 components: scored_declaration.components,
1006 });
1007 }
1008 Declaration::Buffer {
1009 project_entry_id,
1010 rope,
1011 declaration,
1012 ..
1013 } => {
1014 let Some(snapshot) = file_snapshots.get(&project_entry_id) else {
1015 // This case happens when dependency buffers have been opened by
1016 // go-to-definition, resulting in single-file worktrees.
1017 continue;
1018 };
1019 let path = snapshot.file().unwrap().path().clone();
1020 retrieved_definitions.push(RetrievedDefinition {
1021 path,
1022 range: rope.offset_to_point(declaration.item_range.start)
1023 ..rope.offset_to_point(declaration.item_range.end),
1024 score: scored_declaration.score(DeclarationStyle::Declaration),
1025 retrieval_score: scored_declaration.retrieval_score(),
1026 components: scored_declaration.components,
1027 });
1028 }
1029 }
1030 }
1031 retrieved_definitions.sort_by_key(|definition| Reverse(OrderedFloat(definition.score)));
1032
1033 Ok(RetrieveResult {
1034 definitions: retrieved_definitions,
1035 excerpt_range: Some(edit_prediction_context.excerpt.range),
1036 })
1037}
1038
1039async fn gather_lsp_definitions(
1040 lsp_definitions_path: &Path,
1041 start_index: usize,
1042 files: &[ProjectPath],
1043 worktree: &Entity<Worktree>,
1044 project: &Entity<Project>,
1045 definitions: &mut HashMap<SourceLocation, Vec<SourceRange>>,
1046 cx: &mut AsyncApp,
1047) -> Result<()> {
1048 let worktree_id = worktree.read_with(cx, |worktree, _cx| worktree.id())?;
1049
1050 let lsp_store = project.read_with(cx, |project, _cx| project.lsp_store())?;
1051 cx.subscribe(&lsp_store, {
1052 move |_, event, _| {
1053 if let project::LspStoreEvent::LanguageServerUpdate {
1054 message:
1055 client::proto::update_language_server::Variant::WorkProgress(
1056 client::proto::LspWorkProgress {
1057 message: Some(message),
1058 ..
1059 },
1060 ),
1061 ..
1062 } = event
1063 {
1064 println!("⟲ {message}")
1065 }
1066 }
1067 })?
1068 .detach();
1069
1070 let (cache_line_tx, mut cache_line_rx) = mpsc::unbounded::<FileLspDefinitions>();
1071
1072 let cache_file = File::options()
1073 .append(true)
1074 .create(true)
1075 .open(lsp_definitions_path)
1076 .unwrap();
1077
1078 let cache_task = cx.background_spawn(async move {
1079 let mut writer = BufWriter::new(cache_file);
1080 while let Some(line) = cache_line_rx.next().await {
1081 serde_json::to_writer(&mut writer, &line).unwrap();
1082 writer.write_all(&[b'\n']).unwrap();
1083 }
1084 writer.flush().unwrap();
1085 });
1086
1087 let mut error_count = 0;
1088 let mut lsp_open_handles = Vec::new();
1089 let mut ready_languages = HashSet::default();
1090 for (file_index, project_path) in files[start_index..].iter().enumerate() {
1091 println!(
1092 "Processing file {} of {}: {}",
1093 start_index + file_index + 1,
1094 files.len(),
1095 project_path.path.display(PathStyle::Posix)
1096 );
1097
1098 let Some((lsp_open_handle, language_server_id, buffer)) = open_buffer_with_language_server(
1099 project.clone(),
1100 worktree.clone(),
1101 project_path.path.clone(),
1102 &mut ready_languages,
1103 cx,
1104 )
1105 .await
1106 .log_err() else {
1107 continue;
1108 };
1109 lsp_open_handles.push(lsp_open_handle);
1110
1111 let snapshot = buffer.read_with(cx, |buffer, _cx| buffer.snapshot())?;
1112 let full_range = 0..snapshot.len();
1113 let references = references_in_range(
1114 full_range,
1115 &snapshot.text(),
1116 ReferenceRegion::Nearby,
1117 &snapshot,
1118 );
1119
1120 loop {
1121 let is_ready = lsp_store
1122 .read_with(cx, |lsp_store, _cx| {
1123 lsp_store
1124 .language_server_statuses
1125 .get(&language_server_id)
1126 .is_some_and(|status| status.pending_work.is_empty())
1127 })
1128 .unwrap();
1129 if is_ready {
1130 break;
1131 }
1132 cx.background_executor()
1133 .timer(Duration::from_millis(10))
1134 .await;
1135 }
1136
1137 let mut cache_line_references = Vec::with_capacity(references.len());
1138
1139 for reference in references {
1140 // TODO: Rename declaration to definition in edit_prediction_context?
1141 let lsp_result = project
1142 .update(cx, |project, cx| {
1143 project.definitions(&buffer, reference.range.start, cx)
1144 })?
1145 .await;
1146
1147 match lsp_result {
1148 Ok(lsp_definitions) => {
1149 let mut targets = Vec::new();
1150 for target in lsp_definitions.unwrap_or_default() {
1151 let buffer = target.target.buffer;
1152 let anchor_range = target.target.range;
1153 buffer.read_with(cx, |buffer, cx| {
1154 let Some(file) = project::File::from_dyn(buffer.file()) else {
1155 return;
1156 };
1157 let file_worktree = file.worktree.read(cx);
1158 let file_worktree_id = file_worktree.id();
1159 // Relative paths for worktree files, absolute for all others
1160 let path = if worktree_id != file_worktree_id {
1161 file.worktree.read(cx).absolutize(&file.path)
1162 } else {
1163 file.path.as_std_path().to_path_buf()
1164 };
1165 let offset_range = anchor_range.to_offset(&buffer);
1166 let point_range = SerializablePoint::from_language_point_range(
1167 offset_range.to_point(&buffer),
1168 );
1169 targets.push(SourceRange {
1170 path,
1171 offset_range,
1172 point_range,
1173 });
1174 })?;
1175 }
1176
1177 let point = snapshot.offset_to_point(reference.range.start);
1178
1179 cache_line_references.push((point.into(), targets.clone()));
1180 definitions.insert(
1181 SourceLocation {
1182 path: project_path.path.clone(),
1183 point,
1184 },
1185 targets,
1186 );
1187 }
1188 Err(err) => {
1189 log::error!("Language server error: {err}");
1190 error_count += 1;
1191 }
1192 }
1193 }
1194
1195 cache_line_tx
1196 .unbounded_send(FileLspDefinitions {
1197 path: project_path.path.as_unix_str().into(),
1198 references: cache_line_references,
1199 })
1200 .log_err();
1201 }
1202
1203 drop(cache_line_tx);
1204
1205 if error_count > 0 {
1206 log::error!("Encountered {} language server errors", error_count);
1207 }
1208
1209 cache_task.await;
1210
1211 Ok(())
1212}
1213
1214#[derive(Serialize, Deserialize)]
1215struct FileLspDefinitions {
1216 path: Arc<str>,
1217 references: Vec<(SerializablePoint, Vec<SourceRange>)>,
1218}
1219
1220#[derive(Debug, Clone, Serialize, Deserialize)]
1221struct SourceRange {
1222 path: PathBuf,
1223 point_range: Range<SerializablePoint>,
1224 offset_range: Range<usize>,
1225}
1226
1227/// Serializes to 1-based row and column indices.
1228#[derive(Debug, Clone, Serialize, Deserialize)]
1229pub struct SerializablePoint {
1230 pub row: u32,
1231 pub column: u32,
1232}
1233
1234impl SerializablePoint {
1235 pub fn into_language_point_range(range: Range<Self>) -> Range<Point> {
1236 range.start.into()..range.end.into()
1237 }
1238
1239 pub fn from_language_point_range(range: Range<Point>) -> Range<Self> {
1240 range.start.into()..range.end.into()
1241 }
1242}
1243
1244impl From<Point> for SerializablePoint {
1245 fn from(point: Point) -> Self {
1246 SerializablePoint {
1247 row: point.row + 1,
1248 column: point.column + 1,
1249 }
1250 }
1251}
1252
1253impl From<SerializablePoint> for Point {
1254 fn from(serializable: SerializablePoint) -> Self {
1255 Point {
1256 row: serializable.row.saturating_sub(1),
1257 column: serializable.column.saturating_sub(1),
1258 }
1259 }
1260}