1use ::util::rel_path::RelPath;
2use ::util::{RangeExt, ResultExt as _};
3use anyhow::{Context as _, Result};
4use cloud_llm_client::predict_edits_v3::DeclarationScoreComponents;
5use edit_prediction_context::{
6 Declaration, DeclarationStyle, EditPredictionContext, Identifier, Imports, Reference,
7 ReferenceRegion, SyntaxIndex, SyntaxIndexState, references_in_range,
8};
9use futures::StreamExt as _;
10use futures::channel::mpsc;
11use gpui::Entity;
12use gpui::{AppContext, AsyncApp};
13use language::OffsetRangeExt;
14use language::{BufferSnapshot, Point};
15use ordered_float::OrderedFloat;
16use polars::prelude::*;
17use project::{Project, ProjectEntryId, ProjectPath, Worktree};
18use serde::{Deserialize, Serialize};
19use std::fs;
20use std::{
21 cmp::Reverse,
22 collections::{HashMap, HashSet},
23 fs::File,
24 hash::{Hash, Hasher},
25 io::{BufRead, BufReader, BufWriter, Write as _},
26 ops::Range,
27 path::{Path, PathBuf},
28 sync::{
29 Arc,
30 atomic::{self, AtomicUsize},
31 },
32 time::Duration,
33};
34use util::paths::PathStyle;
35
36use crate::headless::ZetaCliAppState;
37use crate::source_location::SourceLocation;
38use crate::util::{open_buffer, open_buffer_with_language_server};
39
40pub async fn retrieval_stats(
41 worktree: PathBuf,
42 app_state: Arc<ZetaCliAppState>,
43 only_extension: Option<String>,
44 file_limit: Option<usize>,
45 skip_files: Option<usize>,
46 options: zeta2::ZetaOptions,
47 cx: &mut AsyncApp,
48) -> Result<String> {
49 let options = Arc::new(options);
50 let worktree_path = worktree.canonicalize()?;
51
52 let project = cx.update(|cx| {
53 Project::local(
54 app_state.client.clone(),
55 app_state.node_runtime.clone(),
56 app_state.user_store.clone(),
57 app_state.languages.clone(),
58 app_state.fs.clone(),
59 None,
60 cx,
61 )
62 })?;
63
64 let worktree = project
65 .update(cx, |project, cx| {
66 project.create_worktree(&worktree_path, true, cx)
67 })?
68 .await?;
69
70 // wait for worktree scan so that wait_for_initial_file_indexing waits for the whole worktree.
71 worktree
72 .read_with(cx, |worktree, _cx| {
73 worktree.as_local().unwrap().scan_complete()
74 })?
75 .await;
76
77 let index = cx.new(|cx| SyntaxIndex::new(&project, options.file_indexing_parallelism, cx))?;
78 index
79 .read_with(cx, |index, cx| index.wait_for_initial_file_indexing(cx))?
80 .await?;
81 let indexed_files = index
82 .read_with(cx, |index, cx| index.indexed_file_paths(cx))?
83 .await;
84 let mut filtered_files = indexed_files
85 .into_iter()
86 .filter(|project_path| {
87 let file_extension = project_path.path.extension();
88 if let Some(only_extension) = only_extension.as_ref() {
89 file_extension.is_some_and(|extension| extension == only_extension)
90 } else {
91 file_extension
92 .is_some_and(|extension| !["md", "json", "sh", "diff"].contains(&extension))
93 }
94 })
95 .collect::<Vec<_>>();
96 filtered_files.sort_by(|a, b| a.path.cmp(&b.path));
97
98 let index_state = index.read_with(cx, |index, _cx| index.state().clone())?;
99 cx.update(|_| {
100 drop(index);
101 })?;
102 let index_state = Arc::new(
103 Arc::into_inner(index_state)
104 .context("Index state had more than 1 reference")?
105 .into_inner(),
106 );
107
108 struct FileSnapshot {
109 project_entry_id: ProjectEntryId,
110 snapshot: BufferSnapshot,
111 hash: u64,
112 parent_abs_path: Arc<Path>,
113 }
114
115 let files: Vec<FileSnapshot> = futures::future::try_join_all({
116 filtered_files
117 .iter()
118 .map(|file| {
119 let buffer_task =
120 open_buffer(project.clone(), worktree.clone(), file.path.clone(), cx);
121 cx.spawn(async move |cx| {
122 let buffer = buffer_task.await?;
123 let (project_entry_id, parent_abs_path, snapshot) =
124 buffer.read_with(cx, |buffer, cx| {
125 let file = project::File::from_dyn(buffer.file()).unwrap();
126 let project_entry_id = file.project_entry_id().unwrap();
127 let mut parent_abs_path = file.worktree.read(cx).absolutize(&file.path);
128 if !parent_abs_path.pop() {
129 panic!("Invalid worktree path");
130 }
131
132 (project_entry_id, parent_abs_path, buffer.snapshot())
133 })?;
134
135 anyhow::Ok(
136 cx.background_spawn(async move {
137 let mut hasher = collections::FxHasher::default();
138 snapshot.text().hash(&mut hasher);
139 FileSnapshot {
140 project_entry_id,
141 snapshot,
142 hash: hasher.finish(),
143 parent_abs_path: parent_abs_path.into(),
144 }
145 })
146 .await,
147 )
148 })
149 })
150 .collect::<Vec<_>>()
151 })
152 .await?;
153
154 let mut file_snapshots = HashMap::default();
155 let mut hasher = collections::FxHasher::default();
156 for FileSnapshot {
157 project_entry_id,
158 snapshot,
159 hash,
160 ..
161 } in &files
162 {
163 file_snapshots.insert(*project_entry_id, snapshot.clone());
164 hash.hash(&mut hasher);
165 }
166 let files_hash = hasher.finish();
167 let file_snapshots = Arc::new(file_snapshots);
168 let target_cli_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("../../target/zeta_cli");
169 fs::create_dir_all(&target_cli_dir).unwrap();
170 let target_cli_dir = target_cli_dir.canonicalize().unwrap();
171
172 let lsp_cache_dir = target_cli_dir.join("cache");
173 fs::create_dir_all(&lsp_cache_dir).unwrap();
174
175 let lsp_definitions_path = lsp_cache_dir.join(format!(
176 "{}-{:x}.jsonl",
177 worktree_path.file_stem().unwrap_or_default().display(),
178 files_hash
179 ));
180
181 let mut lsp_definitions = HashMap::default();
182 let mut lsp_files = 0;
183
184 if fs::exists(&lsp_definitions_path)? {
185 log::info!(
186 "Using cached LSP definitions from {}",
187 lsp_definitions_path.display()
188 );
189
190 let file = File::options()
191 .read(true)
192 .write(true)
193 .open(&lsp_definitions_path)?;
194 let lines = BufReader::new(&file).lines();
195 let mut valid_len: usize = 0;
196
197 for (line, expected_file) in lines.zip(files.iter()) {
198 let line = line?;
199 let FileLspDefinitions { path, references } = match serde_json::from_str(&line) {
200 Ok(ok) => ok,
201 Err(_) => {
202 log::error!("Found invalid cache line. Truncating to #{lsp_files}.",);
203 file.set_len(valid_len as u64)?;
204 break;
205 }
206 };
207 let expected_path = expected_file.snapshot.file().unwrap().path().as_unix_str();
208 if expected_path != path.as_ref() {
209 log::error!(
210 "Expected file #{} to be {expected_path}, but found {path}. Truncating to #{lsp_files}.",
211 lsp_files + 1
212 );
213 file.set_len(valid_len as u64)?;
214 break;
215 }
216 for (point, ranges) in references {
217 let Ok(path) = RelPath::new(Path::new(path.as_ref()), PathStyle::Posix) else {
218 log::warn!("Invalid path: {}", path);
219 continue;
220 };
221 lsp_definitions.insert(
222 SourceLocation {
223 path: path.into_arc(),
224 point: point.into(),
225 },
226 ranges,
227 );
228 }
229 lsp_files += 1;
230 valid_len += line.len() + 1
231 }
232 }
233
234 if lsp_files < files.len() {
235 if lsp_files == 0 {
236 log::warn!(
237 "No LSP definitions found, populating {}",
238 lsp_definitions_path.display()
239 );
240 } else {
241 log::warn!("{} files missing from LSP cache", files.len() - lsp_files);
242 }
243
244 gather_lsp_definitions(
245 &lsp_definitions_path,
246 lsp_files,
247 &filtered_files,
248 &worktree,
249 &project,
250 &mut lsp_definitions,
251 cx,
252 )
253 .await?;
254 }
255 let files_len = files.len().min(file_limit.unwrap_or(usize::MAX));
256 let done_count = Arc::new(AtomicUsize::new(0));
257
258 let (output_tx, output_rx) = mpsc::unbounded::<ReferenceRetrievalResult>();
259
260 let tasks = files
261 .into_iter()
262 .skip(skip_files.unwrap_or(0))
263 .take(file_limit.unwrap_or(usize::MAX))
264 .map(|project_file| {
265 let index_state = index_state.clone();
266 let lsp_definitions = lsp_definitions.clone();
267 let options = options.clone();
268 let output_tx = output_tx.clone();
269 let done_count = done_count.clone();
270 let file_snapshots = file_snapshots.clone();
271 cx.background_spawn(async move {
272 let snapshot = project_file.snapshot;
273
274 let full_range = 0..snapshot.len();
275 let references = references_in_range(
276 full_range,
277 &snapshot.text(),
278 ReferenceRegion::Nearby,
279 &snapshot,
280 );
281
282 let imports = if options.context.use_imports {
283 Imports::gather(&snapshot, Some(&project_file.parent_abs_path))
284 } else {
285 Imports::default()
286 };
287
288 let path = snapshot.file().unwrap().path();
289
290 for reference in references {
291 let query_point = snapshot.offset_to_point(reference.range.start);
292 let source_location = SourceLocation {
293 path: path.clone(),
294 point: query_point,
295 };
296 let lsp_definitions = lsp_definitions
297 .get(&source_location)
298 .cloned()
299 .unwrap_or_else(|| {
300 log::warn!(
301 "No definitions found for source location: {:?}",
302 source_location
303 );
304 Vec::new()
305 });
306
307 let retrieve_result = retrieve_definitions(
308 &reference,
309 &imports,
310 query_point,
311 &snapshot,
312 &index_state,
313 &file_snapshots,
314 &options,
315 )
316 .await?;
317
318 let result = ReferenceRetrievalResult {
319 cursor_path: path.clone(),
320 identifier: reference.identifier,
321 cursor_point: query_point,
322 lsp_definitions,
323 retrieved_definitions: retrieve_result.definitions,
324 excerpt_range: retrieve_result.excerpt_range,
325 };
326
327 output_tx.unbounded_send(result).ok();
328 }
329
330 println!(
331 "{:02}/{:02} done",
332 done_count.fetch_add(1, atomic::Ordering::Relaxed) + 1,
333 files_len,
334 );
335
336 anyhow::Ok(())
337 })
338 })
339 .collect::<Vec<_>>();
340
341 drop(output_tx);
342
343 let df_task = cx.background_spawn(build_dataframe(output_rx));
344
345 futures::future::try_join_all(tasks).await?;
346 let mut df = df_task.await?;
347
348 let run_id = format!(
349 "{}-{}",
350 worktree_path.file_stem().unwrap_or_default().display(),
351 chrono::Local::now().format("%Y%m%d_%H%M%S")
352 );
353 let run_dir = target_cli_dir.join(run_id);
354 fs::create_dir(&run_dir).unwrap();
355
356 let parquet_path = run_dir.join("stats.parquet");
357 let mut parquet_file = fs::File::create(&parquet_path)?;
358
359 ParquetWriter::new(&mut parquet_file)
360 .finish(&mut df)
361 .unwrap();
362
363 let stats = SummaryStats::from_dataframe(df)?;
364
365 let stats_path = run_dir.join("stats.txt");
366 fs::write(&stats_path, format!("{}", stats))?;
367
368 println!("{}", stats);
369 println!("\nWrote:");
370 println!("- {}", relativize_path(&parquet_path).display());
371 println!("- {}", relativize_path(&stats_path).display());
372 println!("- {}", relativize_path(&lsp_definitions_path).display());
373
374 Ok("".to_string())
375}
376
377async fn build_dataframe(
378 mut output_rx: mpsc::UnboundedReceiver<ReferenceRetrievalResult>,
379) -> Result<DataFrame> {
380 use soa_rs::{Soa, Soars};
381
382 #[derive(Default, Soars)]
383 struct Row {
384 ref_id: u32,
385 cursor_path: String,
386 cursor_row: u32,
387 cursor_column: u32,
388 cursor_identifier: String,
389 gold_in_excerpt: bool,
390 gold_path: String,
391 gold_row: u32,
392 gold_column: u32,
393 gold_is_external: bool,
394 candidate_count: u32,
395 candidate_path: Option<String>,
396 candidate_row: Option<u32>,
397 candidate_column: Option<u32>,
398 candidate_is_gold: Option<bool>,
399 candidate_rank: Option<u32>,
400 candidate_is_same_file: Option<bool>,
401 candidate_is_referenced_nearby: Option<bool>,
402 candidate_is_referenced_in_breadcrumb: Option<bool>,
403 candidate_reference_count: Option<u32>,
404 candidate_same_file_declaration_count: Option<u32>,
405 candidate_declaration_count: Option<u32>,
406 candidate_reference_line_distance: Option<u32>,
407 candidate_declaration_line_distance: Option<u32>,
408 candidate_excerpt_vs_item_jaccard: Option<f32>,
409 candidate_excerpt_vs_signature_jaccard: Option<f32>,
410 candidate_adjacent_vs_item_jaccard: Option<f32>,
411 candidate_adjacent_vs_signature_jaccard: Option<f32>,
412 candidate_excerpt_vs_item_weighted_overlap: Option<f32>,
413 candidate_excerpt_vs_signature_weighted_overlap: Option<f32>,
414 candidate_adjacent_vs_item_weighted_overlap: Option<f32>,
415 candidate_adjacent_vs_signature_weighted_overlap: Option<f32>,
416 candidate_path_import_match_count: Option<u32>,
417 candidate_wildcard_path_import_match_count: Option<u32>,
418 candidate_import_similarity: Option<f32>,
419 candidate_max_import_similarity: Option<f32>,
420 candidate_normalized_import_similarity: Option<f32>,
421 candidate_wildcard_import_similarity: Option<f32>,
422 candidate_normalized_wildcard_import_similarity: Option<f32>,
423 candidate_included_by_others: Option<u32>,
424 candidate_includes_others: Option<u32>,
425 }
426 let mut rows = Soa::<Row>::new();
427 let mut next_ref_id = 0;
428
429 while let Some(result) = output_rx.next().await {
430 let mut gold_is_external = false;
431 let mut gold_in_excerpt = false;
432 let cursor_path = result.cursor_path.as_unix_str();
433 let cursor_row = result.cursor_point.row + 1;
434 let cursor_column = result.cursor_point.column + 1;
435 let cursor_identifier = result.identifier.name.to_string();
436 let ref_id = next_ref_id;
437 next_ref_id += 1;
438
439 for lsp_definition in result.lsp_definitions {
440 let SourceRange {
441 path: gold_path,
442 point_range: gold_point_range,
443 offset_range: gold_offset_range,
444 } = lsp_definition;
445 let lsp_point_range =
446 SerializablePoint::into_language_point_range(gold_point_range.clone());
447
448 gold_is_external = gold_is_external
449 || gold_path.is_absolute()
450 || gold_path
451 .components()
452 .any(|component| component.as_os_str() == "node_modules");
453
454 gold_in_excerpt = gold_in_excerpt
455 || result.excerpt_range.as_ref().is_some_and(|excerpt_range| {
456 excerpt_range.contains_inclusive(&gold_offset_range)
457 });
458
459 let gold_row = gold_point_range.start.row;
460 let gold_column = gold_point_range.start.column;
461 let candidate_count = result.retrieved_definitions.len() as u32;
462
463 for (candidate_rank, retrieved_definition) in
464 result.retrieved_definitions.iter().enumerate()
465 {
466 let candidate_is_gold = gold_path.as_path()
467 == retrieved_definition.path.as_std_path()
468 && retrieved_definition
469 .range
470 .contains_inclusive(&lsp_point_range);
471
472 let candidate_row = retrieved_definition.range.start.row + 1;
473 let candidate_column = retrieved_definition.range.start.column + 1;
474
475 let DeclarationScoreComponents {
476 is_same_file,
477 is_referenced_nearby,
478 is_referenced_in_breadcrumb,
479 reference_count,
480 same_file_declaration_count,
481 declaration_count,
482 reference_line_distance,
483 declaration_line_distance,
484 excerpt_vs_item_jaccard,
485 excerpt_vs_signature_jaccard,
486 adjacent_vs_item_jaccard,
487 adjacent_vs_signature_jaccard,
488 excerpt_vs_item_weighted_overlap,
489 excerpt_vs_signature_weighted_overlap,
490 adjacent_vs_item_weighted_overlap,
491 adjacent_vs_signature_weighted_overlap,
492 path_import_match_count,
493 wildcard_path_import_match_count,
494 import_similarity,
495 max_import_similarity,
496 normalized_import_similarity,
497 wildcard_import_similarity,
498 normalized_wildcard_import_similarity,
499 included_by_others,
500 includes_others,
501 } = retrieved_definition.components;
502
503 rows.push(Row {
504 ref_id,
505 cursor_path: cursor_path.to_string(),
506 cursor_row,
507 cursor_column,
508 cursor_identifier: cursor_identifier.clone(),
509 gold_in_excerpt,
510 gold_path: gold_path.to_string_lossy().to_string(),
511 gold_row,
512 gold_column,
513 gold_is_external,
514 candidate_count,
515 candidate_path: Some(retrieved_definition.path.as_unix_str().to_string()),
516 candidate_row: Some(candidate_row),
517 candidate_column: Some(candidate_column),
518 candidate_is_gold: Some(candidate_is_gold),
519 candidate_rank: Some(candidate_rank as u32),
520 candidate_is_same_file: Some(is_same_file),
521 candidate_is_referenced_nearby: Some(is_referenced_nearby),
522 candidate_is_referenced_in_breadcrumb: Some(is_referenced_in_breadcrumb),
523 candidate_reference_count: Some(reference_count as u32),
524 candidate_same_file_declaration_count: Some(same_file_declaration_count as u32),
525 candidate_declaration_count: Some(declaration_count as u32),
526 candidate_reference_line_distance: Some(reference_line_distance),
527 candidate_declaration_line_distance: Some(declaration_line_distance),
528 candidate_excerpt_vs_item_jaccard: Some(excerpt_vs_item_jaccard),
529 candidate_excerpt_vs_signature_jaccard: Some(excerpt_vs_signature_jaccard),
530 candidate_adjacent_vs_item_jaccard: Some(adjacent_vs_item_jaccard),
531 candidate_adjacent_vs_signature_jaccard: Some(adjacent_vs_signature_jaccard),
532 candidate_excerpt_vs_item_weighted_overlap: Some(
533 excerpt_vs_item_weighted_overlap,
534 ),
535 candidate_excerpt_vs_signature_weighted_overlap: Some(
536 excerpt_vs_signature_weighted_overlap,
537 ),
538 candidate_adjacent_vs_item_weighted_overlap: Some(
539 adjacent_vs_item_weighted_overlap,
540 ),
541 candidate_adjacent_vs_signature_weighted_overlap: Some(
542 adjacent_vs_signature_weighted_overlap,
543 ),
544 candidate_path_import_match_count: Some(path_import_match_count as u32),
545 candidate_wildcard_path_import_match_count: Some(
546 wildcard_path_import_match_count as u32,
547 ),
548 candidate_import_similarity: Some(import_similarity),
549 candidate_max_import_similarity: Some(max_import_similarity),
550 candidate_normalized_import_similarity: Some(normalized_import_similarity),
551 candidate_wildcard_import_similarity: Some(wildcard_import_similarity),
552 candidate_normalized_wildcard_import_similarity: Some(
553 normalized_wildcard_import_similarity,
554 ),
555 candidate_included_by_others: Some(included_by_others as u32),
556 candidate_includes_others: Some(includes_others as u32),
557 });
558 }
559
560 if result.retrieved_definitions.is_empty() {
561 rows.push(Row {
562 ref_id,
563 cursor_path: cursor_path.to_string(),
564 cursor_row,
565 cursor_column,
566 cursor_identifier: cursor_identifier.clone(),
567 gold_in_excerpt,
568 gold_path: gold_path.to_string_lossy().to_string(),
569 gold_row,
570 gold_column,
571 gold_is_external,
572 candidate_count,
573 ..Default::default()
574 });
575 }
576 }
577 }
578 let slices = rows.slices();
579
580 let RowSlices {
581 ref_id,
582 cursor_path,
583 cursor_row,
584 cursor_column,
585 cursor_identifier,
586 gold_in_excerpt,
587 gold_path,
588 gold_row,
589 gold_column,
590 gold_is_external,
591 candidate_path,
592 candidate_row,
593 candidate_column,
594 candidate_is_gold,
595 candidate_rank,
596 candidate_count,
597 candidate_is_same_file,
598 candidate_is_referenced_nearby,
599 candidate_is_referenced_in_breadcrumb,
600 candidate_reference_count,
601 candidate_same_file_declaration_count,
602 candidate_declaration_count,
603 candidate_reference_line_distance,
604 candidate_declaration_line_distance,
605 candidate_excerpt_vs_item_jaccard,
606 candidate_excerpt_vs_signature_jaccard,
607 candidate_adjacent_vs_item_jaccard,
608 candidate_adjacent_vs_signature_jaccard,
609 candidate_excerpt_vs_item_weighted_overlap,
610 candidate_excerpt_vs_signature_weighted_overlap,
611 candidate_adjacent_vs_item_weighted_overlap,
612 candidate_adjacent_vs_signature_weighted_overlap,
613 candidate_path_import_match_count,
614 candidate_wildcard_path_import_match_count,
615 candidate_import_similarity,
616 candidate_max_import_similarity,
617 candidate_normalized_import_similarity,
618 candidate_wildcard_import_similarity,
619 candidate_normalized_wildcard_import_similarity,
620 candidate_included_by_others,
621 candidate_includes_others,
622 } = slices;
623
624 let df = DataFrame::new(vec![
625 Series::new(PlSmallStr::from_str("ref_id"), ref_id).into(),
626 Series::new(PlSmallStr::from_str("cursor_path"), cursor_path).into(),
627 Series::new(PlSmallStr::from_str("cursor_row"), cursor_row).into(),
628 Series::new(PlSmallStr::from_str("cursor_column"), cursor_column).into(),
629 Series::new(PlSmallStr::from_str("cursor_identifier"), cursor_identifier).into(),
630 Series::new(PlSmallStr::from_str("gold_in_excerpt"), gold_in_excerpt).into(),
631 Series::new(PlSmallStr::from_str("gold_path"), gold_path).into(),
632 Series::new(PlSmallStr::from_str("gold_row"), gold_row).into(),
633 Series::new(PlSmallStr::from_str("gold_column"), gold_column).into(),
634 Series::new(PlSmallStr::from_str("gold_is_external"), gold_is_external).into(),
635 Series::new(PlSmallStr::from_str("candidate_count"), candidate_count).into(),
636 Series::new(PlSmallStr::from_str("candidate_path"), candidate_path).into(),
637 Series::new(PlSmallStr::from_str("candidate_row"), candidate_row).into(),
638 Series::new(PlSmallStr::from_str("candidate_column"), candidate_column).into(),
639 Series::new(PlSmallStr::from_str("candidate_is_gold"), candidate_is_gold).into(),
640 Series::new(PlSmallStr::from_str("candidate_rank"), candidate_rank).into(),
641 Series::new(
642 PlSmallStr::from_str("candidate_is_same_file"),
643 candidate_is_same_file,
644 )
645 .into(),
646 Series::new(
647 PlSmallStr::from_str("candidate_is_referenced_nearby"),
648 candidate_is_referenced_nearby,
649 )
650 .into(),
651 Series::new(
652 PlSmallStr::from_str("candidate_is_referenced_in_breadcrumb"),
653 candidate_is_referenced_in_breadcrumb,
654 )
655 .into(),
656 Series::new(
657 PlSmallStr::from_str("candidate_reference_count"),
658 candidate_reference_count,
659 )
660 .into(),
661 Series::new(
662 PlSmallStr::from_str("candidate_same_file_declaration_count"),
663 candidate_same_file_declaration_count,
664 )
665 .into(),
666 Series::new(
667 PlSmallStr::from_str("candidate_declaration_count"),
668 candidate_declaration_count,
669 )
670 .into(),
671 Series::new(
672 PlSmallStr::from_str("candidate_reference_line_distance"),
673 candidate_reference_line_distance,
674 )
675 .into(),
676 Series::new(
677 PlSmallStr::from_str("candidate_declaration_line_distance"),
678 candidate_declaration_line_distance,
679 )
680 .into(),
681 Series::new(
682 PlSmallStr::from_str("candidate_excerpt_vs_item_jaccard"),
683 candidate_excerpt_vs_item_jaccard,
684 )
685 .into(),
686 Series::new(
687 PlSmallStr::from_str("candidate_excerpt_vs_signature_jaccard"),
688 candidate_excerpt_vs_signature_jaccard,
689 )
690 .into(),
691 Series::new(
692 PlSmallStr::from_str("candidate_adjacent_vs_item_jaccard"),
693 candidate_adjacent_vs_item_jaccard,
694 )
695 .into(),
696 Series::new(
697 PlSmallStr::from_str("candidate_adjacent_vs_signature_jaccard"),
698 candidate_adjacent_vs_signature_jaccard,
699 )
700 .into(),
701 Series::new(
702 PlSmallStr::from_str("candidate_excerpt_vs_item_weighted_overlap"),
703 candidate_excerpt_vs_item_weighted_overlap,
704 )
705 .into(),
706 Series::new(
707 PlSmallStr::from_str("candidate_excerpt_vs_signature_weighted_overlap"),
708 candidate_excerpt_vs_signature_weighted_overlap,
709 )
710 .into(),
711 Series::new(
712 PlSmallStr::from_str("candidate_adjacent_vs_item_weighted_overlap"),
713 candidate_adjacent_vs_item_weighted_overlap,
714 )
715 .into(),
716 Series::new(
717 PlSmallStr::from_str("candidate_adjacent_vs_signature_weighted_overlap"),
718 candidate_adjacent_vs_signature_weighted_overlap,
719 )
720 .into(),
721 Series::new(
722 PlSmallStr::from_str("candidate_path_import_match_count"),
723 candidate_path_import_match_count,
724 )
725 .into(),
726 Series::new(
727 PlSmallStr::from_str("candidate_wildcard_path_import_match_count"),
728 candidate_wildcard_path_import_match_count,
729 )
730 .into(),
731 Series::new(
732 PlSmallStr::from_str("candidate_import_similarity"),
733 candidate_import_similarity,
734 )
735 .into(),
736 Series::new(
737 PlSmallStr::from_str("candidate_max_import_similarity"),
738 candidate_max_import_similarity,
739 )
740 .into(),
741 Series::new(
742 PlSmallStr::from_str("candidate_normalized_import_similarity"),
743 candidate_normalized_import_similarity,
744 )
745 .into(),
746 Series::new(
747 PlSmallStr::from_str("candidate_wildcard_import_similarity"),
748 candidate_wildcard_import_similarity,
749 )
750 .into(),
751 Series::new(
752 PlSmallStr::from_str("candidate_normalized_wildcard_import_similarity"),
753 candidate_normalized_wildcard_import_similarity,
754 )
755 .into(),
756 Series::new(
757 PlSmallStr::from_str("candidate_included_by_others"),
758 candidate_included_by_others,
759 )
760 .into(),
761 Series::new(
762 PlSmallStr::from_str("candidate_includes_others"),
763 candidate_includes_others,
764 )
765 .into(),
766 ])?;
767
768 Ok(df)
769}
770
771fn relativize_path(path: &Path) -> &Path {
772 path.strip_prefix(std::env::current_dir().unwrap())
773 .unwrap_or(path)
774}
775
776struct SummaryStats {
777 references_count: u32,
778 retrieved_count: u32,
779 top_match_count: u32,
780 non_top_match_count: u32,
781 ranking_involved_top_match_count: u32,
782 missing_none_retrieved: u32,
783 missing_wrong_retrieval: u32,
784 missing_external: u32,
785 in_excerpt_count: u32,
786}
787
788impl SummaryStats {
789 fn from_dataframe(df: DataFrame) -> Result<Self> {
790 // TODO: use lazy more
791 let unique_refs =
792 df.unique::<(), ()>(Some(&["ref_id".into()]), UniqueKeepStrategy::Any, None)?;
793 let references_count = unique_refs.height() as u32;
794
795 let gold_mask = df.column("candidate_is_gold")?.bool()?;
796 let gold_df = df.filter(&gold_mask)?;
797 let retrieved_count = gold_df.height() as u32;
798
799 let top_match_mask = gold_df.column("candidate_rank")?.u32()?.equal(0);
800 let top_match_df = gold_df.filter(&top_match_mask)?;
801 let top_match_count = top_match_df.height() as u32;
802
803 let ranking_involved_top_match_count = top_match_df
804 .column("candidate_count")?
805 .u32()?
806 .gt(1)
807 .sum()
808 .unwrap_or_default();
809
810 let non_top_match_count = (!top_match_mask).sum().unwrap_or(0);
811
812 let not_retrieved_df = df
813 .lazy()
814 .group_by(&[col("ref_id"), col("candidate_count")])
815 .agg(&[
816 col("candidate_is_gold")
817 .fill_null(false)
818 .sum()
819 .alias("gold_count"),
820 col("gold_in_excerpt").sum().alias("gold_in_excerpt_count"),
821 col("gold_is_external")
822 .sum()
823 .alias("gold_is_external_count"),
824 ])
825 .filter(col("gold_count").eq(lit(0)))
826 .collect()?;
827
828 let in_excerpt_mask = not_retrieved_df
829 .column("gold_in_excerpt_count")?
830 .u32()?
831 .gt(0);
832 let in_excerpt_count = in_excerpt_mask.sum().unwrap_or(0);
833
834 let missing_df = not_retrieved_df.filter(&!in_excerpt_mask)?;
835
836 let missing_none_retrieved_mask = missing_df.column("candidate_count")?.u32()?.equal(0);
837 let missing_none_retrieved = missing_none_retrieved_mask.sum().unwrap_or(0);
838 let external_mask = missing_df.column("gold_is_external_count")?.u32()?.gt(0);
839 let missing_external = (missing_none_retrieved_mask & external_mask)
840 .sum()
841 .unwrap_or(0);
842
843 let missing_wrong_retrieval = missing_df
844 .column("candidate_count")?
845 .u32()?
846 .gt(0)
847 .sum()
848 .unwrap_or(0);
849
850 Ok(SummaryStats {
851 references_count,
852 retrieved_count,
853 top_match_count,
854 non_top_match_count,
855 ranking_involved_top_match_count,
856 missing_none_retrieved,
857 missing_wrong_retrieval,
858 missing_external,
859 in_excerpt_count,
860 })
861 }
862
863 fn count_and_percentage(part: u32, total: u32) -> String {
864 format!("{} ({:.2}%)", part, (part as f64 / total as f64) * 100.0)
865 }
866}
867
868impl std::fmt::Display for SummaryStats {
869 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
870 let included = self.in_excerpt_count + self.retrieved_count;
871 let missing = self.references_count - included;
872 writeln!(f)?;
873 writeln!(f, "╮ references: {}", self.references_count)?;
874 writeln!(
875 f,
876 "├─╮ included: {}",
877 Self::count_and_percentage(included, self.references_count),
878 )?;
879 writeln!(
880 f,
881 "│ ├─╮ retrieved: {}",
882 Self::count_and_percentage(self.retrieved_count, self.references_count)
883 )?;
884 writeln!(
885 f,
886 "│ │ ├─╮ top match : {}",
887 Self::count_and_percentage(self.top_match_count, self.retrieved_count)
888 )?;
889 writeln!(
890 f,
891 "│ │ │ ╰─╴ involving ranking: {}",
892 Self::count_and_percentage(self.ranking_involved_top_match_count, self.top_match_count)
893 )?;
894 writeln!(
895 f,
896 "│ │ ╰─╴ non-top match: {}",
897 Self::count_and_percentage(self.non_top_match_count, self.retrieved_count)
898 )?;
899 writeln!(
900 f,
901 "│ ╰─╴ in excerpt: {}",
902 Self::count_and_percentage(self.in_excerpt_count, included)
903 )?;
904 writeln!(
905 f,
906 "╰─╮ missing: {}",
907 Self::count_and_percentage(missing, self.references_count)
908 )?;
909 writeln!(
910 f,
911 " ├─╮ none retrieved: {}",
912 Self::count_and_percentage(self.missing_none_retrieved, missing)
913 )?;
914 writeln!(
915 f,
916 " │ ╰─╴ external (expected): {}",
917 Self::count_and_percentage(self.missing_external, missing)
918 )?;
919 writeln!(
920 f,
921 " ╰─╴ wrong retrieval: {}",
922 Self::count_and_percentage(self.missing_wrong_retrieval, missing)
923 )?;
924 Ok(())
925 }
926}
927
928#[derive(Debug)]
929struct ReferenceRetrievalResult {
930 cursor_path: Arc<RelPath>,
931 cursor_point: Point,
932 identifier: Identifier,
933 excerpt_range: Option<Range<usize>>,
934 lsp_definitions: Vec<SourceRange>,
935 retrieved_definitions: Vec<RetrievedDefinition>,
936}
937
938#[derive(Debug)]
939struct RetrievedDefinition {
940 path: Arc<RelPath>,
941 range: Range<Point>,
942 score: f32,
943 #[allow(dead_code)]
944 retrieval_score: f32,
945 #[allow(dead_code)]
946 components: DeclarationScoreComponents,
947}
948
949struct RetrieveResult {
950 definitions: Vec<RetrievedDefinition>,
951 excerpt_range: Option<Range<usize>>,
952}
953
954async fn retrieve_definitions(
955 reference: &Reference,
956 imports: &Imports,
957 query_point: Point,
958 snapshot: &BufferSnapshot,
959 index: &Arc<SyntaxIndexState>,
960 file_snapshots: &Arc<HashMap<ProjectEntryId, BufferSnapshot>>,
961 options: &Arc<zeta2::ZetaOptions>,
962) -> Result<RetrieveResult> {
963 let mut single_reference_map = HashMap::default();
964 single_reference_map.insert(reference.identifier.clone(), vec![reference.clone()]);
965 let edit_prediction_context = EditPredictionContext::gather_context_with_references_fn(
966 query_point,
967 snapshot,
968 imports,
969 &options.context,
970 Some(&index),
971 |_, _, _| single_reference_map,
972 );
973
974 let Some(edit_prediction_context) = edit_prediction_context else {
975 return Ok(RetrieveResult {
976 definitions: Vec::new(),
977 excerpt_range: None,
978 });
979 };
980
981 let mut retrieved_definitions = Vec::new();
982 for scored_declaration in edit_prediction_context.declarations {
983 match &scored_declaration.declaration {
984 Declaration::File {
985 project_entry_id,
986 declaration,
987 ..
988 } => {
989 let Some(snapshot) = file_snapshots.get(&project_entry_id) else {
990 log::error!("bug: file project entry not found");
991 continue;
992 };
993 let path = snapshot.file().unwrap().path().clone();
994 retrieved_definitions.push(RetrievedDefinition {
995 path,
996 range: snapshot.offset_to_point(declaration.item_range.start)
997 ..snapshot.offset_to_point(declaration.item_range.end),
998 score: scored_declaration.score(DeclarationStyle::Declaration),
999 retrieval_score: scored_declaration.retrieval_score(),
1000 components: scored_declaration.components,
1001 });
1002 }
1003 Declaration::Buffer {
1004 project_entry_id,
1005 rope,
1006 declaration,
1007 ..
1008 } => {
1009 let Some(snapshot) = file_snapshots.get(&project_entry_id) else {
1010 // This case happens when dependency buffers have been opened by
1011 // go-to-definition, resulting in single-file worktrees.
1012 continue;
1013 };
1014 let path = snapshot.file().unwrap().path().clone();
1015 retrieved_definitions.push(RetrievedDefinition {
1016 path,
1017 range: rope.offset_to_point(declaration.item_range.start)
1018 ..rope.offset_to_point(declaration.item_range.end),
1019 score: scored_declaration.score(DeclarationStyle::Declaration),
1020 retrieval_score: scored_declaration.retrieval_score(),
1021 components: scored_declaration.components,
1022 });
1023 }
1024 }
1025 }
1026 retrieved_definitions.sort_by_key(|definition| Reverse(OrderedFloat(definition.score)));
1027
1028 Ok(RetrieveResult {
1029 definitions: retrieved_definitions,
1030 excerpt_range: Some(edit_prediction_context.excerpt.range),
1031 })
1032}
1033
1034async fn gather_lsp_definitions(
1035 lsp_definitions_path: &Path,
1036 start_index: usize,
1037 files: &[ProjectPath],
1038 worktree: &Entity<Worktree>,
1039 project: &Entity<Project>,
1040 definitions: &mut HashMap<SourceLocation, Vec<SourceRange>>,
1041 cx: &mut AsyncApp,
1042) -> Result<()> {
1043 let worktree_id = worktree.read_with(cx, |worktree, _cx| worktree.id())?;
1044
1045 let lsp_store = project.read_with(cx, |project, _cx| project.lsp_store())?;
1046 cx.subscribe(&lsp_store, {
1047 move |_, event, _| {
1048 if let project::LspStoreEvent::LanguageServerUpdate {
1049 message:
1050 client::proto::update_language_server::Variant::WorkProgress(
1051 client::proto::LspWorkProgress {
1052 message: Some(message),
1053 ..
1054 },
1055 ),
1056 ..
1057 } = event
1058 {
1059 println!("⟲ {message}")
1060 }
1061 }
1062 })?
1063 .detach();
1064
1065 let (cache_line_tx, mut cache_line_rx) = mpsc::unbounded::<FileLspDefinitions>();
1066
1067 let cache_file = File::options()
1068 .append(true)
1069 .create(true)
1070 .open(lsp_definitions_path)
1071 .unwrap();
1072
1073 let cache_task = cx.background_spawn(async move {
1074 let mut writer = BufWriter::new(cache_file);
1075 while let Some(line) = cache_line_rx.next().await {
1076 serde_json::to_writer(&mut writer, &line).unwrap();
1077 writer.write_all(&[b'\n']).unwrap();
1078 }
1079 writer.flush().unwrap();
1080 });
1081
1082 let mut error_count = 0;
1083 let mut lsp_open_handles = Vec::new();
1084 let mut ready_languages = HashSet::default();
1085 for (file_index, project_path) in files[start_index..].iter().enumerate() {
1086 println!(
1087 "Processing file {} of {}: {}",
1088 start_index + file_index + 1,
1089 files.len(),
1090 project_path.path.display(PathStyle::Posix)
1091 );
1092
1093 let Some((lsp_open_handle, language_server_id, buffer)) = open_buffer_with_language_server(
1094 project.clone(),
1095 worktree.clone(),
1096 project_path.path.clone(),
1097 &mut ready_languages,
1098 cx,
1099 )
1100 .await
1101 .log_err() else {
1102 continue;
1103 };
1104 lsp_open_handles.push(lsp_open_handle);
1105
1106 let snapshot = buffer.read_with(cx, |buffer, _cx| buffer.snapshot())?;
1107 let full_range = 0..snapshot.len();
1108 let references = references_in_range(
1109 full_range,
1110 &snapshot.text(),
1111 ReferenceRegion::Nearby,
1112 &snapshot,
1113 );
1114
1115 loop {
1116 let is_ready = lsp_store
1117 .read_with(cx, |lsp_store, _cx| {
1118 lsp_store
1119 .language_server_statuses
1120 .get(&language_server_id)
1121 .is_some_and(|status| status.pending_work.is_empty())
1122 })
1123 .unwrap();
1124 if is_ready {
1125 break;
1126 }
1127 cx.background_executor()
1128 .timer(Duration::from_millis(10))
1129 .await;
1130 }
1131
1132 let mut cache_line_references = Vec::with_capacity(references.len());
1133
1134 for reference in references {
1135 // TODO: Rename declaration to definition in edit_prediction_context?
1136 let lsp_result = project
1137 .update(cx, |project, cx| {
1138 project.definitions(&buffer, reference.range.start, cx)
1139 })?
1140 .await;
1141
1142 match lsp_result {
1143 Ok(lsp_definitions) => {
1144 let mut targets = Vec::new();
1145 for target in lsp_definitions.unwrap_or_default() {
1146 let buffer = target.target.buffer;
1147 let anchor_range = target.target.range;
1148 buffer.read_with(cx, |buffer, cx| {
1149 let Some(file) = project::File::from_dyn(buffer.file()) else {
1150 return;
1151 };
1152 let file_worktree = file.worktree.read(cx);
1153 let file_worktree_id = file_worktree.id();
1154 // Relative paths for worktree files, absolute for all others
1155 let path = if worktree_id != file_worktree_id {
1156 file.worktree.read(cx).absolutize(&file.path)
1157 } else {
1158 file.path.as_std_path().to_path_buf()
1159 };
1160 let offset_range = anchor_range.to_offset(&buffer);
1161 let point_range = SerializablePoint::from_language_point_range(
1162 offset_range.to_point(&buffer),
1163 );
1164 targets.push(SourceRange {
1165 path,
1166 offset_range,
1167 point_range,
1168 });
1169 })?;
1170 }
1171
1172 let point = snapshot.offset_to_point(reference.range.start);
1173
1174 cache_line_references.push((point.into(), targets.clone()));
1175 definitions.insert(
1176 SourceLocation {
1177 path: project_path.path.clone(),
1178 point,
1179 },
1180 targets,
1181 );
1182 }
1183 Err(err) => {
1184 log::error!("Language server error: {err}");
1185 error_count += 1;
1186 }
1187 }
1188 }
1189
1190 cache_line_tx
1191 .unbounded_send(FileLspDefinitions {
1192 path: project_path.path.as_unix_str().into(),
1193 references: cache_line_references,
1194 })
1195 .log_err();
1196 }
1197
1198 drop(cache_line_tx);
1199
1200 if error_count > 0 {
1201 log::error!("Encountered {} language server errors", error_count);
1202 }
1203
1204 cache_task.await;
1205
1206 Ok(())
1207}
1208
1209#[derive(Serialize, Deserialize)]
1210struct FileLspDefinitions {
1211 path: Arc<str>,
1212 references: Vec<(SerializablePoint, Vec<SourceRange>)>,
1213}
1214
1215#[derive(Debug, Clone, Serialize, Deserialize)]
1216struct SourceRange {
1217 path: PathBuf,
1218 point_range: Range<SerializablePoint>,
1219 offset_range: Range<usize>,
1220}
1221
1222/// Serializes to 1-based row and column indices.
1223#[derive(Debug, Clone, Serialize, Deserialize)]
1224pub struct SerializablePoint {
1225 pub row: u32,
1226 pub column: u32,
1227}
1228
1229impl SerializablePoint {
1230 pub fn into_language_point_range(range: Range<Self>) -> Range<Point> {
1231 range.start.into()..range.end.into()
1232 }
1233
1234 pub fn from_language_point_range(range: Range<Point>) -> Range<Self> {
1235 range.start.into()..range.end.into()
1236 }
1237}
1238
1239impl From<Point> for SerializablePoint {
1240 fn from(point: Point) -> Self {
1241 SerializablePoint {
1242 row: point.row + 1,
1243 column: point.column + 1,
1244 }
1245 }
1246}
1247
1248impl From<SerializablePoint> for Point {
1249 fn from(serializable: SerializablePoint) -> Self {
1250 Point {
1251 row: serializable.row.saturating_sub(1),
1252 column: serializable.column.saturating_sub(1),
1253 }
1254 }
1255}