retrieval_stats.rs

   1use ::util::rel_path::RelPath;
   2use ::util::{RangeExt, ResultExt as _};
   3use anyhow::{Context as _, Result};
   4use cloud_llm_client::predict_edits_v3::DeclarationScoreComponents;
   5use edit_prediction_context::{
   6    Declaration, DeclarationStyle, EditPredictionContext, EditPredictionContextOptions, Identifier,
   7    Imports, Reference, ReferenceRegion, SyntaxIndex, SyntaxIndexState, references_in_range,
   8};
   9use futures::StreamExt as _;
  10use futures::channel::mpsc;
  11use gpui::Entity;
  12use gpui::{AppContext, AsyncApp};
  13use language::OffsetRangeExt;
  14use language::{BufferSnapshot, Point};
  15use ordered_float::OrderedFloat;
  16use polars::prelude::*;
  17use project::{Project, ProjectEntryId, ProjectPath, Worktree};
  18use serde::{Deserialize, Serialize};
  19use std::fs;
  20use std::{
  21    cmp::Reverse,
  22    collections::{HashMap, HashSet},
  23    fs::File,
  24    hash::{Hash, Hasher},
  25    io::{BufRead, BufReader, BufWriter, Write as _},
  26    ops::Range,
  27    path::{Path, PathBuf},
  28    sync::{
  29        Arc,
  30        atomic::{self, AtomicUsize},
  31    },
  32    time::Duration,
  33};
  34use util::paths::PathStyle;
  35use zeta2::ContextMode;
  36
  37use crate::headless::ZetaCliAppState;
  38use crate::source_location::SourceLocation;
  39use crate::util::{open_buffer, open_buffer_with_language_server};
  40
  41pub async fn retrieval_stats(
  42    worktree: PathBuf,
  43    app_state: Arc<ZetaCliAppState>,
  44    only_extension: Option<String>,
  45    file_limit: Option<usize>,
  46    skip_files: Option<usize>,
  47    options: zeta2::ZetaOptions,
  48    cx: &mut AsyncApp,
  49) -> Result<String> {
  50    let ContextMode::Syntax(context_options) = options.context.clone() else {
  51        anyhow::bail!("retrieval stats only works in ContextMode::Syntax");
  52    };
  53
  54    let options = Arc::new(options);
  55    let worktree_path = worktree.canonicalize()?;
  56
  57    let project = cx.update(|cx| {
  58        Project::local(
  59            app_state.client.clone(),
  60            app_state.node_runtime.clone(),
  61            app_state.user_store.clone(),
  62            app_state.languages.clone(),
  63            app_state.fs.clone(),
  64            None,
  65            cx,
  66        )
  67    })?;
  68
  69    let worktree = project
  70        .update(cx, |project, cx| {
  71            project.create_worktree(&worktree_path, true, cx)
  72        })?
  73        .await?;
  74
  75    // wait for worktree scan so that wait_for_initial_file_indexing waits for the whole worktree.
  76    worktree
  77        .read_with(cx, |worktree, _cx| {
  78            worktree.as_local().unwrap().scan_complete()
  79        })?
  80        .await;
  81
  82    let index = cx.new(|cx| SyntaxIndex::new(&project, options.file_indexing_parallelism, cx))?;
  83    index
  84        .read_with(cx, |index, cx| index.wait_for_initial_file_indexing(cx))?
  85        .await?;
  86    let indexed_files = index
  87        .read_with(cx, |index, cx| index.indexed_file_paths(cx))?
  88        .await;
  89    let mut filtered_files = indexed_files
  90        .into_iter()
  91        .filter(|project_path| {
  92            let file_extension = project_path.path.extension();
  93            if let Some(only_extension) = only_extension.as_ref() {
  94                file_extension.is_some_and(|extension| extension == only_extension)
  95            } else {
  96                file_extension
  97                    .is_some_and(|extension| !["md", "json", "sh", "diff"].contains(&extension))
  98            }
  99        })
 100        .collect::<Vec<_>>();
 101    filtered_files.sort_by(|a, b| a.path.cmp(&b.path));
 102
 103    let index_state = index.read_with(cx, |index, _cx| index.state().clone())?;
 104    cx.update(|_| {
 105        drop(index);
 106    })?;
 107    let index_state = Arc::new(
 108        Arc::into_inner(index_state)
 109            .context("Index state had more than 1 reference")?
 110            .into_inner(),
 111    );
 112
 113    struct FileSnapshot {
 114        project_entry_id: ProjectEntryId,
 115        snapshot: BufferSnapshot,
 116        hash: u64,
 117        parent_abs_path: Arc<Path>,
 118    }
 119
 120    let files: Vec<FileSnapshot> = futures::future::try_join_all({
 121        filtered_files
 122            .iter()
 123            .map(|file| {
 124                let buffer_task =
 125                    open_buffer(project.clone(), worktree.clone(), file.path.clone(), cx);
 126                cx.spawn(async move |cx| {
 127                    let buffer = buffer_task.await?;
 128                    let (project_entry_id, parent_abs_path, snapshot) =
 129                        buffer.read_with(cx, |buffer, cx| {
 130                            let file = project::File::from_dyn(buffer.file()).unwrap();
 131                            let project_entry_id = file.project_entry_id().unwrap();
 132                            let mut parent_abs_path = file.worktree.read(cx).absolutize(&file.path);
 133                            if !parent_abs_path.pop() {
 134                                panic!("Invalid worktree path");
 135                            }
 136
 137                            (project_entry_id, parent_abs_path, buffer.snapshot())
 138                        })?;
 139
 140                    anyhow::Ok(
 141                        cx.background_spawn(async move {
 142                            let mut hasher = collections::FxHasher::default();
 143                            snapshot.text().hash(&mut hasher);
 144                            FileSnapshot {
 145                                project_entry_id,
 146                                snapshot,
 147                                hash: hasher.finish(),
 148                                parent_abs_path: parent_abs_path.into(),
 149                            }
 150                        })
 151                        .await,
 152                    )
 153                })
 154            })
 155            .collect::<Vec<_>>()
 156    })
 157    .await?;
 158
 159    let mut file_snapshots = HashMap::default();
 160    let mut hasher = collections::FxHasher::default();
 161    for FileSnapshot {
 162        project_entry_id,
 163        snapshot,
 164        hash,
 165        ..
 166    } in &files
 167    {
 168        file_snapshots.insert(*project_entry_id, snapshot.clone());
 169        hash.hash(&mut hasher);
 170    }
 171    let files_hash = hasher.finish();
 172    let file_snapshots = Arc::new(file_snapshots);
 173    let target_cli_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("../../target/zeta_cli");
 174    fs::create_dir_all(&target_cli_dir).unwrap();
 175    let target_cli_dir = target_cli_dir.canonicalize().unwrap();
 176
 177    let lsp_cache_dir = target_cli_dir.join("cache");
 178    fs::create_dir_all(&lsp_cache_dir).unwrap();
 179
 180    let lsp_definitions_path = lsp_cache_dir.join(format!(
 181        "{}-{:x}.jsonl",
 182        worktree_path.file_stem().unwrap_or_default().display(),
 183        files_hash
 184    ));
 185
 186    let mut lsp_definitions = HashMap::default();
 187    let mut lsp_files = 0;
 188
 189    if fs::exists(&lsp_definitions_path)? {
 190        log::info!(
 191            "Using cached LSP definitions from {}",
 192            lsp_definitions_path.display()
 193        );
 194
 195        let file = File::options()
 196            .read(true)
 197            .write(true)
 198            .open(&lsp_definitions_path)?;
 199        let lines = BufReader::new(&file).lines();
 200        let mut valid_len: usize = 0;
 201
 202        for (line, expected_file) in lines.zip(files.iter()) {
 203            let line = line?;
 204            let FileLspDefinitions { path, references } = match serde_json::from_str(&line) {
 205                Ok(ok) => ok,
 206                Err(_) => {
 207                    log::error!("Found invalid cache line. Truncating to #{lsp_files}.",);
 208                    file.set_len(valid_len as u64)?;
 209                    break;
 210                }
 211            };
 212            let expected_path = expected_file.snapshot.file().unwrap().path().as_unix_str();
 213            if expected_path != path.as_ref() {
 214                log::error!(
 215                    "Expected file #{} to be {expected_path}, but found {path}. Truncating to #{lsp_files}.",
 216                    lsp_files + 1
 217                );
 218                file.set_len(valid_len as u64)?;
 219                break;
 220            }
 221            for (point, ranges) in references {
 222                let Ok(path) = RelPath::new(Path::new(path.as_ref()), PathStyle::Posix) else {
 223                    log::warn!("Invalid path: {}", path);
 224                    continue;
 225                };
 226                lsp_definitions.insert(
 227                    SourceLocation {
 228                        path: path.into_arc(),
 229                        point: point.into(),
 230                    },
 231                    ranges,
 232                );
 233            }
 234            lsp_files += 1;
 235            valid_len += line.len() + 1
 236        }
 237    }
 238
 239    if lsp_files < files.len() {
 240        if lsp_files == 0 {
 241            log::warn!(
 242                "No LSP definitions found, populating {}",
 243                lsp_definitions_path.display()
 244            );
 245        } else {
 246            log::warn!("{} files missing from LSP cache", files.len() - lsp_files);
 247        }
 248
 249        gather_lsp_definitions(
 250            &lsp_definitions_path,
 251            lsp_files,
 252            &filtered_files,
 253            &worktree,
 254            &project,
 255            &mut lsp_definitions,
 256            cx,
 257        )
 258        .await?;
 259    }
 260    let files_len = files.len().min(file_limit.unwrap_or(usize::MAX));
 261    let done_count = Arc::new(AtomicUsize::new(0));
 262
 263    let (output_tx, output_rx) = mpsc::unbounded::<ReferenceRetrievalResult>();
 264
 265    let tasks = files
 266        .into_iter()
 267        .skip(skip_files.unwrap_or(0))
 268        .take(file_limit.unwrap_or(usize::MAX))
 269        .map(|project_file| {
 270            let index_state = index_state.clone();
 271            let lsp_definitions = lsp_definitions.clone();
 272            let output_tx = output_tx.clone();
 273            let done_count = done_count.clone();
 274            let file_snapshots = file_snapshots.clone();
 275            let context_options = context_options.clone();
 276            cx.background_spawn(async move {
 277                let snapshot = project_file.snapshot;
 278
 279                let full_range = 0..snapshot.len();
 280                let references = references_in_range(
 281                    full_range,
 282                    &snapshot.text(),
 283                    ReferenceRegion::Nearby,
 284                    &snapshot,
 285                );
 286
 287                let imports = if context_options.use_imports {
 288                    Imports::gather(&snapshot, Some(&project_file.parent_abs_path))
 289                } else {
 290                    Imports::default()
 291                };
 292
 293                let path = snapshot.file().unwrap().path();
 294
 295                for reference in references {
 296                    let query_point = snapshot.offset_to_point(reference.range.start);
 297                    let source_location = SourceLocation {
 298                        path: path.clone(),
 299                        point: query_point,
 300                    };
 301                    let lsp_definitions = lsp_definitions
 302                        .get(&source_location)
 303                        .cloned()
 304                        .unwrap_or_else(|| {
 305                            log::warn!(
 306                                "No definitions found for source location: {:?}",
 307                                source_location
 308                            );
 309                            Vec::new()
 310                        });
 311
 312                    let retrieve_result = retrieve_definitions(
 313                        &reference,
 314                        &imports,
 315                        query_point,
 316                        &snapshot,
 317                        &index_state,
 318                        &file_snapshots,
 319                        &context_options,
 320                    )
 321                    .await?;
 322
 323                    let result = ReferenceRetrievalResult {
 324                        cursor_path: path.clone(),
 325                        identifier: reference.identifier,
 326                        cursor_point: query_point,
 327                        lsp_definitions,
 328                        retrieved_definitions: retrieve_result.definitions,
 329                        excerpt_range: retrieve_result.excerpt_range,
 330                    };
 331
 332                    output_tx.unbounded_send(result).ok();
 333                }
 334
 335                println!(
 336                    "{:02}/{:02} done",
 337                    done_count.fetch_add(1, atomic::Ordering::Relaxed) + 1,
 338                    files_len,
 339                );
 340
 341                anyhow::Ok(())
 342            })
 343        })
 344        .collect::<Vec<_>>();
 345
 346    drop(output_tx);
 347
 348    let df_task = cx.background_spawn(build_dataframe(output_rx));
 349
 350    futures::future::try_join_all(tasks).await?;
 351    let mut df = df_task.await?;
 352
 353    let run_id = format!(
 354        "{}-{}",
 355        worktree_path.file_stem().unwrap_or_default().display(),
 356        chrono::Local::now().format("%Y%m%d_%H%M%S")
 357    );
 358    let run_dir = target_cli_dir.join(run_id);
 359    fs::create_dir(&run_dir).unwrap();
 360
 361    let parquet_path = run_dir.join("stats.parquet");
 362    let mut parquet_file = fs::File::create(&parquet_path)?;
 363
 364    ParquetWriter::new(&mut parquet_file)
 365        .finish(&mut df)
 366        .unwrap();
 367
 368    let stats = SummaryStats::from_dataframe(df)?;
 369
 370    let stats_path = run_dir.join("stats.txt");
 371    fs::write(&stats_path, format!("{}", stats))?;
 372
 373    println!("{}", stats);
 374    println!("\nWrote:");
 375    println!("- {}", relativize_path(&parquet_path).display());
 376    println!("- {}", relativize_path(&stats_path).display());
 377    println!("- {}", relativize_path(&lsp_definitions_path).display());
 378
 379    Ok("".to_string())
 380}
 381
 382async fn build_dataframe(
 383    mut output_rx: mpsc::UnboundedReceiver<ReferenceRetrievalResult>,
 384) -> Result<DataFrame> {
 385    use soa_rs::{Soa, Soars};
 386
 387    #[derive(Default, Soars)]
 388    struct Row {
 389        ref_id: u32,
 390        cursor_path: String,
 391        cursor_row: u32,
 392        cursor_column: u32,
 393        cursor_identifier: String,
 394        gold_in_excerpt: bool,
 395        gold_path: String,
 396        gold_row: u32,
 397        gold_column: u32,
 398        gold_is_external: bool,
 399        candidate_count: u32,
 400        candidate_path: Option<String>,
 401        candidate_row: Option<u32>,
 402        candidate_column: Option<u32>,
 403        candidate_is_gold: Option<bool>,
 404        candidate_rank: Option<u32>,
 405        candidate_is_same_file: Option<bool>,
 406        candidate_is_referenced_nearby: Option<bool>,
 407        candidate_is_referenced_in_breadcrumb: Option<bool>,
 408        candidate_reference_count: Option<u32>,
 409        candidate_same_file_declaration_count: Option<u32>,
 410        candidate_declaration_count: Option<u32>,
 411        candidate_reference_line_distance: Option<u32>,
 412        candidate_declaration_line_distance: Option<u32>,
 413        candidate_excerpt_vs_item_jaccard: Option<f32>,
 414        candidate_excerpt_vs_signature_jaccard: Option<f32>,
 415        candidate_adjacent_vs_item_jaccard: Option<f32>,
 416        candidate_adjacent_vs_signature_jaccard: Option<f32>,
 417        candidate_excerpt_vs_item_weighted_overlap: Option<f32>,
 418        candidate_excerpt_vs_signature_weighted_overlap: Option<f32>,
 419        candidate_adjacent_vs_item_weighted_overlap: Option<f32>,
 420        candidate_adjacent_vs_signature_weighted_overlap: Option<f32>,
 421        candidate_path_import_match_count: Option<u32>,
 422        candidate_wildcard_path_import_match_count: Option<u32>,
 423        candidate_import_similarity: Option<f32>,
 424        candidate_max_import_similarity: Option<f32>,
 425        candidate_normalized_import_similarity: Option<f32>,
 426        candidate_wildcard_import_similarity: Option<f32>,
 427        candidate_normalized_wildcard_import_similarity: Option<f32>,
 428        candidate_included_by_others: Option<u32>,
 429        candidate_includes_others: Option<u32>,
 430    }
 431    let mut rows = Soa::<Row>::new();
 432    let mut next_ref_id = 0;
 433
 434    while let Some(result) = output_rx.next().await {
 435        let mut gold_is_external = false;
 436        let mut gold_in_excerpt = false;
 437        let cursor_path = result.cursor_path.as_unix_str();
 438        let cursor_row = result.cursor_point.row + 1;
 439        let cursor_column = result.cursor_point.column + 1;
 440        let cursor_identifier = result.identifier.name.to_string();
 441        let ref_id = next_ref_id;
 442        next_ref_id += 1;
 443
 444        for lsp_definition in result.lsp_definitions {
 445            let SourceRange {
 446                path: gold_path,
 447                point_range: gold_point_range,
 448                offset_range: gold_offset_range,
 449            } = lsp_definition;
 450            let lsp_point_range =
 451                SerializablePoint::into_language_point_range(gold_point_range.clone());
 452
 453            gold_is_external = gold_is_external
 454                || gold_path.is_absolute()
 455                || gold_path
 456                    .components()
 457                    .any(|component| component.as_os_str() == "node_modules");
 458
 459            gold_in_excerpt = gold_in_excerpt
 460                || result.excerpt_range.as_ref().is_some_and(|excerpt_range| {
 461                    excerpt_range.contains_inclusive(&gold_offset_range)
 462                });
 463
 464            let gold_row = gold_point_range.start.row;
 465            let gold_column = gold_point_range.start.column;
 466            let candidate_count = result.retrieved_definitions.len() as u32;
 467
 468            for (candidate_rank, retrieved_definition) in
 469                result.retrieved_definitions.iter().enumerate()
 470            {
 471                let candidate_is_gold = gold_path.as_path()
 472                    == retrieved_definition.path.as_std_path()
 473                    && retrieved_definition
 474                        .range
 475                        .contains_inclusive(&lsp_point_range);
 476
 477                let candidate_row = retrieved_definition.range.start.row + 1;
 478                let candidate_column = retrieved_definition.range.start.column + 1;
 479
 480                let DeclarationScoreComponents {
 481                    is_same_file,
 482                    is_referenced_nearby,
 483                    is_referenced_in_breadcrumb,
 484                    reference_count,
 485                    same_file_declaration_count,
 486                    declaration_count,
 487                    reference_line_distance,
 488                    declaration_line_distance,
 489                    excerpt_vs_item_jaccard,
 490                    excerpt_vs_signature_jaccard,
 491                    adjacent_vs_item_jaccard,
 492                    adjacent_vs_signature_jaccard,
 493                    excerpt_vs_item_weighted_overlap,
 494                    excerpt_vs_signature_weighted_overlap,
 495                    adjacent_vs_item_weighted_overlap,
 496                    adjacent_vs_signature_weighted_overlap,
 497                    path_import_match_count,
 498                    wildcard_path_import_match_count,
 499                    import_similarity,
 500                    max_import_similarity,
 501                    normalized_import_similarity,
 502                    wildcard_import_similarity,
 503                    normalized_wildcard_import_similarity,
 504                    included_by_others,
 505                    includes_others,
 506                } = retrieved_definition.components;
 507
 508                rows.push(Row {
 509                    ref_id,
 510                    cursor_path: cursor_path.to_string(),
 511                    cursor_row,
 512                    cursor_column,
 513                    cursor_identifier: cursor_identifier.clone(),
 514                    gold_in_excerpt,
 515                    gold_path: gold_path.to_string_lossy().to_string(),
 516                    gold_row,
 517                    gold_column,
 518                    gold_is_external,
 519                    candidate_count,
 520                    candidate_path: Some(retrieved_definition.path.as_unix_str().to_string()),
 521                    candidate_row: Some(candidate_row),
 522                    candidate_column: Some(candidate_column),
 523                    candidate_is_gold: Some(candidate_is_gold),
 524                    candidate_rank: Some(candidate_rank as u32),
 525                    candidate_is_same_file: Some(is_same_file),
 526                    candidate_is_referenced_nearby: Some(is_referenced_nearby),
 527                    candidate_is_referenced_in_breadcrumb: Some(is_referenced_in_breadcrumb),
 528                    candidate_reference_count: Some(reference_count as u32),
 529                    candidate_same_file_declaration_count: Some(same_file_declaration_count as u32),
 530                    candidate_declaration_count: Some(declaration_count as u32),
 531                    candidate_reference_line_distance: Some(reference_line_distance),
 532                    candidate_declaration_line_distance: Some(declaration_line_distance),
 533                    candidate_excerpt_vs_item_jaccard: Some(excerpt_vs_item_jaccard),
 534                    candidate_excerpt_vs_signature_jaccard: Some(excerpt_vs_signature_jaccard),
 535                    candidate_adjacent_vs_item_jaccard: Some(adjacent_vs_item_jaccard),
 536                    candidate_adjacent_vs_signature_jaccard: Some(adjacent_vs_signature_jaccard),
 537                    candidate_excerpt_vs_item_weighted_overlap: Some(
 538                        excerpt_vs_item_weighted_overlap,
 539                    ),
 540                    candidate_excerpt_vs_signature_weighted_overlap: Some(
 541                        excerpt_vs_signature_weighted_overlap,
 542                    ),
 543                    candidate_adjacent_vs_item_weighted_overlap: Some(
 544                        adjacent_vs_item_weighted_overlap,
 545                    ),
 546                    candidate_adjacent_vs_signature_weighted_overlap: Some(
 547                        adjacent_vs_signature_weighted_overlap,
 548                    ),
 549                    candidate_path_import_match_count: Some(path_import_match_count as u32),
 550                    candidate_wildcard_path_import_match_count: Some(
 551                        wildcard_path_import_match_count as u32,
 552                    ),
 553                    candidate_import_similarity: Some(import_similarity),
 554                    candidate_max_import_similarity: Some(max_import_similarity),
 555                    candidate_normalized_import_similarity: Some(normalized_import_similarity),
 556                    candidate_wildcard_import_similarity: Some(wildcard_import_similarity),
 557                    candidate_normalized_wildcard_import_similarity: Some(
 558                        normalized_wildcard_import_similarity,
 559                    ),
 560                    candidate_included_by_others: Some(included_by_others as u32),
 561                    candidate_includes_others: Some(includes_others as u32),
 562                });
 563            }
 564
 565            if result.retrieved_definitions.is_empty() {
 566                rows.push(Row {
 567                    ref_id,
 568                    cursor_path: cursor_path.to_string(),
 569                    cursor_row,
 570                    cursor_column,
 571                    cursor_identifier: cursor_identifier.clone(),
 572                    gold_in_excerpt,
 573                    gold_path: gold_path.to_string_lossy().to_string(),
 574                    gold_row,
 575                    gold_column,
 576                    gold_is_external,
 577                    candidate_count,
 578                    ..Default::default()
 579                });
 580            }
 581        }
 582    }
 583    let slices = rows.slices();
 584
 585    let RowSlices {
 586        ref_id,
 587        cursor_path,
 588        cursor_row,
 589        cursor_column,
 590        cursor_identifier,
 591        gold_in_excerpt,
 592        gold_path,
 593        gold_row,
 594        gold_column,
 595        gold_is_external,
 596        candidate_path,
 597        candidate_row,
 598        candidate_column,
 599        candidate_is_gold,
 600        candidate_rank,
 601        candidate_count,
 602        candidate_is_same_file,
 603        candidate_is_referenced_nearby,
 604        candidate_is_referenced_in_breadcrumb,
 605        candidate_reference_count,
 606        candidate_same_file_declaration_count,
 607        candidate_declaration_count,
 608        candidate_reference_line_distance,
 609        candidate_declaration_line_distance,
 610        candidate_excerpt_vs_item_jaccard,
 611        candidate_excerpt_vs_signature_jaccard,
 612        candidate_adjacent_vs_item_jaccard,
 613        candidate_adjacent_vs_signature_jaccard,
 614        candidate_excerpt_vs_item_weighted_overlap,
 615        candidate_excerpt_vs_signature_weighted_overlap,
 616        candidate_adjacent_vs_item_weighted_overlap,
 617        candidate_adjacent_vs_signature_weighted_overlap,
 618        candidate_path_import_match_count,
 619        candidate_wildcard_path_import_match_count,
 620        candidate_import_similarity,
 621        candidate_max_import_similarity,
 622        candidate_normalized_import_similarity,
 623        candidate_wildcard_import_similarity,
 624        candidate_normalized_wildcard_import_similarity,
 625        candidate_included_by_others,
 626        candidate_includes_others,
 627    } = slices;
 628
 629    let df = DataFrame::new(vec![
 630        Series::new(PlSmallStr::from_str("ref_id"), ref_id).into(),
 631        Series::new(PlSmallStr::from_str("cursor_path"), cursor_path).into(),
 632        Series::new(PlSmallStr::from_str("cursor_row"), cursor_row).into(),
 633        Series::new(PlSmallStr::from_str("cursor_column"), cursor_column).into(),
 634        Series::new(PlSmallStr::from_str("cursor_identifier"), cursor_identifier).into(),
 635        Series::new(PlSmallStr::from_str("gold_in_excerpt"), gold_in_excerpt).into(),
 636        Series::new(PlSmallStr::from_str("gold_path"), gold_path).into(),
 637        Series::new(PlSmallStr::from_str("gold_row"), gold_row).into(),
 638        Series::new(PlSmallStr::from_str("gold_column"), gold_column).into(),
 639        Series::new(PlSmallStr::from_str("gold_is_external"), gold_is_external).into(),
 640        Series::new(PlSmallStr::from_str("candidate_count"), candidate_count).into(),
 641        Series::new(PlSmallStr::from_str("candidate_path"), candidate_path).into(),
 642        Series::new(PlSmallStr::from_str("candidate_row"), candidate_row).into(),
 643        Series::new(PlSmallStr::from_str("candidate_column"), candidate_column).into(),
 644        Series::new(PlSmallStr::from_str("candidate_is_gold"), candidate_is_gold).into(),
 645        Series::new(PlSmallStr::from_str("candidate_rank"), candidate_rank).into(),
 646        Series::new(
 647            PlSmallStr::from_str("candidate_is_same_file"),
 648            candidate_is_same_file,
 649        )
 650        .into(),
 651        Series::new(
 652            PlSmallStr::from_str("candidate_is_referenced_nearby"),
 653            candidate_is_referenced_nearby,
 654        )
 655        .into(),
 656        Series::new(
 657            PlSmallStr::from_str("candidate_is_referenced_in_breadcrumb"),
 658            candidate_is_referenced_in_breadcrumb,
 659        )
 660        .into(),
 661        Series::new(
 662            PlSmallStr::from_str("candidate_reference_count"),
 663            candidate_reference_count,
 664        )
 665        .into(),
 666        Series::new(
 667            PlSmallStr::from_str("candidate_same_file_declaration_count"),
 668            candidate_same_file_declaration_count,
 669        )
 670        .into(),
 671        Series::new(
 672            PlSmallStr::from_str("candidate_declaration_count"),
 673            candidate_declaration_count,
 674        )
 675        .into(),
 676        Series::new(
 677            PlSmallStr::from_str("candidate_reference_line_distance"),
 678            candidate_reference_line_distance,
 679        )
 680        .into(),
 681        Series::new(
 682            PlSmallStr::from_str("candidate_declaration_line_distance"),
 683            candidate_declaration_line_distance,
 684        )
 685        .into(),
 686        Series::new(
 687            PlSmallStr::from_str("candidate_excerpt_vs_item_jaccard"),
 688            candidate_excerpt_vs_item_jaccard,
 689        )
 690        .into(),
 691        Series::new(
 692            PlSmallStr::from_str("candidate_excerpt_vs_signature_jaccard"),
 693            candidate_excerpt_vs_signature_jaccard,
 694        )
 695        .into(),
 696        Series::new(
 697            PlSmallStr::from_str("candidate_adjacent_vs_item_jaccard"),
 698            candidate_adjacent_vs_item_jaccard,
 699        )
 700        .into(),
 701        Series::new(
 702            PlSmallStr::from_str("candidate_adjacent_vs_signature_jaccard"),
 703            candidate_adjacent_vs_signature_jaccard,
 704        )
 705        .into(),
 706        Series::new(
 707            PlSmallStr::from_str("candidate_excerpt_vs_item_weighted_overlap"),
 708            candidate_excerpt_vs_item_weighted_overlap,
 709        )
 710        .into(),
 711        Series::new(
 712            PlSmallStr::from_str("candidate_excerpt_vs_signature_weighted_overlap"),
 713            candidate_excerpt_vs_signature_weighted_overlap,
 714        )
 715        .into(),
 716        Series::new(
 717            PlSmallStr::from_str("candidate_adjacent_vs_item_weighted_overlap"),
 718            candidate_adjacent_vs_item_weighted_overlap,
 719        )
 720        .into(),
 721        Series::new(
 722            PlSmallStr::from_str("candidate_adjacent_vs_signature_weighted_overlap"),
 723            candidate_adjacent_vs_signature_weighted_overlap,
 724        )
 725        .into(),
 726        Series::new(
 727            PlSmallStr::from_str("candidate_path_import_match_count"),
 728            candidate_path_import_match_count,
 729        )
 730        .into(),
 731        Series::new(
 732            PlSmallStr::from_str("candidate_wildcard_path_import_match_count"),
 733            candidate_wildcard_path_import_match_count,
 734        )
 735        .into(),
 736        Series::new(
 737            PlSmallStr::from_str("candidate_import_similarity"),
 738            candidate_import_similarity,
 739        )
 740        .into(),
 741        Series::new(
 742            PlSmallStr::from_str("candidate_max_import_similarity"),
 743            candidate_max_import_similarity,
 744        )
 745        .into(),
 746        Series::new(
 747            PlSmallStr::from_str("candidate_normalized_import_similarity"),
 748            candidate_normalized_import_similarity,
 749        )
 750        .into(),
 751        Series::new(
 752            PlSmallStr::from_str("candidate_wildcard_import_similarity"),
 753            candidate_wildcard_import_similarity,
 754        )
 755        .into(),
 756        Series::new(
 757            PlSmallStr::from_str("candidate_normalized_wildcard_import_similarity"),
 758            candidate_normalized_wildcard_import_similarity,
 759        )
 760        .into(),
 761        Series::new(
 762            PlSmallStr::from_str("candidate_included_by_others"),
 763            candidate_included_by_others,
 764        )
 765        .into(),
 766        Series::new(
 767            PlSmallStr::from_str("candidate_includes_others"),
 768            candidate_includes_others,
 769        )
 770        .into(),
 771    ])?;
 772
 773    Ok(df)
 774}
 775
 776fn relativize_path(path: &Path) -> &Path {
 777    path.strip_prefix(std::env::current_dir().unwrap())
 778        .unwrap_or(path)
 779}
 780
 781struct SummaryStats {
 782    references_count: u32,
 783    retrieved_count: u32,
 784    top_match_count: u32,
 785    non_top_match_count: u32,
 786    ranking_involved_top_match_count: u32,
 787    missing_none_retrieved: u32,
 788    missing_wrong_retrieval: u32,
 789    missing_external: u32,
 790    in_excerpt_count: u32,
 791}
 792
 793impl SummaryStats {
 794    fn from_dataframe(df: DataFrame) -> Result<Self> {
 795        // TODO: use lazy more
 796        let unique_refs =
 797            df.unique::<(), ()>(Some(&["ref_id".into()]), UniqueKeepStrategy::Any, None)?;
 798        let references_count = unique_refs.height() as u32;
 799
 800        let gold_mask = df.column("candidate_is_gold")?.bool()?;
 801        let gold_df = df.filter(&gold_mask)?;
 802        let retrieved_count = gold_df.height() as u32;
 803
 804        let top_match_mask = gold_df.column("candidate_rank")?.u32()?.equal(0);
 805        let top_match_df = gold_df.filter(&top_match_mask)?;
 806        let top_match_count = top_match_df.height() as u32;
 807
 808        let ranking_involved_top_match_count = top_match_df
 809            .column("candidate_count")?
 810            .u32()?
 811            .gt(1)
 812            .sum()
 813            .unwrap_or_default();
 814
 815        let non_top_match_count = (!top_match_mask).sum().unwrap_or(0);
 816
 817        let not_retrieved_df = df
 818            .lazy()
 819            .group_by(&[col("ref_id"), col("candidate_count")])
 820            .agg(&[
 821                col("candidate_is_gold")
 822                    .fill_null(false)
 823                    .sum()
 824                    .alias("gold_count"),
 825                col("gold_in_excerpt").sum().alias("gold_in_excerpt_count"),
 826                col("gold_is_external")
 827                    .sum()
 828                    .alias("gold_is_external_count"),
 829            ])
 830            .filter(col("gold_count").eq(lit(0)))
 831            .collect()?;
 832
 833        let in_excerpt_mask = not_retrieved_df
 834            .column("gold_in_excerpt_count")?
 835            .u32()?
 836            .gt(0);
 837        let in_excerpt_count = in_excerpt_mask.sum().unwrap_or(0);
 838
 839        let missing_df = not_retrieved_df.filter(&!in_excerpt_mask)?;
 840
 841        let missing_none_retrieved_mask = missing_df.column("candidate_count")?.u32()?.equal(0);
 842        let missing_none_retrieved = missing_none_retrieved_mask.sum().unwrap_or(0);
 843        let external_mask = missing_df.column("gold_is_external_count")?.u32()?.gt(0);
 844        let missing_external = (missing_none_retrieved_mask & external_mask)
 845            .sum()
 846            .unwrap_or(0);
 847
 848        let missing_wrong_retrieval = missing_df
 849            .column("candidate_count")?
 850            .u32()?
 851            .gt(0)
 852            .sum()
 853            .unwrap_or(0);
 854
 855        Ok(SummaryStats {
 856            references_count,
 857            retrieved_count,
 858            top_match_count,
 859            non_top_match_count,
 860            ranking_involved_top_match_count,
 861            missing_none_retrieved,
 862            missing_wrong_retrieval,
 863            missing_external,
 864            in_excerpt_count,
 865        })
 866    }
 867
 868    fn count_and_percentage(part: u32, total: u32) -> String {
 869        format!("{} ({:.2}%)", part, (part as f64 / total as f64) * 100.0)
 870    }
 871}
 872
 873impl std::fmt::Display for SummaryStats {
 874    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 875        let included = self.in_excerpt_count + self.retrieved_count;
 876        let missing = self.references_count - included;
 877        writeln!(f)?;
 878        writeln!(f, "╮ references: {}", self.references_count)?;
 879        writeln!(
 880            f,
 881            "├─╮ included: {}",
 882            Self::count_and_percentage(included, self.references_count),
 883        )?;
 884        writeln!(
 885            f,
 886            "│ ├─╮ retrieved: {}",
 887            Self::count_and_percentage(self.retrieved_count, self.references_count)
 888        )?;
 889        writeln!(
 890            f,
 891            "│ │ ├─╮ top match : {}",
 892            Self::count_and_percentage(self.top_match_count, self.retrieved_count)
 893        )?;
 894        writeln!(
 895            f,
 896            "│ │ │ ╰─╴ involving ranking: {}",
 897            Self::count_and_percentage(self.ranking_involved_top_match_count, self.top_match_count)
 898        )?;
 899        writeln!(
 900            f,
 901            "│ │ ╰─╴ non-top match: {}",
 902            Self::count_and_percentage(self.non_top_match_count, self.retrieved_count)
 903        )?;
 904        writeln!(
 905            f,
 906            "│ ╰─╴ in excerpt: {}",
 907            Self::count_and_percentage(self.in_excerpt_count, included)
 908        )?;
 909        writeln!(
 910            f,
 911            "╰─╮ missing: {}",
 912            Self::count_and_percentage(missing, self.references_count)
 913        )?;
 914        writeln!(
 915            f,
 916            "  ├─╮ none retrieved: {}",
 917            Self::count_and_percentage(self.missing_none_retrieved, missing)
 918        )?;
 919        writeln!(
 920            f,
 921            "  │ ╰─╴ external (expected): {}",
 922            Self::count_and_percentage(self.missing_external, missing)
 923        )?;
 924        writeln!(
 925            f,
 926            "  ╰─╴ wrong retrieval: {}",
 927            Self::count_and_percentage(self.missing_wrong_retrieval, missing)
 928        )?;
 929        Ok(())
 930    }
 931}
 932
 933#[derive(Debug)]
 934struct ReferenceRetrievalResult {
 935    cursor_path: Arc<RelPath>,
 936    cursor_point: Point,
 937    identifier: Identifier,
 938    excerpt_range: Option<Range<usize>>,
 939    lsp_definitions: Vec<SourceRange>,
 940    retrieved_definitions: Vec<RetrievedDefinition>,
 941}
 942
 943#[derive(Debug)]
 944struct RetrievedDefinition {
 945    path: Arc<RelPath>,
 946    range: Range<Point>,
 947    score: f32,
 948    #[allow(dead_code)]
 949    retrieval_score: f32,
 950    #[allow(dead_code)]
 951    components: DeclarationScoreComponents,
 952}
 953
 954struct RetrieveResult {
 955    definitions: Vec<RetrievedDefinition>,
 956    excerpt_range: Option<Range<usize>>,
 957}
 958
 959async fn retrieve_definitions(
 960    reference: &Reference,
 961    imports: &Imports,
 962    query_point: Point,
 963    snapshot: &BufferSnapshot,
 964    index: &Arc<SyntaxIndexState>,
 965    file_snapshots: &Arc<HashMap<ProjectEntryId, BufferSnapshot>>,
 966    context_options: &EditPredictionContextOptions,
 967) -> Result<RetrieveResult> {
 968    let mut single_reference_map = HashMap::default();
 969    single_reference_map.insert(reference.identifier.clone(), vec![reference.clone()]);
 970    let edit_prediction_context = EditPredictionContext::gather_context_with_references_fn(
 971        query_point,
 972        snapshot,
 973        imports,
 974        &context_options,
 975        Some(&index),
 976        |_, _, _| single_reference_map,
 977    );
 978
 979    let Some(edit_prediction_context) = edit_prediction_context else {
 980        return Ok(RetrieveResult {
 981            definitions: Vec::new(),
 982            excerpt_range: None,
 983        });
 984    };
 985
 986    let mut retrieved_definitions = Vec::new();
 987    for scored_declaration in edit_prediction_context.declarations {
 988        match &scored_declaration.declaration {
 989            Declaration::File {
 990                project_entry_id,
 991                declaration,
 992                ..
 993            } => {
 994                let Some(snapshot) = file_snapshots.get(&project_entry_id) else {
 995                    log::error!("bug: file project entry not found");
 996                    continue;
 997                };
 998                let path = snapshot.file().unwrap().path().clone();
 999                retrieved_definitions.push(RetrievedDefinition {
1000                    path,
1001                    range: snapshot.offset_to_point(declaration.item_range.start)
1002                        ..snapshot.offset_to_point(declaration.item_range.end),
1003                    score: scored_declaration.score(DeclarationStyle::Declaration),
1004                    retrieval_score: scored_declaration.retrieval_score(),
1005                    components: scored_declaration.components,
1006                });
1007            }
1008            Declaration::Buffer {
1009                project_entry_id,
1010                rope,
1011                declaration,
1012                ..
1013            } => {
1014                let Some(snapshot) = file_snapshots.get(&project_entry_id) else {
1015                    // This case happens when dependency buffers have been opened by
1016                    // go-to-definition, resulting in single-file worktrees.
1017                    continue;
1018                };
1019                let path = snapshot.file().unwrap().path().clone();
1020                retrieved_definitions.push(RetrievedDefinition {
1021                    path,
1022                    range: rope.offset_to_point(declaration.item_range.start)
1023                        ..rope.offset_to_point(declaration.item_range.end),
1024                    score: scored_declaration.score(DeclarationStyle::Declaration),
1025                    retrieval_score: scored_declaration.retrieval_score(),
1026                    components: scored_declaration.components,
1027                });
1028            }
1029        }
1030    }
1031    retrieved_definitions.sort_by_key(|definition| Reverse(OrderedFloat(definition.score)));
1032
1033    Ok(RetrieveResult {
1034        definitions: retrieved_definitions,
1035        excerpt_range: Some(edit_prediction_context.excerpt.range),
1036    })
1037}
1038
1039async fn gather_lsp_definitions(
1040    lsp_definitions_path: &Path,
1041    start_index: usize,
1042    files: &[ProjectPath],
1043    worktree: &Entity<Worktree>,
1044    project: &Entity<Project>,
1045    definitions: &mut HashMap<SourceLocation, Vec<SourceRange>>,
1046    cx: &mut AsyncApp,
1047) -> Result<()> {
1048    let worktree_id = worktree.read_with(cx, |worktree, _cx| worktree.id())?;
1049
1050    let lsp_store = project.read_with(cx, |project, _cx| project.lsp_store())?;
1051    cx.subscribe(&lsp_store, {
1052        move |_, event, _| {
1053            if let project::LspStoreEvent::LanguageServerUpdate {
1054                message:
1055                    client::proto::update_language_server::Variant::WorkProgress(
1056                        client::proto::LspWorkProgress {
1057                            message: Some(message),
1058                            ..
1059                        },
1060                    ),
1061                ..
1062            } = event
1063            {
1064                println!("⟲ {message}")
1065            }
1066        }
1067    })?
1068    .detach();
1069
1070    let (cache_line_tx, mut cache_line_rx) = mpsc::unbounded::<FileLspDefinitions>();
1071
1072    let cache_file = File::options()
1073        .append(true)
1074        .create(true)
1075        .open(lsp_definitions_path)
1076        .unwrap();
1077
1078    let cache_task = cx.background_spawn(async move {
1079        let mut writer = BufWriter::new(cache_file);
1080        while let Some(line) = cache_line_rx.next().await {
1081            serde_json::to_writer(&mut writer, &line).unwrap();
1082            writer.write_all(&[b'\n']).unwrap();
1083        }
1084        writer.flush().unwrap();
1085    });
1086
1087    let mut error_count = 0;
1088    let mut lsp_open_handles = Vec::new();
1089    let mut ready_languages = HashSet::default();
1090    for (file_index, project_path) in files[start_index..].iter().enumerate() {
1091        println!(
1092            "Processing file {} of {}: {}",
1093            start_index + file_index + 1,
1094            files.len(),
1095            project_path.path.display(PathStyle::Posix)
1096        );
1097
1098        let Some((lsp_open_handle, language_server_id, buffer)) = open_buffer_with_language_server(
1099            project.clone(),
1100            worktree.clone(),
1101            project_path.path.clone(),
1102            &mut ready_languages,
1103            cx,
1104        )
1105        .await
1106        .log_err() else {
1107            continue;
1108        };
1109        lsp_open_handles.push(lsp_open_handle);
1110
1111        let snapshot = buffer.read_with(cx, |buffer, _cx| buffer.snapshot())?;
1112        let full_range = 0..snapshot.len();
1113        let references = references_in_range(
1114            full_range,
1115            &snapshot.text(),
1116            ReferenceRegion::Nearby,
1117            &snapshot,
1118        );
1119
1120        loop {
1121            let is_ready = lsp_store
1122                .read_with(cx, |lsp_store, _cx| {
1123                    lsp_store
1124                        .language_server_statuses
1125                        .get(&language_server_id)
1126                        .is_some_and(|status| status.pending_work.is_empty())
1127                })
1128                .unwrap();
1129            if is_ready {
1130                break;
1131            }
1132            cx.background_executor()
1133                .timer(Duration::from_millis(10))
1134                .await;
1135        }
1136
1137        let mut cache_line_references = Vec::with_capacity(references.len());
1138
1139        for reference in references {
1140            // TODO: Rename declaration to definition in edit_prediction_context?
1141            let lsp_result = project
1142                .update(cx, |project, cx| {
1143                    project.definitions(&buffer, reference.range.start, cx)
1144                })?
1145                .await;
1146
1147            match lsp_result {
1148                Ok(lsp_definitions) => {
1149                    let mut targets = Vec::new();
1150                    for target in lsp_definitions.unwrap_or_default() {
1151                        let buffer = target.target.buffer;
1152                        let anchor_range = target.target.range;
1153                        buffer.read_with(cx, |buffer, cx| {
1154                            let Some(file) = project::File::from_dyn(buffer.file()) else {
1155                                return;
1156                            };
1157                            let file_worktree = file.worktree.read(cx);
1158                            let file_worktree_id = file_worktree.id();
1159                            // Relative paths for worktree files, absolute for all others
1160                            let path = if worktree_id != file_worktree_id {
1161                                file.worktree.read(cx).absolutize(&file.path)
1162                            } else {
1163                                file.path.as_std_path().to_path_buf()
1164                            };
1165                            let offset_range = anchor_range.to_offset(&buffer);
1166                            let point_range = SerializablePoint::from_language_point_range(
1167                                offset_range.to_point(&buffer),
1168                            );
1169                            targets.push(SourceRange {
1170                                path,
1171                                offset_range,
1172                                point_range,
1173                            });
1174                        })?;
1175                    }
1176
1177                    let point = snapshot.offset_to_point(reference.range.start);
1178
1179                    cache_line_references.push((point.into(), targets.clone()));
1180                    definitions.insert(
1181                        SourceLocation {
1182                            path: project_path.path.clone(),
1183                            point,
1184                        },
1185                        targets,
1186                    );
1187                }
1188                Err(err) => {
1189                    log::error!("Language server error: {err}");
1190                    error_count += 1;
1191                }
1192            }
1193        }
1194
1195        cache_line_tx
1196            .unbounded_send(FileLspDefinitions {
1197                path: project_path.path.as_unix_str().into(),
1198                references: cache_line_references,
1199            })
1200            .log_err();
1201    }
1202
1203    drop(cache_line_tx);
1204
1205    if error_count > 0 {
1206        log::error!("Encountered {} language server errors", error_count);
1207    }
1208
1209    cache_task.await;
1210
1211    Ok(())
1212}
1213
1214#[derive(Serialize, Deserialize)]
1215struct FileLspDefinitions {
1216    path: Arc<str>,
1217    references: Vec<(SerializablePoint, Vec<SourceRange>)>,
1218}
1219
1220#[derive(Debug, Clone, Serialize, Deserialize)]
1221struct SourceRange {
1222    path: PathBuf,
1223    point_range: Range<SerializablePoint>,
1224    offset_range: Range<usize>,
1225}
1226
1227/// Serializes to 1-based row and column indices.
1228#[derive(Debug, Clone, Serialize, Deserialize)]
1229pub struct SerializablePoint {
1230    pub row: u32,
1231    pub column: u32,
1232}
1233
1234impl SerializablePoint {
1235    pub fn into_language_point_range(range: Range<Self>) -> Range<Point> {
1236        range.start.into()..range.end.into()
1237    }
1238
1239    pub fn from_language_point_range(range: Range<Point>) -> Range<Self> {
1240        range.start.into()..range.end.into()
1241    }
1242}
1243
1244impl From<Point> for SerializablePoint {
1245    fn from(point: Point) -> Self {
1246        SerializablePoint {
1247            row: point.row + 1,
1248            column: point.column + 1,
1249        }
1250    }
1251}
1252
1253impl From<SerializablePoint> for Point {
1254    fn from(serializable: SerializablePoint) -> Self {
1255        Point {
1256            row: serializable.row.saturating_sub(1),
1257            column: serializable.column.saturating_sub(1),
1258        }
1259    }
1260}