retrieval_stats.rs

   1use ::util::rel_path::RelPath;
   2use ::util::{RangeExt, ResultExt as _};
   3use anyhow::{Context as _, Result};
   4use cloud_llm_client::predict_edits_v3::DeclarationScoreComponents;
   5use edit_prediction_context::{
   6    Declaration, DeclarationStyle, EditPredictionContext, Identifier, Imports, Reference,
   7    ReferenceRegion, SyntaxIndex, SyntaxIndexState, references_in_range,
   8};
   9use futures::StreamExt as _;
  10use futures::channel::mpsc;
  11use gpui::Entity;
  12use gpui::{AppContext, AsyncApp};
  13use language::OffsetRangeExt;
  14use language::{BufferSnapshot, Point};
  15use ordered_float::OrderedFloat;
  16use polars::prelude::*;
  17use project::{Project, ProjectEntryId, ProjectPath, Worktree};
  18use serde::{Deserialize, Serialize};
  19use std::fs;
  20use std::{
  21    cmp::Reverse,
  22    collections::{HashMap, HashSet},
  23    fs::File,
  24    hash::{Hash, Hasher},
  25    io::{BufRead, BufReader, BufWriter, Write as _},
  26    ops::Range,
  27    path::{Path, PathBuf},
  28    sync::{
  29        Arc,
  30        atomic::{self, AtomicUsize},
  31    },
  32    time::Duration,
  33};
  34use util::paths::PathStyle;
  35
  36use crate::headless::ZetaCliAppState;
  37use crate::source_location::SourceLocation;
  38use crate::util::{open_buffer, open_buffer_with_language_server};
  39
  40pub async fn retrieval_stats(
  41    worktree: PathBuf,
  42    app_state: Arc<ZetaCliAppState>,
  43    only_extension: Option<String>,
  44    file_limit: Option<usize>,
  45    skip_files: Option<usize>,
  46    options: zeta2::ZetaOptions,
  47    cx: &mut AsyncApp,
  48) -> Result<String> {
  49    let options = Arc::new(options);
  50    let worktree_path = worktree.canonicalize()?;
  51
  52    let project = cx.update(|cx| {
  53        Project::local(
  54            app_state.client.clone(),
  55            app_state.node_runtime.clone(),
  56            app_state.user_store.clone(),
  57            app_state.languages.clone(),
  58            app_state.fs.clone(),
  59            None,
  60            cx,
  61        )
  62    })?;
  63
  64    let worktree = project
  65        .update(cx, |project, cx| {
  66            project.create_worktree(&worktree_path, true, cx)
  67        })?
  68        .await?;
  69
  70    // wait for worktree scan so that wait_for_initial_file_indexing waits for the whole worktree.
  71    worktree
  72        .read_with(cx, |worktree, _cx| {
  73            worktree.as_local().unwrap().scan_complete()
  74        })?
  75        .await;
  76
  77    let index = cx.new(|cx| SyntaxIndex::new(&project, options.file_indexing_parallelism, cx))?;
  78    index
  79        .read_with(cx, |index, cx| index.wait_for_initial_file_indexing(cx))?
  80        .await?;
  81    let indexed_files = index
  82        .read_with(cx, |index, cx| index.indexed_file_paths(cx))?
  83        .await;
  84    let mut filtered_files = indexed_files
  85        .into_iter()
  86        .filter(|project_path| {
  87            let file_extension = project_path.path.extension();
  88            if let Some(only_extension) = only_extension.as_ref() {
  89                file_extension.is_some_and(|extension| extension == only_extension)
  90            } else {
  91                file_extension
  92                    .is_some_and(|extension| !["md", "json", "sh", "diff"].contains(&extension))
  93            }
  94        })
  95        .collect::<Vec<_>>();
  96    filtered_files.sort_by(|a, b| a.path.cmp(&b.path));
  97
  98    let index_state = index.read_with(cx, |index, _cx| index.state().clone())?;
  99    cx.update(|_| {
 100        drop(index);
 101    })?;
 102    let index_state = Arc::new(
 103        Arc::into_inner(index_state)
 104            .context("Index state had more than 1 reference")?
 105            .into_inner(),
 106    );
 107
 108    struct FileSnapshot {
 109        project_entry_id: ProjectEntryId,
 110        snapshot: BufferSnapshot,
 111        hash: u64,
 112        parent_abs_path: Arc<Path>,
 113    }
 114
 115    let files: Vec<FileSnapshot> = futures::future::try_join_all({
 116        filtered_files
 117            .iter()
 118            .map(|file| {
 119                let buffer_task =
 120                    open_buffer(project.clone(), worktree.clone(), file.path.clone(), cx);
 121                cx.spawn(async move |cx| {
 122                    let buffer = buffer_task.await?;
 123                    let (project_entry_id, parent_abs_path, snapshot) =
 124                        buffer.read_with(cx, |buffer, cx| {
 125                            let file = project::File::from_dyn(buffer.file()).unwrap();
 126                            let project_entry_id = file.project_entry_id().unwrap();
 127                            let mut parent_abs_path = file.worktree.read(cx).absolutize(&file.path);
 128                            if !parent_abs_path.pop() {
 129                                panic!("Invalid worktree path");
 130                            }
 131
 132                            (project_entry_id, parent_abs_path, buffer.snapshot())
 133                        })?;
 134
 135                    anyhow::Ok(
 136                        cx.background_spawn(async move {
 137                            let mut hasher = collections::FxHasher::default();
 138                            snapshot.text().hash(&mut hasher);
 139                            FileSnapshot {
 140                                project_entry_id,
 141                                snapshot,
 142                                hash: hasher.finish(),
 143                                parent_abs_path: parent_abs_path.into(),
 144                            }
 145                        })
 146                        .await,
 147                    )
 148                })
 149            })
 150            .collect::<Vec<_>>()
 151    })
 152    .await?;
 153
 154    let mut file_snapshots = HashMap::default();
 155    let mut hasher = collections::FxHasher::default();
 156    for FileSnapshot {
 157        project_entry_id,
 158        snapshot,
 159        hash,
 160        ..
 161    } in &files
 162    {
 163        file_snapshots.insert(*project_entry_id, snapshot.clone());
 164        hash.hash(&mut hasher);
 165    }
 166    let files_hash = hasher.finish();
 167    let file_snapshots = Arc::new(file_snapshots);
 168    let target_cli_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("../../target/zeta_cli");
 169    fs::create_dir_all(&target_cli_dir).unwrap();
 170    let target_cli_dir = target_cli_dir.canonicalize().unwrap();
 171
 172    let lsp_cache_dir = target_cli_dir.join("cache");
 173    fs::create_dir_all(&lsp_cache_dir).unwrap();
 174
 175    let lsp_definitions_path = lsp_cache_dir.join(format!(
 176        "{}-{:x}.jsonl",
 177        worktree_path.file_stem().unwrap_or_default().display(),
 178        files_hash
 179    ));
 180
 181    let mut lsp_definitions = HashMap::default();
 182    let mut lsp_files = 0;
 183
 184    if fs::exists(&lsp_definitions_path)? {
 185        log::info!(
 186            "Using cached LSP definitions from {}",
 187            lsp_definitions_path.display()
 188        );
 189
 190        let file = File::options()
 191            .read(true)
 192            .write(true)
 193            .open(&lsp_definitions_path)?;
 194        let lines = BufReader::new(&file).lines();
 195        let mut valid_len: usize = 0;
 196
 197        for (line, expected_file) in lines.zip(files.iter()) {
 198            let line = line?;
 199            let FileLspDefinitions { path, references } = match serde_json::from_str(&line) {
 200                Ok(ok) => ok,
 201                Err(_) => {
 202                    log::error!("Found invalid cache line. Truncating to #{lsp_files}.",);
 203                    file.set_len(valid_len as u64)?;
 204                    break;
 205                }
 206            };
 207            let expected_path = expected_file.snapshot.file().unwrap().path().as_unix_str();
 208            if expected_path != path.as_ref() {
 209                log::error!(
 210                    "Expected file #{} to be {expected_path}, but found {path}. Truncating to #{lsp_files}.",
 211                    lsp_files + 1
 212                );
 213                file.set_len(valid_len as u64)?;
 214                break;
 215            }
 216            for (point, ranges) in references {
 217                let Ok(path) = RelPath::new(Path::new(path.as_ref()), PathStyle::Posix) else {
 218                    log::warn!("Invalid path: {}", path);
 219                    continue;
 220                };
 221                lsp_definitions.insert(
 222                    SourceLocation {
 223                        path: path.into_arc(),
 224                        point: point.into(),
 225                    },
 226                    ranges,
 227                );
 228            }
 229            lsp_files += 1;
 230            valid_len += line.len() + 1
 231        }
 232    }
 233
 234    if lsp_files < files.len() {
 235        if lsp_files == 0 {
 236            log::warn!(
 237                "No LSP definitions found, populating {}",
 238                lsp_definitions_path.display()
 239            );
 240        } else {
 241            log::warn!("{} files missing from LSP cache", files.len() - lsp_files);
 242        }
 243
 244        gather_lsp_definitions(
 245            &lsp_definitions_path,
 246            lsp_files,
 247            &filtered_files,
 248            &worktree,
 249            &project,
 250            &mut lsp_definitions,
 251            cx,
 252        )
 253        .await?;
 254    }
 255    let files_len = files.len().min(file_limit.unwrap_or(usize::MAX));
 256    let done_count = Arc::new(AtomicUsize::new(0));
 257
 258    let (output_tx, output_rx) = mpsc::unbounded::<ReferenceRetrievalResult>();
 259
 260    let tasks = files
 261        .into_iter()
 262        .skip(skip_files.unwrap_or(0))
 263        .take(file_limit.unwrap_or(usize::MAX))
 264        .map(|project_file| {
 265            let index_state = index_state.clone();
 266            let lsp_definitions = lsp_definitions.clone();
 267            let options = options.clone();
 268            let output_tx = output_tx.clone();
 269            let done_count = done_count.clone();
 270            let file_snapshots = file_snapshots.clone();
 271            cx.background_spawn(async move {
 272                let snapshot = project_file.snapshot;
 273
 274                let full_range = 0..snapshot.len();
 275                let references = references_in_range(
 276                    full_range,
 277                    &snapshot.text(),
 278                    ReferenceRegion::Nearby,
 279                    &snapshot,
 280                );
 281
 282                let imports = if options.context.use_imports {
 283                    Imports::gather(&snapshot, Some(&project_file.parent_abs_path))
 284                } else {
 285                    Imports::default()
 286                };
 287
 288                let path = snapshot.file().unwrap().path();
 289
 290                for reference in references {
 291                    let query_point = snapshot.offset_to_point(reference.range.start);
 292                    let source_location = SourceLocation {
 293                        path: path.clone(),
 294                        point: query_point,
 295                    };
 296                    let lsp_definitions = lsp_definitions
 297                        .get(&source_location)
 298                        .cloned()
 299                        .unwrap_or_else(|| {
 300                            log::warn!(
 301                                "No definitions found for source location: {:?}",
 302                                source_location
 303                            );
 304                            Vec::new()
 305                        });
 306
 307                    let retrieve_result = retrieve_definitions(
 308                        &reference,
 309                        &imports,
 310                        query_point,
 311                        &snapshot,
 312                        &index_state,
 313                        &file_snapshots,
 314                        &options,
 315                    )
 316                    .await?;
 317
 318                    let result = ReferenceRetrievalResult {
 319                        cursor_path: path.clone(),
 320                        identifier: reference.identifier,
 321                        cursor_point: query_point,
 322                        lsp_definitions,
 323                        retrieved_definitions: retrieve_result.definitions,
 324                        excerpt_range: retrieve_result.excerpt_range,
 325                    };
 326
 327                    output_tx.unbounded_send(result).ok();
 328                }
 329
 330                println!(
 331                    "{:02}/{:02} done",
 332                    done_count.fetch_add(1, atomic::Ordering::Relaxed) + 1,
 333                    files_len,
 334                );
 335
 336                anyhow::Ok(())
 337            })
 338        })
 339        .collect::<Vec<_>>();
 340
 341    drop(output_tx);
 342
 343    let df_task = cx.background_spawn(build_dataframe(output_rx));
 344
 345    futures::future::try_join_all(tasks).await?;
 346    let mut df = df_task.await?;
 347
 348    let run_id = format!(
 349        "{}-{}",
 350        worktree_path.file_stem().unwrap_or_default().display(),
 351        chrono::Local::now().format("%Y%m%d_%H%M%S")
 352    );
 353    let run_dir = target_cli_dir.join(run_id);
 354    fs::create_dir(&run_dir).unwrap();
 355
 356    let parquet_path = run_dir.join("stats.parquet");
 357    let mut parquet_file = fs::File::create(&parquet_path)?;
 358
 359    ParquetWriter::new(&mut parquet_file)
 360        .finish(&mut df)
 361        .unwrap();
 362
 363    let stats = SummaryStats::from_dataframe(df)?;
 364
 365    let stats_path = run_dir.join("stats.txt");
 366    fs::write(&stats_path, format!("{}", stats))?;
 367
 368    println!("{}", stats);
 369    println!("\nWrote:");
 370    println!("- {}", relativize_path(&parquet_path).display());
 371    println!("- {}", relativize_path(&stats_path).display());
 372    println!("- {}", relativize_path(&lsp_definitions_path).display());
 373
 374    Ok("".to_string())
 375}
 376
 377async fn build_dataframe(
 378    mut output_rx: mpsc::UnboundedReceiver<ReferenceRetrievalResult>,
 379) -> Result<DataFrame> {
 380    use soa_rs::{Soa, Soars};
 381
 382    #[derive(Default, Soars)]
 383    struct Row {
 384        ref_id: u32,
 385        cursor_path: String,
 386        cursor_row: u32,
 387        cursor_column: u32,
 388        cursor_identifier: String,
 389        gold_in_excerpt: bool,
 390        gold_path: String,
 391        gold_row: u32,
 392        gold_column: u32,
 393        gold_is_external: bool,
 394        candidate_count: u32,
 395        candidate_path: Option<String>,
 396        candidate_row: Option<u32>,
 397        candidate_column: Option<u32>,
 398        candidate_is_gold: Option<bool>,
 399        candidate_rank: Option<u32>,
 400        candidate_is_same_file: Option<bool>,
 401        candidate_is_referenced_nearby: Option<bool>,
 402        candidate_is_referenced_in_breadcrumb: Option<bool>,
 403        candidate_reference_count: Option<u32>,
 404        candidate_same_file_declaration_count: Option<u32>,
 405        candidate_declaration_count: Option<u32>,
 406        candidate_reference_line_distance: Option<u32>,
 407        candidate_declaration_line_distance: Option<u32>,
 408        candidate_excerpt_vs_item_jaccard: Option<f32>,
 409        candidate_excerpt_vs_signature_jaccard: Option<f32>,
 410        candidate_adjacent_vs_item_jaccard: Option<f32>,
 411        candidate_adjacent_vs_signature_jaccard: Option<f32>,
 412        candidate_excerpt_vs_item_weighted_overlap: Option<f32>,
 413        candidate_excerpt_vs_signature_weighted_overlap: Option<f32>,
 414        candidate_adjacent_vs_item_weighted_overlap: Option<f32>,
 415        candidate_adjacent_vs_signature_weighted_overlap: Option<f32>,
 416        candidate_path_import_match_count: Option<u32>,
 417        candidate_wildcard_path_import_match_count: Option<u32>,
 418        candidate_import_similarity: Option<f32>,
 419        candidate_max_import_similarity: Option<f32>,
 420        candidate_normalized_import_similarity: Option<f32>,
 421        candidate_wildcard_import_similarity: Option<f32>,
 422        candidate_normalized_wildcard_import_similarity: Option<f32>,
 423        candidate_included_by_others: Option<u32>,
 424        candidate_includes_others: Option<u32>,
 425    }
 426    let mut rows = Soa::<Row>::new();
 427    let mut next_ref_id = 0;
 428
 429    while let Some(result) = output_rx.next().await {
 430        let mut gold_is_external = false;
 431        let mut gold_in_excerpt = false;
 432        let cursor_path = result.cursor_path.as_unix_str();
 433        let cursor_row = result.cursor_point.row + 1;
 434        let cursor_column = result.cursor_point.column + 1;
 435        let cursor_identifier = result.identifier.name.to_string();
 436        let ref_id = next_ref_id;
 437        next_ref_id += 1;
 438
 439        for lsp_definition in result.lsp_definitions {
 440            let SourceRange {
 441                path: gold_path,
 442                point_range: gold_point_range,
 443                offset_range: gold_offset_range,
 444            } = lsp_definition;
 445            let lsp_point_range =
 446                SerializablePoint::into_language_point_range(gold_point_range.clone());
 447
 448            gold_is_external = gold_is_external
 449                || gold_path.is_absolute()
 450                || gold_path
 451                    .components()
 452                    .any(|component| component.as_os_str() == "node_modules");
 453
 454            gold_in_excerpt = gold_in_excerpt
 455                || result.excerpt_range.as_ref().is_some_and(|excerpt_range| {
 456                    excerpt_range.contains_inclusive(&gold_offset_range)
 457                });
 458
 459            let gold_row = gold_point_range.start.row;
 460            let gold_column = gold_point_range.start.column;
 461            let candidate_count = result.retrieved_definitions.len() as u32;
 462
 463            for (candidate_rank, retrieved_definition) in
 464                result.retrieved_definitions.iter().enumerate()
 465            {
 466                let candidate_is_gold = gold_path.as_path()
 467                    == retrieved_definition.path.as_std_path()
 468                    && retrieved_definition
 469                        .range
 470                        .contains_inclusive(&lsp_point_range);
 471
 472                let candidate_row = retrieved_definition.range.start.row + 1;
 473                let candidate_column = retrieved_definition.range.start.column + 1;
 474
 475                let DeclarationScoreComponents {
 476                    is_same_file,
 477                    is_referenced_nearby,
 478                    is_referenced_in_breadcrumb,
 479                    reference_count,
 480                    same_file_declaration_count,
 481                    declaration_count,
 482                    reference_line_distance,
 483                    declaration_line_distance,
 484                    excerpt_vs_item_jaccard,
 485                    excerpt_vs_signature_jaccard,
 486                    adjacent_vs_item_jaccard,
 487                    adjacent_vs_signature_jaccard,
 488                    excerpt_vs_item_weighted_overlap,
 489                    excerpt_vs_signature_weighted_overlap,
 490                    adjacent_vs_item_weighted_overlap,
 491                    adjacent_vs_signature_weighted_overlap,
 492                    path_import_match_count,
 493                    wildcard_path_import_match_count,
 494                    import_similarity,
 495                    max_import_similarity,
 496                    normalized_import_similarity,
 497                    wildcard_import_similarity,
 498                    normalized_wildcard_import_similarity,
 499                    included_by_others,
 500                    includes_others,
 501                } = retrieved_definition.components;
 502
 503                rows.push(Row {
 504                    ref_id,
 505                    cursor_path: cursor_path.to_string(),
 506                    cursor_row,
 507                    cursor_column,
 508                    cursor_identifier: cursor_identifier.clone(),
 509                    gold_in_excerpt,
 510                    gold_path: gold_path.to_string_lossy().to_string(),
 511                    gold_row,
 512                    gold_column,
 513                    gold_is_external,
 514                    candidate_count,
 515                    candidate_path: Some(retrieved_definition.path.as_unix_str().to_string()),
 516                    candidate_row: Some(candidate_row),
 517                    candidate_column: Some(candidate_column),
 518                    candidate_is_gold: Some(candidate_is_gold),
 519                    candidate_rank: Some(candidate_rank as u32),
 520                    candidate_is_same_file: Some(is_same_file),
 521                    candidate_is_referenced_nearby: Some(is_referenced_nearby),
 522                    candidate_is_referenced_in_breadcrumb: Some(is_referenced_in_breadcrumb),
 523                    candidate_reference_count: Some(reference_count as u32),
 524                    candidate_same_file_declaration_count: Some(same_file_declaration_count as u32),
 525                    candidate_declaration_count: Some(declaration_count as u32),
 526                    candidate_reference_line_distance: Some(reference_line_distance),
 527                    candidate_declaration_line_distance: Some(declaration_line_distance),
 528                    candidate_excerpt_vs_item_jaccard: Some(excerpt_vs_item_jaccard),
 529                    candidate_excerpt_vs_signature_jaccard: Some(excerpt_vs_signature_jaccard),
 530                    candidate_adjacent_vs_item_jaccard: Some(adjacent_vs_item_jaccard),
 531                    candidate_adjacent_vs_signature_jaccard: Some(adjacent_vs_signature_jaccard),
 532                    candidate_excerpt_vs_item_weighted_overlap: Some(
 533                        excerpt_vs_item_weighted_overlap,
 534                    ),
 535                    candidate_excerpt_vs_signature_weighted_overlap: Some(
 536                        excerpt_vs_signature_weighted_overlap,
 537                    ),
 538                    candidate_adjacent_vs_item_weighted_overlap: Some(
 539                        adjacent_vs_item_weighted_overlap,
 540                    ),
 541                    candidate_adjacent_vs_signature_weighted_overlap: Some(
 542                        adjacent_vs_signature_weighted_overlap,
 543                    ),
 544                    candidate_path_import_match_count: Some(path_import_match_count as u32),
 545                    candidate_wildcard_path_import_match_count: Some(
 546                        wildcard_path_import_match_count as u32,
 547                    ),
 548                    candidate_import_similarity: Some(import_similarity),
 549                    candidate_max_import_similarity: Some(max_import_similarity),
 550                    candidate_normalized_import_similarity: Some(normalized_import_similarity),
 551                    candidate_wildcard_import_similarity: Some(wildcard_import_similarity),
 552                    candidate_normalized_wildcard_import_similarity: Some(
 553                        normalized_wildcard_import_similarity,
 554                    ),
 555                    candidate_included_by_others: Some(included_by_others as u32),
 556                    candidate_includes_others: Some(includes_others as u32),
 557                });
 558            }
 559
 560            if result.retrieved_definitions.is_empty() {
 561                rows.push(Row {
 562                    ref_id,
 563                    cursor_path: cursor_path.to_string(),
 564                    cursor_row,
 565                    cursor_column,
 566                    cursor_identifier: cursor_identifier.clone(),
 567                    gold_in_excerpt,
 568                    gold_path: gold_path.to_string_lossy().to_string(),
 569                    gold_row,
 570                    gold_column,
 571                    gold_is_external,
 572                    candidate_count,
 573                    ..Default::default()
 574                });
 575            }
 576        }
 577    }
 578    let slices = rows.slices();
 579
 580    let RowSlices {
 581        ref_id,
 582        cursor_path,
 583        cursor_row,
 584        cursor_column,
 585        cursor_identifier,
 586        gold_in_excerpt,
 587        gold_path,
 588        gold_row,
 589        gold_column,
 590        gold_is_external,
 591        candidate_path,
 592        candidate_row,
 593        candidate_column,
 594        candidate_is_gold,
 595        candidate_rank,
 596        candidate_count,
 597        candidate_is_same_file,
 598        candidate_is_referenced_nearby,
 599        candidate_is_referenced_in_breadcrumb,
 600        candidate_reference_count,
 601        candidate_same_file_declaration_count,
 602        candidate_declaration_count,
 603        candidate_reference_line_distance,
 604        candidate_declaration_line_distance,
 605        candidate_excerpt_vs_item_jaccard,
 606        candidate_excerpt_vs_signature_jaccard,
 607        candidate_adjacent_vs_item_jaccard,
 608        candidate_adjacent_vs_signature_jaccard,
 609        candidate_excerpt_vs_item_weighted_overlap,
 610        candidate_excerpt_vs_signature_weighted_overlap,
 611        candidate_adjacent_vs_item_weighted_overlap,
 612        candidate_adjacent_vs_signature_weighted_overlap,
 613        candidate_path_import_match_count,
 614        candidate_wildcard_path_import_match_count,
 615        candidate_import_similarity,
 616        candidate_max_import_similarity,
 617        candidate_normalized_import_similarity,
 618        candidate_wildcard_import_similarity,
 619        candidate_normalized_wildcard_import_similarity,
 620        candidate_included_by_others,
 621        candidate_includes_others,
 622    } = slices;
 623
 624    let df = DataFrame::new(vec![
 625        Series::new(PlSmallStr::from_str("ref_id"), ref_id).into(),
 626        Series::new(PlSmallStr::from_str("cursor_path"), cursor_path).into(),
 627        Series::new(PlSmallStr::from_str("cursor_row"), cursor_row).into(),
 628        Series::new(PlSmallStr::from_str("cursor_column"), cursor_column).into(),
 629        Series::new(PlSmallStr::from_str("cursor_identifier"), cursor_identifier).into(),
 630        Series::new(PlSmallStr::from_str("gold_in_excerpt"), gold_in_excerpt).into(),
 631        Series::new(PlSmallStr::from_str("gold_path"), gold_path).into(),
 632        Series::new(PlSmallStr::from_str("gold_row"), gold_row).into(),
 633        Series::new(PlSmallStr::from_str("gold_column"), gold_column).into(),
 634        Series::new(PlSmallStr::from_str("gold_is_external"), gold_is_external).into(),
 635        Series::new(PlSmallStr::from_str("candidate_count"), candidate_count).into(),
 636        Series::new(PlSmallStr::from_str("candidate_path"), candidate_path).into(),
 637        Series::new(PlSmallStr::from_str("candidate_row"), candidate_row).into(),
 638        Series::new(PlSmallStr::from_str("candidate_column"), candidate_column).into(),
 639        Series::new(PlSmallStr::from_str("candidate_is_gold"), candidate_is_gold).into(),
 640        Series::new(PlSmallStr::from_str("candidate_rank"), candidate_rank).into(),
 641        Series::new(
 642            PlSmallStr::from_str("candidate_is_same_file"),
 643            candidate_is_same_file,
 644        )
 645        .into(),
 646        Series::new(
 647            PlSmallStr::from_str("candidate_is_referenced_nearby"),
 648            candidate_is_referenced_nearby,
 649        )
 650        .into(),
 651        Series::new(
 652            PlSmallStr::from_str("candidate_is_referenced_in_breadcrumb"),
 653            candidate_is_referenced_in_breadcrumb,
 654        )
 655        .into(),
 656        Series::new(
 657            PlSmallStr::from_str("candidate_reference_count"),
 658            candidate_reference_count,
 659        )
 660        .into(),
 661        Series::new(
 662            PlSmallStr::from_str("candidate_same_file_declaration_count"),
 663            candidate_same_file_declaration_count,
 664        )
 665        .into(),
 666        Series::new(
 667            PlSmallStr::from_str("candidate_declaration_count"),
 668            candidate_declaration_count,
 669        )
 670        .into(),
 671        Series::new(
 672            PlSmallStr::from_str("candidate_reference_line_distance"),
 673            candidate_reference_line_distance,
 674        )
 675        .into(),
 676        Series::new(
 677            PlSmallStr::from_str("candidate_declaration_line_distance"),
 678            candidate_declaration_line_distance,
 679        )
 680        .into(),
 681        Series::new(
 682            PlSmallStr::from_str("candidate_excerpt_vs_item_jaccard"),
 683            candidate_excerpt_vs_item_jaccard,
 684        )
 685        .into(),
 686        Series::new(
 687            PlSmallStr::from_str("candidate_excerpt_vs_signature_jaccard"),
 688            candidate_excerpt_vs_signature_jaccard,
 689        )
 690        .into(),
 691        Series::new(
 692            PlSmallStr::from_str("candidate_adjacent_vs_item_jaccard"),
 693            candidate_adjacent_vs_item_jaccard,
 694        )
 695        .into(),
 696        Series::new(
 697            PlSmallStr::from_str("candidate_adjacent_vs_signature_jaccard"),
 698            candidate_adjacent_vs_signature_jaccard,
 699        )
 700        .into(),
 701        Series::new(
 702            PlSmallStr::from_str("candidate_excerpt_vs_item_weighted_overlap"),
 703            candidate_excerpt_vs_item_weighted_overlap,
 704        )
 705        .into(),
 706        Series::new(
 707            PlSmallStr::from_str("candidate_excerpt_vs_signature_weighted_overlap"),
 708            candidate_excerpt_vs_signature_weighted_overlap,
 709        )
 710        .into(),
 711        Series::new(
 712            PlSmallStr::from_str("candidate_adjacent_vs_item_weighted_overlap"),
 713            candidate_adjacent_vs_item_weighted_overlap,
 714        )
 715        .into(),
 716        Series::new(
 717            PlSmallStr::from_str("candidate_adjacent_vs_signature_weighted_overlap"),
 718            candidate_adjacent_vs_signature_weighted_overlap,
 719        )
 720        .into(),
 721        Series::new(
 722            PlSmallStr::from_str("candidate_path_import_match_count"),
 723            candidate_path_import_match_count,
 724        )
 725        .into(),
 726        Series::new(
 727            PlSmallStr::from_str("candidate_wildcard_path_import_match_count"),
 728            candidate_wildcard_path_import_match_count,
 729        )
 730        .into(),
 731        Series::new(
 732            PlSmallStr::from_str("candidate_import_similarity"),
 733            candidate_import_similarity,
 734        )
 735        .into(),
 736        Series::new(
 737            PlSmallStr::from_str("candidate_max_import_similarity"),
 738            candidate_max_import_similarity,
 739        )
 740        .into(),
 741        Series::new(
 742            PlSmallStr::from_str("candidate_normalized_import_similarity"),
 743            candidate_normalized_import_similarity,
 744        )
 745        .into(),
 746        Series::new(
 747            PlSmallStr::from_str("candidate_wildcard_import_similarity"),
 748            candidate_wildcard_import_similarity,
 749        )
 750        .into(),
 751        Series::new(
 752            PlSmallStr::from_str("candidate_normalized_wildcard_import_similarity"),
 753            candidate_normalized_wildcard_import_similarity,
 754        )
 755        .into(),
 756        Series::new(
 757            PlSmallStr::from_str("candidate_included_by_others"),
 758            candidate_included_by_others,
 759        )
 760        .into(),
 761        Series::new(
 762            PlSmallStr::from_str("candidate_includes_others"),
 763            candidate_includes_others,
 764        )
 765        .into(),
 766    ])?;
 767
 768    Ok(df)
 769}
 770
 771fn relativize_path(path: &Path) -> &Path {
 772    path.strip_prefix(std::env::current_dir().unwrap())
 773        .unwrap_or(path)
 774}
 775
 776struct SummaryStats {
 777    references_count: u32,
 778    retrieved_count: u32,
 779    top_match_count: u32,
 780    non_top_match_count: u32,
 781    ranking_involved_top_match_count: u32,
 782    missing_none_retrieved: u32,
 783    missing_wrong_retrieval: u32,
 784    missing_external: u32,
 785    in_excerpt_count: u32,
 786}
 787
 788impl SummaryStats {
 789    fn from_dataframe(df: DataFrame) -> Result<Self> {
 790        // TODO: use lazy more
 791        let unique_refs =
 792            df.unique::<(), ()>(Some(&["ref_id".into()]), UniqueKeepStrategy::Any, None)?;
 793        let references_count = unique_refs.height() as u32;
 794
 795        let gold_mask = df.column("candidate_is_gold")?.bool()?;
 796        let gold_df = df.filter(&gold_mask)?;
 797        let retrieved_count = gold_df.height() as u32;
 798
 799        let top_match_mask = gold_df.column("candidate_rank")?.u32()?.equal(0);
 800        let top_match_df = gold_df.filter(&top_match_mask)?;
 801        let top_match_count = top_match_df.height() as u32;
 802
 803        let ranking_involved_top_match_count = top_match_df
 804            .column("candidate_count")?
 805            .u32()?
 806            .gt(1)
 807            .sum()
 808            .unwrap_or_default();
 809
 810        let non_top_match_count = (!top_match_mask).sum().unwrap_or(0);
 811
 812        let not_retrieved_df = df
 813            .lazy()
 814            .group_by(&[col("ref_id"), col("candidate_count")])
 815            .agg(&[
 816                col("candidate_is_gold")
 817                    .fill_null(false)
 818                    .sum()
 819                    .alias("gold_count"),
 820                col("gold_in_excerpt").sum().alias("gold_in_excerpt_count"),
 821                col("gold_is_external")
 822                    .sum()
 823                    .alias("gold_is_external_count"),
 824            ])
 825            .filter(col("gold_count").eq(lit(0)))
 826            .collect()?;
 827
 828        let in_excerpt_mask = not_retrieved_df
 829            .column("gold_in_excerpt_count")?
 830            .u32()?
 831            .gt(0);
 832        let in_excerpt_count = in_excerpt_mask.sum().unwrap_or(0);
 833
 834        let missing_df = not_retrieved_df.filter(&!in_excerpt_mask)?;
 835
 836        let missing_none_retrieved_mask = missing_df.column("candidate_count")?.u32()?.equal(0);
 837        let missing_none_retrieved = missing_none_retrieved_mask.sum().unwrap_or(0);
 838        let external_mask = missing_df.column("gold_is_external_count")?.u32()?.gt(0);
 839        let missing_external = (missing_none_retrieved_mask & external_mask)
 840            .sum()
 841            .unwrap_or(0);
 842
 843        let missing_wrong_retrieval = missing_df
 844            .column("candidate_count")?
 845            .u32()?
 846            .gt(0)
 847            .sum()
 848            .unwrap_or(0);
 849
 850        Ok(SummaryStats {
 851            references_count,
 852            retrieved_count,
 853            top_match_count,
 854            non_top_match_count,
 855            ranking_involved_top_match_count,
 856            missing_none_retrieved,
 857            missing_wrong_retrieval,
 858            missing_external,
 859            in_excerpt_count,
 860        })
 861    }
 862
 863    fn count_and_percentage(part: u32, total: u32) -> String {
 864        format!("{} ({:.2}%)", part, (part as f64 / total as f64) * 100.0)
 865    }
 866}
 867
 868impl std::fmt::Display for SummaryStats {
 869    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 870        let included = self.in_excerpt_count + self.retrieved_count;
 871        let missing = self.references_count - included;
 872        writeln!(f)?;
 873        writeln!(f, "╮ references: {}", self.references_count)?;
 874        writeln!(
 875            f,
 876            "├─╮ included: {}",
 877            Self::count_and_percentage(included, self.references_count),
 878        )?;
 879        writeln!(
 880            f,
 881            "│ ├─╮ retrieved: {}",
 882            Self::count_and_percentage(self.retrieved_count, self.references_count)
 883        )?;
 884        writeln!(
 885            f,
 886            "│ │ ├─╮ top match : {}",
 887            Self::count_and_percentage(self.top_match_count, self.retrieved_count)
 888        )?;
 889        writeln!(
 890            f,
 891            "│ │ │ ╰─╴ involving ranking: {}",
 892            Self::count_and_percentage(self.ranking_involved_top_match_count, self.top_match_count)
 893        )?;
 894        writeln!(
 895            f,
 896            "│ │ ╰─╴ non-top match: {}",
 897            Self::count_and_percentage(self.non_top_match_count, self.retrieved_count)
 898        )?;
 899        writeln!(
 900            f,
 901            "│ ╰─╴ in excerpt: {}",
 902            Self::count_and_percentage(self.in_excerpt_count, included)
 903        )?;
 904        writeln!(
 905            f,
 906            "╰─╮ missing: {}",
 907            Self::count_and_percentage(missing, self.references_count)
 908        )?;
 909        writeln!(
 910            f,
 911            "  ├─╮ none retrieved: {}",
 912            Self::count_and_percentage(self.missing_none_retrieved, missing)
 913        )?;
 914        writeln!(
 915            f,
 916            "  │ ╰─╴ external (expected): {}",
 917            Self::count_and_percentage(self.missing_external, missing)
 918        )?;
 919        writeln!(
 920            f,
 921            "  ╰─╴ wrong retrieval: {}",
 922            Self::count_and_percentage(self.missing_wrong_retrieval, missing)
 923        )?;
 924        Ok(())
 925    }
 926}
 927
 928#[derive(Debug)]
 929struct ReferenceRetrievalResult {
 930    cursor_path: Arc<RelPath>,
 931    cursor_point: Point,
 932    identifier: Identifier,
 933    excerpt_range: Option<Range<usize>>,
 934    lsp_definitions: Vec<SourceRange>,
 935    retrieved_definitions: Vec<RetrievedDefinition>,
 936}
 937
 938#[derive(Debug)]
 939struct RetrievedDefinition {
 940    path: Arc<RelPath>,
 941    range: Range<Point>,
 942    score: f32,
 943    #[allow(dead_code)]
 944    retrieval_score: f32,
 945    #[allow(dead_code)]
 946    components: DeclarationScoreComponents,
 947}
 948
 949struct RetrieveResult {
 950    definitions: Vec<RetrievedDefinition>,
 951    excerpt_range: Option<Range<usize>>,
 952}
 953
 954async fn retrieve_definitions(
 955    reference: &Reference,
 956    imports: &Imports,
 957    query_point: Point,
 958    snapshot: &BufferSnapshot,
 959    index: &Arc<SyntaxIndexState>,
 960    file_snapshots: &Arc<HashMap<ProjectEntryId, BufferSnapshot>>,
 961    options: &Arc<zeta2::ZetaOptions>,
 962) -> Result<RetrieveResult> {
 963    let mut single_reference_map = HashMap::default();
 964    single_reference_map.insert(reference.identifier.clone(), vec![reference.clone()]);
 965    let edit_prediction_context = EditPredictionContext::gather_context_with_references_fn(
 966        query_point,
 967        snapshot,
 968        imports,
 969        &options.context,
 970        Some(&index),
 971        |_, _, _| single_reference_map,
 972    );
 973
 974    let Some(edit_prediction_context) = edit_prediction_context else {
 975        return Ok(RetrieveResult {
 976            definitions: Vec::new(),
 977            excerpt_range: None,
 978        });
 979    };
 980
 981    let mut retrieved_definitions = Vec::new();
 982    for scored_declaration in edit_prediction_context.declarations {
 983        match &scored_declaration.declaration {
 984            Declaration::File {
 985                project_entry_id,
 986                declaration,
 987                ..
 988            } => {
 989                let Some(snapshot) = file_snapshots.get(&project_entry_id) else {
 990                    log::error!("bug: file project entry not found");
 991                    continue;
 992                };
 993                let path = snapshot.file().unwrap().path().clone();
 994                retrieved_definitions.push(RetrievedDefinition {
 995                    path,
 996                    range: snapshot.offset_to_point(declaration.item_range.start)
 997                        ..snapshot.offset_to_point(declaration.item_range.end),
 998                    score: scored_declaration.score(DeclarationStyle::Declaration),
 999                    retrieval_score: scored_declaration.retrieval_score(),
1000                    components: scored_declaration.components,
1001                });
1002            }
1003            Declaration::Buffer {
1004                project_entry_id,
1005                rope,
1006                declaration,
1007                ..
1008            } => {
1009                let Some(snapshot) = file_snapshots.get(&project_entry_id) else {
1010                    // This case happens when dependency buffers have been opened by
1011                    // go-to-definition, resulting in single-file worktrees.
1012                    continue;
1013                };
1014                let path = snapshot.file().unwrap().path().clone();
1015                retrieved_definitions.push(RetrievedDefinition {
1016                    path,
1017                    range: rope.offset_to_point(declaration.item_range.start)
1018                        ..rope.offset_to_point(declaration.item_range.end),
1019                    score: scored_declaration.score(DeclarationStyle::Declaration),
1020                    retrieval_score: scored_declaration.retrieval_score(),
1021                    components: scored_declaration.components,
1022                });
1023            }
1024        }
1025    }
1026    retrieved_definitions.sort_by_key(|definition| Reverse(OrderedFloat(definition.score)));
1027
1028    Ok(RetrieveResult {
1029        definitions: retrieved_definitions,
1030        excerpt_range: Some(edit_prediction_context.excerpt.range),
1031    })
1032}
1033
1034async fn gather_lsp_definitions(
1035    lsp_definitions_path: &Path,
1036    start_index: usize,
1037    files: &[ProjectPath],
1038    worktree: &Entity<Worktree>,
1039    project: &Entity<Project>,
1040    definitions: &mut HashMap<SourceLocation, Vec<SourceRange>>,
1041    cx: &mut AsyncApp,
1042) -> Result<()> {
1043    let worktree_id = worktree.read_with(cx, |worktree, _cx| worktree.id())?;
1044
1045    let lsp_store = project.read_with(cx, |project, _cx| project.lsp_store())?;
1046    cx.subscribe(&lsp_store, {
1047        move |_, event, _| {
1048            if let project::LspStoreEvent::LanguageServerUpdate {
1049                message:
1050                    client::proto::update_language_server::Variant::WorkProgress(
1051                        client::proto::LspWorkProgress {
1052                            message: Some(message),
1053                            ..
1054                        },
1055                    ),
1056                ..
1057            } = event
1058            {
1059                println!("⟲ {message}")
1060            }
1061        }
1062    })?
1063    .detach();
1064
1065    let (cache_line_tx, mut cache_line_rx) = mpsc::unbounded::<FileLspDefinitions>();
1066
1067    let cache_file = File::options()
1068        .append(true)
1069        .create(true)
1070        .open(lsp_definitions_path)
1071        .unwrap();
1072
1073    let cache_task = cx.background_spawn(async move {
1074        let mut writer = BufWriter::new(cache_file);
1075        while let Some(line) = cache_line_rx.next().await {
1076            serde_json::to_writer(&mut writer, &line).unwrap();
1077            writer.write_all(&[b'\n']).unwrap();
1078        }
1079        writer.flush().unwrap();
1080    });
1081
1082    let mut error_count = 0;
1083    let mut lsp_open_handles = Vec::new();
1084    let mut ready_languages = HashSet::default();
1085    for (file_index, project_path) in files[start_index..].iter().enumerate() {
1086        println!(
1087            "Processing file {} of {}: {}",
1088            start_index + file_index + 1,
1089            files.len(),
1090            project_path.path.display(PathStyle::Posix)
1091        );
1092
1093        let Some((lsp_open_handle, language_server_id, buffer)) = open_buffer_with_language_server(
1094            project.clone(),
1095            worktree.clone(),
1096            project_path.path.clone(),
1097            &mut ready_languages,
1098            cx,
1099        )
1100        .await
1101        .log_err() else {
1102            continue;
1103        };
1104        lsp_open_handles.push(lsp_open_handle);
1105
1106        let snapshot = buffer.read_with(cx, |buffer, _cx| buffer.snapshot())?;
1107        let full_range = 0..snapshot.len();
1108        let references = references_in_range(
1109            full_range,
1110            &snapshot.text(),
1111            ReferenceRegion::Nearby,
1112            &snapshot,
1113        );
1114
1115        loop {
1116            let is_ready = lsp_store
1117                .read_with(cx, |lsp_store, _cx| {
1118                    lsp_store
1119                        .language_server_statuses
1120                        .get(&language_server_id)
1121                        .is_some_and(|status| status.pending_work.is_empty())
1122                })
1123                .unwrap();
1124            if is_ready {
1125                break;
1126            }
1127            cx.background_executor()
1128                .timer(Duration::from_millis(10))
1129                .await;
1130        }
1131
1132        let mut cache_line_references = Vec::with_capacity(references.len());
1133
1134        for reference in references {
1135            // TODO: Rename declaration to definition in edit_prediction_context?
1136            let lsp_result = project
1137                .update(cx, |project, cx| {
1138                    project.definitions(&buffer, reference.range.start, cx)
1139                })?
1140                .await;
1141
1142            match lsp_result {
1143                Ok(lsp_definitions) => {
1144                    let mut targets = Vec::new();
1145                    for target in lsp_definitions.unwrap_or_default() {
1146                        let buffer = target.target.buffer;
1147                        let anchor_range = target.target.range;
1148                        buffer.read_with(cx, |buffer, cx| {
1149                            let Some(file) = project::File::from_dyn(buffer.file()) else {
1150                                return;
1151                            };
1152                            let file_worktree = file.worktree.read(cx);
1153                            let file_worktree_id = file_worktree.id();
1154                            // Relative paths for worktree files, absolute for all others
1155                            let path = if worktree_id != file_worktree_id {
1156                                file.worktree.read(cx).absolutize(&file.path)
1157                            } else {
1158                                file.path.as_std_path().to_path_buf()
1159                            };
1160                            let offset_range = anchor_range.to_offset(&buffer);
1161                            let point_range = SerializablePoint::from_language_point_range(
1162                                offset_range.to_point(&buffer),
1163                            );
1164                            targets.push(SourceRange {
1165                                path,
1166                                offset_range,
1167                                point_range,
1168                            });
1169                        })?;
1170                    }
1171
1172                    let point = snapshot.offset_to_point(reference.range.start);
1173
1174                    cache_line_references.push((point.into(), targets.clone()));
1175                    definitions.insert(
1176                        SourceLocation {
1177                            path: project_path.path.clone(),
1178                            point,
1179                        },
1180                        targets,
1181                    );
1182                }
1183                Err(err) => {
1184                    log::error!("Language server error: {err}");
1185                    error_count += 1;
1186                }
1187            }
1188        }
1189
1190        cache_line_tx
1191            .unbounded_send(FileLspDefinitions {
1192                path: project_path.path.as_unix_str().into(),
1193                references: cache_line_references,
1194            })
1195            .log_err();
1196    }
1197
1198    drop(cache_line_tx);
1199
1200    if error_count > 0 {
1201        log::error!("Encountered {} language server errors", error_count);
1202    }
1203
1204    cache_task.await;
1205
1206    Ok(())
1207}
1208
1209#[derive(Serialize, Deserialize)]
1210struct FileLspDefinitions {
1211    path: Arc<str>,
1212    references: Vec<(SerializablePoint, Vec<SourceRange>)>,
1213}
1214
1215#[derive(Debug, Clone, Serialize, Deserialize)]
1216struct SourceRange {
1217    path: PathBuf,
1218    point_range: Range<SerializablePoint>,
1219    offset_range: Range<usize>,
1220}
1221
1222/// Serializes to 1-based row and column indices.
1223#[derive(Debug, Clone, Serialize, Deserialize)]
1224pub struct SerializablePoint {
1225    pub row: u32,
1226    pub column: u32,
1227}
1228
1229impl SerializablePoint {
1230    pub fn into_language_point_range(range: Range<Self>) -> Range<Point> {
1231        range.start.into()..range.end.into()
1232    }
1233
1234    pub fn from_language_point_range(range: Range<Point>) -> Range<Self> {
1235        range.start.into()..range.end.into()
1236    }
1237}
1238
1239impl From<Point> for SerializablePoint {
1240    fn from(point: Point) -> Self {
1241        SerializablePoint {
1242            row: point.row + 1,
1243            column: point.column + 1,
1244        }
1245    }
1246}
1247
1248impl From<SerializablePoint> for Point {
1249    fn from(serializable: SerializablePoint) -> Self {
1250        Point {
1251            row: serializable.row.saturating_sub(1),
1252            column: serializable.column.saturating_sub(1),
1253        }
1254    }
1255}