score.rs

  1use crate::{
  2    PredictArgs, PredictionProvider,
  3    example::{ActualCursor, Example, ExampleScore},
  4    format_prompt::TeacherPrompt,
  5    headless::EpAppState,
  6    metrics,
  7    parse_output::parse_prediction_output,
  8    predict::run_prediction,
  9    progress::{ExampleProgress, Step},
 10    reversal_tracking,
 11};
 12use anyhow::Context as _;
 13use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
 14use gpui::AsyncApp;
 15use serde::Serialize;
 16use std::fs::File;
 17use std::io::BufWriter;
 18use std::path::Path;
 19use std::sync::Arc;
 20
 21pub async fn run_scoring(
 22    example: &mut Example,
 23    args: &PredictArgs,
 24    app_state: Arc<EpAppState>,
 25    example_progress: &ExampleProgress,
 26    cx: AsyncApp,
 27) -> anyhow::Result<()> {
 28    run_prediction(example, args, app_state, example_progress, cx).await?;
 29
 30    let progress = example_progress.start(Step::Score);
 31
 32    progress.set_substatus("applying patches");
 33    let prompt_inputs = example
 34        .prompt_inputs
 35        .as_ref()
 36        .context("prompt_inputs is required for scoring - run prediction first or ensure JSON includes prompt_inputs")?;
 37    let original_text: &str = prompt_inputs.cursor_excerpt.as_ref();
 38    let expected_patches_with_cursors = example.spec.expected_patches_with_cursor_positions();
 39
 40    let expected_texts: Vec<String> = expected_patches_with_cursors
 41        .iter()
 42        .map(|(patch, _)| {
 43            apply_diff_to_string(patch, original_text)
 44                .with_context(|| format!("Expected patch did not apply for {}", example.spec.name))
 45        })
 46        .collect::<Result<Vec<_>, _>>()?;
 47
 48    // For Teacher prompts, we need to extract the editable region to properly compute cursor offsets.
 49    // The actual_cursor_offset from Teacher is relative to the editable region, while the expected
 50    // cursor from the patch is relative to the hunk. We need to apply the patch to the editable
 51    // region to find where the hunk matched, then compute the expected cursor position.
 52    let old_editable_region = if let Some(p) = example.prompt.as_ref() {
 53        if matches!(
 54            p.provider,
 55            PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_)
 56        ) {
 57            Some(
 58                TeacherPrompt::extract_editable_region(&p.input)?
 59                    .replace(TeacherPrompt::USER_CURSOR_MARKER, ""),
 60            )
 61        } else {
 62            None
 63        }
 64    } else {
 65        None
 66    };
 67
 68    let zero_scores = ExampleScore {
 69        delta_chr_f: 0.0,
 70        delta_chr_f_true_positives: 0,
 71        delta_chr_f_false_positives: 0,
 72        delta_chr_f_false_negatives: 0,
 73        delta_chr_f_precision: 0.0,
 74        delta_chr_f_recall: 0.0,
 75        delta_chr_f_beta: metrics::delta_chr_f_beta(),
 76        braces_disbalance: 0,
 77        exact_lines_tp: 0,
 78        exact_lines_fp: 0,
 79        exact_lines_fn: 0,
 80        reversal_ratio: 0.0,
 81        cursor_distance: None,
 82        cursor_exact_match: None,
 83        wrong_editable_region: None,
 84        has_isolated_whitespace_changes: false,
 85        inserted_tokens: 0,
 86        deleted_tokens: 0,
 87        cumulative_logprob: None,
 88        avg_logprob: None,
 89    };
 90
 91    let cursor_path = example.spec.cursor_path.as_ref();
 92
 93    progress.set_substatus("computing metrics");
 94    let mut scores = vec![];
 95    for prediction in &example.predictions {
 96        let actual_patch = prediction.actual_patch.clone().or_else(|| {
 97            parse_prediction_output(example, &prediction.actual_output, prediction.provider)
 98                .ok()
 99                .map(|(patch, _)| patch)
100        });
101
102        let Some(actual_patch) = actual_patch else {
103            scores.push(zero_scores.clone());
104            continue;
105        };
106
107        let token_changes = metrics::count_patch_token_changes(&actual_patch);
108
109        let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
110            Ok(text) => text,
111            Err(_) => {
112                let mut s = zero_scores.clone();
113                s.inserted_tokens = token_changes.inserted_tokens;
114                s.deleted_tokens = token_changes.deleted_tokens;
115                scores.push(s);
116                continue;
117            }
118        };
119
120        let mut best_delta_chr_f_metrics = metrics::DeltaChrFMetrics::default();
121        let mut best_expected_cursor: Option<usize> = None;
122        let mut best_patch_idx: Option<usize> = None;
123
124        for (idx, expected) in expected_texts.iter().enumerate() {
125            let delta_chr_f_metrics = metrics::delta_chr_f(original_text, expected, &actual_text);
126            if delta_chr_f_metrics.score > best_delta_chr_f_metrics.score {
127                best_delta_chr_f_metrics = delta_chr_f_metrics;
128                best_patch_idx = Some(idx);
129            }
130        }
131
132        if let Some(idx) = best_patch_idx {
133            // Get the raw cursor offset from the expected patch (relative to hunk new text)
134            let expected_cursor_in_patch = expected_patches_with_cursors
135                .get(idx)
136                .and_then(|(_, cursor)| *cursor);
137
138            // For Teacher prompts, we need to apply the patch to the editable region
139            // to find where the hunk matched, then compute the actual cursor position
140            if let (Some(editable_region), Some(cursor_in_patch)) =
141                (&old_editable_region, expected_cursor_in_patch)
142            {
143                let (patch, _) = &expected_patches_with_cursors[idx];
144                if let Ok((_, hunk_offset)) =
145                    apply_diff_to_string_with_hunk_offset(patch, editable_region)
146                {
147                    let hunk_start = hunk_offset.unwrap_or(0);
148                    best_expected_cursor = Some(hunk_start + cursor_in_patch);
149                }
150            } else {
151                // For non-Teacher prompts or if we can't compute, use raw offset
152                best_expected_cursor = expected_cursor_in_patch;
153            }
154        }
155
156        let disbalance_before = metrics::braces_disbalance(&original_text);
157        let disbalance_after = metrics::braces_disbalance(&actual_text);
158        let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
159
160        // Compute exact lines match against best matching expected patch
161        let best_exact_lines = expected_patches_with_cursors
162            .iter()
163            .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
164            .max_by_key(|m| m.true_positives)
165            .unwrap_or_default();
166
167        // Compute reversal ratio
168        let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
169            prompt_inputs,
170            &actual_text,
171            cursor_path,
172        );
173
174        // Compute cursor position metrics
175        let (cursor_distance, cursor_exact_match) =
176            compute_cursor_metrics(best_expected_cursor, prediction.actual_cursor.as_ref());
177
178        // Compute approximation of editable region correctness
179        let wrong_editable_region = Some(!metrics::is_editable_region_correct(&actual_patch));
180
181        // Check for isolated whitespace changes.
182        let has_isolated_whitespace_changes = metrics::has_isolated_whitespace_changes(
183            &actual_patch,
184            prediction.actual_cursor.as_ref(),
185        );
186
187        scores.push(ExampleScore {
188            delta_chr_f: best_delta_chr_f_metrics.score as f32,
189            delta_chr_f_true_positives: best_delta_chr_f_metrics.counts.true_positives,
190            delta_chr_f_false_positives: best_delta_chr_f_metrics.counts.false_positives,
191            delta_chr_f_false_negatives: best_delta_chr_f_metrics.counts.false_negatives,
192            delta_chr_f_precision: best_delta_chr_f_metrics.precision,
193            delta_chr_f_recall: best_delta_chr_f_metrics.recall,
194            delta_chr_f_beta: best_delta_chr_f_metrics.beta,
195            braces_disbalance,
196            exact_lines_tp: best_exact_lines.true_positives,
197            exact_lines_fp: best_exact_lines.false_positives,
198            exact_lines_fn: best_exact_lines.false_negatives,
199            reversal_ratio,
200            cursor_distance,
201            cursor_exact_match,
202            wrong_editable_region,
203            has_isolated_whitespace_changes,
204            inserted_tokens: token_changes.inserted_tokens,
205            deleted_tokens: token_changes.deleted_tokens,
206            cumulative_logprob: prediction.cumulative_logprob,
207            avg_logprob: prediction.avg_logprob,
208        });
209    }
210
211    example.score = scores;
212    Ok(())
213}
214
215fn compute_cursor_metrics(
216    expected_cursor_editable_region_offset: Option<usize>,
217    actual_cursor: Option<&ActualCursor>,
218) -> (Option<usize>, Option<bool>) {
219    match (expected_cursor_editable_region_offset, actual_cursor) {
220        (Some(expected), Some(actual)) => {
221            let distance = expected.abs_diff(actual.editable_region_offset.unwrap_or_default());
222            let exact_match = distance == 0;
223            (Some(distance), Some(exact_match))
224        }
225        (None, None) => {
226            // Neither has cursor position - skip cursor scoring
227            (None, None)
228        }
229        (Some(_), None) | (None, Some(_)) => {
230            // Only one has cursor position - count as miss
231            (None, Some(false))
232        }
233    }
234}
235
236pub fn print_report(examples: &[Example], verbose: bool) {
237    const MAX_EXAMPLES_DEFAULT: usize = 20;
238    use crate::metrics::ClassificationMetrics;
239
240    const LINE_WIDTH: usize = 101;
241    let separator = "─".repeat(LINE_WIDTH);
242
243    println!("{}", separator);
244    println!(
245        "{:<40} {:>8} {:>5} {:>7} {:>7} {:>7} {:>7} {:>6} {:>5}",
246        "Example", "DeltaChrF", "Brace", "F1", "Revert", "QaRev", "QaConf", "Cursor", "WrgER"
247    );
248    println!("{}", separator);
249
250    let mut all_delta_chr_f_scores = Vec::new();
251    let mut all_reversal_ratios = Vec::new();
252    let mut braces_disbalance_sum: usize = 0;
253    let mut total_delta_chr_f = ClassificationMetrics::default();
254    let mut total_delta_chr_f_precision = 0.0;
255    let mut total_delta_chr_f_recall = 0.0;
256    let mut delta_chr_f_beta = 0.0;
257    let mut total_exact_lines = ClassificationMetrics::default();
258    let mut total_scores: usize = 0;
259    let mut qa_reverts_count: usize = 0;
260    let mut qa_reverts_total: usize = 0;
261    let mut qa_confidence_sum: u64 = 0;
262    let mut qa_confidence_count: usize = 0;
263    let mut cursor_exact_matches: usize = 0;
264    let mut cursor_total: usize = 0;
265    let mut cursor_distance_sum: usize = 0;
266    let mut cursor_distance_count: usize = 0;
267    let mut wrong_editable_region_count: usize = 0;
268    let mut wrong_editable_region_total: usize = 0;
269    let mut isolated_whitespace_count: usize = 0;
270    let mut patch_inserted_tokens: Vec<usize> = Vec::new();
271    let mut patch_deleted_tokens: Vec<usize> = Vec::new();
272    let mut predictions_with_patch: usize = 0;
273
274    let mut printed_lines: usize = 0;
275    let mut skipped_lines: usize = 0;
276
277    for example in examples {
278        for (score_idx, score) in example.score.iter().enumerate() {
279            let exact_lines = score.exact_lines_counts();
280
281            // Get QA results for this prediction if available
282            let qa_result = example.qa.get(score_idx).and_then(|q| q.as_ref());
283            let qa_reverts_str = qa_result
284                .and_then(|q| q.reverts_edits)
285                .map(|v| if v { "yes" } else { "no" })
286                .unwrap_or("-");
287            let qa_conf_str = qa_result
288                .and_then(|q| q.confidence)
289                .map(|v| format!("{}", v))
290                .unwrap_or("-".to_string());
291
292            // Format wrong editable region metric
293            let wrong_er_str = match score.wrong_editable_region {
294                Some(true) => "✗",
295                Some(false) => "",
296                None => "",
297            };
298
299            // Format cursor metric
300            let cursor_str = match (score.cursor_exact_match, score.cursor_distance) {
301                (Some(true), _) => "✓".to_string(),
302                (Some(false), Some(dist)) => format!("±{}", dist),
303                (Some(false), None) => "✗".to_string(),
304                (None, _) => "-".to_string(),
305            };
306
307            if verbose || printed_lines < MAX_EXAMPLES_DEFAULT {
308                println!(
309                    "{:<40} {:>8.2} {:>5} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
310                    truncate_name(&example.spec.name, 40),
311                    score.delta_chr_f,
312                    score.braces_disbalance,
313                    exact_lines.f1() * 100.0,
314                    score.reversal_ratio * 100.0,
315                    qa_reverts_str,
316                    qa_conf_str,
317                    cursor_str,
318                    wrong_er_str
319                );
320                printed_lines += 1;
321            } else {
322                skipped_lines += 1;
323            }
324
325            all_delta_chr_f_scores.push(score.delta_chr_f);
326            all_reversal_ratios.push(score.reversal_ratio);
327            total_scores += 1;
328            braces_disbalance_sum += score.braces_disbalance;
329            total_delta_chr_f.accumulate(&score.delta_chr_f_counts());
330            total_delta_chr_f_precision += score.delta_chr_f_precision;
331            total_delta_chr_f_recall += score.delta_chr_f_recall;
332            delta_chr_f_beta = score.delta_chr_f_beta;
333            total_exact_lines.accumulate(&score.exact_lines_counts());
334
335            // Accumulate QA metrics
336            if let Some(qa) = qa_result {
337                if let Some(reverts) = qa.reverts_edits {
338                    qa_reverts_total += 1;
339                    if reverts {
340                        qa_reverts_count += 1;
341                    }
342                }
343                if let Some(conf) = qa.confidence {
344                    qa_confidence_sum += conf as u64;
345                    qa_confidence_count += 1;
346                }
347            }
348
349            // Accumulate wrong editable region metrics
350            if let Some(wrong) = score.wrong_editable_region {
351                wrong_editable_region_total += 1;
352                if wrong {
353                    wrong_editable_region_count += 1;
354                }
355            }
356
357            // Accumulate isolated whitespace metrics
358            if score.has_isolated_whitespace_changes {
359                isolated_whitespace_count += 1;
360            }
361
362            // Accumulate token change metrics (only for predictions that produced a patch)
363            let has_patch = example
364                .predictions
365                .get(score_idx)
366                .and_then(|p| p.actual_patch.as_ref())
367                .is_some_and(|p| !p.is_empty());
368            if has_patch {
369                predictions_with_patch += 1;
370                patch_inserted_tokens.push(score.inserted_tokens);
371                patch_deleted_tokens.push(score.deleted_tokens);
372            }
373
374            // Accumulate cursor metrics
375            if let Some(exact_match) = score.cursor_exact_match {
376                cursor_total += 1;
377                if exact_match {
378                    cursor_exact_matches += 1;
379                }
380            }
381            if let Some(dist) = score.cursor_distance {
382                cursor_distance_sum += dist;
383                cursor_distance_count += 1;
384            }
385        }
386    }
387
388    if skipped_lines > 0 {
389        println!(
390            "{:<40} (use --verbose to see all {} examples)",
391            format!("... and {} more", skipped_lines),
392            printed_lines + skipped_lines
393        );
394    }
395    println!("{}", separator);
396
397    if !all_delta_chr_f_scores.is_empty() {
398        let avg_delta_chr_f: f32 =
399            all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32;
400        let avg_reversal_ratio: f32 =
401            all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32;
402        let braces_disbalance_avg: f32 = braces_disbalance_sum as f32 / total_scores as f32;
403
404        let qa_reverts_str = if qa_reverts_total > 0 {
405            format!(
406                "{:.1}%",
407                qa_reverts_count as f32 / qa_reverts_total as f32 * 100.0
408            )
409        } else {
410            "-".to_string()
411        };
412        let qa_conf_str = if qa_confidence_count > 0 {
413            format!(
414                "{:.1}",
415                qa_confidence_sum as f32 / qa_confidence_count as f32
416            )
417        } else {
418            "-".to_string()
419        };
420        let cursor_str = if cursor_total > 0 {
421            format!(
422                "{:.0}%",
423                cursor_exact_matches as f32 / cursor_total as f32 * 100.0
424            )
425        } else {
426            "-".to_string()
427        };
428        let wrong_er_str = if wrong_editable_region_total > 0 {
429            format!(
430                "{:.2}%",
431                wrong_editable_region_count as f32 / wrong_editable_region_total as f32 * 100.0
432            )
433        } else {
434            "-".to_string()
435        };
436        let isolated_ws_str = if total_scores > 0 {
437            format!(
438                "{}/{} ({:.1}%)",
439                isolated_whitespace_count,
440                total_scores,
441                isolated_whitespace_count as f32 / total_scores as f32 * 100.0
442            )
443        } else {
444            "-".to_string()
445        };
446        let avg_cursor_distance = if cursor_distance_count > 0 {
447            Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
448        } else {
449            None
450        };
451
452        println!(
453            "{:<40} {:>8.2} {:>5.1} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
454            "TOTAL / AVERAGE",
455            avg_delta_chr_f,
456            braces_disbalance_avg,
457            total_exact_lines.f1() * 100.0,
458            avg_reversal_ratio * 100.0,
459            qa_reverts_str,
460            qa_conf_str,
461            cursor_str,
462            wrong_er_str
463        );
464        println!("{}", separator);
465        println!(
466            "Delta chrF (β={:.1}): TP={}, FP={}, FN={}, P={:.1}%, R={:.1}%",
467            delta_chr_f_beta,
468            total_delta_chr_f.true_positives,
469            total_delta_chr_f.false_positives,
470            total_delta_chr_f.false_negatives,
471            total_delta_chr_f_precision / total_scores as f64 * 100.0,
472            total_delta_chr_f_recall / total_scores as f64 * 100.0
473        );
474
475        // Print additional cursor metrics if available
476        if let Some(avg_dist) = avg_cursor_distance {
477            println!(
478                "Cursor: {}/{} exact matches ({:.0}%), avg distance: {:.1} bytes",
479                cursor_exact_matches,
480                cursor_total,
481                cursor_exact_matches as f32 / cursor_total as f32 * 100.0,
482                avg_dist
483            );
484        }
485
486        // Print isolated whitespace metrics
487        if total_scores > 0 {
488            println!("Isolated whitespace changes: {}", isolated_ws_str);
489        }
490
491        // Print token change percentile summary (only for predictions with a patch)
492        if !patch_inserted_tokens.is_empty() {
493            patch_inserted_tokens.sort_unstable();
494            patch_deleted_tokens.sort_unstable();
495            let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
496                .iter()
497                .zip(patch_deleted_tokens.iter())
498                .map(|(i, d)| i + d)
499                .collect();
500            patch_total_tokens.sort_unstable();
501
502            let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
503            println!();
504            println!(
505                "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
506                predictions_with_patch, total_scores, patch_rate
507            );
508            println!(
509                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
510                "", "p25", "p50", "p75", "p90", "p99"
511            );
512            println!("{}", "─".repeat(LINE_WIDTH));
513            println!(
514                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
515                "Inserted tokens",
516                percentile(&patch_inserted_tokens, 25),
517                percentile(&patch_inserted_tokens, 50),
518                percentile(&patch_inserted_tokens, 75),
519                percentile(&patch_inserted_tokens, 90),
520                percentile(&patch_inserted_tokens, 99),
521            );
522            println!(
523                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
524                "Deleted tokens",
525                percentile(&patch_deleted_tokens, 25),
526                percentile(&patch_deleted_tokens, 50),
527                percentile(&patch_deleted_tokens, 75),
528                percentile(&patch_deleted_tokens, 90),
529                percentile(&patch_deleted_tokens, 99),
530            );
531            println!(
532                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
533                "Total tokens",
534                percentile(&patch_total_tokens, 25),
535                percentile(&patch_total_tokens, 50),
536                percentile(&patch_total_tokens, 75),
537                percentile(&patch_total_tokens, 90),
538                percentile(&patch_total_tokens, 99),
539            );
540        }
541    }
542
543    println!("\n");
544}
545
546fn percentile(sorted_values: &[usize], p: usize) -> usize {
547    if sorted_values.is_empty() {
548        return 0;
549    }
550    let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
551    sorted_values[idx.min(sorted_values.len() - 1)]
552}
553
554fn truncate_name(name: &str, max_len: usize) -> String {
555    if name.len() <= max_len {
556        name.to_string()
557    } else {
558        format!("{}...", &name[..max_len - 3])
559    }
560}
561
562#[derive(Serialize)]
563pub struct SummaryJson {
564    pub total_examples: usize,
565    pub avg_delta_chr_f: f32,
566    pub delta_chr_f_beta: f64,
567    pub delta_chr_f_true_positives: usize,
568    pub delta_chr_f_false_positives: usize,
569    pub delta_chr_f_false_negatives: usize,
570    pub delta_chr_f_precision: f64,
571    pub delta_chr_f_recall: f64,
572    pub avg_braces_disbalance: f32,
573    pub exact_lines_true_positives: usize,
574    pub exact_lines_false_positives: usize,
575    pub exact_lines_false_negatives: usize,
576    pub exact_lines_precision: f64,
577    pub exact_lines_recall: f64,
578    pub exact_lines_f1: f64,
579    pub avg_reversal_ratio: f32,
580    #[serde(skip_serializing_if = "Option::is_none")]
581    pub qa_avg_reverts_edits: Option<f32>,
582    #[serde(skip_serializing_if = "Option::is_none")]
583    pub qa_avg_confidence: Option<f32>,
584    #[serde(skip_serializing_if = "Option::is_none")]
585    pub cursor_exact_match_rate: Option<f32>,
586    #[serde(skip_serializing_if = "Option::is_none")]
587    pub cursor_avg_distance: Option<f32>,
588    #[serde(skip_serializing_if = "Option::is_none")]
589    pub cursor_total_evaluated: Option<usize>,
590    #[serde(skip_serializing_if = "Option::is_none")]
591    pub wrong_editable_region_rate: Option<f32>,
592    pub isolated_whitespace_rate: Option<f32>,
593}
594
595pub fn compute_summary(examples: &[Example]) -> SummaryJson {
596    use crate::metrics::ClassificationMetrics;
597
598    let mut all_delta_chr_f_scores = Vec::new();
599    let mut all_reversal_ratios = Vec::new();
600    let mut braces_disbalance_sum: usize = 0;
601    let mut total_delta_chr_f = ClassificationMetrics::default();
602    let mut total_delta_chr_f_precision = 0.0;
603    let mut total_delta_chr_f_recall = 0.0;
604    let mut delta_chr_f_beta = 0.0;
605    let mut total_exact_lines = ClassificationMetrics::default();
606    let mut total_scores: usize = 0;
607    let mut qa_reverts_count: usize = 0;
608    let mut qa_reverts_total: usize = 0;
609    let mut qa_confidence_sum: u64 = 0;
610    let mut qa_confidence_count: usize = 0;
611    let mut cursor_exact_matches: usize = 0;
612    let mut cursor_total: usize = 0;
613    let mut cursor_distance_sum: usize = 0;
614    let mut cursor_distance_count: usize = 0;
615    let mut wrong_editable_region_count: usize = 0;
616    let mut wrong_editable_region_total: usize = 0;
617    let mut isolated_whitespace_count: usize = 0;
618
619    for example in examples {
620        for (score_idx, score) in example.score.iter().enumerate() {
621            all_delta_chr_f_scores.push(score.delta_chr_f);
622            all_reversal_ratios.push(score.reversal_ratio);
623            total_scores += 1;
624            braces_disbalance_sum += score.braces_disbalance;
625            total_delta_chr_f.accumulate(&score.delta_chr_f_counts());
626            total_delta_chr_f_precision += score.delta_chr_f_precision;
627            total_delta_chr_f_recall += score.delta_chr_f_recall;
628            delta_chr_f_beta = score.delta_chr_f_beta;
629            total_exact_lines.accumulate(&score.exact_lines_counts());
630
631            // Accumulate QA metrics
632            if let Some(Some(qa)) = example.qa.get(score_idx) {
633                if let Some(reverts) = qa.reverts_edits {
634                    qa_reverts_total += 1;
635                    if reverts {
636                        qa_reverts_count += 1;
637                    }
638                }
639                if let Some(conf) = qa.confidence {
640                    qa_confidence_sum += conf as u64;
641                    qa_confidence_count += 1;
642                }
643            }
644
645            // Accumulate wrong editable region metrics
646            if let Some(wrong) = score.wrong_editable_region {
647                wrong_editable_region_total += 1;
648                if wrong {
649                    wrong_editable_region_count += 1;
650                }
651            }
652
653            // Accumulate isolated whitespace metrics
654            if score.has_isolated_whitespace_changes {
655                isolated_whitespace_count += 1;
656            }
657
658            // Accumulate cursor metrics
659            if let Some(exact_match) = score.cursor_exact_match {
660                cursor_total += 1;
661                if exact_match {
662                    cursor_exact_matches += 1;
663                }
664            }
665            if let Some(dist) = score.cursor_distance {
666                cursor_distance_sum += dist;
667                cursor_distance_count += 1;
668            }
669        }
670    }
671
672    let avg_delta_chr_f = if all_delta_chr_f_scores.is_empty() {
673        0.0
674    } else {
675        all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32
676    };
677
678    let avg_reversal_ratio = if all_reversal_ratios.is_empty() {
679        0.0
680    } else {
681        all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32
682    };
683
684    let avg_braces_disbalance = if total_scores == 0 {
685        0.0
686    } else {
687        braces_disbalance_sum as f32 / total_scores as f32
688    };
689
690    let qa_avg_reverts_edits = if qa_reverts_total > 0 {
691        Some(qa_reverts_count as f32 / qa_reverts_total as f32)
692    } else {
693        None
694    };
695
696    let qa_avg_confidence = if qa_confidence_count > 0 {
697        Some(qa_confidence_sum as f32 / qa_confidence_count as f32)
698    } else {
699        None
700    };
701
702    let cursor_exact_match_rate = if cursor_total > 0 {
703        Some(cursor_exact_matches as f32 / cursor_total as f32)
704    } else {
705        None
706    };
707
708    let cursor_avg_distance = if cursor_distance_count > 0 {
709        Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
710    } else {
711        None
712    };
713
714    let cursor_total_evaluated = if cursor_total > 0 {
715        Some(cursor_total)
716    } else {
717        None
718    };
719
720    let wrong_editable_region_rate = if wrong_editable_region_total > 0 {
721        Some(wrong_editable_region_count as f32 / wrong_editable_region_total as f32)
722    } else {
723        None
724    };
725
726    let isolated_whitespace_rate = if total_scores > 0 {
727        Some(isolated_whitespace_count as f32 / total_scores as f32)
728    } else {
729        None
730    };
731
732    SummaryJson {
733        total_examples: total_scores,
734        avg_delta_chr_f,
735        delta_chr_f_beta,
736        delta_chr_f_true_positives: total_delta_chr_f.true_positives,
737        delta_chr_f_false_positives: total_delta_chr_f.false_positives,
738        delta_chr_f_false_negatives: total_delta_chr_f.false_negatives,
739        delta_chr_f_precision: if total_scores == 0 {
740            0.0
741        } else {
742            total_delta_chr_f_precision / total_scores as f64
743        },
744        delta_chr_f_recall: if total_scores == 0 {
745            0.0
746        } else {
747            total_delta_chr_f_recall / total_scores as f64
748        },
749        avg_braces_disbalance,
750        exact_lines_true_positives: total_exact_lines.true_positives,
751        exact_lines_false_positives: total_exact_lines.false_positives,
752        exact_lines_false_negatives: total_exact_lines.false_negatives,
753        exact_lines_precision: total_exact_lines.precision(),
754        exact_lines_recall: total_exact_lines.recall(),
755        exact_lines_f1: total_exact_lines.f1(),
756        avg_reversal_ratio,
757        qa_avg_reverts_edits,
758        qa_avg_confidence,
759        cursor_exact_match_rate,
760        cursor_avg_distance,
761        cursor_total_evaluated,
762        wrong_editable_region_rate,
763        isolated_whitespace_rate,
764    }
765}
766
767pub fn write_summary_json(examples: &[Example], path: &Path) -> anyhow::Result<()> {
768    let summary = compute_summary(examples);
769    let file = File::create(path)
770        .with_context(|| format!("Failed to create summary JSON file: {}", path.display()))?;
771    let writer = BufWriter::new(file);
772    serde_json::to_writer_pretty(writer, &summary)
773        .with_context(|| format!("Failed to write summary JSON to: {}", path.display()))?;
774    eprintln!("Wrote summary JSON to: {}", path.display());
775    Ok(())
776}