score.rs

  1use crate::{
  2    PredictArgs, PredictionProvider,
  3    example::{ActualCursor, Example, ExampleScore},
  4    format_prompt::TeacherPrompt,
  5    headless::EpAppState,
  6    metrics,
  7    parse_output::parse_prediction_output,
  8    predict::run_prediction,
  9    progress::{ExampleProgress, Step},
 10    reversal_tracking,
 11};
 12use anyhow::Context as _;
 13use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
 14use gpui::AsyncApp;
 15use serde::Serialize;
 16use std::fs::File;
 17use std::io::BufWriter;
 18use std::path::Path;
 19use std::sync::Arc;
 20
 21pub async fn run_scoring(
 22    example: &mut Example,
 23    args: &PredictArgs,
 24    app_state: Arc<EpAppState>,
 25    example_progress: &ExampleProgress,
 26    cx: AsyncApp,
 27) -> anyhow::Result<()> {
 28    run_prediction(example, args, app_state, example_progress, cx).await?;
 29
 30    let progress = example_progress.start(Step::Score);
 31
 32    progress.set_substatus("applying patches");
 33    let prompt_inputs = example
 34        .prompt_inputs
 35        .as_ref()
 36        .context("prompt_inputs is required for scoring - run prediction first or ensure JSON includes prompt_inputs")?;
 37    let original_text: &str = prompt_inputs.cursor_excerpt.as_ref();
 38    let expected_patches_with_cursors = example.spec.expected_patches_with_cursor_positions();
 39
 40    let expected_texts: Vec<String> = expected_patches_with_cursors
 41        .iter()
 42        .map(|(patch, _)| {
 43            apply_diff_to_string(patch, original_text)
 44                .with_context(|| format!("Expected patch did not apply for {}", example.spec.name))
 45        })
 46        .collect::<Result<Vec<_>, _>>()?;
 47
 48    // For Teacher prompts, we need to extract the editable region to properly compute cursor offsets.
 49    // The actual_cursor_offset from Teacher is relative to the editable region, while the expected
 50    // cursor from the patch is relative to the hunk. We need to apply the patch to the editable
 51    // region to find where the hunk matched, then compute the expected cursor position.
 52    let old_editable_region = if let Some(p) = example.prompt.as_ref() {
 53        if matches!(
 54            p.provider,
 55            PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_)
 56        ) {
 57            Some(
 58                TeacherPrompt::extract_editable_region(&p.input)?
 59                    .replace(TeacherPrompt::USER_CURSOR_MARKER, ""),
 60            )
 61        } else {
 62            None
 63        }
 64    } else {
 65        None
 66    };
 67
 68    let zero_scores = ExampleScore {
 69        delta_chr_f: 0.0,
 70        braces_disbalance: 0,
 71        exact_lines_tp: 0,
 72        exact_lines_fp: 0,
 73        exact_lines_fn: 0,
 74        token_match_tp: 0,
 75        token_match_fp: 0,
 76        token_match_fn: 0,
 77        token_match_precision: 0.0,
 78        token_match_recall: 0.0,
 79        reversal_ratio: 0.0,
 80        cursor_distance: None,
 81        cursor_exact_match: None,
 82        wrong_editable_region: None,
 83        has_isolated_whitespace_changes: false,
 84        inserted_tokens: 0,
 85        deleted_tokens: 0,
 86        cumulative_logprob: None,
 87        avg_logprob: None,
 88    };
 89
 90    let cursor_path = example.spec.cursor_path.as_ref();
 91
 92    progress.set_substatus("computing metrics");
 93    let mut scores = vec![];
 94    for prediction in &example.predictions {
 95        let actual_patch = prediction.actual_patch.clone().or_else(|| {
 96            parse_prediction_output(example, &prediction.actual_output, prediction.provider)
 97                .ok()
 98                .map(|(patch, _)| patch)
 99        });
100
101        let Some(actual_patch) = actual_patch else {
102            scores.push(zero_scores.clone());
103            continue;
104        };
105
106        let token_changes = metrics::count_patch_token_changes(&actual_patch);
107
108        let best_exact_lines = expected_patches_with_cursors
109            .iter()
110            .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
111            .max_by_key(|m| m.true_positives)
112            .unwrap_or_default();
113
114        let best_token_match = expected_patches_with_cursors
115            .iter()
116            .map(|(expected_patch, _)| metrics::token_match(expected_patch, &actual_patch))
117            .max_by(metrics::compare_classification_metrics)
118            .unwrap_or_default();
119
120        let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
121            Ok(text) => text,
122            Err(_) => {
123                let mut s = zero_scores.clone();
124                s.exact_lines_tp = best_exact_lines.true_positives;
125                s.exact_lines_fp = best_exact_lines.false_positives;
126                s.exact_lines_fn = best_exact_lines.false_negatives;
127                s.token_match_tp = best_token_match.true_positives;
128                s.token_match_fp = best_token_match.false_positives;
129                s.token_match_fn = best_token_match.false_negatives;
130                s.token_match_precision = best_token_match.precision();
131                s.token_match_recall = best_token_match.recall();
132                s.inserted_tokens = token_changes.inserted_tokens;
133                s.deleted_tokens = token_changes.deleted_tokens;
134                scores.push(s);
135                continue;
136            }
137        };
138
139        let mut best_delta_chr_f = 0.0f32;
140        let mut best_expected_cursor: Option<usize> = None;
141        let mut best_patch_idx: Option<usize> = None;
142
143        for (idx, expected) in expected_texts.iter().enumerate() {
144            let delta_chr_f = metrics::delta_chr_f(original_text, expected, &actual_text) as f32;
145            if delta_chr_f > best_delta_chr_f {
146                best_delta_chr_f = delta_chr_f;
147                best_patch_idx = Some(idx);
148            }
149        }
150
151        if let Some(idx) = best_patch_idx {
152            // Get the raw cursor offset from the expected patch (relative to hunk new text)
153            let expected_cursor_in_patch = expected_patches_with_cursors
154                .get(idx)
155                .and_then(|(_, cursor)| *cursor);
156
157            // For Teacher prompts, we need to apply the patch to the editable region
158            // to find where the hunk matched, then compute the actual cursor position
159            if let (Some(editable_region), Some(cursor_in_patch)) =
160                (&old_editable_region, expected_cursor_in_patch)
161            {
162                let (patch, _) = &expected_patches_with_cursors[idx];
163                if let Ok((_, hunk_offset)) =
164                    apply_diff_to_string_with_hunk_offset(patch, editable_region)
165                {
166                    let hunk_start = hunk_offset.unwrap_or(0);
167                    best_expected_cursor = Some(hunk_start + cursor_in_patch);
168                }
169            } else {
170                // For non-Teacher prompts or if we can't compute, use raw offset
171                best_expected_cursor = expected_cursor_in_patch;
172            }
173        }
174
175        let disbalance_before = metrics::braces_disbalance(&original_text);
176        let disbalance_after = metrics::braces_disbalance(&actual_text);
177        let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
178
179        // Compute reversal ratio
180        let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
181            prompt_inputs,
182            &actual_text,
183            cursor_path,
184        );
185
186        // Compute cursor position metrics
187        let (cursor_distance, cursor_exact_match) =
188            compute_cursor_metrics(best_expected_cursor, prediction.actual_cursor.as_ref());
189
190        // Compute approximation of editable region correctness
191        let wrong_editable_region = Some(!metrics::is_editable_region_correct(&actual_patch));
192
193        // Check for isolated whitespace changes.
194        let has_isolated_whitespace_changes = metrics::has_isolated_whitespace_changes(
195            &actual_patch,
196            prediction.actual_cursor.as_ref(),
197        );
198
199        scores.push(ExampleScore {
200            delta_chr_f: best_delta_chr_f,
201            braces_disbalance,
202            exact_lines_tp: best_exact_lines.true_positives,
203            exact_lines_fp: best_exact_lines.false_positives,
204            exact_lines_fn: best_exact_lines.false_negatives,
205            token_match_tp: best_token_match.true_positives,
206            token_match_fp: best_token_match.false_positives,
207            token_match_fn: best_token_match.false_negatives,
208            token_match_precision: best_token_match.precision(),
209            token_match_recall: best_token_match.recall(),
210            reversal_ratio,
211            cursor_distance,
212            cursor_exact_match,
213            wrong_editable_region,
214            has_isolated_whitespace_changes,
215            inserted_tokens: token_changes.inserted_tokens,
216            deleted_tokens: token_changes.deleted_tokens,
217            cumulative_logprob: prediction.cumulative_logprob,
218            avg_logprob: prediction.avg_logprob,
219        });
220    }
221
222    example.score = scores;
223    Ok(())
224}
225
226fn compute_cursor_metrics(
227    expected_cursor_editable_region_offset: Option<usize>,
228    actual_cursor: Option<&ActualCursor>,
229) -> (Option<usize>, Option<bool>) {
230    match (expected_cursor_editable_region_offset, actual_cursor) {
231        (Some(expected), Some(actual)) => {
232            let distance = expected.abs_diff(actual.editable_region_offset.unwrap_or_default());
233            let exact_match = distance == 0;
234            (Some(distance), Some(exact_match))
235        }
236        (None, None) => {
237            // Neither has cursor position - skip cursor scoring
238            (None, None)
239        }
240        (Some(_), None) | (None, Some(_)) => {
241            // Only one has cursor position - count as miss
242            (None, Some(false))
243        }
244    }
245}
246
247pub fn print_report(examples: &[Example], verbose: bool) {
248    const MAX_EXAMPLES_DEFAULT: usize = 20;
249    use crate::metrics::ClassificationMetrics;
250
251    const LINE_WIDTH: usize = 101;
252    let separator = "".repeat(LINE_WIDTH);
253
254    println!("{}", separator);
255    println!(
256        "{:<40} {:>8} {:>5} {:>7} {:>7} {:>7} {:>7} {:>6} {:>5}",
257        "Example", "DeltaChrF", "Brace", "F1", "Revert", "QaRev", "QaConf", "Cursor", "WrgER"
258    );
259    println!("{}", separator);
260
261    let mut all_delta_chr_f_scores = Vec::new();
262    let mut all_reversal_ratios = Vec::new();
263    let mut braces_disbalance_sum: usize = 0;
264    let mut total_exact_lines = ClassificationMetrics::default();
265    let mut total_scores: usize = 0;
266    let mut qa_reverts_count: usize = 0;
267    let mut qa_reverts_total: usize = 0;
268    let mut qa_confidence_sum: u64 = 0;
269    let mut qa_confidence_count: usize = 0;
270    let mut cursor_exact_matches: usize = 0;
271    let mut cursor_total: usize = 0;
272    let mut cursor_distance_sum: usize = 0;
273    let mut cursor_distance_count: usize = 0;
274    let mut wrong_editable_region_count: usize = 0;
275    let mut wrong_editable_region_total: usize = 0;
276    let mut isolated_whitespace_count: usize = 0;
277    let mut patch_inserted_tokens: Vec<usize> = Vec::new();
278    let mut patch_deleted_tokens: Vec<usize> = Vec::new();
279    let mut total_token_match = ClassificationMetrics::default();
280    let mut predictions_with_patch: usize = 0;
281
282    let mut printed_lines: usize = 0;
283    let mut skipped_lines: usize = 0;
284
285    for example in examples {
286        for (score_idx, score) in example.score.iter().enumerate() {
287            let exact_lines = ClassificationMetrics {
288                true_positives: score.exact_lines_tp,
289                false_positives: score.exact_lines_fp,
290                false_negatives: score.exact_lines_fn,
291            };
292
293            // Get QA results for this prediction if available
294            let qa_result = example.qa.get(score_idx).and_then(|q| q.as_ref());
295            let qa_reverts_str = qa_result
296                .and_then(|q| q.reverts_edits)
297                .map(|v| if v { "yes" } else { "no" })
298                .unwrap_or("-");
299            let qa_conf_str = qa_result
300                .and_then(|q| q.confidence)
301                .map(|v| format!("{}", v))
302                .unwrap_or("-".to_string());
303
304            // Format wrong editable region metric
305            let wrong_er_str = match score.wrong_editable_region {
306                Some(true) => "",
307                Some(false) => "",
308                None => "",
309            };
310
311            // Format cursor metric
312            let cursor_str = match (score.cursor_exact_match, score.cursor_distance) {
313                (Some(true), _) => "".to_string(),
314                (Some(false), Some(dist)) => format!("±{}", dist),
315                (Some(false), None) => "".to_string(),
316                (None, _) => "-".to_string(),
317            };
318
319            if verbose || printed_lines < MAX_EXAMPLES_DEFAULT {
320                println!(
321                    "{:<40} {:>8.2} {:>5} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
322                    truncate_name(&example.spec.name, 40),
323                    score.delta_chr_f,
324                    score.braces_disbalance,
325                    exact_lines.f1() * 100.0,
326                    score.reversal_ratio * 100.0,
327                    qa_reverts_str,
328                    qa_conf_str,
329                    cursor_str,
330                    wrong_er_str
331                );
332                printed_lines += 1;
333            } else {
334                skipped_lines += 1;
335            }
336
337            all_delta_chr_f_scores.push(score.delta_chr_f);
338            all_reversal_ratios.push(score.reversal_ratio);
339            total_scores += 1;
340            braces_disbalance_sum += score.braces_disbalance;
341            total_exact_lines.true_positives += score.exact_lines_tp;
342            total_exact_lines.false_positives += score.exact_lines_fp;
343            total_exact_lines.false_negatives += score.exact_lines_fn;
344            total_token_match.true_positives += score.token_match_tp;
345            total_token_match.false_positives += score.token_match_fp;
346            total_token_match.false_negatives += score.token_match_fn;
347
348            // Accumulate QA metrics
349            if let Some(qa) = qa_result {
350                if let Some(reverts) = qa.reverts_edits {
351                    qa_reverts_total += 1;
352                    if reverts {
353                        qa_reverts_count += 1;
354                    }
355                }
356                if let Some(conf) = qa.confidence {
357                    qa_confidence_sum += conf as u64;
358                    qa_confidence_count += 1;
359                }
360            }
361
362            // Accumulate wrong editable region metrics
363            if let Some(wrong) = score.wrong_editable_region {
364                wrong_editable_region_total += 1;
365                if wrong {
366                    wrong_editable_region_count += 1;
367                }
368            }
369
370            // Accumulate isolated whitespace metrics
371            if score.has_isolated_whitespace_changes {
372                isolated_whitespace_count += 1;
373            }
374
375            // Accumulate token change metrics (only for predictions that produced a patch)
376            let has_patch = example
377                .predictions
378                .get(score_idx)
379                .and_then(|p| p.actual_patch.as_ref())
380                .is_some_and(|p| !p.is_empty());
381            if has_patch {
382                predictions_with_patch += 1;
383                patch_inserted_tokens.push(score.inserted_tokens);
384                patch_deleted_tokens.push(score.deleted_tokens);
385            }
386
387            // Accumulate cursor metrics
388            if let Some(exact_match) = score.cursor_exact_match {
389                cursor_total += 1;
390                if exact_match {
391                    cursor_exact_matches += 1;
392                }
393            }
394            if let Some(dist) = score.cursor_distance {
395                cursor_distance_sum += dist;
396                cursor_distance_count += 1;
397            }
398        }
399    }
400
401    if skipped_lines > 0 {
402        println!(
403            "{:<40} (use --verbose to see all {} examples)",
404            format!("... and {} more", skipped_lines),
405            printed_lines + skipped_lines
406        );
407    }
408    println!("{}", separator);
409
410    if !all_delta_chr_f_scores.is_empty() {
411        let avg_delta_chr_f: f32 =
412            all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32;
413        let avg_reversal_ratio: f32 =
414            all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32;
415        let braces_disbalance_avg: f32 = braces_disbalance_sum as f32 / total_scores as f32;
416
417        let qa_reverts_str = if qa_reverts_total > 0 {
418            format!(
419                "{:.1}%",
420                qa_reverts_count as f32 / qa_reverts_total as f32 * 100.0
421            )
422        } else {
423            "-".to_string()
424        };
425        let qa_conf_str = if qa_confidence_count > 0 {
426            format!(
427                "{:.1}",
428                qa_confidence_sum as f32 / qa_confidence_count as f32
429            )
430        } else {
431            "-".to_string()
432        };
433        let cursor_str = if cursor_total > 0 {
434            format!(
435                "{:.0}%",
436                cursor_exact_matches as f32 / cursor_total as f32 * 100.0
437            )
438        } else {
439            "-".to_string()
440        };
441        let wrong_er_str = if wrong_editable_region_total > 0 {
442            format!(
443                "{:.2}%",
444                wrong_editable_region_count as f32 / wrong_editable_region_total as f32 * 100.0
445            )
446        } else {
447            "-".to_string()
448        };
449        let isolated_ws_str = if total_scores > 0 {
450            format!(
451                "{}/{} ({:.1}%)",
452                isolated_whitespace_count,
453                total_scores,
454                isolated_whitespace_count as f32 / total_scores as f32 * 100.0
455            )
456        } else {
457            "-".to_string()
458        };
459        let avg_cursor_distance = if cursor_distance_count > 0 {
460            Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
461        } else {
462            None
463        };
464
465        println!(
466            "{:<40} {:>8.2} {:>5.1} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
467            "TOTAL / AVERAGE",
468            avg_delta_chr_f,
469            braces_disbalance_avg,
470            total_exact_lines.f1() * 100.0,
471            avg_reversal_ratio * 100.0,
472            qa_reverts_str,
473            qa_conf_str,
474            cursor_str,
475            wrong_er_str
476        );
477        println!("{}", separator);
478
479        // Print additional cursor metrics if available
480        if let Some(avg_dist) = avg_cursor_distance {
481            println!(
482                "Cursor: {}/{} exact matches ({:.0}%), avg distance: {:.1} bytes",
483                cursor_exact_matches,
484                cursor_total,
485                cursor_exact_matches as f32 / cursor_total as f32 * 100.0,
486                avg_dist
487            );
488        }
489
490        // Print isolated whitespace metrics
491        if total_scores > 0 {
492            println!("Isolated whitespace changes: {}", isolated_ws_str);
493        }
494
495        println!(
496            "Token match: P={:.1}% R={:.1}% F1={:.1}% (TP={}, FP={}, FN={})",
497            total_token_match.precision() * 100.0,
498            total_token_match.recall() * 100.0,
499            total_token_match.f1() * 100.0,
500            total_token_match.true_positives,
501            total_token_match.false_positives,
502            total_token_match.false_negatives,
503        );
504
505        // Print token change percentile summary (only for predictions with a patch)
506        if !patch_inserted_tokens.is_empty() {
507            patch_inserted_tokens.sort_unstable();
508            patch_deleted_tokens.sort_unstable();
509            let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
510                .iter()
511                .zip(patch_deleted_tokens.iter())
512                .map(|(i, d)| i + d)
513                .collect();
514            patch_total_tokens.sort_unstable();
515
516            let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
517            println!();
518            println!(
519                "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
520                predictions_with_patch, total_scores, patch_rate
521            );
522            println!(
523                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
524                "", "p25", "p50", "p75", "p90", "p99"
525            );
526            println!("{}", "".repeat(LINE_WIDTH));
527            println!(
528                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
529                "Inserted tokens",
530                percentile(&patch_inserted_tokens, 25),
531                percentile(&patch_inserted_tokens, 50),
532                percentile(&patch_inserted_tokens, 75),
533                percentile(&patch_inserted_tokens, 90),
534                percentile(&patch_inserted_tokens, 99),
535            );
536            println!(
537                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
538                "Deleted tokens",
539                percentile(&patch_deleted_tokens, 25),
540                percentile(&patch_deleted_tokens, 50),
541                percentile(&patch_deleted_tokens, 75),
542                percentile(&patch_deleted_tokens, 90),
543                percentile(&patch_deleted_tokens, 99),
544            );
545            println!(
546                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
547                "Total tokens",
548                percentile(&patch_total_tokens, 25),
549                percentile(&patch_total_tokens, 50),
550                percentile(&patch_total_tokens, 75),
551                percentile(&patch_total_tokens, 90),
552                percentile(&patch_total_tokens, 99),
553            );
554        }
555    }
556
557    println!("\n");
558}
559
560fn percentile(sorted_values: &[usize], p: usize) -> usize {
561    if sorted_values.is_empty() {
562        return 0;
563    }
564    let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
565    sorted_values[idx.min(sorted_values.len() - 1)]
566}
567
568fn truncate_name(name: &str, max_len: usize) -> String {
569    if name.len() <= max_len {
570        name.to_string()
571    } else {
572        format!("{}...", &name[..max_len - 3])
573    }
574}
575
576#[derive(Serialize)]
577pub struct SummaryJson {
578    pub total_examples: usize,
579    pub avg_delta_chr_f: f32,
580    pub avg_braces_disbalance: f32,
581    pub exact_lines_true_positives: usize,
582    pub exact_lines_false_positives: usize,
583    pub exact_lines_false_negatives: usize,
584    pub exact_lines_precision: f64,
585    pub exact_lines_recall: f64,
586    pub exact_lines_f1: f64,
587    pub token_match_tp: usize,
588    pub token_match_fp: usize,
589    pub token_match_fn: usize,
590    pub token_match_precision: f64,
591    pub token_match_recall: f64,
592    pub token_match_f1: f64,
593    pub avg_reversal_ratio: f32,
594    #[serde(skip_serializing_if = "Option::is_none")]
595    pub qa_avg_reverts_edits: Option<f32>,
596    #[serde(skip_serializing_if = "Option::is_none")]
597    pub qa_avg_confidence: Option<f32>,
598    #[serde(skip_serializing_if = "Option::is_none")]
599    pub cursor_exact_match_rate: Option<f32>,
600    #[serde(skip_serializing_if = "Option::is_none")]
601    pub cursor_avg_distance: Option<f32>,
602    #[serde(skip_serializing_if = "Option::is_none")]
603    pub cursor_total_evaluated: Option<usize>,
604    #[serde(skip_serializing_if = "Option::is_none")]
605    pub wrong_editable_region_rate: Option<f32>,
606    pub isolated_whitespace_rate: Option<f32>,
607}
608
609pub fn compute_summary(examples: &[Example]) -> SummaryJson {
610    use crate::metrics::ClassificationMetrics;
611
612    let mut all_delta_chr_f_scores = Vec::new();
613    let mut all_reversal_ratios = Vec::new();
614    let mut braces_disbalance_sum: usize = 0;
615    let mut total_exact_lines = ClassificationMetrics::default();
616    let mut total_token_match = ClassificationMetrics::default();
617    let mut total_scores: usize = 0;
618    let mut qa_reverts_count: usize = 0;
619    let mut qa_reverts_total: usize = 0;
620    let mut qa_confidence_sum: u64 = 0;
621    let mut qa_confidence_count: usize = 0;
622    let mut cursor_exact_matches: usize = 0;
623    let mut cursor_total: usize = 0;
624    let mut cursor_distance_sum: usize = 0;
625    let mut cursor_distance_count: usize = 0;
626    let mut wrong_editable_region_count: usize = 0;
627    let mut wrong_editable_region_total: usize = 0;
628    let mut isolated_whitespace_count: usize = 0;
629
630    for example in examples {
631        for (score_idx, score) in example.score.iter().enumerate() {
632            all_delta_chr_f_scores.push(score.delta_chr_f);
633            all_reversal_ratios.push(score.reversal_ratio);
634            total_scores += 1;
635            braces_disbalance_sum += score.braces_disbalance;
636            total_exact_lines.true_positives += score.exact_lines_tp;
637            total_exact_lines.false_positives += score.exact_lines_fp;
638            total_exact_lines.false_negatives += score.exact_lines_fn;
639            total_token_match.true_positives += score.token_match_tp;
640            total_token_match.false_positives += score.token_match_fp;
641            total_token_match.false_negatives += score.token_match_fn;
642
643            // Accumulate QA metrics
644            if let Some(Some(qa)) = example.qa.get(score_idx) {
645                if let Some(reverts) = qa.reverts_edits {
646                    qa_reverts_total += 1;
647                    if reverts {
648                        qa_reverts_count += 1;
649                    }
650                }
651                if let Some(conf) = qa.confidence {
652                    qa_confidence_sum += conf as u64;
653                    qa_confidence_count += 1;
654                }
655            }
656
657            // Accumulate wrong editable region metrics
658            if let Some(wrong) = score.wrong_editable_region {
659                wrong_editable_region_total += 1;
660                if wrong {
661                    wrong_editable_region_count += 1;
662                }
663            }
664
665            // Accumulate isolated whitespace metrics
666            if score.has_isolated_whitespace_changes {
667                isolated_whitespace_count += 1;
668            }
669
670            // Accumulate cursor metrics
671            if let Some(exact_match) = score.cursor_exact_match {
672                cursor_total += 1;
673                if exact_match {
674                    cursor_exact_matches += 1;
675                }
676            }
677            if let Some(dist) = score.cursor_distance {
678                cursor_distance_sum += dist;
679                cursor_distance_count += 1;
680            }
681        }
682    }
683
684    let avg_delta_chr_f = if all_delta_chr_f_scores.is_empty() {
685        0.0
686    } else {
687        all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32
688    };
689
690    let avg_reversal_ratio = if all_reversal_ratios.is_empty() {
691        0.0
692    } else {
693        all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32
694    };
695
696    let avg_braces_disbalance = if total_scores == 0 {
697        0.0
698    } else {
699        braces_disbalance_sum as f32 / total_scores as f32
700    };
701
702    let qa_avg_reverts_edits = if qa_reverts_total > 0 {
703        Some(qa_reverts_count as f32 / qa_reverts_total as f32)
704    } else {
705        None
706    };
707
708    let qa_avg_confidence = if qa_confidence_count > 0 {
709        Some(qa_confidence_sum as f32 / qa_confidence_count as f32)
710    } else {
711        None
712    };
713
714    let cursor_exact_match_rate = if cursor_total > 0 {
715        Some(cursor_exact_matches as f32 / cursor_total as f32)
716    } else {
717        None
718    };
719
720    let cursor_avg_distance = if cursor_distance_count > 0 {
721        Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
722    } else {
723        None
724    };
725
726    let cursor_total_evaluated = if cursor_total > 0 {
727        Some(cursor_total)
728    } else {
729        None
730    };
731
732    let wrong_editable_region_rate = if wrong_editable_region_total > 0 {
733        Some(wrong_editable_region_count as f32 / wrong_editable_region_total as f32)
734    } else {
735        None
736    };
737
738    let isolated_whitespace_rate = if total_scores > 0 {
739        Some(isolated_whitespace_count as f32 / total_scores as f32)
740    } else {
741        None
742    };
743
744    SummaryJson {
745        total_examples: total_scores,
746        avg_delta_chr_f,
747        avg_braces_disbalance,
748        exact_lines_true_positives: total_exact_lines.true_positives,
749        exact_lines_false_positives: total_exact_lines.false_positives,
750        exact_lines_false_negatives: total_exact_lines.false_negatives,
751        exact_lines_precision: total_exact_lines.precision(),
752        exact_lines_recall: total_exact_lines.recall(),
753        exact_lines_f1: total_exact_lines.f1(),
754        token_match_tp: total_token_match.true_positives,
755        token_match_fp: total_token_match.false_positives,
756        token_match_fn: total_token_match.false_negatives,
757        token_match_precision: total_token_match.precision(),
758        token_match_recall: total_token_match.recall(),
759        token_match_f1: total_token_match.f1(),
760        avg_reversal_ratio,
761        qa_avg_reverts_edits,
762        qa_avg_confidence,
763        cursor_exact_match_rate,
764        cursor_avg_distance,
765        cursor_total_evaluated,
766        wrong_editable_region_rate,
767        isolated_whitespace_rate,
768    }
769}
770
771pub fn write_summary_json(examples: &[Example], path: &Path) -> anyhow::Result<()> {
772    let summary = compute_summary(examples);
773    let file = File::create(path)
774        .with_context(|| format!("Failed to create summary JSON file: {}", path.display()))?;
775    let writer = BufWriter::new(file);
776    serde_json::to_writer_pretty(writer, &summary)
777        .with_context(|| format!("Failed to write summary JSON to: {}", path.display()))?;
778    eprintln!("Wrote summary JSON to: {}", path.display());
779    Ok(())
780}