score.rs

  1use crate::{
  2    PredictArgs, PredictionProvider,
  3    example::{ActualCursor, Example, ExampleScore},
  4    format_prompt::TeacherPrompt,
  5    headless::EpAppState,
  6    metrics,
  7    parse_output::parse_prediction_output,
  8    predict::run_prediction,
  9    progress::{ExampleProgress, Step},
 10    reversal_tracking,
 11};
 12use anyhow::Context as _;
 13use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
 14use gpui::AsyncApp;
 15use serde::Serialize;
 16use std::fs::File;
 17use std::io::BufWriter;
 18use std::path::Path;
 19use std::sync::Arc;
 20
 21pub async fn run_scoring(
 22    example: &mut Example,
 23    args: &PredictArgs,
 24    app_state: Arc<EpAppState>,
 25    example_progress: &ExampleProgress,
 26    cx: AsyncApp,
 27) -> anyhow::Result<()> {
 28    run_prediction(example, args, app_state, example_progress, cx).await?;
 29
 30    let progress = example_progress.start(Step::Score);
 31
 32    progress.set_substatus("applying patches");
 33    let original_text = &example
 34        .prompt_inputs
 35        .as_ref()
 36        .context("prompt_inputs is required for scoring - run prediction first or ensure JSON includes prompt_inputs")?
 37        .content;
 38    let expected_patches_with_cursors = example.spec.expected_patches_with_cursor_positions();
 39
 40    let expected_texts: Vec<String> = expected_patches_with_cursors
 41        .iter()
 42        .map(|(patch, _)| {
 43            apply_diff_to_string(patch, original_text)
 44                .with_context(|| format!("Expected patch did not apply for {}", example.spec.name))
 45        })
 46        .collect::<Result<Vec<_>, _>>()?;
 47
 48    // For Teacher prompts, we need to extract the editable region to properly compute cursor offsets.
 49    // The actual_cursor_offset from Teacher is relative to the editable region, while the expected
 50    // cursor from the patch is relative to the hunk. We need to apply the patch to the editable
 51    // region to find where the hunk matched, then compute the expected cursor position.
 52    let old_editable_region = if let Some(p) = example.prompt.as_ref() {
 53        if matches!(
 54            p.provider,
 55            PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_)
 56        ) {
 57            Some(
 58                TeacherPrompt::extract_editable_region(&p.input)?
 59                    .replace(TeacherPrompt::USER_CURSOR_MARKER, ""),
 60            )
 61        } else {
 62            None
 63        }
 64    } else {
 65        None
 66    };
 67
 68    let zero_scores = ExampleScore {
 69        delta_chr_f: 0.0,
 70        braces_disbalance: 0,
 71        exact_lines_tp: 0,
 72        exact_lines_fp: 0,
 73        exact_lines_fn: 0,
 74        reversal_ratio: 0.0,
 75        cursor_distance: None,
 76        cursor_exact_match: None,
 77        wrong_editable_region: None,
 78        has_isolated_whitespace_changes: false,
 79        inserted_tokens: 0,
 80        deleted_tokens: 0,
 81    };
 82
 83    let prompt_inputs = example.prompt_inputs.as_ref().unwrap();
 84    let cursor_path = example.spec.cursor_path.as_ref();
 85
 86    progress.set_substatus("computing metrics");
 87    let mut scores = vec![];
 88    for prediction in &example.predictions {
 89        let actual_patch = prediction.actual_patch.clone().or_else(|| {
 90            parse_prediction_output(example, &prediction.actual_output, prediction.provider)
 91                .ok()
 92                .map(|(patch, _)| patch)
 93        });
 94
 95        let Some(actual_patch) = actual_patch else {
 96            scores.push(zero_scores.clone());
 97            continue;
 98        };
 99
100        let token_changes = metrics::count_patch_token_changes(&actual_patch);
101
102        let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
103            Ok(text) => text,
104            Err(_) => {
105                let mut s = zero_scores.clone();
106                s.inserted_tokens = token_changes.inserted_tokens;
107                s.deleted_tokens = token_changes.deleted_tokens;
108                scores.push(s);
109                continue;
110            }
111        };
112
113        let mut best_delta_chr_f = 0.0f32;
114        let mut best_expected_cursor: Option<usize> = None;
115        let mut best_patch_idx: Option<usize> = None;
116
117        for (idx, expected) in expected_texts.iter().enumerate() {
118            let delta_chr_f = metrics::delta_chr_f(original_text, expected, &actual_text) as f32;
119            if delta_chr_f > best_delta_chr_f {
120                best_delta_chr_f = delta_chr_f;
121                best_patch_idx = Some(idx);
122            }
123        }
124
125        if let Some(idx) = best_patch_idx {
126            // Get the raw cursor offset from the expected patch (relative to hunk new text)
127            let expected_cursor_in_patch = expected_patches_with_cursors
128                .get(idx)
129                .and_then(|(_, cursor)| *cursor);
130
131            // For Teacher prompts, we need to apply the patch to the editable region
132            // to find where the hunk matched, then compute the actual cursor position
133            if let (Some(editable_region), Some(cursor_in_patch)) =
134                (&old_editable_region, expected_cursor_in_patch)
135            {
136                let (patch, _) = &expected_patches_with_cursors[idx];
137                if let Ok((_, hunk_offset)) =
138                    apply_diff_to_string_with_hunk_offset(patch, editable_region)
139                {
140                    let hunk_start = hunk_offset.unwrap_or(0);
141                    best_expected_cursor = Some(hunk_start + cursor_in_patch);
142                }
143            } else {
144                // For non-Teacher prompts or if we can't compute, use raw offset
145                best_expected_cursor = expected_cursor_in_patch;
146            }
147        }
148
149        let disbalance_before = metrics::braces_disbalance(&original_text);
150        let disbalance_after = metrics::braces_disbalance(&actual_text);
151        let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
152
153        // Compute exact lines match against best matching expected patch
154        let best_exact_lines = expected_patches_with_cursors
155            .iter()
156            .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
157            .max_by_key(|m| m.true_positives)
158            .unwrap_or_default();
159
160        // Compute reversal ratio
161        let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
162            prompt_inputs,
163            &actual_text,
164            cursor_path,
165        );
166
167        // Compute cursor position metrics
168        let (cursor_distance, cursor_exact_match) =
169            compute_cursor_metrics(best_expected_cursor, prediction.actual_cursor.as_ref());
170
171        // Compute approximation of editable region correctness
172        let wrong_editable_region = Some(!metrics::is_editable_region_correct(&actual_patch));
173
174        // Check for isolated whitespace changes.
175        let has_isolated_whitespace_changes = metrics::has_isolated_whitespace_changes(
176            &actual_patch,
177            prediction.actual_cursor.as_ref(),
178        );
179
180        scores.push(ExampleScore {
181            delta_chr_f: best_delta_chr_f,
182            braces_disbalance,
183            exact_lines_tp: best_exact_lines.true_positives,
184            exact_lines_fp: best_exact_lines.false_positives,
185            exact_lines_fn: best_exact_lines.false_negatives,
186            reversal_ratio,
187            cursor_distance,
188            cursor_exact_match,
189            wrong_editable_region,
190            has_isolated_whitespace_changes,
191            inserted_tokens: token_changes.inserted_tokens,
192            deleted_tokens: token_changes.deleted_tokens,
193        });
194    }
195
196    example.score = scores;
197    Ok(())
198}
199
200fn compute_cursor_metrics(
201    expected_cursor_editable_region_offset: Option<usize>,
202    actual_cursor: Option<&ActualCursor>,
203) -> (Option<usize>, Option<bool>) {
204    match (expected_cursor_editable_region_offset, actual_cursor) {
205        (Some(expected), Some(actual)) => {
206            let distance = expected.abs_diff(actual.editable_region_offset.unwrap_or_default());
207            let exact_match = distance == 0;
208            (Some(distance), Some(exact_match))
209        }
210        (None, None) => {
211            // Neither has cursor position - skip cursor scoring
212            (None, None)
213        }
214        (Some(_), None) | (None, Some(_)) => {
215            // Only one has cursor position - count as miss
216            (None, Some(false))
217        }
218    }
219}
220
221pub fn print_report(examples: &[Example]) {
222    use crate::metrics::ClassificationMetrics;
223
224    const LINE_WIDTH: usize = 101;
225    let separator = "".repeat(LINE_WIDTH);
226
227    println!("{}", separator);
228    println!(
229        "{:<40} {:>8} {:>5} {:>7} {:>7} {:>7} {:>7} {:>6} {:>5}",
230        "Example", "DeltaChrF", "Brace", "F1", "Revert", "QaRev", "QaConf", "Cursor", "WrgER"
231    );
232    println!("{}", separator);
233
234    let mut all_delta_chr_f_scores = Vec::new();
235    let mut all_reversal_ratios = Vec::new();
236    let mut braces_disbalance_sum: usize = 0;
237    let mut total_exact_lines = ClassificationMetrics::default();
238    let mut total_scores: usize = 0;
239    let mut qa_reverts_count: usize = 0;
240    let mut qa_reverts_total: usize = 0;
241    let mut qa_confidence_sum: u64 = 0;
242    let mut qa_confidence_count: usize = 0;
243    let mut cursor_exact_matches: usize = 0;
244    let mut cursor_total: usize = 0;
245    let mut cursor_distance_sum: usize = 0;
246    let mut cursor_distance_count: usize = 0;
247    let mut wrong_editable_region_count: usize = 0;
248    let mut wrong_editable_region_total: usize = 0;
249    let mut isolated_whitespace_count: usize = 0;
250    let mut patch_inserted_tokens: Vec<usize> = Vec::new();
251    let mut patch_deleted_tokens: Vec<usize> = Vec::new();
252    let mut predictions_with_patch: usize = 0;
253
254    for example in examples {
255        for (score_idx, score) in example.score.iter().enumerate() {
256            let exact_lines = ClassificationMetrics {
257                true_positives: score.exact_lines_tp,
258                false_positives: score.exact_lines_fp,
259                false_negatives: score.exact_lines_fn,
260            };
261
262            // Get QA results for this prediction if available
263            let qa_result = example.qa.get(score_idx).and_then(|q| q.as_ref());
264            let qa_reverts_str = qa_result
265                .and_then(|q| q.reverts_edits)
266                .map(|v| if v { "yes" } else { "no" })
267                .unwrap_or("-");
268            let qa_conf_str = qa_result
269                .and_then(|q| q.confidence)
270                .map(|v| format!("{}", v))
271                .unwrap_or("-".to_string());
272
273            // Format wrong editable region metric
274            let wrong_er_str = match score.wrong_editable_region {
275                Some(true) => "",
276                Some(false) => "",
277                None => "",
278            };
279
280            // Format cursor metric
281            let cursor_str = match (score.cursor_exact_match, score.cursor_distance) {
282                (Some(true), _) => "".to_string(),
283                (Some(false), Some(dist)) => format!("±{}", dist),
284                (Some(false), None) => "".to_string(),
285                (None, _) => "-".to_string(),
286            };
287
288            println!(
289                "{:<40} {:>8.2} {:>5} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
290                truncate_name(&example.spec.name, 40),
291                score.delta_chr_f,
292                score.braces_disbalance,
293                exact_lines.f1() * 100.0,
294                score.reversal_ratio * 100.0,
295                qa_reverts_str,
296                qa_conf_str,
297                cursor_str,
298                wrong_er_str
299            );
300
301            all_delta_chr_f_scores.push(score.delta_chr_f);
302            all_reversal_ratios.push(score.reversal_ratio);
303            total_scores += 1;
304            braces_disbalance_sum += score.braces_disbalance;
305            total_exact_lines.true_positives += score.exact_lines_tp;
306            total_exact_lines.false_positives += score.exact_lines_fp;
307            total_exact_lines.false_negatives += score.exact_lines_fn;
308
309            // Accumulate QA metrics
310            if let Some(qa) = qa_result {
311                if let Some(reverts) = qa.reverts_edits {
312                    qa_reverts_total += 1;
313                    if reverts {
314                        qa_reverts_count += 1;
315                    }
316                }
317                if let Some(conf) = qa.confidence {
318                    qa_confidence_sum += conf as u64;
319                    qa_confidence_count += 1;
320                }
321            }
322
323            // Accumulate wrong editable region metrics
324            if let Some(wrong) = score.wrong_editable_region {
325                wrong_editable_region_total += 1;
326                if wrong {
327                    wrong_editable_region_count += 1;
328                }
329            }
330
331            // Accumulate isolated whitespace metrics
332            if score.has_isolated_whitespace_changes {
333                isolated_whitespace_count += 1;
334            }
335
336            // Accumulate token change metrics (only for predictions that produced a patch)
337            let has_patch = example
338                .predictions
339                .get(score_idx)
340                .and_then(|p| p.actual_patch.as_ref())
341                .is_some_and(|p| !p.is_empty());
342            if has_patch {
343                predictions_with_patch += 1;
344                patch_inserted_tokens.push(score.inserted_tokens);
345                patch_deleted_tokens.push(score.deleted_tokens);
346            }
347
348            // Accumulate cursor metrics
349            if let Some(exact_match) = score.cursor_exact_match {
350                cursor_total += 1;
351                if exact_match {
352                    cursor_exact_matches += 1;
353                }
354            }
355            if let Some(dist) = score.cursor_distance {
356                cursor_distance_sum += dist;
357                cursor_distance_count += 1;
358            }
359        }
360    }
361
362    println!("{}", separator);
363
364    if !all_delta_chr_f_scores.is_empty() {
365        let avg_delta_chr_f: f32 =
366            all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32;
367        let avg_reversal_ratio: f32 =
368            all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32;
369        let braces_disbalance_avg: f32 = braces_disbalance_sum as f32 / total_scores as f32;
370
371        let qa_reverts_str = if qa_reverts_total > 0 {
372            format!(
373                "{:.1}%",
374                qa_reverts_count as f32 / qa_reverts_total as f32 * 100.0
375            )
376        } else {
377            "-".to_string()
378        };
379        let qa_conf_str = if qa_confidence_count > 0 {
380            format!(
381                "{:.1}",
382                qa_confidence_sum as f32 / qa_confidence_count as f32
383            )
384        } else {
385            "-".to_string()
386        };
387        let cursor_str = if cursor_total > 0 {
388            format!(
389                "{:.0}%",
390                cursor_exact_matches as f32 / cursor_total as f32 * 100.0
391            )
392        } else {
393            "-".to_string()
394        };
395        let wrong_er_str = if wrong_editable_region_total > 0 {
396            format!(
397                "{:.2}%",
398                wrong_editable_region_count as f32 / wrong_editable_region_total as f32 * 100.0
399            )
400        } else {
401            "-".to_string()
402        };
403        let isolated_ws_str = if total_scores > 0 {
404            format!(
405                "{}/{} ({:.1}%)",
406                isolated_whitespace_count,
407                total_scores,
408                isolated_whitespace_count as f32 / total_scores as f32 * 100.0
409            )
410        } else {
411            "-".to_string()
412        };
413        let avg_cursor_distance = if cursor_distance_count > 0 {
414            Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
415        } else {
416            None
417        };
418
419        println!(
420            "{:<40} {:>8.2} {:>5.1} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
421            "TOTAL / AVERAGE",
422            avg_delta_chr_f,
423            braces_disbalance_avg,
424            total_exact_lines.f1() * 100.0,
425            avg_reversal_ratio * 100.0,
426            qa_reverts_str,
427            qa_conf_str,
428            cursor_str,
429            wrong_er_str
430        );
431        println!("{}", separator);
432
433        // Print additional cursor metrics if available
434        if let Some(avg_dist) = avg_cursor_distance {
435            println!(
436                "Cursor: {}/{} exact matches ({:.0}%), avg distance: {:.1} bytes",
437                cursor_exact_matches,
438                cursor_total,
439                cursor_exact_matches as f32 / cursor_total as f32 * 100.0,
440                avg_dist
441            );
442        }
443
444        // Print isolated whitespace metrics
445        if total_scores > 0 {
446            println!("Isolated whitespace changes: {}", isolated_ws_str);
447        }
448
449        // Print token change percentile summary (only for predictions with a patch)
450        if !patch_inserted_tokens.is_empty() {
451            patch_inserted_tokens.sort_unstable();
452            patch_deleted_tokens.sort_unstable();
453            let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
454                .iter()
455                .zip(patch_deleted_tokens.iter())
456                .map(|(i, d)| i + d)
457                .collect();
458            patch_total_tokens.sort_unstable();
459
460            let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
461            println!();
462            println!(
463                "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
464                predictions_with_patch, total_scores, patch_rate
465            );
466            println!(
467                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
468                "", "p25", "p50", "p75", "p90", "p99"
469            );
470            println!("{}", "".repeat(LINE_WIDTH));
471            println!(
472                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
473                "Inserted tokens",
474                percentile(&patch_inserted_tokens, 25),
475                percentile(&patch_inserted_tokens, 50),
476                percentile(&patch_inserted_tokens, 75),
477                percentile(&patch_inserted_tokens, 90),
478                percentile(&patch_inserted_tokens, 99),
479            );
480            println!(
481                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
482                "Deleted tokens",
483                percentile(&patch_deleted_tokens, 25),
484                percentile(&patch_deleted_tokens, 50),
485                percentile(&patch_deleted_tokens, 75),
486                percentile(&patch_deleted_tokens, 90),
487                percentile(&patch_deleted_tokens, 99),
488            );
489            println!(
490                "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
491                "Total tokens",
492                percentile(&patch_total_tokens, 25),
493                percentile(&patch_total_tokens, 50),
494                percentile(&patch_total_tokens, 75),
495                percentile(&patch_total_tokens, 90),
496                percentile(&patch_total_tokens, 99),
497            );
498        }
499    }
500
501    println!("\n");
502}
503
504fn percentile(sorted_values: &[usize], p: usize) -> usize {
505    if sorted_values.is_empty() {
506        return 0;
507    }
508    let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
509    sorted_values[idx.min(sorted_values.len() - 1)]
510}
511
512fn truncate_name(name: &str, max_len: usize) -> String {
513    if name.len() <= max_len {
514        name.to_string()
515    } else {
516        format!("{}...", &name[..max_len - 3])
517    }
518}
519
520#[derive(Serialize)]
521pub struct SummaryJson {
522    pub total_examples: usize,
523    pub avg_delta_chr_f: f32,
524    pub avg_braces_disbalance: f32,
525    pub exact_lines_true_positives: usize,
526    pub exact_lines_false_positives: usize,
527    pub exact_lines_false_negatives: usize,
528    pub exact_lines_precision: f64,
529    pub exact_lines_recall: f64,
530    pub exact_lines_f1: f64,
531    pub avg_reversal_ratio: f32,
532    #[serde(skip_serializing_if = "Option::is_none")]
533    pub qa_avg_reverts_edits: Option<f32>,
534    #[serde(skip_serializing_if = "Option::is_none")]
535    pub qa_avg_confidence: Option<f32>,
536    #[serde(skip_serializing_if = "Option::is_none")]
537    pub cursor_exact_match_rate: Option<f32>,
538    #[serde(skip_serializing_if = "Option::is_none")]
539    pub cursor_avg_distance: Option<f32>,
540    #[serde(skip_serializing_if = "Option::is_none")]
541    pub cursor_total_evaluated: Option<usize>,
542    #[serde(skip_serializing_if = "Option::is_none")]
543    pub wrong_editable_region_rate: Option<f32>,
544    pub isolated_whitespace_rate: Option<f32>,
545}
546
547pub fn compute_summary(examples: &[Example]) -> SummaryJson {
548    use crate::metrics::ClassificationMetrics;
549
550    let mut all_delta_chr_f_scores = Vec::new();
551    let mut all_reversal_ratios = Vec::new();
552    let mut braces_disbalance_sum: usize = 0;
553    let mut total_exact_lines = ClassificationMetrics::default();
554    let mut total_scores: usize = 0;
555    let mut qa_reverts_count: usize = 0;
556    let mut qa_reverts_total: usize = 0;
557    let mut qa_confidence_sum: u64 = 0;
558    let mut qa_confidence_count: usize = 0;
559    let mut cursor_exact_matches: usize = 0;
560    let mut cursor_total: usize = 0;
561    let mut cursor_distance_sum: usize = 0;
562    let mut cursor_distance_count: usize = 0;
563    let mut wrong_editable_region_count: usize = 0;
564    let mut wrong_editable_region_total: usize = 0;
565    let mut isolated_whitespace_count: usize = 0;
566
567    for example in examples {
568        for (score_idx, score) in example.score.iter().enumerate() {
569            all_delta_chr_f_scores.push(score.delta_chr_f);
570            all_reversal_ratios.push(score.reversal_ratio);
571            total_scores += 1;
572            braces_disbalance_sum += score.braces_disbalance;
573            total_exact_lines.true_positives += score.exact_lines_tp;
574            total_exact_lines.false_positives += score.exact_lines_fp;
575            total_exact_lines.false_negatives += score.exact_lines_fn;
576
577            // Accumulate QA metrics
578            if let Some(Some(qa)) = example.qa.get(score_idx) {
579                if let Some(reverts) = qa.reverts_edits {
580                    qa_reverts_total += 1;
581                    if reverts {
582                        qa_reverts_count += 1;
583                    }
584                }
585                if let Some(conf) = qa.confidence {
586                    qa_confidence_sum += conf as u64;
587                    qa_confidence_count += 1;
588                }
589            }
590
591            // Accumulate wrong editable region metrics
592            if let Some(wrong) = score.wrong_editable_region {
593                wrong_editable_region_total += 1;
594                if wrong {
595                    wrong_editable_region_count += 1;
596                }
597            }
598
599            // Accumulate isolated whitespace metrics
600            if score.has_isolated_whitespace_changes {
601                isolated_whitespace_count += 1;
602            }
603
604            // Accumulate cursor metrics
605            if let Some(exact_match) = score.cursor_exact_match {
606                cursor_total += 1;
607                if exact_match {
608                    cursor_exact_matches += 1;
609                }
610            }
611            if let Some(dist) = score.cursor_distance {
612                cursor_distance_sum += dist;
613                cursor_distance_count += 1;
614            }
615        }
616    }
617
618    let avg_delta_chr_f = if all_delta_chr_f_scores.is_empty() {
619        0.0
620    } else {
621        all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32
622    };
623
624    let avg_reversal_ratio = if all_reversal_ratios.is_empty() {
625        0.0
626    } else {
627        all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32
628    };
629
630    let avg_braces_disbalance = if total_scores == 0 {
631        0.0
632    } else {
633        braces_disbalance_sum as f32 / total_scores as f32
634    };
635
636    let qa_avg_reverts_edits = if qa_reverts_total > 0 {
637        Some(qa_reverts_count as f32 / qa_reverts_total as f32)
638    } else {
639        None
640    };
641
642    let qa_avg_confidence = if qa_confidence_count > 0 {
643        Some(qa_confidence_sum as f32 / qa_confidence_count as f32)
644    } else {
645        None
646    };
647
648    let cursor_exact_match_rate = if cursor_total > 0 {
649        Some(cursor_exact_matches as f32 / cursor_total as f32)
650    } else {
651        None
652    };
653
654    let cursor_avg_distance = if cursor_distance_count > 0 {
655        Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
656    } else {
657        None
658    };
659
660    let cursor_total_evaluated = if cursor_total > 0 {
661        Some(cursor_total)
662    } else {
663        None
664    };
665
666    let wrong_editable_region_rate = if wrong_editable_region_total > 0 {
667        Some(wrong_editable_region_count as f32 / wrong_editable_region_total as f32)
668    } else {
669        None
670    };
671
672    let isolated_whitespace_rate = if total_scores > 0 {
673        Some(isolated_whitespace_count as f32 / total_scores as f32)
674    } else {
675        None
676    };
677
678    SummaryJson {
679        total_examples: total_scores,
680        avg_delta_chr_f,
681        avg_braces_disbalance,
682        exact_lines_true_positives: total_exact_lines.true_positives,
683        exact_lines_false_positives: total_exact_lines.false_positives,
684        exact_lines_false_negatives: total_exact_lines.false_negatives,
685        exact_lines_precision: total_exact_lines.precision(),
686        exact_lines_recall: total_exact_lines.recall(),
687        exact_lines_f1: total_exact_lines.f1(),
688        avg_reversal_ratio,
689        qa_avg_reverts_edits,
690        qa_avg_confidence,
691        cursor_exact_match_rate,
692        cursor_avg_distance,
693        cursor_total_evaluated,
694        wrong_editable_region_rate,
695        isolated_whitespace_rate,
696    }
697}
698
699pub fn write_summary_json(examples: &[Example], path: &Path) -> anyhow::Result<()> {
700    let summary = compute_summary(examples);
701    let file = File::create(path)
702        .with_context(|| format!("Failed to create summary JSON file: {}", path.display()))?;
703    let writer = BufWriter::new(file);
704    serde_json::to_writer_pretty(writer, &summary)
705        .with_context(|| format!("Failed to write summary JSON to: {}", path.display()))?;
706    eprintln!("Wrote summary JSON to: {}", path.display());
707    Ok(())
708}