1use crate::{
2 PredictArgs, PredictionProvider,
3 example::{ActualCursor, Example, ExampleScore},
4 format_prompt::TeacherPrompt,
5 headless::EpAppState,
6 metrics,
7 parse_output::parse_prediction_output,
8 predict::run_prediction,
9 progress::{ExampleProgress, Step},
10 reversal_tracking,
11};
12use anyhow::Context as _;
13use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
14use gpui::AsyncApp;
15use serde::Serialize;
16use std::fs::File;
17use std::io::BufWriter;
18use std::path::Path;
19use std::sync::Arc;
20
21pub async fn run_scoring(
22 example: &mut Example,
23 args: &PredictArgs,
24 app_state: Arc<EpAppState>,
25 example_progress: &ExampleProgress,
26 cx: AsyncApp,
27) -> anyhow::Result<()> {
28 run_prediction(example, args, app_state, example_progress, cx).await?;
29
30 let progress = example_progress.start(Step::Score);
31
32 progress.set_substatus("applying patches");
33 let prompt_inputs = example
34 .prompt_inputs
35 .as_ref()
36 .context("prompt_inputs is required for scoring - run prediction first or ensure JSON includes prompt_inputs")?;
37 let original_text: &str = prompt_inputs.cursor_excerpt.as_ref();
38 let expected_patches_with_cursors = example.spec.expected_patches_with_cursor_positions();
39
40 let expected_texts: Vec<String> = expected_patches_with_cursors
41 .iter()
42 .map(|(patch, _)| {
43 apply_diff_to_string(patch, original_text)
44 .with_context(|| format!("Expected patch did not apply for {}", example.spec.name))
45 })
46 .collect::<Result<Vec<_>, _>>()?;
47
48 // For Teacher prompts, we need to extract the editable region to properly compute cursor offsets.
49 // The actual_cursor_offset from Teacher is relative to the editable region, while the expected
50 // cursor from the patch is relative to the hunk. We need to apply the patch to the editable
51 // region to find where the hunk matched, then compute the expected cursor position.
52 let old_editable_region = if let Some(p) = example.prompt.as_ref() {
53 if matches!(
54 p.provider,
55 PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_)
56 ) {
57 Some(
58 TeacherPrompt::extract_editable_region(&p.input)?
59 .replace(TeacherPrompt::USER_CURSOR_MARKER, ""),
60 )
61 } else {
62 None
63 }
64 } else {
65 None
66 };
67
68 let zero_scores = ExampleScore {
69 delta_chr_f: 0.0,
70 braces_disbalance: 0,
71 exact_lines_tp: 0,
72 exact_lines_fp: 0,
73 exact_lines_fn: 0,
74 token_match_tp: 0,
75 token_match_fp: 0,
76 token_match_fn: 0,
77 token_match_precision: 0.0,
78 token_match_recall: 0.0,
79 reversal_ratio: 0.0,
80 cursor_distance: None,
81 cursor_exact_match: None,
82 wrong_editable_region: None,
83 has_isolated_whitespace_changes: false,
84 inserted_tokens: 0,
85 deleted_tokens: 0,
86 cumulative_logprob: None,
87 avg_logprob: None,
88 };
89
90 let cursor_path = example.spec.cursor_path.as_ref();
91
92 progress.set_substatus("computing metrics");
93 let mut scores = vec![];
94 for prediction in &example.predictions {
95 let actual_patch = prediction.actual_patch.clone().or_else(|| {
96 parse_prediction_output(example, &prediction.actual_output, prediction.provider)
97 .ok()
98 .map(|(patch, _)| patch)
99 });
100
101 let Some(actual_patch) = actual_patch else {
102 scores.push(zero_scores.clone());
103 continue;
104 };
105
106 let token_changes = metrics::count_patch_token_changes(&actual_patch);
107
108 let best_exact_lines = expected_patches_with_cursors
109 .iter()
110 .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
111 .max_by_key(|m| m.true_positives)
112 .unwrap_or_default();
113
114 let best_token_match = expected_patches_with_cursors
115 .iter()
116 .map(|(expected_patch, _)| metrics::token_match(expected_patch, &actual_patch))
117 .max_by(metrics::compare_classification_metrics)
118 .unwrap_or_default();
119
120 let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
121 Ok(text) => text,
122 Err(_) => {
123 let mut s = zero_scores.clone();
124 s.exact_lines_tp = best_exact_lines.true_positives;
125 s.exact_lines_fp = best_exact_lines.false_positives;
126 s.exact_lines_fn = best_exact_lines.false_negatives;
127 s.token_match_tp = best_token_match.true_positives;
128 s.token_match_fp = best_token_match.false_positives;
129 s.token_match_fn = best_token_match.false_negatives;
130 s.token_match_precision = best_token_match.precision();
131 s.token_match_recall = best_token_match.recall();
132 s.inserted_tokens = token_changes.inserted_tokens;
133 s.deleted_tokens = token_changes.deleted_tokens;
134 scores.push(s);
135 continue;
136 }
137 };
138
139 let mut best_delta_chr_f = 0.0f32;
140 let mut best_expected_cursor: Option<usize> = None;
141 let mut best_patch_idx: Option<usize> = None;
142
143 for (idx, expected) in expected_texts.iter().enumerate() {
144 let delta_chr_f = metrics::delta_chr_f(original_text, expected, &actual_text) as f32;
145 if delta_chr_f > best_delta_chr_f {
146 best_delta_chr_f = delta_chr_f;
147 best_patch_idx = Some(idx);
148 }
149 }
150
151 if let Some(idx) = best_patch_idx {
152 // Get the raw cursor offset from the expected patch (relative to hunk new text)
153 let expected_cursor_in_patch = expected_patches_with_cursors
154 .get(idx)
155 .and_then(|(_, cursor)| *cursor);
156
157 // For Teacher prompts, we need to apply the patch to the editable region
158 // to find where the hunk matched, then compute the actual cursor position
159 if let (Some(editable_region), Some(cursor_in_patch)) =
160 (&old_editable_region, expected_cursor_in_patch)
161 {
162 let (patch, _) = &expected_patches_with_cursors[idx];
163 if let Ok((_, hunk_offset)) =
164 apply_diff_to_string_with_hunk_offset(patch, editable_region)
165 {
166 let hunk_start = hunk_offset.unwrap_or(0);
167 best_expected_cursor = Some(hunk_start + cursor_in_patch);
168 }
169 } else {
170 // For non-Teacher prompts or if we can't compute, use raw offset
171 best_expected_cursor = expected_cursor_in_patch;
172 }
173 }
174
175 let disbalance_before = metrics::braces_disbalance(&original_text);
176 let disbalance_after = metrics::braces_disbalance(&actual_text);
177 let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
178
179 // Compute reversal ratio
180 let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
181 prompt_inputs,
182 &actual_text,
183 cursor_path,
184 );
185
186 // Compute cursor position metrics
187 let (cursor_distance, cursor_exact_match) =
188 compute_cursor_metrics(best_expected_cursor, prediction.actual_cursor.as_ref());
189
190 // Compute approximation of editable region correctness
191 let wrong_editable_region = Some(!metrics::is_editable_region_correct(&actual_patch));
192
193 // Check for isolated whitespace changes.
194 let has_isolated_whitespace_changes = metrics::has_isolated_whitespace_changes(
195 &actual_patch,
196 prediction.actual_cursor.as_ref(),
197 );
198
199 scores.push(ExampleScore {
200 delta_chr_f: best_delta_chr_f,
201 braces_disbalance,
202 exact_lines_tp: best_exact_lines.true_positives,
203 exact_lines_fp: best_exact_lines.false_positives,
204 exact_lines_fn: best_exact_lines.false_negatives,
205 token_match_tp: best_token_match.true_positives,
206 token_match_fp: best_token_match.false_positives,
207 token_match_fn: best_token_match.false_negatives,
208 token_match_precision: best_token_match.precision(),
209 token_match_recall: best_token_match.recall(),
210 reversal_ratio,
211 cursor_distance,
212 cursor_exact_match,
213 wrong_editable_region,
214 has_isolated_whitespace_changes,
215 inserted_tokens: token_changes.inserted_tokens,
216 deleted_tokens: token_changes.deleted_tokens,
217 cumulative_logprob: prediction.cumulative_logprob,
218 avg_logprob: prediction.avg_logprob,
219 });
220 }
221
222 example.score = scores;
223 Ok(())
224}
225
226fn compute_cursor_metrics(
227 expected_cursor_editable_region_offset: Option<usize>,
228 actual_cursor: Option<&ActualCursor>,
229) -> (Option<usize>, Option<bool>) {
230 match (expected_cursor_editable_region_offset, actual_cursor) {
231 (Some(expected), Some(actual)) => {
232 let distance = expected.abs_diff(actual.editable_region_offset.unwrap_or_default());
233 let exact_match = distance == 0;
234 (Some(distance), Some(exact_match))
235 }
236 (None, None) => {
237 // Neither has cursor position - skip cursor scoring
238 (None, None)
239 }
240 (Some(_), None) | (None, Some(_)) => {
241 // Only one has cursor position - count as miss
242 (None, Some(false))
243 }
244 }
245}
246
247pub fn print_report(examples: &[Example], verbose: bool) {
248 const MAX_EXAMPLES_DEFAULT: usize = 20;
249 use crate::metrics::ClassificationMetrics;
250
251 const LINE_WIDTH: usize = 101;
252 let separator = "─".repeat(LINE_WIDTH);
253
254 println!("{}", separator);
255 println!(
256 "{:<40} {:>8} {:>5} {:>7} {:>7} {:>7} {:>7} {:>6} {:>5}",
257 "Example", "DeltaChrF", "Brace", "F1", "Revert", "QaRev", "QaConf", "Cursor", "WrgER"
258 );
259 println!("{}", separator);
260
261 let mut all_delta_chr_f_scores = Vec::new();
262 let mut all_reversal_ratios = Vec::new();
263 let mut braces_disbalance_sum: usize = 0;
264 let mut total_exact_lines = ClassificationMetrics::default();
265 let mut total_scores: usize = 0;
266 let mut qa_reverts_count: usize = 0;
267 let mut qa_reverts_total: usize = 0;
268 let mut qa_confidence_sum: u64 = 0;
269 let mut qa_confidence_count: usize = 0;
270 let mut cursor_exact_matches: usize = 0;
271 let mut cursor_total: usize = 0;
272 let mut cursor_distance_sum: usize = 0;
273 let mut cursor_distance_count: usize = 0;
274 let mut wrong_editable_region_count: usize = 0;
275 let mut wrong_editable_region_total: usize = 0;
276 let mut isolated_whitespace_count: usize = 0;
277 let mut patch_inserted_tokens: Vec<usize> = Vec::new();
278 let mut patch_deleted_tokens: Vec<usize> = Vec::new();
279 let mut total_token_match = ClassificationMetrics::default();
280 let mut predictions_with_patch: usize = 0;
281
282 let mut printed_lines: usize = 0;
283 let mut skipped_lines: usize = 0;
284
285 for example in examples {
286 for (score_idx, score) in example.score.iter().enumerate() {
287 let exact_lines = ClassificationMetrics {
288 true_positives: score.exact_lines_tp,
289 false_positives: score.exact_lines_fp,
290 false_negatives: score.exact_lines_fn,
291 };
292
293 // Get QA results for this prediction if available
294 let qa_result = example.qa.get(score_idx).and_then(|q| q.as_ref());
295 let qa_reverts_str = qa_result
296 .and_then(|q| q.reverts_edits)
297 .map(|v| if v { "yes" } else { "no" })
298 .unwrap_or("-");
299 let qa_conf_str = qa_result
300 .and_then(|q| q.confidence)
301 .map(|v| format!("{}", v))
302 .unwrap_or("-".to_string());
303
304 // Format wrong editable region metric
305 let wrong_er_str = match score.wrong_editable_region {
306 Some(true) => "✗",
307 Some(false) => "",
308 None => "",
309 };
310
311 // Format cursor metric
312 let cursor_str = match (score.cursor_exact_match, score.cursor_distance) {
313 (Some(true), _) => "✓".to_string(),
314 (Some(false), Some(dist)) => format!("±{}", dist),
315 (Some(false), None) => "✗".to_string(),
316 (None, _) => "-".to_string(),
317 };
318
319 if verbose || printed_lines < MAX_EXAMPLES_DEFAULT {
320 println!(
321 "{:<40} {:>8.2} {:>5} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
322 truncate_name(&example.spec.name, 40),
323 score.delta_chr_f,
324 score.braces_disbalance,
325 exact_lines.f1() * 100.0,
326 score.reversal_ratio * 100.0,
327 qa_reverts_str,
328 qa_conf_str,
329 cursor_str,
330 wrong_er_str
331 );
332 printed_lines += 1;
333 } else {
334 skipped_lines += 1;
335 }
336
337 all_delta_chr_f_scores.push(score.delta_chr_f);
338 all_reversal_ratios.push(score.reversal_ratio);
339 total_scores += 1;
340 braces_disbalance_sum += score.braces_disbalance;
341 total_exact_lines.true_positives += score.exact_lines_tp;
342 total_exact_lines.false_positives += score.exact_lines_fp;
343 total_exact_lines.false_negatives += score.exact_lines_fn;
344 total_token_match.true_positives += score.token_match_tp;
345 total_token_match.false_positives += score.token_match_fp;
346 total_token_match.false_negatives += score.token_match_fn;
347
348 // Accumulate QA metrics
349 if let Some(qa) = qa_result {
350 if let Some(reverts) = qa.reverts_edits {
351 qa_reverts_total += 1;
352 if reverts {
353 qa_reverts_count += 1;
354 }
355 }
356 if let Some(conf) = qa.confidence {
357 qa_confidence_sum += conf as u64;
358 qa_confidence_count += 1;
359 }
360 }
361
362 // Accumulate wrong editable region metrics
363 if let Some(wrong) = score.wrong_editable_region {
364 wrong_editable_region_total += 1;
365 if wrong {
366 wrong_editable_region_count += 1;
367 }
368 }
369
370 // Accumulate isolated whitespace metrics
371 if score.has_isolated_whitespace_changes {
372 isolated_whitespace_count += 1;
373 }
374
375 // Accumulate token change metrics (only for predictions that produced a patch)
376 let has_patch = example
377 .predictions
378 .get(score_idx)
379 .and_then(|p| p.actual_patch.as_ref())
380 .is_some_and(|p| !p.is_empty());
381 if has_patch {
382 predictions_with_patch += 1;
383 patch_inserted_tokens.push(score.inserted_tokens);
384 patch_deleted_tokens.push(score.deleted_tokens);
385 }
386
387 // Accumulate cursor metrics
388 if let Some(exact_match) = score.cursor_exact_match {
389 cursor_total += 1;
390 if exact_match {
391 cursor_exact_matches += 1;
392 }
393 }
394 if let Some(dist) = score.cursor_distance {
395 cursor_distance_sum += dist;
396 cursor_distance_count += 1;
397 }
398 }
399 }
400
401 if skipped_lines > 0 {
402 println!(
403 "{:<40} (use --verbose to see all {} examples)",
404 format!("... and {} more", skipped_lines),
405 printed_lines + skipped_lines
406 );
407 }
408 println!("{}", separator);
409
410 if !all_delta_chr_f_scores.is_empty() {
411 let avg_delta_chr_f: f32 =
412 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32;
413 let avg_reversal_ratio: f32 =
414 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32;
415 let braces_disbalance_avg: f32 = braces_disbalance_sum as f32 / total_scores as f32;
416
417 let qa_reverts_str = if qa_reverts_total > 0 {
418 format!(
419 "{:.1}%",
420 qa_reverts_count as f32 / qa_reverts_total as f32 * 100.0
421 )
422 } else {
423 "-".to_string()
424 };
425 let qa_conf_str = if qa_confidence_count > 0 {
426 format!(
427 "{:.1}",
428 qa_confidence_sum as f32 / qa_confidence_count as f32
429 )
430 } else {
431 "-".to_string()
432 };
433 let cursor_str = if cursor_total > 0 {
434 format!(
435 "{:.0}%",
436 cursor_exact_matches as f32 / cursor_total as f32 * 100.0
437 )
438 } else {
439 "-".to_string()
440 };
441 let wrong_er_str = if wrong_editable_region_total > 0 {
442 format!(
443 "{:.2}%",
444 wrong_editable_region_count as f32 / wrong_editable_region_total as f32 * 100.0
445 )
446 } else {
447 "-".to_string()
448 };
449 let isolated_ws_str = if total_scores > 0 {
450 format!(
451 "{}/{} ({:.1}%)",
452 isolated_whitespace_count,
453 total_scores,
454 isolated_whitespace_count as f32 / total_scores as f32 * 100.0
455 )
456 } else {
457 "-".to_string()
458 };
459 let avg_cursor_distance = if cursor_distance_count > 0 {
460 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
461 } else {
462 None
463 };
464
465 println!(
466 "{:<40} {:>8.2} {:>5.1} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
467 "TOTAL / AVERAGE",
468 avg_delta_chr_f,
469 braces_disbalance_avg,
470 total_exact_lines.f1() * 100.0,
471 avg_reversal_ratio * 100.0,
472 qa_reverts_str,
473 qa_conf_str,
474 cursor_str,
475 wrong_er_str
476 );
477 println!("{}", separator);
478
479 // Print additional cursor metrics if available
480 if let Some(avg_dist) = avg_cursor_distance {
481 println!(
482 "Cursor: {}/{} exact matches ({:.0}%), avg distance: {:.1} bytes",
483 cursor_exact_matches,
484 cursor_total,
485 cursor_exact_matches as f32 / cursor_total as f32 * 100.0,
486 avg_dist
487 );
488 }
489
490 // Print isolated whitespace metrics
491 if total_scores > 0 {
492 println!("Isolated whitespace changes: {}", isolated_ws_str);
493 }
494
495 println!(
496 "Token match: P={:.1}% R={:.1}% F1={:.1}% (TP={}, FP={}, FN={})",
497 total_token_match.precision() * 100.0,
498 total_token_match.recall() * 100.0,
499 total_token_match.f1() * 100.0,
500 total_token_match.true_positives,
501 total_token_match.false_positives,
502 total_token_match.false_negatives,
503 );
504
505 // Print token change percentile summary (only for predictions with a patch)
506 if !patch_inserted_tokens.is_empty() {
507 patch_inserted_tokens.sort_unstable();
508 patch_deleted_tokens.sort_unstable();
509 let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
510 .iter()
511 .zip(patch_deleted_tokens.iter())
512 .map(|(i, d)| i + d)
513 .collect();
514 patch_total_tokens.sort_unstable();
515
516 let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
517 println!();
518 println!(
519 "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
520 predictions_with_patch, total_scores, patch_rate
521 );
522 println!(
523 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
524 "", "p25", "p50", "p75", "p90", "p99"
525 );
526 println!("{}", "─".repeat(LINE_WIDTH));
527 println!(
528 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
529 "Inserted tokens",
530 percentile(&patch_inserted_tokens, 25),
531 percentile(&patch_inserted_tokens, 50),
532 percentile(&patch_inserted_tokens, 75),
533 percentile(&patch_inserted_tokens, 90),
534 percentile(&patch_inserted_tokens, 99),
535 );
536 println!(
537 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
538 "Deleted tokens",
539 percentile(&patch_deleted_tokens, 25),
540 percentile(&patch_deleted_tokens, 50),
541 percentile(&patch_deleted_tokens, 75),
542 percentile(&patch_deleted_tokens, 90),
543 percentile(&patch_deleted_tokens, 99),
544 );
545 println!(
546 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
547 "Total tokens",
548 percentile(&patch_total_tokens, 25),
549 percentile(&patch_total_tokens, 50),
550 percentile(&patch_total_tokens, 75),
551 percentile(&patch_total_tokens, 90),
552 percentile(&patch_total_tokens, 99),
553 );
554 }
555 }
556
557 println!("\n");
558}
559
560fn percentile(sorted_values: &[usize], p: usize) -> usize {
561 if sorted_values.is_empty() {
562 return 0;
563 }
564 let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
565 sorted_values[idx.min(sorted_values.len() - 1)]
566}
567
568fn truncate_name(name: &str, max_len: usize) -> String {
569 if name.len() <= max_len {
570 name.to_string()
571 } else {
572 format!("{}...", &name[..max_len - 3])
573 }
574}
575
576#[derive(Serialize)]
577pub struct SummaryJson {
578 pub total_examples: usize,
579 pub avg_delta_chr_f: f32,
580 pub avg_braces_disbalance: f32,
581 pub exact_lines_true_positives: usize,
582 pub exact_lines_false_positives: usize,
583 pub exact_lines_false_negatives: usize,
584 pub exact_lines_precision: f64,
585 pub exact_lines_recall: f64,
586 pub exact_lines_f1: f64,
587 pub token_match_tp: usize,
588 pub token_match_fp: usize,
589 pub token_match_fn: usize,
590 pub token_match_precision: f64,
591 pub token_match_recall: f64,
592 pub token_match_f1: f64,
593 pub avg_reversal_ratio: f32,
594 #[serde(skip_serializing_if = "Option::is_none")]
595 pub qa_avg_reverts_edits: Option<f32>,
596 #[serde(skip_serializing_if = "Option::is_none")]
597 pub qa_avg_confidence: Option<f32>,
598 #[serde(skip_serializing_if = "Option::is_none")]
599 pub cursor_exact_match_rate: Option<f32>,
600 #[serde(skip_serializing_if = "Option::is_none")]
601 pub cursor_avg_distance: Option<f32>,
602 #[serde(skip_serializing_if = "Option::is_none")]
603 pub cursor_total_evaluated: Option<usize>,
604 #[serde(skip_serializing_if = "Option::is_none")]
605 pub wrong_editable_region_rate: Option<f32>,
606 pub isolated_whitespace_rate: Option<f32>,
607}
608
609pub fn compute_summary(examples: &[Example]) -> SummaryJson {
610 use crate::metrics::ClassificationMetrics;
611
612 let mut all_delta_chr_f_scores = Vec::new();
613 let mut all_reversal_ratios = Vec::new();
614 let mut braces_disbalance_sum: usize = 0;
615 let mut total_exact_lines = ClassificationMetrics::default();
616 let mut total_token_match = ClassificationMetrics::default();
617 let mut total_scores: usize = 0;
618 let mut qa_reverts_count: usize = 0;
619 let mut qa_reverts_total: usize = 0;
620 let mut qa_confidence_sum: u64 = 0;
621 let mut qa_confidence_count: usize = 0;
622 let mut cursor_exact_matches: usize = 0;
623 let mut cursor_total: usize = 0;
624 let mut cursor_distance_sum: usize = 0;
625 let mut cursor_distance_count: usize = 0;
626 let mut wrong_editable_region_count: usize = 0;
627 let mut wrong_editable_region_total: usize = 0;
628 let mut isolated_whitespace_count: usize = 0;
629
630 for example in examples {
631 for (score_idx, score) in example.score.iter().enumerate() {
632 all_delta_chr_f_scores.push(score.delta_chr_f);
633 all_reversal_ratios.push(score.reversal_ratio);
634 total_scores += 1;
635 braces_disbalance_sum += score.braces_disbalance;
636 total_exact_lines.true_positives += score.exact_lines_tp;
637 total_exact_lines.false_positives += score.exact_lines_fp;
638 total_exact_lines.false_negatives += score.exact_lines_fn;
639 total_token_match.true_positives += score.token_match_tp;
640 total_token_match.false_positives += score.token_match_fp;
641 total_token_match.false_negatives += score.token_match_fn;
642
643 // Accumulate QA metrics
644 if let Some(Some(qa)) = example.qa.get(score_idx) {
645 if let Some(reverts) = qa.reverts_edits {
646 qa_reverts_total += 1;
647 if reverts {
648 qa_reverts_count += 1;
649 }
650 }
651 if let Some(conf) = qa.confidence {
652 qa_confidence_sum += conf as u64;
653 qa_confidence_count += 1;
654 }
655 }
656
657 // Accumulate wrong editable region metrics
658 if let Some(wrong) = score.wrong_editable_region {
659 wrong_editable_region_total += 1;
660 if wrong {
661 wrong_editable_region_count += 1;
662 }
663 }
664
665 // Accumulate isolated whitespace metrics
666 if score.has_isolated_whitespace_changes {
667 isolated_whitespace_count += 1;
668 }
669
670 // Accumulate cursor metrics
671 if let Some(exact_match) = score.cursor_exact_match {
672 cursor_total += 1;
673 if exact_match {
674 cursor_exact_matches += 1;
675 }
676 }
677 if let Some(dist) = score.cursor_distance {
678 cursor_distance_sum += dist;
679 cursor_distance_count += 1;
680 }
681 }
682 }
683
684 let avg_delta_chr_f = if all_delta_chr_f_scores.is_empty() {
685 0.0
686 } else {
687 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32
688 };
689
690 let avg_reversal_ratio = if all_reversal_ratios.is_empty() {
691 0.0
692 } else {
693 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32
694 };
695
696 let avg_braces_disbalance = if total_scores == 0 {
697 0.0
698 } else {
699 braces_disbalance_sum as f32 / total_scores as f32
700 };
701
702 let qa_avg_reverts_edits = if qa_reverts_total > 0 {
703 Some(qa_reverts_count as f32 / qa_reverts_total as f32)
704 } else {
705 None
706 };
707
708 let qa_avg_confidence = if qa_confidence_count > 0 {
709 Some(qa_confidence_sum as f32 / qa_confidence_count as f32)
710 } else {
711 None
712 };
713
714 let cursor_exact_match_rate = if cursor_total > 0 {
715 Some(cursor_exact_matches as f32 / cursor_total as f32)
716 } else {
717 None
718 };
719
720 let cursor_avg_distance = if cursor_distance_count > 0 {
721 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
722 } else {
723 None
724 };
725
726 let cursor_total_evaluated = if cursor_total > 0 {
727 Some(cursor_total)
728 } else {
729 None
730 };
731
732 let wrong_editable_region_rate = if wrong_editable_region_total > 0 {
733 Some(wrong_editable_region_count as f32 / wrong_editable_region_total as f32)
734 } else {
735 None
736 };
737
738 let isolated_whitespace_rate = if total_scores > 0 {
739 Some(isolated_whitespace_count as f32 / total_scores as f32)
740 } else {
741 None
742 };
743
744 SummaryJson {
745 total_examples: total_scores,
746 avg_delta_chr_f,
747 avg_braces_disbalance,
748 exact_lines_true_positives: total_exact_lines.true_positives,
749 exact_lines_false_positives: total_exact_lines.false_positives,
750 exact_lines_false_negatives: total_exact_lines.false_negatives,
751 exact_lines_precision: total_exact_lines.precision(),
752 exact_lines_recall: total_exact_lines.recall(),
753 exact_lines_f1: total_exact_lines.f1(),
754 token_match_tp: total_token_match.true_positives,
755 token_match_fp: total_token_match.false_positives,
756 token_match_fn: total_token_match.false_negatives,
757 token_match_precision: total_token_match.precision(),
758 token_match_recall: total_token_match.recall(),
759 token_match_f1: total_token_match.f1(),
760 avg_reversal_ratio,
761 qa_avg_reverts_edits,
762 qa_avg_confidence,
763 cursor_exact_match_rate,
764 cursor_avg_distance,
765 cursor_total_evaluated,
766 wrong_editable_region_rate,
767 isolated_whitespace_rate,
768 }
769}
770
771pub fn write_summary_json(examples: &[Example], path: &Path) -> anyhow::Result<()> {
772 let summary = compute_summary(examples);
773 let file = File::create(path)
774 .with_context(|| format!("Failed to create summary JSON file: {}", path.display()))?;
775 let writer = BufWriter::new(file);
776 serde_json::to_writer_pretty(writer, &summary)
777 .with_context(|| format!("Failed to write summary JSON to: {}", path.display()))?;
778 eprintln!("Wrote summary JSON to: {}", path.display());
779 Ok(())
780}