1use crate::{
2 PredictArgs, PredictionProvider,
3 example::{ActualCursor, Example, ExampleScore},
4 format_prompt::TeacherPrompt,
5 headless::EpAppState,
6 metrics,
7 parse_output::parse_prediction_output,
8 predict::run_prediction,
9 progress::{ExampleProgress, Step},
10 reversal_tracking,
11};
12use anyhow::Context as _;
13use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
14use gpui::AsyncApp;
15use serde::Serialize;
16use std::fs::File;
17use std::io::BufWriter;
18use std::path::Path;
19use std::sync::Arc;
20
21pub async fn run_scoring(
22 example: &mut Example,
23 args: &PredictArgs,
24 app_state: Arc<EpAppState>,
25 example_progress: &ExampleProgress,
26 cx: AsyncApp,
27) -> anyhow::Result<()> {
28 run_prediction(example, args, app_state, example_progress, cx).await?;
29
30 let progress = example_progress.start(Step::Score);
31
32 progress.set_substatus("applying patches");
33 let prompt_inputs = example
34 .prompt_inputs
35 .as_ref()
36 .context("prompt_inputs is required for scoring - run prediction first or ensure JSON includes prompt_inputs")?;
37 let original_text: &str = prompt_inputs.cursor_excerpt.as_ref();
38 let expected_patches_with_cursors = example.spec.expected_patches_with_cursor_positions();
39
40 let expected_texts: Vec<String> = expected_patches_with_cursors
41 .iter()
42 .map(|(patch, _)| {
43 apply_diff_to_string(patch, original_text)
44 .with_context(|| format!("Expected patch did not apply for {}", example.spec.name))
45 })
46 .collect::<Result<Vec<_>, _>>()?;
47
48 // For Teacher prompts, we need to extract the editable region to properly compute cursor offsets.
49 // The actual_cursor_offset from Teacher is relative to the editable region, while the expected
50 // cursor from the patch is relative to the hunk. We need to apply the patch to the editable
51 // region to find where the hunk matched, then compute the expected cursor position.
52 let old_editable_region = if let Some(p) = example.prompt.as_ref() {
53 if matches!(
54 p.provider,
55 PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_)
56 ) {
57 Some(
58 TeacherPrompt::extract_editable_region(&p.input)?
59 .replace(TeacherPrompt::USER_CURSOR_MARKER, ""),
60 )
61 } else {
62 None
63 }
64 } else {
65 None
66 };
67
68 let zero_scores = ExampleScore {
69 delta_chr_f: 0.0,
70 braces_disbalance: 0,
71 exact_lines_tp: 0,
72 exact_lines_fp: 0,
73 exact_lines_fn: 0,
74 reversal_ratio: 0.0,
75 cursor_distance: None,
76 cursor_exact_match: None,
77 wrong_editable_region: None,
78 has_isolated_whitespace_changes: false,
79 inserted_tokens: 0,
80 deleted_tokens: 0,
81 };
82
83 let cursor_path = example.spec.cursor_path.as_ref();
84
85 progress.set_substatus("computing metrics");
86 let mut scores = vec![];
87 for prediction in &example.predictions {
88 let actual_patch = prediction.actual_patch.clone().or_else(|| {
89 parse_prediction_output(example, &prediction.actual_output, prediction.provider)
90 .ok()
91 .map(|(patch, _)| patch)
92 });
93
94 let Some(actual_patch) = actual_patch else {
95 scores.push(zero_scores.clone());
96 continue;
97 };
98
99 let token_changes = metrics::count_patch_token_changes(&actual_patch);
100
101 let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
102 Ok(text) => text,
103 Err(_) => {
104 let mut s = zero_scores.clone();
105 s.inserted_tokens = token_changes.inserted_tokens;
106 s.deleted_tokens = token_changes.deleted_tokens;
107 scores.push(s);
108 continue;
109 }
110 };
111
112 let mut best_delta_chr_f = 0.0f32;
113 let mut best_expected_cursor: Option<usize> = None;
114 let mut best_patch_idx: Option<usize> = None;
115
116 for (idx, expected) in expected_texts.iter().enumerate() {
117 let delta_chr_f = metrics::delta_chr_f(original_text, expected, &actual_text) as f32;
118 if delta_chr_f > best_delta_chr_f {
119 best_delta_chr_f = delta_chr_f;
120 best_patch_idx = Some(idx);
121 }
122 }
123
124 if let Some(idx) = best_patch_idx {
125 // Get the raw cursor offset from the expected patch (relative to hunk new text)
126 let expected_cursor_in_patch = expected_patches_with_cursors
127 .get(idx)
128 .and_then(|(_, cursor)| *cursor);
129
130 // For Teacher prompts, we need to apply the patch to the editable region
131 // to find where the hunk matched, then compute the actual cursor position
132 if let (Some(editable_region), Some(cursor_in_patch)) =
133 (&old_editable_region, expected_cursor_in_patch)
134 {
135 let (patch, _) = &expected_patches_with_cursors[idx];
136 if let Ok((_, hunk_offset)) =
137 apply_diff_to_string_with_hunk_offset(patch, editable_region)
138 {
139 let hunk_start = hunk_offset.unwrap_or(0);
140 best_expected_cursor = Some(hunk_start + cursor_in_patch);
141 }
142 } else {
143 // For non-Teacher prompts or if we can't compute, use raw offset
144 best_expected_cursor = expected_cursor_in_patch;
145 }
146 }
147
148 let disbalance_before = metrics::braces_disbalance(&original_text);
149 let disbalance_after = metrics::braces_disbalance(&actual_text);
150 let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
151
152 // Compute exact lines match against best matching expected patch
153 let best_exact_lines = expected_patches_with_cursors
154 .iter()
155 .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
156 .max_by_key(|m| m.true_positives)
157 .unwrap_or_default();
158
159 // Compute reversal ratio
160 let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
161 prompt_inputs,
162 &actual_text,
163 cursor_path,
164 );
165
166 // Compute cursor position metrics
167 let (cursor_distance, cursor_exact_match) =
168 compute_cursor_metrics(best_expected_cursor, prediction.actual_cursor.as_ref());
169
170 // Compute approximation of editable region correctness
171 let wrong_editable_region = Some(!metrics::is_editable_region_correct(&actual_patch));
172
173 // Check for isolated whitespace changes.
174 let has_isolated_whitespace_changes = metrics::has_isolated_whitespace_changes(
175 &actual_patch,
176 prediction.actual_cursor.as_ref(),
177 );
178
179 scores.push(ExampleScore {
180 delta_chr_f: best_delta_chr_f,
181 braces_disbalance,
182 exact_lines_tp: best_exact_lines.true_positives,
183 exact_lines_fp: best_exact_lines.false_positives,
184 exact_lines_fn: best_exact_lines.false_negatives,
185 reversal_ratio,
186 cursor_distance,
187 cursor_exact_match,
188 wrong_editable_region,
189 has_isolated_whitespace_changes,
190 inserted_tokens: token_changes.inserted_tokens,
191 deleted_tokens: token_changes.deleted_tokens,
192 });
193 }
194
195 example.score = scores;
196 Ok(())
197}
198
199fn compute_cursor_metrics(
200 expected_cursor_editable_region_offset: Option<usize>,
201 actual_cursor: Option<&ActualCursor>,
202) -> (Option<usize>, Option<bool>) {
203 match (expected_cursor_editable_region_offset, actual_cursor) {
204 (Some(expected), Some(actual)) => {
205 let distance = expected.abs_diff(actual.editable_region_offset.unwrap_or_default());
206 let exact_match = distance == 0;
207 (Some(distance), Some(exact_match))
208 }
209 (None, None) => {
210 // Neither has cursor position - skip cursor scoring
211 (None, None)
212 }
213 (Some(_), None) | (None, Some(_)) => {
214 // Only one has cursor position - count as miss
215 (None, Some(false))
216 }
217 }
218}
219
220pub fn print_report(examples: &[Example], verbose: bool) {
221 const MAX_EXAMPLES_DEFAULT: usize = 20;
222 use crate::metrics::ClassificationMetrics;
223
224 const LINE_WIDTH: usize = 101;
225 let separator = "─".repeat(LINE_WIDTH);
226
227 println!("{}", separator);
228 println!(
229 "{:<40} {:>8} {:>5} {:>7} {:>7} {:>7} {:>7} {:>6} {:>5}",
230 "Example", "DeltaChrF", "Brace", "F1", "Revert", "QaRev", "QaConf", "Cursor", "WrgER"
231 );
232 println!("{}", separator);
233
234 let mut all_delta_chr_f_scores = Vec::new();
235 let mut all_reversal_ratios = Vec::new();
236 let mut braces_disbalance_sum: usize = 0;
237 let mut total_exact_lines = ClassificationMetrics::default();
238 let mut total_scores: usize = 0;
239 let mut qa_reverts_count: usize = 0;
240 let mut qa_reverts_total: usize = 0;
241 let mut qa_confidence_sum: u64 = 0;
242 let mut qa_confidence_count: usize = 0;
243 let mut cursor_exact_matches: usize = 0;
244 let mut cursor_total: usize = 0;
245 let mut cursor_distance_sum: usize = 0;
246 let mut cursor_distance_count: usize = 0;
247 let mut wrong_editable_region_count: usize = 0;
248 let mut wrong_editable_region_total: usize = 0;
249 let mut isolated_whitespace_count: usize = 0;
250 let mut patch_inserted_tokens: Vec<usize> = Vec::new();
251 let mut patch_deleted_tokens: Vec<usize> = Vec::new();
252 let mut predictions_with_patch: usize = 0;
253
254 let mut printed_lines: usize = 0;
255 let mut skipped_lines: usize = 0;
256
257 for example in examples {
258 for (score_idx, score) in example.score.iter().enumerate() {
259 let exact_lines = ClassificationMetrics {
260 true_positives: score.exact_lines_tp,
261 false_positives: score.exact_lines_fp,
262 false_negatives: score.exact_lines_fn,
263 };
264
265 // Get QA results for this prediction if available
266 let qa_result = example.qa.get(score_idx).and_then(|q| q.as_ref());
267 let qa_reverts_str = qa_result
268 .and_then(|q| q.reverts_edits)
269 .map(|v| if v { "yes" } else { "no" })
270 .unwrap_or("-");
271 let qa_conf_str = qa_result
272 .and_then(|q| q.confidence)
273 .map(|v| format!("{}", v))
274 .unwrap_or("-".to_string());
275
276 // Format wrong editable region metric
277 let wrong_er_str = match score.wrong_editable_region {
278 Some(true) => "✗",
279 Some(false) => "",
280 None => "",
281 };
282
283 // Format cursor metric
284 let cursor_str = match (score.cursor_exact_match, score.cursor_distance) {
285 (Some(true), _) => "✓".to_string(),
286 (Some(false), Some(dist)) => format!("±{}", dist),
287 (Some(false), None) => "✗".to_string(),
288 (None, _) => "-".to_string(),
289 };
290
291 if verbose || printed_lines < MAX_EXAMPLES_DEFAULT {
292 println!(
293 "{:<40} {:>8.2} {:>5} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
294 truncate_name(&example.spec.name, 40),
295 score.delta_chr_f,
296 score.braces_disbalance,
297 exact_lines.f1() * 100.0,
298 score.reversal_ratio * 100.0,
299 qa_reverts_str,
300 qa_conf_str,
301 cursor_str,
302 wrong_er_str
303 );
304 printed_lines += 1;
305 } else {
306 skipped_lines += 1;
307 }
308
309 all_delta_chr_f_scores.push(score.delta_chr_f);
310 all_reversal_ratios.push(score.reversal_ratio);
311 total_scores += 1;
312 braces_disbalance_sum += score.braces_disbalance;
313 total_exact_lines.true_positives += score.exact_lines_tp;
314 total_exact_lines.false_positives += score.exact_lines_fp;
315 total_exact_lines.false_negatives += score.exact_lines_fn;
316
317 // Accumulate QA metrics
318 if let Some(qa) = qa_result {
319 if let Some(reverts) = qa.reverts_edits {
320 qa_reverts_total += 1;
321 if reverts {
322 qa_reverts_count += 1;
323 }
324 }
325 if let Some(conf) = qa.confidence {
326 qa_confidence_sum += conf as u64;
327 qa_confidence_count += 1;
328 }
329 }
330
331 // Accumulate wrong editable region metrics
332 if let Some(wrong) = score.wrong_editable_region {
333 wrong_editable_region_total += 1;
334 if wrong {
335 wrong_editable_region_count += 1;
336 }
337 }
338
339 // Accumulate isolated whitespace metrics
340 if score.has_isolated_whitespace_changes {
341 isolated_whitespace_count += 1;
342 }
343
344 // Accumulate token change metrics (only for predictions that produced a patch)
345 let has_patch = example
346 .predictions
347 .get(score_idx)
348 .and_then(|p| p.actual_patch.as_ref())
349 .is_some_and(|p| !p.is_empty());
350 if has_patch {
351 predictions_with_patch += 1;
352 patch_inserted_tokens.push(score.inserted_tokens);
353 patch_deleted_tokens.push(score.deleted_tokens);
354 }
355
356 // Accumulate cursor metrics
357 if let Some(exact_match) = score.cursor_exact_match {
358 cursor_total += 1;
359 if exact_match {
360 cursor_exact_matches += 1;
361 }
362 }
363 if let Some(dist) = score.cursor_distance {
364 cursor_distance_sum += dist;
365 cursor_distance_count += 1;
366 }
367 }
368 }
369
370 if skipped_lines > 0 {
371 println!(
372 "{:<40} (use --verbose to see all {} examples)",
373 format!("... and {} more", skipped_lines),
374 printed_lines + skipped_lines
375 );
376 }
377 println!("{}", separator);
378
379 if !all_delta_chr_f_scores.is_empty() {
380 let avg_delta_chr_f: f32 =
381 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32;
382 let avg_reversal_ratio: f32 =
383 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32;
384 let braces_disbalance_avg: f32 = braces_disbalance_sum as f32 / total_scores as f32;
385
386 let qa_reverts_str = if qa_reverts_total > 0 {
387 format!(
388 "{:.1}%",
389 qa_reverts_count as f32 / qa_reverts_total as f32 * 100.0
390 )
391 } else {
392 "-".to_string()
393 };
394 let qa_conf_str = if qa_confidence_count > 0 {
395 format!(
396 "{:.1}",
397 qa_confidence_sum as f32 / qa_confidence_count as f32
398 )
399 } else {
400 "-".to_string()
401 };
402 let cursor_str = if cursor_total > 0 {
403 format!(
404 "{:.0}%",
405 cursor_exact_matches as f32 / cursor_total as f32 * 100.0
406 )
407 } else {
408 "-".to_string()
409 };
410 let wrong_er_str = if wrong_editable_region_total > 0 {
411 format!(
412 "{:.2}%",
413 wrong_editable_region_count as f32 / wrong_editable_region_total as f32 * 100.0
414 )
415 } else {
416 "-".to_string()
417 };
418 let isolated_ws_str = if total_scores > 0 {
419 format!(
420 "{}/{} ({:.1}%)",
421 isolated_whitespace_count,
422 total_scores,
423 isolated_whitespace_count as f32 / total_scores as f32 * 100.0
424 )
425 } else {
426 "-".to_string()
427 };
428 let avg_cursor_distance = if cursor_distance_count > 0 {
429 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
430 } else {
431 None
432 };
433
434 println!(
435 "{:<40} {:>8.2} {:>5.1} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
436 "TOTAL / AVERAGE",
437 avg_delta_chr_f,
438 braces_disbalance_avg,
439 total_exact_lines.f1() * 100.0,
440 avg_reversal_ratio * 100.0,
441 qa_reverts_str,
442 qa_conf_str,
443 cursor_str,
444 wrong_er_str
445 );
446 println!("{}", separator);
447
448 // Print additional cursor metrics if available
449 if let Some(avg_dist) = avg_cursor_distance {
450 println!(
451 "Cursor: {}/{} exact matches ({:.0}%), avg distance: {:.1} bytes",
452 cursor_exact_matches,
453 cursor_total,
454 cursor_exact_matches as f32 / cursor_total as f32 * 100.0,
455 avg_dist
456 );
457 }
458
459 // Print isolated whitespace metrics
460 if total_scores > 0 {
461 println!("Isolated whitespace changes: {}", isolated_ws_str);
462 }
463
464 // Print token change percentile summary (only for predictions with a patch)
465 if !patch_inserted_tokens.is_empty() {
466 patch_inserted_tokens.sort_unstable();
467 patch_deleted_tokens.sort_unstable();
468 let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
469 .iter()
470 .zip(patch_deleted_tokens.iter())
471 .map(|(i, d)| i + d)
472 .collect();
473 patch_total_tokens.sort_unstable();
474
475 let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
476 println!();
477 println!(
478 "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
479 predictions_with_patch, total_scores, patch_rate
480 );
481 println!(
482 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
483 "", "p25", "p50", "p75", "p90", "p99"
484 );
485 println!("{}", "─".repeat(LINE_WIDTH));
486 println!(
487 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
488 "Inserted tokens",
489 percentile(&patch_inserted_tokens, 25),
490 percentile(&patch_inserted_tokens, 50),
491 percentile(&patch_inserted_tokens, 75),
492 percentile(&patch_inserted_tokens, 90),
493 percentile(&patch_inserted_tokens, 99),
494 );
495 println!(
496 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
497 "Deleted tokens",
498 percentile(&patch_deleted_tokens, 25),
499 percentile(&patch_deleted_tokens, 50),
500 percentile(&patch_deleted_tokens, 75),
501 percentile(&patch_deleted_tokens, 90),
502 percentile(&patch_deleted_tokens, 99),
503 );
504 println!(
505 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
506 "Total tokens",
507 percentile(&patch_total_tokens, 25),
508 percentile(&patch_total_tokens, 50),
509 percentile(&patch_total_tokens, 75),
510 percentile(&patch_total_tokens, 90),
511 percentile(&patch_total_tokens, 99),
512 );
513 }
514 }
515
516 println!("\n");
517}
518
519fn percentile(sorted_values: &[usize], p: usize) -> usize {
520 if sorted_values.is_empty() {
521 return 0;
522 }
523 let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
524 sorted_values[idx.min(sorted_values.len() - 1)]
525}
526
527fn truncate_name(name: &str, max_len: usize) -> String {
528 if name.len() <= max_len {
529 name.to_string()
530 } else {
531 format!("{}...", &name[..max_len - 3])
532 }
533}
534
535#[derive(Serialize)]
536pub struct SummaryJson {
537 pub total_examples: usize,
538 pub avg_delta_chr_f: f32,
539 pub avg_braces_disbalance: f32,
540 pub exact_lines_true_positives: usize,
541 pub exact_lines_false_positives: usize,
542 pub exact_lines_false_negatives: usize,
543 pub exact_lines_precision: f64,
544 pub exact_lines_recall: f64,
545 pub exact_lines_f1: f64,
546 pub avg_reversal_ratio: f32,
547 #[serde(skip_serializing_if = "Option::is_none")]
548 pub qa_avg_reverts_edits: Option<f32>,
549 #[serde(skip_serializing_if = "Option::is_none")]
550 pub qa_avg_confidence: Option<f32>,
551 #[serde(skip_serializing_if = "Option::is_none")]
552 pub cursor_exact_match_rate: Option<f32>,
553 #[serde(skip_serializing_if = "Option::is_none")]
554 pub cursor_avg_distance: Option<f32>,
555 #[serde(skip_serializing_if = "Option::is_none")]
556 pub cursor_total_evaluated: Option<usize>,
557 #[serde(skip_serializing_if = "Option::is_none")]
558 pub wrong_editable_region_rate: Option<f32>,
559 pub isolated_whitespace_rate: Option<f32>,
560}
561
562pub fn compute_summary(examples: &[Example]) -> SummaryJson {
563 use crate::metrics::ClassificationMetrics;
564
565 let mut all_delta_chr_f_scores = Vec::new();
566 let mut all_reversal_ratios = Vec::new();
567 let mut braces_disbalance_sum: usize = 0;
568 let mut total_exact_lines = ClassificationMetrics::default();
569 let mut total_scores: usize = 0;
570 let mut qa_reverts_count: usize = 0;
571 let mut qa_reverts_total: usize = 0;
572 let mut qa_confidence_sum: u64 = 0;
573 let mut qa_confidence_count: usize = 0;
574 let mut cursor_exact_matches: usize = 0;
575 let mut cursor_total: usize = 0;
576 let mut cursor_distance_sum: usize = 0;
577 let mut cursor_distance_count: usize = 0;
578 let mut wrong_editable_region_count: usize = 0;
579 let mut wrong_editable_region_total: usize = 0;
580 let mut isolated_whitespace_count: usize = 0;
581
582 for example in examples {
583 for (score_idx, score) in example.score.iter().enumerate() {
584 all_delta_chr_f_scores.push(score.delta_chr_f);
585 all_reversal_ratios.push(score.reversal_ratio);
586 total_scores += 1;
587 braces_disbalance_sum += score.braces_disbalance;
588 total_exact_lines.true_positives += score.exact_lines_tp;
589 total_exact_lines.false_positives += score.exact_lines_fp;
590 total_exact_lines.false_negatives += score.exact_lines_fn;
591
592 // Accumulate QA metrics
593 if let Some(Some(qa)) = example.qa.get(score_idx) {
594 if let Some(reverts) = qa.reverts_edits {
595 qa_reverts_total += 1;
596 if reverts {
597 qa_reverts_count += 1;
598 }
599 }
600 if let Some(conf) = qa.confidence {
601 qa_confidence_sum += conf as u64;
602 qa_confidence_count += 1;
603 }
604 }
605
606 // Accumulate wrong editable region metrics
607 if let Some(wrong) = score.wrong_editable_region {
608 wrong_editable_region_total += 1;
609 if wrong {
610 wrong_editable_region_count += 1;
611 }
612 }
613
614 // Accumulate isolated whitespace metrics
615 if score.has_isolated_whitespace_changes {
616 isolated_whitespace_count += 1;
617 }
618
619 // Accumulate cursor metrics
620 if let Some(exact_match) = score.cursor_exact_match {
621 cursor_total += 1;
622 if exact_match {
623 cursor_exact_matches += 1;
624 }
625 }
626 if let Some(dist) = score.cursor_distance {
627 cursor_distance_sum += dist;
628 cursor_distance_count += 1;
629 }
630 }
631 }
632
633 let avg_delta_chr_f = if all_delta_chr_f_scores.is_empty() {
634 0.0
635 } else {
636 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32
637 };
638
639 let avg_reversal_ratio = if all_reversal_ratios.is_empty() {
640 0.0
641 } else {
642 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32
643 };
644
645 let avg_braces_disbalance = if total_scores == 0 {
646 0.0
647 } else {
648 braces_disbalance_sum as f32 / total_scores as f32
649 };
650
651 let qa_avg_reverts_edits = if qa_reverts_total > 0 {
652 Some(qa_reverts_count as f32 / qa_reverts_total as f32)
653 } else {
654 None
655 };
656
657 let qa_avg_confidence = if qa_confidence_count > 0 {
658 Some(qa_confidence_sum as f32 / qa_confidence_count as f32)
659 } else {
660 None
661 };
662
663 let cursor_exact_match_rate = if cursor_total > 0 {
664 Some(cursor_exact_matches as f32 / cursor_total as f32)
665 } else {
666 None
667 };
668
669 let cursor_avg_distance = if cursor_distance_count > 0 {
670 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
671 } else {
672 None
673 };
674
675 let cursor_total_evaluated = if cursor_total > 0 {
676 Some(cursor_total)
677 } else {
678 None
679 };
680
681 let wrong_editable_region_rate = if wrong_editable_region_total > 0 {
682 Some(wrong_editable_region_count as f32 / wrong_editable_region_total as f32)
683 } else {
684 None
685 };
686
687 let isolated_whitespace_rate = if total_scores > 0 {
688 Some(isolated_whitespace_count as f32 / total_scores as f32)
689 } else {
690 None
691 };
692
693 SummaryJson {
694 total_examples: total_scores,
695 avg_delta_chr_f,
696 avg_braces_disbalance,
697 exact_lines_true_positives: total_exact_lines.true_positives,
698 exact_lines_false_positives: total_exact_lines.false_positives,
699 exact_lines_false_negatives: total_exact_lines.false_negatives,
700 exact_lines_precision: total_exact_lines.precision(),
701 exact_lines_recall: total_exact_lines.recall(),
702 exact_lines_f1: total_exact_lines.f1(),
703 avg_reversal_ratio,
704 qa_avg_reverts_edits,
705 qa_avg_confidence,
706 cursor_exact_match_rate,
707 cursor_avg_distance,
708 cursor_total_evaluated,
709 wrong_editable_region_rate,
710 isolated_whitespace_rate,
711 }
712}
713
714pub fn write_summary_json(examples: &[Example], path: &Path) -> anyhow::Result<()> {
715 let summary = compute_summary(examples);
716 let file = File::create(path)
717 .with_context(|| format!("Failed to create summary JSON file: {}", path.display()))?;
718 let writer = BufWriter::new(file);
719 serde_json::to_writer_pretty(writer, &summary)
720 .with_context(|| format!("Failed to write summary JSON to: {}", path.display()))?;
721 eprintln!("Wrote summary JSON to: {}", path.display());
722 Ok(())
723}