1use crate::{
2 PredictArgs, PredictionProvider,
3 example::{ActualCursor, Example, ExampleScore},
4 format_prompt::TeacherPrompt,
5 headless::EpAppState,
6 metrics,
7 parse_output::parse_prediction_output,
8 predict::run_prediction,
9 progress::{ExampleProgress, Step},
10 reversal_tracking,
11};
12use anyhow::Context as _;
13use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
14use gpui::AsyncApp;
15use serde::Serialize;
16use std::fs::File;
17use std::io::BufWriter;
18use std::path::Path;
19use std::sync::Arc;
20
21pub async fn run_scoring(
22 example: &mut Example,
23 args: &PredictArgs,
24 app_state: Arc<EpAppState>,
25 example_progress: &ExampleProgress,
26 cx: AsyncApp,
27) -> anyhow::Result<()> {
28 run_prediction(example, args, app_state, example_progress, cx).await?;
29
30 let progress = example_progress.start(Step::Score);
31
32 progress.set_substatus("applying patches");
33 let original_text = &example
34 .prompt_inputs
35 .as_ref()
36 .context("prompt_inputs is required for scoring - run prediction first or ensure JSON includes prompt_inputs")?
37 .content;
38 let expected_patches_with_cursors = example.spec.expected_patches_with_cursor_positions();
39
40 let expected_texts: Vec<String> = expected_patches_with_cursors
41 .iter()
42 .map(|(patch, _)| {
43 apply_diff_to_string(patch, original_text)
44 .with_context(|| format!("Expected patch did not apply for {}", example.spec.name))
45 })
46 .collect::<Result<Vec<_>, _>>()?;
47
48 // For Teacher prompts, we need to extract the editable region to properly compute cursor offsets.
49 // The actual_cursor_offset from Teacher is relative to the editable region, while the expected
50 // cursor from the patch is relative to the hunk. We need to apply the patch to the editable
51 // region to find where the hunk matched, then compute the expected cursor position.
52 let old_editable_region = if let Some(p) = example.prompt.as_ref() {
53 if matches!(
54 p.provider,
55 PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_)
56 ) {
57 Some(
58 TeacherPrompt::extract_editable_region(&p.input)?
59 .replace(TeacherPrompt::USER_CURSOR_MARKER, ""),
60 )
61 } else {
62 None
63 }
64 } else {
65 None
66 };
67
68 let zero_scores = ExampleScore {
69 delta_chr_f: 0.0,
70 braces_disbalance: 0,
71 exact_lines_tp: 0,
72 exact_lines_fp: 0,
73 exact_lines_fn: 0,
74 reversal_ratio: 0.0,
75 cursor_distance: None,
76 cursor_exact_match: None,
77 wrong_editable_region: None,
78 has_isolated_whitespace_changes: false,
79 inserted_tokens: 0,
80 deleted_tokens: 0,
81 };
82
83 let prompt_inputs = example.prompt_inputs.as_ref().unwrap();
84 let cursor_path = example.spec.cursor_path.as_ref();
85
86 progress.set_substatus("computing metrics");
87 let mut scores = vec![];
88 for prediction in &example.predictions {
89 let actual_patch = prediction.actual_patch.clone().or_else(|| {
90 parse_prediction_output(example, &prediction.actual_output, prediction.provider)
91 .ok()
92 .map(|(patch, _)| patch)
93 });
94
95 let Some(actual_patch) = actual_patch else {
96 scores.push(zero_scores.clone());
97 continue;
98 };
99
100 let token_changes = metrics::count_patch_token_changes(&actual_patch);
101
102 let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
103 Ok(text) => text,
104 Err(_) => {
105 let mut s = zero_scores.clone();
106 s.inserted_tokens = token_changes.inserted_tokens;
107 s.deleted_tokens = token_changes.deleted_tokens;
108 scores.push(s);
109 continue;
110 }
111 };
112
113 let mut best_delta_chr_f = 0.0f32;
114 let mut best_expected_cursor: Option<usize> = None;
115 let mut best_patch_idx: Option<usize> = None;
116
117 for (idx, expected) in expected_texts.iter().enumerate() {
118 let delta_chr_f = metrics::delta_chr_f(original_text, expected, &actual_text) as f32;
119 if delta_chr_f > best_delta_chr_f {
120 best_delta_chr_f = delta_chr_f;
121 best_patch_idx = Some(idx);
122 }
123 }
124
125 if let Some(idx) = best_patch_idx {
126 // Get the raw cursor offset from the expected patch (relative to hunk new text)
127 let expected_cursor_in_patch = expected_patches_with_cursors
128 .get(idx)
129 .and_then(|(_, cursor)| *cursor);
130
131 // For Teacher prompts, we need to apply the patch to the editable region
132 // to find where the hunk matched, then compute the actual cursor position
133 if let (Some(editable_region), Some(cursor_in_patch)) =
134 (&old_editable_region, expected_cursor_in_patch)
135 {
136 let (patch, _) = &expected_patches_with_cursors[idx];
137 if let Ok((_, hunk_offset)) =
138 apply_diff_to_string_with_hunk_offset(patch, editable_region)
139 {
140 let hunk_start = hunk_offset.unwrap_or(0);
141 best_expected_cursor = Some(hunk_start + cursor_in_patch);
142 }
143 } else {
144 // For non-Teacher prompts or if we can't compute, use raw offset
145 best_expected_cursor = expected_cursor_in_patch;
146 }
147 }
148
149 let disbalance_before = metrics::braces_disbalance(&original_text);
150 let disbalance_after = metrics::braces_disbalance(&actual_text);
151 let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
152
153 // Compute exact lines match against best matching expected patch
154 let best_exact_lines = expected_patches_with_cursors
155 .iter()
156 .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
157 .max_by_key(|m| m.true_positives)
158 .unwrap_or_default();
159
160 // Compute reversal ratio
161 let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
162 prompt_inputs,
163 &actual_text,
164 cursor_path,
165 );
166
167 // Compute cursor position metrics
168 let (cursor_distance, cursor_exact_match) =
169 compute_cursor_metrics(best_expected_cursor, prediction.actual_cursor.as_ref());
170
171 // Compute approximation of editable region correctness
172 let wrong_editable_region = Some(!metrics::is_editable_region_correct(&actual_patch));
173
174 // Check for isolated whitespace changes.
175 let has_isolated_whitespace_changes = metrics::has_isolated_whitespace_changes(
176 &actual_patch,
177 prediction.actual_cursor.as_ref(),
178 );
179
180 scores.push(ExampleScore {
181 delta_chr_f: best_delta_chr_f,
182 braces_disbalance,
183 exact_lines_tp: best_exact_lines.true_positives,
184 exact_lines_fp: best_exact_lines.false_positives,
185 exact_lines_fn: best_exact_lines.false_negatives,
186 reversal_ratio,
187 cursor_distance,
188 cursor_exact_match,
189 wrong_editable_region,
190 has_isolated_whitespace_changes,
191 inserted_tokens: token_changes.inserted_tokens,
192 deleted_tokens: token_changes.deleted_tokens,
193 });
194 }
195
196 example.score = scores;
197 Ok(())
198}
199
200fn compute_cursor_metrics(
201 expected_cursor_editable_region_offset: Option<usize>,
202 actual_cursor: Option<&ActualCursor>,
203) -> (Option<usize>, Option<bool>) {
204 match (expected_cursor_editable_region_offset, actual_cursor) {
205 (Some(expected), Some(actual)) => {
206 let distance = expected.abs_diff(actual.editable_region_offset.unwrap_or_default());
207 let exact_match = distance == 0;
208 (Some(distance), Some(exact_match))
209 }
210 (None, None) => {
211 // Neither has cursor position - skip cursor scoring
212 (None, None)
213 }
214 (Some(_), None) | (None, Some(_)) => {
215 // Only one has cursor position - count as miss
216 (None, Some(false))
217 }
218 }
219}
220
221pub fn print_report(examples: &[Example]) {
222 use crate::metrics::ClassificationMetrics;
223
224 const LINE_WIDTH: usize = 101;
225 let separator = "─".repeat(LINE_WIDTH);
226
227 println!("{}", separator);
228 println!(
229 "{:<40} {:>8} {:>5} {:>7} {:>7} {:>7} {:>7} {:>6} {:>5}",
230 "Example", "DeltaChrF", "Brace", "F1", "Revert", "QaRev", "QaConf", "Cursor", "WrgER"
231 );
232 println!("{}", separator);
233
234 let mut all_delta_chr_f_scores = Vec::new();
235 let mut all_reversal_ratios = Vec::new();
236 let mut braces_disbalance_sum: usize = 0;
237 let mut total_exact_lines = ClassificationMetrics::default();
238 let mut total_scores: usize = 0;
239 let mut qa_reverts_count: usize = 0;
240 let mut qa_reverts_total: usize = 0;
241 let mut qa_confidence_sum: u64 = 0;
242 let mut qa_confidence_count: usize = 0;
243 let mut cursor_exact_matches: usize = 0;
244 let mut cursor_total: usize = 0;
245 let mut cursor_distance_sum: usize = 0;
246 let mut cursor_distance_count: usize = 0;
247 let mut wrong_editable_region_count: usize = 0;
248 let mut wrong_editable_region_total: usize = 0;
249 let mut isolated_whitespace_count: usize = 0;
250 let mut patch_inserted_tokens: Vec<usize> = Vec::new();
251 let mut patch_deleted_tokens: Vec<usize> = Vec::new();
252 let mut predictions_with_patch: usize = 0;
253
254 for example in examples {
255 for (score_idx, score) in example.score.iter().enumerate() {
256 let exact_lines = ClassificationMetrics {
257 true_positives: score.exact_lines_tp,
258 false_positives: score.exact_lines_fp,
259 false_negatives: score.exact_lines_fn,
260 };
261
262 // Get QA results for this prediction if available
263 let qa_result = example.qa.get(score_idx).and_then(|q| q.as_ref());
264 let qa_reverts_str = qa_result
265 .and_then(|q| q.reverts_edits)
266 .map(|v| if v { "yes" } else { "no" })
267 .unwrap_or("-");
268 let qa_conf_str = qa_result
269 .and_then(|q| q.confidence)
270 .map(|v| format!("{}", v))
271 .unwrap_or("-".to_string());
272
273 // Format wrong editable region metric
274 let wrong_er_str = match score.wrong_editable_region {
275 Some(true) => "✗",
276 Some(false) => "",
277 None => "",
278 };
279
280 // Format cursor metric
281 let cursor_str = match (score.cursor_exact_match, score.cursor_distance) {
282 (Some(true), _) => "✓".to_string(),
283 (Some(false), Some(dist)) => format!("±{}", dist),
284 (Some(false), None) => "✗".to_string(),
285 (None, _) => "-".to_string(),
286 };
287
288 println!(
289 "{:<40} {:>8.2} {:>5} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
290 truncate_name(&example.spec.name, 40),
291 score.delta_chr_f,
292 score.braces_disbalance,
293 exact_lines.f1() * 100.0,
294 score.reversal_ratio * 100.0,
295 qa_reverts_str,
296 qa_conf_str,
297 cursor_str,
298 wrong_er_str
299 );
300
301 all_delta_chr_f_scores.push(score.delta_chr_f);
302 all_reversal_ratios.push(score.reversal_ratio);
303 total_scores += 1;
304 braces_disbalance_sum += score.braces_disbalance;
305 total_exact_lines.true_positives += score.exact_lines_tp;
306 total_exact_lines.false_positives += score.exact_lines_fp;
307 total_exact_lines.false_negatives += score.exact_lines_fn;
308
309 // Accumulate QA metrics
310 if let Some(qa) = qa_result {
311 if let Some(reverts) = qa.reverts_edits {
312 qa_reverts_total += 1;
313 if reverts {
314 qa_reverts_count += 1;
315 }
316 }
317 if let Some(conf) = qa.confidence {
318 qa_confidence_sum += conf as u64;
319 qa_confidence_count += 1;
320 }
321 }
322
323 // Accumulate wrong editable region metrics
324 if let Some(wrong) = score.wrong_editable_region {
325 wrong_editable_region_total += 1;
326 if wrong {
327 wrong_editable_region_count += 1;
328 }
329 }
330
331 // Accumulate isolated whitespace metrics
332 if score.has_isolated_whitespace_changes {
333 isolated_whitespace_count += 1;
334 }
335
336 // Accumulate token change metrics (only for predictions that produced a patch)
337 let has_patch = example
338 .predictions
339 .get(score_idx)
340 .and_then(|p| p.actual_patch.as_ref())
341 .is_some_and(|p| !p.is_empty());
342 if has_patch {
343 predictions_with_patch += 1;
344 patch_inserted_tokens.push(score.inserted_tokens);
345 patch_deleted_tokens.push(score.deleted_tokens);
346 }
347
348 // Accumulate cursor metrics
349 if let Some(exact_match) = score.cursor_exact_match {
350 cursor_total += 1;
351 if exact_match {
352 cursor_exact_matches += 1;
353 }
354 }
355 if let Some(dist) = score.cursor_distance {
356 cursor_distance_sum += dist;
357 cursor_distance_count += 1;
358 }
359 }
360 }
361
362 println!("{}", separator);
363
364 if !all_delta_chr_f_scores.is_empty() {
365 let avg_delta_chr_f: f32 =
366 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32;
367 let avg_reversal_ratio: f32 =
368 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32;
369 let braces_disbalance_avg: f32 = braces_disbalance_sum as f32 / total_scores as f32;
370
371 let qa_reverts_str = if qa_reverts_total > 0 {
372 format!(
373 "{:.1}%",
374 qa_reverts_count as f32 / qa_reverts_total as f32 * 100.0
375 )
376 } else {
377 "-".to_string()
378 };
379 let qa_conf_str = if qa_confidence_count > 0 {
380 format!(
381 "{:.1}",
382 qa_confidence_sum as f32 / qa_confidence_count as f32
383 )
384 } else {
385 "-".to_string()
386 };
387 let cursor_str = if cursor_total > 0 {
388 format!(
389 "{:.0}%",
390 cursor_exact_matches as f32 / cursor_total as f32 * 100.0
391 )
392 } else {
393 "-".to_string()
394 };
395 let wrong_er_str = if wrong_editable_region_total > 0 {
396 format!(
397 "{:.2}%",
398 wrong_editable_region_count as f32 / wrong_editable_region_total as f32 * 100.0
399 )
400 } else {
401 "-".to_string()
402 };
403 let isolated_ws_str = if total_scores > 0 {
404 format!(
405 "{}/{} ({:.1}%)",
406 isolated_whitespace_count,
407 total_scores,
408 isolated_whitespace_count as f32 / total_scores as f32 * 100.0
409 )
410 } else {
411 "-".to_string()
412 };
413 let avg_cursor_distance = if cursor_distance_count > 0 {
414 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
415 } else {
416 None
417 };
418
419 println!(
420 "{:<40} {:>8.2} {:>5.1} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
421 "TOTAL / AVERAGE",
422 avg_delta_chr_f,
423 braces_disbalance_avg,
424 total_exact_lines.f1() * 100.0,
425 avg_reversal_ratio * 100.0,
426 qa_reverts_str,
427 qa_conf_str,
428 cursor_str,
429 wrong_er_str
430 );
431 println!("{}", separator);
432
433 // Print additional cursor metrics if available
434 if let Some(avg_dist) = avg_cursor_distance {
435 println!(
436 "Cursor: {}/{} exact matches ({:.0}%), avg distance: {:.1} bytes",
437 cursor_exact_matches,
438 cursor_total,
439 cursor_exact_matches as f32 / cursor_total as f32 * 100.0,
440 avg_dist
441 );
442 }
443
444 // Print isolated whitespace metrics
445 if total_scores > 0 {
446 println!("Isolated whitespace changes: {}", isolated_ws_str);
447 }
448
449 // Print token change percentile summary (only for predictions with a patch)
450 if !patch_inserted_tokens.is_empty() {
451 patch_inserted_tokens.sort_unstable();
452 patch_deleted_tokens.sort_unstable();
453 let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
454 .iter()
455 .zip(patch_deleted_tokens.iter())
456 .map(|(i, d)| i + d)
457 .collect();
458 patch_total_tokens.sort_unstable();
459
460 let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
461 println!();
462 println!(
463 "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
464 predictions_with_patch, total_scores, patch_rate
465 );
466 println!(
467 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
468 "", "p25", "p50", "p75", "p90", "p99"
469 );
470 println!("{}", "─".repeat(LINE_WIDTH));
471 println!(
472 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
473 "Inserted tokens",
474 percentile(&patch_inserted_tokens, 25),
475 percentile(&patch_inserted_tokens, 50),
476 percentile(&patch_inserted_tokens, 75),
477 percentile(&patch_inserted_tokens, 90),
478 percentile(&patch_inserted_tokens, 99),
479 );
480 println!(
481 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
482 "Deleted tokens",
483 percentile(&patch_deleted_tokens, 25),
484 percentile(&patch_deleted_tokens, 50),
485 percentile(&patch_deleted_tokens, 75),
486 percentile(&patch_deleted_tokens, 90),
487 percentile(&patch_deleted_tokens, 99),
488 );
489 println!(
490 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
491 "Total tokens",
492 percentile(&patch_total_tokens, 25),
493 percentile(&patch_total_tokens, 50),
494 percentile(&patch_total_tokens, 75),
495 percentile(&patch_total_tokens, 90),
496 percentile(&patch_total_tokens, 99),
497 );
498 }
499 }
500
501 println!("\n");
502}
503
504fn percentile(sorted_values: &[usize], p: usize) -> usize {
505 if sorted_values.is_empty() {
506 return 0;
507 }
508 let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
509 sorted_values[idx.min(sorted_values.len() - 1)]
510}
511
512fn truncate_name(name: &str, max_len: usize) -> String {
513 if name.len() <= max_len {
514 name.to_string()
515 } else {
516 format!("{}...", &name[..max_len - 3])
517 }
518}
519
520#[derive(Serialize)]
521pub struct SummaryJson {
522 pub total_examples: usize,
523 pub avg_delta_chr_f: f32,
524 pub avg_braces_disbalance: f32,
525 pub exact_lines_true_positives: usize,
526 pub exact_lines_false_positives: usize,
527 pub exact_lines_false_negatives: usize,
528 pub exact_lines_precision: f64,
529 pub exact_lines_recall: f64,
530 pub exact_lines_f1: f64,
531 pub avg_reversal_ratio: f32,
532 #[serde(skip_serializing_if = "Option::is_none")]
533 pub qa_avg_reverts_edits: Option<f32>,
534 #[serde(skip_serializing_if = "Option::is_none")]
535 pub qa_avg_confidence: Option<f32>,
536 #[serde(skip_serializing_if = "Option::is_none")]
537 pub cursor_exact_match_rate: Option<f32>,
538 #[serde(skip_serializing_if = "Option::is_none")]
539 pub cursor_avg_distance: Option<f32>,
540 #[serde(skip_serializing_if = "Option::is_none")]
541 pub cursor_total_evaluated: Option<usize>,
542 #[serde(skip_serializing_if = "Option::is_none")]
543 pub wrong_editable_region_rate: Option<f32>,
544 pub isolated_whitespace_rate: Option<f32>,
545}
546
547pub fn compute_summary(examples: &[Example]) -> SummaryJson {
548 use crate::metrics::ClassificationMetrics;
549
550 let mut all_delta_chr_f_scores = Vec::new();
551 let mut all_reversal_ratios = Vec::new();
552 let mut braces_disbalance_sum: usize = 0;
553 let mut total_exact_lines = ClassificationMetrics::default();
554 let mut total_scores: usize = 0;
555 let mut qa_reverts_count: usize = 0;
556 let mut qa_reverts_total: usize = 0;
557 let mut qa_confidence_sum: u64 = 0;
558 let mut qa_confidence_count: usize = 0;
559 let mut cursor_exact_matches: usize = 0;
560 let mut cursor_total: usize = 0;
561 let mut cursor_distance_sum: usize = 0;
562 let mut cursor_distance_count: usize = 0;
563 let mut wrong_editable_region_count: usize = 0;
564 let mut wrong_editable_region_total: usize = 0;
565 let mut isolated_whitespace_count: usize = 0;
566
567 for example in examples {
568 for (score_idx, score) in example.score.iter().enumerate() {
569 all_delta_chr_f_scores.push(score.delta_chr_f);
570 all_reversal_ratios.push(score.reversal_ratio);
571 total_scores += 1;
572 braces_disbalance_sum += score.braces_disbalance;
573 total_exact_lines.true_positives += score.exact_lines_tp;
574 total_exact_lines.false_positives += score.exact_lines_fp;
575 total_exact_lines.false_negatives += score.exact_lines_fn;
576
577 // Accumulate QA metrics
578 if let Some(Some(qa)) = example.qa.get(score_idx) {
579 if let Some(reverts) = qa.reverts_edits {
580 qa_reverts_total += 1;
581 if reverts {
582 qa_reverts_count += 1;
583 }
584 }
585 if let Some(conf) = qa.confidence {
586 qa_confidence_sum += conf as u64;
587 qa_confidence_count += 1;
588 }
589 }
590
591 // Accumulate wrong editable region metrics
592 if let Some(wrong) = score.wrong_editable_region {
593 wrong_editable_region_total += 1;
594 if wrong {
595 wrong_editable_region_count += 1;
596 }
597 }
598
599 // Accumulate isolated whitespace metrics
600 if score.has_isolated_whitespace_changes {
601 isolated_whitespace_count += 1;
602 }
603
604 // Accumulate cursor metrics
605 if let Some(exact_match) = score.cursor_exact_match {
606 cursor_total += 1;
607 if exact_match {
608 cursor_exact_matches += 1;
609 }
610 }
611 if let Some(dist) = score.cursor_distance {
612 cursor_distance_sum += dist;
613 cursor_distance_count += 1;
614 }
615 }
616 }
617
618 let avg_delta_chr_f = if all_delta_chr_f_scores.is_empty() {
619 0.0
620 } else {
621 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32
622 };
623
624 let avg_reversal_ratio = if all_reversal_ratios.is_empty() {
625 0.0
626 } else {
627 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32
628 };
629
630 let avg_braces_disbalance = if total_scores == 0 {
631 0.0
632 } else {
633 braces_disbalance_sum as f32 / total_scores as f32
634 };
635
636 let qa_avg_reverts_edits = if qa_reverts_total > 0 {
637 Some(qa_reverts_count as f32 / qa_reverts_total as f32)
638 } else {
639 None
640 };
641
642 let qa_avg_confidence = if qa_confidence_count > 0 {
643 Some(qa_confidence_sum as f32 / qa_confidence_count as f32)
644 } else {
645 None
646 };
647
648 let cursor_exact_match_rate = if cursor_total > 0 {
649 Some(cursor_exact_matches as f32 / cursor_total as f32)
650 } else {
651 None
652 };
653
654 let cursor_avg_distance = if cursor_distance_count > 0 {
655 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
656 } else {
657 None
658 };
659
660 let cursor_total_evaluated = if cursor_total > 0 {
661 Some(cursor_total)
662 } else {
663 None
664 };
665
666 let wrong_editable_region_rate = if wrong_editable_region_total > 0 {
667 Some(wrong_editable_region_count as f32 / wrong_editable_region_total as f32)
668 } else {
669 None
670 };
671
672 let isolated_whitespace_rate = if total_scores > 0 {
673 Some(isolated_whitespace_count as f32 / total_scores as f32)
674 } else {
675 None
676 };
677
678 SummaryJson {
679 total_examples: total_scores,
680 avg_delta_chr_f,
681 avg_braces_disbalance,
682 exact_lines_true_positives: total_exact_lines.true_positives,
683 exact_lines_false_positives: total_exact_lines.false_positives,
684 exact_lines_false_negatives: total_exact_lines.false_negatives,
685 exact_lines_precision: total_exact_lines.precision(),
686 exact_lines_recall: total_exact_lines.recall(),
687 exact_lines_f1: total_exact_lines.f1(),
688 avg_reversal_ratio,
689 qa_avg_reverts_edits,
690 qa_avg_confidence,
691 cursor_exact_match_rate,
692 cursor_avg_distance,
693 cursor_total_evaluated,
694 wrong_editable_region_rate,
695 isolated_whitespace_rate,
696 }
697}
698
699pub fn write_summary_json(examples: &[Example], path: &Path) -> anyhow::Result<()> {
700 let summary = compute_summary(examples);
701 let file = File::create(path)
702 .with_context(|| format!("Failed to create summary JSON file: {}", path.display()))?;
703 let writer = BufWriter::new(file);
704 serde_json::to_writer_pretty(writer, &summary)
705 .with_context(|| format!("Failed to write summary JSON to: {}", path.display()))?;
706 eprintln!("Wrote summary JSON to: {}", path.display());
707 Ok(())
708}