1use crate::{
2 PredictArgs, PredictionProvider,
3 example::{ActualCursor, Example, ExampleScore},
4 format_prompt::TeacherPrompt,
5 headless::EpAppState,
6 metrics,
7 parse_output::parse_prediction_output,
8 predict::run_prediction,
9 progress::{ExampleProgress, Step},
10 reversal_tracking,
11};
12use anyhow::Context as _;
13use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
14use gpui::AsyncApp;
15use serde::Serialize;
16use std::fs::File;
17use std::io::BufWriter;
18use std::path::Path;
19use std::sync::Arc;
20
21pub async fn run_scoring(
22 example: &mut Example,
23 args: &PredictArgs,
24 app_state: Arc<EpAppState>,
25 example_progress: &ExampleProgress,
26 cx: AsyncApp,
27) -> anyhow::Result<()> {
28 run_prediction(example, args, app_state, example_progress, cx).await?;
29
30 let progress = example_progress.start(Step::Score);
31
32 progress.set_substatus("applying patches");
33 let prompt_inputs = example
34 .prompt_inputs
35 .as_ref()
36 .context("prompt_inputs is required for scoring - run prediction first or ensure JSON includes prompt_inputs")?;
37 let original_text: &str = prompt_inputs.cursor_excerpt.as_ref();
38 let expected_patches_with_cursors = example.spec.expected_patches_with_cursor_positions();
39
40 let expected_texts: Vec<String> = expected_patches_with_cursors
41 .iter()
42 .map(|(patch, _)| {
43 apply_diff_to_string(patch, original_text)
44 .with_context(|| format!("Expected patch did not apply for {}", example.spec.name))
45 })
46 .collect::<Result<Vec<_>, _>>()?;
47
48 // For Teacher prompts, we need to extract the editable region to properly compute cursor offsets.
49 // The actual_cursor_offset from Teacher is relative to the editable region, while the expected
50 // cursor from the patch is relative to the hunk. We need to apply the patch to the editable
51 // region to find where the hunk matched, then compute the expected cursor position.
52 let old_editable_region = if let Some(p) = example.prompt.as_ref() {
53 if matches!(
54 p.provider,
55 PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_)
56 ) {
57 Some(
58 TeacherPrompt::extract_editable_region(&p.input)?
59 .replace(TeacherPrompt::USER_CURSOR_MARKER, ""),
60 )
61 } else {
62 None
63 }
64 } else {
65 None
66 };
67
68 let zero_scores = ExampleScore {
69 delta_chr_f: 0.0,
70 braces_disbalance: 0,
71 exact_lines_tp: 0,
72 exact_lines_fp: 0,
73 exact_lines_fn: 0,
74 reversal_ratio: 0.0,
75 cursor_distance: None,
76 cursor_exact_match: None,
77 wrong_editable_region: None,
78 has_isolated_whitespace_changes: false,
79 inserted_tokens: 0,
80 deleted_tokens: 0,
81 cumulative_logprob: None,
82 avg_logprob: None,
83 };
84
85 let cursor_path = example.spec.cursor_path.as_ref();
86
87 progress.set_substatus("computing metrics");
88 let mut scores = vec![];
89 for prediction in &example.predictions {
90 let actual_patch = prediction.actual_patch.clone().or_else(|| {
91 parse_prediction_output(example, &prediction.actual_output, prediction.provider)
92 .ok()
93 .map(|(patch, _)| patch)
94 });
95
96 let Some(actual_patch) = actual_patch else {
97 scores.push(zero_scores.clone());
98 continue;
99 };
100
101 let token_changes = metrics::count_patch_token_changes(&actual_patch);
102
103 let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
104 Ok(text) => text,
105 Err(_) => {
106 let mut s = zero_scores.clone();
107 s.inserted_tokens = token_changes.inserted_tokens;
108 s.deleted_tokens = token_changes.deleted_tokens;
109 scores.push(s);
110 continue;
111 }
112 };
113
114 let mut best_delta_chr_f = 0.0f32;
115 let mut best_expected_cursor: Option<usize> = None;
116 let mut best_patch_idx: Option<usize> = None;
117
118 for (idx, expected) in expected_texts.iter().enumerate() {
119 let delta_chr_f = metrics::delta_chr_f(original_text, expected, &actual_text) as f32;
120 if delta_chr_f > best_delta_chr_f {
121 best_delta_chr_f = delta_chr_f;
122 best_patch_idx = Some(idx);
123 }
124 }
125
126 if let Some(idx) = best_patch_idx {
127 // Get the raw cursor offset from the expected patch (relative to hunk new text)
128 let expected_cursor_in_patch = expected_patches_with_cursors
129 .get(idx)
130 .and_then(|(_, cursor)| *cursor);
131
132 // For Teacher prompts, we need to apply the patch to the editable region
133 // to find where the hunk matched, then compute the actual cursor position
134 if let (Some(editable_region), Some(cursor_in_patch)) =
135 (&old_editable_region, expected_cursor_in_patch)
136 {
137 let (patch, _) = &expected_patches_with_cursors[idx];
138 if let Ok((_, hunk_offset)) =
139 apply_diff_to_string_with_hunk_offset(patch, editable_region)
140 {
141 let hunk_start = hunk_offset.unwrap_or(0);
142 best_expected_cursor = Some(hunk_start + cursor_in_patch);
143 }
144 } else {
145 // For non-Teacher prompts or if we can't compute, use raw offset
146 best_expected_cursor = expected_cursor_in_patch;
147 }
148 }
149
150 let disbalance_before = metrics::braces_disbalance(&original_text);
151 let disbalance_after = metrics::braces_disbalance(&actual_text);
152 let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
153
154 // Compute exact lines match against best matching expected patch
155 let best_exact_lines = expected_patches_with_cursors
156 .iter()
157 .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
158 .max_by_key(|m| m.true_positives)
159 .unwrap_or_default();
160
161 // Compute reversal ratio
162 let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
163 prompt_inputs,
164 &actual_text,
165 cursor_path,
166 );
167
168 // Compute cursor position metrics
169 let (cursor_distance, cursor_exact_match) =
170 compute_cursor_metrics(best_expected_cursor, prediction.actual_cursor.as_ref());
171
172 // Compute approximation of editable region correctness
173 let wrong_editable_region = Some(!metrics::is_editable_region_correct(&actual_patch));
174
175 // Check for isolated whitespace changes.
176 let has_isolated_whitespace_changes = metrics::has_isolated_whitespace_changes(
177 &actual_patch,
178 prediction.actual_cursor.as_ref(),
179 );
180
181 scores.push(ExampleScore {
182 delta_chr_f: best_delta_chr_f,
183 braces_disbalance,
184 exact_lines_tp: best_exact_lines.true_positives,
185 exact_lines_fp: best_exact_lines.false_positives,
186 exact_lines_fn: best_exact_lines.false_negatives,
187 reversal_ratio,
188 cursor_distance,
189 cursor_exact_match,
190 wrong_editable_region,
191 has_isolated_whitespace_changes,
192 inserted_tokens: token_changes.inserted_tokens,
193 deleted_tokens: token_changes.deleted_tokens,
194 cumulative_logprob: prediction.cumulative_logprob,
195 avg_logprob: prediction.avg_logprob,
196 });
197 }
198
199 example.score = scores;
200 Ok(())
201}
202
203fn compute_cursor_metrics(
204 expected_cursor_editable_region_offset: Option<usize>,
205 actual_cursor: Option<&ActualCursor>,
206) -> (Option<usize>, Option<bool>) {
207 match (expected_cursor_editable_region_offset, actual_cursor) {
208 (Some(expected), Some(actual)) => {
209 let distance = expected.abs_diff(actual.editable_region_offset.unwrap_or_default());
210 let exact_match = distance == 0;
211 (Some(distance), Some(exact_match))
212 }
213 (None, None) => {
214 // Neither has cursor position - skip cursor scoring
215 (None, None)
216 }
217 (Some(_), None) | (None, Some(_)) => {
218 // Only one has cursor position - count as miss
219 (None, Some(false))
220 }
221 }
222}
223
224pub fn print_report(examples: &[Example], verbose: bool) {
225 const MAX_EXAMPLES_DEFAULT: usize = 20;
226 use crate::metrics::ClassificationMetrics;
227
228 const LINE_WIDTH: usize = 101;
229 let separator = "─".repeat(LINE_WIDTH);
230
231 println!("{}", separator);
232 println!(
233 "{:<40} {:>8} {:>5} {:>7} {:>7} {:>7} {:>7} {:>6} {:>5}",
234 "Example", "DeltaChrF", "Brace", "F1", "Revert", "QaRev", "QaConf", "Cursor", "WrgER"
235 );
236 println!("{}", separator);
237
238 let mut all_delta_chr_f_scores = Vec::new();
239 let mut all_reversal_ratios = Vec::new();
240 let mut braces_disbalance_sum: usize = 0;
241 let mut total_exact_lines = ClassificationMetrics::default();
242 let mut total_scores: usize = 0;
243 let mut qa_reverts_count: usize = 0;
244 let mut qa_reverts_total: usize = 0;
245 let mut qa_confidence_sum: u64 = 0;
246 let mut qa_confidence_count: usize = 0;
247 let mut cursor_exact_matches: usize = 0;
248 let mut cursor_total: usize = 0;
249 let mut cursor_distance_sum: usize = 0;
250 let mut cursor_distance_count: usize = 0;
251 let mut wrong_editable_region_count: usize = 0;
252 let mut wrong_editable_region_total: usize = 0;
253 let mut isolated_whitespace_count: usize = 0;
254 let mut patch_inserted_tokens: Vec<usize> = Vec::new();
255 let mut patch_deleted_tokens: Vec<usize> = Vec::new();
256 let mut predictions_with_patch: usize = 0;
257
258 let mut printed_lines: usize = 0;
259 let mut skipped_lines: usize = 0;
260
261 for example in examples {
262 for (score_idx, score) in example.score.iter().enumerate() {
263 let exact_lines = ClassificationMetrics {
264 true_positives: score.exact_lines_tp,
265 false_positives: score.exact_lines_fp,
266 false_negatives: score.exact_lines_fn,
267 };
268
269 // Get QA results for this prediction if available
270 let qa_result = example.qa.get(score_idx).and_then(|q| q.as_ref());
271 let qa_reverts_str = qa_result
272 .and_then(|q| q.reverts_edits)
273 .map(|v| if v { "yes" } else { "no" })
274 .unwrap_or("-");
275 let qa_conf_str = qa_result
276 .and_then(|q| q.confidence)
277 .map(|v| format!("{}", v))
278 .unwrap_or("-".to_string());
279
280 // Format wrong editable region metric
281 let wrong_er_str = match score.wrong_editable_region {
282 Some(true) => "✗",
283 Some(false) => "",
284 None => "",
285 };
286
287 // Format cursor metric
288 let cursor_str = match (score.cursor_exact_match, score.cursor_distance) {
289 (Some(true), _) => "✓".to_string(),
290 (Some(false), Some(dist)) => format!("±{}", dist),
291 (Some(false), None) => "✗".to_string(),
292 (None, _) => "-".to_string(),
293 };
294
295 if verbose || printed_lines < MAX_EXAMPLES_DEFAULT {
296 println!(
297 "{:<40} {:>8.2} {:>5} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
298 truncate_name(&example.spec.name, 40),
299 score.delta_chr_f,
300 score.braces_disbalance,
301 exact_lines.f1() * 100.0,
302 score.reversal_ratio * 100.0,
303 qa_reverts_str,
304 qa_conf_str,
305 cursor_str,
306 wrong_er_str
307 );
308 printed_lines += 1;
309 } else {
310 skipped_lines += 1;
311 }
312
313 all_delta_chr_f_scores.push(score.delta_chr_f);
314 all_reversal_ratios.push(score.reversal_ratio);
315 total_scores += 1;
316 braces_disbalance_sum += score.braces_disbalance;
317 total_exact_lines.true_positives += score.exact_lines_tp;
318 total_exact_lines.false_positives += score.exact_lines_fp;
319 total_exact_lines.false_negatives += score.exact_lines_fn;
320
321 // Accumulate QA metrics
322 if let Some(qa) = qa_result {
323 if let Some(reverts) = qa.reverts_edits {
324 qa_reverts_total += 1;
325 if reverts {
326 qa_reverts_count += 1;
327 }
328 }
329 if let Some(conf) = qa.confidence {
330 qa_confidence_sum += conf as u64;
331 qa_confidence_count += 1;
332 }
333 }
334
335 // Accumulate wrong editable region metrics
336 if let Some(wrong) = score.wrong_editable_region {
337 wrong_editable_region_total += 1;
338 if wrong {
339 wrong_editable_region_count += 1;
340 }
341 }
342
343 // Accumulate isolated whitespace metrics
344 if score.has_isolated_whitespace_changes {
345 isolated_whitespace_count += 1;
346 }
347
348 // Accumulate token change metrics (only for predictions that produced a patch)
349 let has_patch = example
350 .predictions
351 .get(score_idx)
352 .and_then(|p| p.actual_patch.as_ref())
353 .is_some_and(|p| !p.is_empty());
354 if has_patch {
355 predictions_with_patch += 1;
356 patch_inserted_tokens.push(score.inserted_tokens);
357 patch_deleted_tokens.push(score.deleted_tokens);
358 }
359
360 // Accumulate cursor metrics
361 if let Some(exact_match) = score.cursor_exact_match {
362 cursor_total += 1;
363 if exact_match {
364 cursor_exact_matches += 1;
365 }
366 }
367 if let Some(dist) = score.cursor_distance {
368 cursor_distance_sum += dist;
369 cursor_distance_count += 1;
370 }
371 }
372 }
373
374 if skipped_lines > 0 {
375 println!(
376 "{:<40} (use --verbose to see all {} examples)",
377 format!("... and {} more", skipped_lines),
378 printed_lines + skipped_lines
379 );
380 }
381 println!("{}", separator);
382
383 if !all_delta_chr_f_scores.is_empty() {
384 let avg_delta_chr_f: f32 =
385 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32;
386 let avg_reversal_ratio: f32 =
387 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32;
388 let braces_disbalance_avg: f32 = braces_disbalance_sum as f32 / total_scores as f32;
389
390 let qa_reverts_str = if qa_reverts_total > 0 {
391 format!(
392 "{:.1}%",
393 qa_reverts_count as f32 / qa_reverts_total as f32 * 100.0
394 )
395 } else {
396 "-".to_string()
397 };
398 let qa_conf_str = if qa_confidence_count > 0 {
399 format!(
400 "{:.1}",
401 qa_confidence_sum as f32 / qa_confidence_count as f32
402 )
403 } else {
404 "-".to_string()
405 };
406 let cursor_str = if cursor_total > 0 {
407 format!(
408 "{:.0}%",
409 cursor_exact_matches as f32 / cursor_total as f32 * 100.0
410 )
411 } else {
412 "-".to_string()
413 };
414 let wrong_er_str = if wrong_editable_region_total > 0 {
415 format!(
416 "{:.2}%",
417 wrong_editable_region_count as f32 / wrong_editable_region_total as f32 * 100.0
418 )
419 } else {
420 "-".to_string()
421 };
422 let isolated_ws_str = if total_scores > 0 {
423 format!(
424 "{}/{} ({:.1}%)",
425 isolated_whitespace_count,
426 total_scores,
427 isolated_whitespace_count as f32 / total_scores as f32 * 100.0
428 )
429 } else {
430 "-".to_string()
431 };
432 let avg_cursor_distance = if cursor_distance_count > 0 {
433 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
434 } else {
435 None
436 };
437
438 println!(
439 "{:<40} {:>8.2} {:>5.1} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
440 "TOTAL / AVERAGE",
441 avg_delta_chr_f,
442 braces_disbalance_avg,
443 total_exact_lines.f1() * 100.0,
444 avg_reversal_ratio * 100.0,
445 qa_reverts_str,
446 qa_conf_str,
447 cursor_str,
448 wrong_er_str
449 );
450 println!("{}", separator);
451
452 // Print additional cursor metrics if available
453 if let Some(avg_dist) = avg_cursor_distance {
454 println!(
455 "Cursor: {}/{} exact matches ({:.0}%), avg distance: {:.1} bytes",
456 cursor_exact_matches,
457 cursor_total,
458 cursor_exact_matches as f32 / cursor_total as f32 * 100.0,
459 avg_dist
460 );
461 }
462
463 // Print isolated whitespace metrics
464 if total_scores > 0 {
465 println!("Isolated whitespace changes: {}", isolated_ws_str);
466 }
467
468 // Print token change percentile summary (only for predictions with a patch)
469 if !patch_inserted_tokens.is_empty() {
470 patch_inserted_tokens.sort_unstable();
471 patch_deleted_tokens.sort_unstable();
472 let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
473 .iter()
474 .zip(patch_deleted_tokens.iter())
475 .map(|(i, d)| i + d)
476 .collect();
477 patch_total_tokens.sort_unstable();
478
479 let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
480 println!();
481 println!(
482 "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
483 predictions_with_patch, total_scores, patch_rate
484 );
485 println!(
486 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
487 "", "p25", "p50", "p75", "p90", "p99"
488 );
489 println!("{}", "─".repeat(LINE_WIDTH));
490 println!(
491 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
492 "Inserted tokens",
493 percentile(&patch_inserted_tokens, 25),
494 percentile(&patch_inserted_tokens, 50),
495 percentile(&patch_inserted_tokens, 75),
496 percentile(&patch_inserted_tokens, 90),
497 percentile(&patch_inserted_tokens, 99),
498 );
499 println!(
500 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
501 "Deleted tokens",
502 percentile(&patch_deleted_tokens, 25),
503 percentile(&patch_deleted_tokens, 50),
504 percentile(&patch_deleted_tokens, 75),
505 percentile(&patch_deleted_tokens, 90),
506 percentile(&patch_deleted_tokens, 99),
507 );
508 println!(
509 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
510 "Total tokens",
511 percentile(&patch_total_tokens, 25),
512 percentile(&patch_total_tokens, 50),
513 percentile(&patch_total_tokens, 75),
514 percentile(&patch_total_tokens, 90),
515 percentile(&patch_total_tokens, 99),
516 );
517 }
518 }
519
520 println!("\n");
521}
522
523fn percentile(sorted_values: &[usize], p: usize) -> usize {
524 if sorted_values.is_empty() {
525 return 0;
526 }
527 let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
528 sorted_values[idx.min(sorted_values.len() - 1)]
529}
530
531fn truncate_name(name: &str, max_len: usize) -> String {
532 if name.len() <= max_len {
533 name.to_string()
534 } else {
535 format!("{}...", &name[..max_len - 3])
536 }
537}
538
539#[derive(Serialize)]
540pub struct SummaryJson {
541 pub total_examples: usize,
542 pub avg_delta_chr_f: f32,
543 pub avg_braces_disbalance: f32,
544 pub exact_lines_true_positives: usize,
545 pub exact_lines_false_positives: usize,
546 pub exact_lines_false_negatives: usize,
547 pub exact_lines_precision: f64,
548 pub exact_lines_recall: f64,
549 pub exact_lines_f1: f64,
550 pub avg_reversal_ratio: f32,
551 #[serde(skip_serializing_if = "Option::is_none")]
552 pub qa_avg_reverts_edits: Option<f32>,
553 #[serde(skip_serializing_if = "Option::is_none")]
554 pub qa_avg_confidence: Option<f32>,
555 #[serde(skip_serializing_if = "Option::is_none")]
556 pub cursor_exact_match_rate: Option<f32>,
557 #[serde(skip_serializing_if = "Option::is_none")]
558 pub cursor_avg_distance: Option<f32>,
559 #[serde(skip_serializing_if = "Option::is_none")]
560 pub cursor_total_evaluated: Option<usize>,
561 #[serde(skip_serializing_if = "Option::is_none")]
562 pub wrong_editable_region_rate: Option<f32>,
563 pub isolated_whitespace_rate: Option<f32>,
564}
565
566pub fn compute_summary(examples: &[Example]) -> SummaryJson {
567 use crate::metrics::ClassificationMetrics;
568
569 let mut all_delta_chr_f_scores = Vec::new();
570 let mut all_reversal_ratios = Vec::new();
571 let mut braces_disbalance_sum: usize = 0;
572 let mut total_exact_lines = ClassificationMetrics::default();
573 let mut total_scores: usize = 0;
574 let mut qa_reverts_count: usize = 0;
575 let mut qa_reverts_total: usize = 0;
576 let mut qa_confidence_sum: u64 = 0;
577 let mut qa_confidence_count: usize = 0;
578 let mut cursor_exact_matches: usize = 0;
579 let mut cursor_total: usize = 0;
580 let mut cursor_distance_sum: usize = 0;
581 let mut cursor_distance_count: usize = 0;
582 let mut wrong_editable_region_count: usize = 0;
583 let mut wrong_editable_region_total: usize = 0;
584 let mut isolated_whitespace_count: usize = 0;
585
586 for example in examples {
587 for (score_idx, score) in example.score.iter().enumerate() {
588 all_delta_chr_f_scores.push(score.delta_chr_f);
589 all_reversal_ratios.push(score.reversal_ratio);
590 total_scores += 1;
591 braces_disbalance_sum += score.braces_disbalance;
592 total_exact_lines.true_positives += score.exact_lines_tp;
593 total_exact_lines.false_positives += score.exact_lines_fp;
594 total_exact_lines.false_negatives += score.exact_lines_fn;
595
596 // Accumulate QA metrics
597 if let Some(Some(qa)) = example.qa.get(score_idx) {
598 if let Some(reverts) = qa.reverts_edits {
599 qa_reverts_total += 1;
600 if reverts {
601 qa_reverts_count += 1;
602 }
603 }
604 if let Some(conf) = qa.confidence {
605 qa_confidence_sum += conf as u64;
606 qa_confidence_count += 1;
607 }
608 }
609
610 // Accumulate wrong editable region metrics
611 if let Some(wrong) = score.wrong_editable_region {
612 wrong_editable_region_total += 1;
613 if wrong {
614 wrong_editable_region_count += 1;
615 }
616 }
617
618 // Accumulate isolated whitespace metrics
619 if score.has_isolated_whitespace_changes {
620 isolated_whitespace_count += 1;
621 }
622
623 // Accumulate cursor metrics
624 if let Some(exact_match) = score.cursor_exact_match {
625 cursor_total += 1;
626 if exact_match {
627 cursor_exact_matches += 1;
628 }
629 }
630 if let Some(dist) = score.cursor_distance {
631 cursor_distance_sum += dist;
632 cursor_distance_count += 1;
633 }
634 }
635 }
636
637 let avg_delta_chr_f = if all_delta_chr_f_scores.is_empty() {
638 0.0
639 } else {
640 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32
641 };
642
643 let avg_reversal_ratio = if all_reversal_ratios.is_empty() {
644 0.0
645 } else {
646 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32
647 };
648
649 let avg_braces_disbalance = if total_scores == 0 {
650 0.0
651 } else {
652 braces_disbalance_sum as f32 / total_scores as f32
653 };
654
655 let qa_avg_reverts_edits = if qa_reverts_total > 0 {
656 Some(qa_reverts_count as f32 / qa_reverts_total as f32)
657 } else {
658 None
659 };
660
661 let qa_avg_confidence = if qa_confidence_count > 0 {
662 Some(qa_confidence_sum as f32 / qa_confidence_count as f32)
663 } else {
664 None
665 };
666
667 let cursor_exact_match_rate = if cursor_total > 0 {
668 Some(cursor_exact_matches as f32 / cursor_total as f32)
669 } else {
670 None
671 };
672
673 let cursor_avg_distance = if cursor_distance_count > 0 {
674 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
675 } else {
676 None
677 };
678
679 let cursor_total_evaluated = if cursor_total > 0 {
680 Some(cursor_total)
681 } else {
682 None
683 };
684
685 let wrong_editable_region_rate = if wrong_editable_region_total > 0 {
686 Some(wrong_editable_region_count as f32 / wrong_editable_region_total as f32)
687 } else {
688 None
689 };
690
691 let isolated_whitespace_rate = if total_scores > 0 {
692 Some(isolated_whitespace_count as f32 / total_scores as f32)
693 } else {
694 None
695 };
696
697 SummaryJson {
698 total_examples: total_scores,
699 avg_delta_chr_f,
700 avg_braces_disbalance,
701 exact_lines_true_positives: total_exact_lines.true_positives,
702 exact_lines_false_positives: total_exact_lines.false_positives,
703 exact_lines_false_negatives: total_exact_lines.false_negatives,
704 exact_lines_precision: total_exact_lines.precision(),
705 exact_lines_recall: total_exact_lines.recall(),
706 exact_lines_f1: total_exact_lines.f1(),
707 avg_reversal_ratio,
708 qa_avg_reverts_edits,
709 qa_avg_confidence,
710 cursor_exact_match_rate,
711 cursor_avg_distance,
712 cursor_total_evaluated,
713 wrong_editable_region_rate,
714 isolated_whitespace_rate,
715 }
716}
717
718pub fn write_summary_json(examples: &[Example], path: &Path) -> anyhow::Result<()> {
719 let summary = compute_summary(examples);
720 let file = File::create(path)
721 .with_context(|| format!("Failed to create summary JSON file: {}", path.display()))?;
722 let writer = BufWriter::new(file);
723 serde_json::to_writer_pretty(writer, &summary)
724 .with_context(|| format!("Failed to write summary JSON to: {}", path.display()))?;
725 eprintln!("Wrote summary JSON to: {}", path.display());
726 Ok(())
727}