1use crate::{
2 PredictArgs, PredictionProvider,
3 example::{ActualCursor, Example, ExampleScore},
4 format_prompt::TeacherPrompt,
5 headless::EpAppState,
6 metrics,
7 parse_output::parse_prediction_output,
8 predict::run_prediction,
9 progress::{ExampleProgress, Step},
10 reversal_tracking,
11};
12use anyhow::Context as _;
13use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
14use gpui::AsyncApp;
15use serde::Serialize;
16use std::fs::File;
17use std::io::BufWriter;
18use std::path::Path;
19use std::sync::Arc;
20
21pub async fn run_scoring(
22 example: &mut Example,
23 args: &PredictArgs,
24 app_state: Arc<EpAppState>,
25 example_progress: &ExampleProgress,
26 cx: AsyncApp,
27) -> anyhow::Result<()> {
28 run_prediction(example, args, app_state, example_progress, cx).await?;
29
30 let progress = example_progress.start(Step::Score);
31
32 progress.set_substatus("applying patches");
33 let prompt_inputs = example
34 .prompt_inputs
35 .as_ref()
36 .context("prompt_inputs is required for scoring - run prediction first or ensure JSON includes prompt_inputs")?;
37 let original_text: &str = prompt_inputs.cursor_excerpt.as_ref();
38 let expected_patches_with_cursors = example.spec.expected_patches_with_cursor_positions();
39
40 let expected_texts: Vec<String> = expected_patches_with_cursors
41 .iter()
42 .map(|(patch, _)| {
43 apply_diff_to_string(patch, original_text)
44 .with_context(|| format!("Expected patch did not apply for {}", example.spec.name))
45 })
46 .collect::<Result<Vec<_>, _>>()?;
47
48 // For Teacher prompts, we need to extract the editable region to properly compute cursor offsets.
49 // The actual_cursor_offset from Teacher is relative to the editable region, while the expected
50 // cursor from the patch is relative to the hunk. We need to apply the patch to the editable
51 // region to find where the hunk matched, then compute the expected cursor position.
52 let old_editable_region = if let Some(p) = example.prompt.as_ref() {
53 if matches!(
54 p.provider,
55 PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_)
56 ) {
57 Some(
58 TeacherPrompt::extract_editable_region(&p.input)?
59 .replace(TeacherPrompt::USER_CURSOR_MARKER, ""),
60 )
61 } else {
62 None
63 }
64 } else {
65 None
66 };
67
68 let zero_scores = ExampleScore {
69 delta_chr_f: 0.0,
70 delta_chr_f_true_positives: 0,
71 delta_chr_f_false_positives: 0,
72 delta_chr_f_false_negatives: 0,
73 delta_chr_f_precision: 0.0,
74 delta_chr_f_recall: 0.0,
75 delta_chr_f_beta: metrics::delta_chr_f_beta(),
76 braces_disbalance: 0,
77 exact_lines_tp: 0,
78 exact_lines_fp: 0,
79 exact_lines_fn: 0,
80 reversal_ratio: 0.0,
81 cursor_distance: None,
82 cursor_exact_match: None,
83 wrong_editable_region: None,
84 has_isolated_whitespace_changes: false,
85 inserted_tokens: 0,
86 deleted_tokens: 0,
87 cumulative_logprob: None,
88 avg_logprob: None,
89 };
90
91 let cursor_path = example.spec.cursor_path.as_ref();
92
93 progress.set_substatus("computing metrics");
94 let mut scores = vec![];
95 for prediction in &example.predictions {
96 let actual_patch = prediction.actual_patch.clone().or_else(|| {
97 parse_prediction_output(example, &prediction.actual_output, prediction.provider)
98 .ok()
99 .map(|(patch, _)| patch)
100 });
101
102 let Some(actual_patch) = actual_patch else {
103 scores.push(zero_scores.clone());
104 continue;
105 };
106
107 let token_changes = metrics::count_patch_token_changes(&actual_patch);
108
109 let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
110 Ok(text) => text,
111 Err(_) => {
112 let mut s = zero_scores.clone();
113 s.inserted_tokens = token_changes.inserted_tokens;
114 s.deleted_tokens = token_changes.deleted_tokens;
115 scores.push(s);
116 continue;
117 }
118 };
119
120 let mut best_delta_chr_f_metrics = metrics::DeltaChrFMetrics::default();
121 let mut best_expected_cursor: Option<usize> = None;
122 let mut best_patch_idx: Option<usize> = None;
123
124 for (idx, expected) in expected_texts.iter().enumerate() {
125 let delta_chr_f_metrics = metrics::delta_chr_f(original_text, expected, &actual_text);
126 if delta_chr_f_metrics.score > best_delta_chr_f_metrics.score {
127 best_delta_chr_f_metrics = delta_chr_f_metrics;
128 best_patch_idx = Some(idx);
129 }
130 }
131
132 if let Some(idx) = best_patch_idx {
133 // Get the raw cursor offset from the expected patch (relative to hunk new text)
134 let expected_cursor_in_patch = expected_patches_with_cursors
135 .get(idx)
136 .and_then(|(_, cursor)| *cursor);
137
138 // For Teacher prompts, we need to apply the patch to the editable region
139 // to find where the hunk matched, then compute the actual cursor position
140 if let (Some(editable_region), Some(cursor_in_patch)) =
141 (&old_editable_region, expected_cursor_in_patch)
142 {
143 let (patch, _) = &expected_patches_with_cursors[idx];
144 if let Ok((_, hunk_offset)) =
145 apply_diff_to_string_with_hunk_offset(patch, editable_region)
146 {
147 let hunk_start = hunk_offset.unwrap_or(0);
148 best_expected_cursor = Some(hunk_start + cursor_in_patch);
149 }
150 } else {
151 // For non-Teacher prompts or if we can't compute, use raw offset
152 best_expected_cursor = expected_cursor_in_patch;
153 }
154 }
155
156 let disbalance_before = metrics::braces_disbalance(&original_text);
157 let disbalance_after = metrics::braces_disbalance(&actual_text);
158 let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
159
160 // Compute exact lines match against best matching expected patch
161 let best_exact_lines = expected_patches_with_cursors
162 .iter()
163 .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
164 .max_by_key(|m| m.true_positives)
165 .unwrap_or_default();
166
167 // Compute reversal ratio
168 let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
169 prompt_inputs,
170 &actual_text,
171 cursor_path,
172 );
173
174 // Compute cursor position metrics
175 let (cursor_distance, cursor_exact_match) =
176 compute_cursor_metrics(best_expected_cursor, prediction.actual_cursor.as_ref());
177
178 // Compute approximation of editable region correctness
179 let wrong_editable_region = Some(!metrics::is_editable_region_correct(&actual_patch));
180
181 // Check for isolated whitespace changes.
182 let has_isolated_whitespace_changes = metrics::has_isolated_whitespace_changes(
183 &actual_patch,
184 prediction.actual_cursor.as_ref(),
185 );
186
187 scores.push(ExampleScore {
188 delta_chr_f: best_delta_chr_f_metrics.score as f32,
189 delta_chr_f_true_positives: best_delta_chr_f_metrics.counts.true_positives,
190 delta_chr_f_false_positives: best_delta_chr_f_metrics.counts.false_positives,
191 delta_chr_f_false_negatives: best_delta_chr_f_metrics.counts.false_negatives,
192 delta_chr_f_precision: best_delta_chr_f_metrics.precision,
193 delta_chr_f_recall: best_delta_chr_f_metrics.recall,
194 delta_chr_f_beta: best_delta_chr_f_metrics.beta,
195 braces_disbalance,
196 exact_lines_tp: best_exact_lines.true_positives,
197 exact_lines_fp: best_exact_lines.false_positives,
198 exact_lines_fn: best_exact_lines.false_negatives,
199 reversal_ratio,
200 cursor_distance,
201 cursor_exact_match,
202 wrong_editable_region,
203 has_isolated_whitespace_changes,
204 inserted_tokens: token_changes.inserted_tokens,
205 deleted_tokens: token_changes.deleted_tokens,
206 cumulative_logprob: prediction.cumulative_logprob,
207 avg_logprob: prediction.avg_logprob,
208 });
209 }
210
211 example.score = scores;
212 Ok(())
213}
214
215fn compute_cursor_metrics(
216 expected_cursor_editable_region_offset: Option<usize>,
217 actual_cursor: Option<&ActualCursor>,
218) -> (Option<usize>, Option<bool>) {
219 match (expected_cursor_editable_region_offset, actual_cursor) {
220 (Some(expected), Some(actual)) => {
221 let distance = expected.abs_diff(actual.editable_region_offset.unwrap_or_default());
222 let exact_match = distance == 0;
223 (Some(distance), Some(exact_match))
224 }
225 (None, None) => {
226 // Neither has cursor position - skip cursor scoring
227 (None, None)
228 }
229 (Some(_), None) | (None, Some(_)) => {
230 // Only one has cursor position - count as miss
231 (None, Some(false))
232 }
233 }
234}
235
236pub fn print_report(examples: &[Example], verbose: bool) {
237 const MAX_EXAMPLES_DEFAULT: usize = 20;
238 use crate::metrics::ClassificationMetrics;
239
240 const LINE_WIDTH: usize = 101;
241 let separator = "─".repeat(LINE_WIDTH);
242
243 println!("{}", separator);
244 println!(
245 "{:<40} {:>8} {:>5} {:>7} {:>7} {:>7} {:>7} {:>6} {:>5}",
246 "Example", "DeltaChrF", "Brace", "F1", "Revert", "QaRev", "QaConf", "Cursor", "WrgER"
247 );
248 println!("{}", separator);
249
250 let mut all_delta_chr_f_scores = Vec::new();
251 let mut all_reversal_ratios = Vec::new();
252 let mut braces_disbalance_sum: usize = 0;
253 let mut total_delta_chr_f = ClassificationMetrics::default();
254 let mut total_delta_chr_f_precision = 0.0;
255 let mut total_delta_chr_f_recall = 0.0;
256 let mut delta_chr_f_beta = 0.0;
257 let mut total_exact_lines = ClassificationMetrics::default();
258 let mut total_scores: usize = 0;
259 let mut qa_reverts_count: usize = 0;
260 let mut qa_reverts_total: usize = 0;
261 let mut qa_confidence_sum: u64 = 0;
262 let mut qa_confidence_count: usize = 0;
263 let mut cursor_exact_matches: usize = 0;
264 let mut cursor_total: usize = 0;
265 let mut cursor_distance_sum: usize = 0;
266 let mut cursor_distance_count: usize = 0;
267 let mut wrong_editable_region_count: usize = 0;
268 let mut wrong_editable_region_total: usize = 0;
269 let mut isolated_whitespace_count: usize = 0;
270 let mut patch_inserted_tokens: Vec<usize> = Vec::new();
271 let mut patch_deleted_tokens: Vec<usize> = Vec::new();
272 let mut predictions_with_patch: usize = 0;
273
274 let mut printed_lines: usize = 0;
275 let mut skipped_lines: usize = 0;
276
277 for example in examples {
278 for (score_idx, score) in example.score.iter().enumerate() {
279 let exact_lines = score.exact_lines_counts();
280
281 // Get QA results for this prediction if available
282 let qa_result = example.qa.get(score_idx).and_then(|q| q.as_ref());
283 let qa_reverts_str = qa_result
284 .and_then(|q| q.reverts_edits)
285 .map(|v| if v { "yes" } else { "no" })
286 .unwrap_or("-");
287 let qa_conf_str = qa_result
288 .and_then(|q| q.confidence)
289 .map(|v| format!("{}", v))
290 .unwrap_or("-".to_string());
291
292 // Format wrong editable region metric
293 let wrong_er_str = match score.wrong_editable_region {
294 Some(true) => "✗",
295 Some(false) => "",
296 None => "",
297 };
298
299 // Format cursor metric
300 let cursor_str = match (score.cursor_exact_match, score.cursor_distance) {
301 (Some(true), _) => "✓".to_string(),
302 (Some(false), Some(dist)) => format!("±{}", dist),
303 (Some(false), None) => "✗".to_string(),
304 (None, _) => "-".to_string(),
305 };
306
307 if verbose || printed_lines < MAX_EXAMPLES_DEFAULT {
308 println!(
309 "{:<40} {:>8.2} {:>5} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
310 truncate_name(&example.spec.name, 40),
311 score.delta_chr_f,
312 score.braces_disbalance,
313 exact_lines.f1() * 100.0,
314 score.reversal_ratio * 100.0,
315 qa_reverts_str,
316 qa_conf_str,
317 cursor_str,
318 wrong_er_str
319 );
320 printed_lines += 1;
321 } else {
322 skipped_lines += 1;
323 }
324
325 all_delta_chr_f_scores.push(score.delta_chr_f);
326 all_reversal_ratios.push(score.reversal_ratio);
327 total_scores += 1;
328 braces_disbalance_sum += score.braces_disbalance;
329 total_delta_chr_f.accumulate(&score.delta_chr_f_counts());
330 total_delta_chr_f_precision += score.delta_chr_f_precision;
331 total_delta_chr_f_recall += score.delta_chr_f_recall;
332 delta_chr_f_beta = score.delta_chr_f_beta;
333 total_exact_lines.accumulate(&score.exact_lines_counts());
334
335 // Accumulate QA metrics
336 if let Some(qa) = qa_result {
337 if let Some(reverts) = qa.reverts_edits {
338 qa_reverts_total += 1;
339 if reverts {
340 qa_reverts_count += 1;
341 }
342 }
343 if let Some(conf) = qa.confidence {
344 qa_confidence_sum += conf as u64;
345 qa_confidence_count += 1;
346 }
347 }
348
349 // Accumulate wrong editable region metrics
350 if let Some(wrong) = score.wrong_editable_region {
351 wrong_editable_region_total += 1;
352 if wrong {
353 wrong_editable_region_count += 1;
354 }
355 }
356
357 // Accumulate isolated whitespace metrics
358 if score.has_isolated_whitespace_changes {
359 isolated_whitespace_count += 1;
360 }
361
362 // Accumulate token change metrics (only for predictions that produced a patch)
363 let has_patch = example
364 .predictions
365 .get(score_idx)
366 .and_then(|p| p.actual_patch.as_ref())
367 .is_some_and(|p| !p.is_empty());
368 if has_patch {
369 predictions_with_patch += 1;
370 patch_inserted_tokens.push(score.inserted_tokens);
371 patch_deleted_tokens.push(score.deleted_tokens);
372 }
373
374 // Accumulate cursor metrics
375 if let Some(exact_match) = score.cursor_exact_match {
376 cursor_total += 1;
377 if exact_match {
378 cursor_exact_matches += 1;
379 }
380 }
381 if let Some(dist) = score.cursor_distance {
382 cursor_distance_sum += dist;
383 cursor_distance_count += 1;
384 }
385 }
386 }
387
388 if skipped_lines > 0 {
389 println!(
390 "{:<40} (use --verbose to see all {} examples)",
391 format!("... and {} more", skipped_lines),
392 printed_lines + skipped_lines
393 );
394 }
395 println!("{}", separator);
396
397 if !all_delta_chr_f_scores.is_empty() {
398 let avg_delta_chr_f: f32 =
399 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32;
400 let avg_reversal_ratio: f32 =
401 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32;
402 let braces_disbalance_avg: f32 = braces_disbalance_sum as f32 / total_scores as f32;
403
404 let qa_reverts_str = if qa_reverts_total > 0 {
405 format!(
406 "{:.1}%",
407 qa_reverts_count as f32 / qa_reverts_total as f32 * 100.0
408 )
409 } else {
410 "-".to_string()
411 };
412 let qa_conf_str = if qa_confidence_count > 0 {
413 format!(
414 "{:.1}",
415 qa_confidence_sum as f32 / qa_confidence_count as f32
416 )
417 } else {
418 "-".to_string()
419 };
420 let cursor_str = if cursor_total > 0 {
421 format!(
422 "{:.0}%",
423 cursor_exact_matches as f32 / cursor_total as f32 * 100.0
424 )
425 } else {
426 "-".to_string()
427 };
428 let wrong_er_str = if wrong_editable_region_total > 0 {
429 format!(
430 "{:.2}%",
431 wrong_editable_region_count as f32 / wrong_editable_region_total as f32 * 100.0
432 )
433 } else {
434 "-".to_string()
435 };
436 let isolated_ws_str = if total_scores > 0 {
437 format!(
438 "{}/{} ({:.1}%)",
439 isolated_whitespace_count,
440 total_scores,
441 isolated_whitespace_count as f32 / total_scores as f32 * 100.0
442 )
443 } else {
444 "-".to_string()
445 };
446 let avg_cursor_distance = if cursor_distance_count > 0 {
447 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
448 } else {
449 None
450 };
451
452 println!(
453 "{:<40} {:>8.2} {:>5.1} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
454 "TOTAL / AVERAGE",
455 avg_delta_chr_f,
456 braces_disbalance_avg,
457 total_exact_lines.f1() * 100.0,
458 avg_reversal_ratio * 100.0,
459 qa_reverts_str,
460 qa_conf_str,
461 cursor_str,
462 wrong_er_str
463 );
464 println!("{}", separator);
465 println!(
466 "Delta chrF (β={:.1}): TP={}, FP={}, FN={}, P={:.1}%, R={:.1}%",
467 delta_chr_f_beta,
468 total_delta_chr_f.true_positives,
469 total_delta_chr_f.false_positives,
470 total_delta_chr_f.false_negatives,
471 total_delta_chr_f_precision / total_scores as f64 * 100.0,
472 total_delta_chr_f_recall / total_scores as f64 * 100.0
473 );
474
475 // Print additional cursor metrics if available
476 if let Some(avg_dist) = avg_cursor_distance {
477 println!(
478 "Cursor: {}/{} exact matches ({:.0}%), avg distance: {:.1} bytes",
479 cursor_exact_matches,
480 cursor_total,
481 cursor_exact_matches as f32 / cursor_total as f32 * 100.0,
482 avg_dist
483 );
484 }
485
486 // Print isolated whitespace metrics
487 if total_scores > 0 {
488 println!("Isolated whitespace changes: {}", isolated_ws_str);
489 }
490
491 // Print token change percentile summary (only for predictions with a patch)
492 if !patch_inserted_tokens.is_empty() {
493 patch_inserted_tokens.sort_unstable();
494 patch_deleted_tokens.sort_unstable();
495 let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
496 .iter()
497 .zip(patch_deleted_tokens.iter())
498 .map(|(i, d)| i + d)
499 .collect();
500 patch_total_tokens.sort_unstable();
501
502 let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
503 println!();
504 println!(
505 "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
506 predictions_with_patch, total_scores, patch_rate
507 );
508 println!(
509 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
510 "", "p25", "p50", "p75", "p90", "p99"
511 );
512 println!("{}", "─".repeat(LINE_WIDTH));
513 println!(
514 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
515 "Inserted tokens",
516 percentile(&patch_inserted_tokens, 25),
517 percentile(&patch_inserted_tokens, 50),
518 percentile(&patch_inserted_tokens, 75),
519 percentile(&patch_inserted_tokens, 90),
520 percentile(&patch_inserted_tokens, 99),
521 );
522 println!(
523 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
524 "Deleted tokens",
525 percentile(&patch_deleted_tokens, 25),
526 percentile(&patch_deleted_tokens, 50),
527 percentile(&patch_deleted_tokens, 75),
528 percentile(&patch_deleted_tokens, 90),
529 percentile(&patch_deleted_tokens, 99),
530 );
531 println!(
532 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
533 "Total tokens",
534 percentile(&patch_total_tokens, 25),
535 percentile(&patch_total_tokens, 50),
536 percentile(&patch_total_tokens, 75),
537 percentile(&patch_total_tokens, 90),
538 percentile(&patch_total_tokens, 99),
539 );
540 }
541 }
542
543 println!("\n");
544}
545
546fn percentile(sorted_values: &[usize], p: usize) -> usize {
547 if sorted_values.is_empty() {
548 return 0;
549 }
550 let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
551 sorted_values[idx.min(sorted_values.len() - 1)]
552}
553
554fn truncate_name(name: &str, max_len: usize) -> String {
555 if name.len() <= max_len {
556 name.to_string()
557 } else {
558 format!("{}...", &name[..max_len - 3])
559 }
560}
561
562#[derive(Serialize)]
563pub struct SummaryJson {
564 pub total_examples: usize,
565 pub avg_delta_chr_f: f32,
566 pub delta_chr_f_beta: f64,
567 pub delta_chr_f_true_positives: usize,
568 pub delta_chr_f_false_positives: usize,
569 pub delta_chr_f_false_negatives: usize,
570 pub delta_chr_f_precision: f64,
571 pub delta_chr_f_recall: f64,
572 pub avg_braces_disbalance: f32,
573 pub exact_lines_true_positives: usize,
574 pub exact_lines_false_positives: usize,
575 pub exact_lines_false_negatives: usize,
576 pub exact_lines_precision: f64,
577 pub exact_lines_recall: f64,
578 pub exact_lines_f1: f64,
579 pub avg_reversal_ratio: f32,
580 #[serde(skip_serializing_if = "Option::is_none")]
581 pub qa_avg_reverts_edits: Option<f32>,
582 #[serde(skip_serializing_if = "Option::is_none")]
583 pub qa_avg_confidence: Option<f32>,
584 #[serde(skip_serializing_if = "Option::is_none")]
585 pub cursor_exact_match_rate: Option<f32>,
586 #[serde(skip_serializing_if = "Option::is_none")]
587 pub cursor_avg_distance: Option<f32>,
588 #[serde(skip_serializing_if = "Option::is_none")]
589 pub cursor_total_evaluated: Option<usize>,
590 #[serde(skip_serializing_if = "Option::is_none")]
591 pub wrong_editable_region_rate: Option<f32>,
592 pub isolated_whitespace_rate: Option<f32>,
593}
594
595pub fn compute_summary(examples: &[Example]) -> SummaryJson {
596 use crate::metrics::ClassificationMetrics;
597
598 let mut all_delta_chr_f_scores = Vec::new();
599 let mut all_reversal_ratios = Vec::new();
600 let mut braces_disbalance_sum: usize = 0;
601 let mut total_delta_chr_f = ClassificationMetrics::default();
602 let mut total_delta_chr_f_precision = 0.0;
603 let mut total_delta_chr_f_recall = 0.0;
604 let mut delta_chr_f_beta = 0.0;
605 let mut total_exact_lines = ClassificationMetrics::default();
606 let mut total_scores: usize = 0;
607 let mut qa_reverts_count: usize = 0;
608 let mut qa_reverts_total: usize = 0;
609 let mut qa_confidence_sum: u64 = 0;
610 let mut qa_confidence_count: usize = 0;
611 let mut cursor_exact_matches: usize = 0;
612 let mut cursor_total: usize = 0;
613 let mut cursor_distance_sum: usize = 0;
614 let mut cursor_distance_count: usize = 0;
615 let mut wrong_editable_region_count: usize = 0;
616 let mut wrong_editable_region_total: usize = 0;
617 let mut isolated_whitespace_count: usize = 0;
618
619 for example in examples {
620 for (score_idx, score) in example.score.iter().enumerate() {
621 all_delta_chr_f_scores.push(score.delta_chr_f);
622 all_reversal_ratios.push(score.reversal_ratio);
623 total_scores += 1;
624 braces_disbalance_sum += score.braces_disbalance;
625 total_delta_chr_f.accumulate(&score.delta_chr_f_counts());
626 total_delta_chr_f_precision += score.delta_chr_f_precision;
627 total_delta_chr_f_recall += score.delta_chr_f_recall;
628 delta_chr_f_beta = score.delta_chr_f_beta;
629 total_exact_lines.accumulate(&score.exact_lines_counts());
630
631 // Accumulate QA metrics
632 if let Some(Some(qa)) = example.qa.get(score_idx) {
633 if let Some(reverts) = qa.reverts_edits {
634 qa_reverts_total += 1;
635 if reverts {
636 qa_reverts_count += 1;
637 }
638 }
639 if let Some(conf) = qa.confidence {
640 qa_confidence_sum += conf as u64;
641 qa_confidence_count += 1;
642 }
643 }
644
645 // Accumulate wrong editable region metrics
646 if let Some(wrong) = score.wrong_editable_region {
647 wrong_editable_region_total += 1;
648 if wrong {
649 wrong_editable_region_count += 1;
650 }
651 }
652
653 // Accumulate isolated whitespace metrics
654 if score.has_isolated_whitespace_changes {
655 isolated_whitespace_count += 1;
656 }
657
658 // Accumulate cursor metrics
659 if let Some(exact_match) = score.cursor_exact_match {
660 cursor_total += 1;
661 if exact_match {
662 cursor_exact_matches += 1;
663 }
664 }
665 if let Some(dist) = score.cursor_distance {
666 cursor_distance_sum += dist;
667 cursor_distance_count += 1;
668 }
669 }
670 }
671
672 let avg_delta_chr_f = if all_delta_chr_f_scores.is_empty() {
673 0.0
674 } else {
675 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32
676 };
677
678 let avg_reversal_ratio = if all_reversal_ratios.is_empty() {
679 0.0
680 } else {
681 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32
682 };
683
684 let avg_braces_disbalance = if total_scores == 0 {
685 0.0
686 } else {
687 braces_disbalance_sum as f32 / total_scores as f32
688 };
689
690 let qa_avg_reverts_edits = if qa_reverts_total > 0 {
691 Some(qa_reverts_count as f32 / qa_reverts_total as f32)
692 } else {
693 None
694 };
695
696 let qa_avg_confidence = if qa_confidence_count > 0 {
697 Some(qa_confidence_sum as f32 / qa_confidence_count as f32)
698 } else {
699 None
700 };
701
702 let cursor_exact_match_rate = if cursor_total > 0 {
703 Some(cursor_exact_matches as f32 / cursor_total as f32)
704 } else {
705 None
706 };
707
708 let cursor_avg_distance = if cursor_distance_count > 0 {
709 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
710 } else {
711 None
712 };
713
714 let cursor_total_evaluated = if cursor_total > 0 {
715 Some(cursor_total)
716 } else {
717 None
718 };
719
720 let wrong_editable_region_rate = if wrong_editable_region_total > 0 {
721 Some(wrong_editable_region_count as f32 / wrong_editable_region_total as f32)
722 } else {
723 None
724 };
725
726 let isolated_whitespace_rate = if total_scores > 0 {
727 Some(isolated_whitespace_count as f32 / total_scores as f32)
728 } else {
729 None
730 };
731
732 SummaryJson {
733 total_examples: total_scores,
734 avg_delta_chr_f,
735 delta_chr_f_beta,
736 delta_chr_f_true_positives: total_delta_chr_f.true_positives,
737 delta_chr_f_false_positives: total_delta_chr_f.false_positives,
738 delta_chr_f_false_negatives: total_delta_chr_f.false_negatives,
739 delta_chr_f_precision: if total_scores == 0 {
740 0.0
741 } else {
742 total_delta_chr_f_precision / total_scores as f64
743 },
744 delta_chr_f_recall: if total_scores == 0 {
745 0.0
746 } else {
747 total_delta_chr_f_recall / total_scores as f64
748 },
749 avg_braces_disbalance,
750 exact_lines_true_positives: total_exact_lines.true_positives,
751 exact_lines_false_positives: total_exact_lines.false_positives,
752 exact_lines_false_negatives: total_exact_lines.false_negatives,
753 exact_lines_precision: total_exact_lines.precision(),
754 exact_lines_recall: total_exact_lines.recall(),
755 exact_lines_f1: total_exact_lines.f1(),
756 avg_reversal_ratio,
757 qa_avg_reverts_edits,
758 qa_avg_confidence,
759 cursor_exact_match_rate,
760 cursor_avg_distance,
761 cursor_total_evaluated,
762 wrong_editable_region_rate,
763 isolated_whitespace_rate,
764 }
765}
766
767pub fn write_summary_json(examples: &[Example], path: &Path) -> anyhow::Result<()> {
768 let summary = compute_summary(examples);
769 let file = File::create(path)
770 .with_context(|| format!("Failed to create summary JSON file: {}", path.display()))?;
771 let writer = BufWriter::new(file);
772 serde_json::to_writer_pretty(writer, &summary)
773 .with_context(|| format!("Failed to write summary JSON to: {}", path.display()))?;
774 eprintln!("Wrote summary JSON to: {}", path.display());
775 Ok(())
776}