1use crate::{
2 PredictArgs, PredictionProvider,
3 example::{ActualCursor, Example, ExampleScore},
4 format_prompt::TeacherPrompt,
5 headless::EpAppState,
6 metrics,
7 parse_output::parse_prediction_output,
8 predict::run_prediction,
9 progress::{ExampleProgress, Step},
10 reversal_tracking,
11};
12use anyhow::Context as _;
13use gpui::AsyncApp;
14use serde::Serialize;
15use std::fs::File;
16use std::io::BufWriter;
17use std::path::Path;
18use std::sync::Arc;
19use zeta_prompt::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
20
21pub async fn run_scoring(
22 example: &mut Example,
23 args: &PredictArgs,
24 app_state: Arc<EpAppState>,
25 example_progress: &ExampleProgress,
26 cx: AsyncApp,
27) -> anyhow::Result<()> {
28 run_prediction(example, args, app_state, example_progress, cx).await?;
29
30 let progress = example_progress.start(Step::Score);
31
32 progress.set_substatus("applying patches");
33 let prompt_inputs = example
34 .prompt_inputs
35 .as_ref()
36 .context("prompt_inputs is required for scoring - run prediction first or ensure JSON includes prompt_inputs")?;
37 let original_text: &str = prompt_inputs.cursor_excerpt.as_ref();
38 let expected_patches_with_cursors = example.spec.expected_patches_with_cursor_positions();
39
40 let expected_texts: Vec<String> = expected_patches_with_cursors
41 .iter()
42 .map(|(patch, _)| {
43 apply_diff_to_string(patch, original_text)
44 .with_context(|| format!("Expected patch did not apply for {}", example.spec.name))
45 })
46 .collect::<Result<Vec<_>, _>>()?;
47
48 // For Teacher prompts, we need to extract the editable region to properly compute cursor offsets.
49 // The actual_cursor_offset from Teacher is relative to the editable region, while the expected
50 // cursor from the patch is relative to the hunk. We need to apply the patch to the editable
51 // region to find where the hunk matched, then compute the expected cursor position.
52 let old_editable_region = if let Some(p) = example.prompt.as_ref() {
53 if matches!(
54 p.provider,
55 PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_)
56 ) {
57 Some(
58 TeacherPrompt::extract_editable_region(&p.input)?
59 .replace(TeacherPrompt::USER_CURSOR_MARKER, ""),
60 )
61 } else {
62 None
63 }
64 } else {
65 None
66 };
67
68 let zero_scores = ExampleScore {
69 delta_chr_f: 0.0,
70 delta_chr_f_true_positives: 0,
71 delta_chr_f_false_positives: 0,
72 delta_chr_f_false_negatives: 0,
73 delta_chr_f_precision: 0.0,
74 delta_chr_f_recall: 0.0,
75 delta_chr_f_beta: metrics::delta_chr_f_beta(),
76 braces_disbalance: 0,
77 exact_lines_tp: 0,
78 exact_lines_fp: 0,
79 exact_lines_fn: 0,
80 reversal_ratio: 0.0,
81 cursor_distance: None,
82 cursor_exact_match: None,
83 wrong_editable_region: None,
84 has_isolated_whitespace_changes: false,
85 inserted_tokens: 0,
86 deleted_tokens: 0,
87 kept_rate: None,
88 cumulative_logprob: None,
89 avg_logprob: None,
90 };
91
92 let cursor_path = example.spec.cursor_path.as_ref();
93
94 progress.set_substatus("computing metrics");
95 let mut scores = vec![];
96 for prediction in &example.predictions {
97 let actual_patch = prediction.actual_patch.clone().or_else(|| {
98 parse_prediction_output(example, &prediction.actual_output, prediction.provider)
99 .ok()
100 .map(|(patch, _)| patch)
101 });
102
103 let Some(actual_patch) = actual_patch else {
104 scores.push(zero_scores.clone());
105 continue;
106 };
107
108 let token_changes = metrics::count_patch_token_changes(&actual_patch);
109
110 let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
111 Ok(text) => text,
112 Err(_) => {
113 let mut s = zero_scores.clone();
114 s.inserted_tokens = token_changes.inserted_tokens;
115 s.deleted_tokens = token_changes.deleted_tokens;
116 scores.push(s);
117 continue;
118 }
119 };
120
121 let mut best_delta_chr_f_metrics = metrics::DeltaChrFMetrics::default();
122 let mut best_expected_cursor: Option<usize> = None;
123 let mut best_patch_idx: Option<usize> = None;
124 let mut best_expected_text: Option<&str> = None;
125
126 for (idx, expected) in expected_texts.iter().enumerate() {
127 let delta_chr_f_metrics = metrics::delta_chr_f(original_text, expected, &actual_text);
128 if delta_chr_f_metrics.score > best_delta_chr_f_metrics.score {
129 best_delta_chr_f_metrics = delta_chr_f_metrics;
130 best_patch_idx = Some(idx);
131 best_expected_text = Some(expected);
132 }
133 }
134
135 if let Some(idx) = best_patch_idx {
136 // Get the raw cursor offset from the expected patch (relative to hunk new text)
137 let expected_cursor_in_patch = expected_patches_with_cursors
138 .get(idx)
139 .and_then(|(_, cursor)| *cursor);
140
141 // For Teacher prompts, we need to apply the patch to the editable region
142 // to find where the hunk matched, then compute the actual cursor position
143 if let (Some(editable_region), Some(cursor_in_patch)) =
144 (&old_editable_region, expected_cursor_in_patch)
145 {
146 let (patch, _) = &expected_patches_with_cursors[idx];
147 if let Ok((_, hunk_offset)) =
148 apply_diff_to_string_with_hunk_offset(patch, editable_region)
149 {
150 let hunk_start = hunk_offset.unwrap_or(0);
151 best_expected_cursor = Some(hunk_start + cursor_in_patch);
152 }
153 } else {
154 // For non-Teacher prompts or if we can't compute, use raw offset
155 best_expected_cursor = expected_cursor_in_patch;
156 }
157 }
158
159 let disbalance_before = metrics::braces_disbalance(&original_text);
160 let disbalance_after = metrics::braces_disbalance(&actual_text);
161 let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
162
163 // Compute exact lines match against best matching expected patch
164 let best_exact_lines = expected_patches_with_cursors
165 .iter()
166 .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
167 .max_by_key(|m| m.true_positives)
168 .unwrap_or_default();
169
170 // Compute reversal ratio
171 let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
172 prompt_inputs,
173 &actual_text,
174 cursor_path,
175 );
176
177 // Compute cursor position metrics
178 let (cursor_distance, cursor_exact_match) =
179 compute_cursor_metrics(best_expected_cursor, prediction.actual_cursor.as_ref());
180
181 // Compute approximation of editable region correctness
182 let wrong_editable_region = Some(!metrics::is_editable_region_correct(&actual_patch));
183
184 // Check for isolated whitespace changes.
185 let has_isolated_whitespace_changes = metrics::has_isolated_whitespace_changes(
186 &actual_patch,
187 prediction.actual_cursor.as_ref(),
188 );
189
190 let kept_rate = best_expected_text.map(|final_text| {
191 metrics::compute_kept_rate(original_text, &actual_text, final_text).kept_rate
192 });
193
194 scores.push(ExampleScore {
195 delta_chr_f: best_delta_chr_f_metrics.score as f32,
196 delta_chr_f_true_positives: best_delta_chr_f_metrics.counts.true_positives,
197 delta_chr_f_false_positives: best_delta_chr_f_metrics.counts.false_positives,
198 delta_chr_f_false_negatives: best_delta_chr_f_metrics.counts.false_negatives,
199 delta_chr_f_precision: best_delta_chr_f_metrics.precision,
200 delta_chr_f_recall: best_delta_chr_f_metrics.recall,
201 delta_chr_f_beta: best_delta_chr_f_metrics.beta,
202 braces_disbalance,
203 exact_lines_tp: best_exact_lines.true_positives,
204 exact_lines_fp: best_exact_lines.false_positives,
205 exact_lines_fn: best_exact_lines.false_negatives,
206 reversal_ratio,
207 cursor_distance,
208 cursor_exact_match,
209 wrong_editable_region,
210 has_isolated_whitespace_changes,
211 inserted_tokens: token_changes.inserted_tokens,
212 deleted_tokens: token_changes.deleted_tokens,
213 kept_rate,
214 cumulative_logprob: prediction.cumulative_logprob,
215 avg_logprob: prediction.avg_logprob,
216 });
217 }
218
219 example.score = scores;
220 Ok(())
221}
222
223fn compute_cursor_metrics(
224 expected_cursor_editable_region_offset: Option<usize>,
225 actual_cursor: Option<&ActualCursor>,
226) -> (Option<usize>, Option<bool>) {
227 match (expected_cursor_editable_region_offset, actual_cursor) {
228 (Some(expected), Some(actual)) => {
229 let distance = expected.abs_diff(actual.editable_region_offset.unwrap_or_default());
230 let exact_match = distance == 0;
231 (Some(distance), Some(exact_match))
232 }
233 (None, None) => {
234 // Neither has cursor position - skip cursor scoring
235 (None, None)
236 }
237 (Some(_), None) | (None, Some(_)) => {
238 // Only one has cursor position - count as miss
239 (None, Some(false))
240 }
241 }
242}
243
244pub fn print_report(examples: &[Example], verbose: bool) {
245 const MAX_EXAMPLES_DEFAULT: usize = 20;
246 use crate::metrics::ClassificationMetrics;
247
248 const LINE_WIDTH: usize = 101;
249 let separator = "─".repeat(LINE_WIDTH);
250
251 println!("{}", separator);
252 println!(
253 "{:<40} {:>8} {:>5} {:>7} {:>7} {:>7} {:>7} {:>6} {:>5}",
254 "Example", "DeltaChrF", "Brace", "F1", "Revert", "QaRev", "QaConf", "Cursor", "WrgER"
255 );
256 println!("{}", separator);
257
258 let mut all_delta_chr_f_scores = Vec::new();
259 let mut all_reversal_ratios = Vec::new();
260 let mut braces_disbalance_sum: usize = 0;
261 let mut total_delta_chr_f = ClassificationMetrics::default();
262 let mut total_delta_chr_f_precision = 0.0;
263 let mut total_delta_chr_f_recall = 0.0;
264 let mut delta_chr_f_beta = 0.0;
265 let mut total_exact_lines = ClassificationMetrics::default();
266 let mut total_scores: usize = 0;
267 let mut qa_reverts_count: usize = 0;
268 let mut qa_reverts_total: usize = 0;
269 let mut qa_confidence_sum: u64 = 0;
270 let mut qa_confidence_count: usize = 0;
271 let mut cursor_exact_matches: usize = 0;
272 let mut cursor_total: usize = 0;
273 let mut cursor_distance_sum: usize = 0;
274 let mut cursor_distance_count: usize = 0;
275 let mut wrong_editable_region_count: usize = 0;
276 let mut wrong_editable_region_total: usize = 0;
277 let mut isolated_whitespace_count: usize = 0;
278 let mut kept_rate_sum: f64 = 0.0;
279 let mut kept_rate_count: usize = 0;
280 let mut patch_inserted_tokens: Vec<usize> = Vec::new();
281 let mut patch_deleted_tokens: Vec<usize> = Vec::new();
282 let mut predictions_with_patch: usize = 0;
283
284 let mut printed_lines: usize = 0;
285 let mut skipped_lines: usize = 0;
286
287 for example in examples {
288 for (score_idx, score) in example.score.iter().enumerate() {
289 let exact_lines = score.exact_lines_counts();
290
291 // Get QA results for this prediction if available
292 let qa_result = example.qa.get(score_idx).and_then(|q| q.as_ref());
293 let qa_reverts_str = qa_result
294 .and_then(|q| q.reverts_edits)
295 .map(|v| if v { "yes" } else { "no" })
296 .unwrap_or("-");
297 let qa_conf_str = qa_result
298 .and_then(|q| q.confidence)
299 .map(|v| format!("{}", v))
300 .unwrap_or("-".to_string());
301
302 // Format wrong editable region metric
303 let wrong_er_str = match score.wrong_editable_region {
304 Some(true) => "✗",
305 Some(false) => "",
306 None => "",
307 };
308
309 // Format cursor metric
310 let cursor_str = match (score.cursor_exact_match, score.cursor_distance) {
311 (Some(true), _) => "✓".to_string(),
312 (Some(false), Some(dist)) => format!("±{}", dist),
313 (Some(false), None) => "✗".to_string(),
314 (None, _) => "-".to_string(),
315 };
316
317 if verbose || printed_lines < MAX_EXAMPLES_DEFAULT {
318 println!(
319 "{:<40} {:>8.2} {:>5} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
320 truncate_name(&example.spec.name, 40),
321 score.delta_chr_f,
322 score.braces_disbalance,
323 exact_lines.f1() * 100.0,
324 score.reversal_ratio * 100.0,
325 qa_reverts_str,
326 qa_conf_str,
327 cursor_str,
328 wrong_er_str
329 );
330 printed_lines += 1;
331 } else {
332 skipped_lines += 1;
333 }
334
335 all_delta_chr_f_scores.push(score.delta_chr_f);
336 all_reversal_ratios.push(score.reversal_ratio);
337 total_scores += 1;
338 braces_disbalance_sum += score.braces_disbalance;
339 total_delta_chr_f.accumulate(&score.delta_chr_f_counts());
340 total_delta_chr_f_precision += score.delta_chr_f_precision;
341 total_delta_chr_f_recall += score.delta_chr_f_recall;
342 delta_chr_f_beta = score.delta_chr_f_beta;
343 total_exact_lines.accumulate(&score.exact_lines_counts());
344
345 // Accumulate QA metrics
346 if let Some(qa) = qa_result {
347 if let Some(reverts) = qa.reverts_edits {
348 qa_reverts_total += 1;
349 if reverts {
350 qa_reverts_count += 1;
351 }
352 }
353 if let Some(conf) = qa.confidence {
354 qa_confidence_sum += conf as u64;
355 qa_confidence_count += 1;
356 }
357 }
358
359 // Accumulate wrong editable region metrics
360 if let Some(wrong) = score.wrong_editable_region {
361 wrong_editable_region_total += 1;
362 if wrong {
363 wrong_editable_region_count += 1;
364 }
365 }
366
367 // Accumulate isolated whitespace metrics
368 if score.has_isolated_whitespace_changes {
369 isolated_whitespace_count += 1;
370 }
371
372 // Accumulate kept rate metrics
373 if let Some(kr) = score.kept_rate {
374 kept_rate_sum += kr;
375 kept_rate_count += 1;
376 }
377
378 // Accumulate token change metrics (only for predictions that produced a patch)
379 let has_patch = example
380 .predictions
381 .get(score_idx)
382 .and_then(|p| p.actual_patch.as_ref())
383 .is_some_and(|p| !p.is_empty());
384 if has_patch {
385 predictions_with_patch += 1;
386 patch_inserted_tokens.push(score.inserted_tokens);
387 patch_deleted_tokens.push(score.deleted_tokens);
388 }
389
390 // Accumulate cursor metrics
391 if let Some(exact_match) = score.cursor_exact_match {
392 cursor_total += 1;
393 if exact_match {
394 cursor_exact_matches += 1;
395 }
396 }
397 if let Some(dist) = score.cursor_distance {
398 cursor_distance_sum += dist;
399 cursor_distance_count += 1;
400 }
401 }
402 }
403
404 if skipped_lines > 0 {
405 println!(
406 "{:<40} (use --verbose to see all {} examples)",
407 format!("... and {} more", skipped_lines),
408 printed_lines + skipped_lines
409 );
410 }
411 println!("{}", separator);
412
413 if !all_delta_chr_f_scores.is_empty() {
414 let avg_delta_chr_f: f32 =
415 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32;
416 let avg_reversal_ratio: f32 =
417 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32;
418 let braces_disbalance_avg: f32 = braces_disbalance_sum as f32 / total_scores as f32;
419
420 let qa_reverts_str = if qa_reverts_total > 0 {
421 format!(
422 "{:.1}%",
423 qa_reverts_count as f32 / qa_reverts_total as f32 * 100.0
424 )
425 } else {
426 "-".to_string()
427 };
428 let qa_conf_str = if qa_confidence_count > 0 {
429 format!(
430 "{:.1}",
431 qa_confidence_sum as f32 / qa_confidence_count as f32
432 )
433 } else {
434 "-".to_string()
435 };
436 let cursor_str = if cursor_total > 0 {
437 format!(
438 "{:.0}%",
439 cursor_exact_matches as f32 / cursor_total as f32 * 100.0
440 )
441 } else {
442 "-".to_string()
443 };
444 let wrong_er_str = if wrong_editable_region_total > 0 {
445 format!(
446 "{:.2}%",
447 wrong_editable_region_count as f32 / wrong_editable_region_total as f32 * 100.0
448 )
449 } else {
450 "-".to_string()
451 };
452 let isolated_ws_str = if total_scores > 0 {
453 format!(
454 "{}/{} ({:.1}%)",
455 isolated_whitespace_count,
456 total_scores,
457 isolated_whitespace_count as f32 / total_scores as f32 * 100.0
458 )
459 } else {
460 "-".to_string()
461 };
462 let avg_cursor_distance = if cursor_distance_count > 0 {
463 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
464 } else {
465 None
466 };
467
468 println!(
469 "{:<40} {:>8.2} {:>5.1} {:>6.1}% {:>6.1}% {:>7} {:>7} {:>6} {:>5}",
470 "TOTAL / AVERAGE",
471 avg_delta_chr_f,
472 braces_disbalance_avg,
473 total_exact_lines.f1() * 100.0,
474 avg_reversal_ratio * 100.0,
475 qa_reverts_str,
476 qa_conf_str,
477 cursor_str,
478 wrong_er_str
479 );
480 println!("{}", separator);
481 println!(
482 "Delta chrF (β={:.1}): TP={}, FP={}, FN={}, P={:.1}%, R={:.1}%",
483 delta_chr_f_beta,
484 total_delta_chr_f.true_positives,
485 total_delta_chr_f.false_positives,
486 total_delta_chr_f.false_negatives,
487 total_delta_chr_f_precision / total_scores as f64 * 100.0,
488 total_delta_chr_f_recall / total_scores as f64 * 100.0
489 );
490
491 // Print additional cursor metrics if available
492 if let Some(avg_dist) = avg_cursor_distance {
493 println!(
494 "Cursor: {}/{} exact matches ({:.0}%), avg distance: {:.1} bytes",
495 cursor_exact_matches,
496 cursor_total,
497 cursor_exact_matches as f32 / cursor_total as f32 * 100.0,
498 avg_dist
499 );
500 }
501
502 // Print isolated whitespace metrics
503 if total_scores > 0 {
504 println!("Isolated whitespace changes: {}", isolated_ws_str);
505 }
506
507 // Print kept rate metrics
508 if kept_rate_count > 0 {
509 let avg_kept_rate = kept_rate_sum / kept_rate_count as f64;
510 println!(
511 "Kept rate: {:.1}% avg ({} evaluated)",
512 avg_kept_rate * 100.0,
513 kept_rate_count
514 );
515 }
516
517 // Print token change percentile summary (only for predictions with a patch)
518 if !patch_inserted_tokens.is_empty() {
519 patch_inserted_tokens.sort_unstable();
520 patch_deleted_tokens.sort_unstable();
521 let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
522 .iter()
523 .zip(patch_deleted_tokens.iter())
524 .map(|(i, d)| i + d)
525 .collect();
526 patch_total_tokens.sort_unstable();
527
528 let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
529 println!();
530 println!(
531 "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
532 predictions_with_patch, total_scores, patch_rate
533 );
534 println!(
535 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
536 "", "p25", "p50", "p75", "p90", "p99"
537 );
538 println!("{}", "─".repeat(LINE_WIDTH));
539 println!(
540 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
541 "Inserted tokens",
542 percentile(&patch_inserted_tokens, 25),
543 percentile(&patch_inserted_tokens, 50),
544 percentile(&patch_inserted_tokens, 75),
545 percentile(&patch_inserted_tokens, 90),
546 percentile(&patch_inserted_tokens, 99),
547 );
548 println!(
549 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
550 "Deleted tokens",
551 percentile(&patch_deleted_tokens, 25),
552 percentile(&patch_deleted_tokens, 50),
553 percentile(&patch_deleted_tokens, 75),
554 percentile(&patch_deleted_tokens, 90),
555 percentile(&patch_deleted_tokens, 99),
556 );
557 println!(
558 "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
559 "Total tokens",
560 percentile(&patch_total_tokens, 25),
561 percentile(&patch_total_tokens, 50),
562 percentile(&patch_total_tokens, 75),
563 percentile(&patch_total_tokens, 90),
564 percentile(&patch_total_tokens, 99),
565 );
566 }
567 }
568
569 println!("\n");
570}
571
572fn percentile(sorted_values: &[usize], p: usize) -> usize {
573 if sorted_values.is_empty() {
574 return 0;
575 }
576 let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
577 sorted_values[idx.min(sorted_values.len() - 1)]
578}
579
580fn truncate_name(name: &str, max_len: usize) -> String {
581 if name.len() <= max_len {
582 name.to_string()
583 } else {
584 format!("{}...", &name[..max_len - 3])
585 }
586}
587
588#[derive(Serialize)]
589pub struct SummaryJson {
590 pub total_examples: usize,
591 pub avg_delta_chr_f: f32,
592 pub delta_chr_f_beta: f64,
593 pub delta_chr_f_true_positives: usize,
594 pub delta_chr_f_false_positives: usize,
595 pub delta_chr_f_false_negatives: usize,
596 pub delta_chr_f_precision: f64,
597 pub delta_chr_f_recall: f64,
598 pub avg_braces_disbalance: f32,
599 pub exact_lines_true_positives: usize,
600 pub exact_lines_false_positives: usize,
601 pub exact_lines_false_negatives: usize,
602 pub exact_lines_precision: f64,
603 pub exact_lines_recall: f64,
604 pub exact_lines_f1: f64,
605 pub avg_reversal_ratio: f32,
606 #[serde(skip_serializing_if = "Option::is_none")]
607 pub qa_avg_reverts_edits: Option<f32>,
608 #[serde(skip_serializing_if = "Option::is_none")]
609 pub qa_avg_confidence: Option<f32>,
610 #[serde(skip_serializing_if = "Option::is_none")]
611 pub cursor_exact_match_rate: Option<f32>,
612 #[serde(skip_serializing_if = "Option::is_none")]
613 pub cursor_avg_distance: Option<f32>,
614 #[serde(skip_serializing_if = "Option::is_none")]
615 pub cursor_total_evaluated: Option<usize>,
616 #[serde(skip_serializing_if = "Option::is_none")]
617 pub wrong_editable_region_rate: Option<f32>,
618 pub isolated_whitespace_rate: Option<f32>,
619 #[serde(skip_serializing_if = "Option::is_none")]
620 pub avg_kept_rate: Option<f64>,
621}
622
623pub fn compute_summary(examples: &[Example]) -> SummaryJson {
624 use crate::metrics::ClassificationMetrics;
625
626 let mut all_delta_chr_f_scores = Vec::new();
627 let mut all_reversal_ratios = Vec::new();
628 let mut braces_disbalance_sum: usize = 0;
629 let mut total_delta_chr_f = ClassificationMetrics::default();
630 let mut total_delta_chr_f_precision = 0.0;
631 let mut total_delta_chr_f_recall = 0.0;
632 let mut delta_chr_f_beta = 0.0;
633 let mut total_exact_lines = ClassificationMetrics::default();
634 let mut total_scores: usize = 0;
635 let mut qa_reverts_count: usize = 0;
636 let mut qa_reverts_total: usize = 0;
637 let mut qa_confidence_sum: u64 = 0;
638 let mut qa_confidence_count: usize = 0;
639 let mut cursor_exact_matches: usize = 0;
640 let mut cursor_total: usize = 0;
641 let mut cursor_distance_sum: usize = 0;
642 let mut cursor_distance_count: usize = 0;
643 let mut wrong_editable_region_count: usize = 0;
644 let mut wrong_editable_region_total: usize = 0;
645 let mut isolated_whitespace_count: usize = 0;
646 let mut kept_rate_sum: f64 = 0.0;
647 let mut kept_rate_count: usize = 0;
648
649 for example in examples {
650 for (score_idx, score) in example.score.iter().enumerate() {
651 all_delta_chr_f_scores.push(score.delta_chr_f);
652 all_reversal_ratios.push(score.reversal_ratio);
653 total_scores += 1;
654 braces_disbalance_sum += score.braces_disbalance;
655 total_delta_chr_f.accumulate(&score.delta_chr_f_counts());
656 total_delta_chr_f_precision += score.delta_chr_f_precision;
657 total_delta_chr_f_recall += score.delta_chr_f_recall;
658 delta_chr_f_beta = score.delta_chr_f_beta;
659 total_exact_lines.accumulate(&score.exact_lines_counts());
660
661 // Accumulate QA metrics
662 if let Some(Some(qa)) = example.qa.get(score_idx) {
663 if let Some(reverts) = qa.reverts_edits {
664 qa_reverts_total += 1;
665 if reverts {
666 qa_reverts_count += 1;
667 }
668 }
669 if let Some(conf) = qa.confidence {
670 qa_confidence_sum += conf as u64;
671 qa_confidence_count += 1;
672 }
673 }
674
675 // Accumulate wrong editable region metrics
676 if let Some(wrong) = score.wrong_editable_region {
677 wrong_editable_region_total += 1;
678 if wrong {
679 wrong_editable_region_count += 1;
680 }
681 }
682
683 // Accumulate isolated whitespace metrics
684 if score.has_isolated_whitespace_changes {
685 isolated_whitespace_count += 1;
686 }
687
688 // Accumulate kept rate metrics
689 if let Some(kr) = score.kept_rate {
690 kept_rate_sum += kr;
691 kept_rate_count += 1;
692 }
693
694 // Accumulate cursor metrics
695 if let Some(exact_match) = score.cursor_exact_match {
696 cursor_total += 1;
697 if exact_match {
698 cursor_exact_matches += 1;
699 }
700 }
701 if let Some(dist) = score.cursor_distance {
702 cursor_distance_sum += dist;
703 cursor_distance_count += 1;
704 }
705 }
706 }
707
708 let avg_delta_chr_f = if all_delta_chr_f_scores.is_empty() {
709 0.0
710 } else {
711 all_delta_chr_f_scores.iter().sum::<f32>() / all_delta_chr_f_scores.len() as f32
712 };
713
714 let avg_reversal_ratio = if all_reversal_ratios.is_empty() {
715 0.0
716 } else {
717 all_reversal_ratios.iter().sum::<f32>() / all_reversal_ratios.len() as f32
718 };
719
720 let avg_braces_disbalance = if total_scores == 0 {
721 0.0
722 } else {
723 braces_disbalance_sum as f32 / total_scores as f32
724 };
725
726 let qa_avg_reverts_edits = if qa_reverts_total > 0 {
727 Some(qa_reverts_count as f32 / qa_reverts_total as f32)
728 } else {
729 None
730 };
731
732 let qa_avg_confidence = if qa_confidence_count > 0 {
733 Some(qa_confidence_sum as f32 / qa_confidence_count as f32)
734 } else {
735 None
736 };
737
738 let cursor_exact_match_rate = if cursor_total > 0 {
739 Some(cursor_exact_matches as f32 / cursor_total as f32)
740 } else {
741 None
742 };
743
744 let cursor_avg_distance = if cursor_distance_count > 0 {
745 Some(cursor_distance_sum as f32 / cursor_distance_count as f32)
746 } else {
747 None
748 };
749
750 let cursor_total_evaluated = if cursor_total > 0 {
751 Some(cursor_total)
752 } else {
753 None
754 };
755
756 let wrong_editable_region_rate = if wrong_editable_region_total > 0 {
757 Some(wrong_editable_region_count as f32 / wrong_editable_region_total as f32)
758 } else {
759 None
760 };
761
762 let isolated_whitespace_rate = if total_scores > 0 {
763 Some(isolated_whitespace_count as f32 / total_scores as f32)
764 } else {
765 None
766 };
767
768 let avg_kept_rate = if kept_rate_count > 0 {
769 Some(kept_rate_sum / kept_rate_count as f64)
770 } else {
771 None
772 };
773
774 SummaryJson {
775 total_examples: total_scores,
776 avg_delta_chr_f,
777 delta_chr_f_beta,
778 delta_chr_f_true_positives: total_delta_chr_f.true_positives,
779 delta_chr_f_false_positives: total_delta_chr_f.false_positives,
780 delta_chr_f_false_negatives: total_delta_chr_f.false_negatives,
781 delta_chr_f_precision: if total_scores == 0 {
782 0.0
783 } else {
784 total_delta_chr_f_precision / total_scores as f64
785 },
786 delta_chr_f_recall: if total_scores == 0 {
787 0.0
788 } else {
789 total_delta_chr_f_recall / total_scores as f64
790 },
791 avg_braces_disbalance,
792 exact_lines_true_positives: total_exact_lines.true_positives,
793 exact_lines_false_positives: total_exact_lines.false_positives,
794 exact_lines_false_negatives: total_exact_lines.false_negatives,
795 exact_lines_precision: total_exact_lines.precision(),
796 exact_lines_recall: total_exact_lines.recall(),
797 exact_lines_f1: total_exact_lines.f1(),
798 avg_reversal_ratio,
799 qa_avg_reverts_edits,
800 qa_avg_confidence,
801 cursor_exact_match_rate,
802 cursor_avg_distance,
803 cursor_total_evaluated,
804 wrong_editable_region_rate,
805 isolated_whitespace_rate,
806 avg_kept_rate,
807 }
808}
809
810pub fn write_summary_json(examples: &[Example], path: &Path) -> anyhow::Result<()> {
811 let summary = compute_summary(examples);
812 let file = File::create(path)
813 .with_context(|| format!("Failed to create summary JSON file: {}", path.display()))?;
814 let writer = BufWriter::new(file);
815 serde_json::to_writer_pretty(writer, &summary)
816 .with_context(|| format!("Failed to write summary JSON to: {}", path.display()))?;
817 eprintln!("Wrote summary JSON to: {}", path.display());
818 Ok(())
819}