From 99d7b2fa1de8c9d5b6d33db0d4c90ba1571e2f8e Mon Sep 17 00:00:00 2001 From: Oleksiy Syvokon Date: Thu, 27 Nov 2025 11:10:35 +0200 Subject: [PATCH] zeta2: Compute diff-aware chrF metric (#43485) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zeta evals now include a character n-gram metric adapted for multi-edit diffs (“delta chrF”). It works as follows: 1. Reconstruct the original, golden (expected), and actual texts from unified diffs. - "original": the text before any edits - "golden": the text after applying the expected edits - "actual": the text after applying the actual edits 2. Compute n-gram count deltas between original→golden and original→actual. - n-grams are computed as in chrF (max n=6, whitespace ignored). 3. Compare these deltas to assess how well the actual edits match the expected edits. - As in standard chrF, classify n-grams as true positives, false positives, and false negatives, and report the F-beta score with beta=2. Release Notes: - N/A --- crates/language/src/buffer.rs | 4 +- crates/zeta_cli/src/evaluate.rs | 203 +++++++---------- crates/zeta_cli/src/main.rs | 1 + crates/zeta_cli/src/metrics.rs | 380 ++++++++++++++++++++++++++++++++ 4 files changed, 464 insertions(+), 124 deletions(-) create mode 100644 crates/zeta_cli/src/metrics.rs diff --git a/crates/language/src/buffer.rs b/crates/language/src/buffer.rs index 66967f9a3357e13485b8228b06874804a8768fac..7d713d515b2ae9584bc922d08d5811155f83d3a8 100644 --- a/crates/language/src/buffer.rs +++ b/crates/language/src/buffer.rs @@ -758,8 +758,8 @@ impl EditPreview { .to_point(&self.applied_edits_snapshot); let start = Point::new(start.row.saturating_sub(3), 0); - let old_end = Point::new(old_end.row + 3, 0).min(self.old_snapshot.max_point()); - let new_end = Point::new(new_end.row + 3, 0).min(self.applied_edits_snapshot.max_point()); + let old_end = Point::new(old_end.row + 4, 0).min(self.old_snapshot.max_point()); + let new_end = Point::new(new_end.row + 4, 0).min(self.applied_edits_snapshot.max_point()); Some(unified_diff( &self diff --git a/crates/zeta_cli/src/evaluate.rs b/crates/zeta_cli/src/evaluate.rs index 6726dcb3aafdeff7fe41cbbbc49850c1e7465cf4..043844768557ad46f61d5fd0d809e1e85c62574f 100644 --- a/crates/zeta_cli/src/evaluate.rs +++ b/crates/zeta_cli/src/evaluate.rs @@ -1,3 +1,4 @@ +use crate::metrics::{self, Scores}; use std::{ collections::HashMap, io::{IsTerminal, Write}, @@ -5,7 +6,6 @@ use std::{ }; use anyhow::Result; -use collections::HashSet; use gpui::{AsyncApp, Entity}; use project::Project; use util::ResultExt as _; @@ -119,13 +119,14 @@ fn write_aggregated_scores( } if successful.len() > 1 { - let mut edit_predictions = successful + let edit_scores = successful .iter() - .filter_map(|r| r.edit_prediction.as_ref()) - .peekable(); - let has_edit_predictions = edit_predictions.peek().is_some(); + .filter_map(|r| r.edit_scores.clone()) + .collect::>(); + let has_edit_predictions = edit_scores.len() > 0; let aggregated_result = EvaluationResult { - edit_prediction: has_edit_predictions.then(|| Scores::aggregate(edit_predictions)), + context_scores: Scores::aggregate(successful.iter().map(|r| &r.context_scores)), + edit_scores: has_edit_predictions.then(|| EditScores::aggregate(&edit_scores)), prompt_len: successful.iter().map(|r| r.prompt_len).sum::() / successful.len(), generated_len: successful.iter().map(|r| r.generated_len).sum::() / successful.len(), @@ -247,94 +248,27 @@ fn write_eval_result( anyhow::Ok(()) } -#[derive(Debug, Default)] -pub struct EvaluationResult { - pub edit_prediction: Option, - pub prompt_len: usize, - pub generated_len: usize, -} - -#[derive(Default, Debug)] -pub struct Scores { - pub true_positives: usize, - pub false_positives: usize, - pub false_negatives: usize, +#[derive(Debug, Default, Clone)] +pub struct EditScores { + pub line_match: Scores, + pub chr_f: f64, } -impl Scores { - pub fn new(expected: &HashSet, actual: &HashSet) -> Scores { - let true_positives = expected.intersection(actual).count(); - let false_positives = actual.difference(expected).count(); - let false_negatives = expected.difference(actual).count(); - - Scores { - true_positives, - false_positives, - false_negatives, - } - } - - pub fn to_markdown(&self) -> String { - format!( - " -Precision : {:.4} -Recall : {:.4} -F1 Score : {:.4} -True Positives : {} -False Positives : {} -False Negatives : {}", - self.precision(), - self.recall(), - self.f1_score(), - self.true_positives, - self.false_positives, - self.false_negatives - ) - } - - pub fn aggregate<'a>(scores: impl Iterator) -> Scores { - let mut true_positives = 0; - let mut false_positives = 0; - let mut false_negatives = 0; - - for score in scores { - true_positives += score.true_positives; - false_positives += score.false_positives; - false_negatives += score.false_negatives; - } - - Scores { - true_positives, - false_positives, - false_negatives, - } - } +impl EditScores { + pub fn aggregate(scores: &[EditScores]) -> EditScores { + let line_match = Scores::aggregate(scores.iter().map(|s| &s.line_match)); + let chr_f = scores.iter().map(|s| s.chr_f).sum::() / scores.len() as f64; - pub fn precision(&self) -> f64 { - if self.true_positives + self.false_positives == 0 { - 0.0 - } else { - self.true_positives as f64 / (self.true_positives + self.false_positives) as f64 - } - } - - pub fn recall(&self) -> f64 { - if self.true_positives + self.false_negatives == 0 { - 0.0 - } else { - self.true_positives as f64 / (self.true_positives + self.false_negatives) as f64 - } + EditScores { line_match, chr_f } } +} - pub fn f1_score(&self) -> f64 { - let recall = self.recall(); - let precision = self.precision(); - if precision + recall == 0.0 { - 0.0 - } else { - 2.0 * precision * recall / (precision + recall) - } - } +#[derive(Debug, Default)] +pub struct EvaluationResult { + pub edit_scores: Option, + pub context_scores: Scores, + pub prompt_len: usize, + pub generated_len: usize, } impl std::fmt::Display for EvaluationResult { @@ -349,40 +283,74 @@ impl std::fmt::Display for EvaluationResult { impl EvaluationResult { fn fmt_markdown(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if let Some(prediction) = &self.edit_prediction { + write!( + f, + r#" +### Context Scores +{} +"#, + self.context_scores.to_markdown(), + )?; + if let Some(scores) = &self.edit_scores { write!( f, r#" ### Edit Prediction Scores {}"#, - prediction.to_markdown() + scores.line_match.to_markdown() )?; } Ok(()) } fn fmt_table(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "### Scores\n")?; + writeln!(f, "#### Prompt Statistics")?; + writeln!(f, "─────────────────────────")?; + writeln!(f, "Prompt_len Generated_len")?; + writeln!(f, "─────────────────────────")?; + writeln!(f, "{:<11} {:<14}", self.prompt_len, self.generated_len,)?; + writeln!(f)?; + writeln!(f)?; + writeln!(f, "#### Performance Scores")?; writeln!( f, - " Prompt Generated TP FP FN Precision Recall F1" + "──────────────────────────────────────────────────────────────────" )?; writeln!( f, - "───────────────────────────────────────────────────────────────────────────────────────────────" + " TP FP FN Precision Recall F1" )?; - if let Some(edit_prediction) = &self.edit_prediction { + writeln!( + f, + "──────────────────────────────────────────────────────────────────" + )?; + writeln!( + f, + "Context Retrieval {:<6} {:<6} {:<6} {:>8.2} {:>7.2} {:>6.2}", + self.context_scores.true_positives, + self.context_scores.false_positives, + self.context_scores.false_negatives, + self.context_scores.precision() * 100.0, + self.context_scores.recall() * 100.0, + self.context_scores.f1_score() * 100.0 + )?; + if let Some(edit_scores) = &self.edit_scores { + let line_match = &edit_scores.line_match; + writeln!(f, "Edit Prediction")?; writeln!( f, - "Edit Prediction {:<7} {:<9} {:<6} {:<6} {:<6} {:>9.2} {:>8.2} {:>7.2}", - self.prompt_len, - self.generated_len, - edit_prediction.true_positives, - edit_prediction.false_positives, - edit_prediction.false_negatives, - edit_prediction.precision() * 100.0, - edit_prediction.recall() * 100.0, - edit_prediction.f1_score() * 100.0 + " ├─ exact lines {:<6} {:<6} {:<6} {:>8.2} {:>7.2} {:>6.2}", + line_match.true_positives, + line_match.false_positives, + line_match.false_negatives, + line_match.precision() * 100.0, + line_match.recall() * 100.0, + line_match.f1_score() * 100.0 + )?; + writeln!( + f, + " └─ diff chrF {:<6} {:<6} {:<6} {:>8} {:>8} {:>6.2}", + "-", "-", "-", "-", "-", edit_scores.chr_f )?; } Ok(()) @@ -403,21 +371,12 @@ fn evaluate(example: &Example, preds: &PredictionDetails, predict: bool) -> Eval .lines() .map(DiffLine::parse) .collect::>(); - let expected_patch_lines = expected_patch - .iter() - .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_))) - .map(|line| line.to_string()) - .collect(); + let actual_patch = preds.diff.lines().map(DiffLine::parse).collect::>(); - let actual_patch_lines = preds - .diff - .lines() - .map(DiffLine::parse) - .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_))) - .map(|line| line.to_string()) - .collect(); + let line_match = metrics::line_match_score(&expected_patch, &actual_patch); + let chr_f = metrics::delta_chr_f(&expected_patch, &actual_patch); - eval_result.edit_prediction = Some(Scores::new(&expected_patch_lines, &actual_patch_lines)); + eval_result.edit_scores = Some(EditScores { line_match, chr_f }); } eval_result @@ -500,12 +459,12 @@ fn write_bucketed_analysis( diff: execution_data.diff.clone(), is_correct: { evaluation_result - .edit_prediction + .edit_scores .as_ref() - .map_or(false, |edit_prediction| { - edit_prediction.false_positives == 0 - && edit_prediction.false_negatives == 0 - && edit_prediction.true_positives > 0 + .map_or(false, |edit_scores| { + edit_scores.line_match.false_positives == 0 + && edit_scores.line_match.false_negatives == 0 + && edit_scores.line_match.true_positives > 0 }) }, execution_indices: vec![execution_data.execution_id.clone()], diff --git a/crates/zeta_cli/src/main.rs b/crates/zeta_cli/src/main.rs index 2d5a23e31f463455871494d123a4988b41b5bd66..d72a0f5cf7cf00166a2bbaa60c1700d1007fc8af 100644 --- a/crates/zeta_cli/src/main.rs +++ b/crates/zeta_cli/src/main.rs @@ -1,6 +1,7 @@ mod evaluate; mod example; mod headless; +mod metrics; mod paths; mod predict; mod source_location; diff --git a/crates/zeta_cli/src/metrics.rs b/crates/zeta_cli/src/metrics.rs new file mode 100644 index 0000000000000000000000000000000000000000..dd08459678eef6d04a6b656d19a4572d51a5b5c1 --- /dev/null +++ b/crates/zeta_cli/src/metrics.rs @@ -0,0 +1,380 @@ +use collections::{HashMap, HashSet}; +use zeta::udiff::DiffLine; + +type Counts = HashMap; +type CountsDelta = HashMap; + +#[derive(Default, Debug, Clone)] +pub struct Scores { + pub true_positives: usize, + pub false_positives: usize, + pub false_negatives: usize, +} + +impl Scores { + pub fn from_sets(expected: &HashSet, actual: &HashSet) -> Scores { + let true_positives = expected.intersection(actual).count(); + let false_positives = actual.difference(expected).count(); + let false_negatives = expected.difference(actual).count(); + + Scores { + true_positives, + false_positives, + false_negatives, + } + } + + pub fn from_counts(expected: &Counts, actual: &Counts) -> Scores { + let mut true_positives = 0; + let mut false_positives = 0; + let mut false_negatives = 0; + + for (ngram, &expected_count) in expected { + let actual_count = *actual.get(ngram).unwrap_or(&0); + if actual_count > expected_count { + false_positives += actual_count - expected_count; + } else { + false_negatives += expected_count - actual_count; + } + true_positives += expected_count.min(actual_count); + } + + for (ngram, &actual_count) in actual { + if !expected.contains_key(ngram) { + false_positives += actual_count; + } + } + + Scores { + true_positives, + false_positives, + false_negatives, + } + } + + pub fn to_markdown(&self) -> String { + format!( + " +Precision : {:.4} +Recall : {:.4} +F1 Score : {:.4} +True Positives : {} +False Positives : {} +False Negatives : {}", + self.precision(), + self.recall(), + self.f1_score(), + self.true_positives, + self.false_positives, + self.false_negatives + ) + } + + pub fn aggregate<'a>(scores: impl Iterator) -> Scores { + let mut true_positives = 0; + let mut false_positives = 0; + let mut false_negatives = 0; + + for score in scores { + true_positives += score.true_positives; + false_positives += score.false_positives; + false_negatives += score.false_negatives; + } + + Scores { + true_positives, + false_positives, + false_negatives, + } + } + + pub fn precision(&self) -> f64 { + if self.true_positives + self.false_positives == 0 { + 0.0 + } else { + self.true_positives as f64 / (self.true_positives + self.false_positives) as f64 + } + } + + pub fn recall(&self) -> f64 { + if self.true_positives + self.false_negatives == 0 { + 0.0 + } else { + self.true_positives as f64 / (self.true_positives + self.false_negatives) as f64 + } + } + + pub fn f1_score(&self) -> f64 { + let recall = self.recall(); + let precision = self.precision(); + if precision + recall == 0.0 { + 0.0 + } else { + 2.0 * precision * recall / (precision + recall) + } + } +} + +pub fn line_match_score(expected_patch: &[DiffLine], actual_patch: &[DiffLine]) -> Scores { + let expected_change_lines = expected_patch + .iter() + .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_))) + .map(|line| line.to_string()) + .collect(); + + let actual_change_lines = actual_patch + .iter() + .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_))) + .map(|line| line.to_string()) + .collect(); + + Scores::from_sets(&expected_change_lines, &actual_change_lines) +} + +enum ChrfWhitespace { + #[allow(unused)] + Unchanged, + Ignore, +} + +const CHR_F_CHAR_ORDER: usize = 6; +const CHR_F_BETA: f64 = 2.0; +const CHR_F_WHITESPACE: ChrfWhitespace = ChrfWhitespace::Ignore; + +/// Computes a delta-chrF score that compares two sets of edits. +/// +/// This metric works by: +/// 1. Reconstructing original, golden (expected result), and actual texts from diffs +/// 2. Computing n-gram count differences (deltas) between original→golden and original→actual +/// 3. Comparing these deltas to measure how well actual edits match expected edits +pub fn delta_chr_f(expected: &[DiffLine], actual: &[DiffLine]) -> f64 { + // Reconstruct texts from diffs + let mut original_text = String::new(); // state of the text before any edits + let mut golden_text = String::new(); // text after applying golden edits + let mut actual_text = String::new(); // text after applying actual edits + + for line in expected { + match line { + DiffLine::Context(s) => { + original_text.push_str(s); + golden_text.push_str(s); + } + DiffLine::Deletion(s) => { + original_text.push_str(s); + } + DiffLine::Addition(s) => { + golden_text.push_str(s); + } + _ => {} + } + } + + for line in actual { + match line { + DiffLine::Context(s) | DiffLine::Addition(s) => { + actual_text.push_str(s); + } + _ => {} + } + } + + // Edge case + if original_text == golden_text && golden_text == actual_text { + return 100.0; + } + + // Compute the metric + let original_ngrams = chr_f_ngram_counts(&original_text); + let golden_ngrams = chr_f_ngram_counts(&golden_text); + let actual_ngrams = chr_f_ngram_counts(&actual_text); + + let mut total_precision = 0.0; + let mut total_recall = 0.0; + + for order in 0..CHR_F_CHAR_ORDER { + let expected_delta = compute_ngram_delta(&golden_ngrams[order], &original_ngrams[order]); + let actual_delta = compute_ngram_delta(&actual_ngrams[order], &original_ngrams[order]); + + if expected_delta.is_empty() && actual_delta.is_empty() { + total_precision += 1.0; + total_recall += 1.0; + continue; + } + + let expected_counts = ngram_delta_to_counts(&expected_delta); + let actual_counts = ngram_delta_to_counts(&actual_delta); + + let score = Scores::from_counts(&expected_counts, &actual_counts); + total_precision += score.precision(); + total_recall += score.recall(); + } + + let prec = total_precision / CHR_F_CHAR_ORDER as f64; + let recall = total_recall / CHR_F_CHAR_ORDER as f64; + let f_score = if prec + recall == 0.0 { + 0.0 + } else { + (1.0 + CHR_F_BETA * CHR_F_BETA) * prec * recall / (CHR_F_BETA * CHR_F_BETA * prec + recall) + }; + + f_score * 100.0 +} + +fn chr_f_ngram_counts(text: &str) -> Vec { + // Ignore whitespace. The original chrF implementation skips all + // whitespace. We should consider compressing multiple consecutive + // spaces into one -- this may reflect our task more closely. + let text = match CHR_F_WHITESPACE { + ChrfWhitespace::Unchanged => text.to_string(), + ChrfWhitespace::Ignore => text + .chars() + .filter(|c| !c.is_whitespace()) + .collect::(), + }; + + (1..=CHR_F_CHAR_ORDER) + .map(|order| count_ngrams(&text, order)) + .collect() +} + +fn compute_ngram_delta(after: &Counts, before: &Counts) -> CountsDelta { + let mut delta = CountsDelta::default(); + + for (ngram, &before_count) in before { + let after_count = *after.get(ngram).unwrap_or(&0); + delta.insert(ngram.clone(), after_count as isize - before_count as isize); + } + + for (ngram, &after_count) in after { + if !before.contains_key(ngram) { + delta.insert(ngram.clone(), after_count as isize); + } + } + + delta +} + +/// Convert negative counts to special deletion tokens. +/// For example, if expected delta is {"foo": -1} and actual delta is {"bar": -1}, +/// we convert it to {"¬foo": +1} and {"¬bar": +1}. This way _not_ deleting "foo" +/// will result in a false negative, and mistakenly deleting "bar" will result in a false positive. +fn ngram_delta_to_counts(delta: &CountsDelta) -> Counts { + let mut counts = Counts::default(); + + for (ngram, &delta) in delta { + if delta > 0 { + counts.insert(ngram.clone(), delta as usize); + } else { + counts.insert(format!("¬{ngram}"), delta.unsigned_abs()); + } + } + + counts +} + +fn count_ngrams(text: &str, n: usize) -> Counts { + let chars: Vec = text.chars().collect(); + let mut counts = Counts::default(); + + for window in chars.windows(n) { + let ngram: String = window.iter().collect(); + *counts.entry(ngram).or_insert(0) += 1; + } + + counts +} + +#[cfg(test)] +mod test { + use super::*; + use zeta::udiff::DiffLine; + + #[test] + fn test_delta_chr_f_perfect_match() { + let diff = vec![ + DiffLine::Context("fn main() {"), + DiffLine::Deletion(" println!(\"Hello\");"), + DiffLine::Addition(" println!(\"Hello, World!\");"), + DiffLine::Context("}"), + ]; + + let score = delta_chr_f(&diff, &diff); + assert!((score - 100.0).abs() < 1e-2); + } + + #[test] + fn test_delta_chr_f_wrong_edit() { + // When the edit is wrong + let expected = vec![ + DiffLine::Context("one "), + DiffLine::Deletion("two "), + DiffLine::Context("three"), + ]; + + let actual = vec![ + DiffLine::Context("one "), + DiffLine::Context("two "), + DiffLine::Deletion("three"), + DiffLine::Addition("four"), + ]; + + // Then the score should be low + let score = delta_chr_f(&expected, &actual); + assert!(score > 20.0 && score < 40.0); + } + + #[test] + fn test_delta_chr_f_partial_match() { + let expected = vec![ + DiffLine::Deletion("let x = 42;"), + DiffLine::Addition("let x = 100;"), + ]; + + let actual = vec![ + DiffLine::Deletion("let x = 42;"), + DiffLine::Addition("let x = 99;"), + ]; + + // We got the edit location right, but the replacement text is wrong. + // Deleted ngrams will match, bringing the score somewhere in the middle. + let score = delta_chr_f(&expected, &actual); + assert!(score > 40.0 && score < 60.0); + } + + #[test] + fn test_delta_chr_f_missed_edit() { + // When predictions makes no changes + let expected = vec![ + DiffLine::Context("prefix "), + DiffLine::Deletion("old"), + DiffLine::Addition("new"), + DiffLine::Context(" suffix"), + ]; + + let actual = vec![ + DiffLine::Context("prefix "), + DiffLine::Context("old"), + DiffLine::Context(" suffix"), + ]; + + // Then the score should be low (all expected changes are false negatives) + let score = delta_chr_f(&expected, &actual); + assert!(score < 20.0); + } + + #[test] + fn test_delta_chr_f_extra_edit() { + // When adding unexpected content + let expected = vec![DiffLine::Context("hello"), DiffLine::Context("world")]; + + let actual = vec![ + DiffLine::Context("hello"), + DiffLine::Addition("extra"), + DiffLine::Context("world"), + ]; + + // Then the score should be low (all actual changes are false positives) + let score = delta_chr_f(&expected, &actual); + assert!(score < 20.0); + } +}