zeta2: Compute diff-aware chrF metric (#43485)

Oleksiy Syvokon created

Zeta evals now include a character n-gram metric adapted for multi-edit diffs (β€œdelta chrF”). It works as follows:

1. Reconstruct the original, golden (expected), and actual texts from unified diffs.
   - "original": the text before any edits
   - "golden": the text after applying the expected edits
   - "actual": the text after applying the actual edits

2. Compute n-gram count deltas between original→golden and original→actual.
   - n-grams are computed as in chrF (max n=6, whitespace ignored).

3. Compare these deltas to assess how well the actual edits match the expected edits.
   - As in standard chrF, classify n-grams as true positives, false positives, and false negatives, and report the F-beta score with beta=2.

Release Notes:

- N/A

Change summary

crates/language/src/buffer.rs   |   4 
crates/zeta_cli/src/evaluate.rs | 203 +++++++-----------
crates/zeta_cli/src/main.rs     |   1 
crates/zeta_cli/src/metrics.rs  | 380 +++++++++++++++++++++++++++++++++++
4 files changed, 464 insertions(+), 124 deletions(-)

Detailed changes

crates/language/src/buffer.rs πŸ”—

@@ -758,8 +758,8 @@ impl EditPreview {
             .to_point(&self.applied_edits_snapshot);
 
         let start = Point::new(start.row.saturating_sub(3), 0);
-        let old_end = Point::new(old_end.row + 3, 0).min(self.old_snapshot.max_point());
-        let new_end = Point::new(new_end.row + 3, 0).min(self.applied_edits_snapshot.max_point());
+        let old_end = Point::new(old_end.row + 4, 0).min(self.old_snapshot.max_point());
+        let new_end = Point::new(new_end.row + 4, 0).min(self.applied_edits_snapshot.max_point());
 
         Some(unified_diff(
             &self

crates/zeta_cli/src/evaluate.rs πŸ”—

@@ -1,3 +1,4 @@
+use crate::metrics::{self, Scores};
 use std::{
     collections::HashMap,
     io::{IsTerminal, Write},
@@ -5,7 +6,6 @@ use std::{
 };
 
 use anyhow::Result;
-use collections::HashSet;
 use gpui::{AsyncApp, Entity};
 use project::Project;
 use util::ResultExt as _;
@@ -119,13 +119,14 @@ fn write_aggregated_scores(
     }
 
     if successful.len() > 1 {
-        let mut edit_predictions = successful
+        let edit_scores = successful
             .iter()
-            .filter_map(|r| r.edit_prediction.as_ref())
-            .peekable();
-        let has_edit_predictions = edit_predictions.peek().is_some();
+            .filter_map(|r| r.edit_scores.clone())
+            .collect::<Vec<_>>();
+        let has_edit_predictions = edit_scores.len() > 0;
         let aggregated_result = EvaluationResult {
-            edit_prediction: has_edit_predictions.then(|| Scores::aggregate(edit_predictions)),
+            context_scores: Scores::aggregate(successful.iter().map(|r| &r.context_scores)),
+            edit_scores: has_edit_predictions.then(|| EditScores::aggregate(&edit_scores)),
             prompt_len: successful.iter().map(|r| r.prompt_len).sum::<usize>() / successful.len(),
             generated_len: successful.iter().map(|r| r.generated_len).sum::<usize>()
                 / successful.len(),
@@ -247,94 +248,27 @@ fn write_eval_result(
     anyhow::Ok(())
 }
 
-#[derive(Debug, Default)]
-pub struct EvaluationResult {
-    pub edit_prediction: Option<Scores>,
-    pub prompt_len: usize,
-    pub generated_len: usize,
-}
-
-#[derive(Default, Debug)]
-pub struct Scores {
-    pub true_positives: usize,
-    pub false_positives: usize,
-    pub false_negatives: usize,
+#[derive(Debug, Default, Clone)]
+pub struct EditScores {
+    pub line_match: Scores,
+    pub chr_f: f64,
 }
 
-impl Scores {
-    pub fn new(expected: &HashSet<String>, actual: &HashSet<String>) -> Scores {
-        let true_positives = expected.intersection(actual).count();
-        let false_positives = actual.difference(expected).count();
-        let false_negatives = expected.difference(actual).count();
-
-        Scores {
-            true_positives,
-            false_positives,
-            false_negatives,
-        }
-    }
-
-    pub fn to_markdown(&self) -> String {
-        format!(
-            "
-Precision       : {:.4}
-Recall          : {:.4}
-F1 Score        : {:.4}
-True Positives  : {}
-False Positives : {}
-False Negatives : {}",
-            self.precision(),
-            self.recall(),
-            self.f1_score(),
-            self.true_positives,
-            self.false_positives,
-            self.false_negatives
-        )
-    }
-
-    pub fn aggregate<'a>(scores: impl Iterator<Item = &'a Scores>) -> Scores {
-        let mut true_positives = 0;
-        let mut false_positives = 0;
-        let mut false_negatives = 0;
-
-        for score in scores {
-            true_positives += score.true_positives;
-            false_positives += score.false_positives;
-            false_negatives += score.false_negatives;
-        }
-
-        Scores {
-            true_positives,
-            false_positives,
-            false_negatives,
-        }
-    }
+impl EditScores {
+    pub fn aggregate(scores: &[EditScores]) -> EditScores {
+        let line_match = Scores::aggregate(scores.iter().map(|s| &s.line_match));
+        let chr_f = scores.iter().map(|s| s.chr_f).sum::<f64>() / scores.len() as f64;
 
-    pub fn precision(&self) -> f64 {
-        if self.true_positives + self.false_positives == 0 {
-            0.0
-        } else {
-            self.true_positives as f64 / (self.true_positives + self.false_positives) as f64
-        }
-    }
-
-    pub fn recall(&self) -> f64 {
-        if self.true_positives + self.false_negatives == 0 {
-            0.0
-        } else {
-            self.true_positives as f64 / (self.true_positives + self.false_negatives) as f64
-        }
+        EditScores { line_match, chr_f }
     }
+}
 
-    pub fn f1_score(&self) -> f64 {
-        let recall = self.recall();
-        let precision = self.precision();
-        if precision + recall == 0.0 {
-            0.0
-        } else {
-            2.0 * precision * recall / (precision + recall)
-        }
-    }
+#[derive(Debug, Default)]
+pub struct EvaluationResult {
+    pub edit_scores: Option<EditScores>,
+    pub context_scores: Scores,
+    pub prompt_len: usize,
+    pub generated_len: usize,
 }
 
 impl std::fmt::Display for EvaluationResult {
@@ -349,40 +283,74 @@ impl std::fmt::Display for EvaluationResult {
 
 impl EvaluationResult {
     fn fmt_markdown(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if let Some(prediction) = &self.edit_prediction {
+        write!(
+            f,
+            r#"
+### Context Scores
+{}
+"#,
+            self.context_scores.to_markdown(),
+        )?;
+        if let Some(scores) = &self.edit_scores {
             write!(
                 f,
                 r#"
                 ### Edit Prediction Scores
                 {}"#,
-                prediction.to_markdown()
+                scores.line_match.to_markdown()
             )?;
         }
         Ok(())
     }
 
     fn fmt_table(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        writeln!(f, "### Scores\n")?;
+        writeln!(f, "#### Prompt Statistics")?;
+        writeln!(f, "─────────────────────────")?;
+        writeln!(f, "Prompt_len  Generated_len")?;
+        writeln!(f, "─────────────────────────")?;
+        writeln!(f, "{:<11} {:<14}", self.prompt_len, self.generated_len,)?;
+        writeln!(f)?;
+        writeln!(f)?;
+        writeln!(f, "#### Performance Scores")?;
         writeln!(
             f,
-            "                   Prompt  Generated  TP     FP     FN     Precision   Recall      F1"
+            "──────────────────────────────────────────────────────────────────"
         )?;
         writeln!(
             f,
-            "───────────────────────────────────────────────────────────────────────────────────────────────"
+            "                   TP     FP     FN     Precision   Recall     F1"
         )?;
-        if let Some(edit_prediction) = &self.edit_prediction {
+        writeln!(
+            f,
+            "──────────────────────────────────────────────────────────────────"
+        )?;
+        writeln!(
+            f,
+            "Context Retrieval  {:<6} {:<6} {:<6} {:>8.2}  {:>7.2}  {:>6.2}",
+            self.context_scores.true_positives,
+            self.context_scores.false_positives,
+            self.context_scores.false_negatives,
+            self.context_scores.precision() * 100.0,
+            self.context_scores.recall() * 100.0,
+            self.context_scores.f1_score() * 100.0
+        )?;
+        if let Some(edit_scores) = &self.edit_scores {
+            let line_match = &edit_scores.line_match;
+            writeln!(f, "Edit Prediction")?;
             writeln!(
                 f,
-                "Edit Prediction    {:<7} {:<9}  {:<6} {:<6} {:<6} {:>9.2} {:>8.2} {:>7.2}",
-                self.prompt_len,
-                self.generated_len,
-                edit_prediction.true_positives,
-                edit_prediction.false_positives,
-                edit_prediction.false_negatives,
-                edit_prediction.precision() * 100.0,
-                edit_prediction.recall() * 100.0,
-                edit_prediction.f1_score() * 100.0
+                "  β”œβ”€ exact lines   {:<6} {:<6} {:<6} {:>8.2}  {:>7.2}  {:>6.2}",
+                line_match.true_positives,
+                line_match.false_positives,
+                line_match.false_negatives,
+                line_match.precision() * 100.0,
+                line_match.recall() * 100.0,
+                line_match.f1_score() * 100.0
+            )?;
+            writeln!(
+                f,
+                "  └─ diff chrF     {:<6} {:<6} {:<6} {:>8} {:>8}  {:>6.2}",
+                "-", "-", "-", "-", "-", edit_scores.chr_f
             )?;
         }
         Ok(())
@@ -403,21 +371,12 @@ fn evaluate(example: &Example, preds: &PredictionDetails, predict: bool) -> Eval
             .lines()
             .map(DiffLine::parse)
             .collect::<Vec<_>>();
-        let expected_patch_lines = expected_patch
-            .iter()
-            .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_)))
-            .map(|line| line.to_string())
-            .collect();
+        let actual_patch = preds.diff.lines().map(DiffLine::parse).collect::<Vec<_>>();
 
-        let actual_patch_lines = preds
-            .diff
-            .lines()
-            .map(DiffLine::parse)
-            .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_)))
-            .map(|line| line.to_string())
-            .collect();
+        let line_match = metrics::line_match_score(&expected_patch, &actual_patch);
+        let chr_f = metrics::delta_chr_f(&expected_patch, &actual_patch);
 
-        eval_result.edit_prediction = Some(Scores::new(&expected_patch_lines, &actual_patch_lines));
+        eval_result.edit_scores = Some(EditScores { line_match, chr_f });
     }
 
     eval_result
@@ -500,12 +459,12 @@ fn write_bucketed_analysis(
                 diff: execution_data.diff.clone(),
                 is_correct: {
                     evaluation_result
-                        .edit_prediction
+                        .edit_scores
                         .as_ref()
-                        .map_or(false, |edit_prediction| {
-                            edit_prediction.false_positives == 0
-                                && edit_prediction.false_negatives == 0
-                                && edit_prediction.true_positives > 0
+                        .map_or(false, |edit_scores| {
+                            edit_scores.line_match.false_positives == 0
+                                && edit_scores.line_match.false_negatives == 0
+                                && edit_scores.line_match.true_positives > 0
                         })
                 },
                 execution_indices: vec![execution_data.execution_id.clone()],

crates/zeta_cli/src/metrics.rs πŸ”—

@@ -0,0 +1,380 @@
+use collections::{HashMap, HashSet};
+use zeta::udiff::DiffLine;
+
+type Counts = HashMap<String, usize>;
+type CountsDelta = HashMap<String, isize>;
+
+#[derive(Default, Debug, Clone)]
+pub struct Scores {
+    pub true_positives: usize,
+    pub false_positives: usize,
+    pub false_negatives: usize,
+}
+
+impl Scores {
+    pub fn from_sets(expected: &HashSet<String>, actual: &HashSet<String>) -> Scores {
+        let true_positives = expected.intersection(actual).count();
+        let false_positives = actual.difference(expected).count();
+        let false_negatives = expected.difference(actual).count();
+
+        Scores {
+            true_positives,
+            false_positives,
+            false_negatives,
+        }
+    }
+
+    pub fn from_counts(expected: &Counts, actual: &Counts) -> Scores {
+        let mut true_positives = 0;
+        let mut false_positives = 0;
+        let mut false_negatives = 0;
+
+        for (ngram, &expected_count) in expected {
+            let actual_count = *actual.get(ngram).unwrap_or(&0);
+            if actual_count > expected_count {
+                false_positives += actual_count - expected_count;
+            } else {
+                false_negatives += expected_count - actual_count;
+            }
+            true_positives += expected_count.min(actual_count);
+        }
+
+        for (ngram, &actual_count) in actual {
+            if !expected.contains_key(ngram) {
+                false_positives += actual_count;
+            }
+        }
+
+        Scores {
+            true_positives,
+            false_positives,
+            false_negatives,
+        }
+    }
+
+    pub fn to_markdown(&self) -> String {
+        format!(
+            "
+Precision       : {:.4}
+Recall          : {:.4}
+F1 Score        : {:.4}
+True Positives  : {}
+False Positives : {}
+False Negatives : {}",
+            self.precision(),
+            self.recall(),
+            self.f1_score(),
+            self.true_positives,
+            self.false_positives,
+            self.false_negatives
+        )
+    }
+
+    pub fn aggregate<'a>(scores: impl Iterator<Item = &'a Scores>) -> Scores {
+        let mut true_positives = 0;
+        let mut false_positives = 0;
+        let mut false_negatives = 0;
+
+        for score in scores {
+            true_positives += score.true_positives;
+            false_positives += score.false_positives;
+            false_negatives += score.false_negatives;
+        }
+
+        Scores {
+            true_positives,
+            false_positives,
+            false_negatives,
+        }
+    }
+
+    pub fn precision(&self) -> f64 {
+        if self.true_positives + self.false_positives == 0 {
+            0.0
+        } else {
+            self.true_positives as f64 / (self.true_positives + self.false_positives) as f64
+        }
+    }
+
+    pub fn recall(&self) -> f64 {
+        if self.true_positives + self.false_negatives == 0 {
+            0.0
+        } else {
+            self.true_positives as f64 / (self.true_positives + self.false_negatives) as f64
+        }
+    }
+
+    pub fn f1_score(&self) -> f64 {
+        let recall = self.recall();
+        let precision = self.precision();
+        if precision + recall == 0.0 {
+            0.0
+        } else {
+            2.0 * precision * recall / (precision + recall)
+        }
+    }
+}
+
+pub fn line_match_score(expected_patch: &[DiffLine], actual_patch: &[DiffLine]) -> Scores {
+    let expected_change_lines = expected_patch
+        .iter()
+        .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_)))
+        .map(|line| line.to_string())
+        .collect();
+
+    let actual_change_lines = actual_patch
+        .iter()
+        .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_)))
+        .map(|line| line.to_string())
+        .collect();
+
+    Scores::from_sets(&expected_change_lines, &actual_change_lines)
+}
+
+enum ChrfWhitespace {
+    #[allow(unused)]
+    Unchanged,
+    Ignore,
+}
+
+const CHR_F_CHAR_ORDER: usize = 6;
+const CHR_F_BETA: f64 = 2.0;
+const CHR_F_WHITESPACE: ChrfWhitespace = ChrfWhitespace::Ignore;
+
+/// Computes a delta-chrF score that compares two sets of edits.
+///
+/// This metric works by:
+/// 1. Reconstructing original, golden (expected result), and actual texts from diffs
+/// 2. Computing n-gram count differences (deltas) between original→golden and original→actual
+/// 3. Comparing these deltas to measure how well actual edits match expected edits
+pub fn delta_chr_f(expected: &[DiffLine], actual: &[DiffLine]) -> f64 {
+    // Reconstruct texts from diffs
+    let mut original_text = String::new(); // state of the text before any edits
+    let mut golden_text = String::new(); // text after applying golden edits
+    let mut actual_text = String::new(); // text after applying actual edits
+
+    for line in expected {
+        match line {
+            DiffLine::Context(s) => {
+                original_text.push_str(s);
+                golden_text.push_str(s);
+            }
+            DiffLine::Deletion(s) => {
+                original_text.push_str(s);
+            }
+            DiffLine::Addition(s) => {
+                golden_text.push_str(s);
+            }
+            _ => {}
+        }
+    }
+
+    for line in actual {
+        match line {
+            DiffLine::Context(s) | DiffLine::Addition(s) => {
+                actual_text.push_str(s);
+            }
+            _ => {}
+        }
+    }
+
+    // Edge case
+    if original_text == golden_text && golden_text == actual_text {
+        return 100.0;
+    }
+
+    // Compute the metric
+    let original_ngrams = chr_f_ngram_counts(&original_text);
+    let golden_ngrams = chr_f_ngram_counts(&golden_text);
+    let actual_ngrams = chr_f_ngram_counts(&actual_text);
+
+    let mut total_precision = 0.0;
+    let mut total_recall = 0.0;
+
+    for order in 0..CHR_F_CHAR_ORDER {
+        let expected_delta = compute_ngram_delta(&golden_ngrams[order], &original_ngrams[order]);
+        let actual_delta = compute_ngram_delta(&actual_ngrams[order], &original_ngrams[order]);
+
+        if expected_delta.is_empty() && actual_delta.is_empty() {
+            total_precision += 1.0;
+            total_recall += 1.0;
+            continue;
+        }
+
+        let expected_counts = ngram_delta_to_counts(&expected_delta);
+        let actual_counts = ngram_delta_to_counts(&actual_delta);
+
+        let score = Scores::from_counts(&expected_counts, &actual_counts);
+        total_precision += score.precision();
+        total_recall += score.recall();
+    }
+
+    let prec = total_precision / CHR_F_CHAR_ORDER as f64;
+    let recall = total_recall / CHR_F_CHAR_ORDER as f64;
+    let f_score = if prec + recall == 0.0 {
+        0.0
+    } else {
+        (1.0 + CHR_F_BETA * CHR_F_BETA) * prec * recall / (CHR_F_BETA * CHR_F_BETA * prec + recall)
+    };
+
+    f_score * 100.0
+}
+
+fn chr_f_ngram_counts(text: &str) -> Vec<Counts> {
+    // Ignore whitespace. The original chrF implementation skips all
+    // whitespace. We should consider compressing multiple consecutive
+    // spaces into one -- this may reflect our task more closely.
+    let text = match CHR_F_WHITESPACE {
+        ChrfWhitespace::Unchanged => text.to_string(),
+        ChrfWhitespace::Ignore => text
+            .chars()
+            .filter(|c| !c.is_whitespace())
+            .collect::<String>(),
+    };
+
+    (1..=CHR_F_CHAR_ORDER)
+        .map(|order| count_ngrams(&text, order))
+        .collect()
+}
+
+fn compute_ngram_delta(after: &Counts, before: &Counts) -> CountsDelta {
+    let mut delta = CountsDelta::default();
+
+    for (ngram, &before_count) in before {
+        let after_count = *after.get(ngram).unwrap_or(&0);
+        delta.insert(ngram.clone(), after_count as isize - before_count as isize);
+    }
+
+    for (ngram, &after_count) in after {
+        if !before.contains_key(ngram) {
+            delta.insert(ngram.clone(), after_count as isize);
+        }
+    }
+
+    delta
+}
+
+/// Convert negative counts to special deletion tokens.
+/// For example, if expected delta is {"foo": -1} and actual delta is {"bar": -1},
+/// we convert it to {"Β¬foo": +1} and {"Β¬bar": +1}. This way _not_ deleting "foo"
+/// will result in a false negative, and mistakenly deleting "bar" will result in a false positive.
+fn ngram_delta_to_counts(delta: &CountsDelta) -> Counts {
+    let mut counts = Counts::default();
+
+    for (ngram, &delta) in delta {
+        if delta > 0 {
+            counts.insert(ngram.clone(), delta as usize);
+        } else {
+            counts.insert(format!("Β¬{ngram}"), delta.unsigned_abs());
+        }
+    }
+
+    counts
+}
+
+fn count_ngrams(text: &str, n: usize) -> Counts {
+    let chars: Vec<char> = text.chars().collect();
+    let mut counts = Counts::default();
+
+    for window in chars.windows(n) {
+        let ngram: String = window.iter().collect();
+        *counts.entry(ngram).or_insert(0) += 1;
+    }
+
+    counts
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use zeta::udiff::DiffLine;
+
+    #[test]
+    fn test_delta_chr_f_perfect_match() {
+        let diff = vec![
+            DiffLine::Context("fn main() {"),
+            DiffLine::Deletion("    println!(\"Hello\");"),
+            DiffLine::Addition("    println!(\"Hello, World!\");"),
+            DiffLine::Context("}"),
+        ];
+
+        let score = delta_chr_f(&diff, &diff);
+        assert!((score - 100.0).abs() < 1e-2);
+    }
+
+    #[test]
+    fn test_delta_chr_f_wrong_edit() {
+        // When the edit is wrong
+        let expected = vec![
+            DiffLine::Context("one "),
+            DiffLine::Deletion("two "),
+            DiffLine::Context("three"),
+        ];
+
+        let actual = vec![
+            DiffLine::Context("one "),
+            DiffLine::Context("two "),
+            DiffLine::Deletion("three"),
+            DiffLine::Addition("four"),
+        ];
+
+        // Then the score should be low
+        let score = delta_chr_f(&expected, &actual);
+        assert!(score > 20.0 && score < 40.0);
+    }
+
+    #[test]
+    fn test_delta_chr_f_partial_match() {
+        let expected = vec![
+            DiffLine::Deletion("let x = 42;"),
+            DiffLine::Addition("let x = 100;"),
+        ];
+
+        let actual = vec![
+            DiffLine::Deletion("let x = 42;"),
+            DiffLine::Addition("let x = 99;"),
+        ];
+
+        // We got the edit location right, but the replacement text is wrong.
+        // Deleted ngrams will match, bringing the score somewhere in the middle.
+        let score = delta_chr_f(&expected, &actual);
+        assert!(score > 40.0 && score < 60.0);
+    }
+
+    #[test]
+    fn test_delta_chr_f_missed_edit() {
+        // When predictions makes no changes
+        let expected = vec![
+            DiffLine::Context("prefix "),
+            DiffLine::Deletion("old"),
+            DiffLine::Addition("new"),
+            DiffLine::Context(" suffix"),
+        ];
+
+        let actual = vec![
+            DiffLine::Context("prefix "),
+            DiffLine::Context("old"),
+            DiffLine::Context(" suffix"),
+        ];
+
+        // Then the score should be low (all expected changes are false negatives)
+        let score = delta_chr_f(&expected, &actual);
+        assert!(score < 20.0);
+    }
+
+    #[test]
+    fn test_delta_chr_f_extra_edit() {
+        // When adding unexpected content
+        let expected = vec![DiffLine::Context("hello"), DiffLine::Context("world")];
+
+        let actual = vec![
+            DiffLine::Context("hello"),
+            DiffLine::Addition("extra"),
+            DiffLine::Context("world"),
+        ];
+
+        // Then the score should be low (all actual changes are false positives)
+        let score = delta_chr_f(&expected, &actual);
+        assert!(score < 20.0);
+    }
+}