@@ -1,3 +1,4 @@
+use crate::metrics::{self, Scores};
use std::{
collections::HashMap,
io::{IsTerminal, Write},
@@ -5,7 +6,6 @@ use std::{
};
use anyhow::Result;
-use collections::HashSet;
use gpui::{AsyncApp, Entity};
use project::Project;
use util::ResultExt as _;
@@ -119,13 +119,14 @@ fn write_aggregated_scores(
}
if successful.len() > 1 {
- let mut edit_predictions = successful
+ let edit_scores = successful
.iter()
- .filter_map(|r| r.edit_prediction.as_ref())
- .peekable();
- let has_edit_predictions = edit_predictions.peek().is_some();
+ .filter_map(|r| r.edit_scores.clone())
+ .collect::<Vec<_>>();
+ let has_edit_predictions = edit_scores.len() > 0;
let aggregated_result = EvaluationResult {
- edit_prediction: has_edit_predictions.then(|| Scores::aggregate(edit_predictions)),
+ context_scores: Scores::aggregate(successful.iter().map(|r| &r.context_scores)),
+ edit_scores: has_edit_predictions.then(|| EditScores::aggregate(&edit_scores)),
prompt_len: successful.iter().map(|r| r.prompt_len).sum::<usize>() / successful.len(),
generated_len: successful.iter().map(|r| r.generated_len).sum::<usize>()
/ successful.len(),
@@ -247,94 +248,27 @@ fn write_eval_result(
anyhow::Ok(())
}
-#[derive(Debug, Default)]
-pub struct EvaluationResult {
- pub edit_prediction: Option<Scores>,
- pub prompt_len: usize,
- pub generated_len: usize,
-}
-
-#[derive(Default, Debug)]
-pub struct Scores {
- pub true_positives: usize,
- pub false_positives: usize,
- pub false_negatives: usize,
+#[derive(Debug, Default, Clone)]
+pub struct EditScores {
+ pub line_match: Scores,
+ pub chr_f: f64,
}
-impl Scores {
- pub fn new(expected: &HashSet<String>, actual: &HashSet<String>) -> Scores {
- let true_positives = expected.intersection(actual).count();
- let false_positives = actual.difference(expected).count();
- let false_negatives = expected.difference(actual).count();
-
- Scores {
- true_positives,
- false_positives,
- false_negatives,
- }
- }
-
- pub fn to_markdown(&self) -> String {
- format!(
- "
-Precision : {:.4}
-Recall : {:.4}
-F1 Score : {:.4}
-True Positives : {}
-False Positives : {}
-False Negatives : {}",
- self.precision(),
- self.recall(),
- self.f1_score(),
- self.true_positives,
- self.false_positives,
- self.false_negatives
- )
- }
-
- pub fn aggregate<'a>(scores: impl Iterator<Item = &'a Scores>) -> Scores {
- let mut true_positives = 0;
- let mut false_positives = 0;
- let mut false_negatives = 0;
-
- for score in scores {
- true_positives += score.true_positives;
- false_positives += score.false_positives;
- false_negatives += score.false_negatives;
- }
-
- Scores {
- true_positives,
- false_positives,
- false_negatives,
- }
- }
+impl EditScores {
+ pub fn aggregate(scores: &[EditScores]) -> EditScores {
+ let line_match = Scores::aggregate(scores.iter().map(|s| &s.line_match));
+ let chr_f = scores.iter().map(|s| s.chr_f).sum::<f64>() / scores.len() as f64;
- pub fn precision(&self) -> f64 {
- if self.true_positives + self.false_positives == 0 {
- 0.0
- } else {
- self.true_positives as f64 / (self.true_positives + self.false_positives) as f64
- }
- }
-
- pub fn recall(&self) -> f64 {
- if self.true_positives + self.false_negatives == 0 {
- 0.0
- } else {
- self.true_positives as f64 / (self.true_positives + self.false_negatives) as f64
- }
+ EditScores { line_match, chr_f }
}
+}
- pub fn f1_score(&self) -> f64 {
- let recall = self.recall();
- let precision = self.precision();
- if precision + recall == 0.0 {
- 0.0
- } else {
- 2.0 * precision * recall / (precision + recall)
- }
- }
+#[derive(Debug, Default)]
+pub struct EvaluationResult {
+ pub edit_scores: Option<EditScores>,
+ pub context_scores: Scores,
+ pub prompt_len: usize,
+ pub generated_len: usize,
}
impl std::fmt::Display for EvaluationResult {
@@ -349,40 +283,74 @@ impl std::fmt::Display for EvaluationResult {
impl EvaluationResult {
fn fmt_markdown(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- if let Some(prediction) = &self.edit_prediction {
+ write!(
+ f,
+ r#"
+### Context Scores
+{}
+"#,
+ self.context_scores.to_markdown(),
+ )?;
+ if let Some(scores) = &self.edit_scores {
write!(
f,
r#"
### Edit Prediction Scores
{}"#,
- prediction.to_markdown()
+ scores.line_match.to_markdown()
)?;
}
Ok(())
}
fn fmt_table(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- writeln!(f, "### Scores\n")?;
+ writeln!(f, "#### Prompt Statistics")?;
+ writeln!(f, "βββββββββββββββββββββββββ")?;
+ writeln!(f, "Prompt_len Generated_len")?;
+ writeln!(f, "βββββββββββββββββββββββββ")?;
+ writeln!(f, "{:<11} {:<14}", self.prompt_len, self.generated_len,)?;
+ writeln!(f)?;
+ writeln!(f)?;
+ writeln!(f, "#### Performance Scores")?;
writeln!(
f,
- " Prompt Generated TP FP FN Precision Recall F1"
+ "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
)?;
writeln!(
f,
- "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+ " TP FP FN Precision Recall F1"
)?;
- if let Some(edit_prediction) = &self.edit_prediction {
+ writeln!(
+ f,
+ "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
+ )?;
+ writeln!(
+ f,
+ "Context Retrieval {:<6} {:<6} {:<6} {:>8.2} {:>7.2} {:>6.2}",
+ self.context_scores.true_positives,
+ self.context_scores.false_positives,
+ self.context_scores.false_negatives,
+ self.context_scores.precision() * 100.0,
+ self.context_scores.recall() * 100.0,
+ self.context_scores.f1_score() * 100.0
+ )?;
+ if let Some(edit_scores) = &self.edit_scores {
+ let line_match = &edit_scores.line_match;
+ writeln!(f, "Edit Prediction")?;
writeln!(
f,
- "Edit Prediction {:<7} {:<9} {:<6} {:<6} {:<6} {:>9.2} {:>8.2} {:>7.2}",
- self.prompt_len,
- self.generated_len,
- edit_prediction.true_positives,
- edit_prediction.false_positives,
- edit_prediction.false_negatives,
- edit_prediction.precision() * 100.0,
- edit_prediction.recall() * 100.0,
- edit_prediction.f1_score() * 100.0
+ " ββ exact lines {:<6} {:<6} {:<6} {:>8.2} {:>7.2} {:>6.2}",
+ line_match.true_positives,
+ line_match.false_positives,
+ line_match.false_negatives,
+ line_match.precision() * 100.0,
+ line_match.recall() * 100.0,
+ line_match.f1_score() * 100.0
+ )?;
+ writeln!(
+ f,
+ " ββ diff chrF {:<6} {:<6} {:<6} {:>8} {:>8} {:>6.2}",
+ "-", "-", "-", "-", "-", edit_scores.chr_f
)?;
}
Ok(())
@@ -403,21 +371,12 @@ fn evaluate(example: &Example, preds: &PredictionDetails, predict: bool) -> Eval
.lines()
.map(DiffLine::parse)
.collect::<Vec<_>>();
- let expected_patch_lines = expected_patch
- .iter()
- .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_)))
- .map(|line| line.to_string())
- .collect();
+ let actual_patch = preds.diff.lines().map(DiffLine::parse).collect::<Vec<_>>();
- let actual_patch_lines = preds
- .diff
- .lines()
- .map(DiffLine::parse)
- .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_)))
- .map(|line| line.to_string())
- .collect();
+ let line_match = metrics::line_match_score(&expected_patch, &actual_patch);
+ let chr_f = metrics::delta_chr_f(&expected_patch, &actual_patch);
- eval_result.edit_prediction = Some(Scores::new(&expected_patch_lines, &actual_patch_lines));
+ eval_result.edit_scores = Some(EditScores { line_match, chr_f });
}
eval_result
@@ -500,12 +459,12 @@ fn write_bucketed_analysis(
diff: execution_data.diff.clone(),
is_correct: {
evaluation_result
- .edit_prediction
+ .edit_scores
.as_ref()
- .map_or(false, |edit_prediction| {
- edit_prediction.false_positives == 0
- && edit_prediction.false_negatives == 0
- && edit_prediction.true_positives > 0
+ .map_or(false, |edit_scores| {
+ edit_scores.line_match.false_positives == 0
+ && edit_scores.line_match.false_negatives == 0
+ && edit_scores.line_match.true_positives > 0
})
},
execution_indices: vec![execution_data.execution_id.clone()],
@@ -0,0 +1,380 @@
+use collections::{HashMap, HashSet};
+use zeta::udiff::DiffLine;
+
+type Counts = HashMap<String, usize>;
+type CountsDelta = HashMap<String, isize>;
+
+#[derive(Default, Debug, Clone)]
+pub struct Scores {
+ pub true_positives: usize,
+ pub false_positives: usize,
+ pub false_negatives: usize,
+}
+
+impl Scores {
+ pub fn from_sets(expected: &HashSet<String>, actual: &HashSet<String>) -> Scores {
+ let true_positives = expected.intersection(actual).count();
+ let false_positives = actual.difference(expected).count();
+ let false_negatives = expected.difference(actual).count();
+
+ Scores {
+ true_positives,
+ false_positives,
+ false_negatives,
+ }
+ }
+
+ pub fn from_counts(expected: &Counts, actual: &Counts) -> Scores {
+ let mut true_positives = 0;
+ let mut false_positives = 0;
+ let mut false_negatives = 0;
+
+ for (ngram, &expected_count) in expected {
+ let actual_count = *actual.get(ngram).unwrap_or(&0);
+ if actual_count > expected_count {
+ false_positives += actual_count - expected_count;
+ } else {
+ false_negatives += expected_count - actual_count;
+ }
+ true_positives += expected_count.min(actual_count);
+ }
+
+ for (ngram, &actual_count) in actual {
+ if !expected.contains_key(ngram) {
+ false_positives += actual_count;
+ }
+ }
+
+ Scores {
+ true_positives,
+ false_positives,
+ false_negatives,
+ }
+ }
+
+ pub fn to_markdown(&self) -> String {
+ format!(
+ "
+Precision : {:.4}
+Recall : {:.4}
+F1 Score : {:.4}
+True Positives : {}
+False Positives : {}
+False Negatives : {}",
+ self.precision(),
+ self.recall(),
+ self.f1_score(),
+ self.true_positives,
+ self.false_positives,
+ self.false_negatives
+ )
+ }
+
+ pub fn aggregate<'a>(scores: impl Iterator<Item = &'a Scores>) -> Scores {
+ let mut true_positives = 0;
+ let mut false_positives = 0;
+ let mut false_negatives = 0;
+
+ for score in scores {
+ true_positives += score.true_positives;
+ false_positives += score.false_positives;
+ false_negatives += score.false_negatives;
+ }
+
+ Scores {
+ true_positives,
+ false_positives,
+ false_negatives,
+ }
+ }
+
+ pub fn precision(&self) -> f64 {
+ if self.true_positives + self.false_positives == 0 {
+ 0.0
+ } else {
+ self.true_positives as f64 / (self.true_positives + self.false_positives) as f64
+ }
+ }
+
+ pub fn recall(&self) -> f64 {
+ if self.true_positives + self.false_negatives == 0 {
+ 0.0
+ } else {
+ self.true_positives as f64 / (self.true_positives + self.false_negatives) as f64
+ }
+ }
+
+ pub fn f1_score(&self) -> f64 {
+ let recall = self.recall();
+ let precision = self.precision();
+ if precision + recall == 0.0 {
+ 0.0
+ } else {
+ 2.0 * precision * recall / (precision + recall)
+ }
+ }
+}
+
+pub fn line_match_score(expected_patch: &[DiffLine], actual_patch: &[DiffLine]) -> Scores {
+ let expected_change_lines = expected_patch
+ .iter()
+ .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_)))
+ .map(|line| line.to_string())
+ .collect();
+
+ let actual_change_lines = actual_patch
+ .iter()
+ .filter(|line| matches!(line, DiffLine::Addition(_) | DiffLine::Deletion(_)))
+ .map(|line| line.to_string())
+ .collect();
+
+ Scores::from_sets(&expected_change_lines, &actual_change_lines)
+}
+
+enum ChrfWhitespace {
+ #[allow(unused)]
+ Unchanged,
+ Ignore,
+}
+
+const CHR_F_CHAR_ORDER: usize = 6;
+const CHR_F_BETA: f64 = 2.0;
+const CHR_F_WHITESPACE: ChrfWhitespace = ChrfWhitespace::Ignore;
+
+/// Computes a delta-chrF score that compares two sets of edits.
+///
+/// This metric works by:
+/// 1. Reconstructing original, golden (expected result), and actual texts from diffs
+/// 2. Computing n-gram count differences (deltas) between originalβgolden and originalβactual
+/// 3. Comparing these deltas to measure how well actual edits match expected edits
+pub fn delta_chr_f(expected: &[DiffLine], actual: &[DiffLine]) -> f64 {
+ // Reconstruct texts from diffs
+ let mut original_text = String::new(); // state of the text before any edits
+ let mut golden_text = String::new(); // text after applying golden edits
+ let mut actual_text = String::new(); // text after applying actual edits
+
+ for line in expected {
+ match line {
+ DiffLine::Context(s) => {
+ original_text.push_str(s);
+ golden_text.push_str(s);
+ }
+ DiffLine::Deletion(s) => {
+ original_text.push_str(s);
+ }
+ DiffLine::Addition(s) => {
+ golden_text.push_str(s);
+ }
+ _ => {}
+ }
+ }
+
+ for line in actual {
+ match line {
+ DiffLine::Context(s) | DiffLine::Addition(s) => {
+ actual_text.push_str(s);
+ }
+ _ => {}
+ }
+ }
+
+ // Edge case
+ if original_text == golden_text && golden_text == actual_text {
+ return 100.0;
+ }
+
+ // Compute the metric
+ let original_ngrams = chr_f_ngram_counts(&original_text);
+ let golden_ngrams = chr_f_ngram_counts(&golden_text);
+ let actual_ngrams = chr_f_ngram_counts(&actual_text);
+
+ let mut total_precision = 0.0;
+ let mut total_recall = 0.0;
+
+ for order in 0..CHR_F_CHAR_ORDER {
+ let expected_delta = compute_ngram_delta(&golden_ngrams[order], &original_ngrams[order]);
+ let actual_delta = compute_ngram_delta(&actual_ngrams[order], &original_ngrams[order]);
+
+ if expected_delta.is_empty() && actual_delta.is_empty() {
+ total_precision += 1.0;
+ total_recall += 1.0;
+ continue;
+ }
+
+ let expected_counts = ngram_delta_to_counts(&expected_delta);
+ let actual_counts = ngram_delta_to_counts(&actual_delta);
+
+ let score = Scores::from_counts(&expected_counts, &actual_counts);
+ total_precision += score.precision();
+ total_recall += score.recall();
+ }
+
+ let prec = total_precision / CHR_F_CHAR_ORDER as f64;
+ let recall = total_recall / CHR_F_CHAR_ORDER as f64;
+ let f_score = if prec + recall == 0.0 {
+ 0.0
+ } else {
+ (1.0 + CHR_F_BETA * CHR_F_BETA) * prec * recall / (CHR_F_BETA * CHR_F_BETA * prec + recall)
+ };
+
+ f_score * 100.0
+}
+
+fn chr_f_ngram_counts(text: &str) -> Vec<Counts> {
+ // Ignore whitespace. The original chrF implementation skips all
+ // whitespace. We should consider compressing multiple consecutive
+ // spaces into one -- this may reflect our task more closely.
+ let text = match CHR_F_WHITESPACE {
+ ChrfWhitespace::Unchanged => text.to_string(),
+ ChrfWhitespace::Ignore => text
+ .chars()
+ .filter(|c| !c.is_whitespace())
+ .collect::<String>(),
+ };
+
+ (1..=CHR_F_CHAR_ORDER)
+ .map(|order| count_ngrams(&text, order))
+ .collect()
+}
+
+fn compute_ngram_delta(after: &Counts, before: &Counts) -> CountsDelta {
+ let mut delta = CountsDelta::default();
+
+ for (ngram, &before_count) in before {
+ let after_count = *after.get(ngram).unwrap_or(&0);
+ delta.insert(ngram.clone(), after_count as isize - before_count as isize);
+ }
+
+ for (ngram, &after_count) in after {
+ if !before.contains_key(ngram) {
+ delta.insert(ngram.clone(), after_count as isize);
+ }
+ }
+
+ delta
+}
+
+/// Convert negative counts to special deletion tokens.
+/// For example, if expected delta is {"foo": -1} and actual delta is {"bar": -1},
+/// we convert it to {"Β¬foo": +1} and {"Β¬bar": +1}. This way _not_ deleting "foo"
+/// will result in a false negative, and mistakenly deleting "bar" will result in a false positive.
+fn ngram_delta_to_counts(delta: &CountsDelta) -> Counts {
+ let mut counts = Counts::default();
+
+ for (ngram, &delta) in delta {
+ if delta > 0 {
+ counts.insert(ngram.clone(), delta as usize);
+ } else {
+ counts.insert(format!("Β¬{ngram}"), delta.unsigned_abs());
+ }
+ }
+
+ counts
+}
+
+fn count_ngrams(text: &str, n: usize) -> Counts {
+ let chars: Vec<char> = text.chars().collect();
+ let mut counts = Counts::default();
+
+ for window in chars.windows(n) {
+ let ngram: String = window.iter().collect();
+ *counts.entry(ngram).or_insert(0) += 1;
+ }
+
+ counts
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+ use zeta::udiff::DiffLine;
+
+ #[test]
+ fn test_delta_chr_f_perfect_match() {
+ let diff = vec![
+ DiffLine::Context("fn main() {"),
+ DiffLine::Deletion(" println!(\"Hello\");"),
+ DiffLine::Addition(" println!(\"Hello, World!\");"),
+ DiffLine::Context("}"),
+ ];
+
+ let score = delta_chr_f(&diff, &diff);
+ assert!((score - 100.0).abs() < 1e-2);
+ }
+
+ #[test]
+ fn test_delta_chr_f_wrong_edit() {
+ // When the edit is wrong
+ let expected = vec![
+ DiffLine::Context("one "),
+ DiffLine::Deletion("two "),
+ DiffLine::Context("three"),
+ ];
+
+ let actual = vec![
+ DiffLine::Context("one "),
+ DiffLine::Context("two "),
+ DiffLine::Deletion("three"),
+ DiffLine::Addition("four"),
+ ];
+
+ // Then the score should be low
+ let score = delta_chr_f(&expected, &actual);
+ assert!(score > 20.0 && score < 40.0);
+ }
+
+ #[test]
+ fn test_delta_chr_f_partial_match() {
+ let expected = vec![
+ DiffLine::Deletion("let x = 42;"),
+ DiffLine::Addition("let x = 100;"),
+ ];
+
+ let actual = vec![
+ DiffLine::Deletion("let x = 42;"),
+ DiffLine::Addition("let x = 99;"),
+ ];
+
+ // We got the edit location right, but the replacement text is wrong.
+ // Deleted ngrams will match, bringing the score somewhere in the middle.
+ let score = delta_chr_f(&expected, &actual);
+ assert!(score > 40.0 && score < 60.0);
+ }
+
+ #[test]
+ fn test_delta_chr_f_missed_edit() {
+ // When predictions makes no changes
+ let expected = vec![
+ DiffLine::Context("prefix "),
+ DiffLine::Deletion("old"),
+ DiffLine::Addition("new"),
+ DiffLine::Context(" suffix"),
+ ];
+
+ let actual = vec![
+ DiffLine::Context("prefix "),
+ DiffLine::Context("old"),
+ DiffLine::Context(" suffix"),
+ ];
+
+ // Then the score should be low (all expected changes are false negatives)
+ let score = delta_chr_f(&expected, &actual);
+ assert!(score < 20.0);
+ }
+
+ #[test]
+ fn test_delta_chr_f_extra_edit() {
+ // When adding unexpected content
+ let expected = vec![DiffLine::Context("hello"), DiffLine::Context("world")];
+
+ let actual = vec![
+ DiffLine::Context("hello"),
+ DiffLine::Addition("extra"),
+ DiffLine::Context("world"),
+ ];
+
+ // Then the score should be low (all actual changes are false positives)
+ let score = delta_chr_f(&expected, &actual);
+ assert!(score < 20.0);
+ }
+}