Add token-match-debug command

Oleksiy Syvokon created

It writes HTML with explanation of how mertic values are derived

Change summary

crates/edit_prediction_cli/src/example.rs           |  10 
crates/edit_prediction_cli/src/main.rs              |  13 
crates/edit_prediction_cli/src/metrics.rs           | 476 +++++++++++++-
crates/edit_prediction_cli/src/score.rs             |  67 +
crates/edit_prediction_cli/src/token_match_debug.rs | 404 ++++++++++++
crates/edit_prediction_cli/src/word_diff.rs         |  20 
6 files changed, 923 insertions(+), 67 deletions(-)

Detailed changes

crates/edit_prediction_cli/src/example.rs ๐Ÿ”—

@@ -158,6 +158,16 @@ pub struct ExampleScore {
     #[serde(default)]
     pub exact_lines_fn: usize,
     #[serde(default)]
+    pub token_match_tp: usize,
+    #[serde(default)]
+    pub token_match_fp: usize,
+    #[serde(default)]
+    pub token_match_fn: usize,
+    #[serde(default)]
+    pub token_match_precision: f64,
+    #[serde(default)]
+    pub token_match_recall: f64,
+    #[serde(default)]
     pub reversal_ratio: f32,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub cursor_distance: Option<usize>,

crates/edit_prediction_cli/src/main.rs ๐Ÿ”—

@@ -22,6 +22,7 @@ mod reversal_tracking;
 mod score;
 mod split_commit;
 mod split_dataset;
+mod token_match_debug;
 
 mod synthesize;
 mod truncate_expected_patch;
@@ -61,6 +62,7 @@ use crate::score::run_scoring;
 use crate::split_commit::SplitCommitArgs;
 use crate::split_dataset::SplitArgs;
 use crate::synthesize::{SynthesizeConfig, run_synthesize};
+use crate::token_match_debug::{TokenMatchDebugArgs, run_token_match_debug};
 use crate::truncate_expected_patch::TruncatePatchArgs;
 
 #[derive(Parser, Debug)]
@@ -214,6 +216,8 @@ enum Command {
     SplitCommit(SplitCommitArgs),
     /// Truncate expected patch by the given criteria
     TruncatePatch(TruncatePatchArgs),
+    /// Generate token-match debug HTML for expected vs predicted patches
+    TokenMatchDebug(TokenMatchDebugArgs),
     /// Split a JSONL dataset into multiple files (stratified by repository_url if present)
     Split(SplitArgs),
     /// Filter a JSONL dataset by programming language (based on cursor_path extension)
@@ -257,6 +261,7 @@ impl Display for Command {
             Command::Clean => write!(f, "clean"),
             Command::SplitCommit(_) => write!(f, "split-commit"),
             Command::TruncatePatch(_) => write!(f, "truncate-patch"),
+            Command::TokenMatchDebug(_) => write!(f, "token-match-debug"),
             Command::Split(_) => write!(f, "split"),
             Command::FilterLanguages(_) => write!(f, "filter-languages"),
             Command::ImportBatch(args) => {
@@ -1056,6 +1061,13 @@ fn main() {
             }
             return;
         }
+        Command::TokenMatchDebug(debug_args) => {
+            if let Err(error) = run_token_match_debug(debug_args, &args.inputs) {
+                eprintln!("{error:#}");
+                std::process::exit(1);
+            }
+            return;
+        }
         Command::Split(split_args) => {
             if let Err(error) = split_dataset::run_split(split_args, &args.inputs) {
                 eprintln!("{error:#}");
@@ -1249,6 +1261,7 @@ fn main() {
                                         | Command::SplitCommit(_)
                                         | Command::Split(_)
                                         | Command::TruncatePatch(_)
+                                        | Command::TokenMatchDebug(_)
                                         | Command::FilterLanguages(_)
                                         | Command::ImportBatch(_)
                                         | Command::PrintZetaFormats => {

crates/edit_prediction_cli/src/metrics.rs ๐Ÿ”—

@@ -12,13 +12,32 @@ type CountsDelta = HashMap<String, isize>;
 /// Context characters needed on each side of a change to capture all affected n-grams
 const CONTEXT_CHARS: usize = CHR_F_CHAR_ORDER - 1;
 
-#[derive(Default, Debug, Clone)]
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
 pub struct ClassificationMetrics {
     pub true_positives: usize,
     pub false_positives: usize,
     pub false_negatives: usize,
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TokenClass {
+    TruePositive,
+    FalsePositive,
+    FalseNegative,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ClassifiedToken {
+    pub token: String,
+    pub class: TokenClass,
+}
+
+#[derive(Debug, Default, Clone, PartialEq, Eq)]
+pub struct TokenClassificationDetail {
+    pub expected_tokens: Vec<ClassifiedToken>,
+    pub actual_tokens: Vec<ClassifiedToken>,
+}
+
 impl ClassificationMetrics {
     pub fn from_counts(expected: &Counts, actual: &Counts) -> ClassificationMetrics {
         let mut true_positives = 0;
@@ -48,6 +67,12 @@ impl ClassificationMetrics {
         }
     }
 
+    pub fn accumulate(&mut self, other: &ClassificationMetrics) {
+        self.true_positives += other.true_positives;
+        self.false_positives += other.false_positives;
+        self.false_negatives += other.false_negatives;
+    }
+
     pub fn precision(&self) -> f64 {
         if self.true_positives + self.false_positives == 0 {
             0.0
@@ -75,6 +100,19 @@ impl ClassificationMetrics {
     }
 }
 
+pub fn compare_classification_metrics(
+    left: &ClassificationMetrics,
+    right: &ClassificationMetrics,
+) -> std::cmp::Ordering {
+    left.f1()
+        .total_cmp(&right.f1())
+        .then_with(|| left.precision().total_cmp(&right.precision()))
+        .then_with(|| left.recall().total_cmp(&right.recall()))
+        .then_with(|| left.true_positives.cmp(&right.true_positives))
+        .then_with(|| right.false_positives.cmp(&left.false_positives))
+        .then_with(|| right.false_negatives.cmp(&left.false_negatives))
+}
+
 enum ChrfWhitespace {
     /// Preserve whitespace as-is
     #[allow(unused)]
@@ -525,63 +563,137 @@ pub struct TokenChangeCounts {
     pub deleted_tokens: usize,
 }
 
-/// Counts the number of inserted and deleted tokens in a unified diff patch.
-///
-/// Tokens are words and whitespace sequences (as defined by `word_diff::tokenize`).
-/// Within each hunk, the old (`-`) and new (`+`) lines are compared at the token level
-/// using an LCS-based diff, so modified lines only count the actually changed tokens
-/// rather than the entire line.
-pub fn count_patch_token_changes(patch: &str) -> TokenChangeCounts {
-    let mut counts = TokenChangeCounts::default();
-    let mut old_lines: Vec<&str> = Vec::new();
-    let mut new_lines: Vec<&str> = Vec::new();
+fn classify_token_diff_ops(
+    expected_tokens: &[&str],
+    actual_tokens: &[&str],
+) -> ClassificationMetrics {
+    classify_token_diff_ops_detailed(expected_tokens, actual_tokens).0
+}
 
-    let flush =
-        |old_lines: &mut Vec<&str>, new_lines: &mut Vec<&str>, counts: &mut TokenChangeCounts| {
-            if old_lines.is_empty() && new_lines.is_empty() {
-                return;
+fn classify_token_diff_ops_detailed(
+    expected_tokens: &[&str],
+    actual_tokens: &[&str],
+) -> (ClassificationMetrics, TokenClassificationDetail) {
+    let mut metrics = ClassificationMetrics::default();
+    let mut detail = TokenClassificationDetail::default();
+
+    for operation in diff_tokens(expected_tokens, actual_tokens) {
+        match operation {
+            DiffOp::Equal {
+                old_start,
+                old_end,
+                new_start,
+                new_end,
+            } => {
+                metrics.true_positives += old_end - old_start;
+                for token in &expected_tokens[old_start..old_end] {
+                    detail.expected_tokens.push(ClassifiedToken {
+                        token: (*token).to_string(),
+                        class: TokenClass::TruePositive,
+                    });
+                }
+                for token in &actual_tokens[new_start..new_end] {
+                    detail.actual_tokens.push(ClassifiedToken {
+                        token: (*token).to_string(),
+                        class: TokenClass::TruePositive,
+                    });
+                }
             }
-
-            let old_text: String = old_lines
-                .iter()
-                .map(|line| if line.len() > 1 { &line[1..] } else { "" })
-                .collect::<Vec<_>>()
-                .join("\n");
-
-            let new_text: String = new_lines
-                .iter()
-                .map(|line| if line.len() > 1 { &line[1..] } else { "" })
-                .collect::<Vec<_>>()
-                .join("\n");
-
-            let old_tokens = tokenize(&old_text);
-            let new_tokens = tokenize(&new_text);
-            let ops = diff_tokens(&old_tokens, &new_tokens);
-
-            for op in ops {
-                match op {
-                    DiffOp::Equal(..) => {}
-                    DiffOp::Delete(start, end) => {
-                        counts.deleted_tokens += end - start;
-                    }
-                    DiffOp::Insert(start, end) => {
-                        counts.inserted_tokens += end - start;
-                    }
-                    DiffOp::Replace {
-                        old_start,
-                        old_end,
-                        new_start,
-                        new_end,
-                    } => {
-                        counts.deleted_tokens += old_end - old_start;
-                        counts.inserted_tokens += new_end - new_start;
-                    }
+            DiffOp::Delete(start, end) => {
+                metrics.false_negatives += end - start;
+                for token in &expected_tokens[start..end] {
+                    detail.expected_tokens.push(ClassifiedToken {
+                        token: (*token).to_string(),
+                        class: TokenClass::FalseNegative,
+                    });
+                }
+            }
+            DiffOp::Insert(start, end) => {
+                metrics.false_positives += end - start;
+                for token in &actual_tokens[start..end] {
+                    detail.actual_tokens.push(ClassifiedToken {
+                        token: (*token).to_string(),
+                        class: TokenClass::FalsePositive,
+                    });
+                }
+            }
+            DiffOp::Replace {
+                old_start,
+                old_end,
+                new_start,
+                new_end,
+            } => {
+                metrics.false_negatives += old_end - old_start;
+                metrics.false_positives += new_end - new_start;
+
+                for token in &expected_tokens[old_start..old_end] {
+                    detail.expected_tokens.push(ClassifiedToken {
+                        token: (*token).to_string(),
+                        class: TokenClass::FalseNegative,
+                    });
+                }
+                for token in &actual_tokens[new_start..new_end] {
+                    detail.actual_tokens.push(ClassifiedToken {
+                        token: (*token).to_string(),
+                        class: TokenClass::FalsePositive,
+                    });
                 }
             }
+        }
+    }
+
+    (metrics, detail)
+}
+
+fn classify_token_texts(expected_text: &str, actual_text: &str) -> ClassificationMetrics {
+    let expected_tokens = tokenize(expected_text);
+    let actual_tokens = tokenize(actual_text);
+    classify_token_diff_ops(&expected_tokens, &actual_tokens)
+}
+
+fn classify_token_texts_detailed(
+    expected_text: &str,
+    actual_text: &str,
+) -> (ClassificationMetrics, TokenClassificationDetail) {
+    let expected_tokens = tokenize(expected_text);
+    let actual_tokens = tokenize(actual_text);
+    classify_token_diff_ops_detailed(&expected_tokens, &actual_tokens)
+}
+
+fn strip_patch_line_prefix(line: &str) -> &str {
+    line.strip_prefix('-')
+        .or_else(|| line.strip_prefix('+'))
+        .unwrap_or(line)
+}
 
-            old_lines.clear();
-            new_lines.clear();
-        };
+fn extract_patch_change_blocks(patch: &str) -> Vec<(String, String)> {
+    let mut blocks = Vec::new();
+    let mut old_lines: Vec<&str> = Vec::new();
+    let mut new_lines: Vec<&str> = Vec::new();
+
+    let flush = |old_lines: &mut Vec<&str>,
+                 new_lines: &mut Vec<&str>,
+                 blocks: &mut Vec<(String, String)>| {
+        if old_lines.is_empty() && new_lines.is_empty() {
+            return;
+        }
+
+        let old_text = old_lines
+            .iter()
+            .map(|line| strip_patch_line_prefix(line))
+            .collect::<Vec<_>>()
+            .join("\n");
+
+        let new_text = new_lines
+            .iter()
+            .map(|line| strip_patch_line_prefix(line))
+            .collect::<Vec<_>>()
+            .join("\n");
+
+        blocks.push((old_text, new_text));
+        old_lines.clear();
+        new_lines.clear();
+    };
 
     for line in patch.lines() {
         if line.starts_with("---")
@@ -590,17 +702,102 @@ pub fn count_patch_token_changes(patch: &str) -> TokenChangeCounts {
             || line.starts_with("diff ")
             || line.starts_with("index ")
         {
-            flush(&mut old_lines, &mut new_lines, &mut counts);
+            flush(&mut old_lines, &mut new_lines, &mut blocks);
         } else if line.starts_with('-') {
             old_lines.push(line);
         } else if line.starts_with('+') {
             new_lines.push(line);
         } else {
-            flush(&mut old_lines, &mut new_lines, &mut counts);
+            flush(&mut old_lines, &mut new_lines, &mut blocks);
         }
     }
 
-    flush(&mut old_lines, &mut new_lines, &mut counts);
+    flush(&mut old_lines, &mut new_lines, &mut blocks);
+    blocks
+}
+
+fn collect_patch_side_text<F>(patch: &str, mut select_side: F) -> String
+where
+    F: FnMut(&(String, String)) -> &str,
+{
+    let mut text = String::new();
+
+    for block in extract_patch_change_blocks(patch) {
+        let block_text = select_side(&block);
+        if block_text.is_empty() {
+            continue;
+        }
+
+        if !text.is_empty() {
+            text.push('\n');
+        }
+        text.push_str(block_text);
+    }
+
+    text
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TokenMatchDebugReport {
+    pub expected_deleted_text: String,
+    pub actual_deleted_text: String,
+    pub expected_inserted_text: String,
+    pub actual_inserted_text: String,
+    pub deleted: TokenClassificationDetail,
+    pub inserted: TokenClassificationDetail,
+    pub metrics: ClassificationMetrics,
+}
+
+/// Computes token-match precision/recall counts between expected and actual patches.
+///
+/// Deletions and insertions are aligned independently, then their counts are summed.
+/// Tokenization uses `word_diff::tokenize`, so identifiers, whitespace runs, and punctuation
+/// are compared using the same token boundaries as the word-diff view.
+pub fn token_match(expected_patch: &str, actual_patch: &str) -> ClassificationMetrics {
+    token_match_debug_report(expected_patch, actual_patch).metrics
+}
+
+pub fn token_match_debug_report(expected_patch: &str, actual_patch: &str) -> TokenMatchDebugReport {
+    let expected_deleted =
+        collect_patch_side_text(expected_patch, |(old_text, _)| old_text.as_str());
+    let actual_deleted = collect_patch_side_text(actual_patch, |(old_text, _)| old_text.as_str());
+    let expected_inserted =
+        collect_patch_side_text(expected_patch, |(_, new_text)| new_text.as_str());
+    let actual_inserted = collect_patch_side_text(actual_patch, |(_, new_text)| new_text.as_str());
+
+    let (mut metrics, deleted_detail) =
+        classify_token_texts_detailed(&expected_deleted, &actual_deleted);
+    let (inserted_metrics, inserted_detail) =
+        classify_token_texts_detailed(&expected_inserted, &actual_inserted);
+    metrics.accumulate(&inserted_metrics);
+
+    TokenMatchDebugReport {
+        expected_deleted_text: expected_deleted,
+        actual_deleted_text: actual_deleted,
+        expected_inserted_text: expected_inserted,
+        actual_inserted_text: actual_inserted,
+        deleted: deleted_detail,
+        inserted: inserted_detail,
+        metrics,
+    }
+}
+
+/// Counts the number of inserted and deleted tokens in a unified diff patch.
+///
+/// Tokens are words and whitespace sequences (as defined by `word_diff::tokenize`).
+/// Within each hunk, the old (`-`) and new (`+`) lines are compared at the token level
+/// using an LCS-based diff, so modified lines only count the actually changed tokens
+/// rather than the entire line.
+
+pub fn count_patch_token_changes(patch: &str) -> TokenChangeCounts {
+    let mut counts = TokenChangeCounts::default();
+
+    for (old_text, new_text) in extract_patch_change_blocks(patch) {
+        let metrics = classify_token_texts(&old_text, &new_text);
+        counts.deleted_tokens += metrics.false_negatives;
+        counts.inserted_tokens += metrics.false_positives;
+    }
+
     counts
 }
 
@@ -961,6 +1158,173 @@ index abc123..def456 100644
         assert_eq!(metrics.false_negatives, 0);
     }
 
+    #[test]
+    fn test_token_match_perfect() {
+        let expected = indoc! {"
+            @@ -1,2 +1,4 @@
+            -str
+            +struct LanguageEntry {
+            +    path: PathBuf,
+            +}
+        "};
+
+        let actual = indoc! {"
+            @@ -1,2 +1,4 @@
+            -str
+            +struct LanguageEntry {
+            +    path: PathBuf,
+            +}
+        "};
+
+        let metrics = token_match(expected, actual);
+        assert_eq!(metrics.false_positives, 0);
+        assert_eq!(metrics.false_negatives, 0);
+        assert!(metrics.true_positives > 0);
+        assert!((metrics.precision() - 1.0).abs() < 1e-6);
+        assert!((metrics.recall() - 1.0).abs() < 1e-6);
+        assert!((metrics.f1() - 1.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_token_match_partial_subset_keeps_high_precision() {
+        let expected = indoc! {"
+            @@ -1,2 +1,6 @@
+            -str
+            +struct LanguageEntry {
+            +    path: PathBuf,
+            +    language: OnceCell<Language>,
+            +    external_files: Option<Vec<PathBuf>>,
+            +}
+        "};
+
+        let actual = indoc! {"
+            @@ -1,2 +1,3 @@
+            -str
+            +struct LanguageEntry {
+            +}
+        "};
+
+        let metrics = token_match(expected, actual);
+        assert!(metrics.true_positives > 0);
+        assert_eq!(metrics.false_positives, 0);
+        assert!(metrics.false_negatives > 0);
+        assert!((metrics.precision() - 1.0).abs() < 1e-6);
+        assert!(metrics.recall() < 1.0);
+    }
+
+    #[test]
+    fn test_token_match_counts_wrong_tokens_as_fp_and_fn() {
+        let expected = indoc! {"
+            @@ -1,1 +1,1 @@
+            -old_name
+            +new_name
+        "};
+
+        let actual = indoc! {"
+            @@ -1,1 +1,1 @@
+            -different_old
+            +different_new
+        "};
+
+        let metrics = token_match(expected, actual);
+        assert_eq!(metrics.true_positives, 0);
+        assert!(metrics.false_positives > 0);
+        assert!(metrics.false_negatives > 0);
+    }
+
+    #[test]
+    fn test_token_match_debug_report_metrics_match_token_match() {
+        let expected = indoc! {"
+            @@ -1,2 +1,3 @@
+            -str
+            +struct LanguageEntry {
+            +}
+        "};
+
+        let actual = indoc! {"
+            @@ -1,2 +1,4 @@
+            -str
+            +struct LanguageEntry {
+            +    path: PathBuf,
+            +}
+        "};
+
+        let metrics = token_match(expected, actual);
+        let report = token_match_debug_report(expected, actual);
+
+        assert_eq!(report.metrics, metrics);
+
+        let expected_tp = report
+            .deleted
+            .expected_tokens
+            .iter()
+            .chain(report.inserted.expected_tokens.iter())
+            .filter(|token| token.class == TokenClass::TruePositive)
+            .count();
+        let expected_fn = report
+            .deleted
+            .expected_tokens
+            .iter()
+            .chain(report.inserted.expected_tokens.iter())
+            .filter(|token| token.class == TokenClass::FalseNegative)
+            .count();
+        let actual_tp = report
+            .deleted
+            .actual_tokens
+            .iter()
+            .chain(report.inserted.actual_tokens.iter())
+            .filter(|token| token.class == TokenClass::TruePositive)
+            .count();
+        let actual_fp = report
+            .deleted
+            .actual_tokens
+            .iter()
+            .chain(report.inserted.actual_tokens.iter())
+            .filter(|token| token.class == TokenClass::FalsePositive)
+            .count();
+
+        assert_eq!(expected_tp, report.metrics.true_positives);
+        assert_eq!(actual_tp, report.metrics.true_positives);
+        assert_eq!(expected_fn, report.metrics.false_negatives);
+        assert_eq!(actual_fp, report.metrics.false_positives);
+    }
+
+    #[test]
+    fn test_token_match_debug_report_marks_inserted_extra_tokens_as_fp() {
+        let expected = indoc! {"
+            @@ -1,1 +1,1 @@
+            -a
+            +value
+        "};
+
+        let actual = indoc! {"
+            @@ -1,1 +1,1 @@
+            -a
+            +value_extra
+        "};
+
+        let report = token_match_debug_report(expected, actual);
+
+        assert_eq!(report.metrics.false_positives, 1);
+        assert_eq!(report.metrics.false_negatives, 1);
+
+        assert!(
+            report
+                .inserted
+                .actual_tokens
+                .iter()
+                .any(|token| token.token == "value_extra"
+                    && token.class == TokenClass::FalsePositive)
+        );
+        assert!(
+            report
+                .inserted
+                .expected_tokens
+                .iter()
+                .any(|token| token.token == "value" && token.class == TokenClass::FalseNegative)
+        );
+    }
+
     #[test]
     fn test_is_editable_region_correct() {
         let patch = indoc! {"

crates/edit_prediction_cli/src/score.rs ๐Ÿ”—

@@ -71,6 +71,11 @@ pub async fn run_scoring(
         exact_lines_tp: 0,
         exact_lines_fp: 0,
         exact_lines_fn: 0,
+        token_match_tp: 0,
+        token_match_fp: 0,
+        token_match_fn: 0,
+        token_match_precision: 0.0,
+        token_match_recall: 0.0,
         reversal_ratio: 0.0,
         cursor_distance: None,
         cursor_exact_match: None,
@@ -100,10 +105,30 @@ pub async fn run_scoring(
 
         let token_changes = metrics::count_patch_token_changes(&actual_patch);
 
+        let best_exact_lines = expected_patches_with_cursors
+            .iter()
+            .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
+            .max_by_key(|m| m.true_positives)
+            .unwrap_or_default();
+
+        let best_token_match = expected_patches_with_cursors
+            .iter()
+            .map(|(expected_patch, _)| metrics::token_match(expected_patch, &actual_patch))
+            .max_by(metrics::compare_classification_metrics)
+            .unwrap_or_default();
+
         let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
             Ok(text) => text,
             Err(_) => {
                 let mut s = zero_scores.clone();
+                s.exact_lines_tp = best_exact_lines.true_positives;
+                s.exact_lines_fp = best_exact_lines.false_positives;
+                s.exact_lines_fn = best_exact_lines.false_negatives;
+                s.token_match_tp = best_token_match.true_positives;
+                s.token_match_fp = best_token_match.false_positives;
+                s.token_match_fn = best_token_match.false_negatives;
+                s.token_match_precision = best_token_match.precision();
+                s.token_match_recall = best_token_match.recall();
                 s.inserted_tokens = token_changes.inserted_tokens;
                 s.deleted_tokens = token_changes.deleted_tokens;
                 scores.push(s);
@@ -151,13 +176,6 @@ pub async fn run_scoring(
         let disbalance_after = metrics::braces_disbalance(&actual_text);
         let braces_disbalance = disbalance_after.saturating_sub(disbalance_before);
 
-        // Compute exact lines match against best matching expected patch
-        let best_exact_lines = expected_patches_with_cursors
-            .iter()
-            .map(|(expected_patch, _)| metrics::exact_lines_match(expected_patch, &actual_patch))
-            .max_by_key(|m| m.true_positives)
-            .unwrap_or_default();
-
         // Compute reversal ratio
         let reversal_ratio = reversal_tracking::compute_prediction_reversal_ratio(
             prompt_inputs,
@@ -184,6 +202,11 @@ pub async fn run_scoring(
             exact_lines_tp: best_exact_lines.true_positives,
             exact_lines_fp: best_exact_lines.false_positives,
             exact_lines_fn: best_exact_lines.false_negatives,
+            token_match_tp: best_token_match.true_positives,
+            token_match_fp: best_token_match.false_positives,
+            token_match_fn: best_token_match.false_negatives,
+            token_match_precision: best_token_match.precision(),
+            token_match_recall: best_token_match.recall(),
             reversal_ratio,
             cursor_distance,
             cursor_exact_match,
@@ -253,6 +276,7 @@ pub fn print_report(examples: &[Example], verbose: bool) {
     let mut isolated_whitespace_count: usize = 0;
     let mut patch_inserted_tokens: Vec<usize> = Vec::new();
     let mut patch_deleted_tokens: Vec<usize> = Vec::new();
+    let mut total_token_match = ClassificationMetrics::default();
     let mut predictions_with_patch: usize = 0;
 
     let mut printed_lines: usize = 0;
@@ -317,6 +341,9 @@ pub fn print_report(examples: &[Example], verbose: bool) {
             total_exact_lines.true_positives += score.exact_lines_tp;
             total_exact_lines.false_positives += score.exact_lines_fp;
             total_exact_lines.false_negatives += score.exact_lines_fn;
+            total_token_match.true_positives += score.token_match_tp;
+            total_token_match.false_positives += score.token_match_fp;
+            total_token_match.false_negatives += score.token_match_fn;
 
             // Accumulate QA metrics
             if let Some(qa) = qa_result {
@@ -465,6 +492,16 @@ pub fn print_report(examples: &[Example], verbose: bool) {
             println!("Isolated whitespace changes: {}", isolated_ws_str);
         }
 
+        println!(
+            "Token match: P={:.1}% R={:.1}% F1={:.1}% (TP={}, FP={}, FN={})",
+            total_token_match.precision() * 100.0,
+            total_token_match.recall() * 100.0,
+            total_token_match.f1() * 100.0,
+            total_token_match.true_positives,
+            total_token_match.false_positives,
+            total_token_match.false_negatives,
+        );
+
         // Print token change percentile summary (only for predictions with a patch)
         if !patch_inserted_tokens.is_empty() {
             patch_inserted_tokens.sort_unstable();
@@ -547,6 +584,12 @@ pub struct SummaryJson {
     pub exact_lines_precision: f64,
     pub exact_lines_recall: f64,
     pub exact_lines_f1: f64,
+    pub token_match_tp: usize,
+    pub token_match_fp: usize,
+    pub token_match_fn: usize,
+    pub token_match_precision: f64,
+    pub token_match_recall: f64,
+    pub token_match_f1: f64,
     pub avg_reversal_ratio: f32,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub qa_avg_reverts_edits: Option<f32>,
@@ -570,6 +613,7 @@ pub fn compute_summary(examples: &[Example]) -> SummaryJson {
     let mut all_reversal_ratios = Vec::new();
     let mut braces_disbalance_sum: usize = 0;
     let mut total_exact_lines = ClassificationMetrics::default();
+    let mut total_token_match = ClassificationMetrics::default();
     let mut total_scores: usize = 0;
     let mut qa_reverts_count: usize = 0;
     let mut qa_reverts_total: usize = 0;
@@ -592,6 +636,9 @@ pub fn compute_summary(examples: &[Example]) -> SummaryJson {
             total_exact_lines.true_positives += score.exact_lines_tp;
             total_exact_lines.false_positives += score.exact_lines_fp;
             total_exact_lines.false_negatives += score.exact_lines_fn;
+            total_token_match.true_positives += score.token_match_tp;
+            total_token_match.false_positives += score.token_match_fp;
+            total_token_match.false_negatives += score.token_match_fn;
 
             // Accumulate QA metrics
             if let Some(Some(qa)) = example.qa.get(score_idx) {
@@ -704,6 +751,12 @@ pub fn compute_summary(examples: &[Example]) -> SummaryJson {
         exact_lines_precision: total_exact_lines.precision(),
         exact_lines_recall: total_exact_lines.recall(),
         exact_lines_f1: total_exact_lines.f1(),
+        token_match_tp: total_token_match.true_positives,
+        token_match_fp: total_token_match.false_positives,
+        token_match_fn: total_token_match.false_negatives,
+        token_match_precision: total_token_match.precision(),
+        token_match_recall: total_token_match.recall(),
+        token_match_f1: total_token_match.f1(),
         avg_reversal_ratio,
         qa_avg_reverts_edits,
         qa_avg_confidence,

crates/edit_prediction_cli/src/token_match_debug.rs ๐Ÿ”—

@@ -0,0 +1,404 @@
+use crate::{example::read_example_files, metrics};
+use anyhow::Context as _;
+use clap::Args;
+use std::fmt::Write as _;
+use std::path::PathBuf;
+
+#[derive(Args, Debug, Clone)]
+#[command(
+    about = "Generate token-match debug HTML for expected vs predicted patches",
+    after_help = r#"EXAMPLES:
+  # Debug all examples from a jsonl dataset
+  ep token-match-debug examples.jsonl
+
+  # Write HTML files to a specific directory
+  ep token-match-debug examples.jsonl --output-dir out/token-debug
+
+  # Keep only the best expected patch per prediction
+  ep token-match-debug examples.jsonl --best-only
+
+  # Limit generated files
+  ep token-match-debug examples.jsonl --limit 50
+"#
+)]
+pub struct TokenMatchDebugArgs {
+    /// Directory where HTML reports are written.
+    #[arg(long, default_value = "token-match-debug")]
+    pub output_dir: PathBuf,
+
+    /// Only emit one report per prediction (best matching expected patch).
+    #[arg(long, default_value_t = false)]
+    pub best_only: bool,
+
+    /// Maximum number of reports to write.
+    #[arg(long)]
+    pub limit: Option<usize>,
+}
+
+pub fn run_token_match_debug(args: &TokenMatchDebugArgs, inputs: &[PathBuf]) -> anyhow::Result<()> {
+    let stdin_path = PathBuf::from("-");
+    let inputs = if inputs.is_empty() {
+        std::slice::from_ref(&stdin_path)
+    } else {
+        inputs
+    };
+
+    let examples = read_example_files(inputs);
+    std::fs::create_dir_all(&args.output_dir).with_context(|| {
+        format!(
+            "failed to create output directory '{}'",
+            args.output_dir.display()
+        )
+    })?;
+
+    let mut written = 0usize;
+    for example in &examples {
+        let expected_patches = example.spec.expected_patches_with_cursor_positions();
+        if expected_patches.is_empty() || example.predictions.is_empty() {
+            continue;
+        }
+
+        for (prediction_index, prediction) in example.predictions.iter().enumerate() {
+            let Some(actual_patch) = prediction.actual_patch.as_deref() else {
+                continue;
+            };
+            if actual_patch.trim().is_empty() {
+                continue;
+            }
+
+            if args.best_only {
+                if let Some((expected_index, report)) =
+                    best_expected_patch_report(&expected_patches, actual_patch)
+                {
+                    let html = render_report_html(
+                        &example.spec.name,
+                        prediction_index,
+                        expected_index,
+                        &expected_patches[expected_index].0,
+                        actual_patch,
+                        &report,
+                    );
+
+                    let path = args.output_dir.join(report_filename(
+                        &example.spec.filename(),
+                        prediction_index,
+                        expected_index,
+                    ));
+                    std::fs::write(&path, html)
+                        .with_context(|| format!("failed to write report '{}'", path.display()))?;
+                    written += 1;
+                    if args.limit.is_some_and(|limit| written >= limit) {
+                        eprintln!(
+                            "Wrote {} report(s) to {}",
+                            written,
+                            args.output_dir.display()
+                        );
+                        return Ok(());
+                    }
+                }
+                continue;
+            }
+
+            for (expected_index, (expected_patch, _)) in expected_patches.iter().enumerate() {
+                let report = metrics::token_match_debug_report(expected_patch, actual_patch);
+                let html = render_report_html(
+                    &example.spec.name,
+                    prediction_index,
+                    expected_index,
+                    expected_patch,
+                    actual_patch,
+                    &report,
+                );
+                let path = args.output_dir.join(report_filename(
+                    &example.spec.filename(),
+                    prediction_index,
+                    expected_index,
+                ));
+
+                std::fs::write(&path, html)
+                    .with_context(|| format!("failed to write report '{}'", path.display()))?;
+                written += 1;
+
+                if args.limit.is_some_and(|limit| written >= limit) {
+                    eprintln!(
+                        "Wrote {} report(s) to {}",
+                        written,
+                        args.output_dir.display()
+                    );
+                    return Ok(());
+                }
+            }
+        }
+    }
+
+    eprintln!(
+        "Wrote {} report(s) to {}",
+        written,
+        args.output_dir.display()
+    );
+    Ok(())
+}
+
+fn best_expected_patch_report(
+    expected_patches: &[(String, Option<usize>)],
+    actual_patch: &str,
+) -> Option<(usize, metrics::TokenMatchDebugReport)> {
+    let mut best: Option<(usize, metrics::TokenMatchDebugReport)> = None;
+    for (index, (expected_patch, _)) in expected_patches.iter().enumerate() {
+        let report = metrics::token_match_debug_report(expected_patch, actual_patch);
+        match &best {
+            Some((_, current)) => {
+                if metrics::compare_classification_metrics(&report.metrics, &current.metrics)
+                    .is_gt()
+                {
+                    best = Some((index, report));
+                }
+            }
+            None => best = Some((index, report)),
+        }
+    }
+    best
+}
+
+fn report_filename(example_name: &str, prediction_index: usize, expected_index: usize) -> String {
+    format!(
+        "{}__prediction-{}__expected-{}.html",
+        example_name, prediction_index, expected_index
+    )
+}
+
+fn render_report_html(
+    example_name: &str,
+    prediction_index: usize,
+    expected_index: usize,
+    expected_patch: &str,
+    actual_patch: &str,
+    report: &metrics::TokenMatchDebugReport,
+) -> String {
+    let mut html = String::new();
+
+    let precision = report.metrics.precision() * 100.0;
+    let recall = report.metrics.recall() * 100.0;
+    let f1 = report.metrics.f1() * 100.0;
+
+    let _ = write!(
+        html,
+        r#"<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>Token Match Debug</title>
+<style>
+:root {{
+  color-scheme: light dark;
+  --bg: #0f1115;
+  --panel: #161a22;
+  --muted: #9ca3af;
+  --text: #e5e7eb;
+  --tp: #22c55e33;
+  --fp: #ef444433;
+  --fn: #f59e0b33;
+  --border: #2a3140;
+}}
+* {{ box-sizing: border-box; }}
+body {{
+  margin: 0;
+  font-family: ui-sans-serif, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+  background: var(--bg);
+  color: var(--text);
+}}
+main {{
+  max-width: 1400px;
+  margin: 0 auto;
+  padding: 20px;
+}}
+h1, h2, h3 {{
+  margin: 0 0 10px;
+}}
+.meta {{
+  color: var(--muted);
+  margin-bottom: 16px;
+}}
+.grid {{
+  display: grid;
+  gap: 16px;
+}}
+.grid.two {{
+  grid-template-columns: repeat(2, minmax(0, 1fr));
+}}
+.panel {{
+  border: 1px solid var(--border);
+  background: var(--panel);
+  border-radius: 10px;
+  padding: 12px;
+}}
+.metrics {{
+  display: flex;
+  gap: 18px;
+  flex-wrap: wrap;
+}}
+.metric {{
+  min-width: 160px;
+}}
+.metric .label {{
+  color: var(--muted);
+  font-size: 12px;
+}}
+.metric .value {{
+  font-weight: 700;
+  font-size: 20px;
+}}
+pre {{
+  white-space: pre-wrap;
+  word-break: break-word;
+  margin: 0;
+  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
+  font-size: 12px;
+  line-height: 1.5;
+}}
+.legend {{
+  display: flex;
+  gap: 12px;
+  font-size: 12px;
+  color: var(--muted);
+  flex-wrap: wrap;
+}}
+.swatch {{
+  display: inline-block;
+  width: 12px;
+  height: 12px;
+  border-radius: 2px;
+  margin-right: 4px;
+  vertical-align: middle;
+}}
+.tp {{ background: var(--tp); }}
+.fp {{ background: var(--fp); }}
+.fn {{ background: var(--fn); }}
+.token.tp {{ background: var(--tp); }}
+.token.fp {{ background: var(--fp); }}
+.token.fn {{ background: var(--fn); }}
+.token {{
+  border-radius: 3px;
+}}
+.section-title {{
+  margin-bottom: 8px;
+  color: var(--muted);
+  font-size: 12px;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+}}
+</style>
+</head>
+<body>
+<main>
+  <h1>Token Match Debug</h1>
+  <p class="meta">Example: {example_name} ยท Prediction #{prediction_index} ยท Expected Patch #{expected_index}</p>
+
+  <section class="panel">
+    <div class="metrics">
+      <div class="metric"><div class="label">Precision</div><div class="value">{precision:.1}%</div></div>
+      <div class="metric"><div class="label">Recall</div><div class="value">{recall:.1}%</div></div>
+      <div class="metric"><div class="label">F1</div><div class="value">{f1:.1}%</div></div>
+      <div class="metric"><div class="label">TP</div><div class="value">{tp}</div></div>
+      <div class="metric"><div class="label">FP</div><div class="value">{fp}</div></div>
+      <div class="metric"><div class="label">FN</div><div class="value">{fn}</div></div>
+    </div>
+    <div class="legend" style="margin-top: 10px;">
+      <span><span class="swatch tp"></span>True Positive</span>
+      <span><span class="swatch fp"></span>False Positive</span>
+      <span><span class="swatch fn"></span>False Negative</span>
+    </div>
+  </section>
+
+  <div class="grid two" style="margin-top: 16px;">
+    <section class="panel">
+      <div class="section-title">Expected patch</div>
+      <pre>{expected_patch}</pre>
+    </section>
+    <section class="panel">
+      <div class="section-title">Actual patch</div>
+      <pre>{actual_patch}</pre>
+    </section>
+  </div>
+
+  <div class="grid two" style="margin-top: 16px;">
+    <section class="panel">
+      <h3>Deleted-side token alignment</h3>
+      <div class="section-title">Expected deleted text</div>
+      <pre>{expected_deleted_text}</pre>
+      <div class="section-title" style="margin-top: 10px;">Actual deleted text</div>
+      <pre>{actual_deleted_text}</pre>
+      <div class="section-title" style="margin-top: 10px;">Expected deleted tokens (FN highlighted)</div>
+      <pre>{deleted_expected_tokens}</pre>
+      <div class="section-title" style="margin-top: 10px;">Actual deleted tokens (FP highlighted)</div>
+      <pre>{deleted_actual_tokens}</pre>
+    </section>
+
+    <section class="panel">
+      <h3>Inserted-side token alignment</h3>
+      <div class="section-title">Expected inserted text</div>
+      <pre>{expected_inserted_text}</pre>
+      <div class="section-title" style="margin-top: 10px;">Actual inserted text</div>
+      <pre>{actual_inserted_text}</pre>
+      <div class="section-title" style="margin-top: 10px;">Expected inserted tokens (FN highlighted)</div>
+      <pre>{inserted_expected_tokens}</pre>
+      <div class="section-title" style="margin-top: 10px;">Actual inserted tokens (FP highlighted)</div>
+      <pre>{inserted_actual_tokens}</pre>
+    </section>
+  </div>
+</main>
+</body>
+</html>"#,
+        example_name = escape_html(example_name),
+        prediction_index = prediction_index,
+        expected_index = expected_index,
+        precision = precision,
+        recall = recall,
+        f1 = f1,
+        tp = report.metrics.true_positives,
+        fp = report.metrics.false_positives,
+        fn = report.metrics.false_negatives,
+        expected_patch = escape_html(expected_patch),
+        actual_patch = escape_html(actual_patch),
+        expected_deleted_text = escape_html(&report.expected_deleted_text),
+        actual_deleted_text = escape_html(&report.actual_deleted_text),
+        expected_inserted_text = escape_html(&report.expected_inserted_text),
+        actual_inserted_text = escape_html(&report.actual_inserted_text),
+        deleted_expected_tokens = render_classified_tokens(&report.deleted.expected_tokens),
+        deleted_actual_tokens = render_classified_tokens(&report.deleted.actual_tokens),
+        inserted_expected_tokens = render_classified_tokens(&report.inserted.expected_tokens),
+        inserted_actual_tokens = render_classified_tokens(&report.inserted.actual_tokens),
+    );
+
+    html
+}
+
+fn render_classified_tokens(tokens: &[metrics::ClassifiedToken]) -> String {
+    let mut result = String::new();
+    for token in tokens {
+        let class = match token.class {
+            metrics::TokenClass::TruePositive => "tp",
+            metrics::TokenClass::FalsePositive => "fp",
+            metrics::TokenClass::FalseNegative => "fn",
+        };
+        let escaped = escape_html(&token.token);
+        let _ = write!(result, r#"<span class="token {class}">{escaped}</span>"#);
+    }
+    result
+}
+
+fn escape_html(input: &str) -> String {
+    let mut result = String::with_capacity(input.len());
+    for character in input.chars() {
+        match character {
+            '&' => result.push_str("&amp;"),
+            '<' => result.push_str("&lt;"),
+            '>' => result.push_str("&gt;"),
+            '"' => result.push_str("&quot;"),
+            '\'' => result.push_str("&#39;"),
+            _ => result.push(character),
+        }
+    }
+    result
+}

crates/edit_prediction_cli/src/word_diff.rs ๐Ÿ”—

@@ -85,8 +85,10 @@ fn compute_word_diff(old_text: &str, new_text: &str) -> String {
 
     for op in ops {
         match op {
-            DiffOp::Equal(start, end) => {
-                for token in &old_words[start..end] {
+            DiffOp::Equal {
+                old_start, old_end, ..
+            } => {
+                for token in &old_words[old_start..old_end] {
                     result.push_str(token);
                 }
             }
@@ -178,7 +180,12 @@ pub(crate) fn tokenize(text: &str) -> Vec<&str> {
 
 #[derive(Debug)]
 pub(crate) enum DiffOp {
-    Equal(usize, usize),
+    Equal {
+        old_start: usize,
+        old_end: usize,
+        new_start: usize,
+        new_end: usize,
+    },
     Delete(usize, usize),
     Insert(usize, usize),
     Replace {
@@ -199,7 +206,12 @@ pub(crate) fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec<DiffOp> {
             let old_range = op.old_range();
             let new_range = op.new_range();
             match tag {
-                DiffTag::Equal => DiffOp::Equal(old_range.start, old_range.end),
+                DiffTag::Equal => DiffOp::Equal {
+                    old_start: old_range.start,
+                    old_end: old_range.end,
+                    new_start: new_range.start,
+                    new_end: new_range.end,
+                },
                 DiffTag::Delete => DiffOp::Delete(old_range.start, old_range.end),
                 DiffTag::Insert => DiffOp::Insert(new_range.start, new_range.end),
                 DiffTag::Replace => DiffOp::Replace {