From a53017cc36d3f00d7abb7044e7e34acf7875024a Mon Sep 17 00:00:00 2001 From: Oleksiy Syvokon Date: Wed, 21 Jan 2026 13:15:25 +0200 Subject: [PATCH] ep: Don't compute n-grams for regions that match exactly (#47281) This makes delta-chrF computation ~50 faster. Release Notes: - N/A --- crates/edit_prediction_cli/src/metrics.rs | 282 ++++++++++++++++++++++ 1 file changed, 282 insertions(+) diff --git a/crates/edit_prediction_cli/src/metrics.rs b/crates/edit_prediction_cli/src/metrics.rs index 2dc767d683fe3ebf15abd462b3f6ecc0f986742f..148c1ec7cf4ad54180aa521269ea2e243875c96a 100644 --- a/crates/edit_prediction_cli/src/metrics.rs +++ b/crates/edit_prediction_cli/src/metrics.rs @@ -3,6 +3,9 @@ use collections::HashMap; type Counts = HashMap; type CountsDelta = HashMap; +/// Context characters needed on each side of a change to capture all affected n-grams +const CONTEXT_CHARS: usize = CHR_F_CHAR_ORDER - 1; + #[derive(Default, Debug, Clone)] struct ClassificationMetrics { true_positives: usize, @@ -80,6 +83,62 @@ pub fn delta_chr_f(original: &str, expected: &str, actual: &str) -> f64 { return 100.0; } + // Pre-filter whitespace once for all texts + let orig_chars: Vec = filter_whitespace_chars(original); + let exp_chars: Vec = filter_whitespace_chars(expected); + let act_chars: Vec = filter_whitespace_chars(actual); + + // Find the changed regions between original→expected and original→actual + // We only need to compute n-grams on these regions (plus context for boundary n-grams) + let (orig_for_exp, exp_region) = extract_changed_regions(&orig_chars, &exp_chars); + let (orig_for_act, act_region) = extract_changed_regions(&orig_chars, &act_chars); + + let mut total_precision = 0.0; + let mut total_recall = 0.0; + + for order in 1..=CHR_F_CHAR_ORDER { + // Compute n-grams only on the affected regions + let orig_ngrams_for_exp = count_ngrams_from_chars(&orig_for_exp, order); + let exp_ngrams = count_ngrams_from_chars(&exp_region, order); + let expected_delta = compute_ngram_delta(&exp_ngrams, &orig_ngrams_for_exp); + + let orig_ngrams_for_act = count_ngrams_from_chars(&orig_for_act, order); + let act_ngrams = count_ngrams_from_chars(&act_region, order); + let actual_delta = compute_ngram_delta(&act_ngrams, &orig_ngrams_for_act); + + if expected_delta.is_empty() && actual_delta.is_empty() { + total_precision += 1.0; + total_recall += 1.0; + continue; + } + + let expected_counts = ngram_delta_to_counts(&expected_delta); + let actual_counts = ngram_delta_to_counts(&actual_delta); + + let score = ClassificationMetrics::from_counts(&expected_counts, &actual_counts); + total_precision += score.precision(); + total_recall += score.recall(); + } + + let prec = total_precision / CHR_F_CHAR_ORDER as f64; + let recall = total_recall / CHR_F_CHAR_ORDER as f64; + let f_score = if prec + recall == 0.0 { + 0.0 + } else { + (1.0 + CHR_F_BETA * CHR_F_BETA) * prec * recall / (CHR_F_BETA * CHR_F_BETA * prec + recall) + }; + + f_score * 100.0 +} + +/// Reference implementation of delta_chr_f (original, non-optimized version). +/// Used for testing that the optimized version produces identical results. +#[cfg(test)] +fn delta_chr_f_reference(original: &str, expected: &str, actual: &str) -> f64 { + if original == expected && expected == actual { + return 100.0; + } + let original_ngrams = chr_f_ngram_counts(original); let expected_ngrams = chr_f_ngram_counts(expected); let actual_ngrams = chr_f_ngram_counts(actual); @@ -116,6 +175,83 @@ pub fn delta_chr_f(original: &str, expected: &str, actual: &str) -> f64 { f_score * 100.0 } +/// Filter whitespace from a string and return as Vec +fn filter_whitespace_chars(text: &str) -> Vec { + match CHR_F_WHITESPACE { + ChrfWhitespace::Unchanged => text.chars().collect(), + ChrfWhitespace::Ignore => text.chars().filter(|c| !c.is_whitespace()).collect(), + } +} + +/// Extract only the changed regions between two texts, with context for n-gram boundaries. +/// +/// Returns (original_affected_region, modified_affected_region) as Vec. +/// +/// The key insight: when computing n-gram delta between two nearly-identical texts, +/// n-grams from unchanged regions cancel out. We only need to process: +/// 1. The changed content itself +/// 2. CONTEXT_CHARS (n-1) characters before and after, to capture boundary-crossing n-grams +fn extract_changed_regions(original: &[char], modified: &[char]) -> (Vec, Vec) { + // Find longest common prefix + let prefix_len = original + .iter() + .zip(modified.iter()) + .take_while(|(a, b)| a == b) + .count(); + + // Find longest common suffix (that doesn't overlap with prefix) + let orig_remaining = original.len().saturating_sub(prefix_len); + let mod_remaining = modified.len().saturating_sub(prefix_len); + let max_suffix = orig_remaining.min(mod_remaining); + + let suffix_len = original + .iter() + .rev() + .zip(modified.iter().rev()) + .take(max_suffix) + .take_while(|(a, b)| a == b) + .count(); + + // Calculate the changed region boundaries + let orig_change_start = prefix_len; + let orig_change_end = original.len().saturating_sub(suffix_len); + let mod_change_start = prefix_len; + let mod_change_end = modified.len().saturating_sub(suffix_len); + + // If there's no actual change, return empty regions + if orig_change_start >= orig_change_end && mod_change_start >= mod_change_end { + return (Vec::new(), Vec::new()); + } + + // Expand to include context for n-gram boundaries + let orig_context_start = orig_change_start.saturating_sub(CONTEXT_CHARS); + let orig_context_end = (orig_change_end + CONTEXT_CHARS).min(original.len()); + let mod_context_start = mod_change_start.saturating_sub(CONTEXT_CHARS); + let mod_context_end = (mod_change_end + CONTEXT_CHARS).min(modified.len()); + + let orig_region: Vec = original[orig_context_start..orig_context_end].to_vec(); + let mod_region: Vec = modified[mod_context_start..mod_context_end].to_vec(); + + (orig_region, mod_region) +} + +/// Count n-grams directly from a char slice (avoids String allocation for the full text) +fn count_ngrams_from_chars(chars: &[char], n: usize) -> Counts { + let mut counts = Counts::default(); + + if chars.len() < n { + return counts; + } + + for window in chars.windows(n) { + let ngram: String = window.iter().collect(); + *counts.entry(ngram).or_insert(0) += 1; + } + + counts +} + +#[allow(dead_code)] fn chr_f_ngram_counts(text: &str) -> Vec { // Ignore whitespace. The original chrF implementation skips all // whitespace. We should consider compressing multiple consecutive @@ -168,6 +304,7 @@ fn ngram_delta_to_counts(delta: &CountsDelta) -> Counts { counts } +#[allow(dead_code)] fn count_ngrams(text: &str, n: usize) -> Counts { let chars: Vec = text.chars().collect(); let mut counts = Counts::default(); @@ -180,6 +317,151 @@ fn count_ngrams(text: &str, n: usize) -> Counts { counts } +#[cfg(test)] +mod test_optimization { + use super::*; + + #[test] + fn test_extract_changed_regions_simple() { + let original: Vec = "hello world".chars().collect(); + let modified: Vec = "hello there".chars().collect(); + + let (orig_region, mod_region) = extract_changed_regions(&original, &modified); + + // "world" vs "there" - with 5 chars context, we get "ello world" vs "ello there" + // (or less if not enough chars available) + assert!(orig_region.len() < original.len()); + assert!(mod_region.len() < modified.len()); + } + + #[test] + fn test_extract_changed_regions_insertion() { + let original: Vec = "abcdef".chars().collect(); + let modified: Vec = "abcXYZdef".chars().collect(); + + let (orig_region, mod_region) = extract_changed_regions(&original, &modified); + + // The insertion is between c and d, so we need context around that point + assert!(orig_region.len() <= original.len()); + assert!(mod_region.iter().collect::().contains("XYZ")); + } + + #[test] + fn test_extract_changed_regions_identical() { + let text: Vec = "identical text".chars().collect(); + + let (orig_region, mod_region) = extract_changed_regions(&text, &text); + + // When texts are identical, regions should be empty + assert!(orig_region.is_empty()); + assert!(mod_region.is_empty()); + } + + #[test] + fn test_optimized_matches_original_score() { + // Test that our optimized version produces the same results + let test_cases = vec![ + ("hello world", "hello there", "hello world"), + ( + "fn main() {}", + "fn main() { println!(); }", + "fn main() { print!(); }", + ), + ("abcdefghij", "abcXXXghij", "abcYYghij"), + ("unchanged", "unchanged", "unchanged"), + ( + "prefix middle suffix", + "prefix CHANGED suffix", + "prefix middle suffix", + ), + ]; + + for (original, expected, actual) in test_cases { + let score = delta_chr_f(original, expected, actual); + // Just verify it produces a reasonable score (0-100) + assert!( + score >= 0.0 && score <= 100.0, + "Score {} out of range for ({}, {}, {})", + score, + original, + expected, + actual + ); + } + } + + #[test] + fn test_optimized_equals_reference() { + // Comprehensive test that optimized version matches reference implementation exactly + let test_cases = vec![ + // Basic cases + ("hello world", "hello there", "hello world"), + ("hello world", "hello there", "hello there"), + ("unchanged", "unchanged", "unchanged"), + // Code-like cases + ( + "fn main() { println!(\"Hello\"); }", + "fn main() { println!(\"Hello, World!\"); }", + "fn main() { println!(\"Hello, World!\"); }", + ), + ( + "fn main() { println!(\"Hello\"); }", + "fn main() { println!(\"Hello, World!\"); }", + "fn main() { println!(\"Goodbye\"); }", + ), + // Insertion + ("abcdef", "abcXYZdef", "abcdef"), + ("abcdef", "abcXYZdef", "abcXYZdef"), + ("abcdef", "abcXYZdef", "abcABCdef"), + // Deletion + ("abcXYZdef", "abcdef", "abcXYZdef"), + ("abcXYZdef", "abcdef", "abcdef"), + // Multiple changes (simulated by different expected/actual) + ("one two three four", "one THREE four", "one two FOUR"), + // Edge cases + ("a", "b", "c"), + ("", "abc", ""), + ("abc", "", "abc"), + // Longer text with small change + ( + "This is a longer piece of text that contains many words and characters to process", + "This is a longer piece of TEXT that contains many words and characters to process", + "This is a longer piece of text that contains many words and characters to process", + ), + // Change at the beginning + ( + "ORIGINAL start of text", + "NEW start of text", + "DIFFERENT start of text", + ), + // Change at the end + ( + "text ending ORIGINAL", + "text ending NEW", + "text ending DIFFERENT", + ), + // Whitespace (should be ignored) + ("hello world", "hello there", "hello world"), + ("a b c d", "a X c d", "a Y c d"), + ]; + + for (original, expected, actual) in test_cases { + let optimized_score = delta_chr_f(original, expected, actual); + let reference_score = delta_chr_f_reference(original, expected, actual); + + assert!( + (optimized_score - reference_score).abs() < 1e-10, + "Mismatch for ({:?}, {:?}, {:?}):\n optimized: {}\n reference: {}", + original, + expected, + actual, + optimized_score, + reference_score + ); + } + } +} + #[cfg(test)] mod test { use super::*;