diff --git a/crates/edit_prediction_cli/src/metrics.rs b/crates/edit_prediction_cli/src/metrics.rs index fc870c36c9c62f4d74486ddd4b2d35176b00bb5c..1bfd8e542fa3d74b55f091d2ac13aa22883f6a2f 100644 --- a/crates/edit_prediction_cli/src/metrics.rs +++ b/crates/edit_prediction_cli/src/metrics.rs @@ -76,14 +76,21 @@ impl ClassificationMetrics { } enum ChrfWhitespace { + /// Preserve whitespace as-is #[allow(unused)] Unchanged, + + /// Ignore all whitespace differences + #[allow(unused)] Ignore, + + /// Collapse whitespace into single spaces + Collapse, } const CHR_F_CHAR_ORDER: usize = 6; const CHR_F_BETA: f64 = 2.0; -const CHR_F_WHITESPACE: ChrfWhitespace = ChrfWhitespace::Ignore; +const CHR_F_WHITESPACE: ChrfWhitespace = ChrfWhitespace::Collapse; /// Computes a delta-chrF score that compares two sets of edits. /// @@ -196,9 +203,34 @@ fn filter_whitespace_chars(text: &str) -> Vec { match CHR_F_WHITESPACE { ChrfWhitespace::Unchanged => text.chars().collect(), ChrfWhitespace::Ignore => text.chars().filter(|c| !c.is_whitespace()).collect(), + ChrfWhitespace::Collapse => collapse_whitespace(text.chars()), } } +/// Collapse whitespace into single spaces. +/// Newlines and spaces are collapsed separately. +fn collapse_whitespace(chars: impl Iterator) -> Vec { + let mut result = Vec::new(); + let mut last_whitespace = None; + for c in chars { + if c.is_whitespace() && c != '\n' { + if last_whitespace != Some(' ') { + result.push(' '); + last_whitespace = Some(' '); + } + } else if c == '\n' { + if last_whitespace != Some('\n') { + result.push(c); + last_whitespace = Some('\n'); + } + } else { + result.push(c); + last_whitespace = None; + } + } + result +} + /// Extract only the changed regions between two texts, with context for n-gram boundaries. /// /// Returns (original_affected_region, modified_affected_region) as Vec. @@ -269,15 +301,15 @@ fn count_ngrams_from_chars(chars: &[char], n: usize) -> Counts { #[allow(dead_code)] fn chr_f_ngram_counts(text: &str) -> Vec { - // Ignore whitespace. The original chrF implementation skips all - // whitespace. We should consider compressing multiple consecutive - // spaces into one -- this may reflect our task more closely. let text = match CHR_F_WHITESPACE { ChrfWhitespace::Unchanged => text.to_string(), ChrfWhitespace::Ignore => text .chars() .filter(|c| !c.is_whitespace()) .collect::(), + ChrfWhitespace::Collapse => collapse_whitespace(text.chars()) + .into_iter() + .collect::(), }; (1..=CHR_F_CHAR_ORDER) @@ -1175,4 +1207,14 @@ index abc123..def456 100644 assert!(counts.deleted_tokens >= 2); assert!(counts.inserted_tokens >= 2); } + + #[test] + fn test_whitespace_collapse() { + let text = "abc \n\n\n 123"; + let collapsed = collapse_whitespace(text.chars()); + assert_eq!( + collapsed, + vec!['a', 'b', 'c', ' ', '\n', ' ', '1', '2', '3'] + ); + } }