diff --git a/crates/markdown/src/markdown.rs b/crates/markdown/src/markdown.rs index c31ca79e7581926e7696fa596aaccc9371512841..52c5fd38ac84b259ca2b39b97a53a11c6dc75d03 100644 --- a/crates/markdown/src/markdown.rs +++ b/crates/markdown/src/markdown.rs @@ -314,6 +314,78 @@ actions!( ] ); +enum EscapeAction { + PassThrough, + Nbsp(usize), + DoubleNewline, + PrefixBackslash, +} + +impl EscapeAction { + fn output_len(&self) -> usize { + match self { + Self::PassThrough => 1, + Self::Nbsp(count) => count * '\u{00A0}'.len_utf8(), + Self::DoubleNewline => 2, + Self::PrefixBackslash => 2, + } + } + + fn write_to(&self, c: char, output: &mut String) { + match self { + Self::PassThrough => output.push(c), + Self::Nbsp(count) => { + for _ in 0..*count { + output.push('\u{00A0}'); + } + } + Self::DoubleNewline => { + output.push('\n'); + output.push('\n'); + } + Self::PrefixBackslash => { + // '\\' is a single backslash in Rust, e.g. '|' -> '\|' + output.push('\\'); + output.push(c); + } + } + } +} + +// Valid to operate on raw bytes since multi-byte UTF-8 +// sequences never contain ASCII-range bytes. +struct MarkdownEscaper { + in_leading_whitespace: bool, +} + +impl MarkdownEscaper { + const TAB_SIZE: usize = 4; + + fn new() -> Self { + Self { + in_leading_whitespace: true, + } + } + + fn next(&mut self, byte: u8) -> EscapeAction { + let action = if self.in_leading_whitespace && byte == b'\t' { + EscapeAction::Nbsp(Self::TAB_SIZE) + } else if self.in_leading_whitespace && byte == b' ' { + EscapeAction::Nbsp(1) + } else if byte == b'\n' { + EscapeAction::DoubleNewline + } else if byte.is_ascii_punctuation() { + EscapeAction::PrefixBackslash + } else { + EscapeAction::PassThrough + }; + + self.in_leading_whitespace = + byte == b'\n' || (self.in_leading_whitespace && (byte == b' ' || byte == b'\t')); + action + } +} + impl Markdown { pub fn new( source: SharedString, @@ -477,30 +549,21 @@ impl Markdown { } pub fn escape(s: &str) -> Cow<'_, str> { - // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars. - let count = s - .bytes() - .filter(|c| *c == b'\n' || c.is_ascii_punctuation()) - .count(); - if count > 0 { - let mut output = String::with_capacity(s.len() + count); - let mut is_newline = false; - for c in s.chars() { - if is_newline && c == ' ' { - continue; - } - is_newline = c == '\n'; - if c == '\n' { - output.push('\n') - } else if c.is_ascii_punctuation() { - output.push('\\') - } - output.push(c) - } - output.into() - } else { - s.into() + let output_len: usize = { + let mut escaper = MarkdownEscaper::new(); + s.bytes().map(|byte| escaper.next(byte).output_len()).sum() + }; + + if output_len == s.len() { + return s.into(); + } + + let mut escaper = MarkdownEscaper::new(); + let mut output = String::with_capacity(output_len); + for c in s.chars() { + escaper.next(c as u8).write_to(c, &mut output); } + output.into() } pub fn selected_text(&self) -> Option { @@ -3077,13 +3140,118 @@ mod tests { ); } + fn nbsp(n: usize) -> String { + "\u{00A0}".repeat(n) + } + + #[test] + fn test_escape_plain_text() { + assert_eq!(Markdown::escape("hello world"), "hello world"); + assert_eq!(Markdown::escape(""), ""); + assert_eq!(Markdown::escape("café ☕ naïve"), "café ☕ naïve"); + } + #[test] - fn test_escape() { - assert_eq!(Markdown::escape("hello `world`"), "hello \\`world\\`"); + fn test_escape_punctuation() { + assert_eq!(Markdown::escape("hello `world`"), r"hello \`world\`"); + assert_eq!(Markdown::escape("a|b"), r"a\|b"); + } + + #[test] + fn test_escape_leading_spaces() { + assert_eq!(Markdown::escape(" hello"), [ (4), "hello"].concat()); + assert_eq!( + Markdown::escape(" | { a: string }"), + [ (4), r"\| \{ a\: string \}"].concat() + ); assert_eq!( - Markdown::escape("hello\n cool world"), - "hello\n\ncool world" + Markdown::escape(" first\n second"), + [ (2), "first\n\n",  (2), "second"].concat() ); + assert_eq!(Markdown::escape("hello world"), "hello world"); + } + + #[test] + fn test_escape_leading_tabs() { + assert_eq!(Markdown::escape("\thello"), [ (4), "hello"].concat()); + assert_eq!( + Markdown::escape("hello\n\t\tindented"), + ["hello\n\n",  (8), "indented"].concat() + ); + assert_eq!( + Markdown::escape(" \t hello"), + [ (1 + 4 + 1), "hello"].concat() + ); + assert_eq!(Markdown::escape("hello\tworld"), "hello\tworld"); + } + + #[test] + fn test_escape_newlines() { + assert_eq!(Markdown::escape("a\nb"), "a\n\nb"); + assert_eq!(Markdown::escape("a\n\nb"), "a\n\n\n\nb"); + assert_eq!(Markdown::escape("\nhello"), "\n\nhello"); + } + + #[test] + fn test_escape_multiline_diagnostic() { + assert_eq!( + Markdown::escape(" | { a: string }\n | { b: number }"), + [ +  (4), + r"\| \{ a\: string \}", + "\n\n", +  (4), + r"\| \{ b\: number \}", + ] + .concat() + ); + } + + fn has_code_block(markdown: &str) -> bool { + let parsed_data = parse_markdown_with_options(markdown, false); + parsed_data + .events + .iter() + .any(|(_, event)| matches!(event, MarkdownEvent::Start(MarkdownTag::CodeBlock { .. }))) + } + + #[test] + fn test_escape_output_len_matches_precomputed() { + let cases = [ + "", + "hello world", + "hello `world`", + " hello", + " | { a: string }", + "\thello", + "hello\n\t\tindented", + " \t hello", + "hello\tworld", + "a\nb", + "a\n\nb", + "\nhello", + " | { a: string }\n | { b: number }", + "café ☕ naïve", + ]; + for input in cases { + let mut escaper = MarkdownEscaper::new(); + let precomputed: usize = input.bytes().map(|b| escaper.next(b).output_len()).sum(); + + let mut escaper = MarkdownEscaper::new(); + let mut output = String::new(); + for c in input.chars() { + escaper.next(c as u8).write_to(c, &mut output); + } + + assert_eq!(precomputed, output.len(), "length mismatch for {:?}", input); + } + } + + #[test] + fn test_escape_prevents_code_block() { + let diagnostic = " | { a: string }"; + assert!(has_code_block(diagnostic)); + assert!(!has_code_block(&Markdown::escape(diagnostic))); } #[track_caller]