Fix markdown escaping

Conrad Irwin created

Closes #29255

Release Notes:

- Improved handling of markdown escape sequences

Change summary

crates/markdown/examples/markdown.rs |   1 
crates/markdown/src/markdown.rs      |  14 +
crates/markdown/src/parser.rs        | 297 ++++++++++++++++++-----------
3 files changed, 200 insertions(+), 112 deletions(-)

Detailed changes

crates/markdown/examples/markdown.rs 🔗

@@ -21,7 +21,6 @@ function a(b: T) {
 }
 ```
 
-
 Remember, markdown processors may have slight differences and extensions, so always refer to the specific documentation or guides relevant to your platform or editor for the best practices and additional features.
 "#;
 

crates/markdown/src/markdown.rs 🔗

@@ -221,7 +221,12 @@ impl Markdown {
             .count();
         if count > 0 {
             let mut output = String::with_capacity(s.len() + count);
+            let mut is_newline = false;
             for c in s.chars() {
+                if is_newline && c == ' ' {
+                    continue;
+                }
+                is_newline = c == '\n';
                 if c == '\n' {
                     output.push('\n')
                 } else if c.is_ascii_punctuation() {
@@ -1722,6 +1727,15 @@ mod tests {
         rendered.text
     }
 
+    #[test]
+    fn test_escape() {
+        assert_eq!(Markdown::escape("hello `world`"), "hello \\`world\\`");
+        assert_eq!(
+            Markdown::escape("hello\n    cool world"),
+            "hello\n\ncool world"
+        );
+    }
+
     #[track_caller]
     fn assert_mappings(rendered: &RenderedText, expected: Vec<Vec<(usize, usize)>>) {
         assert_eq!(rendered.lines.len(), expected.len(), "line count mismatch");

crates/markdown/src/parser.rs 🔗

@@ -2,14 +2,9 @@ use gpui::SharedString;
 use linkify::LinkFinder;
 pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
 use pulldown_cmark::{
-    Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
-};
-use std::{
-    collections::HashSet,
-    ops::{Deref, Range},
-    path::Path,
-    sync::Arc,
+    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
 };
+use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
 
 use crate::path_range::PathWithRange;
 
@@ -35,7 +30,10 @@ pub fn parse_markdown(
     let mut language_paths = HashSet::new();
     let mut within_link = false;
     let mut within_metadata = false;
-    for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
+    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
+        .into_offset_iter()
+        .peekable();
+    while let Some((pulldown_event, mut range)) = parser.next() {
         if within_metadata {
             if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
                 pulldown_event
@@ -175,47 +173,125 @@ pub fn parse_markdown(
                 events.push((range, MarkdownEvent::End(tag)));
             }
             pulldown_cmark::Event::Text(parsed) => {
-                // `parsed` will share bytes with the input unless a substitution like handling of
-                // HTML entities or smart punctuation has occurred. When these substitutions occur,
-                // `parsed` only consists of the result of a single substitution.
-                if !cow_str_points_inside(&parsed, text) {
-                    events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
-                } else {
-                    // Automatically detect links in text if not already within a markdown link.
-                    if !within_link {
-                        let mut finder = LinkFinder::new();
-                        finder.kinds(&[linkify::LinkKind::Url]);
-                        let text_range = range.clone();
-                        for link in finder.links(&text[text_range.clone()]) {
-                            let link_range =
-                                text_range.start + link.start()..text_range.start + link.end();
-
-                            if link_range.start > range.start {
-                                events.push((range.start..link_range.start, MarkdownEvent::Text));
-                            }
+                fn event_for(
+                    text: &str,
+                    range: Range<usize>,
+                    str: &str,
+                ) -> (Range<usize>, MarkdownEvent) {
+                    if str == &text[range.clone()] {
+                        (range, MarkdownEvent::Text)
+                    } else {
+                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
+                    }
+                }
+                struct TextRange<'a> {
+                    source_range: Range<usize>,
+                    merged_range: Range<usize>,
+                    parsed: CowStr<'a>,
+                }
 
-                            events.push((
-                                link_range.clone(),
-                                MarkdownEvent::Start(MarkdownTag::Link {
-                                    link_type: LinkType::Autolink,
-                                    dest_url: SharedString::from(link.as_str().to_string()),
-                                    title: SharedString::default(),
-                                    id: SharedString::default(),
-                                }),
-                            ));
+                let mut last_len = parsed.len();
+                let mut ranges = vec![TextRange {
+                    source_range: range.clone(),
+                    merged_range: 0..last_len,
+                    parsed,
+                }];
+
+                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
+                    let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
+                    else {
+                        unreachable!()
+                    };
+                    let next_len = last_len + next_event.len();
+                    ranges.push(TextRange {
+                        source_range: next_range.clone(),
+                        merged_range: last_len..next_len,
+                        parsed: next_event,
+                    });
+                    last_len = next_len;
+                }
+
+                let mut merged_text =
+                    String::with_capacity(ranges.last().unwrap().merged_range.end);
+                for range in &ranges {
+                    merged_text.push_str(&range.parsed);
+                }
+
+                let mut ranges = ranges.into_iter().peekable();
+
+                if !within_link {
+                    let mut finder = LinkFinder::new();
+                    finder.kinds(&[linkify::LinkKind::Url]);
+
+                    // Find links in the merged text
+                    for link in finder.links(&merged_text) {
+                        let link_start_in_merged = link.start();
+                        let link_end_in_merged = link.end();
+
+                        while ranges
+                            .peek()
+                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
+                        {
+                            let range = ranges.next().unwrap();
+                            events.push(event_for(text, range.source_range, &range.parsed));
+                        }
 
-                            events.push((link_range.clone(), MarkdownEvent::Text));
-                            events.push((
-                                link_range.clone(),
-                                MarkdownEvent::End(MarkdownTagEnd::Link),
+                        let range = ranges.peek_mut().unwrap();
+                        let prefix_len = link_start_in_merged - range.merged_range.start;
+                        if prefix_len > 0 {
+                            let (head, tail) = range.parsed.split_at(prefix_len);
+                            events.push(event_for(
+                                text,
+                                range.source_range.start..range.source_range.start + prefix_len,
+                                &head,
                             ));
+                            range.parsed = CowStr::Boxed(tail.into());
+                            range.merged_range.start += prefix_len;
+                            range.source_range.start += prefix_len;
+                        }
+
+                        let link_start_in_source = range.source_range.start;
+                        let mut link_events = Vec::new();
 
-                            range.start = link_range.end;
+                        while ranges
+                            .peek()
+                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
+                        {
+                            let range = ranges.next().unwrap();
+                            link_events.push(event_for(text, range.source_range, &range.parsed));
                         }
+
+                        let range = ranges.peek_mut().unwrap();
+                        let prefix_len = link_end_in_merged - range.merged_range.start;
+                        if prefix_len > 0 {
+                            let (head, tail) = range.parsed.split_at(prefix_len);
+                            link_events.push(event_for(
+                                text,
+                                range.source_range.start..range.source_range.start + prefix_len,
+                                head,
+                            ));
+                            range.parsed = CowStr::Boxed(tail.into());
+                            range.merged_range.start += prefix_len;
+                            range.source_range.start += prefix_len;
+                        }
+                        let link_range = link_start_in_source..range.source_range.start;
+
+                        events.push((
+                            link_range.clone(),
+                            MarkdownEvent::Start(MarkdownTag::Link {
+                                link_type: LinkType::Autolink,
+                                dest_url: SharedString::from(link.as_str().to_string()),
+                                title: SharedString::default(),
+                                id: SharedString::default(),
+                            }),
+                        ));
+                        events.extend(link_events);
+                        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
                     }
-                    if range.start < range.end {
-                        events.push((range, MarkdownEvent::Text));
-                    }
+                }
+
+                for range in ranges {
+                    events.push(event_for(text, range.source_range, &range.parsed));
                 }
             }
             pulldown_cmark::Event::Code(_) => {
@@ -291,7 +367,7 @@ pub enum MarkdownEvent {
     Text,
     /// Text that differs from the markdown source - typically due to substitution of HTML entities
     /// and smart punctuation.
-    SubstitutedText(CompactStr),
+    SubstitutedText(String),
     /// An inline code node.
     Code,
     /// An HTML node.
@@ -429,73 +505,6 @@ pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
     range
 }
 
-/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
-/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
-///
-/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
-#[derive(Clone)]
-pub enum CompactStr {
-    Boxed(Box<str>),
-    Inlined(InlineStr),
-}
-
-impl std::fmt::Debug for CompactStr {
-    fn fmt(&self, formatter: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
-        self.deref().fmt(formatter)
-    }
-}
-
-impl Deref for CompactStr {
-    type Target = str;
-
-    fn deref(&self) -> &str {
-        match self {
-            CompactStr::Boxed(b) => b,
-            CompactStr::Inlined(i) => i,
-        }
-    }
-}
-
-impl From<&str> for CompactStr {
-    fn from(s: &str) -> Self {
-        if let Ok(inlined) = s.try_into() {
-            CompactStr::Inlined(inlined)
-        } else {
-            CompactStr::Boxed(s.into())
-        }
-    }
-}
-
-impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
-    fn from(cow_str: pulldown_cmark::CowStr) -> Self {
-        match cow_str {
-            pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
-            pulldown_cmark::CowStr::Borrowed(b) => b.into(),
-            pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
-        }
-    }
-}
-
-impl PartialEq for CompactStr {
-    fn eq(&self, other: &Self) -> bool {
-        self.deref() == other.deref()
-    }
-}
-
-fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
-    match substring {
-        pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
-        pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
-        pulldown_cmark::CowStr::Inlined(_) => false,
-    }
-}
-
-fn str_points_inside(substring: &str, container: &str) -> bool {
-    let substring_ptr = substring.as_ptr();
-    let container_ptr = container.as_ptr();
-    unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
-}
-
 #[cfg(test)]
 mod tests {
     use super::MarkdownEvent::*;
@@ -621,4 +630,70 @@ mod tests {
         let input = "```python\nprint('hello')\nprint('world')\n```";
         assert_eq!(extract_code_block_content_range(input), 10..40);
     }
+
+    #[test]
+    fn test_links_split_across_fragments() {
+        // This test verifies that links split across multiple text fragments due to escaping or other issues
+        // are correctly detected and processed
+        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
+        // We're verifying our parser can handle this correctly
+        assert_eq!(
+            parse_markdown("https:/\\/example.com is equivalent to https://example&#46;com!").0,
+            vec![
+                (0..62, Start(Paragraph)),
+                (
+                    0..20,
+                    Start(Link {
+                        link_type: LinkType::Autolink,
+                        dest_url: "https://example.com".into(),
+                        title: "".into(),
+                        id: "".into()
+                    })
+                ),
+                (0..7, Text),
+                (8..20, Text),
+                (0..20, End(MarkdownTagEnd::Link)),
+                (20..38, Text),
+                (
+                    38..61,
+                    Start(Link {
+                        link_type: LinkType::Autolink,
+                        dest_url: "https://example.com".into(),
+                        title: "".into(),
+                        id: "".into()
+                    })
+                ),
+                (38..53, Text),
+                (53..58, SubstitutedText(".".into())),
+                (58..61, Text),
+                (38..61, End(MarkdownTagEnd::Link)),
+                (61..62, Text),
+                (0..62, End(MarkdownTagEnd::Paragraph))
+            ],
+        );
+
+        assert_eq!(
+            parse_markdown("Visit https://example.com/cat\\/é&#8205;☕ for coffee!").0,
+            [
+                (0..55, Start(Paragraph)),
+                (0..6, Text),
+                (
+                    6..43,
+                    Start(Link {
+                        link_type: LinkType::Autolink,
+                        dest_url: "https://example.com/cat/é\u{200d}☕".into(),
+                        title: "".into(),
+                        id: "".into()
+                    })
+                ),
+                (6..29, Text),
+                (30..33, Text),
+                (33..40, SubstitutedText("\u{200d}".into())),
+                (40..43, Text),
+                (6..43, End(MarkdownTagEnd::Link)),
+                (43..55, Text),
+                (0..55, End(MarkdownTagEnd::Paragraph))
+            ]
+        );
+    }
 }