parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
  8
  9use crate::path_range::PathWithRange;
 10
 11const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 12    .union(Options::ENABLE_FOOTNOTES)
 13    .union(Options::ENABLE_STRIKETHROUGH)
 14    .union(Options::ENABLE_TASKLISTS)
 15    .union(Options::ENABLE_SMART_PUNCTUATION)
 16    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 17    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 18    .union(Options::ENABLE_OLD_FOOTNOTES)
 19    .union(Options::ENABLE_GFM);
 20
 21pub fn parse_markdown(
 22    text: &str,
 23) -> (
 24    Vec<(Range<usize>, MarkdownEvent)>,
 25    HashSet<SharedString>,
 26    HashSet<Arc<Path>>,
 27) {
 28    let mut events = Vec::new();
 29    let mut language_names = HashSet::new();
 30    let mut language_paths = HashSet::new();
 31    let mut within_link = false;
 32    let mut within_metadata = false;
 33    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
 34        .into_offset_iter()
 35        .peekable();
 36    while let Some((pulldown_event, mut range)) = parser.next() {
 37        if within_metadata {
 38            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 39                pulldown_event
 40            {
 41                within_metadata = false;
 42            }
 43            continue;
 44        }
 45        match pulldown_event {
 46            pulldown_cmark::Event::Start(tag) => {
 47                let tag = match tag {
 48                    pulldown_cmark::Tag::Link {
 49                        link_type,
 50                        dest_url,
 51                        title,
 52                        id,
 53                    } => {
 54                        within_link = true;
 55                        MarkdownTag::Link {
 56                            link_type,
 57                            dest_url: SharedString::from(dest_url.into_string()),
 58                            title: SharedString::from(title.into_string()),
 59                            id: SharedString::from(id.into_string()),
 60                        }
 61                    }
 62                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 63                        within_metadata = true;
 64                        MarkdownTag::MetadataBlock(kind)
 65                    }
 66                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 67                        MarkdownTag::CodeBlock {
 68                            kind: CodeBlockKind::Indented,
 69                            metadata: CodeBlockMetadata {
 70                                content_range: range.start + 1..range.end + 1,
 71                                line_count: 1,
 72                            },
 73                        }
 74                    }
 75                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 76                        ref info,
 77                    )) => {
 78                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 79                        let content_range =
 80                            content_range.start + range.start..content_range.end + range.start;
 81
 82                        // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
 83                        let line_count = text[content_range.clone()]
 84                            .bytes()
 85                            .filter(|c| *c == b'\n')
 86                            .count();
 87                        let metadata = CodeBlockMetadata {
 88                            content_range,
 89                            line_count,
 90                        };
 91
 92                        let info = info.trim();
 93                        let kind = if info.is_empty() {
 94                            CodeBlockKind::Fenced
 95                            // Languages should never contain a slash, and PathRanges always should.
 96                            // (Models are told to specify them relative to a workspace root.)
 97                        } else if info.contains('/') {
 98                            let path_range = PathWithRange::new(info);
 99                            language_paths.insert(path_range.path.clone());
100                            CodeBlockKind::FencedSrc(path_range)
101                        } else {
102                            let language = SharedString::from(info.to_string());
103                            language_names.insert(language.clone());
104                            CodeBlockKind::FencedLang(language)
105                        };
106
107                        MarkdownTag::CodeBlock { kind, metadata }
108                    }
109                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
110                    pulldown_cmark::Tag::Heading {
111                        level,
112                        id,
113                        classes,
114                        attrs,
115                    } => {
116                        let id = id.map(|id| SharedString::from(id.into_string()));
117                        let classes = classes
118                            .into_iter()
119                            .map(|c| SharedString::from(c.into_string()))
120                            .collect();
121                        let attrs = attrs
122                            .into_iter()
123                            .map(|(key, value)| {
124                                (
125                                    SharedString::from(key.into_string()),
126                                    value.map(|v| SharedString::from(v.into_string())),
127                                )
128                            })
129                            .collect();
130                        MarkdownTag::Heading {
131                            level,
132                            id,
133                            classes,
134                            attrs,
135                        }
136                    }
137                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
138                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
139                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
140                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
141                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
142                    }
143                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
144                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
145                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
146                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
147                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
148                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
149                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
150                    pulldown_cmark::Tag::Image {
151                        link_type,
152                        dest_url,
153                        title,
154                        id,
155                    } => MarkdownTag::Image {
156                        link_type,
157                        dest_url: SharedString::from(dest_url.into_string()),
158                        title: SharedString::from(title.into_string()),
159                        id: SharedString::from(id.into_string()),
160                    },
161                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
162                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
163                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
164                    pulldown_cmark::Tag::DefinitionListDefinition => {
165                        MarkdownTag::DefinitionListDefinition
166                    }
167                };
168                events.push((range, MarkdownEvent::Start(tag)))
169            }
170            pulldown_cmark::Event::End(tag) => {
171                if let pulldown_cmark::TagEnd::Link = tag {
172                    within_link = false;
173                }
174                events.push((range, MarkdownEvent::End(tag)));
175            }
176            pulldown_cmark::Event::Text(parsed) => {
177                fn event_for(
178                    text: &str,
179                    range: Range<usize>,
180                    str: &str,
181                ) -> (Range<usize>, MarkdownEvent) {
182                    if str == &text[range.clone()] {
183                        (range, MarkdownEvent::Text)
184                    } else {
185                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
186                    }
187                }
188                #[derive(Debug)]
189                struct TextRange<'a> {
190                    source_range: Range<usize>,
191                    merged_range: Range<usize>,
192                    parsed: CowStr<'a>,
193                }
194
195                let mut last_len = parsed.len();
196                let mut ranges = vec![TextRange {
197                    source_range: range.clone(),
198                    merged_range: 0..last_len,
199                    parsed,
200                }];
201
202                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
203                    let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
204                    else {
205                        unreachable!()
206                    };
207                    let next_len = last_len + next_event.len();
208                    ranges.push(TextRange {
209                        source_range: next_range.clone(),
210                        merged_range: last_len..next_len,
211                        parsed: next_event,
212                    });
213                    last_len = next_len;
214                }
215
216                let mut merged_text =
217                    String::with_capacity(ranges.last().unwrap().merged_range.end);
218                for range in &ranges {
219                    merged_text.push_str(&range.parsed);
220                }
221
222                let mut ranges = ranges.into_iter().peekable();
223
224                if !within_link {
225                    let mut finder = LinkFinder::new();
226                    finder.kinds(&[linkify::LinkKind::Url]);
227
228                    // Find links in the merged text
229                    for link in finder.links(&merged_text) {
230                        let link_start_in_merged = link.start();
231                        let link_end_in_merged = link.end();
232
233                        while ranges
234                            .peek()
235                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
236                        {
237                            let range = ranges.next().unwrap();
238                            events.push(event_for(text, range.source_range, &range.parsed));
239                        }
240
241                        let Some(range) = ranges.peek_mut() else {
242                            continue;
243                        };
244                        let prefix_len = link_start_in_merged - range.merged_range.start;
245                        if prefix_len > 0 {
246                            let (head, tail) = range.parsed.split_at(prefix_len);
247                            events.push(event_for(
248                                text,
249                                range.source_range.start..range.source_range.start + prefix_len,
250                                &head,
251                            ));
252                            range.parsed = CowStr::Boxed(tail.into());
253                            range.merged_range.start += prefix_len;
254                            range.source_range.start += prefix_len;
255                        }
256
257                        let link_start_in_source = range.source_range.start;
258                        let mut link_end_in_source = range.source_range.end;
259                        let mut link_events = Vec::new();
260
261                        while ranges
262                            .peek()
263                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
264                        {
265                            let range = ranges.next().unwrap();
266                            link_end_in_source = range.source_range.end;
267                            link_events.push(event_for(text, range.source_range, &range.parsed));
268                        }
269
270                        if let Some(range) = ranges.peek_mut() {
271                            let prefix_len = link_end_in_merged - range.merged_range.start;
272                            if prefix_len > 0 {
273                                let (head, tail) = range.parsed.split_at(prefix_len);
274                                link_events.push(event_for(
275                                    text,
276                                    range.source_range.start..range.source_range.start + prefix_len,
277                                    head,
278                                ));
279                                range.parsed = CowStr::Boxed(tail.into());
280                                range.merged_range.start += prefix_len;
281                                range.source_range.start += prefix_len;
282                                link_end_in_source = range.source_range.start;
283                            }
284                        }
285                        let link_range = link_start_in_source..link_end_in_source;
286
287                        events.push((
288                            link_range.clone(),
289                            MarkdownEvent::Start(MarkdownTag::Link {
290                                link_type: LinkType::Autolink,
291                                dest_url: SharedString::from(link.as_str().to_string()),
292                                title: SharedString::default(),
293                                id: SharedString::default(),
294                            }),
295                        ));
296                        events.extend(link_events);
297                        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
298                    }
299                }
300
301                for range in ranges {
302                    events.push(event_for(text, range.source_range, &range.parsed));
303                }
304            }
305            pulldown_cmark::Event::Code(_) => {
306                range.start += 1;
307                range.end -= 1;
308                events.push((range, MarkdownEvent::Code))
309            }
310            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
311            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
312            pulldown_cmark::Event::FootnoteReference(_) => {
313                events.push((range, MarkdownEvent::FootnoteReference))
314            }
315            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
316            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
317            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
318            pulldown_cmark::Event::TaskListMarker(checked) => {
319                events.push((range, MarkdownEvent::TaskListMarker(checked)))
320            }
321            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
322        }
323    }
324    (events, language_names, language_paths)
325}
326
327pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
328    let mut events = Vec::new();
329    let mut finder = LinkFinder::new();
330    finder.kinds(&[linkify::LinkKind::Url]);
331    let mut text_range = Range {
332        start: 0,
333        end: text.len(),
334    };
335    for link in finder.links(text) {
336        let link_range = link.start()..link.end();
337
338        if link_range.start > text_range.start {
339            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
340        }
341
342        events.push((
343            link_range.clone(),
344            MarkdownEvent::Start(MarkdownTag::Link {
345                link_type: LinkType::Autolink,
346                dest_url: SharedString::from(link.as_str().to_string()),
347                title: SharedString::default(),
348                id: SharedString::default(),
349            }),
350        ));
351        events.push((link_range.clone(), MarkdownEvent::Text));
352        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
353
354        text_range.start = link_range.end;
355    }
356
357    if text_range.end > text_range.start {
358        events.push((text_range, MarkdownEvent::Text));
359    }
360
361    events
362}
363
364/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
365/// parse result for rendering without resorting to unsafe lifetime coercion.
366#[derive(Clone, Debug, PartialEq)]
367pub enum MarkdownEvent {
368    /// Start of a tagged element. Events that are yielded after this event
369    /// and before its corresponding `End` event are inside this element.
370    /// Start and end events are guaranteed to be balanced.
371    Start(MarkdownTag),
372    /// End of a tagged element.
373    End(MarkdownTagEnd),
374    /// Text that uses the associated range from the markdown source.
375    Text,
376    /// Text that differs from the markdown source - typically due to substitution of HTML entities
377    /// and smart punctuation.
378    SubstitutedText(String),
379    /// An inline code node.
380    Code,
381    /// An HTML node.
382    Html,
383    /// An inline HTML node.
384    InlineHtml,
385    /// A reference to a footnote with given label, which may or may not be defined
386    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
387    /// occur in any order.
388    FootnoteReference,
389    /// A soft line break.
390    SoftBreak,
391    /// A hard line break.
392    HardBreak,
393    /// A horizontal ruler.
394    Rule,
395    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
396    TaskListMarker(bool),
397}
398
399/// Tags for elements that can contain other elements.
400#[derive(Clone, Debug, PartialEq)]
401pub enum MarkdownTag {
402    /// A paragraph of text and other inline elements.
403    Paragraph,
404
405    /// A heading, with optional identifier, classes and custom attributes.
406    /// The identifier is prefixed with `#` and the last one in the attributes
407    /// list is chosen, classes are prefixed with `.` and custom attributes
408    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
409    Heading {
410        level: HeadingLevel,
411        id: Option<SharedString>,
412        classes: Vec<SharedString>,
413        /// The first item of the tuple is the attr and second one the value.
414        attrs: Vec<(SharedString, Option<SharedString>)>,
415    },
416
417    BlockQuote,
418
419    /// A code block.
420    CodeBlock {
421        kind: CodeBlockKind,
422        metadata: CodeBlockMetadata,
423    },
424
425    /// A HTML block.
426    HtmlBlock,
427
428    /// A list. If the list is ordered the field indicates the number of the first item.
429    /// Contains only list items.
430    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
431
432    /// A list item.
433    Item,
434
435    /// A footnote definition. The value contained is the footnote's label by which it can
436    /// be referred to.
437    FootnoteDefinition(SharedString),
438
439    /// A table. Contains a vector describing the text-alignment for each of its columns.
440    Table(Vec<Alignment>),
441
442    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
443    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
444    TableHead,
445
446    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
447    TableRow,
448    TableCell,
449
450    // span-level tags
451    Emphasis,
452    Strong,
453    Strikethrough,
454
455    /// A link.
456    Link {
457        link_type: LinkType,
458        dest_url: SharedString,
459        title: SharedString,
460        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
461        id: SharedString,
462    },
463
464    /// An image. The first field is the link type, the second the destination URL and the third is a title,
465    /// the fourth is the link identifier.
466    Image {
467        link_type: LinkType,
468        dest_url: SharedString,
469        title: SharedString,
470        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
471        id: SharedString,
472    },
473
474    /// A metadata block.
475    MetadataBlock(MetadataBlockKind),
476
477    DefinitionList,
478    DefinitionListTitle,
479    DefinitionListDefinition,
480}
481
482#[derive(Clone, Debug, PartialEq)]
483pub enum CodeBlockKind {
484    Indented,
485    /// "Fenced" means "surrounded by triple backticks."
486    /// There can optionally be either a language after the backticks (like in traditional Markdown)
487    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
488    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
489    Fenced,
490    FencedLang(SharedString),
491    FencedSrc(PathWithRange),
492}
493
494#[derive(Default, Clone, Debug, PartialEq)]
495pub struct CodeBlockMetadata {
496    pub content_range: Range<usize>,
497    pub line_count: usize,
498}
499
500pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
501    let mut range = 0..text.len();
502    if text.starts_with("```") {
503        range.start += 3;
504
505        if let Some(newline_ix) = text[range.clone()].find('\n') {
506            range.start += newline_ix + 1;
507        }
508    }
509
510    if !range.is_empty() && text.ends_with("```") {
511        range.end -= 3;
512    }
513    range
514}
515
516#[cfg(test)]
517mod tests {
518    use super::MarkdownEvent::*;
519    use super::MarkdownTag::*;
520    use super::*;
521
522    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
523        .union(Options::ENABLE_MATH)
524        .union(Options::ENABLE_DEFINITION_LIST);
525
526    #[test]
527    fn all_options_considered() {
528        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
529        // can be evaluated for inclusion.
530        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
531    }
532
533    #[test]
534    fn wanted_and_unwanted_options_disjoint() {
535        assert_eq!(
536            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
537            Options::empty()
538        );
539    }
540
541    #[test]
542    fn test_html_comments() {
543        assert_eq!(
544            parse_markdown("  <!--\nrdoc-file=string.c\n-->\nReturns"),
545            (
546                vec![
547                    (2..30, Start(HtmlBlock)),
548                    (2..2, SubstitutedText("  ".into())),
549                    (2..7, Html),
550                    (7..26, Html),
551                    (26..30, Html),
552                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
553                    (30..37, Start(Paragraph)),
554                    (30..37, Text),
555                    (30..37, End(MarkdownTagEnd::Paragraph))
556                ],
557                HashSet::new(),
558                HashSet::new()
559            )
560        )
561    }
562
563    #[test]
564    fn test_plain_urls_and_escaped_text() {
565        assert_eq!(
566            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
567            (
568                vec![
569                    (0..51, Start(Paragraph)),
570                    (0..6, SubstitutedText("\u{a0}".into())),
571                    (6..12, SubstitutedText("\u{a0}".into())),
572                    (12..13, Text),
573                    (
574                        13..29,
575                        Start(Link {
576                            link_type: LinkType::Autolink,
577                            dest_url: "https://some.url".into(),
578                            title: "".into(),
579                            id: "".into(),
580                        })
581                    ),
582                    (13..29, Text),
583                    (13..29, End(MarkdownTagEnd::Link)),
584                    (29..35, Text),
585                    (36..37, Text), // Escaped backtick
586                    (37..44, SubstitutedText("".into())),
587                    (45..46, Text), // Escaped backtick
588                    (46..51, Text),
589                    (0..51, End(MarkdownTagEnd::Paragraph))
590                ],
591                HashSet::new(),
592                HashSet::new()
593            )
594        );
595    }
596
597    #[test]
598    fn test_incomplete_link() {
599        assert_eq!(
600            parse_markdown("You can use the [GitHub Search API](https://docs.github.com/en").0,
601            vec![
602                (0..62, Start(Paragraph)),
603                (0..16, Text),
604                (16..17, Text),
605                (17..34, Text),
606                (34..35, Text),
607                (35..36, Text),
608                (
609                    36..62,
610                    Start(Link {
611                        link_type: LinkType::Autolink,
612                        dest_url: "https://docs.github.com/en".into(),
613                        title: "".into(),
614                        id: "".into()
615                    })
616                ),
617                (36..62, Text),
618                (36..62, End(MarkdownTagEnd::Link)),
619                (0..62, End(MarkdownTagEnd::Paragraph))
620            ],
621        );
622    }
623
624    #[test]
625    fn test_smart_punctuation() {
626        assert_eq!(
627            parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
628            (
629                vec![
630                    (0..53, Start(Paragraph)),
631                    (0..2, SubstitutedText("".into())),
632                    (2..3, Text),
633                    (3..6, SubstitutedText("".into())),
634                    (6..7, Text),
635                    (7..10, SubstitutedText("".into())),
636                    (10..11, Text),
637                    (11..12, SubstitutedText("".into())),
638                    (12..25, Text),
639                    (25..26, SubstitutedText("".into())),
640                    (26..27, Text),
641                    (27..28, SubstitutedText("".into())),
642                    (28..41, Text),
643                    (41..42, SubstitutedText("".into())),
644                    (42..43, Text),
645                    (43..53, SubstitutedText("–––––".into())),
646                    (0..53, End(MarkdownTagEnd::Paragraph))
647                ],
648                HashSet::new(),
649                HashSet::new()
650            )
651        )
652    }
653
654    #[test]
655    fn test_code_block_metadata() {
656        assert_eq!(
657            parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
658            (
659                vec![
660                    (
661                        0..37,
662                        Start(CodeBlock {
663                            kind: CodeBlockKind::FencedLang("rust".into()),
664                            metadata: CodeBlockMetadata {
665                                content_range: 8..34,
666                                line_count: 3
667                            }
668                        })
669                    ),
670                    (8..34, Text),
671                    (0..37, End(MarkdownTagEnd::CodeBlock)),
672                ],
673                HashSet::from(["rust".into()]),
674                HashSet::new()
675            )
676        )
677    }
678
679    #[test]
680    fn test_extract_code_block_content_range() {
681        let input = "```rust\nlet x = 5;\n```";
682        assert_eq!(extract_code_block_content_range(input), 8..19);
683
684        let input = "plain text";
685        assert_eq!(extract_code_block_content_range(input), 0..10);
686
687        let input = "```python\nprint('hello')\nprint('world')\n```";
688        assert_eq!(extract_code_block_content_range(input), 10..40);
689    }
690
691    #[test]
692    fn test_links_split_across_fragments() {
693        // This test verifies that links split across multiple text fragments due to escaping or other issues
694        // are correctly detected and processed
695        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
696        // We're verifying our parser can handle this correctly
697        assert_eq!(
698            parse_markdown("https:/\\/example.com is equivalent to https://example&#46;com!").0,
699            vec![
700                (0..62, Start(Paragraph)),
701                (
702                    0..20,
703                    Start(Link {
704                        link_type: LinkType::Autolink,
705                        dest_url: "https://example.com".into(),
706                        title: "".into(),
707                        id: "".into()
708                    })
709                ),
710                (0..7, Text),
711                (8..20, Text),
712                (0..20, End(MarkdownTagEnd::Link)),
713                (20..38, Text),
714                (
715                    38..61,
716                    Start(Link {
717                        link_type: LinkType::Autolink,
718                        dest_url: "https://example.com".into(),
719                        title: "".into(),
720                        id: "".into()
721                    })
722                ),
723                (38..53, Text),
724                (53..58, SubstitutedText(".".into())),
725                (58..61, Text),
726                (38..61, End(MarkdownTagEnd::Link)),
727                (61..62, Text),
728                (0..62, End(MarkdownTagEnd::Paragraph))
729            ],
730        );
731
732        assert_eq!(
733            parse_markdown("Visit https://example.com/cat\\/é&#8205;☕ for coffee!").0,
734            [
735                (0..55, Start(Paragraph)),
736                (0..6, Text),
737                (
738                    6..43,
739                    Start(Link {
740                        link_type: LinkType::Autolink,
741                        dest_url: "https://example.com/cat/é\u{200d}".into(),
742                        title: "".into(),
743                        id: "".into()
744                    })
745                ),
746                (6..29, Text),
747                (30..33, Text),
748                (33..40, SubstitutedText("\u{200d}".into())),
749                (40..43, Text),
750                (6..43, End(MarkdownTagEnd::Link)),
751                (43..55, Text),
752                (0..55, End(MarkdownTagEnd::Paragraph))
753            ]
754        );
755    }
756}