parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
  8
  9use crate::path_range::PathWithRange;
 10
 11const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 12    .union(Options::ENABLE_FOOTNOTES)
 13    .union(Options::ENABLE_STRIKETHROUGH)
 14    .union(Options::ENABLE_TASKLISTS)
 15    .union(Options::ENABLE_SMART_PUNCTUATION)
 16    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 17    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 18    .union(Options::ENABLE_OLD_FOOTNOTES)
 19    .union(Options::ENABLE_GFM);
 20
 21pub fn parse_markdown(
 22    text: &str,
 23) -> (
 24    Vec<(Range<usize>, MarkdownEvent)>,
 25    HashSet<SharedString>,
 26    HashSet<Arc<Path>>,
 27) {
 28    let mut events = Vec::new();
 29    let mut language_names = HashSet::new();
 30    let mut language_paths = HashSet::new();
 31    let mut within_link = false;
 32    let mut within_metadata = false;
 33    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
 34        .into_offset_iter()
 35        .peekable();
 36    while let Some((pulldown_event, mut range)) = parser.next() {
 37        if within_metadata {
 38            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 39                pulldown_event
 40            {
 41                within_metadata = false;
 42            }
 43            continue;
 44        }
 45        match pulldown_event {
 46            pulldown_cmark::Event::Start(tag) => {
 47                let tag = match tag {
 48                    pulldown_cmark::Tag::Link {
 49                        link_type,
 50                        dest_url,
 51                        title,
 52                        id,
 53                    } => {
 54                        within_link = true;
 55                        MarkdownTag::Link {
 56                            link_type,
 57                            dest_url: SharedString::from(dest_url.into_string()),
 58                            title: SharedString::from(title.into_string()),
 59                            id: SharedString::from(id.into_string()),
 60                        }
 61                    }
 62                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 63                        within_metadata = true;
 64                        MarkdownTag::MetadataBlock(kind)
 65                    }
 66                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 67                        MarkdownTag::CodeBlock {
 68                            kind: CodeBlockKind::Indented,
 69                            metadata: CodeBlockMetadata {
 70                                content_range: range.start + 1..range.end + 1,
 71                                line_count: 1,
 72                            },
 73                        }
 74                    }
 75                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 76                        ref info,
 77                    )) => {
 78                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 79                        let content_range =
 80                            content_range.start + range.start..content_range.end + range.start;
 81
 82                        // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
 83                        let line_count = text[content_range.clone()]
 84                            .bytes()
 85                            .filter(|c| *c == b'\n')
 86                            .count();
 87                        let metadata = CodeBlockMetadata {
 88                            content_range,
 89                            line_count,
 90                        };
 91
 92                        let info = info.trim();
 93                        let kind = if info.is_empty() {
 94                            CodeBlockKind::Fenced
 95                            // Languages should never contain a slash, and PathRanges always should.
 96                            // (Models are told to specify them relative to a workspace root.)
 97                        } else if info.contains('/') {
 98                            let path_range = PathWithRange::new(info);
 99                            language_paths.insert(path_range.path.clone());
100                            CodeBlockKind::FencedSrc(path_range)
101                        } else {
102                            let language = SharedString::from(info.to_string());
103                            language_names.insert(language.clone());
104                            CodeBlockKind::FencedLang(language)
105                        };
106
107                        MarkdownTag::CodeBlock { kind, metadata }
108                    }
109                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
110                    pulldown_cmark::Tag::Heading {
111                        level,
112                        id,
113                        classes,
114                        attrs,
115                    } => {
116                        let id = id.map(|id| SharedString::from(id.into_string()));
117                        let classes = classes
118                            .into_iter()
119                            .map(|c| SharedString::from(c.into_string()))
120                            .collect();
121                        let attrs = attrs
122                            .into_iter()
123                            .map(|(key, value)| {
124                                (
125                                    SharedString::from(key.into_string()),
126                                    value.map(|v| SharedString::from(v.into_string())),
127                                )
128                            })
129                            .collect();
130                        MarkdownTag::Heading {
131                            level,
132                            id,
133                            classes,
134                            attrs,
135                        }
136                    }
137                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
138                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
139                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
140                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
141                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
142                    }
143                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
144                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
145                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
146                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
147                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
148                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
149                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
150                    pulldown_cmark::Tag::Image {
151                        link_type,
152                        dest_url,
153                        title,
154                        id,
155                    } => MarkdownTag::Image {
156                        link_type,
157                        dest_url: SharedString::from(dest_url.into_string()),
158                        title: SharedString::from(title.into_string()),
159                        id: SharedString::from(id.into_string()),
160                    },
161                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
162                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
163                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
164                    pulldown_cmark::Tag::DefinitionListDefinition => {
165                        MarkdownTag::DefinitionListDefinition
166                    }
167                };
168                events.push((range, MarkdownEvent::Start(tag)))
169            }
170            pulldown_cmark::Event::End(tag) => {
171                if let pulldown_cmark::TagEnd::Link = tag {
172                    within_link = false;
173                }
174                events.push((range, MarkdownEvent::End(tag)));
175            }
176            pulldown_cmark::Event::Text(parsed) => {
177                fn event_for(
178                    text: &str,
179                    range: Range<usize>,
180                    str: &str,
181                ) -> (Range<usize>, MarkdownEvent) {
182                    if str == &text[range.clone()] {
183                        (range, MarkdownEvent::Text)
184                    } else {
185                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
186                    }
187                }
188                #[derive(Debug)]
189                struct TextRange<'a> {
190                    source_range: Range<usize>,
191                    merged_range: Range<usize>,
192                    parsed: CowStr<'a>,
193                }
194
195                let mut last_len = parsed.len();
196                let mut ranges = vec![TextRange {
197                    source_range: range.clone(),
198                    merged_range: 0..last_len,
199                    parsed,
200                }];
201
202                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
203                    let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
204                    else {
205                        unreachable!()
206                    };
207                    let next_len = last_len + next_event.len();
208                    ranges.push(TextRange {
209                        source_range: next_range.clone(),
210                        merged_range: last_len..next_len,
211                        parsed: next_event,
212                    });
213                    last_len = next_len;
214                }
215
216                let mut merged_text =
217                    String::with_capacity(ranges.last().unwrap().merged_range.end);
218                for range in &ranges {
219                    merged_text.push_str(&range.parsed);
220                }
221
222                let mut ranges = ranges.into_iter().peekable();
223
224                if !within_link {
225                    let mut finder = LinkFinder::new();
226                    finder.kinds(&[linkify::LinkKind::Url]);
227
228                    // Find links in the merged text
229                    for link in finder.links(&merged_text) {
230                        let link_start_in_merged = link.start();
231                        let link_end_in_merged = link.end();
232
233                        while ranges
234                            .peek()
235                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
236                        {
237                            let range = ranges.next().unwrap();
238                            events.push(event_for(text, range.source_range, &range.parsed));
239                        }
240
241                        let Some(range) = ranges.peek_mut() else {
242                            continue;
243                        };
244                        let prefix_len = link_start_in_merged - range.merged_range.start;
245                        if prefix_len > 0 {
246                            let (head, tail) = range.parsed.split_at(prefix_len);
247                            events.push(event_for(
248                                text,
249                                range.source_range.start..range.source_range.start + prefix_len,
250                                &head,
251                            ));
252                            range.parsed = CowStr::Boxed(tail.into());
253                            range.merged_range.start += prefix_len;
254                            range.source_range.start += prefix_len;
255                        }
256
257                        let link_start_in_source = range.source_range.start;
258                        let mut link_end_in_source = range.source_range.end;
259                        let mut link_events = Vec::new();
260
261                        while ranges
262                            .peek()
263                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
264                        {
265                            let range = ranges.next().unwrap();
266                            link_end_in_source = range.source_range.end;
267                            link_events.push(event_for(text, range.source_range, &range.parsed));
268                        }
269
270                        if let Some(range) = ranges.peek_mut() {
271                            let prefix_len = link_end_in_merged - range.merged_range.start;
272                            if prefix_len > 0 {
273                                let (head, tail) = range.parsed.split_at(prefix_len);
274                                link_events.push(event_for(
275                                    text,
276                                    range.source_range.start..range.source_range.start + prefix_len,
277                                    head,
278                                ));
279                                range.parsed = CowStr::Boxed(tail.into());
280                                range.merged_range.start += prefix_len;
281                                range.source_range.start += prefix_len;
282                                link_end_in_source = range.source_range.start;
283                            }
284                        }
285                        let link_range = link_start_in_source..link_end_in_source;
286
287                        events.push((
288                            link_range.clone(),
289                            MarkdownEvent::Start(MarkdownTag::Link {
290                                link_type: LinkType::Autolink,
291                                dest_url: SharedString::from(link.as_str().to_string()),
292                                title: SharedString::default(),
293                                id: SharedString::default(),
294                            }),
295                        ));
296                        events.extend(link_events);
297                        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
298                    }
299                }
300
301                for range in ranges {
302                    events.push(event_for(text, range.source_range, &range.parsed));
303                }
304            }
305            pulldown_cmark::Event::Code(_) => {
306                range.start += 1;
307                range.end -= 1;
308                events.push((range, MarkdownEvent::Code))
309            }
310            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
311            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
312            pulldown_cmark::Event::FootnoteReference(_) => {
313                events.push((range, MarkdownEvent::FootnoteReference))
314            }
315            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
316            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
317            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
318            pulldown_cmark::Event::TaskListMarker(checked) => {
319                events.push((range, MarkdownEvent::TaskListMarker(checked)))
320            }
321            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
322        }
323    }
324    (events, language_names, language_paths)
325}
326
327pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
328    let mut events = Vec::new();
329    let mut finder = LinkFinder::new();
330    finder.kinds(&[linkify::LinkKind::Url]);
331    let mut text_range = Range {
332        start: 0,
333        end: text.len(),
334    };
335    for link in finder.links(text) {
336        let link_range = link.start()..link.end();
337
338        if link_range.start > text_range.start {
339            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
340        }
341
342        events.push((
343            link_range.clone(),
344            MarkdownEvent::Start(MarkdownTag::Link {
345                link_type: LinkType::Autolink,
346                dest_url: SharedString::from(link.as_str().to_string()),
347                title: SharedString::default(),
348                id: SharedString::default(),
349            }),
350        ));
351        events.push((link_range.clone(), MarkdownEvent::Text));
352        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
353
354        text_range.start = link_range.end;
355    }
356
357    if text_range.end > text_range.start {
358        events.push((text_range, MarkdownEvent::Text));
359    }
360
361    events
362}
363
364/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
365/// parse result for rendering without resorting to unsafe lifetime coercion.
366#[derive(Clone, Debug, PartialEq)]
367pub enum MarkdownEvent {
368    /// Start of a tagged element. Events that are yielded after this event
369    /// and before its corresponding `End` event are inside this element.
370    /// Start and end events are guaranteed to be balanced.
371    Start(MarkdownTag),
372    /// End of a tagged element.
373    End(MarkdownTagEnd),
374    /// Text that uses the associated range from the markdown source.
375    Text,
376    /// Text that differs from the markdown source - typically due to substitution of HTML entities
377    /// and smart punctuation.
378    SubstitutedText(String),
379    /// An inline code node.
380    Code,
381    /// An HTML node.
382    Html,
383    /// An inline HTML node.
384    InlineHtml,
385    /// A reference to a footnote with given label, which may or may not be defined
386    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
387    /// occur in any order.
388    FootnoteReference,
389    /// A soft line break.
390    SoftBreak,
391    /// A hard line break.
392    HardBreak,
393    /// A horizontal ruler.
394    Rule,
395    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
396    TaskListMarker(bool),
397}
398
399/// Tags for elements that can contain other elements.
400#[derive(Clone, Debug, PartialEq)]
401pub enum MarkdownTag {
402    /// A paragraph of text and other inline elements.
403    Paragraph,
404
405    /// A heading, with optional identifier, classes and custom attributes.
406    /// The identifier is prefixed with `#` and the last one in the attributes
407    /// list is chosen, classes are prefixed with `.` and custom attributes
408    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
409    Heading {
410        level: HeadingLevel,
411        id: Option<SharedString>,
412        classes: Vec<SharedString>,
413        /// The first item of the tuple is the attr and second one the value.
414        attrs: Vec<(SharedString, Option<SharedString>)>,
415    },
416
417    BlockQuote,
418
419    /// A code block.
420    CodeBlock {
421        kind: CodeBlockKind,
422        metadata: CodeBlockMetadata,
423    },
424
425    /// A HTML block.
426    HtmlBlock,
427
428    /// A list. If the list is ordered the field indicates the number of the first item.
429    /// Contains only list items.
430    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
431
432    /// A list item.
433    Item,
434
435    /// A footnote definition. The value contained is the footnote's label by which it can
436    /// be referred to.
437    FootnoteDefinition(SharedString),
438
439    /// A table. Contains a vector describing the text-alignment for each of its columns.
440    Table(Vec<Alignment>),
441
442    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
443    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
444    TableHead,
445
446    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
447    TableRow,
448    TableCell,
449
450    // span-level tags
451    Emphasis,
452    Strong,
453    Strikethrough,
454
455    /// A link.
456    Link {
457        link_type: LinkType,
458        dest_url: SharedString,
459        title: SharedString,
460        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
461        id: SharedString,
462    },
463
464    /// An image. The first field is the link type, the second the destination URL and the third is a title,
465    /// the fourth is the link identifier.
466    Image {
467        link_type: LinkType,
468        dest_url: SharedString,
469        title: SharedString,
470        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
471        id: SharedString,
472    },
473
474    /// A metadata block.
475    MetadataBlock(MetadataBlockKind),
476
477    DefinitionList,
478    DefinitionListTitle,
479    DefinitionListDefinition,
480}
481
482#[derive(Clone, Debug, PartialEq)]
483pub enum CodeBlockKind {
484    Indented,
485    /// "Fenced" means "surrounded by triple backticks."
486    /// There can optionally be either a language after the backticks (like in traditional Markdown)
487    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
488    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
489    Fenced,
490    FencedLang(SharedString),
491    FencedSrc(PathWithRange),
492}
493
494#[derive(Default, Clone, Debug, PartialEq)]
495pub struct CodeBlockMetadata {
496    pub content_range: Range<usize>,
497    pub line_count: usize,
498}
499
500pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
501    let mut range = 0..text.len();
502    if text.starts_with("```") {
503        range.start += 3;
504
505        if let Some(newline_ix) = text[range.clone()].find('\n') {
506            range.start += newline_ix + 1;
507        }
508    }
509
510    if !range.is_empty() && text.ends_with("```") {
511        range.end -= 3;
512    }
513    if range.start > range.end {
514        range.end = range.start;
515    }
516    range
517}
518
519#[cfg(test)]
520mod tests {
521    use super::MarkdownEvent::*;
522    use super::MarkdownTag::*;
523    use super::*;
524
525    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
526        .union(Options::ENABLE_MATH)
527        .union(Options::ENABLE_DEFINITION_LIST);
528
529    #[test]
530    fn all_options_considered() {
531        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
532        // can be evaluated for inclusion.
533        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
534    }
535
536    #[test]
537    fn wanted_and_unwanted_options_disjoint() {
538        assert_eq!(
539            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
540            Options::empty()
541        );
542    }
543
544    #[test]
545    fn test_html_comments() {
546        assert_eq!(
547            parse_markdown("  <!--\nrdoc-file=string.c\n-->\nReturns"),
548            (
549                vec![
550                    (2..30, Start(HtmlBlock)),
551                    (2..2, SubstitutedText("  ".into())),
552                    (2..7, Html),
553                    (7..26, Html),
554                    (26..30, Html),
555                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
556                    (30..37, Start(Paragraph)),
557                    (30..37, Text),
558                    (30..37, End(MarkdownTagEnd::Paragraph))
559                ],
560                HashSet::new(),
561                HashSet::new()
562            )
563        )
564    }
565
566    #[test]
567    fn test_plain_urls_and_escaped_text() {
568        assert_eq!(
569            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
570            (
571                vec![
572                    (0..51, Start(Paragraph)),
573                    (0..6, SubstitutedText("\u{a0}".into())),
574                    (6..12, SubstitutedText("\u{a0}".into())),
575                    (12..13, Text),
576                    (
577                        13..29,
578                        Start(Link {
579                            link_type: LinkType::Autolink,
580                            dest_url: "https://some.url".into(),
581                            title: "".into(),
582                            id: "".into(),
583                        })
584                    ),
585                    (13..29, Text),
586                    (13..29, End(MarkdownTagEnd::Link)),
587                    (29..35, Text),
588                    (36..37, Text), // Escaped backtick
589                    (37..44, SubstitutedText("".into())),
590                    (45..46, Text), // Escaped backtick
591                    (46..51, Text),
592                    (0..51, End(MarkdownTagEnd::Paragraph))
593                ],
594                HashSet::new(),
595                HashSet::new()
596            )
597        );
598    }
599
600    #[test]
601    fn test_incomplete_link() {
602        assert_eq!(
603            parse_markdown("You can use the [GitHub Search API](https://docs.github.com/en").0,
604            vec![
605                (0..62, Start(Paragraph)),
606                (0..16, Text),
607                (16..17, Text),
608                (17..34, Text),
609                (34..35, Text),
610                (35..36, Text),
611                (
612                    36..62,
613                    Start(Link {
614                        link_type: LinkType::Autolink,
615                        dest_url: "https://docs.github.com/en".into(),
616                        title: "".into(),
617                        id: "".into()
618                    })
619                ),
620                (36..62, Text),
621                (36..62, End(MarkdownTagEnd::Link)),
622                (0..62, End(MarkdownTagEnd::Paragraph))
623            ],
624        );
625    }
626
627    #[test]
628    fn test_smart_punctuation() {
629        assert_eq!(
630            parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
631            (
632                vec![
633                    (0..53, Start(Paragraph)),
634                    (0..2, SubstitutedText("".into())),
635                    (2..3, Text),
636                    (3..6, SubstitutedText("".into())),
637                    (6..7, Text),
638                    (7..10, SubstitutedText("".into())),
639                    (10..11, Text),
640                    (11..12, SubstitutedText("".into())),
641                    (12..25, Text),
642                    (25..26, SubstitutedText("".into())),
643                    (26..27, Text),
644                    (27..28, SubstitutedText("".into())),
645                    (28..41, Text),
646                    (41..42, SubstitutedText("".into())),
647                    (42..43, Text),
648                    (43..53, SubstitutedText("–––––".into())),
649                    (0..53, End(MarkdownTagEnd::Paragraph))
650                ],
651                HashSet::new(),
652                HashSet::new()
653            )
654        )
655    }
656
657    #[test]
658    fn test_code_block_metadata() {
659        assert_eq!(
660            parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
661            (
662                vec![
663                    (
664                        0..37,
665                        Start(CodeBlock {
666                            kind: CodeBlockKind::FencedLang("rust".into()),
667                            metadata: CodeBlockMetadata {
668                                content_range: 8..34,
669                                line_count: 3
670                            }
671                        })
672                    ),
673                    (8..34, Text),
674                    (0..37, End(MarkdownTagEnd::CodeBlock)),
675                ],
676                HashSet::from(["rust".into()]),
677                HashSet::new()
678            )
679        )
680    }
681
682    #[test]
683    fn test_extract_code_block_content_range() {
684        let input = "```rust\nlet x = 5;\n```";
685        assert_eq!(extract_code_block_content_range(input), 8..19);
686
687        let input = "plain text";
688        assert_eq!(extract_code_block_content_range(input), 0..10);
689
690        let input = "```python\nprint('hello')\nprint('world')\n```";
691        assert_eq!(extract_code_block_content_range(input), 10..40);
692
693        // Malformed input
694        let input = "`````";
695        assert_eq!(extract_code_block_content_range(input), 3..3);
696    }
697
698    #[test]
699    fn test_links_split_across_fragments() {
700        // This test verifies that links split across multiple text fragments due to escaping or other issues
701        // are correctly detected and processed
702        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
703        // We're verifying our parser can handle this correctly
704        assert_eq!(
705            parse_markdown("https:/\\/example.com is equivalent to https://example&#46;com!").0,
706            vec![
707                (0..62, Start(Paragraph)),
708                (
709                    0..20,
710                    Start(Link {
711                        link_type: LinkType::Autolink,
712                        dest_url: "https://example.com".into(),
713                        title: "".into(),
714                        id: "".into()
715                    })
716                ),
717                (0..7, Text),
718                (8..20, Text),
719                (0..20, End(MarkdownTagEnd::Link)),
720                (20..38, Text),
721                (
722                    38..61,
723                    Start(Link {
724                        link_type: LinkType::Autolink,
725                        dest_url: "https://example.com".into(),
726                        title: "".into(),
727                        id: "".into()
728                    })
729                ),
730                (38..53, Text),
731                (53..58, SubstitutedText(".".into())),
732                (58..61, Text),
733                (38..61, End(MarkdownTagEnd::Link)),
734                (61..62, Text),
735                (0..62, End(MarkdownTagEnd::Paragraph))
736            ],
737        );
738
739        assert_eq!(
740            parse_markdown("Visit https://example.com/cat\\/é&#8205;☕ for coffee!").0,
741            [
742                (0..55, Start(Paragraph)),
743                (0..6, Text),
744                (
745                    6..43,
746                    Start(Link {
747                        link_type: LinkType::Autolink,
748                        dest_url: "https://example.com/cat/é\u{200d}".into(),
749                        title: "".into(),
750                        id: "".into()
751                    })
752                ),
753                (6..29, Text),
754                (30..33, Text),
755                (33..40, SubstitutedText("\u{200d}".into())),
756                (40..43, Text),
757                (6..43, End(MarkdownTagEnd::Link)),
758                (43..55, Text),
759                (0..55, End(MarkdownTagEnd::Paragraph))
760            ]
761        );
762    }
763}