parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
  8
  9use crate::path_range::PathWithRange;
 10
 11const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 12    .union(Options::ENABLE_FOOTNOTES)
 13    .union(Options::ENABLE_STRIKETHROUGH)
 14    .union(Options::ENABLE_TASKLISTS)
 15    .union(Options::ENABLE_SMART_PUNCTUATION)
 16    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 17    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 18    .union(Options::ENABLE_OLD_FOOTNOTES)
 19    .union(Options::ENABLE_GFM);
 20
 21pub fn parse_markdown(
 22    text: &str,
 23) -> (
 24    Vec<(Range<usize>, MarkdownEvent)>,
 25    HashSet<SharedString>,
 26    HashSet<Arc<Path>>,
 27) {
 28    let mut events = Vec::new();
 29    let mut language_names = HashSet::new();
 30    let mut language_paths = HashSet::new();
 31    let mut within_link = false;
 32    let mut within_metadata = false;
 33    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
 34        .into_offset_iter()
 35        .peekable();
 36    while let Some((pulldown_event, mut range)) = parser.next() {
 37        if within_metadata {
 38            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 39                pulldown_event
 40            {
 41                within_metadata = false;
 42            }
 43            continue;
 44        }
 45        match pulldown_event {
 46            pulldown_cmark::Event::Start(tag) => {
 47                let tag = match tag {
 48                    pulldown_cmark::Tag::Link {
 49                        link_type,
 50                        dest_url,
 51                        title,
 52                        id,
 53                    } => {
 54                        within_link = true;
 55                        MarkdownTag::Link {
 56                            link_type,
 57                            dest_url: SharedString::from(dest_url.into_string()),
 58                            title: SharedString::from(title.into_string()),
 59                            id: SharedString::from(id.into_string()),
 60                        }
 61                    }
 62                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 63                        within_metadata = true;
 64                        MarkdownTag::MetadataBlock(kind)
 65                    }
 66                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 67                        MarkdownTag::CodeBlock {
 68                            kind: CodeBlockKind::Indented,
 69                            metadata: CodeBlockMetadata {
 70                                content_range: range.start + 1..range.end + 1,
 71                                line_count: 1,
 72                            },
 73                        }
 74                    }
 75                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 76                        ref info,
 77                    )) => {
 78                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 79                        let content_range =
 80                            content_range.start + range.start..content_range.end + range.start;
 81
 82                        let line_count = text[content_range.clone()]
 83                            .bytes()
 84                            .filter(|c| *c == b'\n')
 85                            .count();
 86                        let metadata = CodeBlockMetadata {
 87                            content_range,
 88                            line_count,
 89                        };
 90
 91                        let info = info.trim();
 92                        let kind = if info.is_empty() {
 93                            CodeBlockKind::Fenced
 94                            // Languages should never contain a slash, and PathRanges always should.
 95                            // (Models are told to specify them relative to a workspace root.)
 96                        } else if info.contains('/') {
 97                            let path_range = PathWithRange::new(info);
 98                            language_paths.insert(path_range.path.clone());
 99                            CodeBlockKind::FencedSrc(path_range)
100                        } else {
101                            let language = SharedString::from(info.to_string());
102                            language_names.insert(language.clone());
103                            CodeBlockKind::FencedLang(language)
104                        };
105
106                        MarkdownTag::CodeBlock { kind, metadata }
107                    }
108                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
109                    pulldown_cmark::Tag::Heading {
110                        level,
111                        id,
112                        classes,
113                        attrs,
114                    } => {
115                        let id = id.map(|id| SharedString::from(id.into_string()));
116                        let classes = classes
117                            .into_iter()
118                            .map(|c| SharedString::from(c.into_string()))
119                            .collect();
120                        let attrs = attrs
121                            .into_iter()
122                            .map(|(key, value)| {
123                                (
124                                    SharedString::from(key.into_string()),
125                                    value.map(|v| SharedString::from(v.into_string())),
126                                )
127                            })
128                            .collect();
129                        MarkdownTag::Heading {
130                            level,
131                            id,
132                            classes,
133                            attrs,
134                        }
135                    }
136                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
137                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
138                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
139                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
140                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
141                    }
142                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
143                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
144                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
145                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
146                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
147                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
148                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
149                    pulldown_cmark::Tag::Image {
150                        link_type,
151                        dest_url,
152                        title,
153                        id,
154                    } => MarkdownTag::Image {
155                        link_type,
156                        dest_url: SharedString::from(dest_url.into_string()),
157                        title: SharedString::from(title.into_string()),
158                        id: SharedString::from(id.into_string()),
159                    },
160                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
161                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
162                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
163                    pulldown_cmark::Tag::DefinitionListDefinition => {
164                        MarkdownTag::DefinitionListDefinition
165                    }
166                };
167                events.push((range, MarkdownEvent::Start(tag)))
168            }
169            pulldown_cmark::Event::End(tag) => {
170                if let pulldown_cmark::TagEnd::Link = tag {
171                    within_link = false;
172                }
173                events.push((range, MarkdownEvent::End(tag)));
174            }
175            pulldown_cmark::Event::Text(parsed) => {
176                fn event_for(
177                    text: &str,
178                    range: Range<usize>,
179                    str: &str,
180                ) -> (Range<usize>, MarkdownEvent) {
181                    if str == &text[range.clone()] {
182                        (range, MarkdownEvent::Text)
183                    } else {
184                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
185                    }
186                }
187                #[derive(Debug)]
188                struct TextRange<'a> {
189                    source_range: Range<usize>,
190                    merged_range: Range<usize>,
191                    parsed: CowStr<'a>,
192                }
193
194                let mut last_len = parsed.len();
195                let mut ranges = vec![TextRange {
196                    source_range: range.clone(),
197                    merged_range: 0..last_len,
198                    parsed,
199                }];
200
201                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
202                    let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
203                    else {
204                        unreachable!()
205                    };
206                    let next_len = last_len + next_event.len();
207                    ranges.push(TextRange {
208                        source_range: next_range.clone(),
209                        merged_range: last_len..next_len,
210                        parsed: next_event,
211                    });
212                    last_len = next_len;
213                }
214
215                let mut merged_text =
216                    String::with_capacity(ranges.last().unwrap().merged_range.end);
217                for range in &ranges {
218                    merged_text.push_str(&range.parsed);
219                }
220
221                let mut ranges = ranges.into_iter().peekable();
222
223                if !within_link {
224                    let mut finder = LinkFinder::new();
225                    finder.kinds(&[linkify::LinkKind::Url]);
226
227                    // Find links in the merged text
228                    for link in finder.links(&merged_text) {
229                        let link_start_in_merged = link.start();
230                        let link_end_in_merged = link.end();
231
232                        while ranges
233                            .peek()
234                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
235                        {
236                            let range = ranges.next().unwrap();
237                            events.push(event_for(text, range.source_range, &range.parsed));
238                        }
239
240                        let Some(range) = ranges.peek_mut() else {
241                            continue;
242                        };
243                        let prefix_len = link_start_in_merged - range.merged_range.start;
244                        if prefix_len > 0 {
245                            let (head, tail) = range.parsed.split_at(prefix_len);
246                            events.push(event_for(
247                                text,
248                                range.source_range.start..range.source_range.start + prefix_len,
249                                &head,
250                            ));
251                            range.parsed = CowStr::Boxed(tail.into());
252                            range.merged_range.start += prefix_len;
253                            range.source_range.start += prefix_len;
254                        }
255
256                        let link_start_in_source = range.source_range.start;
257                        let mut link_end_in_source = range.source_range.end;
258                        let mut link_events = Vec::new();
259
260                        while ranges
261                            .peek()
262                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
263                        {
264                            let range = ranges.next().unwrap();
265                            link_end_in_source = range.source_range.end;
266                            link_events.push(event_for(text, range.source_range, &range.parsed));
267                        }
268
269                        if let Some(range) = ranges.peek_mut() {
270                            let prefix_len = link_end_in_merged - range.merged_range.start;
271                            if prefix_len > 0 {
272                                let (head, tail) = range.parsed.split_at(prefix_len);
273                                link_events.push(event_for(
274                                    text,
275                                    range.source_range.start..range.source_range.start + prefix_len,
276                                    head,
277                                ));
278                                range.parsed = CowStr::Boxed(tail.into());
279                                range.merged_range.start += prefix_len;
280                                range.source_range.start += prefix_len;
281                                link_end_in_source = range.source_range.start;
282                            }
283                        }
284                        let link_range = link_start_in_source..link_end_in_source;
285
286                        events.push((
287                            link_range.clone(),
288                            MarkdownEvent::Start(MarkdownTag::Link {
289                                link_type: LinkType::Autolink,
290                                dest_url: SharedString::from(link.as_str().to_string()),
291                                title: SharedString::default(),
292                                id: SharedString::default(),
293                            }),
294                        ));
295                        events.extend(link_events);
296                        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
297                    }
298                }
299
300                for range in ranges {
301                    events.push(event_for(text, range.source_range, &range.parsed));
302                }
303            }
304            pulldown_cmark::Event::Code(_) => {
305                range.start += 1;
306                range.end -= 1;
307                events.push((range, MarkdownEvent::Code))
308            }
309            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
310            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
311            pulldown_cmark::Event::FootnoteReference(_) => {
312                events.push((range, MarkdownEvent::FootnoteReference))
313            }
314            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
315            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
316            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
317            pulldown_cmark::Event::TaskListMarker(checked) => {
318                events.push((range, MarkdownEvent::TaskListMarker(checked)))
319            }
320            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
321        }
322    }
323    (events, language_names, language_paths)
324}
325
326pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
327    let mut events = Vec::new();
328    let mut finder = LinkFinder::new();
329    finder.kinds(&[linkify::LinkKind::Url]);
330    let mut text_range = Range {
331        start: 0,
332        end: text.len(),
333    };
334    for link in finder.links(text) {
335        let link_range = link.start()..link.end();
336
337        if link_range.start > text_range.start {
338            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
339        }
340
341        events.push((
342            link_range.clone(),
343            MarkdownEvent::Start(MarkdownTag::Link {
344                link_type: LinkType::Autolink,
345                dest_url: SharedString::from(link.as_str().to_string()),
346                title: SharedString::default(),
347                id: SharedString::default(),
348            }),
349        ));
350        events.push((link_range.clone(), MarkdownEvent::Text));
351        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
352
353        text_range.start = link_range.end;
354    }
355
356    if text_range.end > text_range.start {
357        events.push((text_range, MarkdownEvent::Text));
358    }
359
360    events
361}
362
363/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
364/// parse result for rendering without resorting to unsafe lifetime coercion.
365#[derive(Clone, Debug, PartialEq)]
366pub enum MarkdownEvent {
367    /// Start of a tagged element. Events that are yielded after this event
368    /// and before its corresponding `End` event are inside this element.
369    /// Start and end events are guaranteed to be balanced.
370    Start(MarkdownTag),
371    /// End of a tagged element.
372    End(MarkdownTagEnd),
373    /// Text that uses the associated range from the markdown source.
374    Text,
375    /// Text that differs from the markdown source - typically due to substitution of HTML entities
376    /// and smart punctuation.
377    SubstitutedText(String),
378    /// An inline code node.
379    Code,
380    /// An HTML node.
381    Html,
382    /// An inline HTML node.
383    InlineHtml,
384    /// A reference to a footnote with given label, which may or may not be defined
385    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
386    /// occur in any order.
387    FootnoteReference,
388    /// A soft line break.
389    SoftBreak,
390    /// A hard line break.
391    HardBreak,
392    /// A horizontal ruler.
393    Rule,
394    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
395    TaskListMarker(bool),
396}
397
398/// Tags for elements that can contain other elements.
399#[derive(Clone, Debug, PartialEq)]
400pub enum MarkdownTag {
401    /// A paragraph of text and other inline elements.
402    Paragraph,
403
404    /// A heading, with optional identifier, classes and custom attributes.
405    /// The identifier is prefixed with `#` and the last one in the attributes
406    /// list is chosen, classes are prefixed with `.` and custom attributes
407    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
408    Heading {
409        level: HeadingLevel,
410        id: Option<SharedString>,
411        classes: Vec<SharedString>,
412        /// The first item of the tuple is the attr and second one the value.
413        attrs: Vec<(SharedString, Option<SharedString>)>,
414    },
415
416    BlockQuote,
417
418    /// A code block.
419    CodeBlock {
420        kind: CodeBlockKind,
421        metadata: CodeBlockMetadata,
422    },
423
424    /// A HTML block.
425    HtmlBlock,
426
427    /// A list. If the list is ordered the field indicates the number of the first item.
428    /// Contains only list items.
429    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
430
431    /// A list item.
432    Item,
433
434    /// A footnote definition. The value contained is the footnote's label by which it can
435    /// be referred to.
436    FootnoteDefinition(SharedString),
437
438    /// A table. Contains a vector describing the text-alignment for each of its columns.
439    Table(Vec<Alignment>),
440
441    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
442    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
443    TableHead,
444
445    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
446    TableRow,
447    TableCell,
448
449    // span-level tags
450    Emphasis,
451    Strong,
452    Strikethrough,
453
454    /// A link.
455    Link {
456        link_type: LinkType,
457        dest_url: SharedString,
458        title: SharedString,
459        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
460        id: SharedString,
461    },
462
463    /// An image. The first field is the link type, the second the destination URL and the third is a title,
464    /// the fourth is the link identifier.
465    Image {
466        link_type: LinkType,
467        dest_url: SharedString,
468        title: SharedString,
469        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
470        id: SharedString,
471    },
472
473    /// A metadata block.
474    MetadataBlock(MetadataBlockKind),
475
476    DefinitionList,
477    DefinitionListTitle,
478    DefinitionListDefinition,
479}
480
481#[derive(Clone, Debug, PartialEq)]
482pub enum CodeBlockKind {
483    Indented,
484    /// "Fenced" means "surrounded by triple backticks."
485    /// There can optionally be either a language after the backticks (like in traditional Markdown)
486    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
487    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
488    Fenced,
489    FencedLang(SharedString),
490    FencedSrc(PathWithRange),
491}
492
493#[derive(Default, Clone, Debug, PartialEq)]
494pub struct CodeBlockMetadata {
495    pub content_range: Range<usize>,
496    pub line_count: usize,
497}
498
499pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
500    let mut range = 0..text.len();
501    if text.starts_with("```") {
502        range.start += 3;
503
504        if let Some(newline_ix) = text[range.clone()].find('\n') {
505            range.start += newline_ix + 1;
506        }
507    }
508
509    if !range.is_empty() && text.ends_with("```") {
510        range.end -= 3;
511    }
512    if range.start > range.end {
513        range.end = range.start;
514    }
515    range
516}
517
518#[cfg(test)]
519mod tests {
520    use super::MarkdownEvent::*;
521    use super::MarkdownTag::*;
522    use super::*;
523
524    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
525        .union(Options::ENABLE_MATH)
526        .union(Options::ENABLE_DEFINITION_LIST);
527
528    #[test]
529    fn all_options_considered() {
530        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
531        // can be evaluated for inclusion.
532        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
533    }
534
535    #[test]
536    fn wanted_and_unwanted_options_disjoint() {
537        assert_eq!(
538            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
539            Options::empty()
540        );
541    }
542
543    #[test]
544    fn test_html_comments() {
545        assert_eq!(
546            parse_markdown("  <!--\nrdoc-file=string.c\n-->\nReturns"),
547            (
548                vec![
549                    (2..30, Start(HtmlBlock)),
550                    (2..2, SubstitutedText("  ".into())),
551                    (2..7, Html),
552                    (7..26, Html),
553                    (26..30, Html),
554                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
555                    (30..37, Start(Paragraph)),
556                    (30..37, Text),
557                    (30..37, End(MarkdownTagEnd::Paragraph))
558                ],
559                HashSet::new(),
560                HashSet::new()
561            )
562        )
563    }
564
565    #[test]
566    fn test_plain_urls_and_escaped_text() {
567        assert_eq!(
568            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
569            (
570                vec![
571                    (0..51, Start(Paragraph)),
572                    (0..6, SubstitutedText("\u{a0}".into())),
573                    (6..12, SubstitutedText("\u{a0}".into())),
574                    (12..13, Text),
575                    (
576                        13..29,
577                        Start(Link {
578                            link_type: LinkType::Autolink,
579                            dest_url: "https://some.url".into(),
580                            title: "".into(),
581                            id: "".into(),
582                        })
583                    ),
584                    (13..29, Text),
585                    (13..29, End(MarkdownTagEnd::Link)),
586                    (29..35, Text),
587                    (36..37, Text), // Escaped backtick
588                    (37..44, SubstitutedText("".into())),
589                    (45..46, Text), // Escaped backtick
590                    (46..51, Text),
591                    (0..51, End(MarkdownTagEnd::Paragraph))
592                ],
593                HashSet::new(),
594                HashSet::new()
595            )
596        );
597    }
598
599    #[test]
600    fn test_incomplete_link() {
601        assert_eq!(
602            parse_markdown("You can use the [GitHub Search API](https://docs.github.com/en").0,
603            vec![
604                (0..62, Start(Paragraph)),
605                (0..16, Text),
606                (16..17, Text),
607                (17..34, Text),
608                (34..35, Text),
609                (35..36, Text),
610                (
611                    36..62,
612                    Start(Link {
613                        link_type: LinkType::Autolink,
614                        dest_url: "https://docs.github.com/en".into(),
615                        title: "".into(),
616                        id: "".into()
617                    })
618                ),
619                (36..62, Text),
620                (36..62, End(MarkdownTagEnd::Link)),
621                (0..62, End(MarkdownTagEnd::Paragraph))
622            ],
623        );
624    }
625
626    #[test]
627    fn test_smart_punctuation() {
628        assert_eq!(
629            parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
630            (
631                vec![
632                    (0..53, Start(Paragraph)),
633                    (0..2, SubstitutedText("".into())),
634                    (2..3, Text),
635                    (3..6, SubstitutedText("".into())),
636                    (6..7, Text),
637                    (7..10, SubstitutedText("".into())),
638                    (10..11, Text),
639                    (11..12, SubstitutedText("".into())),
640                    (12..25, Text),
641                    (25..26, SubstitutedText("".into())),
642                    (26..27, Text),
643                    (27..28, SubstitutedText("".into())),
644                    (28..41, Text),
645                    (41..42, SubstitutedText("".into())),
646                    (42..43, Text),
647                    (43..53, SubstitutedText("–––––".into())),
648                    (0..53, End(MarkdownTagEnd::Paragraph))
649                ],
650                HashSet::new(),
651                HashSet::new()
652            )
653        )
654    }
655
656    #[test]
657    fn test_code_block_metadata() {
658        assert_eq!(
659            parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
660            (
661                vec![
662                    (
663                        0..37,
664                        Start(CodeBlock {
665                            kind: CodeBlockKind::FencedLang("rust".into()),
666                            metadata: CodeBlockMetadata {
667                                content_range: 8..34,
668                                line_count: 3
669                            }
670                        })
671                    ),
672                    (8..34, Text),
673                    (0..37, End(MarkdownTagEnd::CodeBlock)),
674                ],
675                HashSet::from(["rust".into()]),
676                HashSet::new()
677            )
678        )
679    }
680
681    #[test]
682    fn test_extract_code_block_content_range() {
683        let input = "```rust\nlet x = 5;\n```";
684        assert_eq!(extract_code_block_content_range(input), 8..19);
685
686        let input = "plain text";
687        assert_eq!(extract_code_block_content_range(input), 0..10);
688
689        let input = "```python\nprint('hello')\nprint('world')\n```";
690        assert_eq!(extract_code_block_content_range(input), 10..40);
691
692        // Malformed input
693        let input = "`````";
694        assert_eq!(extract_code_block_content_range(input), 3..3);
695    }
696
697    #[test]
698    fn test_links_split_across_fragments() {
699        // This test verifies that links split across multiple text fragments due to escaping or other issues
700        // are correctly detected and processed
701        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
702        // We're verifying our parser can handle this correctly
703        assert_eq!(
704            parse_markdown("https:/\\/example.com is equivalent to https://example&#46;com!").0,
705            vec![
706                (0..62, Start(Paragraph)),
707                (
708                    0..20,
709                    Start(Link {
710                        link_type: LinkType::Autolink,
711                        dest_url: "https://example.com".into(),
712                        title: "".into(),
713                        id: "".into()
714                    })
715                ),
716                (0..7, Text),
717                (8..20, Text),
718                (0..20, End(MarkdownTagEnd::Link)),
719                (20..38, Text),
720                (
721                    38..61,
722                    Start(Link {
723                        link_type: LinkType::Autolink,
724                        dest_url: "https://example.com".into(),
725                        title: "".into(),
726                        id: "".into()
727                    })
728                ),
729                (38..53, Text),
730                (53..58, SubstitutedText(".".into())),
731                (58..61, Text),
732                (38..61, End(MarkdownTagEnd::Link)),
733                (61..62, Text),
734                (0..62, End(MarkdownTagEnd::Paragraph))
735            ],
736        );
737
738        assert_eq!(
739            parse_markdown("Visit https://example.com/cat\\/é&#8205;☕ for coffee!").0,
740            [
741                (0..55, Start(Paragraph)),
742                (0..6, Text),
743                (
744                    6..43,
745                    Start(Link {
746                        link_type: LinkType::Autolink,
747                        dest_url: "https://example.com/cat/é\u{200d}".into(),
748                        title: "".into(),
749                        id: "".into()
750                    })
751                ),
752                (6..29, Text),
753                (30..33, Text),
754                (33..40, SubstitutedText("\u{200d}".into())),
755                (40..43, Text),
756                (6..43, End(MarkdownTagEnd::Link)),
757                (43..55, Text),
758                (0..55, End(MarkdownTagEnd::Paragraph))
759            ]
760        );
761    }
762}