parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
  8
  9use crate::path_range::PathWithRange;
 10
 11const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 12    .union(Options::ENABLE_FOOTNOTES)
 13    .union(Options::ENABLE_STRIKETHROUGH)
 14    .union(Options::ENABLE_TASKLISTS)
 15    .union(Options::ENABLE_SMART_PUNCTUATION)
 16    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 17    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 18    .union(Options::ENABLE_OLD_FOOTNOTES)
 19    .union(Options::ENABLE_GFM);
 20
 21pub fn parse_markdown(
 22    text: &str,
 23) -> (
 24    Vec<(Range<usize>, MarkdownEvent)>,
 25    HashSet<SharedString>,
 26    HashSet<Arc<Path>>,
 27) {
 28    let mut events = Vec::new();
 29    let mut language_names = HashSet::new();
 30    let mut language_paths = HashSet::new();
 31    let mut within_link = false;
 32    let mut within_metadata = false;
 33    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
 34        .into_offset_iter()
 35        .peekable();
 36    while let Some((pulldown_event, range)) = parser.next() {
 37        if within_metadata {
 38            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 39                pulldown_event
 40            {
 41                within_metadata = false;
 42            }
 43            continue;
 44        }
 45        match pulldown_event {
 46            pulldown_cmark::Event::Start(tag) => {
 47                let tag = match tag {
 48                    pulldown_cmark::Tag::Link {
 49                        link_type,
 50                        dest_url,
 51                        title,
 52                        id,
 53                    } => {
 54                        within_link = true;
 55                        MarkdownTag::Link {
 56                            link_type,
 57                            dest_url: SharedString::from(dest_url.into_string()),
 58                            title: SharedString::from(title.into_string()),
 59                            id: SharedString::from(id.into_string()),
 60                        }
 61                    }
 62                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 63                        within_metadata = true;
 64                        MarkdownTag::MetadataBlock(kind)
 65                    }
 66                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 67                        MarkdownTag::CodeBlock {
 68                            kind: CodeBlockKind::Indented,
 69                            metadata: CodeBlockMetadata {
 70                                content_range: range.start + 1..range.end + 1,
 71                                line_count: 1,
 72                            },
 73                        }
 74                    }
 75                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 76                        ref info,
 77                    )) => {
 78                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 79                        let content_range =
 80                            content_range.start + range.start..content_range.end + range.start;
 81
 82                        // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
 83                        let line_count = text[content_range.clone()]
 84                            .bytes()
 85                            .filter(|c| *c == b'\n')
 86                            .count();
 87                        let metadata = CodeBlockMetadata {
 88                            content_range,
 89                            line_count,
 90                        };
 91
 92                        let info = info.trim();
 93                        let kind = if info.is_empty() {
 94                            CodeBlockKind::Fenced
 95                            // Languages should never contain a slash, and PathRanges always should.
 96                            // (Models are told to specify them relative to a workspace root.)
 97                        } else if info.contains('/') {
 98                            let path_range = PathWithRange::new(info);
 99                            language_paths.insert(path_range.path.clone());
100                            CodeBlockKind::FencedSrc(path_range)
101                        } else {
102                            let language = SharedString::from(info.to_string());
103                            language_names.insert(language.clone());
104                            CodeBlockKind::FencedLang(language)
105                        };
106
107                        MarkdownTag::CodeBlock { kind, metadata }
108                    }
109                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
110                    pulldown_cmark::Tag::Heading {
111                        level,
112                        id,
113                        classes,
114                        attrs,
115                    } => {
116                        let id = id.map(|id| SharedString::from(id.into_string()));
117                        let classes = classes
118                            .into_iter()
119                            .map(|c| SharedString::from(c.into_string()))
120                            .collect();
121                        let attrs = attrs
122                            .into_iter()
123                            .map(|(key, value)| {
124                                (
125                                    SharedString::from(key.into_string()),
126                                    value.map(|v| SharedString::from(v.into_string())),
127                                )
128                            })
129                            .collect();
130                        MarkdownTag::Heading {
131                            level,
132                            id,
133                            classes,
134                            attrs,
135                        }
136                    }
137                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
138                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
139                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
140                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
141                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
142                    }
143                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
144                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
145                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
146                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
147                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
148                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
149                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
150                    pulldown_cmark::Tag::Image {
151                        link_type,
152                        dest_url,
153                        title,
154                        id,
155                    } => MarkdownTag::Image {
156                        link_type,
157                        dest_url: SharedString::from(dest_url.into_string()),
158                        title: SharedString::from(title.into_string()),
159                        id: SharedString::from(id.into_string()),
160                    },
161                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
162                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
163                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
164                    pulldown_cmark::Tag::DefinitionListDefinition => {
165                        MarkdownTag::DefinitionListDefinition
166                    }
167                };
168                events.push((range, MarkdownEvent::Start(tag)))
169            }
170            pulldown_cmark::Event::End(tag) => {
171                if let pulldown_cmark::TagEnd::Link = tag {
172                    within_link = false;
173                }
174                events.push((range, MarkdownEvent::End(tag)));
175            }
176            pulldown_cmark::Event::Text(parsed) => {
177                fn event_for(
178                    text: &str,
179                    range: Range<usize>,
180                    str: &str,
181                ) -> (Range<usize>, MarkdownEvent) {
182                    if str == &text[range.clone()] {
183                        (range, MarkdownEvent::Text)
184                    } else {
185                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
186                    }
187                }
188                #[derive(Debug)]
189                struct TextRange<'a> {
190                    source_range: Range<usize>,
191                    merged_range: Range<usize>,
192                    parsed: CowStr<'a>,
193                }
194
195                let mut last_len = parsed.len();
196                let mut ranges = vec![TextRange {
197                    source_range: range.clone(),
198                    merged_range: 0..last_len,
199                    parsed,
200                }];
201
202                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
203                    let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
204                    else {
205                        unreachable!()
206                    };
207                    let next_len = last_len + next_event.len();
208                    ranges.push(TextRange {
209                        source_range: next_range.clone(),
210                        merged_range: last_len..next_len,
211                        parsed: next_event,
212                    });
213                    last_len = next_len;
214                }
215
216                let mut merged_text =
217                    String::with_capacity(ranges.last().unwrap().merged_range.end);
218                for range in &ranges {
219                    merged_text.push_str(&range.parsed);
220                }
221
222                let mut ranges = ranges.into_iter().peekable();
223
224                if !within_link {
225                    let mut finder = LinkFinder::new();
226                    finder.kinds(&[linkify::LinkKind::Url]);
227
228                    // Find links in the merged text
229                    for link in finder.links(&merged_text) {
230                        let link_start_in_merged = link.start();
231                        let link_end_in_merged = link.end();
232
233                        while ranges
234                            .peek()
235                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
236                        {
237                            let range = ranges.next().unwrap();
238                            events.push(event_for(text, range.source_range, &range.parsed));
239                        }
240
241                        let Some(range) = ranges.peek_mut() else {
242                            continue;
243                        };
244                        let prefix_len = link_start_in_merged - range.merged_range.start;
245                        if prefix_len > 0 {
246                            let (head, tail) = range.parsed.split_at(prefix_len);
247                            events.push(event_for(
248                                text,
249                                range.source_range.start..range.source_range.start + prefix_len,
250                                head,
251                            ));
252                            range.parsed = CowStr::Boxed(tail.into());
253                            range.merged_range.start += prefix_len;
254                            range.source_range.start += prefix_len;
255                        }
256
257                        let link_start_in_source = range.source_range.start;
258                        let mut link_end_in_source = range.source_range.end;
259                        let mut link_events = Vec::new();
260
261                        while ranges
262                            .peek()
263                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
264                        {
265                            let range = ranges.next().unwrap();
266                            link_end_in_source = range.source_range.end;
267                            link_events.push(event_for(text, range.source_range, &range.parsed));
268                        }
269
270                        if let Some(range) = ranges.peek_mut() {
271                            let prefix_len = link_end_in_merged - range.merged_range.start;
272                            if prefix_len > 0 {
273                                let (head, tail) = range.parsed.split_at(prefix_len);
274                                link_events.push(event_for(
275                                    text,
276                                    range.source_range.start..range.source_range.start + prefix_len,
277                                    head,
278                                ));
279                                range.parsed = CowStr::Boxed(tail.into());
280                                range.merged_range.start += prefix_len;
281                                range.source_range.start += prefix_len;
282                                link_end_in_source = range.source_range.start;
283                            }
284                        }
285                        let link_range = link_start_in_source..link_end_in_source;
286
287                        events.push((
288                            link_range.clone(),
289                            MarkdownEvent::Start(MarkdownTag::Link {
290                                link_type: LinkType::Autolink,
291                                dest_url: SharedString::from(link.as_str().to_string()),
292                                title: SharedString::default(),
293                                id: SharedString::default(),
294                            }),
295                        ));
296                        events.extend(link_events);
297                        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
298                    }
299                }
300
301                for range in ranges {
302                    events.push(event_for(text, range.source_range, &range.parsed));
303                }
304            }
305            pulldown_cmark::Event::Code(_) => {
306                let content_range = extract_code_content_range(&text[range.clone()]);
307                let content_range =
308                    content_range.start + range.start..content_range.end + range.start;
309                events.push((content_range, MarkdownEvent::Code))
310            }
311            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
312            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
313            pulldown_cmark::Event::FootnoteReference(_) => {
314                events.push((range, MarkdownEvent::FootnoteReference))
315            }
316            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
317            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
318            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
319            pulldown_cmark::Event::TaskListMarker(checked) => {
320                events.push((range, MarkdownEvent::TaskListMarker(checked)))
321            }
322            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
323        }
324    }
325    (events, language_names, language_paths)
326}
327
328pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
329    let mut events = Vec::new();
330    let mut finder = LinkFinder::new();
331    finder.kinds(&[linkify::LinkKind::Url]);
332    let mut text_range = Range {
333        start: 0,
334        end: text.len(),
335    };
336    for link in finder.links(text) {
337        let link_range = link.start()..link.end();
338
339        if link_range.start > text_range.start {
340            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
341        }
342
343        events.push((
344            link_range.clone(),
345            MarkdownEvent::Start(MarkdownTag::Link {
346                link_type: LinkType::Autolink,
347                dest_url: SharedString::from(link.as_str().to_string()),
348                title: SharedString::default(),
349                id: SharedString::default(),
350            }),
351        ));
352        events.push((link_range.clone(), MarkdownEvent::Text));
353        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
354
355        text_range.start = link_range.end;
356    }
357
358    if text_range.end > text_range.start {
359        events.push((text_range, MarkdownEvent::Text));
360    }
361
362    events
363}
364
365/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
366/// parse result for rendering without resorting to unsafe lifetime coercion.
367#[derive(Clone, Debug, PartialEq)]
368pub enum MarkdownEvent {
369    /// Start of a tagged element. Events that are yielded after this event
370    /// and before its corresponding `End` event are inside this element.
371    /// Start and end events are guaranteed to be balanced.
372    Start(MarkdownTag),
373    /// End of a tagged element.
374    End(MarkdownTagEnd),
375    /// Text that uses the associated range from the markdown source.
376    Text,
377    /// Text that differs from the markdown source - typically due to substitution of HTML entities
378    /// and smart punctuation.
379    SubstitutedText(String),
380    /// An inline code node.
381    Code,
382    /// An HTML node.
383    Html,
384    /// An inline HTML node.
385    InlineHtml,
386    /// A reference to a footnote with given label, which may or may not be defined
387    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
388    /// occur in any order.
389    FootnoteReference,
390    /// A soft line break.
391    SoftBreak,
392    /// A hard line break.
393    HardBreak,
394    /// A horizontal ruler.
395    Rule,
396    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
397    TaskListMarker(bool),
398}
399
400/// Tags for elements that can contain other elements.
401#[derive(Clone, Debug, PartialEq)]
402pub enum MarkdownTag {
403    /// A paragraph of text and other inline elements.
404    Paragraph,
405
406    /// A heading, with optional identifier, classes and custom attributes.
407    /// The identifier is prefixed with `#` and the last one in the attributes
408    /// list is chosen, classes are prefixed with `.` and custom attributes
409    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
410    Heading {
411        level: HeadingLevel,
412        id: Option<SharedString>,
413        classes: Vec<SharedString>,
414        /// The first item of the tuple is the attr and second one the value.
415        attrs: Vec<(SharedString, Option<SharedString>)>,
416    },
417
418    BlockQuote,
419
420    /// A code block.
421    CodeBlock {
422        kind: CodeBlockKind,
423        metadata: CodeBlockMetadata,
424    },
425
426    /// A HTML block.
427    HtmlBlock,
428
429    /// A list. If the list is ordered the field indicates the number of the first item.
430    /// Contains only list items.
431    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
432
433    /// A list item.
434    Item,
435
436    /// A footnote definition. The value contained is the footnote's label by which it can
437    /// be referred to.
438    FootnoteDefinition(SharedString),
439
440    /// A table. Contains a vector describing the text-alignment for each of its columns.
441    Table(Vec<Alignment>),
442
443    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
444    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
445    TableHead,
446
447    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
448    TableRow,
449    TableCell,
450
451    // span-level tags
452    Emphasis,
453    Strong,
454    Strikethrough,
455
456    /// A link.
457    Link {
458        link_type: LinkType,
459        dest_url: SharedString,
460        title: SharedString,
461        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
462        id: SharedString,
463    },
464
465    /// An image. The first field is the link type, the second the destination URL and the third is a title,
466    /// the fourth is the link identifier.
467    Image {
468        link_type: LinkType,
469        dest_url: SharedString,
470        title: SharedString,
471        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
472        id: SharedString,
473    },
474
475    /// A metadata block.
476    MetadataBlock(MetadataBlockKind),
477
478    DefinitionList,
479    DefinitionListTitle,
480    DefinitionListDefinition,
481}
482
483#[derive(Clone, Debug, PartialEq)]
484pub enum CodeBlockKind {
485    Indented,
486    /// "Fenced" means "surrounded by triple backticks."
487    /// There can optionally be either a language after the backticks (like in traditional Markdown)
488    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
489    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
490    Fenced,
491    FencedLang(SharedString),
492    FencedSrc(PathWithRange),
493}
494
495#[derive(Default, Clone, Debug, PartialEq)]
496pub struct CodeBlockMetadata {
497    pub content_range: Range<usize>,
498    pub line_count: usize,
499}
500
501fn extract_code_content_range(text: &str) -> Range<usize> {
502    let text_len = text.len();
503    if text_len == 0 {
504        return 0..0;
505    }
506
507    let start_ticks = text.chars().take_while(|&c| c == '`').count();
508
509    if start_ticks == 0 || start_ticks > text_len {
510        return 0..text_len;
511    }
512
513    let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
514
515    if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
516        return 0..text_len;
517    }
518
519    start_ticks..text_len - end_ticks
520}
521
522pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
523    let mut range = 0..text.len();
524    if text.starts_with("```") {
525        range.start += 3;
526
527        if let Some(newline_ix) = text[range.clone()].find('\n') {
528            range.start += newline_ix + 1;
529        }
530    }
531
532    if !range.is_empty() && text.ends_with("```") {
533        range.end -= 3;
534    }
535    if range.start > range.end {
536        range.end = range.start;
537    }
538    range
539}
540
541#[cfg(test)]
542mod tests {
543    use super::MarkdownEvent::*;
544    use super::MarkdownTag::*;
545    use super::*;
546
547    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
548        .union(Options::ENABLE_MATH)
549        .union(Options::ENABLE_DEFINITION_LIST);
550
551    #[test]
552    fn all_options_considered() {
553        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
554        // can be evaluated for inclusion.
555        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
556    }
557
558    #[test]
559    fn wanted_and_unwanted_options_disjoint() {
560        assert_eq!(
561            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
562            Options::empty()
563        );
564    }
565
566    #[test]
567    fn test_html_comments() {
568        assert_eq!(
569            parse_markdown("  <!--\nrdoc-file=string.c\n-->\nReturns"),
570            (
571                vec![
572                    (2..30, Start(HtmlBlock)),
573                    (2..2, SubstitutedText("  ".into())),
574                    (2..7, Html),
575                    (7..26, Html),
576                    (26..30, Html),
577                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
578                    (30..37, Start(Paragraph)),
579                    (30..37, Text),
580                    (30..37, End(MarkdownTagEnd::Paragraph))
581                ],
582                HashSet::new(),
583                HashSet::new()
584            )
585        )
586    }
587
588    #[test]
589    fn test_plain_urls_and_escaped_text() {
590        assert_eq!(
591            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
592            (
593                vec![
594                    (0..51, Start(Paragraph)),
595                    (0..6, SubstitutedText("\u{a0}".into())),
596                    (6..12, SubstitutedText("\u{a0}".into())),
597                    (12..13, Text),
598                    (
599                        13..29,
600                        Start(Link {
601                            link_type: LinkType::Autolink,
602                            dest_url: "https://some.url".into(),
603                            title: "".into(),
604                            id: "".into(),
605                        })
606                    ),
607                    (13..29, Text),
608                    (13..29, End(MarkdownTagEnd::Link)),
609                    (29..35, Text),
610                    (36..37, Text), // Escaped backtick
611                    (37..44, SubstitutedText("".into())),
612                    (45..46, Text), // Escaped backtick
613                    (46..51, Text),
614                    (0..51, End(MarkdownTagEnd::Paragraph))
615                ],
616                HashSet::new(),
617                HashSet::new()
618            )
619        );
620    }
621
622    #[test]
623    fn test_incomplete_link() {
624        assert_eq!(
625            parse_markdown("You can use the [GitHub Search API](https://docs.github.com/en").0,
626            vec![
627                (0..62, Start(Paragraph)),
628                (0..16, Text),
629                (16..17, Text),
630                (17..34, Text),
631                (34..35, Text),
632                (35..36, Text),
633                (
634                    36..62,
635                    Start(Link {
636                        link_type: LinkType::Autolink,
637                        dest_url: "https://docs.github.com/en".into(),
638                        title: "".into(),
639                        id: "".into()
640                    })
641                ),
642                (36..62, Text),
643                (36..62, End(MarkdownTagEnd::Link)),
644                (0..62, End(MarkdownTagEnd::Paragraph))
645            ],
646        );
647    }
648
649    #[test]
650    fn test_smart_punctuation() {
651        assert_eq!(
652            parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
653            (
654                vec![
655                    (0..53, Start(Paragraph)),
656                    (0..2, SubstitutedText("".into())),
657                    (2..3, Text),
658                    (3..6, SubstitutedText("".into())),
659                    (6..7, Text),
660                    (7..10, SubstitutedText("".into())),
661                    (10..11, Text),
662                    (11..12, SubstitutedText("".into())),
663                    (12..25, Text),
664                    (25..26, SubstitutedText("".into())),
665                    (26..27, Text),
666                    (27..28, SubstitutedText("".into())),
667                    (28..41, Text),
668                    (41..42, SubstitutedText("".into())),
669                    (42..43, Text),
670                    (43..53, SubstitutedText("–––––".into())),
671                    (0..53, End(MarkdownTagEnd::Paragraph))
672                ],
673                HashSet::new(),
674                HashSet::new()
675            )
676        )
677    }
678
679    #[test]
680    fn test_code_block_metadata() {
681        assert_eq!(
682            parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
683            (
684                vec![
685                    (
686                        0..37,
687                        Start(CodeBlock {
688                            kind: CodeBlockKind::FencedLang("rust".into()),
689                            metadata: CodeBlockMetadata {
690                                content_range: 8..34,
691                                line_count: 3
692                            }
693                        })
694                    ),
695                    (8..34, Text),
696                    (0..37, End(MarkdownTagEnd::CodeBlock)),
697                ],
698                HashSet::from(["rust".into()]),
699                HashSet::new()
700            )
701        )
702    }
703
704    #[test]
705    fn test_extract_code_content_range() {
706        let input = "```let x = 5;```";
707        assert_eq!(extract_code_content_range(input), 3..13);
708
709        let input = "``let x = 5;``";
710        assert_eq!(extract_code_content_range(input), 2..12);
711
712        let input = "`let x = 5;`";
713        assert_eq!(extract_code_content_range(input), 1..11);
714
715        let input = "plain text";
716        assert_eq!(extract_code_content_range(input), 0..10);
717
718        let input = "``let x = 5;`";
719        assert_eq!(extract_code_content_range(input), 0..13);
720    }
721
722    #[test]
723    fn test_extract_code_block_content_range() {
724        let input = "```rust\nlet x = 5;\n```";
725        assert_eq!(extract_code_block_content_range(input), 8..19);
726
727        let input = "plain text";
728        assert_eq!(extract_code_block_content_range(input), 0..10);
729
730        let input = "```python\nprint('hello')\nprint('world')\n```";
731        assert_eq!(extract_code_block_content_range(input), 10..40);
732
733        // Malformed input
734        let input = "`````";
735        assert_eq!(extract_code_block_content_range(input), 3..3);
736    }
737
738    #[test]
739    fn test_links_split_across_fragments() {
740        // This test verifies that links split across multiple text fragments due to escaping or other issues
741        // are correctly detected and processed
742        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
743        // We're verifying our parser can handle this correctly
744        assert_eq!(
745            parse_markdown("https:/\\/example.com is equivalent to https://example&#46;com!").0,
746            vec![
747                (0..62, Start(Paragraph)),
748                (
749                    0..20,
750                    Start(Link {
751                        link_type: LinkType::Autolink,
752                        dest_url: "https://example.com".into(),
753                        title: "".into(),
754                        id: "".into()
755                    })
756                ),
757                (0..7, Text),
758                (8..20, Text),
759                (0..20, End(MarkdownTagEnd::Link)),
760                (20..38, Text),
761                (
762                    38..61,
763                    Start(Link {
764                        link_type: LinkType::Autolink,
765                        dest_url: "https://example.com".into(),
766                        title: "".into(),
767                        id: "".into()
768                    })
769                ),
770                (38..53, Text),
771                (53..58, SubstitutedText(".".into())),
772                (58..61, Text),
773                (38..61, End(MarkdownTagEnd::Link)),
774                (61..62, Text),
775                (0..62, End(MarkdownTagEnd::Paragraph))
776            ],
777        );
778
779        assert_eq!(
780            parse_markdown("Visit https://example.com/cat\\/é&#8205;☕ for coffee!").0,
781            [
782                (0..55, Start(Paragraph)),
783                (0..6, Text),
784                (
785                    6..43,
786                    Start(Link {
787                        link_type: LinkType::Autolink,
788                        dest_url: "https://example.com/cat/é\u{200d}".into(),
789                        title: "".into(),
790                        id: "".into()
791                    })
792                ),
793                (6..29, Text),
794                (30..33, Text),
795                (33..40, SubstitutedText("\u{200d}".into())),
796                (40..43, Text),
797                (6..43, End(MarkdownTagEnd::Link)),
798                (43..55, Text),
799                (0..55, End(MarkdownTagEnd::Paragraph))
800            ]
801        );
802    }
803}