parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{ops::Range, sync::Arc};
  8
  9use collections::HashSet;
 10
 11use crate::path_range::PathWithRange;
 12
 13const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 14    .union(Options::ENABLE_FOOTNOTES)
 15    .union(Options::ENABLE_STRIKETHROUGH)
 16    .union(Options::ENABLE_TASKLISTS)
 17    .union(Options::ENABLE_SMART_PUNCTUATION)
 18    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 19    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 20    .union(Options::ENABLE_OLD_FOOTNOTES)
 21    .union(Options::ENABLE_GFM)
 22    .union(Options::ENABLE_SUPERSCRIPT)
 23    .union(Options::ENABLE_SUBSCRIPT);
 24
 25pub fn parse_markdown(
 26    text: &str,
 27) -> (
 28    Vec<(Range<usize>, MarkdownEvent)>,
 29    HashSet<SharedString>,
 30    HashSet<Arc<str>>,
 31) {
 32    let mut events = Vec::new();
 33    let mut language_names = HashSet::default();
 34    let mut language_paths = HashSet::default();
 35    let mut within_link = false;
 36    let mut within_metadata = false;
 37    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
 38        .into_offset_iter()
 39        .peekable();
 40    while let Some((pulldown_event, range)) = parser.next() {
 41        if within_metadata {
 42            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 43                pulldown_event
 44            {
 45                within_metadata = false;
 46            }
 47            continue;
 48        }
 49        match pulldown_event {
 50            pulldown_cmark::Event::Start(tag) => {
 51                let tag = match tag {
 52                    pulldown_cmark::Tag::Link {
 53                        link_type,
 54                        dest_url,
 55                        title,
 56                        id,
 57                    } => {
 58                        within_link = true;
 59                        MarkdownTag::Link {
 60                            link_type,
 61                            dest_url: SharedString::from(dest_url.into_string()),
 62                            title: SharedString::from(title.into_string()),
 63                            id: SharedString::from(id.into_string()),
 64                        }
 65                    }
 66                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 67                        within_metadata = true;
 68                        MarkdownTag::MetadataBlock(kind)
 69                    }
 70                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 71                        MarkdownTag::CodeBlock {
 72                            kind: CodeBlockKind::Indented,
 73                            metadata: CodeBlockMetadata {
 74                                content_range: range.clone(),
 75                                line_count: 1,
 76                            },
 77                        }
 78                    }
 79                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 80                        ref info,
 81                    )) => {
 82                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 83                        let content_range =
 84                            content_range.start + range.start..content_range.end + range.start;
 85
 86                        // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
 87                        let line_count = text[content_range.clone()]
 88                            .bytes()
 89                            .filter(|c| *c == b'\n')
 90                            .count();
 91                        let metadata = CodeBlockMetadata {
 92                            content_range,
 93                            line_count,
 94                        };
 95
 96                        let info = info.trim();
 97                        let kind = if info.is_empty() {
 98                            CodeBlockKind::Fenced
 99                            // Languages should never contain a slash, and PathRanges always should.
100                            // (Models are told to specify them relative to a workspace root.)
101                        } else if info.contains('/') {
102                            let path_range = PathWithRange::new(info);
103                            language_paths.insert(path_range.path.clone());
104                            CodeBlockKind::FencedSrc(path_range)
105                        } else {
106                            let language = SharedString::from(info.to_string());
107                            language_names.insert(language.clone());
108                            CodeBlockKind::FencedLang(language)
109                        };
110
111                        MarkdownTag::CodeBlock { kind, metadata }
112                    }
113                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
114                    pulldown_cmark::Tag::Heading {
115                        level,
116                        id,
117                        classes,
118                        attrs,
119                    } => {
120                        let id = id.map(|id| SharedString::from(id.into_string()));
121                        let classes = classes
122                            .into_iter()
123                            .map(|c| SharedString::from(c.into_string()))
124                            .collect();
125                        let attrs = attrs
126                            .into_iter()
127                            .map(|(key, value)| {
128                                (
129                                    SharedString::from(key.into_string()),
130                                    value.map(|v| SharedString::from(v.into_string())),
131                                )
132                            })
133                            .collect();
134                        MarkdownTag::Heading {
135                            level,
136                            id,
137                            classes,
138                            attrs,
139                        }
140                    }
141                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
142                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
143                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
144                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
145                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
146                    }
147                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
148                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
149                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
150                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
151                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
152                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
153                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
154                    pulldown_cmark::Tag::Superscript => MarkdownTag::Superscript,
155                    pulldown_cmark::Tag::Subscript => MarkdownTag::Subscript,
156                    pulldown_cmark::Tag::Image {
157                        link_type,
158                        dest_url,
159                        title,
160                        id,
161                    } => MarkdownTag::Image {
162                        link_type,
163                        dest_url: SharedString::from(dest_url.into_string()),
164                        title: SharedString::from(title.into_string()),
165                        id: SharedString::from(id.into_string()),
166                    },
167                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
168                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
169                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
170                    pulldown_cmark::Tag::DefinitionListDefinition => {
171                        MarkdownTag::DefinitionListDefinition
172                    }
173                };
174                events.push((range, MarkdownEvent::Start(tag)))
175            }
176            pulldown_cmark::Event::End(tag) => {
177                if let pulldown_cmark::TagEnd::Link = tag {
178                    within_link = false;
179                }
180                events.push((range, MarkdownEvent::End(tag)));
181            }
182            pulldown_cmark::Event::Text(parsed) => {
183                fn event_for(
184                    text: &str,
185                    range: Range<usize>,
186                    str: &str,
187                ) -> (Range<usize>, MarkdownEvent) {
188                    if str == &text[range.clone()] {
189                        (range, MarkdownEvent::Text)
190                    } else {
191                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
192                    }
193                }
194                #[derive(Debug)]
195                struct TextRange<'a> {
196                    source_range: Range<usize>,
197                    merged_range: Range<usize>,
198                    parsed: CowStr<'a>,
199                }
200
201                let mut last_len = parsed.len();
202                let mut ranges = vec![TextRange {
203                    source_range: range.clone(),
204                    merged_range: 0..last_len,
205                    parsed,
206                }];
207
208                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
209                    let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
210                    else {
211                        unreachable!()
212                    };
213                    let next_len = last_len + next_event.len();
214                    ranges.push(TextRange {
215                        source_range: next_range.clone(),
216                        merged_range: last_len..next_len,
217                        parsed: next_event,
218                    });
219                    last_len = next_len;
220                }
221
222                let mut merged_text =
223                    String::with_capacity(ranges.last().unwrap().merged_range.end);
224                for range in &ranges {
225                    merged_text.push_str(&range.parsed);
226                }
227
228                let mut ranges = ranges.into_iter().peekable();
229
230                if !within_link {
231                    let mut finder = LinkFinder::new();
232                    finder.kinds(&[linkify::LinkKind::Url]);
233
234                    // Find links in the merged text
235                    for link in finder.links(&merged_text) {
236                        let link_start_in_merged = link.start();
237                        let link_end_in_merged = link.end();
238
239                        while ranges
240                            .peek()
241                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
242                        {
243                            let range = ranges.next().unwrap();
244                            events.push(event_for(text, range.source_range, &range.parsed));
245                        }
246
247                        let Some(range) = ranges.peek_mut() else {
248                            continue;
249                        };
250                        let prefix_len = link_start_in_merged - range.merged_range.start;
251                        if prefix_len > 0 {
252                            let (head, tail) = range.parsed.split_at(prefix_len);
253                            events.push(event_for(
254                                text,
255                                range.source_range.start..range.source_range.start + prefix_len,
256                                head,
257                            ));
258                            range.parsed = CowStr::Boxed(tail.into());
259                            range.merged_range.start += prefix_len;
260                            range.source_range.start += prefix_len;
261                        }
262
263                        let link_start_in_source = range.source_range.start;
264                        let mut link_end_in_source = range.source_range.end;
265                        let mut link_events = Vec::new();
266
267                        while ranges
268                            .peek()
269                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
270                        {
271                            let range = ranges.next().unwrap();
272                            link_end_in_source = range.source_range.end;
273                            link_events.push(event_for(text, range.source_range, &range.parsed));
274                        }
275
276                        if let Some(range) = ranges.peek_mut() {
277                            let prefix_len = link_end_in_merged - range.merged_range.start;
278                            if prefix_len > 0 {
279                                let (head, tail) = range.parsed.split_at(prefix_len);
280                                link_events.push(event_for(
281                                    text,
282                                    range.source_range.start..range.source_range.start + prefix_len,
283                                    head,
284                                ));
285                                range.parsed = CowStr::Boxed(tail.into());
286                                range.merged_range.start += prefix_len;
287                                range.source_range.start += prefix_len;
288                                link_end_in_source = range.source_range.start;
289                            }
290                        }
291                        let link_range = link_start_in_source..link_end_in_source;
292
293                        events.push((
294                            link_range.clone(),
295                            MarkdownEvent::Start(MarkdownTag::Link {
296                                link_type: LinkType::Autolink,
297                                dest_url: SharedString::from(link.as_str().to_string()),
298                                title: SharedString::default(),
299                                id: SharedString::default(),
300                            }),
301                        ));
302                        events.extend(link_events);
303                        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
304                    }
305                }
306
307                for range in ranges {
308                    events.push(event_for(text, range.source_range, &range.parsed));
309                }
310            }
311            pulldown_cmark::Event::Code(_) => {
312                let content_range = extract_code_content_range(&text[range.clone()]);
313                let content_range =
314                    content_range.start + range.start..content_range.end + range.start;
315                events.push((content_range, MarkdownEvent::Code))
316            }
317            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
318            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
319            pulldown_cmark::Event::FootnoteReference(_) => {
320                events.push((range, MarkdownEvent::FootnoteReference))
321            }
322            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
323            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
324            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
325            pulldown_cmark::Event::TaskListMarker(checked) => {
326                events.push((range, MarkdownEvent::TaskListMarker(checked)))
327            }
328            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
329        }
330    }
331    (events, language_names, language_paths)
332}
333
334pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
335    let mut events = Vec::new();
336    let mut finder = LinkFinder::new();
337    finder.kinds(&[linkify::LinkKind::Url]);
338    let mut text_range = Range {
339        start: 0,
340        end: text.len(),
341    };
342    for link in finder.links(text) {
343        let link_range = link.start()..link.end();
344
345        if link_range.start > text_range.start {
346            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
347        }
348
349        events.push((
350            link_range.clone(),
351            MarkdownEvent::Start(MarkdownTag::Link {
352                link_type: LinkType::Autolink,
353                dest_url: SharedString::from(link.as_str().to_string()),
354                title: SharedString::default(),
355                id: SharedString::default(),
356            }),
357        ));
358        events.push((link_range.clone(), MarkdownEvent::Text));
359        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
360
361        text_range.start = link_range.end;
362    }
363
364    if text_range.end > text_range.start {
365        events.push((text_range, MarkdownEvent::Text));
366    }
367
368    events
369}
370
371/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
372/// parse result for rendering without resorting to unsafe lifetime coercion.
373#[derive(Clone, Debug, PartialEq)]
374pub enum MarkdownEvent {
375    /// Start of a tagged element. Events that are yielded after this event
376    /// and before its corresponding `End` event are inside this element.
377    /// Start and end events are guaranteed to be balanced.
378    Start(MarkdownTag),
379    /// End of a tagged element.
380    End(MarkdownTagEnd),
381    /// Text that uses the associated range from the markdown source.
382    Text,
383    /// Text that differs from the markdown source - typically due to substitution of HTML entities
384    /// and smart punctuation.
385    SubstitutedText(String),
386    /// An inline code node.
387    Code,
388    /// An HTML node.
389    Html,
390    /// An inline HTML node.
391    InlineHtml,
392    /// A reference to a footnote with given label, which may or may not be defined
393    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
394    /// occur in any order.
395    FootnoteReference,
396    /// A soft line break.
397    SoftBreak,
398    /// A hard line break.
399    HardBreak,
400    /// A horizontal ruler.
401    Rule,
402    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
403    TaskListMarker(bool),
404}
405
406/// Tags for elements that can contain other elements.
407#[derive(Clone, Debug, PartialEq)]
408pub enum MarkdownTag {
409    /// A paragraph of text and other inline elements.
410    Paragraph,
411
412    /// A heading, with optional identifier, classes and custom attributes.
413    /// The identifier is prefixed with `#` and the last one in the attributes
414    /// list is chosen, classes are prefixed with `.` and custom attributes
415    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
416    Heading {
417        level: HeadingLevel,
418        id: Option<SharedString>,
419        classes: Vec<SharedString>,
420        /// The first item of the tuple is the attr and second one the value.
421        attrs: Vec<(SharedString, Option<SharedString>)>,
422    },
423
424    BlockQuote,
425
426    /// A code block.
427    CodeBlock {
428        kind: CodeBlockKind,
429        metadata: CodeBlockMetadata,
430    },
431
432    /// A HTML block.
433    HtmlBlock,
434
435    /// A list. If the list is ordered the field indicates the number of the first item.
436    /// Contains only list items.
437    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
438
439    /// A list item.
440    Item,
441
442    /// A footnote definition. The value contained is the footnote's label by which it can
443    /// be referred to.
444    FootnoteDefinition(SharedString),
445
446    /// A table. Contains a vector describing the text-alignment for each of its columns.
447    Table(Vec<Alignment>),
448
449    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
450    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
451    TableHead,
452
453    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
454    TableRow,
455    TableCell,
456
457    // span-level tags
458    Emphasis,
459    Strong,
460    Strikethrough,
461    Superscript,
462    Subscript,
463
464    /// A link.
465    Link {
466        link_type: LinkType,
467        dest_url: SharedString,
468        title: SharedString,
469        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
470        id: SharedString,
471    },
472
473    /// An image. The first field is the link type, the second the destination URL and the third is a title,
474    /// the fourth is the link identifier.
475    Image {
476        link_type: LinkType,
477        dest_url: SharedString,
478        title: SharedString,
479        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
480        id: SharedString,
481    },
482
483    /// A metadata block.
484    MetadataBlock(MetadataBlockKind),
485
486    DefinitionList,
487    DefinitionListTitle,
488    DefinitionListDefinition,
489}
490
491#[derive(Clone, Debug, PartialEq)]
492pub enum CodeBlockKind {
493    Indented,
494    /// "Fenced" means "surrounded by triple backticks."
495    /// There can optionally be either a language after the backticks (like in traditional Markdown)
496    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
497    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
498    Fenced,
499    FencedLang(SharedString),
500    FencedSrc(PathWithRange),
501}
502
503#[derive(Default, Clone, Debug, PartialEq)]
504pub struct CodeBlockMetadata {
505    pub content_range: Range<usize>,
506    pub line_count: usize,
507}
508
509fn extract_code_content_range(text: &str) -> Range<usize> {
510    let text_len = text.len();
511    if text_len == 0 {
512        return 0..0;
513    }
514
515    let start_ticks = text.chars().take_while(|&c| c == '`').count();
516
517    if start_ticks == 0 || start_ticks > text_len {
518        return 0..text_len;
519    }
520
521    let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
522
523    if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
524        return 0..text_len;
525    }
526
527    start_ticks..text_len - end_ticks
528}
529
530pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
531    let mut range = 0..text.len();
532    if text.starts_with("```") {
533        range.start += 3;
534
535        if let Some(newline_ix) = text[range.clone()].find('\n') {
536            range.start += newline_ix + 1;
537        }
538    }
539
540    if !range.is_empty() && text.ends_with("```") {
541        range.end -= 3;
542    }
543    if range.start > range.end {
544        range.end = range.start;
545    }
546    range
547}
548
549#[cfg(test)]
550mod tests {
551    use super::MarkdownEvent::*;
552    use super::MarkdownTag::*;
553    use super::*;
554
555    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
556        .union(Options::ENABLE_MATH)
557        .union(Options::ENABLE_DEFINITION_LIST)
558        .union(Options::ENABLE_WIKILINKS);
559
560    #[test]
561    fn all_options_considered() {
562        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
563        // can be evaluated for inclusion.
564        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
565    }
566
567    #[test]
568    fn wanted_and_unwanted_options_disjoint() {
569        assert_eq!(
570            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
571            Options::empty()
572        );
573    }
574
575    #[test]
576    fn test_html_comments() {
577        assert_eq!(
578            parse_markdown("  <!--\nrdoc-file=string.c\n-->\nReturns"),
579            (
580                vec![
581                    (2..30, Start(HtmlBlock)),
582                    (2..2, SubstitutedText("  ".into())),
583                    (2..7, Html),
584                    (7..26, Html),
585                    (26..30, Html),
586                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
587                    (30..37, Start(Paragraph)),
588                    (30..37, Text),
589                    (30..37, End(MarkdownTagEnd::Paragraph))
590                ],
591                HashSet::default(),
592                HashSet::default()
593            )
594        )
595    }
596
597    #[test]
598    fn test_plain_urls_and_escaped_text() {
599        assert_eq!(
600            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
601            (
602                vec![
603                    (0..51, Start(Paragraph)),
604                    (0..6, SubstitutedText("\u{a0}".into())),
605                    (6..12, SubstitutedText("\u{a0}".into())),
606                    (12..13, Text),
607                    (
608                        13..29,
609                        Start(Link {
610                            link_type: LinkType::Autolink,
611                            dest_url: "https://some.url".into(),
612                            title: "".into(),
613                            id: "".into(),
614                        })
615                    ),
616                    (13..29, Text),
617                    (13..29, End(MarkdownTagEnd::Link)),
618                    (29..35, Text),
619                    (36..37, Text), // Escaped backtick
620                    (37..44, SubstitutedText("".into())),
621                    (45..46, Text), // Escaped backtick
622                    (46..51, Text),
623                    (0..51, End(MarkdownTagEnd::Paragraph))
624                ],
625                HashSet::default(),
626                HashSet::default()
627            )
628        );
629    }
630
631    #[test]
632    fn test_incomplete_link() {
633        assert_eq!(
634            parse_markdown("You can use the [GitHub Search API](https://docs.github.com/en").0,
635            vec![
636                (0..62, Start(Paragraph)),
637                (0..16, Text),
638                (16..17, Text),
639                (17..34, Text),
640                (34..35, Text),
641                (35..36, Text),
642                (
643                    36..62,
644                    Start(Link {
645                        link_type: LinkType::Autolink,
646                        dest_url: "https://docs.github.com/en".into(),
647                        title: "".into(),
648                        id: "".into()
649                    })
650                ),
651                (36..62, Text),
652                (36..62, End(MarkdownTagEnd::Link)),
653                (0..62, End(MarkdownTagEnd::Paragraph))
654            ],
655        );
656    }
657
658    #[test]
659    fn test_smart_punctuation() {
660        assert_eq!(
661            parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
662            (
663                vec![
664                    (0..53, Start(Paragraph)),
665                    (0..2, SubstitutedText("".into())),
666                    (2..3, Text),
667                    (3..6, SubstitutedText("".into())),
668                    (6..7, Text),
669                    (7..10, SubstitutedText("".into())),
670                    (10..11, Text),
671                    (11..12, SubstitutedText("".into())),
672                    (12..25, Text),
673                    (25..26, SubstitutedText("".into())),
674                    (26..27, Text),
675                    (27..28, SubstitutedText("".into())),
676                    (28..41, Text),
677                    (41..42, SubstitutedText("".into())),
678                    (42..43, Text),
679                    (43..53, SubstitutedText("–––––".into())),
680                    (0..53, End(MarkdownTagEnd::Paragraph))
681                ],
682                HashSet::default(),
683                HashSet::default()
684            )
685        )
686    }
687
688    #[test]
689    fn test_code_block_metadata() {
690        assert_eq!(
691            parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
692            (
693                vec![
694                    (
695                        0..37,
696                        Start(CodeBlock {
697                            kind: CodeBlockKind::FencedLang("rust".into()),
698                            metadata: CodeBlockMetadata {
699                                content_range: 8..34,
700                                line_count: 3
701                            }
702                        })
703                    ),
704                    (8..34, Text),
705                    (0..37, End(MarkdownTagEnd::CodeBlock)),
706                ],
707                {
708                    let mut h = HashSet::default();
709                    h.insert("rust".into());
710                    h
711                },
712                HashSet::default()
713            )
714        );
715        assert_eq!(
716            parse_markdown("    fn main() {}"),
717            (
718                vec![
719                    (
720                        4..16,
721                        Start(CodeBlock {
722                            kind: CodeBlockKind::Indented,
723                            metadata: CodeBlockMetadata {
724                                content_range: 4..16,
725                                line_count: 1
726                            }
727                        })
728                    ),
729                    (4..16, Text),
730                    (4..16, End(MarkdownTagEnd::CodeBlock))
731                ],
732                HashSet::default(),
733                HashSet::default()
734            )
735        );
736    }
737
738    #[test]
739    fn test_extract_code_content_range() {
740        let input = "```let x = 5;```";
741        assert_eq!(extract_code_content_range(input), 3..13);
742
743        let input = "``let x = 5;``";
744        assert_eq!(extract_code_content_range(input), 2..12);
745
746        let input = "`let x = 5;`";
747        assert_eq!(extract_code_content_range(input), 1..11);
748
749        let input = "plain text";
750        assert_eq!(extract_code_content_range(input), 0..10);
751
752        let input = "``let x = 5;`";
753        assert_eq!(extract_code_content_range(input), 0..13);
754    }
755
756    #[test]
757    fn test_extract_code_block_content_range() {
758        let input = "```rust\nlet x = 5;\n```";
759        assert_eq!(extract_code_block_content_range(input), 8..19);
760
761        let input = "plain text";
762        assert_eq!(extract_code_block_content_range(input), 0..10);
763
764        let input = "```python\nprint('hello')\nprint('world')\n```";
765        assert_eq!(extract_code_block_content_range(input), 10..40);
766
767        // Malformed input
768        let input = "`````";
769        assert_eq!(extract_code_block_content_range(input), 3..3);
770    }
771
772    #[test]
773    fn test_links_split_across_fragments() {
774        // This test verifies that links split across multiple text fragments due to escaping or other issues
775        // are correctly detected and processed
776        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
777        // We're verifying our parser can handle this correctly
778        assert_eq!(
779            parse_markdown("https:/\\/example.com is equivalent to https://example&#46;com!").0,
780            vec![
781                (0..62, Start(Paragraph)),
782                (
783                    0..20,
784                    Start(Link {
785                        link_type: LinkType::Autolink,
786                        dest_url: "https://example.com".into(),
787                        title: "".into(),
788                        id: "".into()
789                    })
790                ),
791                (0..7, Text),
792                (8..20, Text),
793                (0..20, End(MarkdownTagEnd::Link)),
794                (20..38, Text),
795                (
796                    38..61,
797                    Start(Link {
798                        link_type: LinkType::Autolink,
799                        dest_url: "https://example.com".into(),
800                        title: "".into(),
801                        id: "".into()
802                    })
803                ),
804                (38..53, Text),
805                (53..58, SubstitutedText(".".into())),
806                (58..61, Text),
807                (38..61, End(MarkdownTagEnd::Link)),
808                (61..62, Text),
809                (0..62, End(MarkdownTagEnd::Paragraph))
810            ],
811        );
812
813        assert_eq!(
814            parse_markdown("Visit https://example.com/cat\\/é&#8205;☕ for coffee!").0,
815            [
816                (0..55, Start(Paragraph)),
817                (0..6, Text),
818                (
819                    6..43,
820                    Start(Link {
821                        link_type: LinkType::Autolink,
822                        dest_url: "https://example.com/cat/é\u{200d}".into(),
823                        title: "".into(),
824                        id: "".into()
825                    })
826                ),
827                (6..29, Text),
828                (30..33, Text),
829                (33..40, SubstitutedText("\u{200d}".into())),
830                (40..43, Text),
831                (6..43, End(MarkdownTagEnd::Link)),
832                (43..55, Text),
833                (0..55, End(MarkdownTagEnd::Paragraph))
834            ]
835        );
836    }
837}