parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{ops::Range, path::Path, sync::Arc};
  8
  9use collections::HashSet;
 10
 11use crate::path_range::PathWithRange;
 12
 13const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 14    .union(Options::ENABLE_FOOTNOTES)
 15    .union(Options::ENABLE_STRIKETHROUGH)
 16    .union(Options::ENABLE_TASKLISTS)
 17    .union(Options::ENABLE_SMART_PUNCTUATION)
 18    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 19    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 20    .union(Options::ENABLE_OLD_FOOTNOTES)
 21    .union(Options::ENABLE_GFM);
 22
 23pub fn parse_markdown(
 24    text: &str,
 25) -> (
 26    Vec<(Range<usize>, MarkdownEvent)>,
 27    HashSet<SharedString>,
 28    HashSet<Arc<Path>>,
 29) {
 30    let mut events = Vec::new();
 31    let mut language_names = HashSet::default();
 32    let mut language_paths = HashSet::default();
 33    let mut within_link = false;
 34    let mut within_metadata = false;
 35    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
 36        .into_offset_iter()
 37        .peekable();
 38    while let Some((pulldown_event, range)) = parser.next() {
 39        if within_metadata {
 40            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 41                pulldown_event
 42            {
 43                within_metadata = false;
 44            }
 45            continue;
 46        }
 47        match pulldown_event {
 48            pulldown_cmark::Event::Start(tag) => {
 49                let tag = match tag {
 50                    pulldown_cmark::Tag::Link {
 51                        link_type,
 52                        dest_url,
 53                        title,
 54                        id,
 55                    } => {
 56                        within_link = true;
 57                        MarkdownTag::Link {
 58                            link_type,
 59                            dest_url: SharedString::from(dest_url.into_string()),
 60                            title: SharedString::from(title.into_string()),
 61                            id: SharedString::from(id.into_string()),
 62                        }
 63                    }
 64                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 65                        within_metadata = true;
 66                        MarkdownTag::MetadataBlock(kind)
 67                    }
 68                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 69                        MarkdownTag::CodeBlock {
 70                            kind: CodeBlockKind::Indented,
 71                            metadata: CodeBlockMetadata {
 72                                content_range: range.clone(),
 73                                line_count: 1,
 74                            },
 75                        }
 76                    }
 77                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 78                        ref info,
 79                    )) => {
 80                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 81                        let content_range =
 82                            content_range.start + range.start..content_range.end + range.start;
 83
 84                        // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
 85                        let line_count = text[content_range.clone()]
 86                            .bytes()
 87                            .filter(|c| *c == b'\n')
 88                            .count();
 89                        let metadata = CodeBlockMetadata {
 90                            content_range,
 91                            line_count,
 92                        };
 93
 94                        let info = info.trim();
 95                        let kind = if info.is_empty() {
 96                            CodeBlockKind::Fenced
 97                            // Languages should never contain a slash, and PathRanges always should.
 98                            // (Models are told to specify them relative to a workspace root.)
 99                        } else if info.contains('/') {
100                            let path_range = PathWithRange::new(info);
101                            language_paths.insert(path_range.path.clone());
102                            CodeBlockKind::FencedSrc(path_range)
103                        } else {
104                            let language = SharedString::from(info.to_string());
105                            language_names.insert(language.clone());
106                            CodeBlockKind::FencedLang(language)
107                        };
108
109                        MarkdownTag::CodeBlock { kind, metadata }
110                    }
111                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
112                    pulldown_cmark::Tag::Heading {
113                        level,
114                        id,
115                        classes,
116                        attrs,
117                    } => {
118                        let id = id.map(|id| SharedString::from(id.into_string()));
119                        let classes = classes
120                            .into_iter()
121                            .map(|c| SharedString::from(c.into_string()))
122                            .collect();
123                        let attrs = attrs
124                            .into_iter()
125                            .map(|(key, value)| {
126                                (
127                                    SharedString::from(key.into_string()),
128                                    value.map(|v| SharedString::from(v.into_string())),
129                                )
130                            })
131                            .collect();
132                        MarkdownTag::Heading {
133                            level,
134                            id,
135                            classes,
136                            attrs,
137                        }
138                    }
139                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
140                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
141                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
142                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
143                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
144                    }
145                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
146                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
147                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
148                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
149                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
150                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
151                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
152                    pulldown_cmark::Tag::Image {
153                        link_type,
154                        dest_url,
155                        title,
156                        id,
157                    } => MarkdownTag::Image {
158                        link_type,
159                        dest_url: SharedString::from(dest_url.into_string()),
160                        title: SharedString::from(title.into_string()),
161                        id: SharedString::from(id.into_string()),
162                    },
163                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
164                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
165                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
166                    pulldown_cmark::Tag::DefinitionListDefinition => {
167                        MarkdownTag::DefinitionListDefinition
168                    }
169                };
170                events.push((range, MarkdownEvent::Start(tag)))
171            }
172            pulldown_cmark::Event::End(tag) => {
173                if let pulldown_cmark::TagEnd::Link = tag {
174                    within_link = false;
175                }
176                events.push((range, MarkdownEvent::End(tag)));
177            }
178            pulldown_cmark::Event::Text(parsed) => {
179                fn event_for(
180                    text: &str,
181                    range: Range<usize>,
182                    str: &str,
183                ) -> (Range<usize>, MarkdownEvent) {
184                    if str == &text[range.clone()] {
185                        (range, MarkdownEvent::Text)
186                    } else {
187                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
188                    }
189                }
190                #[derive(Debug)]
191                struct TextRange<'a> {
192                    source_range: Range<usize>,
193                    merged_range: Range<usize>,
194                    parsed: CowStr<'a>,
195                }
196
197                let mut last_len = parsed.len();
198                let mut ranges = vec![TextRange {
199                    source_range: range.clone(),
200                    merged_range: 0..last_len,
201                    parsed,
202                }];
203
204                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
205                    let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
206                    else {
207                        unreachable!()
208                    };
209                    let next_len = last_len + next_event.len();
210                    ranges.push(TextRange {
211                        source_range: next_range.clone(),
212                        merged_range: last_len..next_len,
213                        parsed: next_event,
214                    });
215                    last_len = next_len;
216                }
217
218                let mut merged_text =
219                    String::with_capacity(ranges.last().unwrap().merged_range.end);
220                for range in &ranges {
221                    merged_text.push_str(&range.parsed);
222                }
223
224                let mut ranges = ranges.into_iter().peekable();
225
226                if !within_link {
227                    let mut finder = LinkFinder::new();
228                    finder.kinds(&[linkify::LinkKind::Url]);
229
230                    // Find links in the merged text
231                    for link in finder.links(&merged_text) {
232                        let link_start_in_merged = link.start();
233                        let link_end_in_merged = link.end();
234
235                        while ranges
236                            .peek()
237                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
238                        {
239                            let range = ranges.next().unwrap();
240                            events.push(event_for(text, range.source_range, &range.parsed));
241                        }
242
243                        let Some(range) = ranges.peek_mut() else {
244                            continue;
245                        };
246                        let prefix_len = link_start_in_merged - range.merged_range.start;
247                        if prefix_len > 0 {
248                            let (head, tail) = range.parsed.split_at(prefix_len);
249                            events.push(event_for(
250                                text,
251                                range.source_range.start..range.source_range.start + prefix_len,
252                                head,
253                            ));
254                            range.parsed = CowStr::Boxed(tail.into());
255                            range.merged_range.start += prefix_len;
256                            range.source_range.start += prefix_len;
257                        }
258
259                        let link_start_in_source = range.source_range.start;
260                        let mut link_end_in_source = range.source_range.end;
261                        let mut link_events = Vec::new();
262
263                        while ranges
264                            .peek()
265                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
266                        {
267                            let range = ranges.next().unwrap();
268                            link_end_in_source = range.source_range.end;
269                            link_events.push(event_for(text, range.source_range, &range.parsed));
270                        }
271
272                        if let Some(range) = ranges.peek_mut() {
273                            let prefix_len = link_end_in_merged - range.merged_range.start;
274                            if prefix_len > 0 {
275                                let (head, tail) = range.parsed.split_at(prefix_len);
276                                link_events.push(event_for(
277                                    text,
278                                    range.source_range.start..range.source_range.start + prefix_len,
279                                    head,
280                                ));
281                                range.parsed = CowStr::Boxed(tail.into());
282                                range.merged_range.start += prefix_len;
283                                range.source_range.start += prefix_len;
284                                link_end_in_source = range.source_range.start;
285                            }
286                        }
287                        let link_range = link_start_in_source..link_end_in_source;
288
289                        events.push((
290                            link_range.clone(),
291                            MarkdownEvent::Start(MarkdownTag::Link {
292                                link_type: LinkType::Autolink,
293                                dest_url: SharedString::from(link.as_str().to_string()),
294                                title: SharedString::default(),
295                                id: SharedString::default(),
296                            }),
297                        ));
298                        events.extend(link_events);
299                        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
300                    }
301                }
302
303                for range in ranges {
304                    events.push(event_for(text, range.source_range, &range.parsed));
305                }
306            }
307            pulldown_cmark::Event::Code(_) => {
308                let content_range = extract_code_content_range(&text[range.clone()]);
309                let content_range =
310                    content_range.start + range.start..content_range.end + range.start;
311                events.push((content_range, MarkdownEvent::Code))
312            }
313            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
314            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
315            pulldown_cmark::Event::FootnoteReference(_) => {
316                events.push((range, MarkdownEvent::FootnoteReference))
317            }
318            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
319            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
320            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
321            pulldown_cmark::Event::TaskListMarker(checked) => {
322                events.push((range, MarkdownEvent::TaskListMarker(checked)))
323            }
324            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
325        }
326    }
327    (events, language_names, language_paths)
328}
329
330pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
331    let mut events = Vec::new();
332    let mut finder = LinkFinder::new();
333    finder.kinds(&[linkify::LinkKind::Url]);
334    let mut text_range = Range {
335        start: 0,
336        end: text.len(),
337    };
338    for link in finder.links(text) {
339        let link_range = link.start()..link.end();
340
341        if link_range.start > text_range.start {
342            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
343        }
344
345        events.push((
346            link_range.clone(),
347            MarkdownEvent::Start(MarkdownTag::Link {
348                link_type: LinkType::Autolink,
349                dest_url: SharedString::from(link.as_str().to_string()),
350                title: SharedString::default(),
351                id: SharedString::default(),
352            }),
353        ));
354        events.push((link_range.clone(), MarkdownEvent::Text));
355        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
356
357        text_range.start = link_range.end;
358    }
359
360    if text_range.end > text_range.start {
361        events.push((text_range, MarkdownEvent::Text));
362    }
363
364    events
365}
366
367/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
368/// parse result for rendering without resorting to unsafe lifetime coercion.
369#[derive(Clone, Debug, PartialEq)]
370pub enum MarkdownEvent {
371    /// Start of a tagged element. Events that are yielded after this event
372    /// and before its corresponding `End` event are inside this element.
373    /// Start and end events are guaranteed to be balanced.
374    Start(MarkdownTag),
375    /// End of a tagged element.
376    End(MarkdownTagEnd),
377    /// Text that uses the associated range from the markdown source.
378    Text,
379    /// Text that differs from the markdown source - typically due to substitution of HTML entities
380    /// and smart punctuation.
381    SubstitutedText(String),
382    /// An inline code node.
383    Code,
384    /// An HTML node.
385    Html,
386    /// An inline HTML node.
387    InlineHtml,
388    /// A reference to a footnote with given label, which may or may not be defined
389    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
390    /// occur in any order.
391    FootnoteReference,
392    /// A soft line break.
393    SoftBreak,
394    /// A hard line break.
395    HardBreak,
396    /// A horizontal ruler.
397    Rule,
398    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
399    TaskListMarker(bool),
400}
401
402/// Tags for elements that can contain other elements.
403#[derive(Clone, Debug, PartialEq)]
404pub enum MarkdownTag {
405    /// A paragraph of text and other inline elements.
406    Paragraph,
407
408    /// A heading, with optional identifier, classes and custom attributes.
409    /// The identifier is prefixed with `#` and the last one in the attributes
410    /// list is chosen, classes are prefixed with `.` and custom attributes
411    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
412    Heading {
413        level: HeadingLevel,
414        id: Option<SharedString>,
415        classes: Vec<SharedString>,
416        /// The first item of the tuple is the attr and second one the value.
417        attrs: Vec<(SharedString, Option<SharedString>)>,
418    },
419
420    BlockQuote,
421
422    /// A code block.
423    CodeBlock {
424        kind: CodeBlockKind,
425        metadata: CodeBlockMetadata,
426    },
427
428    /// A HTML block.
429    HtmlBlock,
430
431    /// A list. If the list is ordered the field indicates the number of the first item.
432    /// Contains only list items.
433    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
434
435    /// A list item.
436    Item,
437
438    /// A footnote definition. The value contained is the footnote's label by which it can
439    /// be referred to.
440    FootnoteDefinition(SharedString),
441
442    /// A table. Contains a vector describing the text-alignment for each of its columns.
443    Table(Vec<Alignment>),
444
445    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
446    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
447    TableHead,
448
449    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
450    TableRow,
451    TableCell,
452
453    // span-level tags
454    Emphasis,
455    Strong,
456    Strikethrough,
457
458    /// A link.
459    Link {
460        link_type: LinkType,
461        dest_url: SharedString,
462        title: SharedString,
463        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
464        id: SharedString,
465    },
466
467    /// An image. The first field is the link type, the second the destination URL and the third is a title,
468    /// the fourth is the link identifier.
469    Image {
470        link_type: LinkType,
471        dest_url: SharedString,
472        title: SharedString,
473        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
474        id: SharedString,
475    },
476
477    /// A metadata block.
478    MetadataBlock(MetadataBlockKind),
479
480    DefinitionList,
481    DefinitionListTitle,
482    DefinitionListDefinition,
483}
484
485#[derive(Clone, Debug, PartialEq)]
486pub enum CodeBlockKind {
487    Indented,
488    /// "Fenced" means "surrounded by triple backticks."
489    /// There can optionally be either a language after the backticks (like in traditional Markdown)
490    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
491    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
492    Fenced,
493    FencedLang(SharedString),
494    FencedSrc(PathWithRange),
495}
496
497#[derive(Default, Clone, Debug, PartialEq)]
498pub struct CodeBlockMetadata {
499    pub content_range: Range<usize>,
500    pub line_count: usize,
501}
502
503fn extract_code_content_range(text: &str) -> Range<usize> {
504    let text_len = text.len();
505    if text_len == 0 {
506        return 0..0;
507    }
508
509    let start_ticks = text.chars().take_while(|&c| c == '`').count();
510
511    if start_ticks == 0 || start_ticks > text_len {
512        return 0..text_len;
513    }
514
515    let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
516
517    if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
518        return 0..text_len;
519    }
520
521    start_ticks..text_len - end_ticks
522}
523
524pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
525    let mut range = 0..text.len();
526    if text.starts_with("```") {
527        range.start += 3;
528
529        if let Some(newline_ix) = text[range.clone()].find('\n') {
530            range.start += newline_ix + 1;
531        }
532    }
533
534    if !range.is_empty() && text.ends_with("```") {
535        range.end -= 3;
536    }
537    if range.start > range.end {
538        range.end = range.start;
539    }
540    range
541}
542
543#[cfg(test)]
544mod tests {
545    use super::MarkdownEvent::*;
546    use super::MarkdownTag::*;
547    use super::*;
548
549    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
550        .union(Options::ENABLE_MATH)
551        .union(Options::ENABLE_DEFINITION_LIST);
552
553    #[test]
554    fn all_options_considered() {
555        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
556        // can be evaluated for inclusion.
557        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
558    }
559
560    #[test]
561    fn wanted_and_unwanted_options_disjoint() {
562        assert_eq!(
563            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
564            Options::empty()
565        );
566    }
567
568    #[test]
569    fn test_html_comments() {
570        assert_eq!(
571            parse_markdown("  <!--\nrdoc-file=string.c\n-->\nReturns"),
572            (
573                vec![
574                    (2..30, Start(HtmlBlock)),
575                    (2..2, SubstitutedText("  ".into())),
576                    (2..7, Html),
577                    (7..26, Html),
578                    (26..30, Html),
579                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
580                    (30..37, Start(Paragraph)),
581                    (30..37, Text),
582                    (30..37, End(MarkdownTagEnd::Paragraph))
583                ],
584                HashSet::default(),
585                HashSet::default()
586            )
587        )
588    }
589
590    #[test]
591    fn test_plain_urls_and_escaped_text() {
592        assert_eq!(
593            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
594            (
595                vec![
596                    (0..51, Start(Paragraph)),
597                    (0..6, SubstitutedText("\u{a0}".into())),
598                    (6..12, SubstitutedText("\u{a0}".into())),
599                    (12..13, Text),
600                    (
601                        13..29,
602                        Start(Link {
603                            link_type: LinkType::Autolink,
604                            dest_url: "https://some.url".into(),
605                            title: "".into(),
606                            id: "".into(),
607                        })
608                    ),
609                    (13..29, Text),
610                    (13..29, End(MarkdownTagEnd::Link)),
611                    (29..35, Text),
612                    (36..37, Text), // Escaped backtick
613                    (37..44, SubstitutedText("".into())),
614                    (45..46, Text), // Escaped backtick
615                    (46..51, Text),
616                    (0..51, End(MarkdownTagEnd::Paragraph))
617                ],
618                HashSet::default(),
619                HashSet::default()
620            )
621        );
622    }
623
624    #[test]
625    fn test_incomplete_link() {
626        assert_eq!(
627            parse_markdown("You can use the [GitHub Search API](https://docs.github.com/en").0,
628            vec![
629                (0..62, Start(Paragraph)),
630                (0..16, Text),
631                (16..17, Text),
632                (17..34, Text),
633                (34..35, Text),
634                (35..36, Text),
635                (
636                    36..62,
637                    Start(Link {
638                        link_type: LinkType::Autolink,
639                        dest_url: "https://docs.github.com/en".into(),
640                        title: "".into(),
641                        id: "".into()
642                    })
643                ),
644                (36..62, Text),
645                (36..62, End(MarkdownTagEnd::Link)),
646                (0..62, End(MarkdownTagEnd::Paragraph))
647            ],
648        );
649    }
650
651    #[test]
652    fn test_smart_punctuation() {
653        assert_eq!(
654            parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
655            (
656                vec![
657                    (0..53, Start(Paragraph)),
658                    (0..2, SubstitutedText("".into())),
659                    (2..3, Text),
660                    (3..6, SubstitutedText("".into())),
661                    (6..7, Text),
662                    (7..10, SubstitutedText("".into())),
663                    (10..11, Text),
664                    (11..12, SubstitutedText("".into())),
665                    (12..25, Text),
666                    (25..26, SubstitutedText("".into())),
667                    (26..27, Text),
668                    (27..28, SubstitutedText("".into())),
669                    (28..41, Text),
670                    (41..42, SubstitutedText("".into())),
671                    (42..43, Text),
672                    (43..53, SubstitutedText("–––––".into())),
673                    (0..53, End(MarkdownTagEnd::Paragraph))
674                ],
675                HashSet::default(),
676                HashSet::default()
677            )
678        )
679    }
680
681    #[test]
682    fn test_code_block_metadata() {
683        assert_eq!(
684            parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
685            (
686                vec![
687                    (
688                        0..37,
689                        Start(CodeBlock {
690                            kind: CodeBlockKind::FencedLang("rust".into()),
691                            metadata: CodeBlockMetadata {
692                                content_range: 8..34,
693                                line_count: 3
694                            }
695                        })
696                    ),
697                    (8..34, Text),
698                    (0..37, End(MarkdownTagEnd::CodeBlock)),
699                ],
700                {
701                    let mut h = HashSet::default();
702                    h.insert("rust".into());
703                    h
704                },
705                HashSet::default()
706            )
707        );
708        assert_eq!(
709            parse_markdown("    fn main() {}"),
710            (
711                vec![
712                    (
713                        4..16,
714                        Start(CodeBlock {
715                            kind: CodeBlockKind::Indented,
716                            metadata: CodeBlockMetadata {
717                                content_range: 4..16,
718                                line_count: 1
719                            }
720                        })
721                    ),
722                    (4..16, Text),
723                    (4..16, End(MarkdownTagEnd::CodeBlock))
724                ],
725                HashSet::default(),
726                HashSet::default()
727            )
728        );
729    }
730
731    #[test]
732    fn test_extract_code_content_range() {
733        let input = "```let x = 5;```";
734        assert_eq!(extract_code_content_range(input), 3..13);
735
736        let input = "``let x = 5;``";
737        assert_eq!(extract_code_content_range(input), 2..12);
738
739        let input = "`let x = 5;`";
740        assert_eq!(extract_code_content_range(input), 1..11);
741
742        let input = "plain text";
743        assert_eq!(extract_code_content_range(input), 0..10);
744
745        let input = "``let x = 5;`";
746        assert_eq!(extract_code_content_range(input), 0..13);
747    }
748
749    #[test]
750    fn test_extract_code_block_content_range() {
751        let input = "```rust\nlet x = 5;\n```";
752        assert_eq!(extract_code_block_content_range(input), 8..19);
753
754        let input = "plain text";
755        assert_eq!(extract_code_block_content_range(input), 0..10);
756
757        let input = "```python\nprint('hello')\nprint('world')\n```";
758        assert_eq!(extract_code_block_content_range(input), 10..40);
759
760        // Malformed input
761        let input = "`````";
762        assert_eq!(extract_code_block_content_range(input), 3..3);
763    }
764
765    #[test]
766    fn test_links_split_across_fragments() {
767        // This test verifies that links split across multiple text fragments due to escaping or other issues
768        // are correctly detected and processed
769        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
770        // We're verifying our parser can handle this correctly
771        assert_eq!(
772            parse_markdown("https:/\\/example.com is equivalent to https://example&#46;com!").0,
773            vec![
774                (0..62, Start(Paragraph)),
775                (
776                    0..20,
777                    Start(Link {
778                        link_type: LinkType::Autolink,
779                        dest_url: "https://example.com".into(),
780                        title: "".into(),
781                        id: "".into()
782                    })
783                ),
784                (0..7, Text),
785                (8..20, Text),
786                (0..20, End(MarkdownTagEnd::Link)),
787                (20..38, Text),
788                (
789                    38..61,
790                    Start(Link {
791                        link_type: LinkType::Autolink,
792                        dest_url: "https://example.com".into(),
793                        title: "".into(),
794                        id: "".into()
795                    })
796                ),
797                (38..53, Text),
798                (53..58, SubstitutedText(".".into())),
799                (58..61, Text),
800                (38..61, End(MarkdownTagEnd::Link)),
801                (61..62, Text),
802                (0..62, End(MarkdownTagEnd::Paragraph))
803            ],
804        );
805
806        assert_eq!(
807            parse_markdown("Visit https://example.com/cat\\/é&#8205;☕ for coffee!").0,
808            [
809                (0..55, Start(Paragraph)),
810                (0..6, Text),
811                (
812                    6..43,
813                    Start(Link {
814                        link_type: LinkType::Autolink,
815                        dest_url: "https://example.com/cat/é\u{200d}".into(),
816                        title: "".into(),
817                        id: "".into()
818                    })
819                ),
820                (6..29, Text),
821                (30..33, Text),
822                (33..40, SubstitutedText("\u{200d}".into())),
823                (40..43, Text),
824                (6..43, End(MarkdownTagEnd::Link)),
825                (43..55, Text),
826                (0..55, End(MarkdownTagEnd::Paragraph))
827            ]
828        );
829    }
830}