parser.rs

   1use gpui::SharedString;
   2use linkify::LinkFinder;
   3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
   4use pulldown_cmark::{
   5    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
   6};
   7use std::{collections::BTreeMap, ops::Range, sync::Arc};
   8
   9use collections::HashSet;
  10
  11use crate::{html, path_range::PathWithRange};
  12
  13pub const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
  14    .union(Options::ENABLE_FOOTNOTES)
  15    .union(Options::ENABLE_STRIKETHROUGH)
  16    .union(Options::ENABLE_TASKLISTS)
  17    .union(Options::ENABLE_SMART_PUNCTUATION)
  18    .union(Options::ENABLE_HEADING_ATTRIBUTES)
  19    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
  20    .union(Options::ENABLE_OLD_FOOTNOTES)
  21    .union(Options::ENABLE_GFM)
  22    .union(Options::ENABLE_SUPERSCRIPT)
  23    .union(Options::ENABLE_SUBSCRIPT);
  24
  25#[derive(Default)]
  26struct ParseState {
  27    events: Vec<(Range<usize>, MarkdownEvent)>,
  28    root_block_starts: Vec<usize>,
  29    depth: usize,
  30}
  31
  32#[derive(Debug, Default)]
  33#[cfg_attr(test, derive(PartialEq))]
  34pub(crate) struct ParsedMarkdownData {
  35    pub events: Vec<(Range<usize>, MarkdownEvent)>,
  36    pub language_names: HashSet<SharedString>,
  37    pub language_paths: HashSet<Arc<str>>,
  38    pub root_block_starts: Vec<usize>,
  39    pub html_blocks: BTreeMap<usize, html::html_parser::ParsedHtmlBlock>,
  40}
  41
  42impl ParseState {
  43    fn push_event(&mut self, range: Range<usize>, event: MarkdownEvent) {
  44        match &event {
  45            MarkdownEvent::Start(_) => {
  46                if self.depth == 0 {
  47                    self.root_block_starts.push(range.start);
  48                    self.events.push((range.clone(), MarkdownEvent::RootStart));
  49                }
  50                self.depth += 1;
  51                self.events.push((range, event));
  52            }
  53            MarkdownEvent::End(_) => {
  54                self.events.push((range.clone(), event));
  55                if self.depth > 0 {
  56                    self.depth -= 1;
  57                    if self.depth == 0 {
  58                        let root_block_index = self.root_block_starts.len() - 1;
  59                        self.events
  60                            .push((range, MarkdownEvent::RootEnd(root_block_index)));
  61                    }
  62                }
  63            }
  64            MarkdownEvent::Rule => {
  65                if self.depth == 0 && !range.is_empty() {
  66                    self.root_block_starts.push(range.start);
  67                    let root_block_index = self.root_block_starts.len() - 1;
  68                    self.events.push((range.clone(), MarkdownEvent::RootStart));
  69                    self.events.push((range.clone(), event));
  70                    self.events
  71                        .push((range, MarkdownEvent::RootEnd(root_block_index)));
  72                } else {
  73                    self.events.push((range, event));
  74                }
  75            }
  76            _ => {
  77                self.events.push((range, event));
  78            }
  79        }
  80    }
  81}
  82
  83pub(crate) fn parse_markdown_with_options(text: &str, parse_html: bool) -> ParsedMarkdownData {
  84    let mut state = ParseState::default();
  85    let mut language_names = HashSet::default();
  86    let mut language_paths = HashSet::default();
  87    let mut html_blocks = BTreeMap::default();
  88    let mut within_link = false;
  89    let mut within_code_block = false;
  90    let mut within_metadata = false;
  91    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
  92        .into_offset_iter()
  93        .peekable();
  94    while let Some((pulldown_event, range)) = parser.next() {
  95        if within_metadata {
  96            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
  97                pulldown_event
  98            {
  99                within_metadata = false;
 100            }
 101            continue;
 102        }
 103        match pulldown_event {
 104            pulldown_cmark::Event::Start(tag) => {
 105                if let pulldown_cmark::Tag::HtmlBlock = &tag {
 106                    state.push_event(range.clone(), MarkdownEvent::Start(MarkdownTag::HtmlBlock));
 107
 108                    if parse_html {
 109                        if let Some(block) =
 110                            html::html_parser::parse_html_block(&text[range.clone()], range.clone())
 111                        {
 112                            html_blocks.insert(range.start, block);
 113
 114                            while let Some((event, end_range)) = parser.next() {
 115                                if let pulldown_cmark::Event::End(
 116                                    pulldown_cmark::TagEnd::HtmlBlock,
 117                                ) = event
 118                                {
 119                                    state.push_event(
 120                                        end_range,
 121                                        MarkdownEvent::End(MarkdownTagEnd::HtmlBlock),
 122                                    );
 123                                    break;
 124                                }
 125                            }
 126                        }
 127                    }
 128                    continue;
 129                }
 130
 131                let tag = match tag {
 132                    pulldown_cmark::Tag::Link {
 133                        link_type,
 134                        dest_url,
 135                        title,
 136                        id,
 137                    } => {
 138                        within_link = true;
 139                        MarkdownTag::Link {
 140                            link_type,
 141                            dest_url: SharedString::from(dest_url.into_string()),
 142                            title: SharedString::from(title.into_string()),
 143                            id: SharedString::from(id.into_string()),
 144                        }
 145                    }
 146                    pulldown_cmark::Tag::MetadataBlock(_kind) => {
 147                        within_metadata = true;
 148                        continue;
 149                    }
 150                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 151                        within_code_block = true;
 152                        MarkdownTag::CodeBlock {
 153                            kind: CodeBlockKind::Indented,
 154                            metadata: CodeBlockMetadata {
 155                                content_range: range.clone(),
 156                                line_count: 1,
 157                            },
 158                        }
 159                    }
 160                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 161                        ref info,
 162                    )) => {
 163                        within_code_block = true;
 164                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 165                        let content_range =
 166                            content_range.start + range.start..content_range.end + range.start;
 167
 168                        // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
 169                        let line_count = text[content_range.clone()]
 170                            .bytes()
 171                            .filter(|c| *c == b'\n')
 172                            .count();
 173                        let metadata = CodeBlockMetadata {
 174                            content_range,
 175                            line_count,
 176                        };
 177
 178                        let info = info.trim();
 179                        let kind = if info.is_empty() {
 180                            CodeBlockKind::Fenced
 181                            // Languages should never contain a slash, and PathRanges always should.
 182                            // (Models are told to specify them relative to a workspace root.)
 183                        } else if info.contains('/') {
 184                            let path_range = PathWithRange::new(info);
 185                            language_paths.insert(path_range.path.clone());
 186                            CodeBlockKind::FencedSrc(path_range)
 187                        } else {
 188                            let language = SharedString::from(info.to_string());
 189                            language_names.insert(language.clone());
 190                            CodeBlockKind::FencedLang(language)
 191                        };
 192
 193                        MarkdownTag::CodeBlock { kind, metadata }
 194                    }
 195                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
 196                    pulldown_cmark::Tag::Heading {
 197                        level,
 198                        id,
 199                        classes,
 200                        attrs,
 201                    } => {
 202                        let id = id.map(|id| SharedString::from(id.into_string()));
 203                        let classes = classes
 204                            .into_iter()
 205                            .map(|c| SharedString::from(c.into_string()))
 206                            .collect();
 207                        let attrs = attrs
 208                            .into_iter()
 209                            .map(|(key, value)| {
 210                                (
 211                                    SharedString::from(key.into_string()),
 212                                    value.map(|v| SharedString::from(v.into_string())),
 213                                )
 214                            })
 215                            .collect();
 216                        MarkdownTag::Heading {
 217                            level,
 218                            id,
 219                            classes,
 220                            attrs,
 221                        }
 222                    }
 223                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
 224                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
 225                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
 226                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
 227                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
 228                    }
 229                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
 230                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
 231                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
 232                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
 233                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
 234                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
 235                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
 236                    pulldown_cmark::Tag::Superscript => MarkdownTag::Superscript,
 237                    pulldown_cmark::Tag::Subscript => MarkdownTag::Subscript,
 238                    pulldown_cmark::Tag::Image {
 239                        link_type,
 240                        dest_url,
 241                        title,
 242                        id,
 243                    } => MarkdownTag::Image {
 244                        link_type,
 245                        dest_url: SharedString::from(dest_url.into_string()),
 246                        title: SharedString::from(title.into_string()),
 247                        id: SharedString::from(id.into_string()),
 248                    },
 249                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock, // this is handled above separately
 250                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
 251                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
 252                    pulldown_cmark::Tag::DefinitionListDefinition => {
 253                        MarkdownTag::DefinitionListDefinition
 254                    }
 255                };
 256                state.push_event(range, MarkdownEvent::Start(tag))
 257            }
 258            pulldown_cmark::Event::End(tag) => {
 259                if let pulldown_cmark::TagEnd::Link = tag {
 260                    within_link = false;
 261                } else if let pulldown_cmark::TagEnd::CodeBlock = tag {
 262                    within_code_block = false;
 263                }
 264                state.push_event(range, MarkdownEvent::End(tag));
 265            }
 266            pulldown_cmark::Event::Text(parsed) => {
 267                fn event_for(
 268                    text: &str,
 269                    range: Range<usize>,
 270                    str: &str,
 271                ) -> (Range<usize>, MarkdownEvent) {
 272                    if str == &text[range.clone()] {
 273                        (range, MarkdownEvent::Text)
 274                    } else {
 275                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
 276                    }
 277                }
 278
 279                if within_code_block {
 280                    let (range, event) = event_for(text, range, &parsed);
 281                    state.push_event(range, event);
 282                    continue;
 283                }
 284
 285                #[derive(Debug)]
 286                struct TextRange<'a> {
 287                    source_range: Range<usize>,
 288                    merged_range: Range<usize>,
 289                    parsed: CowStr<'a>,
 290                }
 291
 292                let mut last_len = parsed.len();
 293                let mut ranges = vec![TextRange {
 294                    source_range: range.clone(),
 295                    merged_range: 0..last_len,
 296                    parsed,
 297                }];
 298
 299                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _)))
 300                    || (parse_html
 301                        && matches!(
 302                            parser.peek(),
 303                            Some((pulldown_cmark::Event::InlineHtml(_), _))
 304                        ))
 305                {
 306                    let Some((next_event, next_range)) = parser.next() else {
 307                        unreachable!()
 308                    };
 309                    let next_text = match next_event {
 310                        pulldown_cmark::Event::Text(next_event) => next_event,
 311                        pulldown_cmark::Event::InlineHtml(_) => CowStr::Borrowed(""),
 312                        _ => unreachable!(),
 313                    };
 314                    let next_len = last_len + next_text.len();
 315                    ranges.push(TextRange {
 316                        source_range: next_range.clone(),
 317                        merged_range: last_len..next_len,
 318                        parsed: next_text,
 319                    });
 320                    last_len = next_len;
 321                }
 322
 323                let mut merged_text =
 324                    String::with_capacity(ranges.last().unwrap().merged_range.end);
 325                for range in &ranges {
 326                    merged_text.push_str(&range.parsed);
 327                }
 328
 329                let mut ranges = ranges.into_iter().peekable();
 330
 331                if !within_link && !within_code_block {
 332                    let mut finder = LinkFinder::new();
 333                    finder.kinds(&[linkify::LinkKind::Url]);
 334
 335                    // Find links in the merged text
 336                    for link in finder.links(&merged_text) {
 337                        let link_start_in_merged = link.start();
 338                        let link_end_in_merged = link.end();
 339
 340                        while ranges
 341                            .peek()
 342                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
 343                        {
 344                            let range = ranges.next().unwrap();
 345                            let (range, event) = event_for(text, range.source_range, &range.parsed);
 346                            state.push_event(range, event);
 347                        }
 348
 349                        let Some(range) = ranges.peek_mut() else {
 350                            continue;
 351                        };
 352                        let prefix_len = link_start_in_merged - range.merged_range.start;
 353                        if prefix_len > 0 {
 354                            let (head, tail) = range.parsed.split_at(prefix_len);
 355                            let (event_range, event) = event_for(
 356                                text,
 357                                range.source_range.start..range.source_range.start + prefix_len,
 358                                head,
 359                            );
 360                            state.push_event(event_range, event);
 361                            range.parsed = CowStr::Boxed(tail.into());
 362                            range.merged_range.start += prefix_len;
 363                            range.source_range.start += prefix_len;
 364                        }
 365
 366                        let link_start_in_source = range.source_range.start;
 367                        let mut link_end_in_source = range.source_range.end;
 368                        let mut link_events = Vec::new();
 369
 370                        while ranges
 371                            .peek()
 372                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
 373                        {
 374                            let range = ranges.next().unwrap();
 375                            link_end_in_source = range.source_range.end;
 376                            link_events.push(event_for(text, range.source_range, &range.parsed));
 377                        }
 378
 379                        if let Some(range) = ranges.peek_mut() {
 380                            let prefix_len = link_end_in_merged - range.merged_range.start;
 381                            if prefix_len > 0 {
 382                                let (head, tail) = range.parsed.split_at(prefix_len);
 383                                link_events.push(event_for(
 384                                    text,
 385                                    range.source_range.start..range.source_range.start + prefix_len,
 386                                    head,
 387                                ));
 388                                range.parsed = CowStr::Boxed(tail.into());
 389                                range.merged_range.start += prefix_len;
 390                                range.source_range.start += prefix_len;
 391                                link_end_in_source = range.source_range.start;
 392                            }
 393                        }
 394                        let link_range = link_start_in_source..link_end_in_source;
 395
 396                        state.push_event(
 397                            link_range.clone(),
 398                            MarkdownEvent::Start(MarkdownTag::Link {
 399                                link_type: LinkType::Autolink,
 400                                dest_url: SharedString::from(link.as_str().to_string()),
 401                                title: SharedString::default(),
 402                                id: SharedString::default(),
 403                            }),
 404                        );
 405                        for (range, event) in link_events {
 406                            state.push_event(range, event);
 407                        }
 408                        state.push_event(
 409                            link_range.clone(),
 410                            MarkdownEvent::End(MarkdownTagEnd::Link),
 411                        );
 412                    }
 413                }
 414
 415                for range in ranges {
 416                    let (range, event) = event_for(text, range.source_range, &range.parsed);
 417                    state.push_event(range, event);
 418                }
 419            }
 420            pulldown_cmark::Event::Code(_) => {
 421                let content_range = extract_code_content_range(&text[range.clone()]);
 422                let content_range =
 423                    content_range.start + range.start..content_range.end + range.start;
 424                state.push_event(content_range, MarkdownEvent::Code)
 425            }
 426            pulldown_cmark::Event::Html(_) => state.push_event(range, MarkdownEvent::Html),
 427            pulldown_cmark::Event::InlineHtml(_) => {
 428                state.push_event(range, MarkdownEvent::InlineHtml)
 429            }
 430            pulldown_cmark::Event::FootnoteReference(_) => {
 431                state.push_event(range, MarkdownEvent::FootnoteReference)
 432            }
 433            pulldown_cmark::Event::SoftBreak => state.push_event(range, MarkdownEvent::SoftBreak),
 434            pulldown_cmark::Event::HardBreak => state.push_event(range, MarkdownEvent::HardBreak),
 435            pulldown_cmark::Event::Rule => state.push_event(range, MarkdownEvent::Rule),
 436            pulldown_cmark::Event::TaskListMarker(checked) => {
 437                state.push_event(range, MarkdownEvent::TaskListMarker(checked))
 438            }
 439            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
 440        }
 441    }
 442
 443    ParsedMarkdownData {
 444        events: state.events,
 445        language_names,
 446        language_paths,
 447        root_block_starts: state.root_block_starts,
 448        html_blocks,
 449    }
 450}
 451
 452pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
 453    let mut events = Vec::new();
 454    let mut finder = LinkFinder::new();
 455    finder.kinds(&[linkify::LinkKind::Url]);
 456    let mut text_range = Range {
 457        start: 0,
 458        end: text.len(),
 459    };
 460    for link in finder.links(text) {
 461        let link_range = link.start()..link.end();
 462
 463        if link_range.start > text_range.start {
 464            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
 465        }
 466
 467        events.push((
 468            link_range.clone(),
 469            MarkdownEvent::Start(MarkdownTag::Link {
 470                link_type: LinkType::Autolink,
 471                dest_url: SharedString::from(link.as_str().to_string()),
 472                title: SharedString::default(),
 473                id: SharedString::default(),
 474            }),
 475        ));
 476        events.push((link_range.clone(), MarkdownEvent::Text));
 477        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
 478
 479        text_range.start = link_range.end;
 480    }
 481
 482    if text_range.end > text_range.start {
 483        events.push((text_range, MarkdownEvent::Text));
 484    }
 485
 486    events
 487}
 488
 489/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
 490/// parse result for rendering without resorting to unsafe lifetime coercion.
 491#[derive(Clone, Debug, PartialEq)]
 492pub enum MarkdownEvent {
 493    /// Start of a tagged element. Events that are yielded after this event
 494    /// and before its corresponding `End` event are inside this element.
 495    /// Start and end events are guaranteed to be balanced.
 496    Start(MarkdownTag),
 497    /// End of a tagged element.
 498    End(MarkdownTagEnd),
 499    /// Text that uses the associated range from the markdown source.
 500    Text,
 501    /// Text that differs from the markdown source - typically due to substitution of HTML entities
 502    /// and smart punctuation.
 503    SubstitutedText(String),
 504    /// An inline code node.
 505    Code,
 506    /// An HTML node.
 507    Html,
 508    /// An inline HTML node.
 509    InlineHtml,
 510    /// A reference to a footnote with given label, which may or may not be defined
 511    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
 512    /// occur in any order.
 513    FootnoteReference,
 514    /// A soft line break.
 515    SoftBreak,
 516    /// A hard line break.
 517    HardBreak,
 518    /// A horizontal ruler.
 519    Rule,
 520    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
 521    TaskListMarker(bool),
 522    /// Start of a root-level block (a top-level structural element like a paragraph, heading, list, etc.).
 523    RootStart,
 524    /// End of a root-level block. Contains the root block index.
 525    RootEnd(usize),
 526}
 527
 528/// Tags for elements that can contain other elements.
 529#[derive(Clone, Debug, PartialEq)]
 530pub enum MarkdownTag {
 531    /// A paragraph of text and other inline elements.
 532    Paragraph,
 533
 534    /// A heading, with optional identifier, classes and custom attributes.
 535    /// The identifier is prefixed with `#` and the last one in the attributes
 536    /// list is chosen, classes are prefixed with `.` and custom attributes
 537    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
 538    Heading {
 539        level: HeadingLevel,
 540        id: Option<SharedString>,
 541        classes: Vec<SharedString>,
 542        /// The first item of the tuple is the attr and second one the value.
 543        attrs: Vec<(SharedString, Option<SharedString>)>,
 544    },
 545
 546    BlockQuote,
 547
 548    /// A code block.
 549    CodeBlock {
 550        kind: CodeBlockKind,
 551        metadata: CodeBlockMetadata,
 552    },
 553
 554    /// A HTML block.
 555    HtmlBlock,
 556
 557    /// A list. If the list is ordered the field indicates the number of the first item.
 558    /// Contains only list items.
 559    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
 560
 561    /// A list item.
 562    Item,
 563
 564    /// A footnote definition. The value contained is the footnote's label by which it can
 565    /// be referred to.
 566    FootnoteDefinition(SharedString),
 567
 568    /// A table. Contains a vector describing the text-alignment for each of its columns.
 569    Table(Vec<Alignment>),
 570
 571    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
 572    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
 573    TableHead,
 574
 575    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
 576    TableRow,
 577    TableCell,
 578
 579    // span-level tags
 580    Emphasis,
 581    Strong,
 582    Strikethrough,
 583    Superscript,
 584    Subscript,
 585
 586    /// A link.
 587    Link {
 588        link_type: LinkType,
 589        dest_url: SharedString,
 590        title: SharedString,
 591        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
 592        id: SharedString,
 593    },
 594
 595    /// An image. The first field is the link type, the second the destination URL and the third is a title,
 596    /// the fourth is the link identifier.
 597    Image {
 598        link_type: LinkType,
 599        dest_url: SharedString,
 600        title: SharedString,
 601        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
 602        id: SharedString,
 603    },
 604
 605    /// A metadata block.
 606    MetadataBlock(MetadataBlockKind),
 607
 608    DefinitionList,
 609    DefinitionListTitle,
 610    DefinitionListDefinition,
 611}
 612
 613#[derive(Clone, Debug, PartialEq)]
 614pub enum CodeBlockKind {
 615    Indented,
 616    /// "Fenced" means "surrounded by triple backticks."
 617    /// There can optionally be either a language after the backticks (like in traditional Markdown)
 618    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
 619    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
 620    Fenced,
 621    FencedLang(SharedString),
 622    FencedSrc(PathWithRange),
 623}
 624
 625#[derive(Default, Clone, Debug, PartialEq)]
 626pub struct CodeBlockMetadata {
 627    pub content_range: Range<usize>,
 628    pub line_count: usize,
 629}
 630
 631fn extract_code_content_range(text: &str) -> Range<usize> {
 632    let text_len = text.len();
 633    if text_len == 0 {
 634        return 0..0;
 635    }
 636
 637    let start_ticks = text.chars().take_while(|&c| c == '`').count();
 638
 639    if start_ticks == 0 || start_ticks > text_len {
 640        return 0..text_len;
 641    }
 642
 643    let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
 644
 645    if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
 646        return 0..text_len;
 647    }
 648
 649    start_ticks..text_len - end_ticks
 650}
 651
 652pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
 653    let mut range = 0..text.len();
 654    if text.starts_with("```") {
 655        range.start += 3;
 656
 657        if let Some(newline_ix) = text[range.clone()].find('\n') {
 658            range.start += newline_ix + 1;
 659        }
 660    }
 661
 662    if !range.is_empty() && text.ends_with("```") {
 663        range.end -= 3;
 664    }
 665    if range.start > range.end {
 666        range.end = range.start;
 667    }
 668    range
 669}
 670
 671#[cfg(test)]
 672mod tests {
 673    use super::MarkdownEvent::*;
 674    use super::MarkdownTag::*;
 675    use super::*;
 676
 677    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
 678        .union(Options::ENABLE_MATH)
 679        .union(Options::ENABLE_DEFINITION_LIST)
 680        .union(Options::ENABLE_WIKILINKS);
 681
 682    #[test]
 683    fn all_options_considered() {
 684        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
 685        // can be evaluated for inclusion.
 686        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
 687    }
 688
 689    #[test]
 690    fn wanted_and_unwanted_options_disjoint() {
 691        assert_eq!(
 692            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
 693            Options::empty()
 694        );
 695    }
 696
 697    #[test]
 698    fn test_html_comments() {
 699        assert_eq!(
 700            parse_markdown_with_options("  <!--\nrdoc-file=string.c\n-->\nReturns", false),
 701            ParsedMarkdownData {
 702                events: vec![
 703                    (2..30, RootStart),
 704                    (2..30, Start(HtmlBlock)),
 705                    (2..2, SubstitutedText("  ".into())),
 706                    (2..7, Html),
 707                    (7..26, Html),
 708                    (26..30, Html),
 709                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
 710                    (2..30, RootEnd(0)),
 711                    (30..37, RootStart),
 712                    (30..37, Start(Paragraph)),
 713                    (30..37, Text),
 714                    (30..37, End(MarkdownTagEnd::Paragraph)),
 715                    (30..37, RootEnd(1)),
 716                ],
 717                root_block_starts: vec![2, 30],
 718                ..Default::default()
 719            }
 720        )
 721    }
 722
 723    #[test]
 724    fn test_plain_urls_and_escaped_text() {
 725        assert_eq!(
 726            parse_markdown_with_options(
 727                "&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text",
 728                false
 729            ),
 730            ParsedMarkdownData {
 731                events: vec![
 732                    (0..51, RootStart),
 733                    (0..51, Start(Paragraph)),
 734                    (0..6, SubstitutedText("\u{a0}".into())),
 735                    (6..12, SubstitutedText("\u{a0}".into())),
 736                    (12..13, Text),
 737                    (
 738                        13..29,
 739                        Start(Link {
 740                            link_type: LinkType::Autolink,
 741                            dest_url: "https://some.url".into(),
 742                            title: "".into(),
 743                            id: "".into(),
 744                        })
 745                    ),
 746                    (13..29, Text),
 747                    (13..29, End(MarkdownTagEnd::Link)),
 748                    (29..35, Text),
 749                    (36..37, Text), // Escaped backtick
 750                    (37..44, SubstitutedText("".into())),
 751                    (45..46, Text), // Escaped backtick
 752                    (46..51, Text),
 753                    (0..51, End(MarkdownTagEnd::Paragraph)),
 754                    (0..51, RootEnd(0)),
 755                ],
 756                root_block_starts: vec![0],
 757                ..Default::default()
 758            }
 759        );
 760    }
 761
 762    #[test]
 763    fn test_incomplete_link() {
 764        assert_eq!(
 765            parse_markdown_with_options(
 766                "You can use the [GitHub Search API](https://docs.github.com/en",
 767                false
 768            )
 769            .events,
 770            vec![
 771                (0..62, RootStart),
 772                (0..62, Start(Paragraph)),
 773                (0..16, Text),
 774                (16..17, Text),
 775                (17..34, Text),
 776                (34..35, Text),
 777                (35..36, Text),
 778                (
 779                    36..62,
 780                    Start(Link {
 781                        link_type: LinkType::Autolink,
 782                        dest_url: "https://docs.github.com/en".into(),
 783                        title: "".into(),
 784                        id: "".into()
 785                    })
 786                ),
 787                (36..62, Text),
 788                (36..62, End(MarkdownTagEnd::Link)),
 789                (0..62, End(MarkdownTagEnd::Paragraph)),
 790                (0..62, RootEnd(0)),
 791            ],
 792        );
 793    }
 794
 795    #[test]
 796    fn test_smart_punctuation() {
 797        assert_eq!(
 798            parse_markdown_with_options(
 799                "-- --- ... \"double quoted\" 'single quoted' ----------",
 800                false
 801            ),
 802            ParsedMarkdownData {
 803                events: vec![
 804                    (0..53, RootStart),
 805                    (0..53, Start(Paragraph)),
 806                    (0..2, SubstitutedText("".into())),
 807                    (2..3, Text),
 808                    (3..6, SubstitutedText("".into())),
 809                    (6..7, Text),
 810                    (7..10, SubstitutedText("".into())),
 811                    (10..11, Text),
 812                    (11..12, SubstitutedText("\u{201c}".into())),
 813                    (12..25, Text),
 814                    (25..26, SubstitutedText("\u{201d}".into())),
 815                    (26..27, Text),
 816                    (27..28, SubstitutedText("\u{2018}".into())),
 817                    (28..41, Text),
 818                    (41..42, SubstitutedText("\u{2019}".into())),
 819                    (42..43, Text),
 820                    (43..53, SubstitutedText("–––––".into())),
 821                    (0..53, End(MarkdownTagEnd::Paragraph)),
 822                    (0..53, RootEnd(0)),
 823                ],
 824                root_block_starts: vec![0],
 825                ..Default::default()
 826            }
 827        )
 828    }
 829
 830    #[test]
 831    fn test_code_block_metadata() {
 832        assert_eq!(
 833            parse_markdown_with_options("```rust\nfn main() {\n let a = 1;\n}\n```", false),
 834            ParsedMarkdownData {
 835                events: vec![
 836                    (0..37, RootStart),
 837                    (
 838                        0..37,
 839                        Start(CodeBlock {
 840                            kind: CodeBlockKind::FencedLang("rust".into()),
 841                            metadata: CodeBlockMetadata {
 842                                content_range: 8..34,
 843                                line_count: 3
 844                            }
 845                        })
 846                    ),
 847                    (8..34, Text),
 848                    (0..37, End(MarkdownTagEnd::CodeBlock)),
 849                    (0..37, RootEnd(0)),
 850                ],
 851                language_names: {
 852                    let mut h = HashSet::default();
 853                    h.insert("rust".into());
 854                    h
 855                },
 856                root_block_starts: vec![0],
 857                ..Default::default()
 858            }
 859        );
 860        assert_eq!(
 861            parse_markdown_with_options("    fn main() {}", false),
 862            ParsedMarkdownData {
 863                events: vec![
 864                    (4..16, RootStart),
 865                    (
 866                        4..16,
 867                        Start(CodeBlock {
 868                            kind: CodeBlockKind::Indented,
 869                            metadata: CodeBlockMetadata {
 870                                content_range: 4..16,
 871                                line_count: 1
 872                            }
 873                        })
 874                    ),
 875                    (4..16, Text),
 876                    (4..16, End(MarkdownTagEnd::CodeBlock)),
 877                    (4..16, RootEnd(0)),
 878                ],
 879                root_block_starts: vec![4],
 880                ..Default::default()
 881            }
 882        );
 883    }
 884
 885    fn assert_code_block_does_not_emit_links(markdown: &str) {
 886        let parsed = parse_markdown_with_options(markdown, false);
 887        let mut code_block_depth = 0;
 888        let mut code_block_count = 0;
 889        let mut saw_text_inside_code_block = false;
 890
 891        for (_, event) in &parsed.events {
 892            match event {
 893                Start(CodeBlock { .. }) => {
 894                    code_block_depth += 1;
 895                    code_block_count += 1;
 896                }
 897                End(MarkdownTagEnd::CodeBlock) => {
 898                    assert!(
 899                        code_block_depth > 0,
 900                        "encountered a code block end without a matching start"
 901                    );
 902                    code_block_depth -= 1;
 903                }
 904                Start(Link { .. }) | End(MarkdownTagEnd::Link) => {
 905                    assert_eq!(
 906                        code_block_depth, 0,
 907                        "code blocks should not emit link events"
 908                    );
 909                }
 910                Text | SubstitutedText(_) if code_block_depth > 0 => {
 911                    saw_text_inside_code_block = true;
 912                }
 913                _ => {}
 914            }
 915        }
 916
 917        assert_eq!(code_block_count, 1, "expected exactly one code block");
 918        assert_eq!(code_block_depth, 0, "unterminated code block");
 919        assert!(
 920            saw_text_inside_code_block,
 921            "expected text inside the code block"
 922        );
 923    }
 924
 925    #[test]
 926    fn test_code_blocks_do_not_autolink_urls() {
 927        assert_code_block_does_not_emit_links("```txt\nhttps://example.com\n```");
 928        assert_code_block_does_not_emit_links("    https://example.com");
 929        assert_code_block_does_not_emit_links(
 930            "```txt\r\nhttps:/\\/example.com\r\nhttps://example&#46;com\r\n```",
 931        );
 932        assert_code_block_does_not_emit_links(
 933            "    https:/\\/example.com\r\n    https://example&#46;com",
 934        );
 935    }
 936
 937    #[test]
 938    fn test_metadata_blocks_do_not_affect_root_blocks() {
 939        assert_eq!(
 940            parse_markdown_with_options("+++\ntitle = \"Example\"\n+++\n\nParagraph", false),
 941            ParsedMarkdownData {
 942                events: vec![
 943                    (27..36, RootStart),
 944                    (27..36, Start(Paragraph)),
 945                    (27..36, Text),
 946                    (27..36, End(MarkdownTagEnd::Paragraph)),
 947                    (27..36, RootEnd(0)),
 948                ],
 949                root_block_starts: vec![27],
 950                ..Default::default()
 951            }
 952        );
 953    }
 954
 955    #[test]
 956    fn test_table_checkboxes_remain_text_in_cells() {
 957        let markdown = "\
 958| Done | Task    |
 959|------|---------|
 960| [x]  | Fix bug |
 961| [ ]  | Add feature |";
 962        let parsed = parse_markdown_with_options(markdown, false);
 963
 964        let mut in_table = false;
 965        let mut saw_task_list_marker = false;
 966        let mut cell_texts = Vec::new();
 967        let mut current_cell = String::new();
 968
 969        for (range, event) in &parsed.events {
 970            match event {
 971                Start(Table(_)) => in_table = true,
 972                End(MarkdownTagEnd::Table) => in_table = false,
 973                Start(TableCell) => current_cell.clear(),
 974                End(MarkdownTagEnd::TableCell) => {
 975                    if in_table {
 976                        cell_texts.push(current_cell.clone());
 977                    }
 978                }
 979                Text if in_table => current_cell.push_str(&markdown[range.clone()]),
 980                TaskListMarker(_) if in_table => saw_task_list_marker = true,
 981                _ => {}
 982            }
 983        }
 984
 985        let checkbox_cells: Vec<&str> = cell_texts
 986            .iter()
 987            .map(|cell| cell.trim())
 988            .filter(|cell| *cell == "[x]" || *cell == "[X]" || *cell == "[ ]")
 989            .collect();
 990
 991        assert!(
 992            !saw_task_list_marker,
 993            "Table checkboxes should remain text, not task-list markers"
 994        );
 995        assert_eq!(checkbox_cells, vec!["[x]", "[ ]"]);
 996    }
 997
 998    #[test]
 999    fn test_extract_code_content_range() {
1000        let input = "```let x = 5;```";
1001        assert_eq!(extract_code_content_range(input), 3..13);
1002
1003        let input = "``let x = 5;``";
1004        assert_eq!(extract_code_content_range(input), 2..12);
1005
1006        let input = "`let x = 5;`";
1007        assert_eq!(extract_code_content_range(input), 1..11);
1008
1009        let input = "plain text";
1010        assert_eq!(extract_code_content_range(input), 0..10);
1011
1012        let input = "``let x = 5;`";
1013        assert_eq!(extract_code_content_range(input), 0..13);
1014    }
1015
1016    #[test]
1017    fn test_extract_code_block_content_range() {
1018        let input = "```rust\nlet x = 5;\n```";
1019        assert_eq!(extract_code_block_content_range(input), 8..19);
1020
1021        let input = "plain text";
1022        assert_eq!(extract_code_block_content_range(input), 0..10);
1023
1024        let input = "```python\nprint('hello')\nprint('world')\n```";
1025        assert_eq!(extract_code_block_content_range(input), 10..40);
1026
1027        // Malformed input
1028        let input = "`````";
1029        assert_eq!(extract_code_block_content_range(input), 3..3);
1030    }
1031
1032    #[test]
1033    fn test_links_split_across_fragments() {
1034        // This test verifies that links split across multiple text fragments due to escaping or other issues
1035        // are correctly detected and processed
1036        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
1037        // We're verifying our parser can handle this correctly
1038        assert_eq!(
1039            parse_markdown_with_options(
1040                "https:/\\/example.com is equivalent to https://example&#46;com!",
1041                false
1042            )
1043            .events,
1044            vec![
1045                (0..62, RootStart),
1046                (0..62, Start(Paragraph)),
1047                (
1048                    0..20,
1049                    Start(Link {
1050                        link_type: LinkType::Autolink,
1051                        dest_url: "https://example.com".into(),
1052                        title: "".into(),
1053                        id: "".into()
1054                    })
1055                ),
1056                (0..7, Text),
1057                (8..20, Text),
1058                (0..20, End(MarkdownTagEnd::Link)),
1059                (20..38, Text),
1060                (
1061                    38..61,
1062                    Start(Link {
1063                        link_type: LinkType::Autolink,
1064                        dest_url: "https://example.com".into(),
1065                        title: "".into(),
1066                        id: "".into()
1067                    })
1068                ),
1069                (38..53, Text),
1070                (53..58, SubstitutedText(".".into())),
1071                (58..61, Text),
1072                (38..61, End(MarkdownTagEnd::Link)),
1073                (61..62, Text),
1074                (0..62, End(MarkdownTagEnd::Paragraph)),
1075                (0..62, RootEnd(0)),
1076            ],
1077        );
1078
1079        assert_eq!(
1080            parse_markdown_with_options(
1081                "Visit https://example.com/cat\\/é&#8205;☕ for coffee!",
1082                false
1083            )
1084            .events,
1085            [
1086                (0..55, RootStart),
1087                (0..55, Start(Paragraph)),
1088                (0..6, Text),
1089                (
1090                    6..43,
1091                    Start(Link {
1092                        link_type: LinkType::Autolink,
1093                        dest_url: "https://example.com/cat/é\u{200d}".into(),
1094                        title: "".into(),
1095                        id: "".into()
1096                    })
1097                ),
1098                (6..29, Text),
1099                (30..33, Text),
1100                (33..40, SubstitutedText("\u{200d}".into())),
1101                (40..43, Text),
1102                (6..43, End(MarkdownTagEnd::Link)),
1103                (43..55, Text),
1104                (0..55, End(MarkdownTagEnd::Paragraph)),
1105                (0..55, RootEnd(0)),
1106            ]
1107        );
1108    }
1109}