parser.rs

   1use gpui::SharedString;
   2use linkify::LinkFinder;
   3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
   4use pulldown_cmark::{
   5    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
   6};
   7use std::{collections::BTreeMap, ops::Range, sync::Arc};
   8
   9use collections::HashSet;
  10
  11use crate::{html, path_range::PathWithRange};
  12
  13pub const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
  14    .union(Options::ENABLE_FOOTNOTES)
  15    .union(Options::ENABLE_STRIKETHROUGH)
  16    .union(Options::ENABLE_TASKLISTS)
  17    .union(Options::ENABLE_SMART_PUNCTUATION)
  18    .union(Options::ENABLE_HEADING_ATTRIBUTES)
  19    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
  20    .union(Options::ENABLE_OLD_FOOTNOTES)
  21    .union(Options::ENABLE_GFM)
  22    .union(Options::ENABLE_SUPERSCRIPT)
  23    .union(Options::ENABLE_SUBSCRIPT);
  24
  25#[derive(Default)]
  26struct ParseState {
  27    events: Vec<(Range<usize>, MarkdownEvent)>,
  28    root_block_starts: Vec<usize>,
  29    depth: usize,
  30}
  31
  32#[derive(Debug, Default)]
  33#[cfg_attr(test, derive(PartialEq))]
  34pub(crate) struct ParsedMarkdownData {
  35    pub events: Vec<(Range<usize>, MarkdownEvent)>,
  36    pub language_names: HashSet<SharedString>,
  37    pub language_paths: HashSet<Arc<str>>,
  38    pub root_block_starts: Vec<usize>,
  39    pub html_blocks: BTreeMap<usize, html::html_parser::ParsedHtmlBlock>,
  40}
  41
  42impl ParseState {
  43    fn push_event(&mut self, range: Range<usize>, event: MarkdownEvent) {
  44        match &event {
  45            MarkdownEvent::Start(_) => {
  46                if self.depth == 0 {
  47                    self.root_block_starts.push(range.start);
  48                    self.events.push((range.clone(), MarkdownEvent::RootStart));
  49                }
  50                self.depth += 1;
  51                self.events.push((range, event));
  52            }
  53            MarkdownEvent::End(_) => {
  54                self.events.push((range.clone(), event));
  55                if self.depth > 0 {
  56                    self.depth -= 1;
  57                    if self.depth == 0 {
  58                        let root_block_index = self.root_block_starts.len() - 1;
  59                        self.events
  60                            .push((range, MarkdownEvent::RootEnd(root_block_index)));
  61                    }
  62                }
  63            }
  64            MarkdownEvent::Rule => {
  65                if self.depth == 0 && !range.is_empty() {
  66                    self.root_block_starts.push(range.start);
  67                    let root_block_index = self.root_block_starts.len() - 1;
  68                    self.events.push((range.clone(), MarkdownEvent::RootStart));
  69                    self.events.push((range.clone(), event));
  70                    self.events
  71                        .push((range, MarkdownEvent::RootEnd(root_block_index)));
  72                } else {
  73                    self.events.push((range, event));
  74                }
  75            }
  76            _ => {
  77                self.events.push((range, event));
  78            }
  79        }
  80    }
  81}
  82
  83pub(crate) fn parse_markdown_with_options(text: &str, parse_html: bool) -> ParsedMarkdownData {
  84    let mut state = ParseState::default();
  85    let mut language_names = HashSet::default();
  86    let mut language_paths = HashSet::default();
  87    let mut html_blocks = BTreeMap::default();
  88    let mut within_link = false;
  89    let mut within_metadata = false;
  90    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
  91        .into_offset_iter()
  92        .peekable();
  93    while let Some((pulldown_event, range)) = parser.next() {
  94        if within_metadata {
  95            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
  96                pulldown_event
  97            {
  98                within_metadata = false;
  99            }
 100            continue;
 101        }
 102        match pulldown_event {
 103            pulldown_cmark::Event::Start(tag) => {
 104                if let pulldown_cmark::Tag::HtmlBlock = &tag {
 105                    state.push_event(range.clone(), MarkdownEvent::Start(MarkdownTag::HtmlBlock));
 106
 107                    if parse_html {
 108                        if let Some(block) =
 109                            html::html_parser::parse_html_block(&text[range.clone()], range.clone())
 110                        {
 111                            html_blocks.insert(range.start, block);
 112
 113                            while let Some((event, end_range)) = parser.next() {
 114                                if let pulldown_cmark::Event::End(
 115                                    pulldown_cmark::TagEnd::HtmlBlock,
 116                                ) = event
 117                                {
 118                                    state.push_event(
 119                                        end_range,
 120                                        MarkdownEvent::End(MarkdownTagEnd::HtmlBlock),
 121                                    );
 122                                    break;
 123                                }
 124                            }
 125                        }
 126                    }
 127                    continue;
 128                }
 129
 130                let tag = match tag {
 131                    pulldown_cmark::Tag::Link {
 132                        link_type,
 133                        dest_url,
 134                        title,
 135                        id,
 136                    } => {
 137                        within_link = true;
 138                        MarkdownTag::Link {
 139                            link_type,
 140                            dest_url: SharedString::from(dest_url.into_string()),
 141                            title: SharedString::from(title.into_string()),
 142                            id: SharedString::from(id.into_string()),
 143                        }
 144                    }
 145                    pulldown_cmark::Tag::MetadataBlock(_kind) => {
 146                        within_metadata = true;
 147                        continue;
 148                    }
 149                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 150                        MarkdownTag::CodeBlock {
 151                            kind: CodeBlockKind::Indented,
 152                            metadata: CodeBlockMetadata {
 153                                content_range: range.clone(),
 154                                line_count: 1,
 155                            },
 156                        }
 157                    }
 158                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 159                        ref info,
 160                    )) => {
 161                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 162                        let content_range =
 163                            content_range.start + range.start..content_range.end + range.start;
 164
 165                        // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
 166                        let line_count = text[content_range.clone()]
 167                            .bytes()
 168                            .filter(|c| *c == b'\n')
 169                            .count();
 170                        let metadata = CodeBlockMetadata {
 171                            content_range,
 172                            line_count,
 173                        };
 174
 175                        let info = info.trim();
 176                        let kind = if info.is_empty() {
 177                            CodeBlockKind::Fenced
 178                            // Languages should never contain a slash, and PathRanges always should.
 179                            // (Models are told to specify them relative to a workspace root.)
 180                        } else if info.contains('/') {
 181                            let path_range = PathWithRange::new(info);
 182                            language_paths.insert(path_range.path.clone());
 183                            CodeBlockKind::FencedSrc(path_range)
 184                        } else {
 185                            let language = SharedString::from(info.to_string());
 186                            language_names.insert(language.clone());
 187                            CodeBlockKind::FencedLang(language)
 188                        };
 189
 190                        MarkdownTag::CodeBlock { kind, metadata }
 191                    }
 192                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
 193                    pulldown_cmark::Tag::Heading {
 194                        level,
 195                        id,
 196                        classes,
 197                        attrs,
 198                    } => {
 199                        let id = id.map(|id| SharedString::from(id.into_string()));
 200                        let classes = classes
 201                            .into_iter()
 202                            .map(|c| SharedString::from(c.into_string()))
 203                            .collect();
 204                        let attrs = attrs
 205                            .into_iter()
 206                            .map(|(key, value)| {
 207                                (
 208                                    SharedString::from(key.into_string()),
 209                                    value.map(|v| SharedString::from(v.into_string())),
 210                                )
 211                            })
 212                            .collect();
 213                        MarkdownTag::Heading {
 214                            level,
 215                            id,
 216                            classes,
 217                            attrs,
 218                        }
 219                    }
 220                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
 221                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
 222                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
 223                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
 224                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
 225                    }
 226                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
 227                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
 228                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
 229                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
 230                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
 231                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
 232                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
 233                    pulldown_cmark::Tag::Superscript => MarkdownTag::Superscript,
 234                    pulldown_cmark::Tag::Subscript => MarkdownTag::Subscript,
 235                    pulldown_cmark::Tag::Image {
 236                        link_type,
 237                        dest_url,
 238                        title,
 239                        id,
 240                    } => MarkdownTag::Image {
 241                        link_type,
 242                        dest_url: SharedString::from(dest_url.into_string()),
 243                        title: SharedString::from(title.into_string()),
 244                        id: SharedString::from(id.into_string()),
 245                    },
 246                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock, // this is handled above separately
 247                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
 248                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
 249                    pulldown_cmark::Tag::DefinitionListDefinition => {
 250                        MarkdownTag::DefinitionListDefinition
 251                    }
 252                };
 253                state.push_event(range, MarkdownEvent::Start(tag))
 254            }
 255            pulldown_cmark::Event::End(tag) => {
 256                if let pulldown_cmark::TagEnd::Link = tag {
 257                    within_link = false;
 258                }
 259                state.push_event(range, MarkdownEvent::End(tag));
 260            }
 261            pulldown_cmark::Event::Text(parsed) => {
 262                fn event_for(
 263                    text: &str,
 264                    range: Range<usize>,
 265                    str: &str,
 266                ) -> (Range<usize>, MarkdownEvent) {
 267                    if str == &text[range.clone()] {
 268                        (range, MarkdownEvent::Text)
 269                    } else {
 270                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
 271                    }
 272                }
 273                #[derive(Debug)]
 274                struct TextRange<'a> {
 275                    source_range: Range<usize>,
 276                    merged_range: Range<usize>,
 277                    parsed: CowStr<'a>,
 278                }
 279
 280                let mut last_len = parsed.len();
 281                let mut ranges = vec![TextRange {
 282                    source_range: range.clone(),
 283                    merged_range: 0..last_len,
 284                    parsed,
 285                }];
 286
 287                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _)))
 288                    || (parse_html
 289                        && matches!(
 290                            parser.peek(),
 291                            Some((pulldown_cmark::Event::InlineHtml(_), _))
 292                        ))
 293                {
 294                    let Some((next_event, next_range)) = parser.next() else {
 295                        unreachable!()
 296                    };
 297                    let next_text = match next_event {
 298                        pulldown_cmark::Event::Text(next_event) => next_event,
 299                        pulldown_cmark::Event::InlineHtml(_) => CowStr::Borrowed(""),
 300                        _ => unreachable!(),
 301                    };
 302                    let next_len = last_len + next_text.len();
 303                    ranges.push(TextRange {
 304                        source_range: next_range.clone(),
 305                        merged_range: last_len..next_len,
 306                        parsed: next_text,
 307                    });
 308                    last_len = next_len;
 309                }
 310
 311                let mut merged_text =
 312                    String::with_capacity(ranges.last().unwrap().merged_range.end);
 313                for range in &ranges {
 314                    merged_text.push_str(&range.parsed);
 315                }
 316
 317                let mut ranges = ranges.into_iter().peekable();
 318
 319                if !within_link {
 320                    let mut finder = LinkFinder::new();
 321                    finder.kinds(&[linkify::LinkKind::Url]);
 322
 323                    // Find links in the merged text
 324                    for link in finder.links(&merged_text) {
 325                        let link_start_in_merged = link.start();
 326                        let link_end_in_merged = link.end();
 327
 328                        while ranges
 329                            .peek()
 330                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
 331                        {
 332                            let range = ranges.next().unwrap();
 333                            let (range, event) = event_for(text, range.source_range, &range.parsed);
 334                            state.push_event(range, event);
 335                        }
 336
 337                        let Some(range) = ranges.peek_mut() else {
 338                            continue;
 339                        };
 340                        let prefix_len = link_start_in_merged - range.merged_range.start;
 341                        if prefix_len > 0 {
 342                            let (head, tail) = range.parsed.split_at(prefix_len);
 343                            let (event_range, event) = event_for(
 344                                text,
 345                                range.source_range.start..range.source_range.start + prefix_len,
 346                                head,
 347                            );
 348                            state.push_event(event_range, event);
 349                            range.parsed = CowStr::Boxed(tail.into());
 350                            range.merged_range.start += prefix_len;
 351                            range.source_range.start += prefix_len;
 352                        }
 353
 354                        let link_start_in_source = range.source_range.start;
 355                        let mut link_end_in_source = range.source_range.end;
 356                        let mut link_events = Vec::new();
 357
 358                        while ranges
 359                            .peek()
 360                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
 361                        {
 362                            let range = ranges.next().unwrap();
 363                            link_end_in_source = range.source_range.end;
 364                            link_events.push(event_for(text, range.source_range, &range.parsed));
 365                        }
 366
 367                        if let Some(range) = ranges.peek_mut() {
 368                            let prefix_len = link_end_in_merged - range.merged_range.start;
 369                            if prefix_len > 0 {
 370                                let (head, tail) = range.parsed.split_at(prefix_len);
 371                                link_events.push(event_for(
 372                                    text,
 373                                    range.source_range.start..range.source_range.start + prefix_len,
 374                                    head,
 375                                ));
 376                                range.parsed = CowStr::Boxed(tail.into());
 377                                range.merged_range.start += prefix_len;
 378                                range.source_range.start += prefix_len;
 379                                link_end_in_source = range.source_range.start;
 380                            }
 381                        }
 382                        let link_range = link_start_in_source..link_end_in_source;
 383
 384                        state.push_event(
 385                            link_range.clone(),
 386                            MarkdownEvent::Start(MarkdownTag::Link {
 387                                link_type: LinkType::Autolink,
 388                                dest_url: SharedString::from(link.as_str().to_string()),
 389                                title: SharedString::default(),
 390                                id: SharedString::default(),
 391                            }),
 392                        );
 393                        for (range, event) in link_events {
 394                            state.push_event(range, event);
 395                        }
 396                        state.push_event(
 397                            link_range.clone(),
 398                            MarkdownEvent::End(MarkdownTagEnd::Link),
 399                        );
 400                    }
 401                }
 402
 403                for range in ranges {
 404                    let (range, event) = event_for(text, range.source_range, &range.parsed);
 405                    state.push_event(range, event);
 406                }
 407            }
 408            pulldown_cmark::Event::Code(_) => {
 409                let content_range = extract_code_content_range(&text[range.clone()]);
 410                let content_range =
 411                    content_range.start + range.start..content_range.end + range.start;
 412                state.push_event(content_range, MarkdownEvent::Code)
 413            }
 414            pulldown_cmark::Event::Html(_) => state.push_event(range, MarkdownEvent::Html),
 415            pulldown_cmark::Event::InlineHtml(_) => {
 416                state.push_event(range, MarkdownEvent::InlineHtml)
 417            }
 418            pulldown_cmark::Event::FootnoteReference(_) => {
 419                state.push_event(range, MarkdownEvent::FootnoteReference)
 420            }
 421            pulldown_cmark::Event::SoftBreak => state.push_event(range, MarkdownEvent::SoftBreak),
 422            pulldown_cmark::Event::HardBreak => state.push_event(range, MarkdownEvent::HardBreak),
 423            pulldown_cmark::Event::Rule => state.push_event(range, MarkdownEvent::Rule),
 424            pulldown_cmark::Event::TaskListMarker(checked) => {
 425                state.push_event(range, MarkdownEvent::TaskListMarker(checked))
 426            }
 427            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
 428        }
 429    }
 430
 431    ParsedMarkdownData {
 432        events: state.events,
 433        language_names,
 434        language_paths,
 435        root_block_starts: state.root_block_starts,
 436        html_blocks,
 437    }
 438}
 439
 440pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
 441    let mut events = Vec::new();
 442    let mut finder = LinkFinder::new();
 443    finder.kinds(&[linkify::LinkKind::Url]);
 444    let mut text_range = Range {
 445        start: 0,
 446        end: text.len(),
 447    };
 448    for link in finder.links(text) {
 449        let link_range = link.start()..link.end();
 450
 451        if link_range.start > text_range.start {
 452            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
 453        }
 454
 455        events.push((
 456            link_range.clone(),
 457            MarkdownEvent::Start(MarkdownTag::Link {
 458                link_type: LinkType::Autolink,
 459                dest_url: SharedString::from(link.as_str().to_string()),
 460                title: SharedString::default(),
 461                id: SharedString::default(),
 462            }),
 463        ));
 464        events.push((link_range.clone(), MarkdownEvent::Text));
 465        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
 466
 467        text_range.start = link_range.end;
 468    }
 469
 470    if text_range.end > text_range.start {
 471        events.push((text_range, MarkdownEvent::Text));
 472    }
 473
 474    events
 475}
 476
 477/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
 478/// parse result for rendering without resorting to unsafe lifetime coercion.
 479#[derive(Clone, Debug, PartialEq)]
 480pub enum MarkdownEvent {
 481    /// Start of a tagged element. Events that are yielded after this event
 482    /// and before its corresponding `End` event are inside this element.
 483    /// Start and end events are guaranteed to be balanced.
 484    Start(MarkdownTag),
 485    /// End of a tagged element.
 486    End(MarkdownTagEnd),
 487    /// Text that uses the associated range from the markdown source.
 488    Text,
 489    /// Text that differs from the markdown source - typically due to substitution of HTML entities
 490    /// and smart punctuation.
 491    SubstitutedText(String),
 492    /// An inline code node.
 493    Code,
 494    /// An HTML node.
 495    Html,
 496    /// An inline HTML node.
 497    InlineHtml,
 498    /// A reference to a footnote with given label, which may or may not be defined
 499    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
 500    /// occur in any order.
 501    FootnoteReference,
 502    /// A soft line break.
 503    SoftBreak,
 504    /// A hard line break.
 505    HardBreak,
 506    /// A horizontal ruler.
 507    Rule,
 508    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
 509    TaskListMarker(bool),
 510    /// Start of a root-level block (a top-level structural element like a paragraph, heading, list, etc.).
 511    RootStart,
 512    /// End of a root-level block. Contains the root block index.
 513    RootEnd(usize),
 514}
 515
 516/// Tags for elements that can contain other elements.
 517#[derive(Clone, Debug, PartialEq)]
 518pub enum MarkdownTag {
 519    /// A paragraph of text and other inline elements.
 520    Paragraph,
 521
 522    /// A heading, with optional identifier, classes and custom attributes.
 523    /// The identifier is prefixed with `#` and the last one in the attributes
 524    /// list is chosen, classes are prefixed with `.` and custom attributes
 525    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
 526    Heading {
 527        level: HeadingLevel,
 528        id: Option<SharedString>,
 529        classes: Vec<SharedString>,
 530        /// The first item of the tuple is the attr and second one the value.
 531        attrs: Vec<(SharedString, Option<SharedString>)>,
 532    },
 533
 534    BlockQuote,
 535
 536    /// A code block.
 537    CodeBlock {
 538        kind: CodeBlockKind,
 539        metadata: CodeBlockMetadata,
 540    },
 541
 542    /// A HTML block.
 543    HtmlBlock,
 544
 545    /// A list. If the list is ordered the field indicates the number of the first item.
 546    /// Contains only list items.
 547    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
 548
 549    /// A list item.
 550    Item,
 551
 552    /// A footnote definition. The value contained is the footnote's label by which it can
 553    /// be referred to.
 554    FootnoteDefinition(SharedString),
 555
 556    /// A table. Contains a vector describing the text-alignment for each of its columns.
 557    Table(Vec<Alignment>),
 558
 559    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
 560    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
 561    TableHead,
 562
 563    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
 564    TableRow,
 565    TableCell,
 566
 567    // span-level tags
 568    Emphasis,
 569    Strong,
 570    Strikethrough,
 571    Superscript,
 572    Subscript,
 573
 574    /// A link.
 575    Link {
 576        link_type: LinkType,
 577        dest_url: SharedString,
 578        title: SharedString,
 579        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
 580        id: SharedString,
 581    },
 582
 583    /// An image. The first field is the link type, the second the destination URL and the third is a title,
 584    /// the fourth is the link identifier.
 585    Image {
 586        link_type: LinkType,
 587        dest_url: SharedString,
 588        title: SharedString,
 589        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
 590        id: SharedString,
 591    },
 592
 593    /// A metadata block.
 594    MetadataBlock(MetadataBlockKind),
 595
 596    DefinitionList,
 597    DefinitionListTitle,
 598    DefinitionListDefinition,
 599}
 600
 601#[derive(Clone, Debug, PartialEq)]
 602pub enum CodeBlockKind {
 603    Indented,
 604    /// "Fenced" means "surrounded by triple backticks."
 605    /// There can optionally be either a language after the backticks (like in traditional Markdown)
 606    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
 607    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
 608    Fenced,
 609    FencedLang(SharedString),
 610    FencedSrc(PathWithRange),
 611}
 612
 613#[derive(Default, Clone, Debug, PartialEq)]
 614pub struct CodeBlockMetadata {
 615    pub content_range: Range<usize>,
 616    pub line_count: usize,
 617}
 618
 619fn extract_code_content_range(text: &str) -> Range<usize> {
 620    let text_len = text.len();
 621    if text_len == 0 {
 622        return 0..0;
 623    }
 624
 625    let start_ticks = text.chars().take_while(|&c| c == '`').count();
 626
 627    if start_ticks == 0 || start_ticks > text_len {
 628        return 0..text_len;
 629    }
 630
 631    let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
 632
 633    if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
 634        return 0..text_len;
 635    }
 636
 637    start_ticks..text_len - end_ticks
 638}
 639
 640pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
 641    let mut range = 0..text.len();
 642    if text.starts_with("```") {
 643        range.start += 3;
 644
 645        if let Some(newline_ix) = text[range.clone()].find('\n') {
 646            range.start += newline_ix + 1;
 647        }
 648    }
 649
 650    if !range.is_empty() && text.ends_with("```") {
 651        range.end -= 3;
 652    }
 653    if range.start > range.end {
 654        range.end = range.start;
 655    }
 656    range
 657}
 658
 659#[cfg(test)]
 660mod tests {
 661    use super::MarkdownEvent::*;
 662    use super::MarkdownTag::*;
 663    use super::*;
 664
 665    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
 666        .union(Options::ENABLE_MATH)
 667        .union(Options::ENABLE_DEFINITION_LIST)
 668        .union(Options::ENABLE_WIKILINKS);
 669
 670    #[test]
 671    fn all_options_considered() {
 672        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
 673        // can be evaluated for inclusion.
 674        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
 675    }
 676
 677    #[test]
 678    fn wanted_and_unwanted_options_disjoint() {
 679        assert_eq!(
 680            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
 681            Options::empty()
 682        );
 683    }
 684
 685    #[test]
 686    fn test_html_comments() {
 687        assert_eq!(
 688            parse_markdown_with_options("  <!--\nrdoc-file=string.c\n-->\nReturns", false),
 689            ParsedMarkdownData {
 690                events: vec![
 691                    (2..30, RootStart),
 692                    (2..30, Start(HtmlBlock)),
 693                    (2..2, SubstitutedText("  ".into())),
 694                    (2..7, Html),
 695                    (7..26, Html),
 696                    (26..30, Html),
 697                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
 698                    (2..30, RootEnd(0)),
 699                    (30..37, RootStart),
 700                    (30..37, Start(Paragraph)),
 701                    (30..37, Text),
 702                    (30..37, End(MarkdownTagEnd::Paragraph)),
 703                    (30..37, RootEnd(1)),
 704                ],
 705                root_block_starts: vec![2, 30],
 706                ..Default::default()
 707            }
 708        )
 709    }
 710
 711    #[test]
 712    fn test_plain_urls_and_escaped_text() {
 713        assert_eq!(
 714            parse_markdown_with_options(
 715                "&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text",
 716                false
 717            ),
 718            ParsedMarkdownData {
 719                events: vec![
 720                    (0..51, RootStart),
 721                    (0..51, Start(Paragraph)),
 722                    (0..6, SubstitutedText("\u{a0}".into())),
 723                    (6..12, SubstitutedText("\u{a0}".into())),
 724                    (12..13, Text),
 725                    (
 726                        13..29,
 727                        Start(Link {
 728                            link_type: LinkType::Autolink,
 729                            dest_url: "https://some.url".into(),
 730                            title: "".into(),
 731                            id: "".into(),
 732                        })
 733                    ),
 734                    (13..29, Text),
 735                    (13..29, End(MarkdownTagEnd::Link)),
 736                    (29..35, Text),
 737                    (36..37, Text), // Escaped backtick
 738                    (37..44, SubstitutedText("".into())),
 739                    (45..46, Text), // Escaped backtick
 740                    (46..51, Text),
 741                    (0..51, End(MarkdownTagEnd::Paragraph)),
 742                    (0..51, RootEnd(0)),
 743                ],
 744                root_block_starts: vec![0],
 745                ..Default::default()
 746            }
 747        );
 748    }
 749
 750    #[test]
 751    fn test_incomplete_link() {
 752        assert_eq!(
 753            parse_markdown_with_options(
 754                "You can use the [GitHub Search API](https://docs.github.com/en",
 755                false
 756            )
 757            .events,
 758            vec![
 759                (0..62, RootStart),
 760                (0..62, Start(Paragraph)),
 761                (0..16, Text),
 762                (16..17, Text),
 763                (17..34, Text),
 764                (34..35, Text),
 765                (35..36, Text),
 766                (
 767                    36..62,
 768                    Start(Link {
 769                        link_type: LinkType::Autolink,
 770                        dest_url: "https://docs.github.com/en".into(),
 771                        title: "".into(),
 772                        id: "".into()
 773                    })
 774                ),
 775                (36..62, Text),
 776                (36..62, End(MarkdownTagEnd::Link)),
 777                (0..62, End(MarkdownTagEnd::Paragraph)),
 778                (0..62, RootEnd(0)),
 779            ],
 780        );
 781    }
 782
 783    #[test]
 784    fn test_smart_punctuation() {
 785        assert_eq!(
 786            parse_markdown_with_options(
 787                "-- --- ... \"double quoted\" 'single quoted' ----------",
 788                false
 789            ),
 790            ParsedMarkdownData {
 791                events: vec![
 792                    (0..53, RootStart),
 793                    (0..53, Start(Paragraph)),
 794                    (0..2, SubstitutedText("".into())),
 795                    (2..3, Text),
 796                    (3..6, SubstitutedText("".into())),
 797                    (6..7, Text),
 798                    (7..10, SubstitutedText("".into())),
 799                    (10..11, Text),
 800                    (11..12, SubstitutedText("\u{201c}".into())),
 801                    (12..25, Text),
 802                    (25..26, SubstitutedText("\u{201d}".into())),
 803                    (26..27, Text),
 804                    (27..28, SubstitutedText("\u{2018}".into())),
 805                    (28..41, Text),
 806                    (41..42, SubstitutedText("\u{2019}".into())),
 807                    (42..43, Text),
 808                    (43..53, SubstitutedText("–––––".into())),
 809                    (0..53, End(MarkdownTagEnd::Paragraph)),
 810                    (0..53, RootEnd(0)),
 811                ],
 812                root_block_starts: vec![0],
 813                ..Default::default()
 814            }
 815        )
 816    }
 817
 818    #[test]
 819    fn test_code_block_metadata() {
 820        assert_eq!(
 821            parse_markdown_with_options("```rust\nfn main() {\n let a = 1;\n}\n```", false),
 822            ParsedMarkdownData {
 823                events: vec![
 824                    (0..37, RootStart),
 825                    (
 826                        0..37,
 827                        Start(CodeBlock {
 828                            kind: CodeBlockKind::FencedLang("rust".into()),
 829                            metadata: CodeBlockMetadata {
 830                                content_range: 8..34,
 831                                line_count: 3
 832                            }
 833                        })
 834                    ),
 835                    (8..34, Text),
 836                    (0..37, End(MarkdownTagEnd::CodeBlock)),
 837                    (0..37, RootEnd(0)),
 838                ],
 839                language_names: {
 840                    let mut h = HashSet::default();
 841                    h.insert("rust".into());
 842                    h
 843                },
 844                root_block_starts: vec![0],
 845                ..Default::default()
 846            }
 847        );
 848        assert_eq!(
 849            parse_markdown_with_options("    fn main() {}", false),
 850            ParsedMarkdownData {
 851                events: vec![
 852                    (4..16, RootStart),
 853                    (
 854                        4..16,
 855                        Start(CodeBlock {
 856                            kind: CodeBlockKind::Indented,
 857                            metadata: CodeBlockMetadata {
 858                                content_range: 4..16,
 859                                line_count: 1
 860                            }
 861                        })
 862                    ),
 863                    (4..16, Text),
 864                    (4..16, End(MarkdownTagEnd::CodeBlock)),
 865                    (4..16, RootEnd(0)),
 866                ],
 867                root_block_starts: vec![4],
 868                ..Default::default()
 869            }
 870        );
 871    }
 872
 873    #[test]
 874    fn test_metadata_blocks_do_not_affect_root_blocks() {
 875        assert_eq!(
 876            parse_markdown_with_options("+++\ntitle = \"Example\"\n+++\n\nParagraph", false),
 877            ParsedMarkdownData {
 878                events: vec![
 879                    (27..36, RootStart),
 880                    (27..36, Start(Paragraph)),
 881                    (27..36, Text),
 882                    (27..36, End(MarkdownTagEnd::Paragraph)),
 883                    (27..36, RootEnd(0)),
 884                ],
 885                root_block_starts: vec![27],
 886                ..Default::default()
 887            }
 888        );
 889    }
 890
 891    #[test]
 892    fn test_table_checkboxes_remain_text_in_cells() {
 893        let markdown = "\
 894| Done | Task    |
 895|------|---------|
 896| [x]  | Fix bug |
 897| [ ]  | Add feature |";
 898        let parsed = parse_markdown_with_options(markdown, false);
 899
 900        let mut in_table = false;
 901        let mut saw_task_list_marker = false;
 902        let mut cell_texts = Vec::new();
 903        let mut current_cell = String::new();
 904
 905        for (range, event) in &parsed.events {
 906            match event {
 907                Start(Table(_)) => in_table = true,
 908                End(MarkdownTagEnd::Table) => in_table = false,
 909                Start(TableCell) => current_cell.clear(),
 910                End(MarkdownTagEnd::TableCell) => {
 911                    if in_table {
 912                        cell_texts.push(current_cell.clone());
 913                    }
 914                }
 915                Text if in_table => current_cell.push_str(&markdown[range.clone()]),
 916                TaskListMarker(_) if in_table => saw_task_list_marker = true,
 917                _ => {}
 918            }
 919        }
 920
 921        let checkbox_cells: Vec<&str> = cell_texts
 922            .iter()
 923            .map(|cell| cell.trim())
 924            .filter(|cell| *cell == "[x]" || *cell == "[X]" || *cell == "[ ]")
 925            .collect();
 926
 927        assert!(
 928            !saw_task_list_marker,
 929            "Table checkboxes should remain text, not task-list markers"
 930        );
 931        assert_eq!(checkbox_cells, vec!["[x]", "[ ]"]);
 932    }
 933
 934    #[test]
 935    fn test_extract_code_content_range() {
 936        let input = "```let x = 5;```";
 937        assert_eq!(extract_code_content_range(input), 3..13);
 938
 939        let input = "``let x = 5;``";
 940        assert_eq!(extract_code_content_range(input), 2..12);
 941
 942        let input = "`let x = 5;`";
 943        assert_eq!(extract_code_content_range(input), 1..11);
 944
 945        let input = "plain text";
 946        assert_eq!(extract_code_content_range(input), 0..10);
 947
 948        let input = "``let x = 5;`";
 949        assert_eq!(extract_code_content_range(input), 0..13);
 950    }
 951
 952    #[test]
 953    fn test_extract_code_block_content_range() {
 954        let input = "```rust\nlet x = 5;\n```";
 955        assert_eq!(extract_code_block_content_range(input), 8..19);
 956
 957        let input = "plain text";
 958        assert_eq!(extract_code_block_content_range(input), 0..10);
 959
 960        let input = "```python\nprint('hello')\nprint('world')\n```";
 961        assert_eq!(extract_code_block_content_range(input), 10..40);
 962
 963        // Malformed input
 964        let input = "`````";
 965        assert_eq!(extract_code_block_content_range(input), 3..3);
 966    }
 967
 968    #[test]
 969    fn test_links_split_across_fragments() {
 970        // This test verifies that links split across multiple text fragments due to escaping or other issues
 971        // are correctly detected and processed
 972        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
 973        // We're verifying our parser can handle this correctly
 974        assert_eq!(
 975            parse_markdown_with_options(
 976                "https:/\\/example.com is equivalent to https://example&#46;com!",
 977                false
 978            )
 979            .events,
 980            vec![
 981                (0..62, RootStart),
 982                (0..62, Start(Paragraph)),
 983                (
 984                    0..20,
 985                    Start(Link {
 986                        link_type: LinkType::Autolink,
 987                        dest_url: "https://example.com".into(),
 988                        title: "".into(),
 989                        id: "".into()
 990                    })
 991                ),
 992                (0..7, Text),
 993                (8..20, Text),
 994                (0..20, End(MarkdownTagEnd::Link)),
 995                (20..38, Text),
 996                (
 997                    38..61,
 998                    Start(Link {
 999                        link_type: LinkType::Autolink,
1000                        dest_url: "https://example.com".into(),
1001                        title: "".into(),
1002                        id: "".into()
1003                    })
1004                ),
1005                (38..53, Text),
1006                (53..58, SubstitutedText(".".into())),
1007                (58..61, Text),
1008                (38..61, End(MarkdownTagEnd::Link)),
1009                (61..62, Text),
1010                (0..62, End(MarkdownTagEnd::Paragraph)),
1011                (0..62, RootEnd(0)),
1012            ],
1013        );
1014
1015        assert_eq!(
1016            parse_markdown_with_options(
1017                "Visit https://example.com/cat\\/é&#8205;☕ for coffee!",
1018                false
1019            )
1020            .events,
1021            [
1022                (0..55, RootStart),
1023                (0..55, Start(Paragraph)),
1024                (0..6, Text),
1025                (
1026                    6..43,
1027                    Start(Link {
1028                        link_type: LinkType::Autolink,
1029                        dest_url: "https://example.com/cat/é\u{200d}".into(),
1030                        title: "".into(),
1031                        id: "".into()
1032                    })
1033                ),
1034                (6..29, Text),
1035                (30..33, Text),
1036                (33..40, SubstitutedText("\u{200d}".into())),
1037                (40..43, Text),
1038                (6..43, End(MarkdownTagEnd::Link)),
1039                (43..55, Text),
1040                (0..55, End(MarkdownTagEnd::Paragraph)),
1041                (0..55, RootEnd(0)),
1042            ]
1043        );
1044    }
1045}