parser.rs

   1use collections::{BTreeMap, HashMap, HashSet};
   2use gpui::SharedString;
   3use linkify::LinkFinder;
   4pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
   5use pulldown_cmark::{
   6    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
   7};
   8use std::{ops::Range, sync::Arc};
   9use util::markdown::generate_heading_slug;
  10
  11use crate::{html, path_range::PathWithRange};
  12
  13pub const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
  14    .union(Options::ENABLE_FOOTNOTES)
  15    .union(Options::ENABLE_STRIKETHROUGH)
  16    .union(Options::ENABLE_TASKLISTS)
  17    .union(Options::ENABLE_SMART_PUNCTUATION)
  18    .union(Options::ENABLE_HEADING_ATTRIBUTES)
  19    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
  20    .union(Options::ENABLE_OLD_FOOTNOTES)
  21    .union(Options::ENABLE_GFM)
  22    .union(Options::ENABLE_SUPERSCRIPT)
  23    .union(Options::ENABLE_SUBSCRIPT);
  24
  25#[derive(Default)]
  26struct ParseState {
  27    events: Vec<(Range<usize>, MarkdownEvent)>,
  28    root_block_starts: Vec<usize>,
  29    depth: usize,
  30}
  31
  32#[derive(Debug, Default)]
  33#[cfg_attr(test, derive(PartialEq))]
  34pub(crate) struct ParsedMarkdownData {
  35    pub events: Vec<(Range<usize>, MarkdownEvent)>,
  36    pub language_names: HashSet<SharedString>,
  37    pub language_paths: HashSet<Arc<str>>,
  38    pub root_block_starts: Vec<usize>,
  39    pub html_blocks: BTreeMap<usize, html::html_parser::ParsedHtmlBlock>,
  40    pub heading_slugs: HashMap<SharedString, usize>,
  41}
  42
  43impl ParseState {
  44    fn push_event(&mut self, range: Range<usize>, event: MarkdownEvent) {
  45        match &event {
  46            MarkdownEvent::Start(_) => {
  47                if self.depth == 0 {
  48                    self.root_block_starts.push(range.start);
  49                    self.events.push((range.clone(), MarkdownEvent::RootStart));
  50                }
  51                self.depth += 1;
  52                self.events.push((range, event));
  53            }
  54            MarkdownEvent::End(_) => {
  55                self.events.push((range.clone(), event));
  56                if self.depth > 0 {
  57                    self.depth -= 1;
  58                    if self.depth == 0 {
  59                        let root_block_index = self.root_block_starts.len() - 1;
  60                        self.events
  61                            .push((range, MarkdownEvent::RootEnd(root_block_index)));
  62                    }
  63                }
  64            }
  65            MarkdownEvent::Rule => {
  66                if self.depth == 0 && !range.is_empty() {
  67                    self.root_block_starts.push(range.start);
  68                    let root_block_index = self.root_block_starts.len() - 1;
  69                    self.events.push((range.clone(), MarkdownEvent::RootStart));
  70                    self.events.push((range.clone(), event));
  71                    self.events
  72                        .push((range, MarkdownEvent::RootEnd(root_block_index)));
  73                } else {
  74                    self.events.push((range, event));
  75                }
  76            }
  77            _ => {
  78                self.events.push((range, event));
  79            }
  80        }
  81    }
  82}
  83
  84const MAX_DUPLICATE_HEADING_SLUGS: usize = 128;
  85
  86fn build_heading_slugs(
  87    source: &str,
  88    events: &[(Range<usize>, MarkdownEvent)],
  89) -> HashMap<SharedString, usize> {
  90    let mut slugs = HashMap::default();
  91    let mut slug_counts: HashMap<String, usize> = HashMap::default();
  92    let mut inside_heading = false;
  93    let mut heading_text = String::new();
  94    let mut heading_source_start: Option<usize> = None;
  95
  96    for (range, event) in events {
  97        match event {
  98            MarkdownEvent::Start(MarkdownTag::Heading { .. }) => {
  99                inside_heading = true;
 100                heading_text.clear();
 101                heading_source_start = None;
 102            }
 103            MarkdownEvent::End(MarkdownTagEnd::Heading(_)) => {
 104                if inside_heading {
 105                    let source_offset = heading_source_start.unwrap_or(range.start);
 106                    let base_slug = generate_heading_slug(&heading_text);
 107                    let count = slug_counts.entry(base_slug.clone()).or_insert(0);
 108                    let mut slug = if *count == 0 {
 109                        base_slug.clone()
 110                    } else {
 111                        format!("{base_slug}-{count}")
 112                    };
 113                    *count += 1;
 114                    while slugs.contains_key(slug.as_str()) {
 115                        let Some(count) = slug_counts.get_mut(&base_slug) else {
 116                            slug.clear();
 117                            break;
 118                        };
 119                        if *count >= MAX_DUPLICATE_HEADING_SLUGS {
 120                            slug.clear();
 121                            break;
 122                        }
 123                        slug = format!("{base_slug}-{count}");
 124                        *count += 1;
 125                    }
 126                    if !slug.is_empty() {
 127                        slugs.insert(SharedString::from(slug), source_offset);
 128                    }
 129                    inside_heading = false;
 130                }
 131            }
 132            MarkdownEvent::Text | MarkdownEvent::Code if inside_heading => {
 133                if heading_source_start.is_none() {
 134                    heading_source_start = Some(range.start);
 135                }
 136                heading_text.push_str(&source[range.clone()]);
 137            }
 138            MarkdownEvent::SubstitutedText(substituted) if inside_heading => {
 139                if heading_source_start.is_none() {
 140                    heading_source_start = Some(range.start);
 141                }
 142                heading_text.push_str(substituted);
 143            }
 144            _ => {}
 145        }
 146    }
 147
 148    slugs
 149}
 150
 151pub(crate) fn parse_markdown_with_options(
 152    text: &str,
 153    parse_html: bool,
 154    parse_heading_slugs: bool,
 155) -> ParsedMarkdownData {
 156    let mut state = ParseState::default();
 157    let mut language_names = HashSet::default();
 158    let mut language_paths = HashSet::default();
 159    let mut html_blocks = BTreeMap::default();
 160    let mut within_link = false;
 161    let mut within_code_block = false;
 162    let mut within_metadata = false;
 163    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
 164        .into_offset_iter()
 165        .peekable();
 166    while let Some((pulldown_event, range)) = parser.next() {
 167        if within_metadata {
 168            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 169                pulldown_event
 170            {
 171                within_metadata = false;
 172            }
 173            continue;
 174        }
 175        match pulldown_event {
 176            pulldown_cmark::Event::Start(tag) => {
 177                if let pulldown_cmark::Tag::HtmlBlock = &tag {
 178                    state.push_event(range.clone(), MarkdownEvent::Start(MarkdownTag::HtmlBlock));
 179
 180                    if parse_html {
 181                        if let Some(block) =
 182                            html::html_parser::parse_html_block(&text[range.clone()], range.clone())
 183                        {
 184                            html_blocks.insert(range.start, block);
 185
 186                            while let Some((event, end_range)) = parser.next() {
 187                                if let pulldown_cmark::Event::End(
 188                                    pulldown_cmark::TagEnd::HtmlBlock,
 189                                ) = event
 190                                {
 191                                    state.push_event(
 192                                        end_range,
 193                                        MarkdownEvent::End(MarkdownTagEnd::HtmlBlock),
 194                                    );
 195                                    break;
 196                                }
 197                            }
 198                        }
 199                    }
 200                    continue;
 201                }
 202
 203                let tag = match tag {
 204                    pulldown_cmark::Tag::Link {
 205                        link_type,
 206                        dest_url,
 207                        title,
 208                        id,
 209                    } => {
 210                        within_link = true;
 211                        MarkdownTag::Link {
 212                            link_type,
 213                            dest_url: SharedString::from(dest_url.into_string()),
 214                            title: SharedString::from(title.into_string()),
 215                            id: SharedString::from(id.into_string()),
 216                        }
 217                    }
 218                    pulldown_cmark::Tag::MetadataBlock(_kind) => {
 219                        within_metadata = true;
 220                        continue;
 221                    }
 222                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 223                        within_code_block = true;
 224                        MarkdownTag::CodeBlock {
 225                            kind: CodeBlockKind::Indented,
 226                            metadata: CodeBlockMetadata {
 227                                content_range: range.clone(),
 228                                line_count: 1,
 229                            },
 230                        }
 231                    }
 232                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 233                        ref info,
 234                    )) => {
 235                        within_code_block = true;
 236                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 237                        let content_range =
 238                            content_range.start + range.start..content_range.end + range.start;
 239
 240                        // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
 241                        let line_count = text[content_range.clone()]
 242                            .bytes()
 243                            .filter(|c| *c == b'\n')
 244                            .count();
 245                        let metadata = CodeBlockMetadata {
 246                            content_range,
 247                            line_count,
 248                        };
 249
 250                        let info = info.trim();
 251                        let kind = if info.is_empty() {
 252                            CodeBlockKind::Fenced
 253                            // Languages should never contain a slash, and PathRanges always should.
 254                            // (Models are told to specify them relative to a workspace root.)
 255                        } else if info.contains('/') {
 256                            let path_range = PathWithRange::new(info);
 257                            language_paths.insert(path_range.path.clone());
 258                            CodeBlockKind::FencedSrc(path_range)
 259                        } else {
 260                            let language = SharedString::from(info.to_string());
 261                            language_names.insert(language.clone());
 262                            CodeBlockKind::FencedLang(language)
 263                        };
 264
 265                        MarkdownTag::CodeBlock { kind, metadata }
 266                    }
 267                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
 268                    pulldown_cmark::Tag::Heading {
 269                        level,
 270                        id,
 271                        classes,
 272                        attrs,
 273                    } => {
 274                        let id = id.map(|id| SharedString::from(id.into_string()));
 275                        let classes = classes
 276                            .into_iter()
 277                            .map(|c| SharedString::from(c.into_string()))
 278                            .collect();
 279                        let attrs = attrs
 280                            .into_iter()
 281                            .map(|(key, value)| {
 282                                (
 283                                    SharedString::from(key.into_string()),
 284                                    value.map(|v| SharedString::from(v.into_string())),
 285                                )
 286                            })
 287                            .collect();
 288                        MarkdownTag::Heading {
 289                            level,
 290                            id,
 291                            classes,
 292                            attrs,
 293                        }
 294                    }
 295                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
 296                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
 297                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
 298                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
 299                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
 300                    }
 301                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
 302                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
 303                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
 304                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
 305                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
 306                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
 307                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
 308                    pulldown_cmark::Tag::Superscript => MarkdownTag::Superscript,
 309                    pulldown_cmark::Tag::Subscript => MarkdownTag::Subscript,
 310                    pulldown_cmark::Tag::Image {
 311                        link_type,
 312                        dest_url,
 313                        title,
 314                        id,
 315                    } => MarkdownTag::Image {
 316                        link_type,
 317                        dest_url: SharedString::from(dest_url.into_string()),
 318                        title: SharedString::from(title.into_string()),
 319                        id: SharedString::from(id.into_string()),
 320                    },
 321                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock, // this is handled above separately
 322                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
 323                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
 324                    pulldown_cmark::Tag::DefinitionListDefinition => {
 325                        MarkdownTag::DefinitionListDefinition
 326                    }
 327                };
 328                state.push_event(range, MarkdownEvent::Start(tag))
 329            }
 330            pulldown_cmark::Event::End(tag) => {
 331                if let pulldown_cmark::TagEnd::Link = tag {
 332                    within_link = false;
 333                } else if let pulldown_cmark::TagEnd::CodeBlock = tag {
 334                    within_code_block = false;
 335                }
 336                state.push_event(range, MarkdownEvent::End(tag));
 337            }
 338            pulldown_cmark::Event::Text(parsed) => {
 339                fn event_for(
 340                    text: &str,
 341                    range: Range<usize>,
 342                    str: &str,
 343                ) -> (Range<usize>, MarkdownEvent) {
 344                    if str == &text[range.clone()] {
 345                        (range, MarkdownEvent::Text)
 346                    } else {
 347                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
 348                    }
 349                }
 350
 351                if within_code_block {
 352                    let (range, event) = event_for(text, range, &parsed);
 353                    state.push_event(range, event);
 354                    continue;
 355                }
 356
 357                #[derive(Debug)]
 358                struct TextRange<'a> {
 359                    source_range: Range<usize>,
 360                    merged_range: Range<usize>,
 361                    parsed: CowStr<'a>,
 362                }
 363
 364                let mut last_len = parsed.len();
 365                let mut ranges = vec![TextRange {
 366                    source_range: range.clone(),
 367                    merged_range: 0..last_len,
 368                    parsed,
 369                }];
 370
 371                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _)))
 372                    || (parse_html
 373                        && matches!(
 374                            parser.peek(),
 375                            Some((pulldown_cmark::Event::InlineHtml(_), _))
 376                        ))
 377                {
 378                    let Some((next_event, next_range)) = parser.next() else {
 379                        unreachable!()
 380                    };
 381                    let next_text = match next_event {
 382                        pulldown_cmark::Event::Text(next_event) => next_event,
 383                        pulldown_cmark::Event::InlineHtml(_) => CowStr::Borrowed(""),
 384                        _ => unreachable!(),
 385                    };
 386                    let next_len = last_len + next_text.len();
 387                    ranges.push(TextRange {
 388                        source_range: next_range.clone(),
 389                        merged_range: last_len..next_len,
 390                        parsed: next_text,
 391                    });
 392                    last_len = next_len;
 393                }
 394
 395                let mut merged_text =
 396                    String::with_capacity(ranges.last().unwrap().merged_range.end);
 397                for range in &ranges {
 398                    merged_text.push_str(&range.parsed);
 399                }
 400
 401                let mut ranges = ranges.into_iter().peekable();
 402
 403                if !within_link && !within_code_block {
 404                    let mut finder = LinkFinder::new();
 405                    finder.kinds(&[linkify::LinkKind::Url]);
 406
 407                    // Find links in the merged text
 408                    for link in finder.links(&merged_text) {
 409                        let link_start_in_merged = link.start();
 410                        let link_end_in_merged = link.end();
 411
 412                        while ranges
 413                            .peek()
 414                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
 415                        {
 416                            let range = ranges.next().unwrap();
 417                            let (range, event) = event_for(text, range.source_range, &range.parsed);
 418                            state.push_event(range, event);
 419                        }
 420
 421                        let Some(range) = ranges.peek_mut() else {
 422                            continue;
 423                        };
 424                        let prefix_len = link_start_in_merged - range.merged_range.start;
 425                        if prefix_len > 0 {
 426                            let (head, tail) = range.parsed.split_at(prefix_len);
 427                            let (event_range, event) = event_for(
 428                                text,
 429                                range.source_range.start..range.source_range.start + prefix_len,
 430                                head,
 431                            );
 432                            state.push_event(event_range, event);
 433                            range.parsed = CowStr::Boxed(tail.into());
 434                            range.merged_range.start += prefix_len;
 435                            range.source_range.start += prefix_len;
 436                        }
 437
 438                        let link_start_in_source = range.source_range.start;
 439                        let mut link_end_in_source = range.source_range.end;
 440                        let mut link_events = Vec::new();
 441
 442                        while ranges
 443                            .peek()
 444                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
 445                        {
 446                            let range = ranges.next().unwrap();
 447                            link_end_in_source = range.source_range.end;
 448                            link_events.push(event_for(text, range.source_range, &range.parsed));
 449                        }
 450
 451                        if let Some(range) = ranges.peek_mut() {
 452                            let prefix_len = link_end_in_merged - range.merged_range.start;
 453                            if prefix_len > 0 {
 454                                let (head, tail) = range.parsed.split_at(prefix_len);
 455                                link_events.push(event_for(
 456                                    text,
 457                                    range.source_range.start..range.source_range.start + prefix_len,
 458                                    head,
 459                                ));
 460                                range.parsed = CowStr::Boxed(tail.into());
 461                                range.merged_range.start += prefix_len;
 462                                range.source_range.start += prefix_len;
 463                                link_end_in_source = range.source_range.start;
 464                            }
 465                        }
 466                        let link_range = link_start_in_source..link_end_in_source;
 467
 468                        state.push_event(
 469                            link_range.clone(),
 470                            MarkdownEvent::Start(MarkdownTag::Link {
 471                                link_type: LinkType::Autolink,
 472                                dest_url: SharedString::from(link.as_str().to_string()),
 473                                title: SharedString::default(),
 474                                id: SharedString::default(),
 475                            }),
 476                        );
 477                        for (range, event) in link_events {
 478                            state.push_event(range, event);
 479                        }
 480                        state.push_event(
 481                            link_range.clone(),
 482                            MarkdownEvent::End(MarkdownTagEnd::Link),
 483                        );
 484                    }
 485                }
 486
 487                for range in ranges {
 488                    let (range, event) = event_for(text, range.source_range, &range.parsed);
 489                    state.push_event(range, event);
 490                }
 491            }
 492            pulldown_cmark::Event::Code(_) => {
 493                let content_range = extract_code_content_range(&text[range.clone()]);
 494                let content_range =
 495                    content_range.start + range.start..content_range.end + range.start;
 496                state.push_event(content_range, MarkdownEvent::Code)
 497            }
 498            pulldown_cmark::Event::Html(_) => state.push_event(range, MarkdownEvent::Html),
 499            pulldown_cmark::Event::InlineHtml(_) => {
 500                state.push_event(range, MarkdownEvent::InlineHtml)
 501            }
 502            pulldown_cmark::Event::FootnoteReference(_) => {
 503                state.push_event(range, MarkdownEvent::FootnoteReference)
 504            }
 505            pulldown_cmark::Event::SoftBreak => state.push_event(range, MarkdownEvent::SoftBreak),
 506            pulldown_cmark::Event::HardBreak => state.push_event(range, MarkdownEvent::HardBreak),
 507            pulldown_cmark::Event::Rule => state.push_event(range, MarkdownEvent::Rule),
 508            pulldown_cmark::Event::TaskListMarker(checked) => {
 509                state.push_event(range, MarkdownEvent::TaskListMarker(checked))
 510            }
 511            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
 512        }
 513    }
 514
 515    let heading_slugs = if parse_heading_slugs {
 516        build_heading_slugs(text, &state.events)
 517    } else {
 518        HashMap::default()
 519    };
 520
 521    ParsedMarkdownData {
 522        events: state.events,
 523        language_names,
 524        language_paths,
 525        root_block_starts: state.root_block_starts,
 526        html_blocks,
 527        heading_slugs,
 528    }
 529}
 530
 531pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
 532    let mut events = Vec::new();
 533    let mut finder = LinkFinder::new();
 534    finder.kinds(&[linkify::LinkKind::Url]);
 535    let mut text_range = Range {
 536        start: 0,
 537        end: text.len(),
 538    };
 539    for link in finder.links(text) {
 540        let link_range = link.start()..link.end();
 541
 542        if link_range.start > text_range.start {
 543            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
 544        }
 545
 546        events.push((
 547            link_range.clone(),
 548            MarkdownEvent::Start(MarkdownTag::Link {
 549                link_type: LinkType::Autolink,
 550                dest_url: SharedString::from(link.as_str().to_string()),
 551                title: SharedString::default(),
 552                id: SharedString::default(),
 553            }),
 554        ));
 555        events.push((link_range.clone(), MarkdownEvent::Text));
 556        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
 557
 558        text_range.start = link_range.end;
 559    }
 560
 561    if text_range.end > text_range.start {
 562        events.push((text_range, MarkdownEvent::Text));
 563    }
 564
 565    events
 566}
 567
 568/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
 569/// parse result for rendering without resorting to unsafe lifetime coercion.
 570#[derive(Clone, Debug, PartialEq)]
 571pub enum MarkdownEvent {
 572    /// Start of a tagged element. Events that are yielded after this event
 573    /// and before its corresponding `End` event are inside this element.
 574    /// Start and end events are guaranteed to be balanced.
 575    Start(MarkdownTag),
 576    /// End of a tagged element.
 577    End(MarkdownTagEnd),
 578    /// Text that uses the associated range from the markdown source.
 579    Text,
 580    /// Text that differs from the markdown source - typically due to substitution of HTML entities
 581    /// and smart punctuation.
 582    SubstitutedText(String),
 583    /// An inline code node.
 584    Code,
 585    /// An HTML node.
 586    Html,
 587    /// An inline HTML node.
 588    InlineHtml,
 589    /// A reference to a footnote with given label, which may or may not be defined
 590    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
 591    /// occur in any order.
 592    FootnoteReference,
 593    /// A soft line break.
 594    SoftBreak,
 595    /// A hard line break.
 596    HardBreak,
 597    /// A horizontal ruler.
 598    Rule,
 599    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
 600    TaskListMarker(bool),
 601    /// Start of a root-level block (a top-level structural element like a paragraph, heading, list, etc.).
 602    RootStart,
 603    /// End of a root-level block. Contains the root block index.
 604    RootEnd(usize),
 605}
 606
 607/// Tags for elements that can contain other elements.
 608#[derive(Clone, Debug, PartialEq)]
 609pub enum MarkdownTag {
 610    /// A paragraph of text and other inline elements.
 611    Paragraph,
 612
 613    /// A heading, with optional identifier, classes and custom attributes.
 614    /// The identifier is prefixed with `#` and the last one in the attributes
 615    /// list is chosen, classes are prefixed with `.` and custom attributes
 616    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
 617    Heading {
 618        level: HeadingLevel,
 619        id: Option<SharedString>,
 620        classes: Vec<SharedString>,
 621        /// The first item of the tuple is the attr and second one the value.
 622        attrs: Vec<(SharedString, Option<SharedString>)>,
 623    },
 624
 625    BlockQuote,
 626
 627    /// A code block.
 628    CodeBlock {
 629        kind: CodeBlockKind,
 630        metadata: CodeBlockMetadata,
 631    },
 632
 633    /// A HTML block.
 634    HtmlBlock,
 635
 636    /// A list. If the list is ordered the field indicates the number of the first item.
 637    /// Contains only list items.
 638    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
 639
 640    /// A list item.
 641    Item,
 642
 643    /// A footnote definition. The value contained is the footnote's label by which it can
 644    /// be referred to.
 645    FootnoteDefinition(SharedString),
 646
 647    /// A table. Contains a vector describing the text-alignment for each of its columns.
 648    Table(Vec<Alignment>),
 649
 650    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
 651    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
 652    TableHead,
 653
 654    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
 655    TableRow,
 656    TableCell,
 657
 658    // span-level tags
 659    Emphasis,
 660    Strong,
 661    Strikethrough,
 662    Superscript,
 663    Subscript,
 664
 665    /// A link.
 666    Link {
 667        link_type: LinkType,
 668        dest_url: SharedString,
 669        title: SharedString,
 670        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
 671        id: SharedString,
 672    },
 673
 674    /// An image. The first field is the link type, the second the destination URL and the third is a title,
 675    /// the fourth is the link identifier.
 676    Image {
 677        link_type: LinkType,
 678        dest_url: SharedString,
 679        title: SharedString,
 680        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
 681        id: SharedString,
 682    },
 683
 684    /// A metadata block.
 685    MetadataBlock(MetadataBlockKind),
 686
 687    DefinitionList,
 688    DefinitionListTitle,
 689    DefinitionListDefinition,
 690}
 691
 692#[derive(Clone, Debug, PartialEq)]
 693pub enum CodeBlockKind {
 694    Indented,
 695    /// "Fenced" means "surrounded by triple backticks."
 696    /// There can optionally be either a language after the backticks (like in traditional Markdown)
 697    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
 698    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
 699    Fenced,
 700    FencedLang(SharedString),
 701    FencedSrc(PathWithRange),
 702}
 703
 704#[derive(Default, Clone, Debug, PartialEq)]
 705pub struct CodeBlockMetadata {
 706    pub content_range: Range<usize>,
 707    pub line_count: usize,
 708}
 709
 710fn extract_code_content_range(text: &str) -> Range<usize> {
 711    let text_len = text.len();
 712    if text_len == 0 {
 713        return 0..0;
 714    }
 715
 716    let start_ticks = text.chars().take_while(|&c| c == '`').count();
 717
 718    if start_ticks == 0 || start_ticks > text_len {
 719        return 0..text_len;
 720    }
 721
 722    let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
 723
 724    if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
 725        return 0..text_len;
 726    }
 727
 728    start_ticks..text_len - end_ticks
 729}
 730
 731pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
 732    let mut range = 0..text.len();
 733    if text.starts_with("```") {
 734        range.start += 3;
 735
 736        if let Some(newline_ix) = text[range.clone()].find('\n') {
 737            range.start += newline_ix + 1;
 738        }
 739    }
 740
 741    if !range.is_empty() && text.ends_with("```") {
 742        range.end -= 3;
 743    }
 744    if range.start > range.end {
 745        range.end = range.start;
 746    }
 747    range
 748}
 749
 750#[cfg(test)]
 751mod tests {
 752    use super::MarkdownEvent::*;
 753    use super::MarkdownTag::*;
 754    use super::*;
 755
 756    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
 757        .union(Options::ENABLE_MATH)
 758        .union(Options::ENABLE_DEFINITION_LIST)
 759        .union(Options::ENABLE_WIKILINKS);
 760
 761    #[test]
 762    fn all_options_considered() {
 763        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
 764        // can be evaluated for inclusion.
 765        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
 766    }
 767
 768    #[test]
 769    fn wanted_and_unwanted_options_disjoint() {
 770        assert_eq!(
 771            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
 772            Options::empty()
 773        );
 774    }
 775
 776    #[test]
 777    fn test_html_comments() {
 778        assert_eq!(
 779            parse_markdown_with_options("  <!--\nrdoc-file=string.c\n-->\nReturns", false, false),
 780            ParsedMarkdownData {
 781                events: vec![
 782                    (2..30, RootStart),
 783                    (2..30, Start(HtmlBlock)),
 784                    (2..2, SubstitutedText("  ".into())),
 785                    (2..7, Html),
 786                    (7..26, Html),
 787                    (26..30, Html),
 788                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
 789                    (2..30, RootEnd(0)),
 790                    (30..37, RootStart),
 791                    (30..37, Start(Paragraph)),
 792                    (30..37, Text),
 793                    (30..37, End(MarkdownTagEnd::Paragraph)),
 794                    (30..37, RootEnd(1)),
 795                ],
 796                root_block_starts: vec![2, 30],
 797                ..Default::default()
 798            }
 799        )
 800    }
 801
 802    #[test]
 803    fn test_plain_urls_and_escaped_text() {
 804        assert_eq!(
 805            parse_markdown_with_options(
 806                "&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text",
 807                false,
 808                false,
 809            ),
 810            ParsedMarkdownData {
 811                events: vec![
 812                    (0..51, RootStart),
 813                    (0..51, Start(Paragraph)),
 814                    (0..6, SubstitutedText("\u{a0}".into())),
 815                    (6..12, SubstitutedText("\u{a0}".into())),
 816                    (12..13, Text),
 817                    (
 818                        13..29,
 819                        Start(Link {
 820                            link_type: LinkType::Autolink,
 821                            dest_url: "https://some.url".into(),
 822                            title: "".into(),
 823                            id: "".into(),
 824                        })
 825                    ),
 826                    (13..29, Text),
 827                    (13..29, End(MarkdownTagEnd::Link)),
 828                    (29..35, Text),
 829                    (36..37, Text), // Escaped backtick
 830                    (37..44, SubstitutedText("".into())),
 831                    (45..46, Text), // Escaped backtick
 832                    (46..51, Text),
 833                    (0..51, End(MarkdownTagEnd::Paragraph)),
 834                    (0..51, RootEnd(0)),
 835                ],
 836                root_block_starts: vec![0],
 837                ..Default::default()
 838            }
 839        );
 840    }
 841
 842    #[test]
 843    fn test_incomplete_link() {
 844        assert_eq!(
 845            parse_markdown_with_options(
 846                "You can use the [GitHub Search API](https://docs.github.com/en",
 847                false,
 848                false,
 849            )
 850            .events,
 851            vec![
 852                (0..62, RootStart),
 853                (0..62, Start(Paragraph)),
 854                (0..16, Text),
 855                (16..17, Text),
 856                (17..34, Text),
 857                (34..35, Text),
 858                (35..36, Text),
 859                (
 860                    36..62,
 861                    Start(Link {
 862                        link_type: LinkType::Autolink,
 863                        dest_url: "https://docs.github.com/en".into(),
 864                        title: "".into(),
 865                        id: "".into()
 866                    })
 867                ),
 868                (36..62, Text),
 869                (36..62, End(MarkdownTagEnd::Link)),
 870                (0..62, End(MarkdownTagEnd::Paragraph)),
 871                (0..62, RootEnd(0)),
 872            ],
 873        );
 874    }
 875
 876    #[test]
 877    fn test_smart_punctuation() {
 878        assert_eq!(
 879            parse_markdown_with_options(
 880                "-- --- ... \"double quoted\" 'single quoted' ----------",
 881                false,
 882                false,
 883            ),
 884            ParsedMarkdownData {
 885                events: vec![
 886                    (0..53, RootStart),
 887                    (0..53, Start(Paragraph)),
 888                    (0..2, SubstitutedText("".into())),
 889                    (2..3, Text),
 890                    (3..6, SubstitutedText("".into())),
 891                    (6..7, Text),
 892                    (7..10, SubstitutedText("".into())),
 893                    (10..11, Text),
 894                    (11..12, SubstitutedText("\u{201c}".into())),
 895                    (12..25, Text),
 896                    (25..26, SubstitutedText("\u{201d}".into())),
 897                    (26..27, Text),
 898                    (27..28, SubstitutedText("\u{2018}".into())),
 899                    (28..41, Text),
 900                    (41..42, SubstitutedText("\u{2019}".into())),
 901                    (42..43, Text),
 902                    (43..53, SubstitutedText("–––––".into())),
 903                    (0..53, End(MarkdownTagEnd::Paragraph)),
 904                    (0..53, RootEnd(0)),
 905                ],
 906                root_block_starts: vec![0],
 907                ..Default::default()
 908            }
 909        )
 910    }
 911
 912    #[test]
 913    fn test_code_block_metadata() {
 914        assert_eq!(
 915            parse_markdown_with_options("```rust\nfn main() {\n let a = 1;\n}\n```", false, false),
 916            ParsedMarkdownData {
 917                events: vec![
 918                    (0..37, RootStart),
 919                    (
 920                        0..37,
 921                        Start(CodeBlock {
 922                            kind: CodeBlockKind::FencedLang("rust".into()),
 923                            metadata: CodeBlockMetadata {
 924                                content_range: 8..34,
 925                                line_count: 3
 926                            }
 927                        })
 928                    ),
 929                    (8..34, Text),
 930                    (0..37, End(MarkdownTagEnd::CodeBlock)),
 931                    (0..37, RootEnd(0)),
 932                ],
 933                language_names: {
 934                    let mut h = HashSet::default();
 935                    h.insert("rust".into());
 936                    h
 937                },
 938                root_block_starts: vec![0],
 939                ..Default::default()
 940            }
 941        );
 942        assert_eq!(
 943            parse_markdown_with_options("    fn main() {}", false, false),
 944            ParsedMarkdownData {
 945                events: vec![
 946                    (4..16, RootStart),
 947                    (
 948                        4..16,
 949                        Start(CodeBlock {
 950                            kind: CodeBlockKind::Indented,
 951                            metadata: CodeBlockMetadata {
 952                                content_range: 4..16,
 953                                line_count: 1
 954                            }
 955                        })
 956                    ),
 957                    (4..16, Text),
 958                    (4..16, End(MarkdownTagEnd::CodeBlock)),
 959                    (4..16, RootEnd(0)),
 960                ],
 961                root_block_starts: vec![4],
 962                ..Default::default()
 963            }
 964        );
 965    }
 966
 967    fn assert_code_block_does_not_emit_links(markdown: &str) {
 968        let parsed = parse_markdown_with_options(markdown, false, false);
 969        let mut code_block_depth = 0;
 970        let mut code_block_count = 0;
 971        let mut saw_text_inside_code_block = false;
 972
 973        for (_, event) in &parsed.events {
 974            match event {
 975                Start(CodeBlock { .. }) => {
 976                    code_block_depth += 1;
 977                    code_block_count += 1;
 978                }
 979                End(MarkdownTagEnd::CodeBlock) => {
 980                    assert!(
 981                        code_block_depth > 0,
 982                        "encountered a code block end without a matching start"
 983                    );
 984                    code_block_depth -= 1;
 985                }
 986                Start(Link { .. }) | End(MarkdownTagEnd::Link) => {
 987                    assert_eq!(
 988                        code_block_depth, 0,
 989                        "code blocks should not emit link events"
 990                    );
 991                }
 992                Text | SubstitutedText(_) if code_block_depth > 0 => {
 993                    saw_text_inside_code_block = true;
 994                }
 995                _ => {}
 996            }
 997        }
 998
 999        assert_eq!(code_block_count, 1, "expected exactly one code block");
1000        assert_eq!(code_block_depth, 0, "unterminated code block");
1001        assert!(
1002            saw_text_inside_code_block,
1003            "expected text inside the code block"
1004        );
1005    }
1006
1007    #[test]
1008    fn test_code_blocks_do_not_autolink_urls() {
1009        assert_code_block_does_not_emit_links("```txt\nhttps://example.com\n```");
1010        assert_code_block_does_not_emit_links("    https://example.com");
1011        assert_code_block_does_not_emit_links(
1012            "```txt\r\nhttps:/\\/example.com\r\nhttps://example&#46;com\r\n```",
1013        );
1014        assert_code_block_does_not_emit_links(
1015            "    https:/\\/example.com\r\n    https://example&#46;com",
1016        );
1017    }
1018
1019    #[test]
1020    fn test_metadata_blocks_do_not_affect_root_blocks() {
1021        assert_eq!(
1022            parse_markdown_with_options("+++\ntitle = \"Example\"\n+++\n\nParagraph", false, false),
1023            ParsedMarkdownData {
1024                events: vec![
1025                    (27..36, RootStart),
1026                    (27..36, Start(Paragraph)),
1027                    (27..36, Text),
1028                    (27..36, End(MarkdownTagEnd::Paragraph)),
1029                    (27..36, RootEnd(0)),
1030                ],
1031                root_block_starts: vec![27],
1032                ..Default::default()
1033            }
1034        );
1035    }
1036
1037    #[test]
1038    fn test_table_checkboxes_remain_text_in_cells() {
1039        let markdown = "\
1040| Done | Task    |
1041|------|---------|
1042| [x]  | Fix bug |
1043| [ ]  | Add feature |";
1044        let parsed = parse_markdown_with_options(markdown, false, false);
1045
1046        let mut in_table = false;
1047        let mut saw_task_list_marker = false;
1048        let mut cell_texts = Vec::new();
1049        let mut current_cell = String::new();
1050
1051        for (range, event) in &parsed.events {
1052            match event {
1053                Start(Table(_)) => in_table = true,
1054                End(MarkdownTagEnd::Table) => in_table = false,
1055                Start(TableCell) => current_cell.clear(),
1056                End(MarkdownTagEnd::TableCell) => {
1057                    if in_table {
1058                        cell_texts.push(current_cell.clone());
1059                    }
1060                }
1061                Text if in_table => current_cell.push_str(&markdown[range.clone()]),
1062                TaskListMarker(_) if in_table => saw_task_list_marker = true,
1063                _ => {}
1064            }
1065        }
1066
1067        let checkbox_cells: Vec<&str> = cell_texts
1068            .iter()
1069            .map(|cell| cell.trim())
1070            .filter(|cell| *cell == "[x]" || *cell == "[X]" || *cell == "[ ]")
1071            .collect();
1072
1073        assert!(
1074            !saw_task_list_marker,
1075            "Table checkboxes should remain text, not task-list markers"
1076        );
1077        assert_eq!(checkbox_cells, vec!["[x]", "[ ]"]);
1078    }
1079
1080    #[test]
1081    fn test_extract_code_content_range() {
1082        let input = "```let x = 5;```";
1083        assert_eq!(extract_code_content_range(input), 3..13);
1084
1085        let input = "``let x = 5;``";
1086        assert_eq!(extract_code_content_range(input), 2..12);
1087
1088        let input = "`let x = 5;`";
1089        assert_eq!(extract_code_content_range(input), 1..11);
1090
1091        let input = "plain text";
1092        assert_eq!(extract_code_content_range(input), 0..10);
1093
1094        let input = "``let x = 5;`";
1095        assert_eq!(extract_code_content_range(input), 0..13);
1096    }
1097
1098    #[test]
1099    fn test_extract_code_block_content_range() {
1100        let input = "```rust\nlet x = 5;\n```";
1101        assert_eq!(extract_code_block_content_range(input), 8..19);
1102
1103        let input = "plain text";
1104        assert_eq!(extract_code_block_content_range(input), 0..10);
1105
1106        let input = "```python\nprint('hello')\nprint('world')\n```";
1107        assert_eq!(extract_code_block_content_range(input), 10..40);
1108
1109        // Malformed input
1110        let input = "`````";
1111        assert_eq!(extract_code_block_content_range(input), 3..3);
1112    }
1113
1114    #[test]
1115    fn test_links_split_across_fragments() {
1116        // This test verifies that links split across multiple text fragments due to escaping or other issues
1117        // are correctly detected and processed
1118        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
1119        // We're verifying our parser can handle this correctly
1120        assert_eq!(
1121            parse_markdown_with_options(
1122                "https:/\\/example.com is equivalent to https://example&#46;com!",
1123                false,
1124                false,
1125            )
1126            .events,
1127            vec![
1128                (0..62, RootStart),
1129                (0..62, Start(Paragraph)),
1130                (
1131                    0..20,
1132                    Start(Link {
1133                        link_type: LinkType::Autolink,
1134                        dest_url: "https://example.com".into(),
1135                        title: "".into(),
1136                        id: "".into()
1137                    })
1138                ),
1139                (0..7, Text),
1140                (8..20, Text),
1141                (0..20, End(MarkdownTagEnd::Link)),
1142                (20..38, Text),
1143                (
1144                    38..61,
1145                    Start(Link {
1146                        link_type: LinkType::Autolink,
1147                        dest_url: "https://example.com".into(),
1148                        title: "".into(),
1149                        id: "".into()
1150                    })
1151                ),
1152                (38..53, Text),
1153                (53..58, SubstitutedText(".".into())),
1154                (58..61, Text),
1155                (38..61, End(MarkdownTagEnd::Link)),
1156                (61..62, Text),
1157                (0..62, End(MarkdownTagEnd::Paragraph)),
1158                (0..62, RootEnd(0)),
1159            ],
1160        );
1161
1162        assert_eq!(
1163            parse_markdown_with_options(
1164                "Visit https://example.com/cat\\/é&#8205;☕ for coffee!",
1165                false,
1166                false,
1167            )
1168            .events,
1169            [
1170                (0..55, RootStart),
1171                (0..55, Start(Paragraph)),
1172                (0..6, Text),
1173                (
1174                    6..43,
1175                    Start(Link {
1176                        link_type: LinkType::Autolink,
1177                        dest_url: "https://example.com/cat/é\u{200d}".into(),
1178                        title: "".into(),
1179                        id: "".into()
1180                    })
1181                ),
1182                (6..29, Text),
1183                (30..33, Text),
1184                (33..40, SubstitutedText("\u{200d}".into())),
1185                (40..43, Text),
1186                (6..43, End(MarkdownTagEnd::Link)),
1187                (43..55, Text),
1188                (0..55, End(MarkdownTagEnd::Paragraph)),
1189                (0..55, RootEnd(0)),
1190            ]
1191        );
1192    }
1193
1194    #[test]
1195    fn test_heading_slugs() {
1196        let parsed = parse_markdown_with_options(
1197            "# Hello World\n\n## Code `block`\n\n### Third Level\n\n#### Fourth Level\n\n## Hello World",
1198            false,
1199            true,
1200        );
1201        assert_eq!(parsed.heading_slugs.len(), 5);
1202        assert!(parsed.heading_slugs.contains_key("hello-world"));
1203        assert!(parsed.heading_slugs.contains_key("code-block"));
1204        assert!(parsed.heading_slugs.contains_key("third-level"));
1205        assert!(parsed.heading_slugs.contains_key("fourth-level"));
1206        assert!(parsed.heading_slugs.contains_key("hello-world-1"));
1207    }
1208
1209    #[test]
1210    fn test_heading_source_index_for_slug() {
1211        let parsed = parse_markdown_with_options(
1212            "# Duplicate\n\nText\n\n## Duplicate\n\nMore text",
1213            false,
1214            true,
1215        );
1216        let first = parsed.heading_slugs.get("duplicate").copied();
1217        let second = parsed.heading_slugs.get("duplicate-1").copied();
1218        assert!(first.is_some());
1219        assert!(second.is_some());
1220        assert!(first.expect("first slug missing") < second.expect("second slug missing"));
1221    }
1222
1223    #[test]
1224    fn test_heading_slug_collision_with_dedup_suffix() {
1225        let parsed = parse_markdown_with_options("# Foo\n\n## Foo\n\n## Foo 1", false, true);
1226        assert_eq!(parsed.heading_slugs.len(), 3);
1227        assert!(parsed.heading_slugs.contains_key("foo"));
1228        assert!(parsed.heading_slugs.contains_key("foo-1"));
1229        assert!(parsed.heading_slugs.contains_key("foo-1-1"));
1230    }
1231}