parser.rs

   1use collections::{BTreeMap, HashMap, HashSet};
   2use gpui::SharedString;
   3use linkify::LinkFinder;
   4pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
   5use pulldown_cmark::{
   6    Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
   7};
   8use std::{ops::Range, sync::Arc};
   9use util::markdown::generate_heading_slug;
  10
  11use crate::{html, path_range::PathWithRange};
  12
  13pub const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
  14    .union(Options::ENABLE_FOOTNOTES)
  15    .union(Options::ENABLE_STRIKETHROUGH)
  16    .union(Options::ENABLE_TASKLISTS)
  17    .union(Options::ENABLE_SMART_PUNCTUATION)
  18    .union(Options::ENABLE_HEADING_ATTRIBUTES)
  19    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
  20    .union(Options::ENABLE_OLD_FOOTNOTES)
  21    .union(Options::ENABLE_GFM)
  22    .union(Options::ENABLE_SUPERSCRIPT)
  23    .union(Options::ENABLE_SUBSCRIPT);
  24
  25#[derive(Default)]
  26struct ParseState {
  27    events: Vec<(Range<usize>, MarkdownEvent)>,
  28    root_block_starts: Vec<usize>,
  29    depth: usize,
  30}
  31
  32#[derive(Debug, Default)]
  33#[cfg_attr(test, derive(PartialEq))]
  34pub(crate) struct ParsedMarkdownData {
  35    pub events: Vec<(Range<usize>, MarkdownEvent)>,
  36    pub language_names: HashSet<SharedString>,
  37    pub language_paths: HashSet<Arc<str>>,
  38    pub root_block_starts: Vec<usize>,
  39    pub html_blocks: BTreeMap<usize, html::html_parser::ParsedHtmlBlock>,
  40    pub heading_slugs: HashMap<SharedString, usize>,
  41    pub footnote_definitions: HashMap<SharedString, usize>,
  42}
  43
  44impl ParseState {
  45    fn push_event(&mut self, range: Range<usize>, event: MarkdownEvent) {
  46        match &event {
  47            MarkdownEvent::Start(_) => {
  48                if self.depth == 0 {
  49                    self.root_block_starts.push(range.start);
  50                    self.events.push((range.clone(), MarkdownEvent::RootStart));
  51                }
  52                self.depth += 1;
  53                self.events.push((range, event));
  54            }
  55            MarkdownEvent::End(_) => {
  56                self.events.push((range.clone(), event));
  57                if self.depth > 0 {
  58                    self.depth -= 1;
  59                    if self.depth == 0 {
  60                        let root_block_index = self.root_block_starts.len() - 1;
  61                        self.events
  62                            .push((range, MarkdownEvent::RootEnd(root_block_index)));
  63                    }
  64                }
  65            }
  66            MarkdownEvent::Rule => {
  67                if self.depth == 0 && !range.is_empty() {
  68                    self.root_block_starts.push(range.start);
  69                    let root_block_index = self.root_block_starts.len() - 1;
  70                    self.events.push((range.clone(), MarkdownEvent::RootStart));
  71                    self.events.push((range.clone(), event));
  72                    self.events
  73                        .push((range, MarkdownEvent::RootEnd(root_block_index)));
  74                } else {
  75                    self.events.push((range, event));
  76                }
  77            }
  78            _ => {
  79                self.events.push((range, event));
  80            }
  81        }
  82    }
  83}
  84
  85const MAX_DUPLICATE_HEADING_SLUGS: usize = 128;
  86
  87fn build_heading_slugs(
  88    source: &str,
  89    events: &[(Range<usize>, MarkdownEvent)],
  90) -> HashMap<SharedString, usize> {
  91    let mut slugs = HashMap::default();
  92    let mut slug_counts: HashMap<String, usize> = HashMap::default();
  93    let mut inside_heading = false;
  94    let mut heading_text = String::new();
  95    let mut heading_source_start: Option<usize> = None;
  96
  97    for (range, event) in events {
  98        match event {
  99            MarkdownEvent::Start(MarkdownTag::Heading { .. }) => {
 100                inside_heading = true;
 101                heading_text.clear();
 102                heading_source_start = None;
 103            }
 104            MarkdownEvent::End(MarkdownTagEnd::Heading(_)) => {
 105                if inside_heading {
 106                    let source_offset = heading_source_start.unwrap_or(range.start);
 107                    let base_slug = generate_heading_slug(&heading_text);
 108                    let count = slug_counts.entry(base_slug.clone()).or_insert(0);
 109                    let mut slug = if *count == 0 {
 110                        base_slug.clone()
 111                    } else {
 112                        format!("{base_slug}-{count}")
 113                    };
 114                    *count += 1;
 115                    while slugs.contains_key(slug.as_str()) {
 116                        let Some(count) = slug_counts.get_mut(&base_slug) else {
 117                            slug.clear();
 118                            break;
 119                        };
 120                        if *count >= MAX_DUPLICATE_HEADING_SLUGS {
 121                            slug.clear();
 122                            break;
 123                        }
 124                        slug = format!("{base_slug}-{count}");
 125                        *count += 1;
 126                    }
 127                    if !slug.is_empty() {
 128                        slugs.insert(SharedString::from(slug), source_offset);
 129                    }
 130                    inside_heading = false;
 131                }
 132            }
 133            MarkdownEvent::Text | MarkdownEvent::Code if inside_heading => {
 134                if heading_source_start.is_none() {
 135                    heading_source_start = Some(range.start);
 136                }
 137                heading_text.push_str(&source[range.clone()]);
 138            }
 139            MarkdownEvent::SubstitutedText(substituted) if inside_heading => {
 140                if heading_source_start.is_none() {
 141                    heading_source_start = Some(range.start);
 142                }
 143                heading_text.push_str(substituted);
 144            }
 145            _ => {}
 146        }
 147    }
 148
 149    slugs
 150}
 151
 152pub(crate) fn parse_markdown_with_options(
 153    text: &str,
 154    parse_html: bool,
 155    parse_heading_slugs: bool,
 156) -> ParsedMarkdownData {
 157    let mut state = ParseState::default();
 158    let mut language_names = HashSet::default();
 159    let mut language_paths = HashSet::default();
 160    let mut html_blocks = BTreeMap::default();
 161    let mut within_link = false;
 162    let mut within_code_block = false;
 163    let mut within_metadata = false;
 164    let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
 165        .into_offset_iter()
 166        .peekable();
 167    while let Some((pulldown_event, range)) = parser.next() {
 168        if within_metadata {
 169            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 170                pulldown_event
 171            {
 172                within_metadata = false;
 173            }
 174            continue;
 175        }
 176        match pulldown_event {
 177            pulldown_cmark::Event::Start(tag) => {
 178                if let pulldown_cmark::Tag::HtmlBlock = &tag {
 179                    state.push_event(range.clone(), MarkdownEvent::Start(MarkdownTag::HtmlBlock));
 180
 181                    if parse_html {
 182                        if let Some(block) =
 183                            html::html_parser::parse_html_block(&text[range.clone()], range.clone())
 184                        {
 185                            html_blocks.insert(range.start, block);
 186
 187                            while let Some((event, end_range)) = parser.next() {
 188                                if let pulldown_cmark::Event::End(
 189                                    pulldown_cmark::TagEnd::HtmlBlock,
 190                                ) = event
 191                                {
 192                                    state.push_event(
 193                                        end_range,
 194                                        MarkdownEvent::End(MarkdownTagEnd::HtmlBlock),
 195                                    );
 196                                    break;
 197                                }
 198                            }
 199                        }
 200                    }
 201                    continue;
 202                }
 203
 204                let tag = match tag {
 205                    pulldown_cmark::Tag::Link {
 206                        link_type,
 207                        dest_url,
 208                        title,
 209                        id,
 210                    } => {
 211                        within_link = true;
 212                        MarkdownTag::Link {
 213                            link_type,
 214                            dest_url: SharedString::from(dest_url.into_string()),
 215                            title: SharedString::from(title.into_string()),
 216                            id: SharedString::from(id.into_string()),
 217                        }
 218                    }
 219                    pulldown_cmark::Tag::MetadataBlock(_kind) => {
 220                        within_metadata = true;
 221                        continue;
 222                    }
 223                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 224                        within_code_block = true;
 225                        MarkdownTag::CodeBlock {
 226                            kind: CodeBlockKind::Indented,
 227                            metadata: CodeBlockMetadata {
 228                                content_range: range.clone(),
 229                                line_count: 1,
 230                            },
 231                        }
 232                    }
 233                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 234                        ref info,
 235                    )) => {
 236                        within_code_block = true;
 237                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 238                        let content_range =
 239                            content_range.start + range.start..content_range.end + range.start;
 240
 241                        // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
 242                        let line_count = text[content_range.clone()]
 243                            .bytes()
 244                            .filter(|c| *c == b'\n')
 245                            .count();
 246                        let metadata = CodeBlockMetadata {
 247                            content_range,
 248                            line_count,
 249                        };
 250
 251                        let info = info.trim();
 252                        let kind = if info.is_empty() {
 253                            CodeBlockKind::Fenced
 254                            // Languages should never contain a slash, and PathRanges always should.
 255                            // (Models are told to specify them relative to a workspace root.)
 256                        } else if info.contains('/') {
 257                            let path_range = PathWithRange::new(info);
 258                            language_paths.insert(path_range.path.clone());
 259                            CodeBlockKind::FencedSrc(path_range)
 260                        } else {
 261                            let language = SharedString::from(info.to_string());
 262                            language_names.insert(language.clone());
 263                            CodeBlockKind::FencedLang(language)
 264                        };
 265
 266                        MarkdownTag::CodeBlock { kind, metadata }
 267                    }
 268                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
 269                    pulldown_cmark::Tag::Heading {
 270                        level,
 271                        id,
 272                        classes,
 273                        attrs,
 274                    } => {
 275                        let id = id.map(|id| SharedString::from(id.into_string()));
 276                        let classes = classes
 277                            .into_iter()
 278                            .map(|c| SharedString::from(c.into_string()))
 279                            .collect();
 280                        let attrs = attrs
 281                            .into_iter()
 282                            .map(|(key, value)| {
 283                                (
 284                                    SharedString::from(key.into_string()),
 285                                    value.map(|v| SharedString::from(v.into_string())),
 286                                )
 287                            })
 288                            .collect();
 289                        MarkdownTag::Heading {
 290                            level,
 291                            id,
 292                            classes,
 293                            attrs,
 294                        }
 295                    }
 296                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
 297                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
 298                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
 299                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
 300                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
 301                    }
 302                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
 303                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
 304                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
 305                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
 306                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
 307                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
 308                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
 309                    pulldown_cmark::Tag::Superscript => MarkdownTag::Superscript,
 310                    pulldown_cmark::Tag::Subscript => MarkdownTag::Subscript,
 311                    pulldown_cmark::Tag::Image {
 312                        link_type,
 313                        dest_url,
 314                        title,
 315                        id,
 316                    } => MarkdownTag::Image {
 317                        link_type,
 318                        dest_url: SharedString::from(dest_url.into_string()),
 319                        title: SharedString::from(title.into_string()),
 320                        id: SharedString::from(id.into_string()),
 321                    },
 322                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock, // this is handled above separately
 323                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
 324                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
 325                    pulldown_cmark::Tag::DefinitionListDefinition => {
 326                        MarkdownTag::DefinitionListDefinition
 327                    }
 328                };
 329                state.push_event(range, MarkdownEvent::Start(tag))
 330            }
 331            pulldown_cmark::Event::End(tag) => {
 332                if let pulldown_cmark::TagEnd::Link = tag {
 333                    within_link = false;
 334                } else if let pulldown_cmark::TagEnd::CodeBlock = tag {
 335                    within_code_block = false;
 336                }
 337                state.push_event(range, MarkdownEvent::End(tag));
 338            }
 339            pulldown_cmark::Event::Text(parsed) => {
 340                fn event_for(
 341                    text: &str,
 342                    range: Range<usize>,
 343                    str: &str,
 344                ) -> (Range<usize>, MarkdownEvent) {
 345                    if str == &text[range.clone()] {
 346                        (range, MarkdownEvent::Text)
 347                    } else {
 348                        (range, MarkdownEvent::SubstitutedText(str.to_owned()))
 349                    }
 350                }
 351
 352                if within_code_block {
 353                    let (range, event) = event_for(text, range, &parsed);
 354                    state.push_event(range, event);
 355                    continue;
 356                }
 357
 358                #[derive(Debug)]
 359                struct TextRange<'a> {
 360                    source_range: Range<usize>,
 361                    merged_range: Range<usize>,
 362                    parsed: CowStr<'a>,
 363                }
 364
 365                let mut last_len = parsed.len();
 366                let mut ranges = vec![TextRange {
 367                    source_range: range.clone(),
 368                    merged_range: 0..last_len,
 369                    parsed,
 370                }];
 371
 372                while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _)))
 373                    || (parse_html
 374                        && matches!(
 375                            parser.peek(),
 376                            Some((pulldown_cmark::Event::InlineHtml(_), _))
 377                        ))
 378                {
 379                    let Some((next_event, next_range)) = parser.next() else {
 380                        unreachable!()
 381                    };
 382                    let next_text = match next_event {
 383                        pulldown_cmark::Event::Text(next_event) => next_event,
 384                        pulldown_cmark::Event::InlineHtml(_) => CowStr::Borrowed(""),
 385                        _ => unreachable!(),
 386                    };
 387                    let next_len = last_len + next_text.len();
 388                    ranges.push(TextRange {
 389                        source_range: next_range.clone(),
 390                        merged_range: last_len..next_len,
 391                        parsed: next_text,
 392                    });
 393                    last_len = next_len;
 394                }
 395
 396                let mut merged_text =
 397                    String::with_capacity(ranges.last().unwrap().merged_range.end);
 398                for range in &ranges {
 399                    merged_text.push_str(&range.parsed);
 400                }
 401
 402                let mut ranges = ranges.into_iter().peekable();
 403
 404                if !within_link && !within_code_block {
 405                    let mut finder = LinkFinder::new();
 406                    finder.kinds(&[linkify::LinkKind::Url]);
 407
 408                    // Find links in the merged text
 409                    for link in finder.links(&merged_text) {
 410                        let link_start_in_merged = link.start();
 411                        let link_end_in_merged = link.end();
 412
 413                        while ranges
 414                            .peek()
 415                            .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
 416                        {
 417                            let range = ranges.next().unwrap();
 418                            let (range, event) = event_for(text, range.source_range, &range.parsed);
 419                            state.push_event(range, event);
 420                        }
 421
 422                        let Some(range) = ranges.peek_mut() else {
 423                            continue;
 424                        };
 425                        let prefix_len = link_start_in_merged - range.merged_range.start;
 426                        if prefix_len > 0 {
 427                            let (head, tail) = range.parsed.split_at(prefix_len);
 428                            let (event_range, event) = event_for(
 429                                text,
 430                                range.source_range.start..range.source_range.start + prefix_len,
 431                                head,
 432                            );
 433                            state.push_event(event_range, event);
 434                            range.parsed = CowStr::Boxed(tail.into());
 435                            range.merged_range.start += prefix_len;
 436                            range.source_range.start += prefix_len;
 437                        }
 438
 439                        let link_start_in_source = range.source_range.start;
 440                        let mut link_end_in_source = range.source_range.end;
 441                        let mut link_events = Vec::new();
 442
 443                        while ranges
 444                            .peek()
 445                            .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
 446                        {
 447                            let range = ranges.next().unwrap();
 448                            link_end_in_source = range.source_range.end;
 449                            link_events.push(event_for(text, range.source_range, &range.parsed));
 450                        }
 451
 452                        if let Some(range) = ranges.peek_mut() {
 453                            let prefix_len = link_end_in_merged - range.merged_range.start;
 454                            if prefix_len > 0 {
 455                                let (head, tail) = range.parsed.split_at(prefix_len);
 456                                link_events.push(event_for(
 457                                    text,
 458                                    range.source_range.start..range.source_range.start + prefix_len,
 459                                    head,
 460                                ));
 461                                range.parsed = CowStr::Boxed(tail.into());
 462                                range.merged_range.start += prefix_len;
 463                                range.source_range.start += prefix_len;
 464                                link_end_in_source = range.source_range.start;
 465                            }
 466                        }
 467                        let link_range = link_start_in_source..link_end_in_source;
 468
 469                        state.push_event(
 470                            link_range.clone(),
 471                            MarkdownEvent::Start(MarkdownTag::Link {
 472                                link_type: LinkType::Autolink,
 473                                dest_url: SharedString::from(link.as_str().to_string()),
 474                                title: SharedString::default(),
 475                                id: SharedString::default(),
 476                            }),
 477                        );
 478                        for (range, event) in link_events {
 479                            state.push_event(range, event);
 480                        }
 481                        state.push_event(
 482                            link_range.clone(),
 483                            MarkdownEvent::End(MarkdownTagEnd::Link),
 484                        );
 485                    }
 486                }
 487
 488                for range in ranges {
 489                    let (range, event) = event_for(text, range.source_range, &range.parsed);
 490                    state.push_event(range, event);
 491                }
 492            }
 493            pulldown_cmark::Event::Code(_) => {
 494                let content_range = extract_code_content_range(&text[range.clone()]);
 495                let content_range =
 496                    content_range.start + range.start..content_range.end + range.start;
 497                state.push_event(content_range, MarkdownEvent::Code)
 498            }
 499            pulldown_cmark::Event::Html(_) => state.push_event(range, MarkdownEvent::Html),
 500            pulldown_cmark::Event::InlineHtml(_) => {
 501                state.push_event(range, MarkdownEvent::InlineHtml)
 502            }
 503            pulldown_cmark::Event::FootnoteReference(label) => state.push_event(
 504                range,
 505                MarkdownEvent::FootnoteReference(SharedString::from(label.to_string())),
 506            ),
 507            pulldown_cmark::Event::SoftBreak => state.push_event(range, MarkdownEvent::SoftBreak),
 508            pulldown_cmark::Event::HardBreak => state.push_event(range, MarkdownEvent::HardBreak),
 509            pulldown_cmark::Event::Rule => state.push_event(range, MarkdownEvent::Rule),
 510            pulldown_cmark::Event::TaskListMarker(checked) => {
 511                state.push_event(range, MarkdownEvent::TaskListMarker(checked))
 512            }
 513            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
 514        }
 515    }
 516
 517    let heading_slugs = if parse_heading_slugs {
 518        build_heading_slugs(text, &state.events)
 519    } else {
 520        HashMap::default()
 521    };
 522    let footnote_definitions = build_footnote_definitions(&state.events);
 523
 524    ParsedMarkdownData {
 525        events: state.events,
 526        language_names,
 527        language_paths,
 528        root_block_starts: state.root_block_starts,
 529        html_blocks,
 530        heading_slugs,
 531        footnote_definitions,
 532    }
 533}
 534
 535fn build_footnote_definitions(
 536    events: &[(Range<usize>, MarkdownEvent)],
 537) -> HashMap<SharedString, usize> {
 538    let mut definitions = HashMap::default();
 539    let mut current_label: Option<SharedString> = None;
 540
 541    for (range, event) in events {
 542        match event {
 543            MarkdownEvent::Start(MarkdownTag::FootnoteDefinition(label)) => {
 544                current_label = Some(label.clone());
 545            }
 546            MarkdownEvent::End(MarkdownTagEnd::FootnoteDefinition) => {
 547                current_label = None;
 548            }
 549            MarkdownEvent::Text if current_label.is_some() => {
 550                if let Some(label) = current_label.take() {
 551                    definitions.entry(label).or_insert(range.start);
 552                }
 553            }
 554            _ => {}
 555        }
 556    }
 557
 558    definitions
 559}
 560
 561pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
 562    let mut events = Vec::new();
 563    let mut finder = LinkFinder::new();
 564    finder.kinds(&[linkify::LinkKind::Url]);
 565    let mut text_range = Range {
 566        start: 0,
 567        end: text.len(),
 568    };
 569    for link in finder.links(text) {
 570        let link_range = link.start()..link.end();
 571
 572        if link_range.start > text_range.start {
 573            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
 574        }
 575
 576        events.push((
 577            link_range.clone(),
 578            MarkdownEvent::Start(MarkdownTag::Link {
 579                link_type: LinkType::Autolink,
 580                dest_url: SharedString::from(link.as_str().to_string()),
 581                title: SharedString::default(),
 582                id: SharedString::default(),
 583            }),
 584        ));
 585        events.push((link_range.clone(), MarkdownEvent::Text));
 586        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
 587
 588        text_range.start = link_range.end;
 589    }
 590
 591    if text_range.end > text_range.start {
 592        events.push((text_range, MarkdownEvent::Text));
 593    }
 594
 595    events
 596}
 597
 598/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
 599/// parse result for rendering without resorting to unsafe lifetime coercion.
 600#[derive(Clone, Debug, PartialEq)]
 601pub enum MarkdownEvent {
 602    /// Start of a tagged element. Events that are yielded after this event
 603    /// and before its corresponding `End` event are inside this element.
 604    /// Start and end events are guaranteed to be balanced.
 605    Start(MarkdownTag),
 606    /// End of a tagged element.
 607    End(MarkdownTagEnd),
 608    /// Text that uses the associated range from the markdown source.
 609    Text,
 610    /// Text that differs from the markdown source - typically due to substitution of HTML entities
 611    /// and smart punctuation.
 612    SubstitutedText(String),
 613    /// An inline code node.
 614    Code,
 615    /// An HTML node.
 616    Html,
 617    /// An inline HTML node.
 618    InlineHtml,
 619    /// A reference to a footnote with given label, which may or may not be defined
 620    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
 621    /// occur in any order.
 622    FootnoteReference(SharedString),
 623    /// A soft line break.
 624    SoftBreak,
 625    /// A hard line break.
 626    HardBreak,
 627    /// A horizontal ruler.
 628    Rule,
 629    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
 630    TaskListMarker(bool),
 631    /// Start of a root-level block (a top-level structural element like a paragraph, heading, list, etc.).
 632    RootStart,
 633    /// End of a root-level block. Contains the root block index.
 634    RootEnd(usize),
 635}
 636
 637/// Tags for elements that can contain other elements.
 638#[derive(Clone, Debug, PartialEq)]
 639pub enum MarkdownTag {
 640    /// A paragraph of text and other inline elements.
 641    Paragraph,
 642
 643    /// A heading, with optional identifier, classes and custom attributes.
 644    /// The identifier is prefixed with `#` and the last one in the attributes
 645    /// list is chosen, classes are prefixed with `.` and custom attributes
 646    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
 647    Heading {
 648        level: HeadingLevel,
 649        id: Option<SharedString>,
 650        classes: Vec<SharedString>,
 651        /// The first item of the tuple is the attr and second one the value.
 652        attrs: Vec<(SharedString, Option<SharedString>)>,
 653    },
 654
 655    BlockQuote,
 656
 657    /// A code block.
 658    CodeBlock {
 659        kind: CodeBlockKind,
 660        metadata: CodeBlockMetadata,
 661    },
 662
 663    /// A HTML block.
 664    HtmlBlock,
 665
 666    /// A list. If the list is ordered the field indicates the number of the first item.
 667    /// Contains only list items.
 668    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
 669
 670    /// A list item.
 671    Item,
 672
 673    /// A footnote definition. The value contained is the footnote's label by which it can
 674    /// be referred to.
 675    FootnoteDefinition(SharedString),
 676
 677    /// A table. Contains a vector describing the text-alignment for each of its columns.
 678    Table(Vec<Alignment>),
 679
 680    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
 681    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
 682    TableHead,
 683
 684    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
 685    TableRow,
 686    TableCell,
 687
 688    // span-level tags
 689    Emphasis,
 690    Strong,
 691    Strikethrough,
 692    Superscript,
 693    Subscript,
 694
 695    /// A link.
 696    Link {
 697        link_type: LinkType,
 698        dest_url: SharedString,
 699        title: SharedString,
 700        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
 701        id: SharedString,
 702    },
 703
 704    /// An image. The first field is the link type, the second the destination URL and the third is a title,
 705    /// the fourth is the link identifier.
 706    Image {
 707        link_type: LinkType,
 708        dest_url: SharedString,
 709        title: SharedString,
 710        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
 711        id: SharedString,
 712    },
 713
 714    /// A metadata block.
 715    MetadataBlock(MetadataBlockKind),
 716
 717    DefinitionList,
 718    DefinitionListTitle,
 719    DefinitionListDefinition,
 720}
 721
 722#[derive(Clone, Debug, PartialEq)]
 723pub enum CodeBlockKind {
 724    Indented,
 725    /// "Fenced" means "surrounded by triple backticks."
 726    /// There can optionally be either a language after the backticks (like in traditional Markdown)
 727    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
 728    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
 729    Fenced,
 730    FencedLang(SharedString),
 731    FencedSrc(PathWithRange),
 732}
 733
 734#[derive(Default, Clone, Debug, PartialEq)]
 735pub struct CodeBlockMetadata {
 736    pub content_range: Range<usize>,
 737    pub line_count: usize,
 738}
 739
 740fn extract_code_content_range(text: &str) -> Range<usize> {
 741    let text_len = text.len();
 742    if text_len == 0 {
 743        return 0..0;
 744    }
 745
 746    let start_ticks = text.chars().take_while(|&c| c == '`').count();
 747
 748    if start_ticks == 0 || start_ticks > text_len {
 749        return 0..text_len;
 750    }
 751
 752    let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
 753
 754    if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
 755        return 0..text_len;
 756    }
 757
 758    start_ticks..text_len - end_ticks
 759}
 760
 761pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
 762    let mut range = 0..text.len();
 763    if text.starts_with("```") {
 764        range.start += 3;
 765
 766        if let Some(newline_ix) = text[range.clone()].find('\n') {
 767            range.start += newline_ix + 1;
 768        }
 769    }
 770
 771    if !range.is_empty() && text.ends_with("```") {
 772        range.end -= 3;
 773    }
 774    if range.start > range.end {
 775        range.end = range.start;
 776    }
 777    range
 778}
 779
 780#[cfg(test)]
 781mod tests {
 782    use super::MarkdownEvent::*;
 783    use super::MarkdownTag::*;
 784    use super::*;
 785
 786    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
 787        .union(Options::ENABLE_MATH)
 788        .union(Options::ENABLE_DEFINITION_LIST)
 789        .union(Options::ENABLE_WIKILINKS);
 790
 791    #[test]
 792    fn all_options_considered() {
 793        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
 794        // can be evaluated for inclusion.
 795        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
 796    }
 797
 798    #[test]
 799    fn wanted_and_unwanted_options_disjoint() {
 800        assert_eq!(
 801            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
 802            Options::empty()
 803        );
 804    }
 805
 806    #[test]
 807    fn test_html_comments() {
 808        assert_eq!(
 809            parse_markdown_with_options("  <!--\nrdoc-file=string.c\n-->\nReturns", false, false),
 810            ParsedMarkdownData {
 811                events: vec![
 812                    (2..30, RootStart),
 813                    (2..30, Start(HtmlBlock)),
 814                    (2..2, SubstitutedText("  ".into())),
 815                    (2..7, Html),
 816                    (7..26, Html),
 817                    (26..30, Html),
 818                    (2..30, End(MarkdownTagEnd::HtmlBlock)),
 819                    (2..30, RootEnd(0)),
 820                    (30..37, RootStart),
 821                    (30..37, Start(Paragraph)),
 822                    (30..37, Text),
 823                    (30..37, End(MarkdownTagEnd::Paragraph)),
 824                    (30..37, RootEnd(1)),
 825                ],
 826                root_block_starts: vec![2, 30],
 827                ..Default::default()
 828            }
 829        )
 830    }
 831
 832    #[test]
 833    fn test_plain_urls_and_escaped_text() {
 834        assert_eq!(
 835            parse_markdown_with_options(
 836                "&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text",
 837                false,
 838                false,
 839            ),
 840            ParsedMarkdownData {
 841                events: vec![
 842                    (0..51, RootStart),
 843                    (0..51, Start(Paragraph)),
 844                    (0..6, SubstitutedText("\u{a0}".into())),
 845                    (6..12, SubstitutedText("\u{a0}".into())),
 846                    (12..13, Text),
 847                    (
 848                        13..29,
 849                        Start(Link {
 850                            link_type: LinkType::Autolink,
 851                            dest_url: "https://some.url".into(),
 852                            title: "".into(),
 853                            id: "".into(),
 854                        })
 855                    ),
 856                    (13..29, Text),
 857                    (13..29, End(MarkdownTagEnd::Link)),
 858                    (29..35, Text),
 859                    (36..37, Text), // Escaped backtick
 860                    (37..44, SubstitutedText("".into())),
 861                    (45..46, Text), // Escaped backtick
 862                    (46..51, Text),
 863                    (0..51, End(MarkdownTagEnd::Paragraph)),
 864                    (0..51, RootEnd(0)),
 865                ],
 866                root_block_starts: vec![0],
 867                ..Default::default()
 868            }
 869        );
 870    }
 871
 872    #[test]
 873    fn test_incomplete_link() {
 874        assert_eq!(
 875            parse_markdown_with_options(
 876                "You can use the [GitHub Search API](https://docs.github.com/en",
 877                false,
 878                false,
 879            )
 880            .events,
 881            vec![
 882                (0..62, RootStart),
 883                (0..62, Start(Paragraph)),
 884                (0..16, Text),
 885                (16..17, Text),
 886                (17..34, Text),
 887                (34..35, Text),
 888                (35..36, Text),
 889                (
 890                    36..62,
 891                    Start(Link {
 892                        link_type: LinkType::Autolink,
 893                        dest_url: "https://docs.github.com/en".into(),
 894                        title: "".into(),
 895                        id: "".into()
 896                    })
 897                ),
 898                (36..62, Text),
 899                (36..62, End(MarkdownTagEnd::Link)),
 900                (0..62, End(MarkdownTagEnd::Paragraph)),
 901                (0..62, RootEnd(0)),
 902            ],
 903        );
 904    }
 905
 906    #[test]
 907    fn test_smart_punctuation() {
 908        assert_eq!(
 909            parse_markdown_with_options(
 910                "-- --- ... \"double quoted\" 'single quoted' ----------",
 911                false,
 912                false,
 913            ),
 914            ParsedMarkdownData {
 915                events: vec![
 916                    (0..53, RootStart),
 917                    (0..53, Start(Paragraph)),
 918                    (0..2, SubstitutedText("".into())),
 919                    (2..3, Text),
 920                    (3..6, SubstitutedText("".into())),
 921                    (6..7, Text),
 922                    (7..10, SubstitutedText("".into())),
 923                    (10..11, Text),
 924                    (11..12, SubstitutedText("\u{201c}".into())),
 925                    (12..25, Text),
 926                    (25..26, SubstitutedText("\u{201d}".into())),
 927                    (26..27, Text),
 928                    (27..28, SubstitutedText("\u{2018}".into())),
 929                    (28..41, Text),
 930                    (41..42, SubstitutedText("\u{2019}".into())),
 931                    (42..43, Text),
 932                    (43..53, SubstitutedText("–––––".into())),
 933                    (0..53, End(MarkdownTagEnd::Paragraph)),
 934                    (0..53, RootEnd(0)),
 935                ],
 936                root_block_starts: vec![0],
 937                ..Default::default()
 938            }
 939        )
 940    }
 941
 942    #[test]
 943    fn test_code_block_metadata() {
 944        assert_eq!(
 945            parse_markdown_with_options("```rust\nfn main() {\n let a = 1;\n}\n```", false, false),
 946            ParsedMarkdownData {
 947                events: vec![
 948                    (0..37, RootStart),
 949                    (
 950                        0..37,
 951                        Start(CodeBlock {
 952                            kind: CodeBlockKind::FencedLang("rust".into()),
 953                            metadata: CodeBlockMetadata {
 954                                content_range: 8..34,
 955                                line_count: 3
 956                            }
 957                        })
 958                    ),
 959                    (8..34, Text),
 960                    (0..37, End(MarkdownTagEnd::CodeBlock)),
 961                    (0..37, RootEnd(0)),
 962                ],
 963                language_names: {
 964                    let mut h = HashSet::default();
 965                    h.insert("rust".into());
 966                    h
 967                },
 968                root_block_starts: vec![0],
 969                ..Default::default()
 970            }
 971        );
 972        assert_eq!(
 973            parse_markdown_with_options("    fn main() {}", false, false),
 974            ParsedMarkdownData {
 975                events: vec![
 976                    (4..16, RootStart),
 977                    (
 978                        4..16,
 979                        Start(CodeBlock {
 980                            kind: CodeBlockKind::Indented,
 981                            metadata: CodeBlockMetadata {
 982                                content_range: 4..16,
 983                                line_count: 1
 984                            }
 985                        })
 986                    ),
 987                    (4..16, Text),
 988                    (4..16, End(MarkdownTagEnd::CodeBlock)),
 989                    (4..16, RootEnd(0)),
 990                ],
 991                root_block_starts: vec![4],
 992                ..Default::default()
 993            }
 994        );
 995    }
 996
 997    fn assert_code_block_does_not_emit_links(markdown: &str) {
 998        let parsed = parse_markdown_with_options(markdown, false, false);
 999        let mut code_block_depth = 0;
1000        let mut code_block_count = 0;
1001        let mut saw_text_inside_code_block = false;
1002
1003        for (_, event) in &parsed.events {
1004            match event {
1005                Start(CodeBlock { .. }) => {
1006                    code_block_depth += 1;
1007                    code_block_count += 1;
1008                }
1009                End(MarkdownTagEnd::CodeBlock) => {
1010                    assert!(
1011                        code_block_depth > 0,
1012                        "encountered a code block end without a matching start"
1013                    );
1014                    code_block_depth -= 1;
1015                }
1016                Start(Link { .. }) | End(MarkdownTagEnd::Link) => {
1017                    assert_eq!(
1018                        code_block_depth, 0,
1019                        "code blocks should not emit link events"
1020                    );
1021                }
1022                Text | SubstitutedText(_) if code_block_depth > 0 => {
1023                    saw_text_inside_code_block = true;
1024                }
1025                _ => {}
1026            }
1027        }
1028
1029        assert_eq!(code_block_count, 1, "expected exactly one code block");
1030        assert_eq!(code_block_depth, 0, "unterminated code block");
1031        assert!(
1032            saw_text_inside_code_block,
1033            "expected text inside the code block"
1034        );
1035    }
1036
1037    #[test]
1038    fn test_code_blocks_do_not_autolink_urls() {
1039        assert_code_block_does_not_emit_links("```txt\nhttps://example.com\n```");
1040        assert_code_block_does_not_emit_links("    https://example.com");
1041        assert_code_block_does_not_emit_links(
1042            "```txt\r\nhttps:/\\/example.com\r\nhttps://example&#46;com\r\n```",
1043        );
1044        assert_code_block_does_not_emit_links(
1045            "    https:/\\/example.com\r\n    https://example&#46;com",
1046        );
1047    }
1048
1049    #[test]
1050    fn test_metadata_blocks_do_not_affect_root_blocks() {
1051        assert_eq!(
1052            parse_markdown_with_options("+++\ntitle = \"Example\"\n+++\n\nParagraph", false, false),
1053            ParsedMarkdownData {
1054                events: vec![
1055                    (27..36, RootStart),
1056                    (27..36, Start(Paragraph)),
1057                    (27..36, Text),
1058                    (27..36, End(MarkdownTagEnd::Paragraph)),
1059                    (27..36, RootEnd(0)),
1060                ],
1061                root_block_starts: vec![27],
1062                ..Default::default()
1063            }
1064        );
1065    }
1066
1067    #[test]
1068    fn test_table_checkboxes_remain_text_in_cells() {
1069        let markdown = "\
1070| Done | Task    |
1071|------|---------|
1072| [x]  | Fix bug |
1073| [ ]  | Add feature |";
1074        let parsed = parse_markdown_with_options(markdown, false, false);
1075
1076        let mut in_table = false;
1077        let mut saw_task_list_marker = false;
1078        let mut cell_texts = Vec::new();
1079        let mut current_cell = String::new();
1080
1081        for (range, event) in &parsed.events {
1082            match event {
1083                Start(Table(_)) => in_table = true,
1084                End(MarkdownTagEnd::Table) => in_table = false,
1085                Start(TableCell) => current_cell.clear(),
1086                End(MarkdownTagEnd::TableCell) => {
1087                    if in_table {
1088                        cell_texts.push(current_cell.clone());
1089                    }
1090                }
1091                Text if in_table => current_cell.push_str(&markdown[range.clone()]),
1092                TaskListMarker(_) if in_table => saw_task_list_marker = true,
1093                _ => {}
1094            }
1095        }
1096
1097        let checkbox_cells: Vec<&str> = cell_texts
1098            .iter()
1099            .map(|cell| cell.trim())
1100            .filter(|cell| *cell == "[x]" || *cell == "[X]" || *cell == "[ ]")
1101            .collect();
1102
1103        assert!(
1104            !saw_task_list_marker,
1105            "Table checkboxes should remain text, not task-list markers"
1106        );
1107        assert_eq!(checkbox_cells, vec!["[x]", "[ ]"]);
1108    }
1109
1110    #[test]
1111    fn test_extract_code_content_range() {
1112        let input = "```let x = 5;```";
1113        assert_eq!(extract_code_content_range(input), 3..13);
1114
1115        let input = "``let x = 5;``";
1116        assert_eq!(extract_code_content_range(input), 2..12);
1117
1118        let input = "`let x = 5;`";
1119        assert_eq!(extract_code_content_range(input), 1..11);
1120
1121        let input = "plain text";
1122        assert_eq!(extract_code_content_range(input), 0..10);
1123
1124        let input = "``let x = 5;`";
1125        assert_eq!(extract_code_content_range(input), 0..13);
1126    }
1127
1128    #[test]
1129    fn test_extract_code_block_content_range() {
1130        let input = "```rust\nlet x = 5;\n```";
1131        assert_eq!(extract_code_block_content_range(input), 8..19);
1132
1133        let input = "plain text";
1134        assert_eq!(extract_code_block_content_range(input), 0..10);
1135
1136        let input = "```python\nprint('hello')\nprint('world')\n```";
1137        assert_eq!(extract_code_block_content_range(input), 10..40);
1138
1139        // Malformed input
1140        let input = "`````";
1141        assert_eq!(extract_code_block_content_range(input), 3..3);
1142    }
1143
1144    #[test]
1145    fn test_footnotes() {
1146        let parsed = parse_markdown_with_options(
1147            "Text with a footnote[^1] and some more text.\n\n[^1]: This is the footnote content.",
1148            false,
1149            false,
1150        );
1151        assert_eq!(
1152            parsed.events,
1153            vec![
1154                (0..45, RootStart),
1155                (0..45, Start(Paragraph)),
1156                (0..20, Text),
1157                (20..24, FootnoteReference("1".into())),
1158                (24..44, Text),
1159                (0..45, End(MarkdownTagEnd::Paragraph)),
1160                (0..45, RootEnd(0)),
1161                (46..81, RootStart),
1162                (46..81, Start(FootnoteDefinition("1".into()))),
1163                (52..81, Start(Paragraph)),
1164                (52..81, Text),
1165                (52..81, End(MarkdownTagEnd::Paragraph)),
1166                (46..81, End(MarkdownTagEnd::FootnoteDefinition)),
1167                (46..81, RootEnd(1)),
1168            ]
1169        );
1170        assert_eq!(parsed.footnote_definitions.len(), 1);
1171        assert_eq!(parsed.footnote_definitions.get("1").copied(), Some(52));
1172    }
1173
1174    #[test]
1175    fn test_footnote_definitions_multiple() {
1176        let parsed = parse_markdown_with_options(
1177            "Text[^a] and[^b].\n\n[^a]: First.\n\n[^b]: Second.",
1178            false,
1179            false,
1180        );
1181        assert_eq!(parsed.footnote_definitions.len(), 2);
1182        assert!(parsed.footnote_definitions.contains_key("a"));
1183        assert!(parsed.footnote_definitions.contains_key("b"));
1184    }
1185
1186    #[test]
1187    fn test_links_split_across_fragments() {
1188        // This test verifies that links split across multiple text fragments due to escaping or other issues
1189        // are correctly detected and processed
1190        // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
1191        // We're verifying our parser can handle this correctly
1192        assert_eq!(
1193            parse_markdown_with_options(
1194                "https:/\\/example.com is equivalent to https://example&#46;com!",
1195                false,
1196                false,
1197            )
1198            .events,
1199            vec![
1200                (0..62, RootStart),
1201                (0..62, Start(Paragraph)),
1202                (
1203                    0..20,
1204                    Start(Link {
1205                        link_type: LinkType::Autolink,
1206                        dest_url: "https://example.com".into(),
1207                        title: "".into(),
1208                        id: "".into()
1209                    })
1210                ),
1211                (0..7, Text),
1212                (8..20, Text),
1213                (0..20, End(MarkdownTagEnd::Link)),
1214                (20..38, Text),
1215                (
1216                    38..61,
1217                    Start(Link {
1218                        link_type: LinkType::Autolink,
1219                        dest_url: "https://example.com".into(),
1220                        title: "".into(),
1221                        id: "".into()
1222                    })
1223                ),
1224                (38..53, Text),
1225                (53..58, SubstitutedText(".".into())),
1226                (58..61, Text),
1227                (38..61, End(MarkdownTagEnd::Link)),
1228                (61..62, Text),
1229                (0..62, End(MarkdownTagEnd::Paragraph)),
1230                (0..62, RootEnd(0)),
1231            ],
1232        );
1233
1234        assert_eq!(
1235            parse_markdown_with_options(
1236                "Visit https://example.com/cat\\/é&#8205;☕ for coffee!",
1237                false,
1238                false,
1239            )
1240            .events,
1241            [
1242                (0..55, RootStart),
1243                (0..55, Start(Paragraph)),
1244                (0..6, Text),
1245                (
1246                    6..43,
1247                    Start(Link {
1248                        link_type: LinkType::Autolink,
1249                        dest_url: "https://example.com/cat/é\u{200d}".into(),
1250                        title: "".into(),
1251                        id: "".into()
1252                    })
1253                ),
1254                (6..29, Text),
1255                (30..33, Text),
1256                (33..40, SubstitutedText("\u{200d}".into())),
1257                (40..43, Text),
1258                (6..43, End(MarkdownTagEnd::Link)),
1259                (43..55, Text),
1260                (0..55, End(MarkdownTagEnd::Paragraph)),
1261                (0..55, RootEnd(0)),
1262            ]
1263        );
1264    }
1265
1266    #[test]
1267    fn test_heading_slugs() {
1268        let parsed = parse_markdown_with_options(
1269            "# Hello World\n\n## Code `block`\n\n### Third Level\n\n#### Fourth Level\n\n## Hello World",
1270            false,
1271            true,
1272        );
1273        assert_eq!(parsed.heading_slugs.len(), 5);
1274        assert!(parsed.heading_slugs.contains_key("hello-world"));
1275        assert!(parsed.heading_slugs.contains_key("code-block"));
1276        assert!(parsed.heading_slugs.contains_key("third-level"));
1277        assert!(parsed.heading_slugs.contains_key("fourth-level"));
1278        assert!(parsed.heading_slugs.contains_key("hello-world-1"));
1279    }
1280
1281    #[test]
1282    fn test_heading_source_index_for_slug() {
1283        let parsed = parse_markdown_with_options(
1284            "# Duplicate\n\nText\n\n## Duplicate\n\nMore text",
1285            false,
1286            true,
1287        );
1288        let first = parsed.heading_slugs.get("duplicate").copied();
1289        let second = parsed.heading_slugs.get("duplicate-1").copied();
1290        assert!(first.is_some());
1291        assert!(second.is_some());
1292        assert!(first.expect("first slug missing") < second.expect("second slug missing"));
1293    }
1294
1295    #[test]
1296    fn test_heading_slug_collision_with_dedup_suffix() {
1297        let parsed = parse_markdown_with_options("# Foo\n\n## Foo\n\n## Foo 1", false, true);
1298        assert_eq!(parsed.heading_slugs.len(), 3);
1299        assert!(parsed.heading_slugs.contains_key("foo"));
1300        assert!(parsed.heading_slugs.contains_key("foo-1"));
1301        assert!(parsed.heading_slugs.contains_key("foo-1-1"));
1302    }
1303}