1use collections::{BTreeMap, HashMap, HashSet};
2use gpui::SharedString;
3use linkify::LinkFinder;
4pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
5use pulldown_cmark::{
6 Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
7};
8use std::{ops::Range, sync::Arc};
9use util::markdown::generate_heading_slug;
10
11use crate::{html, path_range::PathWithRange};
12
13pub const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
14 .union(Options::ENABLE_FOOTNOTES)
15 .union(Options::ENABLE_STRIKETHROUGH)
16 .union(Options::ENABLE_TASKLISTS)
17 .union(Options::ENABLE_SMART_PUNCTUATION)
18 .union(Options::ENABLE_HEADING_ATTRIBUTES)
19 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
20 .union(Options::ENABLE_OLD_FOOTNOTES)
21 .union(Options::ENABLE_GFM)
22 .union(Options::ENABLE_SUPERSCRIPT)
23 .union(Options::ENABLE_SUBSCRIPT);
24
25#[derive(Default)]
26struct ParseState {
27 events: Vec<(Range<usize>, MarkdownEvent)>,
28 root_block_starts: Vec<usize>,
29 depth: usize,
30}
31
32#[derive(Debug, Default)]
33#[cfg_attr(test, derive(PartialEq))]
34pub(crate) struct ParsedMarkdownData {
35 pub events: Vec<(Range<usize>, MarkdownEvent)>,
36 pub language_names: HashSet<SharedString>,
37 pub language_paths: HashSet<Arc<str>>,
38 pub root_block_starts: Vec<usize>,
39 pub html_blocks: BTreeMap<usize, html::html_parser::ParsedHtmlBlock>,
40 pub heading_slugs: HashMap<SharedString, usize>,
41 pub footnote_definitions: HashMap<SharedString, usize>,
42}
43
44impl ParseState {
45 fn push_event(&mut self, range: Range<usize>, event: MarkdownEvent) {
46 match &event {
47 MarkdownEvent::Start(_) => {
48 if self.depth == 0 {
49 self.root_block_starts.push(range.start);
50 self.events.push((range.clone(), MarkdownEvent::RootStart));
51 }
52 self.depth += 1;
53 self.events.push((range, event));
54 }
55 MarkdownEvent::End(_) => {
56 self.events.push((range.clone(), event));
57 if self.depth > 0 {
58 self.depth -= 1;
59 if self.depth == 0 {
60 let root_block_index = self.root_block_starts.len() - 1;
61 self.events
62 .push((range, MarkdownEvent::RootEnd(root_block_index)));
63 }
64 }
65 }
66 MarkdownEvent::Rule => {
67 if self.depth == 0 && !range.is_empty() {
68 self.root_block_starts.push(range.start);
69 let root_block_index = self.root_block_starts.len() - 1;
70 self.events.push((range.clone(), MarkdownEvent::RootStart));
71 self.events.push((range.clone(), event));
72 self.events
73 .push((range, MarkdownEvent::RootEnd(root_block_index)));
74 } else {
75 self.events.push((range, event));
76 }
77 }
78 _ => {
79 self.events.push((range, event));
80 }
81 }
82 }
83}
84
85const MAX_DUPLICATE_HEADING_SLUGS: usize = 128;
86
87fn build_heading_slugs(
88 source: &str,
89 events: &[(Range<usize>, MarkdownEvent)],
90) -> HashMap<SharedString, usize> {
91 let mut slugs = HashMap::default();
92 let mut slug_counts: HashMap<String, usize> = HashMap::default();
93 let mut inside_heading = false;
94 let mut heading_text = String::new();
95 let mut heading_source_start: Option<usize> = None;
96
97 for (range, event) in events {
98 match event {
99 MarkdownEvent::Start(MarkdownTag::Heading { .. }) => {
100 inside_heading = true;
101 heading_text.clear();
102 heading_source_start = None;
103 }
104 MarkdownEvent::End(MarkdownTagEnd::Heading(_)) => {
105 if inside_heading {
106 let source_offset = heading_source_start.unwrap_or(range.start);
107 let base_slug = generate_heading_slug(&heading_text);
108 let count = slug_counts.entry(base_slug.clone()).or_insert(0);
109 let mut slug = if *count == 0 {
110 base_slug.clone()
111 } else {
112 format!("{base_slug}-{count}")
113 };
114 *count += 1;
115 while slugs.contains_key(slug.as_str()) {
116 let Some(count) = slug_counts.get_mut(&base_slug) else {
117 slug.clear();
118 break;
119 };
120 if *count >= MAX_DUPLICATE_HEADING_SLUGS {
121 slug.clear();
122 break;
123 }
124 slug = format!("{base_slug}-{count}");
125 *count += 1;
126 }
127 if !slug.is_empty() {
128 slugs.insert(SharedString::from(slug), source_offset);
129 }
130 inside_heading = false;
131 }
132 }
133 MarkdownEvent::Text | MarkdownEvent::Code if inside_heading => {
134 if heading_source_start.is_none() {
135 heading_source_start = Some(range.start);
136 }
137 heading_text.push_str(&source[range.clone()]);
138 }
139 MarkdownEvent::SubstitutedText(substituted) if inside_heading => {
140 if heading_source_start.is_none() {
141 heading_source_start = Some(range.start);
142 }
143 heading_text.push_str(substituted);
144 }
145 _ => {}
146 }
147 }
148
149 slugs
150}
151
152pub(crate) fn parse_markdown_with_options(
153 text: &str,
154 parse_html: bool,
155 parse_heading_slugs: bool,
156) -> ParsedMarkdownData {
157 let mut state = ParseState::default();
158 let mut language_names = HashSet::default();
159 let mut language_paths = HashSet::default();
160 let mut html_blocks = BTreeMap::default();
161 let mut within_link = false;
162 let mut within_code_block = false;
163 let mut within_metadata = false;
164 let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
165 .into_offset_iter()
166 .peekable();
167 while let Some((pulldown_event, range)) = parser.next() {
168 if within_metadata {
169 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
170 pulldown_event
171 {
172 within_metadata = false;
173 }
174 continue;
175 }
176 match pulldown_event {
177 pulldown_cmark::Event::Start(tag) => {
178 if let pulldown_cmark::Tag::HtmlBlock = &tag {
179 state.push_event(range.clone(), MarkdownEvent::Start(MarkdownTag::HtmlBlock));
180
181 if parse_html {
182 if let Some(block) =
183 html::html_parser::parse_html_block(&text[range.clone()], range.clone())
184 {
185 html_blocks.insert(range.start, block);
186
187 while let Some((event, end_range)) = parser.next() {
188 if let pulldown_cmark::Event::End(
189 pulldown_cmark::TagEnd::HtmlBlock,
190 ) = event
191 {
192 state.push_event(
193 end_range,
194 MarkdownEvent::End(MarkdownTagEnd::HtmlBlock),
195 );
196 break;
197 }
198 }
199 }
200 }
201 continue;
202 }
203
204 let tag = match tag {
205 pulldown_cmark::Tag::Link {
206 link_type,
207 dest_url,
208 title,
209 id,
210 } => {
211 within_link = true;
212 MarkdownTag::Link {
213 link_type,
214 dest_url: SharedString::from(dest_url.into_string()),
215 title: SharedString::from(title.into_string()),
216 id: SharedString::from(id.into_string()),
217 }
218 }
219 pulldown_cmark::Tag::MetadataBlock(_kind) => {
220 within_metadata = true;
221 continue;
222 }
223 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
224 within_code_block = true;
225 MarkdownTag::CodeBlock {
226 kind: CodeBlockKind::Indented,
227 metadata: CodeBlockMetadata {
228 content_range: range.clone(),
229 line_count: 1,
230 },
231 }
232 }
233 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
234 ref info,
235 )) => {
236 within_code_block = true;
237 let content_range = extract_code_block_content_range(&text[range.clone()]);
238 let content_range =
239 content_range.start + range.start..content_range.end + range.start;
240
241 // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
242 let line_count = text[content_range.clone()]
243 .bytes()
244 .filter(|c| *c == b'\n')
245 .count();
246 let metadata = CodeBlockMetadata {
247 content_range,
248 line_count,
249 };
250
251 let info = info.trim();
252 let kind = if info.is_empty() {
253 CodeBlockKind::Fenced
254 // Languages should never contain a slash, and PathRanges always should.
255 // (Models are told to specify them relative to a workspace root.)
256 } else if info.contains('/') {
257 let path_range = PathWithRange::new(info);
258 language_paths.insert(path_range.path.clone());
259 CodeBlockKind::FencedSrc(path_range)
260 } else {
261 let language = SharedString::from(info.to_string());
262 language_names.insert(language.clone());
263 CodeBlockKind::FencedLang(language)
264 };
265
266 MarkdownTag::CodeBlock { kind, metadata }
267 }
268 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
269 pulldown_cmark::Tag::Heading {
270 level,
271 id,
272 classes,
273 attrs,
274 } => {
275 let id = id.map(|id| SharedString::from(id.into_string()));
276 let classes = classes
277 .into_iter()
278 .map(|c| SharedString::from(c.into_string()))
279 .collect();
280 let attrs = attrs
281 .into_iter()
282 .map(|(key, value)| {
283 (
284 SharedString::from(key.into_string()),
285 value.map(|v| SharedString::from(v.into_string())),
286 )
287 })
288 .collect();
289 MarkdownTag::Heading {
290 level,
291 id,
292 classes,
293 attrs,
294 }
295 }
296 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
297 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
298 pulldown_cmark::Tag::Item => MarkdownTag::Item,
299 pulldown_cmark::Tag::FootnoteDefinition(label) => {
300 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
301 }
302 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
303 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
304 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
305 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
306 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
307 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
308 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
309 pulldown_cmark::Tag::Superscript => MarkdownTag::Superscript,
310 pulldown_cmark::Tag::Subscript => MarkdownTag::Subscript,
311 pulldown_cmark::Tag::Image {
312 link_type,
313 dest_url,
314 title,
315 id,
316 } => MarkdownTag::Image {
317 link_type,
318 dest_url: SharedString::from(dest_url.into_string()),
319 title: SharedString::from(title.into_string()),
320 id: SharedString::from(id.into_string()),
321 },
322 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock, // this is handled above separately
323 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
324 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
325 pulldown_cmark::Tag::DefinitionListDefinition => {
326 MarkdownTag::DefinitionListDefinition
327 }
328 };
329 state.push_event(range, MarkdownEvent::Start(tag))
330 }
331 pulldown_cmark::Event::End(tag) => {
332 if let pulldown_cmark::TagEnd::Link = tag {
333 within_link = false;
334 } else if let pulldown_cmark::TagEnd::CodeBlock = tag {
335 within_code_block = false;
336 }
337 state.push_event(range, MarkdownEvent::End(tag));
338 }
339 pulldown_cmark::Event::Text(parsed) => {
340 fn event_for(
341 text: &str,
342 range: Range<usize>,
343 str: &str,
344 ) -> (Range<usize>, MarkdownEvent) {
345 if str == &text[range.clone()] {
346 (range, MarkdownEvent::Text)
347 } else {
348 (range, MarkdownEvent::SubstitutedText(str.to_owned()))
349 }
350 }
351
352 if within_code_block {
353 let (range, event) = event_for(text, range, &parsed);
354 state.push_event(range, event);
355 continue;
356 }
357
358 #[derive(Debug)]
359 struct TextRange<'a> {
360 source_range: Range<usize>,
361 merged_range: Range<usize>,
362 parsed: CowStr<'a>,
363 }
364
365 let mut last_len = parsed.len();
366 let mut ranges = vec![TextRange {
367 source_range: range.clone(),
368 merged_range: 0..last_len,
369 parsed,
370 }];
371
372 while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _)))
373 || (parse_html
374 && matches!(
375 parser.peek(),
376 Some((pulldown_cmark::Event::InlineHtml(_), _))
377 ))
378 {
379 let Some((next_event, next_range)) = parser.next() else {
380 unreachable!()
381 };
382 let next_text = match next_event {
383 pulldown_cmark::Event::Text(next_event) => next_event,
384 pulldown_cmark::Event::InlineHtml(_) => CowStr::Borrowed(""),
385 _ => unreachable!(),
386 };
387 let next_len = last_len + next_text.len();
388 ranges.push(TextRange {
389 source_range: next_range.clone(),
390 merged_range: last_len..next_len,
391 parsed: next_text,
392 });
393 last_len = next_len;
394 }
395
396 let mut merged_text =
397 String::with_capacity(ranges.last().unwrap().merged_range.end);
398 for range in &ranges {
399 merged_text.push_str(&range.parsed);
400 }
401
402 let mut ranges = ranges.into_iter().peekable();
403
404 if !within_link && !within_code_block {
405 let mut finder = LinkFinder::new();
406 finder.kinds(&[linkify::LinkKind::Url]);
407
408 // Find links in the merged text
409 for link in finder.links(&merged_text) {
410 let link_start_in_merged = link.start();
411 let link_end_in_merged = link.end();
412
413 while ranges
414 .peek()
415 .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
416 {
417 let range = ranges.next().unwrap();
418 let (range, event) = event_for(text, range.source_range, &range.parsed);
419 state.push_event(range, event);
420 }
421
422 let Some(range) = ranges.peek_mut() else {
423 continue;
424 };
425 let prefix_len = link_start_in_merged - range.merged_range.start;
426 if prefix_len > 0 {
427 let (head, tail) = range.parsed.split_at(prefix_len);
428 let (event_range, event) = event_for(
429 text,
430 range.source_range.start..range.source_range.start + prefix_len,
431 head,
432 );
433 state.push_event(event_range, event);
434 range.parsed = CowStr::Boxed(tail.into());
435 range.merged_range.start += prefix_len;
436 range.source_range.start += prefix_len;
437 }
438
439 let link_start_in_source = range.source_range.start;
440 let mut link_end_in_source = range.source_range.end;
441 let mut link_events = Vec::new();
442
443 while ranges
444 .peek()
445 .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
446 {
447 let range = ranges.next().unwrap();
448 link_end_in_source = range.source_range.end;
449 link_events.push(event_for(text, range.source_range, &range.parsed));
450 }
451
452 if let Some(range) = ranges.peek_mut() {
453 let prefix_len = link_end_in_merged - range.merged_range.start;
454 if prefix_len > 0 {
455 let (head, tail) = range.parsed.split_at(prefix_len);
456 link_events.push(event_for(
457 text,
458 range.source_range.start..range.source_range.start + prefix_len,
459 head,
460 ));
461 range.parsed = CowStr::Boxed(tail.into());
462 range.merged_range.start += prefix_len;
463 range.source_range.start += prefix_len;
464 link_end_in_source = range.source_range.start;
465 }
466 }
467 let link_range = link_start_in_source..link_end_in_source;
468
469 state.push_event(
470 link_range.clone(),
471 MarkdownEvent::Start(MarkdownTag::Link {
472 link_type: LinkType::Autolink,
473 dest_url: SharedString::from(link.as_str().to_string()),
474 title: SharedString::default(),
475 id: SharedString::default(),
476 }),
477 );
478 for (range, event) in link_events {
479 state.push_event(range, event);
480 }
481 state.push_event(
482 link_range.clone(),
483 MarkdownEvent::End(MarkdownTagEnd::Link),
484 );
485 }
486 }
487
488 for range in ranges {
489 let (range, event) = event_for(text, range.source_range, &range.parsed);
490 state.push_event(range, event);
491 }
492 }
493 pulldown_cmark::Event::Code(_) => {
494 let content_range = extract_code_content_range(&text[range.clone()]);
495 let content_range =
496 content_range.start + range.start..content_range.end + range.start;
497 state.push_event(content_range, MarkdownEvent::Code)
498 }
499 pulldown_cmark::Event::Html(_) => state.push_event(range, MarkdownEvent::Html),
500 pulldown_cmark::Event::InlineHtml(_) => {
501 state.push_event(range, MarkdownEvent::InlineHtml)
502 }
503 pulldown_cmark::Event::FootnoteReference(label) => state.push_event(
504 range,
505 MarkdownEvent::FootnoteReference(SharedString::from(label.to_string())),
506 ),
507 pulldown_cmark::Event::SoftBreak => state.push_event(range, MarkdownEvent::SoftBreak),
508 pulldown_cmark::Event::HardBreak => state.push_event(range, MarkdownEvent::HardBreak),
509 pulldown_cmark::Event::Rule => state.push_event(range, MarkdownEvent::Rule),
510 pulldown_cmark::Event::TaskListMarker(checked) => {
511 state.push_event(range, MarkdownEvent::TaskListMarker(checked))
512 }
513 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
514 }
515 }
516
517 let heading_slugs = if parse_heading_slugs {
518 build_heading_slugs(text, &state.events)
519 } else {
520 HashMap::default()
521 };
522 let footnote_definitions = build_footnote_definitions(&state.events);
523
524 ParsedMarkdownData {
525 events: state.events,
526 language_names,
527 language_paths,
528 root_block_starts: state.root_block_starts,
529 html_blocks,
530 heading_slugs,
531 footnote_definitions,
532 }
533}
534
535fn build_footnote_definitions(
536 events: &[(Range<usize>, MarkdownEvent)],
537) -> HashMap<SharedString, usize> {
538 let mut definitions = HashMap::default();
539 let mut current_label: Option<SharedString> = None;
540
541 for (range, event) in events {
542 match event {
543 MarkdownEvent::Start(MarkdownTag::FootnoteDefinition(label)) => {
544 current_label = Some(label.clone());
545 }
546 MarkdownEvent::End(MarkdownTagEnd::FootnoteDefinition) => {
547 current_label = None;
548 }
549 MarkdownEvent::Text if current_label.is_some() => {
550 if let Some(label) = current_label.take() {
551 definitions.entry(label).or_insert(range.start);
552 }
553 }
554 _ => {}
555 }
556 }
557
558 definitions
559}
560
561pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
562 let mut events = Vec::new();
563 let mut finder = LinkFinder::new();
564 finder.kinds(&[linkify::LinkKind::Url]);
565 let mut text_range = Range {
566 start: 0,
567 end: text.len(),
568 };
569 for link in finder.links(text) {
570 let link_range = link.start()..link.end();
571
572 if link_range.start > text_range.start {
573 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
574 }
575
576 events.push((
577 link_range.clone(),
578 MarkdownEvent::Start(MarkdownTag::Link {
579 link_type: LinkType::Autolink,
580 dest_url: SharedString::from(link.as_str().to_string()),
581 title: SharedString::default(),
582 id: SharedString::default(),
583 }),
584 ));
585 events.push((link_range.clone(), MarkdownEvent::Text));
586 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
587
588 text_range.start = link_range.end;
589 }
590
591 if text_range.end > text_range.start {
592 events.push((text_range, MarkdownEvent::Text));
593 }
594
595 events
596}
597
598/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
599/// parse result for rendering without resorting to unsafe lifetime coercion.
600#[derive(Clone, Debug, PartialEq)]
601pub enum MarkdownEvent {
602 /// Start of a tagged element. Events that are yielded after this event
603 /// and before its corresponding `End` event are inside this element.
604 /// Start and end events are guaranteed to be balanced.
605 Start(MarkdownTag),
606 /// End of a tagged element.
607 End(MarkdownTagEnd),
608 /// Text that uses the associated range from the markdown source.
609 Text,
610 /// Text that differs from the markdown source - typically due to substitution of HTML entities
611 /// and smart punctuation.
612 SubstitutedText(String),
613 /// An inline code node.
614 Code,
615 /// An HTML node.
616 Html,
617 /// An inline HTML node.
618 InlineHtml,
619 /// A reference to a footnote with given label, which may or may not be defined
620 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
621 /// occur in any order.
622 FootnoteReference(SharedString),
623 /// A soft line break.
624 SoftBreak,
625 /// A hard line break.
626 HardBreak,
627 /// A horizontal ruler.
628 Rule,
629 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
630 TaskListMarker(bool),
631 /// Start of a root-level block (a top-level structural element like a paragraph, heading, list, etc.).
632 RootStart,
633 /// End of a root-level block. Contains the root block index.
634 RootEnd(usize),
635}
636
637/// Tags for elements that can contain other elements.
638#[derive(Clone, Debug, PartialEq)]
639pub enum MarkdownTag {
640 /// A paragraph of text and other inline elements.
641 Paragraph,
642
643 /// A heading, with optional identifier, classes and custom attributes.
644 /// The identifier is prefixed with `#` and the last one in the attributes
645 /// list is chosen, classes are prefixed with `.` and custom attributes
646 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
647 Heading {
648 level: HeadingLevel,
649 id: Option<SharedString>,
650 classes: Vec<SharedString>,
651 /// The first item of the tuple is the attr and second one the value.
652 attrs: Vec<(SharedString, Option<SharedString>)>,
653 },
654
655 BlockQuote,
656
657 /// A code block.
658 CodeBlock {
659 kind: CodeBlockKind,
660 metadata: CodeBlockMetadata,
661 },
662
663 /// A HTML block.
664 HtmlBlock,
665
666 /// A list. If the list is ordered the field indicates the number of the first item.
667 /// Contains only list items.
668 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
669
670 /// A list item.
671 Item,
672
673 /// A footnote definition. The value contained is the footnote's label by which it can
674 /// be referred to.
675 FootnoteDefinition(SharedString),
676
677 /// A table. Contains a vector describing the text-alignment for each of its columns.
678 Table(Vec<Alignment>),
679
680 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
681 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
682 TableHead,
683
684 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
685 TableRow,
686 TableCell,
687
688 // span-level tags
689 Emphasis,
690 Strong,
691 Strikethrough,
692 Superscript,
693 Subscript,
694
695 /// A link.
696 Link {
697 link_type: LinkType,
698 dest_url: SharedString,
699 title: SharedString,
700 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
701 id: SharedString,
702 },
703
704 /// An image. The first field is the link type, the second the destination URL and the third is a title,
705 /// the fourth is the link identifier.
706 Image {
707 link_type: LinkType,
708 dest_url: SharedString,
709 title: SharedString,
710 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
711 id: SharedString,
712 },
713
714 /// A metadata block.
715 MetadataBlock(MetadataBlockKind),
716
717 DefinitionList,
718 DefinitionListTitle,
719 DefinitionListDefinition,
720}
721
722#[derive(Clone, Debug, PartialEq)]
723pub enum CodeBlockKind {
724 Indented,
725 /// "Fenced" means "surrounded by triple backticks."
726 /// There can optionally be either a language after the backticks (like in traditional Markdown)
727 /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
728 /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
729 Fenced,
730 FencedLang(SharedString),
731 FencedSrc(PathWithRange),
732}
733
734#[derive(Default, Clone, Debug, PartialEq)]
735pub struct CodeBlockMetadata {
736 pub content_range: Range<usize>,
737 pub line_count: usize,
738}
739
740fn extract_code_content_range(text: &str) -> Range<usize> {
741 let text_len = text.len();
742 if text_len == 0 {
743 return 0..0;
744 }
745
746 let start_ticks = text.chars().take_while(|&c| c == '`').count();
747
748 if start_ticks == 0 || start_ticks > text_len {
749 return 0..text_len;
750 }
751
752 let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
753
754 if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
755 return 0..text_len;
756 }
757
758 start_ticks..text_len - end_ticks
759}
760
761pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
762 let mut range = 0..text.len();
763 if text.starts_with("```") {
764 range.start += 3;
765
766 if let Some(newline_ix) = text[range.clone()].find('\n') {
767 range.start += newline_ix + 1;
768 }
769 }
770
771 if !range.is_empty() && text.ends_with("```") {
772 range.end -= 3;
773 }
774 if range.start > range.end {
775 range.end = range.start;
776 }
777 range
778}
779
780#[cfg(test)]
781mod tests {
782 use super::MarkdownEvent::*;
783 use super::MarkdownTag::*;
784 use super::*;
785
786 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
787 .union(Options::ENABLE_MATH)
788 .union(Options::ENABLE_DEFINITION_LIST)
789 .union(Options::ENABLE_WIKILINKS);
790
791 #[test]
792 fn all_options_considered() {
793 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
794 // can be evaluated for inclusion.
795 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
796 }
797
798 #[test]
799 fn wanted_and_unwanted_options_disjoint() {
800 assert_eq!(
801 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
802 Options::empty()
803 );
804 }
805
806 #[test]
807 fn test_html_comments() {
808 assert_eq!(
809 parse_markdown_with_options(" <!--\nrdoc-file=string.c\n-->\nReturns", false, false),
810 ParsedMarkdownData {
811 events: vec![
812 (2..30, RootStart),
813 (2..30, Start(HtmlBlock)),
814 (2..2, SubstitutedText(" ".into())),
815 (2..7, Html),
816 (7..26, Html),
817 (26..30, Html),
818 (2..30, End(MarkdownTagEnd::HtmlBlock)),
819 (2..30, RootEnd(0)),
820 (30..37, RootStart),
821 (30..37, Start(Paragraph)),
822 (30..37, Text),
823 (30..37, End(MarkdownTagEnd::Paragraph)),
824 (30..37, RootEnd(1)),
825 ],
826 root_block_starts: vec![2, 30],
827 ..Default::default()
828 }
829 )
830 }
831
832 #[test]
833 fn test_plain_urls_and_escaped_text() {
834 assert_eq!(
835 parse_markdown_with_options(
836 " https://some.url some \\`►\\` text",
837 false,
838 false,
839 ),
840 ParsedMarkdownData {
841 events: vec![
842 (0..51, RootStart),
843 (0..51, Start(Paragraph)),
844 (0..6, SubstitutedText("\u{a0}".into())),
845 (6..12, SubstitutedText("\u{a0}".into())),
846 (12..13, Text),
847 (
848 13..29,
849 Start(Link {
850 link_type: LinkType::Autolink,
851 dest_url: "https://some.url".into(),
852 title: "".into(),
853 id: "".into(),
854 })
855 ),
856 (13..29, Text),
857 (13..29, End(MarkdownTagEnd::Link)),
858 (29..35, Text),
859 (36..37, Text), // Escaped backtick
860 (37..44, SubstitutedText("►".into())),
861 (45..46, Text), // Escaped backtick
862 (46..51, Text),
863 (0..51, End(MarkdownTagEnd::Paragraph)),
864 (0..51, RootEnd(0)),
865 ],
866 root_block_starts: vec![0],
867 ..Default::default()
868 }
869 );
870 }
871
872 #[test]
873 fn test_incomplete_link() {
874 assert_eq!(
875 parse_markdown_with_options(
876 "You can use the [GitHub Search API](https://docs.github.com/en",
877 false,
878 false,
879 )
880 .events,
881 vec![
882 (0..62, RootStart),
883 (0..62, Start(Paragraph)),
884 (0..16, Text),
885 (16..17, Text),
886 (17..34, Text),
887 (34..35, Text),
888 (35..36, Text),
889 (
890 36..62,
891 Start(Link {
892 link_type: LinkType::Autolink,
893 dest_url: "https://docs.github.com/en".into(),
894 title: "".into(),
895 id: "".into()
896 })
897 ),
898 (36..62, Text),
899 (36..62, End(MarkdownTagEnd::Link)),
900 (0..62, End(MarkdownTagEnd::Paragraph)),
901 (0..62, RootEnd(0)),
902 ],
903 );
904 }
905
906 #[test]
907 fn test_smart_punctuation() {
908 assert_eq!(
909 parse_markdown_with_options(
910 "-- --- ... \"double quoted\" 'single quoted' ----------",
911 false,
912 false,
913 ),
914 ParsedMarkdownData {
915 events: vec![
916 (0..53, RootStart),
917 (0..53, Start(Paragraph)),
918 (0..2, SubstitutedText("–".into())),
919 (2..3, Text),
920 (3..6, SubstitutedText("—".into())),
921 (6..7, Text),
922 (7..10, SubstitutedText("…".into())),
923 (10..11, Text),
924 (11..12, SubstitutedText("\u{201c}".into())),
925 (12..25, Text),
926 (25..26, SubstitutedText("\u{201d}".into())),
927 (26..27, Text),
928 (27..28, SubstitutedText("\u{2018}".into())),
929 (28..41, Text),
930 (41..42, SubstitutedText("\u{2019}".into())),
931 (42..43, Text),
932 (43..53, SubstitutedText("–––––".into())),
933 (0..53, End(MarkdownTagEnd::Paragraph)),
934 (0..53, RootEnd(0)),
935 ],
936 root_block_starts: vec![0],
937 ..Default::default()
938 }
939 )
940 }
941
942 #[test]
943 fn test_code_block_metadata() {
944 assert_eq!(
945 parse_markdown_with_options("```rust\nfn main() {\n let a = 1;\n}\n```", false, false),
946 ParsedMarkdownData {
947 events: vec![
948 (0..37, RootStart),
949 (
950 0..37,
951 Start(CodeBlock {
952 kind: CodeBlockKind::FencedLang("rust".into()),
953 metadata: CodeBlockMetadata {
954 content_range: 8..34,
955 line_count: 3
956 }
957 })
958 ),
959 (8..34, Text),
960 (0..37, End(MarkdownTagEnd::CodeBlock)),
961 (0..37, RootEnd(0)),
962 ],
963 language_names: {
964 let mut h = HashSet::default();
965 h.insert("rust".into());
966 h
967 },
968 root_block_starts: vec![0],
969 ..Default::default()
970 }
971 );
972 assert_eq!(
973 parse_markdown_with_options(" fn main() {}", false, false),
974 ParsedMarkdownData {
975 events: vec![
976 (4..16, RootStart),
977 (
978 4..16,
979 Start(CodeBlock {
980 kind: CodeBlockKind::Indented,
981 metadata: CodeBlockMetadata {
982 content_range: 4..16,
983 line_count: 1
984 }
985 })
986 ),
987 (4..16, Text),
988 (4..16, End(MarkdownTagEnd::CodeBlock)),
989 (4..16, RootEnd(0)),
990 ],
991 root_block_starts: vec![4],
992 ..Default::default()
993 }
994 );
995 }
996
997 fn assert_code_block_does_not_emit_links(markdown: &str) {
998 let parsed = parse_markdown_with_options(markdown, false, false);
999 let mut code_block_depth = 0;
1000 let mut code_block_count = 0;
1001 let mut saw_text_inside_code_block = false;
1002
1003 for (_, event) in &parsed.events {
1004 match event {
1005 Start(CodeBlock { .. }) => {
1006 code_block_depth += 1;
1007 code_block_count += 1;
1008 }
1009 End(MarkdownTagEnd::CodeBlock) => {
1010 assert!(
1011 code_block_depth > 0,
1012 "encountered a code block end without a matching start"
1013 );
1014 code_block_depth -= 1;
1015 }
1016 Start(Link { .. }) | End(MarkdownTagEnd::Link) => {
1017 assert_eq!(
1018 code_block_depth, 0,
1019 "code blocks should not emit link events"
1020 );
1021 }
1022 Text | SubstitutedText(_) if code_block_depth > 0 => {
1023 saw_text_inside_code_block = true;
1024 }
1025 _ => {}
1026 }
1027 }
1028
1029 assert_eq!(code_block_count, 1, "expected exactly one code block");
1030 assert_eq!(code_block_depth, 0, "unterminated code block");
1031 assert!(
1032 saw_text_inside_code_block,
1033 "expected text inside the code block"
1034 );
1035 }
1036
1037 #[test]
1038 fn test_code_blocks_do_not_autolink_urls() {
1039 assert_code_block_does_not_emit_links("```txt\nhttps://example.com\n```");
1040 assert_code_block_does_not_emit_links(" https://example.com");
1041 assert_code_block_does_not_emit_links(
1042 "```txt\r\nhttps:/\\/example.com\r\nhttps://example.com\r\n```",
1043 );
1044 assert_code_block_does_not_emit_links(
1045 " https:/\\/example.com\r\n https://example.com",
1046 );
1047 }
1048
1049 #[test]
1050 fn test_metadata_blocks_do_not_affect_root_blocks() {
1051 assert_eq!(
1052 parse_markdown_with_options("+++\ntitle = \"Example\"\n+++\n\nParagraph", false, false),
1053 ParsedMarkdownData {
1054 events: vec![
1055 (27..36, RootStart),
1056 (27..36, Start(Paragraph)),
1057 (27..36, Text),
1058 (27..36, End(MarkdownTagEnd::Paragraph)),
1059 (27..36, RootEnd(0)),
1060 ],
1061 root_block_starts: vec![27],
1062 ..Default::default()
1063 }
1064 );
1065 }
1066
1067 #[test]
1068 fn test_table_checkboxes_remain_text_in_cells() {
1069 let markdown = "\
1070| Done | Task |
1071|------|---------|
1072| [x] | Fix bug |
1073| [ ] | Add feature |";
1074 let parsed = parse_markdown_with_options(markdown, false, false);
1075
1076 let mut in_table = false;
1077 let mut saw_task_list_marker = false;
1078 let mut cell_texts = Vec::new();
1079 let mut current_cell = String::new();
1080
1081 for (range, event) in &parsed.events {
1082 match event {
1083 Start(Table(_)) => in_table = true,
1084 End(MarkdownTagEnd::Table) => in_table = false,
1085 Start(TableCell) => current_cell.clear(),
1086 End(MarkdownTagEnd::TableCell) => {
1087 if in_table {
1088 cell_texts.push(current_cell.clone());
1089 }
1090 }
1091 Text if in_table => current_cell.push_str(&markdown[range.clone()]),
1092 TaskListMarker(_) if in_table => saw_task_list_marker = true,
1093 _ => {}
1094 }
1095 }
1096
1097 let checkbox_cells: Vec<&str> = cell_texts
1098 .iter()
1099 .map(|cell| cell.trim())
1100 .filter(|cell| *cell == "[x]" || *cell == "[X]" || *cell == "[ ]")
1101 .collect();
1102
1103 assert!(
1104 !saw_task_list_marker,
1105 "Table checkboxes should remain text, not task-list markers"
1106 );
1107 assert_eq!(checkbox_cells, vec!["[x]", "[ ]"]);
1108 }
1109
1110 #[test]
1111 fn test_extract_code_content_range() {
1112 let input = "```let x = 5;```";
1113 assert_eq!(extract_code_content_range(input), 3..13);
1114
1115 let input = "``let x = 5;``";
1116 assert_eq!(extract_code_content_range(input), 2..12);
1117
1118 let input = "`let x = 5;`";
1119 assert_eq!(extract_code_content_range(input), 1..11);
1120
1121 let input = "plain text";
1122 assert_eq!(extract_code_content_range(input), 0..10);
1123
1124 let input = "``let x = 5;`";
1125 assert_eq!(extract_code_content_range(input), 0..13);
1126 }
1127
1128 #[test]
1129 fn test_extract_code_block_content_range() {
1130 let input = "```rust\nlet x = 5;\n```";
1131 assert_eq!(extract_code_block_content_range(input), 8..19);
1132
1133 let input = "plain text";
1134 assert_eq!(extract_code_block_content_range(input), 0..10);
1135
1136 let input = "```python\nprint('hello')\nprint('world')\n```";
1137 assert_eq!(extract_code_block_content_range(input), 10..40);
1138
1139 // Malformed input
1140 let input = "`````";
1141 assert_eq!(extract_code_block_content_range(input), 3..3);
1142 }
1143
1144 #[test]
1145 fn test_footnotes() {
1146 let parsed = parse_markdown_with_options(
1147 "Text with a footnote[^1] and some more text.\n\n[^1]: This is the footnote content.",
1148 false,
1149 false,
1150 );
1151 assert_eq!(
1152 parsed.events,
1153 vec![
1154 (0..45, RootStart),
1155 (0..45, Start(Paragraph)),
1156 (0..20, Text),
1157 (20..24, FootnoteReference("1".into())),
1158 (24..44, Text),
1159 (0..45, End(MarkdownTagEnd::Paragraph)),
1160 (0..45, RootEnd(0)),
1161 (46..81, RootStart),
1162 (46..81, Start(FootnoteDefinition("1".into()))),
1163 (52..81, Start(Paragraph)),
1164 (52..81, Text),
1165 (52..81, End(MarkdownTagEnd::Paragraph)),
1166 (46..81, End(MarkdownTagEnd::FootnoteDefinition)),
1167 (46..81, RootEnd(1)),
1168 ]
1169 );
1170 assert_eq!(parsed.footnote_definitions.len(), 1);
1171 assert_eq!(parsed.footnote_definitions.get("1").copied(), Some(52));
1172 }
1173
1174 #[test]
1175 fn test_footnote_definitions_multiple() {
1176 let parsed = parse_markdown_with_options(
1177 "Text[^a] and[^b].\n\n[^a]: First.\n\n[^b]: Second.",
1178 false,
1179 false,
1180 );
1181 assert_eq!(parsed.footnote_definitions.len(), 2);
1182 assert!(parsed.footnote_definitions.contains_key("a"));
1183 assert!(parsed.footnote_definitions.contains_key("b"));
1184 }
1185
1186 #[test]
1187 fn test_links_split_across_fragments() {
1188 // This test verifies that links split across multiple text fragments due to escaping or other issues
1189 // are correctly detected and processed
1190 // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
1191 // We're verifying our parser can handle this correctly
1192 assert_eq!(
1193 parse_markdown_with_options(
1194 "https:/\\/example.com is equivalent to https://example.com!",
1195 false,
1196 false,
1197 )
1198 .events,
1199 vec![
1200 (0..62, RootStart),
1201 (0..62, Start(Paragraph)),
1202 (
1203 0..20,
1204 Start(Link {
1205 link_type: LinkType::Autolink,
1206 dest_url: "https://example.com".into(),
1207 title: "".into(),
1208 id: "".into()
1209 })
1210 ),
1211 (0..7, Text),
1212 (8..20, Text),
1213 (0..20, End(MarkdownTagEnd::Link)),
1214 (20..38, Text),
1215 (
1216 38..61,
1217 Start(Link {
1218 link_type: LinkType::Autolink,
1219 dest_url: "https://example.com".into(),
1220 title: "".into(),
1221 id: "".into()
1222 })
1223 ),
1224 (38..53, Text),
1225 (53..58, SubstitutedText(".".into())),
1226 (58..61, Text),
1227 (38..61, End(MarkdownTagEnd::Link)),
1228 (61..62, Text),
1229 (0..62, End(MarkdownTagEnd::Paragraph)),
1230 (0..62, RootEnd(0)),
1231 ],
1232 );
1233
1234 assert_eq!(
1235 parse_markdown_with_options(
1236 "Visit https://example.com/cat\\/é‍☕ for coffee!",
1237 false,
1238 false,
1239 )
1240 .events,
1241 [
1242 (0..55, RootStart),
1243 (0..55, Start(Paragraph)),
1244 (0..6, Text),
1245 (
1246 6..43,
1247 Start(Link {
1248 link_type: LinkType::Autolink,
1249 dest_url: "https://example.com/cat/é\u{200d}☕".into(),
1250 title: "".into(),
1251 id: "".into()
1252 })
1253 ),
1254 (6..29, Text),
1255 (30..33, Text),
1256 (33..40, SubstitutedText("\u{200d}".into())),
1257 (40..43, Text),
1258 (6..43, End(MarkdownTagEnd::Link)),
1259 (43..55, Text),
1260 (0..55, End(MarkdownTagEnd::Paragraph)),
1261 (0..55, RootEnd(0)),
1262 ]
1263 );
1264 }
1265
1266 #[test]
1267 fn test_heading_slugs() {
1268 let parsed = parse_markdown_with_options(
1269 "# Hello World\n\n## Code `block`\n\n### Third Level\n\n#### Fourth Level\n\n## Hello World",
1270 false,
1271 true,
1272 );
1273 assert_eq!(parsed.heading_slugs.len(), 5);
1274 assert!(parsed.heading_slugs.contains_key("hello-world"));
1275 assert!(parsed.heading_slugs.contains_key("code-block"));
1276 assert!(parsed.heading_slugs.contains_key("third-level"));
1277 assert!(parsed.heading_slugs.contains_key("fourth-level"));
1278 assert!(parsed.heading_slugs.contains_key("hello-world-1"));
1279 }
1280
1281 #[test]
1282 fn test_heading_source_index_for_slug() {
1283 let parsed = parse_markdown_with_options(
1284 "# Duplicate\n\nText\n\n## Duplicate\n\nMore text",
1285 false,
1286 true,
1287 );
1288 let first = parsed.heading_slugs.get("duplicate").copied();
1289 let second = parsed.heading_slugs.get("duplicate-1").copied();
1290 assert!(first.is_some());
1291 assert!(second.is_some());
1292 assert!(first.expect("first slug missing") < second.expect("second slug missing"));
1293 }
1294
1295 #[test]
1296 fn test_heading_slug_collision_with_dedup_suffix() {
1297 let parsed = parse_markdown_with_options("# Foo\n\n## Foo\n\n## Foo 1", false, true);
1298 assert_eq!(parsed.heading_slugs.len(), 3);
1299 assert!(parsed.heading_slugs.contains_key("foo"));
1300 assert!(parsed.heading_slugs.contains_key("foo-1"));
1301 assert!(parsed.heading_slugs.contains_key("foo-1-1"));
1302 }
1303}