1use collections::{BTreeMap, HashMap, HashSet};
2use gpui::SharedString;
3use linkify::LinkFinder;
4pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
5use pulldown_cmark::{
6 Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
7};
8use std::{ops::Range, sync::Arc};
9use util::markdown::generate_heading_slug;
10
11use crate::{html, path_range::PathWithRange};
12
13pub const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
14 .union(Options::ENABLE_FOOTNOTES)
15 .union(Options::ENABLE_STRIKETHROUGH)
16 .union(Options::ENABLE_TASKLISTS)
17 .union(Options::ENABLE_SMART_PUNCTUATION)
18 .union(Options::ENABLE_HEADING_ATTRIBUTES)
19 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
20 .union(Options::ENABLE_OLD_FOOTNOTES)
21 .union(Options::ENABLE_GFM)
22 .union(Options::ENABLE_SUPERSCRIPT)
23 .union(Options::ENABLE_SUBSCRIPT);
24
25#[derive(Default)]
26struct ParseState {
27 events: Vec<(Range<usize>, MarkdownEvent)>,
28 root_block_starts: Vec<usize>,
29 depth: usize,
30}
31
32#[derive(Debug, Default)]
33#[cfg_attr(test, derive(PartialEq))]
34pub(crate) struct ParsedMarkdownData {
35 pub events: Vec<(Range<usize>, MarkdownEvent)>,
36 pub language_names: HashSet<SharedString>,
37 pub language_paths: HashSet<Arc<str>>,
38 pub root_block_starts: Vec<usize>,
39 pub html_blocks: BTreeMap<usize, html::html_parser::ParsedHtmlBlock>,
40 pub heading_slugs: HashMap<SharedString, usize>,
41}
42
43impl ParseState {
44 fn push_event(&mut self, range: Range<usize>, event: MarkdownEvent) {
45 match &event {
46 MarkdownEvent::Start(_) => {
47 if self.depth == 0 {
48 self.root_block_starts.push(range.start);
49 self.events.push((range.clone(), MarkdownEvent::RootStart));
50 }
51 self.depth += 1;
52 self.events.push((range, event));
53 }
54 MarkdownEvent::End(_) => {
55 self.events.push((range.clone(), event));
56 if self.depth > 0 {
57 self.depth -= 1;
58 if self.depth == 0 {
59 let root_block_index = self.root_block_starts.len() - 1;
60 self.events
61 .push((range, MarkdownEvent::RootEnd(root_block_index)));
62 }
63 }
64 }
65 MarkdownEvent::Rule => {
66 if self.depth == 0 && !range.is_empty() {
67 self.root_block_starts.push(range.start);
68 let root_block_index = self.root_block_starts.len() - 1;
69 self.events.push((range.clone(), MarkdownEvent::RootStart));
70 self.events.push((range.clone(), event));
71 self.events
72 .push((range, MarkdownEvent::RootEnd(root_block_index)));
73 } else {
74 self.events.push((range, event));
75 }
76 }
77 _ => {
78 self.events.push((range, event));
79 }
80 }
81 }
82}
83
84const MAX_DUPLICATE_HEADING_SLUGS: usize = 128;
85
86fn build_heading_slugs(
87 source: &str,
88 events: &[(Range<usize>, MarkdownEvent)],
89) -> HashMap<SharedString, usize> {
90 let mut slugs = HashMap::default();
91 let mut slug_counts: HashMap<String, usize> = HashMap::default();
92 let mut inside_heading = false;
93 let mut heading_text = String::new();
94 let mut heading_source_start: Option<usize> = None;
95
96 for (range, event) in events {
97 match event {
98 MarkdownEvent::Start(MarkdownTag::Heading { .. }) => {
99 inside_heading = true;
100 heading_text.clear();
101 heading_source_start = None;
102 }
103 MarkdownEvent::End(MarkdownTagEnd::Heading(_)) => {
104 if inside_heading {
105 let source_offset = heading_source_start.unwrap_or(range.start);
106 let base_slug = generate_heading_slug(&heading_text);
107 let count = slug_counts.entry(base_slug.clone()).or_insert(0);
108 let mut slug = if *count == 0 {
109 base_slug.clone()
110 } else {
111 format!("{base_slug}-{count}")
112 };
113 *count += 1;
114 while slugs.contains_key(slug.as_str()) {
115 let Some(count) = slug_counts.get_mut(&base_slug) else {
116 slug.clear();
117 break;
118 };
119 if *count >= MAX_DUPLICATE_HEADING_SLUGS {
120 slug.clear();
121 break;
122 }
123 slug = format!("{base_slug}-{count}");
124 *count += 1;
125 }
126 if !slug.is_empty() {
127 slugs.insert(SharedString::from(slug), source_offset);
128 }
129 inside_heading = false;
130 }
131 }
132 MarkdownEvent::Text | MarkdownEvent::Code if inside_heading => {
133 if heading_source_start.is_none() {
134 heading_source_start = Some(range.start);
135 }
136 heading_text.push_str(&source[range.clone()]);
137 }
138 MarkdownEvent::SubstitutedText(substituted) if inside_heading => {
139 if heading_source_start.is_none() {
140 heading_source_start = Some(range.start);
141 }
142 heading_text.push_str(substituted);
143 }
144 _ => {}
145 }
146 }
147
148 slugs
149}
150
151pub(crate) fn parse_markdown_with_options(
152 text: &str,
153 parse_html: bool,
154 parse_heading_slugs: bool,
155) -> ParsedMarkdownData {
156 let mut state = ParseState::default();
157 let mut language_names = HashSet::default();
158 let mut language_paths = HashSet::default();
159 let mut html_blocks = BTreeMap::default();
160 let mut within_link = false;
161 let mut within_code_block = false;
162 let mut within_metadata = false;
163 let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
164 .into_offset_iter()
165 .peekable();
166 while let Some((pulldown_event, range)) = parser.next() {
167 if within_metadata {
168 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
169 pulldown_event
170 {
171 within_metadata = false;
172 }
173 continue;
174 }
175 match pulldown_event {
176 pulldown_cmark::Event::Start(tag) => {
177 if let pulldown_cmark::Tag::HtmlBlock = &tag {
178 state.push_event(range.clone(), MarkdownEvent::Start(MarkdownTag::HtmlBlock));
179
180 if parse_html {
181 if let Some(block) =
182 html::html_parser::parse_html_block(&text[range.clone()], range.clone())
183 {
184 html_blocks.insert(range.start, block);
185
186 while let Some((event, end_range)) = parser.next() {
187 if let pulldown_cmark::Event::End(
188 pulldown_cmark::TagEnd::HtmlBlock,
189 ) = event
190 {
191 state.push_event(
192 end_range,
193 MarkdownEvent::End(MarkdownTagEnd::HtmlBlock),
194 );
195 break;
196 }
197 }
198 }
199 }
200 continue;
201 }
202
203 let tag = match tag {
204 pulldown_cmark::Tag::Link {
205 link_type,
206 dest_url,
207 title,
208 id,
209 } => {
210 within_link = true;
211 MarkdownTag::Link {
212 link_type,
213 dest_url: SharedString::from(dest_url.into_string()),
214 title: SharedString::from(title.into_string()),
215 id: SharedString::from(id.into_string()),
216 }
217 }
218 pulldown_cmark::Tag::MetadataBlock(_kind) => {
219 within_metadata = true;
220 continue;
221 }
222 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
223 within_code_block = true;
224 MarkdownTag::CodeBlock {
225 kind: CodeBlockKind::Indented,
226 metadata: CodeBlockMetadata {
227 content_range: range.clone(),
228 line_count: 1,
229 },
230 }
231 }
232 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
233 ref info,
234 )) => {
235 within_code_block = true;
236 let content_range = extract_code_block_content_range(&text[range.clone()]);
237 let content_range =
238 content_range.start + range.start..content_range.end + range.start;
239
240 // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
241 let line_count = text[content_range.clone()]
242 .bytes()
243 .filter(|c| *c == b'\n')
244 .count();
245 let metadata = CodeBlockMetadata {
246 content_range,
247 line_count,
248 };
249
250 let info = info.trim();
251 let kind = if info.is_empty() {
252 CodeBlockKind::Fenced
253 // Languages should never contain a slash, and PathRanges always should.
254 // (Models are told to specify them relative to a workspace root.)
255 } else if info.contains('/') {
256 let path_range = PathWithRange::new(info);
257 language_paths.insert(path_range.path.clone());
258 CodeBlockKind::FencedSrc(path_range)
259 } else {
260 let language = SharedString::from(info.to_string());
261 language_names.insert(language.clone());
262 CodeBlockKind::FencedLang(language)
263 };
264
265 MarkdownTag::CodeBlock { kind, metadata }
266 }
267 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
268 pulldown_cmark::Tag::Heading {
269 level,
270 id,
271 classes,
272 attrs,
273 } => {
274 let id = id.map(|id| SharedString::from(id.into_string()));
275 let classes = classes
276 .into_iter()
277 .map(|c| SharedString::from(c.into_string()))
278 .collect();
279 let attrs = attrs
280 .into_iter()
281 .map(|(key, value)| {
282 (
283 SharedString::from(key.into_string()),
284 value.map(|v| SharedString::from(v.into_string())),
285 )
286 })
287 .collect();
288 MarkdownTag::Heading {
289 level,
290 id,
291 classes,
292 attrs,
293 }
294 }
295 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
296 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
297 pulldown_cmark::Tag::Item => MarkdownTag::Item,
298 pulldown_cmark::Tag::FootnoteDefinition(label) => {
299 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
300 }
301 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
302 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
303 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
304 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
305 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
306 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
307 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
308 pulldown_cmark::Tag::Superscript => MarkdownTag::Superscript,
309 pulldown_cmark::Tag::Subscript => MarkdownTag::Subscript,
310 pulldown_cmark::Tag::Image {
311 link_type,
312 dest_url,
313 title,
314 id,
315 } => MarkdownTag::Image {
316 link_type,
317 dest_url: SharedString::from(dest_url.into_string()),
318 title: SharedString::from(title.into_string()),
319 id: SharedString::from(id.into_string()),
320 },
321 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock, // this is handled above separately
322 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
323 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
324 pulldown_cmark::Tag::DefinitionListDefinition => {
325 MarkdownTag::DefinitionListDefinition
326 }
327 };
328 state.push_event(range, MarkdownEvent::Start(tag))
329 }
330 pulldown_cmark::Event::End(tag) => {
331 if let pulldown_cmark::TagEnd::Link = tag {
332 within_link = false;
333 } else if let pulldown_cmark::TagEnd::CodeBlock = tag {
334 within_code_block = false;
335 }
336 state.push_event(range, MarkdownEvent::End(tag));
337 }
338 pulldown_cmark::Event::Text(parsed) => {
339 fn event_for(
340 text: &str,
341 range: Range<usize>,
342 str: &str,
343 ) -> (Range<usize>, MarkdownEvent) {
344 if str == &text[range.clone()] {
345 (range, MarkdownEvent::Text)
346 } else {
347 (range, MarkdownEvent::SubstitutedText(str.to_owned()))
348 }
349 }
350
351 if within_code_block {
352 let (range, event) = event_for(text, range, &parsed);
353 state.push_event(range, event);
354 continue;
355 }
356
357 #[derive(Debug)]
358 struct TextRange<'a> {
359 source_range: Range<usize>,
360 merged_range: Range<usize>,
361 parsed: CowStr<'a>,
362 }
363
364 let mut last_len = parsed.len();
365 let mut ranges = vec![TextRange {
366 source_range: range.clone(),
367 merged_range: 0..last_len,
368 parsed,
369 }];
370
371 while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _)))
372 || (parse_html
373 && matches!(
374 parser.peek(),
375 Some((pulldown_cmark::Event::InlineHtml(_), _))
376 ))
377 {
378 let Some((next_event, next_range)) = parser.next() else {
379 unreachable!()
380 };
381 let next_text = match next_event {
382 pulldown_cmark::Event::Text(next_event) => next_event,
383 pulldown_cmark::Event::InlineHtml(_) => CowStr::Borrowed(""),
384 _ => unreachable!(),
385 };
386 let next_len = last_len + next_text.len();
387 ranges.push(TextRange {
388 source_range: next_range.clone(),
389 merged_range: last_len..next_len,
390 parsed: next_text,
391 });
392 last_len = next_len;
393 }
394
395 let mut merged_text =
396 String::with_capacity(ranges.last().unwrap().merged_range.end);
397 for range in &ranges {
398 merged_text.push_str(&range.parsed);
399 }
400
401 let mut ranges = ranges.into_iter().peekable();
402
403 if !within_link && !within_code_block {
404 let mut finder = LinkFinder::new();
405 finder.kinds(&[linkify::LinkKind::Url]);
406
407 // Find links in the merged text
408 for link in finder.links(&merged_text) {
409 let link_start_in_merged = link.start();
410 let link_end_in_merged = link.end();
411
412 while ranges
413 .peek()
414 .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
415 {
416 let range = ranges.next().unwrap();
417 let (range, event) = event_for(text, range.source_range, &range.parsed);
418 state.push_event(range, event);
419 }
420
421 let Some(range) = ranges.peek_mut() else {
422 continue;
423 };
424 let prefix_len = link_start_in_merged - range.merged_range.start;
425 if prefix_len > 0 {
426 let (head, tail) = range.parsed.split_at(prefix_len);
427 let (event_range, event) = event_for(
428 text,
429 range.source_range.start..range.source_range.start + prefix_len,
430 head,
431 );
432 state.push_event(event_range, event);
433 range.parsed = CowStr::Boxed(tail.into());
434 range.merged_range.start += prefix_len;
435 range.source_range.start += prefix_len;
436 }
437
438 let link_start_in_source = range.source_range.start;
439 let mut link_end_in_source = range.source_range.end;
440 let mut link_events = Vec::new();
441
442 while ranges
443 .peek()
444 .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
445 {
446 let range = ranges.next().unwrap();
447 link_end_in_source = range.source_range.end;
448 link_events.push(event_for(text, range.source_range, &range.parsed));
449 }
450
451 if let Some(range) = ranges.peek_mut() {
452 let prefix_len = link_end_in_merged - range.merged_range.start;
453 if prefix_len > 0 {
454 let (head, tail) = range.parsed.split_at(prefix_len);
455 link_events.push(event_for(
456 text,
457 range.source_range.start..range.source_range.start + prefix_len,
458 head,
459 ));
460 range.parsed = CowStr::Boxed(tail.into());
461 range.merged_range.start += prefix_len;
462 range.source_range.start += prefix_len;
463 link_end_in_source = range.source_range.start;
464 }
465 }
466 let link_range = link_start_in_source..link_end_in_source;
467
468 state.push_event(
469 link_range.clone(),
470 MarkdownEvent::Start(MarkdownTag::Link {
471 link_type: LinkType::Autolink,
472 dest_url: SharedString::from(link.as_str().to_string()),
473 title: SharedString::default(),
474 id: SharedString::default(),
475 }),
476 );
477 for (range, event) in link_events {
478 state.push_event(range, event);
479 }
480 state.push_event(
481 link_range.clone(),
482 MarkdownEvent::End(MarkdownTagEnd::Link),
483 );
484 }
485 }
486
487 for range in ranges {
488 let (range, event) = event_for(text, range.source_range, &range.parsed);
489 state.push_event(range, event);
490 }
491 }
492 pulldown_cmark::Event::Code(_) => {
493 let content_range = extract_code_content_range(&text[range.clone()]);
494 let content_range =
495 content_range.start + range.start..content_range.end + range.start;
496 state.push_event(content_range, MarkdownEvent::Code)
497 }
498 pulldown_cmark::Event::Html(_) => state.push_event(range, MarkdownEvent::Html),
499 pulldown_cmark::Event::InlineHtml(_) => {
500 state.push_event(range, MarkdownEvent::InlineHtml)
501 }
502 pulldown_cmark::Event::FootnoteReference(_) => {
503 state.push_event(range, MarkdownEvent::FootnoteReference)
504 }
505 pulldown_cmark::Event::SoftBreak => state.push_event(range, MarkdownEvent::SoftBreak),
506 pulldown_cmark::Event::HardBreak => state.push_event(range, MarkdownEvent::HardBreak),
507 pulldown_cmark::Event::Rule => state.push_event(range, MarkdownEvent::Rule),
508 pulldown_cmark::Event::TaskListMarker(checked) => {
509 state.push_event(range, MarkdownEvent::TaskListMarker(checked))
510 }
511 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
512 }
513 }
514
515 let heading_slugs = if parse_heading_slugs {
516 build_heading_slugs(text, &state.events)
517 } else {
518 HashMap::default()
519 };
520
521 ParsedMarkdownData {
522 events: state.events,
523 language_names,
524 language_paths,
525 root_block_starts: state.root_block_starts,
526 html_blocks,
527 heading_slugs,
528 }
529}
530
531pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
532 let mut events = Vec::new();
533 let mut finder = LinkFinder::new();
534 finder.kinds(&[linkify::LinkKind::Url]);
535 let mut text_range = Range {
536 start: 0,
537 end: text.len(),
538 };
539 for link in finder.links(text) {
540 let link_range = link.start()..link.end();
541
542 if link_range.start > text_range.start {
543 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
544 }
545
546 events.push((
547 link_range.clone(),
548 MarkdownEvent::Start(MarkdownTag::Link {
549 link_type: LinkType::Autolink,
550 dest_url: SharedString::from(link.as_str().to_string()),
551 title: SharedString::default(),
552 id: SharedString::default(),
553 }),
554 ));
555 events.push((link_range.clone(), MarkdownEvent::Text));
556 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
557
558 text_range.start = link_range.end;
559 }
560
561 if text_range.end > text_range.start {
562 events.push((text_range, MarkdownEvent::Text));
563 }
564
565 events
566}
567
568/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
569/// parse result for rendering without resorting to unsafe lifetime coercion.
570#[derive(Clone, Debug, PartialEq)]
571pub enum MarkdownEvent {
572 /// Start of a tagged element. Events that are yielded after this event
573 /// and before its corresponding `End` event are inside this element.
574 /// Start and end events are guaranteed to be balanced.
575 Start(MarkdownTag),
576 /// End of a tagged element.
577 End(MarkdownTagEnd),
578 /// Text that uses the associated range from the markdown source.
579 Text,
580 /// Text that differs from the markdown source - typically due to substitution of HTML entities
581 /// and smart punctuation.
582 SubstitutedText(String),
583 /// An inline code node.
584 Code,
585 /// An HTML node.
586 Html,
587 /// An inline HTML node.
588 InlineHtml,
589 /// A reference to a footnote with given label, which may or may not be defined
590 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
591 /// occur in any order.
592 FootnoteReference,
593 /// A soft line break.
594 SoftBreak,
595 /// A hard line break.
596 HardBreak,
597 /// A horizontal ruler.
598 Rule,
599 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
600 TaskListMarker(bool),
601 /// Start of a root-level block (a top-level structural element like a paragraph, heading, list, etc.).
602 RootStart,
603 /// End of a root-level block. Contains the root block index.
604 RootEnd(usize),
605}
606
607/// Tags for elements that can contain other elements.
608#[derive(Clone, Debug, PartialEq)]
609pub enum MarkdownTag {
610 /// A paragraph of text and other inline elements.
611 Paragraph,
612
613 /// A heading, with optional identifier, classes and custom attributes.
614 /// The identifier is prefixed with `#` and the last one in the attributes
615 /// list is chosen, classes are prefixed with `.` and custom attributes
616 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
617 Heading {
618 level: HeadingLevel,
619 id: Option<SharedString>,
620 classes: Vec<SharedString>,
621 /// The first item of the tuple is the attr and second one the value.
622 attrs: Vec<(SharedString, Option<SharedString>)>,
623 },
624
625 BlockQuote,
626
627 /// A code block.
628 CodeBlock {
629 kind: CodeBlockKind,
630 metadata: CodeBlockMetadata,
631 },
632
633 /// A HTML block.
634 HtmlBlock,
635
636 /// A list. If the list is ordered the field indicates the number of the first item.
637 /// Contains only list items.
638 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
639
640 /// A list item.
641 Item,
642
643 /// A footnote definition. The value contained is the footnote's label by which it can
644 /// be referred to.
645 FootnoteDefinition(SharedString),
646
647 /// A table. Contains a vector describing the text-alignment for each of its columns.
648 Table(Vec<Alignment>),
649
650 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
651 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
652 TableHead,
653
654 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
655 TableRow,
656 TableCell,
657
658 // span-level tags
659 Emphasis,
660 Strong,
661 Strikethrough,
662 Superscript,
663 Subscript,
664
665 /// A link.
666 Link {
667 link_type: LinkType,
668 dest_url: SharedString,
669 title: SharedString,
670 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
671 id: SharedString,
672 },
673
674 /// An image. The first field is the link type, the second the destination URL and the third is a title,
675 /// the fourth is the link identifier.
676 Image {
677 link_type: LinkType,
678 dest_url: SharedString,
679 title: SharedString,
680 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
681 id: SharedString,
682 },
683
684 /// A metadata block.
685 MetadataBlock(MetadataBlockKind),
686
687 DefinitionList,
688 DefinitionListTitle,
689 DefinitionListDefinition,
690}
691
692#[derive(Clone, Debug, PartialEq)]
693pub enum CodeBlockKind {
694 Indented,
695 /// "Fenced" means "surrounded by triple backticks."
696 /// There can optionally be either a language after the backticks (like in traditional Markdown)
697 /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
698 /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
699 Fenced,
700 FencedLang(SharedString),
701 FencedSrc(PathWithRange),
702}
703
704#[derive(Default, Clone, Debug, PartialEq)]
705pub struct CodeBlockMetadata {
706 pub content_range: Range<usize>,
707 pub line_count: usize,
708}
709
710fn extract_code_content_range(text: &str) -> Range<usize> {
711 let text_len = text.len();
712 if text_len == 0 {
713 return 0..0;
714 }
715
716 let start_ticks = text.chars().take_while(|&c| c == '`').count();
717
718 if start_ticks == 0 || start_ticks > text_len {
719 return 0..text_len;
720 }
721
722 let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
723
724 if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
725 return 0..text_len;
726 }
727
728 start_ticks..text_len - end_ticks
729}
730
731pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
732 let mut range = 0..text.len();
733 if text.starts_with("```") {
734 range.start += 3;
735
736 if let Some(newline_ix) = text[range.clone()].find('\n') {
737 range.start += newline_ix + 1;
738 }
739 }
740
741 if !range.is_empty() && text.ends_with("```") {
742 range.end -= 3;
743 }
744 if range.start > range.end {
745 range.end = range.start;
746 }
747 range
748}
749
750#[cfg(test)]
751mod tests {
752 use super::MarkdownEvent::*;
753 use super::MarkdownTag::*;
754 use super::*;
755
756 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
757 .union(Options::ENABLE_MATH)
758 .union(Options::ENABLE_DEFINITION_LIST)
759 .union(Options::ENABLE_WIKILINKS);
760
761 #[test]
762 fn all_options_considered() {
763 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
764 // can be evaluated for inclusion.
765 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
766 }
767
768 #[test]
769 fn wanted_and_unwanted_options_disjoint() {
770 assert_eq!(
771 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
772 Options::empty()
773 );
774 }
775
776 #[test]
777 fn test_html_comments() {
778 assert_eq!(
779 parse_markdown_with_options(" <!--\nrdoc-file=string.c\n-->\nReturns", false, false),
780 ParsedMarkdownData {
781 events: vec![
782 (2..30, RootStart),
783 (2..30, Start(HtmlBlock)),
784 (2..2, SubstitutedText(" ".into())),
785 (2..7, Html),
786 (7..26, Html),
787 (26..30, Html),
788 (2..30, End(MarkdownTagEnd::HtmlBlock)),
789 (2..30, RootEnd(0)),
790 (30..37, RootStart),
791 (30..37, Start(Paragraph)),
792 (30..37, Text),
793 (30..37, End(MarkdownTagEnd::Paragraph)),
794 (30..37, RootEnd(1)),
795 ],
796 root_block_starts: vec![2, 30],
797 ..Default::default()
798 }
799 )
800 }
801
802 #[test]
803 fn test_plain_urls_and_escaped_text() {
804 assert_eq!(
805 parse_markdown_with_options(
806 " https://some.url some \\`►\\` text",
807 false,
808 false,
809 ),
810 ParsedMarkdownData {
811 events: vec![
812 (0..51, RootStart),
813 (0..51, Start(Paragraph)),
814 (0..6, SubstitutedText("\u{a0}".into())),
815 (6..12, SubstitutedText("\u{a0}".into())),
816 (12..13, Text),
817 (
818 13..29,
819 Start(Link {
820 link_type: LinkType::Autolink,
821 dest_url: "https://some.url".into(),
822 title: "".into(),
823 id: "".into(),
824 })
825 ),
826 (13..29, Text),
827 (13..29, End(MarkdownTagEnd::Link)),
828 (29..35, Text),
829 (36..37, Text), // Escaped backtick
830 (37..44, SubstitutedText("►".into())),
831 (45..46, Text), // Escaped backtick
832 (46..51, Text),
833 (0..51, End(MarkdownTagEnd::Paragraph)),
834 (0..51, RootEnd(0)),
835 ],
836 root_block_starts: vec![0],
837 ..Default::default()
838 }
839 );
840 }
841
842 #[test]
843 fn test_incomplete_link() {
844 assert_eq!(
845 parse_markdown_with_options(
846 "You can use the [GitHub Search API](https://docs.github.com/en",
847 false,
848 false,
849 )
850 .events,
851 vec![
852 (0..62, RootStart),
853 (0..62, Start(Paragraph)),
854 (0..16, Text),
855 (16..17, Text),
856 (17..34, Text),
857 (34..35, Text),
858 (35..36, Text),
859 (
860 36..62,
861 Start(Link {
862 link_type: LinkType::Autolink,
863 dest_url: "https://docs.github.com/en".into(),
864 title: "".into(),
865 id: "".into()
866 })
867 ),
868 (36..62, Text),
869 (36..62, End(MarkdownTagEnd::Link)),
870 (0..62, End(MarkdownTagEnd::Paragraph)),
871 (0..62, RootEnd(0)),
872 ],
873 );
874 }
875
876 #[test]
877 fn test_smart_punctuation() {
878 assert_eq!(
879 parse_markdown_with_options(
880 "-- --- ... \"double quoted\" 'single quoted' ----------",
881 false,
882 false,
883 ),
884 ParsedMarkdownData {
885 events: vec![
886 (0..53, RootStart),
887 (0..53, Start(Paragraph)),
888 (0..2, SubstitutedText("–".into())),
889 (2..3, Text),
890 (3..6, SubstitutedText("—".into())),
891 (6..7, Text),
892 (7..10, SubstitutedText("…".into())),
893 (10..11, Text),
894 (11..12, SubstitutedText("\u{201c}".into())),
895 (12..25, Text),
896 (25..26, SubstitutedText("\u{201d}".into())),
897 (26..27, Text),
898 (27..28, SubstitutedText("\u{2018}".into())),
899 (28..41, Text),
900 (41..42, SubstitutedText("\u{2019}".into())),
901 (42..43, Text),
902 (43..53, SubstitutedText("–––––".into())),
903 (0..53, End(MarkdownTagEnd::Paragraph)),
904 (0..53, RootEnd(0)),
905 ],
906 root_block_starts: vec![0],
907 ..Default::default()
908 }
909 )
910 }
911
912 #[test]
913 fn test_code_block_metadata() {
914 assert_eq!(
915 parse_markdown_with_options("```rust\nfn main() {\n let a = 1;\n}\n```", false, false),
916 ParsedMarkdownData {
917 events: vec![
918 (0..37, RootStart),
919 (
920 0..37,
921 Start(CodeBlock {
922 kind: CodeBlockKind::FencedLang("rust".into()),
923 metadata: CodeBlockMetadata {
924 content_range: 8..34,
925 line_count: 3
926 }
927 })
928 ),
929 (8..34, Text),
930 (0..37, End(MarkdownTagEnd::CodeBlock)),
931 (0..37, RootEnd(0)),
932 ],
933 language_names: {
934 let mut h = HashSet::default();
935 h.insert("rust".into());
936 h
937 },
938 root_block_starts: vec![0],
939 ..Default::default()
940 }
941 );
942 assert_eq!(
943 parse_markdown_with_options(" fn main() {}", false, false),
944 ParsedMarkdownData {
945 events: vec![
946 (4..16, RootStart),
947 (
948 4..16,
949 Start(CodeBlock {
950 kind: CodeBlockKind::Indented,
951 metadata: CodeBlockMetadata {
952 content_range: 4..16,
953 line_count: 1
954 }
955 })
956 ),
957 (4..16, Text),
958 (4..16, End(MarkdownTagEnd::CodeBlock)),
959 (4..16, RootEnd(0)),
960 ],
961 root_block_starts: vec![4],
962 ..Default::default()
963 }
964 );
965 }
966
967 fn assert_code_block_does_not_emit_links(markdown: &str) {
968 let parsed = parse_markdown_with_options(markdown, false, false);
969 let mut code_block_depth = 0;
970 let mut code_block_count = 0;
971 let mut saw_text_inside_code_block = false;
972
973 for (_, event) in &parsed.events {
974 match event {
975 Start(CodeBlock { .. }) => {
976 code_block_depth += 1;
977 code_block_count += 1;
978 }
979 End(MarkdownTagEnd::CodeBlock) => {
980 assert!(
981 code_block_depth > 0,
982 "encountered a code block end without a matching start"
983 );
984 code_block_depth -= 1;
985 }
986 Start(Link { .. }) | End(MarkdownTagEnd::Link) => {
987 assert_eq!(
988 code_block_depth, 0,
989 "code blocks should not emit link events"
990 );
991 }
992 Text | SubstitutedText(_) if code_block_depth > 0 => {
993 saw_text_inside_code_block = true;
994 }
995 _ => {}
996 }
997 }
998
999 assert_eq!(code_block_count, 1, "expected exactly one code block");
1000 assert_eq!(code_block_depth, 0, "unterminated code block");
1001 assert!(
1002 saw_text_inside_code_block,
1003 "expected text inside the code block"
1004 );
1005 }
1006
1007 #[test]
1008 fn test_code_blocks_do_not_autolink_urls() {
1009 assert_code_block_does_not_emit_links("```txt\nhttps://example.com\n```");
1010 assert_code_block_does_not_emit_links(" https://example.com");
1011 assert_code_block_does_not_emit_links(
1012 "```txt\r\nhttps:/\\/example.com\r\nhttps://example.com\r\n```",
1013 );
1014 assert_code_block_does_not_emit_links(
1015 " https:/\\/example.com\r\n https://example.com",
1016 );
1017 }
1018
1019 #[test]
1020 fn test_metadata_blocks_do_not_affect_root_blocks() {
1021 assert_eq!(
1022 parse_markdown_with_options("+++\ntitle = \"Example\"\n+++\n\nParagraph", false, false),
1023 ParsedMarkdownData {
1024 events: vec![
1025 (27..36, RootStart),
1026 (27..36, Start(Paragraph)),
1027 (27..36, Text),
1028 (27..36, End(MarkdownTagEnd::Paragraph)),
1029 (27..36, RootEnd(0)),
1030 ],
1031 root_block_starts: vec![27],
1032 ..Default::default()
1033 }
1034 );
1035 }
1036
1037 #[test]
1038 fn test_table_checkboxes_remain_text_in_cells() {
1039 let markdown = "\
1040| Done | Task |
1041|------|---------|
1042| [x] | Fix bug |
1043| [ ] | Add feature |";
1044 let parsed = parse_markdown_with_options(markdown, false, false);
1045
1046 let mut in_table = false;
1047 let mut saw_task_list_marker = false;
1048 let mut cell_texts = Vec::new();
1049 let mut current_cell = String::new();
1050
1051 for (range, event) in &parsed.events {
1052 match event {
1053 Start(Table(_)) => in_table = true,
1054 End(MarkdownTagEnd::Table) => in_table = false,
1055 Start(TableCell) => current_cell.clear(),
1056 End(MarkdownTagEnd::TableCell) => {
1057 if in_table {
1058 cell_texts.push(current_cell.clone());
1059 }
1060 }
1061 Text if in_table => current_cell.push_str(&markdown[range.clone()]),
1062 TaskListMarker(_) if in_table => saw_task_list_marker = true,
1063 _ => {}
1064 }
1065 }
1066
1067 let checkbox_cells: Vec<&str> = cell_texts
1068 .iter()
1069 .map(|cell| cell.trim())
1070 .filter(|cell| *cell == "[x]" || *cell == "[X]" || *cell == "[ ]")
1071 .collect();
1072
1073 assert!(
1074 !saw_task_list_marker,
1075 "Table checkboxes should remain text, not task-list markers"
1076 );
1077 assert_eq!(checkbox_cells, vec!["[x]", "[ ]"]);
1078 }
1079
1080 #[test]
1081 fn test_extract_code_content_range() {
1082 let input = "```let x = 5;```";
1083 assert_eq!(extract_code_content_range(input), 3..13);
1084
1085 let input = "``let x = 5;``";
1086 assert_eq!(extract_code_content_range(input), 2..12);
1087
1088 let input = "`let x = 5;`";
1089 assert_eq!(extract_code_content_range(input), 1..11);
1090
1091 let input = "plain text";
1092 assert_eq!(extract_code_content_range(input), 0..10);
1093
1094 let input = "``let x = 5;`";
1095 assert_eq!(extract_code_content_range(input), 0..13);
1096 }
1097
1098 #[test]
1099 fn test_extract_code_block_content_range() {
1100 let input = "```rust\nlet x = 5;\n```";
1101 assert_eq!(extract_code_block_content_range(input), 8..19);
1102
1103 let input = "plain text";
1104 assert_eq!(extract_code_block_content_range(input), 0..10);
1105
1106 let input = "```python\nprint('hello')\nprint('world')\n```";
1107 assert_eq!(extract_code_block_content_range(input), 10..40);
1108
1109 // Malformed input
1110 let input = "`````";
1111 assert_eq!(extract_code_block_content_range(input), 3..3);
1112 }
1113
1114 #[test]
1115 fn test_links_split_across_fragments() {
1116 // This test verifies that links split across multiple text fragments due to escaping or other issues
1117 // are correctly detected and processed
1118 // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
1119 // We're verifying our parser can handle this correctly
1120 assert_eq!(
1121 parse_markdown_with_options(
1122 "https:/\\/example.com is equivalent to https://example.com!",
1123 false,
1124 false,
1125 )
1126 .events,
1127 vec![
1128 (0..62, RootStart),
1129 (0..62, Start(Paragraph)),
1130 (
1131 0..20,
1132 Start(Link {
1133 link_type: LinkType::Autolink,
1134 dest_url: "https://example.com".into(),
1135 title: "".into(),
1136 id: "".into()
1137 })
1138 ),
1139 (0..7, Text),
1140 (8..20, Text),
1141 (0..20, End(MarkdownTagEnd::Link)),
1142 (20..38, Text),
1143 (
1144 38..61,
1145 Start(Link {
1146 link_type: LinkType::Autolink,
1147 dest_url: "https://example.com".into(),
1148 title: "".into(),
1149 id: "".into()
1150 })
1151 ),
1152 (38..53, Text),
1153 (53..58, SubstitutedText(".".into())),
1154 (58..61, Text),
1155 (38..61, End(MarkdownTagEnd::Link)),
1156 (61..62, Text),
1157 (0..62, End(MarkdownTagEnd::Paragraph)),
1158 (0..62, RootEnd(0)),
1159 ],
1160 );
1161
1162 assert_eq!(
1163 parse_markdown_with_options(
1164 "Visit https://example.com/cat\\/é‍☕ for coffee!",
1165 false,
1166 false,
1167 )
1168 .events,
1169 [
1170 (0..55, RootStart),
1171 (0..55, Start(Paragraph)),
1172 (0..6, Text),
1173 (
1174 6..43,
1175 Start(Link {
1176 link_type: LinkType::Autolink,
1177 dest_url: "https://example.com/cat/é\u{200d}☕".into(),
1178 title: "".into(),
1179 id: "".into()
1180 })
1181 ),
1182 (6..29, Text),
1183 (30..33, Text),
1184 (33..40, SubstitutedText("\u{200d}".into())),
1185 (40..43, Text),
1186 (6..43, End(MarkdownTagEnd::Link)),
1187 (43..55, Text),
1188 (0..55, End(MarkdownTagEnd::Paragraph)),
1189 (0..55, RootEnd(0)),
1190 ]
1191 );
1192 }
1193
1194 #[test]
1195 fn test_heading_slugs() {
1196 let parsed = parse_markdown_with_options(
1197 "# Hello World\n\n## Code `block`\n\n### Third Level\n\n#### Fourth Level\n\n## Hello World",
1198 false,
1199 true,
1200 );
1201 assert_eq!(parsed.heading_slugs.len(), 5);
1202 assert!(parsed.heading_slugs.contains_key("hello-world"));
1203 assert!(parsed.heading_slugs.contains_key("code-block"));
1204 assert!(parsed.heading_slugs.contains_key("third-level"));
1205 assert!(parsed.heading_slugs.contains_key("fourth-level"));
1206 assert!(parsed.heading_slugs.contains_key("hello-world-1"));
1207 }
1208
1209 #[test]
1210 fn test_heading_source_index_for_slug() {
1211 let parsed = parse_markdown_with_options(
1212 "# Duplicate\n\nText\n\n## Duplicate\n\nMore text",
1213 false,
1214 true,
1215 );
1216 let first = parsed.heading_slugs.get("duplicate").copied();
1217 let second = parsed.heading_slugs.get("duplicate-1").copied();
1218 assert!(first.is_some());
1219 assert!(second.is_some());
1220 assert!(first.expect("first slug missing") < second.expect("second slug missing"));
1221 }
1222
1223 #[test]
1224 fn test_heading_slug_collision_with_dedup_suffix() {
1225 let parsed = parse_markdown_with_options("# Foo\n\n## Foo\n\n## Foo 1", false, true);
1226 assert_eq!(parsed.heading_slugs.len(), 3);
1227 assert!(parsed.heading_slugs.contains_key("foo"));
1228 assert!(parsed.heading_slugs.contains_key("foo-1"));
1229 assert!(parsed.heading_slugs.contains_key("foo-1-1"));
1230 }
1231}