1use gpui::SharedString;
2use linkify::LinkFinder;
3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
4use pulldown_cmark::{
5 Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
6};
7use std::{collections::BTreeMap, ops::Range, sync::Arc};
8
9use collections::HashSet;
10
11use crate::{html, path_range::PathWithRange};
12
13pub const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
14 .union(Options::ENABLE_FOOTNOTES)
15 .union(Options::ENABLE_STRIKETHROUGH)
16 .union(Options::ENABLE_TASKLISTS)
17 .union(Options::ENABLE_SMART_PUNCTUATION)
18 .union(Options::ENABLE_HEADING_ATTRIBUTES)
19 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
20 .union(Options::ENABLE_OLD_FOOTNOTES)
21 .union(Options::ENABLE_GFM)
22 .union(Options::ENABLE_SUPERSCRIPT)
23 .union(Options::ENABLE_SUBSCRIPT);
24
25#[derive(Default)]
26struct ParseState {
27 events: Vec<(Range<usize>, MarkdownEvent)>,
28 root_block_starts: Vec<usize>,
29 depth: usize,
30}
31
32#[derive(Debug, Default)]
33#[cfg_attr(test, derive(PartialEq))]
34pub(crate) struct ParsedMarkdownData {
35 pub events: Vec<(Range<usize>, MarkdownEvent)>,
36 pub language_names: HashSet<SharedString>,
37 pub language_paths: HashSet<Arc<str>>,
38 pub root_block_starts: Vec<usize>,
39 pub html_blocks: BTreeMap<usize, html::html_parser::ParsedHtmlBlock>,
40}
41
42impl ParseState {
43 fn push_event(&mut self, range: Range<usize>, event: MarkdownEvent) {
44 match &event {
45 MarkdownEvent::Start(_) => {
46 if self.depth == 0 {
47 self.root_block_starts.push(range.start);
48 self.events.push((range.clone(), MarkdownEvent::RootStart));
49 }
50 self.depth += 1;
51 self.events.push((range, event));
52 }
53 MarkdownEvent::End(_) => {
54 self.events.push((range.clone(), event));
55 if self.depth > 0 {
56 self.depth -= 1;
57 if self.depth == 0 {
58 let root_block_index = self.root_block_starts.len() - 1;
59 self.events
60 .push((range, MarkdownEvent::RootEnd(root_block_index)));
61 }
62 }
63 }
64 MarkdownEvent::Rule => {
65 if self.depth == 0 && !range.is_empty() {
66 self.root_block_starts.push(range.start);
67 let root_block_index = self.root_block_starts.len() - 1;
68 self.events.push((range.clone(), MarkdownEvent::RootStart));
69 self.events.push((range.clone(), event));
70 self.events
71 .push((range, MarkdownEvent::RootEnd(root_block_index)));
72 } else {
73 self.events.push((range, event));
74 }
75 }
76 _ => {
77 self.events.push((range, event));
78 }
79 }
80 }
81}
82
83pub(crate) fn parse_markdown_with_options(text: &str, parse_html: bool) -> ParsedMarkdownData {
84 let mut state = ParseState::default();
85 let mut language_names = HashSet::default();
86 let mut language_paths = HashSet::default();
87 let mut html_blocks = BTreeMap::default();
88 let mut within_link = false;
89 let mut within_code_block = false;
90 let mut within_metadata = false;
91 let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
92 .into_offset_iter()
93 .peekable();
94 while let Some((pulldown_event, range)) = parser.next() {
95 if within_metadata {
96 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
97 pulldown_event
98 {
99 within_metadata = false;
100 }
101 continue;
102 }
103 match pulldown_event {
104 pulldown_cmark::Event::Start(tag) => {
105 if let pulldown_cmark::Tag::HtmlBlock = &tag {
106 state.push_event(range.clone(), MarkdownEvent::Start(MarkdownTag::HtmlBlock));
107
108 if parse_html {
109 if let Some(block) =
110 html::html_parser::parse_html_block(&text[range.clone()], range.clone())
111 {
112 html_blocks.insert(range.start, block);
113
114 while let Some((event, end_range)) = parser.next() {
115 if let pulldown_cmark::Event::End(
116 pulldown_cmark::TagEnd::HtmlBlock,
117 ) = event
118 {
119 state.push_event(
120 end_range,
121 MarkdownEvent::End(MarkdownTagEnd::HtmlBlock),
122 );
123 break;
124 }
125 }
126 }
127 }
128 continue;
129 }
130
131 let tag = match tag {
132 pulldown_cmark::Tag::Link {
133 link_type,
134 dest_url,
135 title,
136 id,
137 } => {
138 within_link = true;
139 MarkdownTag::Link {
140 link_type,
141 dest_url: SharedString::from(dest_url.into_string()),
142 title: SharedString::from(title.into_string()),
143 id: SharedString::from(id.into_string()),
144 }
145 }
146 pulldown_cmark::Tag::MetadataBlock(_kind) => {
147 within_metadata = true;
148 continue;
149 }
150 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
151 within_code_block = true;
152 MarkdownTag::CodeBlock {
153 kind: CodeBlockKind::Indented,
154 metadata: CodeBlockMetadata {
155 content_range: range.clone(),
156 line_count: 1,
157 },
158 }
159 }
160 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
161 ref info,
162 )) => {
163 within_code_block = true;
164 let content_range = extract_code_block_content_range(&text[range.clone()]);
165 let content_range =
166 content_range.start + range.start..content_range.end + range.start;
167
168 // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
169 let line_count = text[content_range.clone()]
170 .bytes()
171 .filter(|c| *c == b'\n')
172 .count();
173 let metadata = CodeBlockMetadata {
174 content_range,
175 line_count,
176 };
177
178 let info = info.trim();
179 let kind = if info.is_empty() {
180 CodeBlockKind::Fenced
181 // Languages should never contain a slash, and PathRanges always should.
182 // (Models are told to specify them relative to a workspace root.)
183 } else if info.contains('/') {
184 let path_range = PathWithRange::new(info);
185 language_paths.insert(path_range.path.clone());
186 CodeBlockKind::FencedSrc(path_range)
187 } else {
188 let language = SharedString::from(info.to_string());
189 language_names.insert(language.clone());
190 CodeBlockKind::FencedLang(language)
191 };
192
193 MarkdownTag::CodeBlock { kind, metadata }
194 }
195 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
196 pulldown_cmark::Tag::Heading {
197 level,
198 id,
199 classes,
200 attrs,
201 } => {
202 let id = id.map(|id| SharedString::from(id.into_string()));
203 let classes = classes
204 .into_iter()
205 .map(|c| SharedString::from(c.into_string()))
206 .collect();
207 let attrs = attrs
208 .into_iter()
209 .map(|(key, value)| {
210 (
211 SharedString::from(key.into_string()),
212 value.map(|v| SharedString::from(v.into_string())),
213 )
214 })
215 .collect();
216 MarkdownTag::Heading {
217 level,
218 id,
219 classes,
220 attrs,
221 }
222 }
223 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
224 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
225 pulldown_cmark::Tag::Item => MarkdownTag::Item,
226 pulldown_cmark::Tag::FootnoteDefinition(label) => {
227 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
228 }
229 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
230 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
231 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
232 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
233 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
234 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
235 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
236 pulldown_cmark::Tag::Superscript => MarkdownTag::Superscript,
237 pulldown_cmark::Tag::Subscript => MarkdownTag::Subscript,
238 pulldown_cmark::Tag::Image {
239 link_type,
240 dest_url,
241 title,
242 id,
243 } => MarkdownTag::Image {
244 link_type,
245 dest_url: SharedString::from(dest_url.into_string()),
246 title: SharedString::from(title.into_string()),
247 id: SharedString::from(id.into_string()),
248 },
249 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock, // this is handled above separately
250 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
251 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
252 pulldown_cmark::Tag::DefinitionListDefinition => {
253 MarkdownTag::DefinitionListDefinition
254 }
255 };
256 state.push_event(range, MarkdownEvent::Start(tag))
257 }
258 pulldown_cmark::Event::End(tag) => {
259 if let pulldown_cmark::TagEnd::Link = tag {
260 within_link = false;
261 } else if let pulldown_cmark::TagEnd::CodeBlock = tag {
262 within_code_block = false;
263 }
264 state.push_event(range, MarkdownEvent::End(tag));
265 }
266 pulldown_cmark::Event::Text(parsed) => {
267 fn event_for(
268 text: &str,
269 range: Range<usize>,
270 str: &str,
271 ) -> (Range<usize>, MarkdownEvent) {
272 if str == &text[range.clone()] {
273 (range, MarkdownEvent::Text)
274 } else {
275 (range, MarkdownEvent::SubstitutedText(str.to_owned()))
276 }
277 }
278
279 if within_code_block {
280 let (range, event) = event_for(text, range, &parsed);
281 state.push_event(range, event);
282 continue;
283 }
284
285 #[derive(Debug)]
286 struct TextRange<'a> {
287 source_range: Range<usize>,
288 merged_range: Range<usize>,
289 parsed: CowStr<'a>,
290 }
291
292 let mut last_len = parsed.len();
293 let mut ranges = vec![TextRange {
294 source_range: range.clone(),
295 merged_range: 0..last_len,
296 parsed,
297 }];
298
299 while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _)))
300 || (parse_html
301 && matches!(
302 parser.peek(),
303 Some((pulldown_cmark::Event::InlineHtml(_), _))
304 ))
305 {
306 let Some((next_event, next_range)) = parser.next() else {
307 unreachable!()
308 };
309 let next_text = match next_event {
310 pulldown_cmark::Event::Text(next_event) => next_event,
311 pulldown_cmark::Event::InlineHtml(_) => CowStr::Borrowed(""),
312 _ => unreachable!(),
313 };
314 let next_len = last_len + next_text.len();
315 ranges.push(TextRange {
316 source_range: next_range.clone(),
317 merged_range: last_len..next_len,
318 parsed: next_text,
319 });
320 last_len = next_len;
321 }
322
323 let mut merged_text =
324 String::with_capacity(ranges.last().unwrap().merged_range.end);
325 for range in &ranges {
326 merged_text.push_str(&range.parsed);
327 }
328
329 let mut ranges = ranges.into_iter().peekable();
330
331 if !within_link && !within_code_block {
332 let mut finder = LinkFinder::new();
333 finder.kinds(&[linkify::LinkKind::Url]);
334
335 // Find links in the merged text
336 for link in finder.links(&merged_text) {
337 let link_start_in_merged = link.start();
338 let link_end_in_merged = link.end();
339
340 while ranges
341 .peek()
342 .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
343 {
344 let range = ranges.next().unwrap();
345 let (range, event) = event_for(text, range.source_range, &range.parsed);
346 state.push_event(range, event);
347 }
348
349 let Some(range) = ranges.peek_mut() else {
350 continue;
351 };
352 let prefix_len = link_start_in_merged - range.merged_range.start;
353 if prefix_len > 0 {
354 let (head, tail) = range.parsed.split_at(prefix_len);
355 let (event_range, event) = event_for(
356 text,
357 range.source_range.start..range.source_range.start + prefix_len,
358 head,
359 );
360 state.push_event(event_range, event);
361 range.parsed = CowStr::Boxed(tail.into());
362 range.merged_range.start += prefix_len;
363 range.source_range.start += prefix_len;
364 }
365
366 let link_start_in_source = range.source_range.start;
367 let mut link_end_in_source = range.source_range.end;
368 let mut link_events = Vec::new();
369
370 while ranges
371 .peek()
372 .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
373 {
374 let range = ranges.next().unwrap();
375 link_end_in_source = range.source_range.end;
376 link_events.push(event_for(text, range.source_range, &range.parsed));
377 }
378
379 if let Some(range) = ranges.peek_mut() {
380 let prefix_len = link_end_in_merged - range.merged_range.start;
381 if prefix_len > 0 {
382 let (head, tail) = range.parsed.split_at(prefix_len);
383 link_events.push(event_for(
384 text,
385 range.source_range.start..range.source_range.start + prefix_len,
386 head,
387 ));
388 range.parsed = CowStr::Boxed(tail.into());
389 range.merged_range.start += prefix_len;
390 range.source_range.start += prefix_len;
391 link_end_in_source = range.source_range.start;
392 }
393 }
394 let link_range = link_start_in_source..link_end_in_source;
395
396 state.push_event(
397 link_range.clone(),
398 MarkdownEvent::Start(MarkdownTag::Link {
399 link_type: LinkType::Autolink,
400 dest_url: SharedString::from(link.as_str().to_string()),
401 title: SharedString::default(),
402 id: SharedString::default(),
403 }),
404 );
405 for (range, event) in link_events {
406 state.push_event(range, event);
407 }
408 state.push_event(
409 link_range.clone(),
410 MarkdownEvent::End(MarkdownTagEnd::Link),
411 );
412 }
413 }
414
415 for range in ranges {
416 let (range, event) = event_for(text, range.source_range, &range.parsed);
417 state.push_event(range, event);
418 }
419 }
420 pulldown_cmark::Event::Code(_) => {
421 let content_range = extract_code_content_range(&text[range.clone()]);
422 let content_range =
423 content_range.start + range.start..content_range.end + range.start;
424 state.push_event(content_range, MarkdownEvent::Code)
425 }
426 pulldown_cmark::Event::Html(_) => state.push_event(range, MarkdownEvent::Html),
427 pulldown_cmark::Event::InlineHtml(_) => {
428 state.push_event(range, MarkdownEvent::InlineHtml)
429 }
430 pulldown_cmark::Event::FootnoteReference(_) => {
431 state.push_event(range, MarkdownEvent::FootnoteReference)
432 }
433 pulldown_cmark::Event::SoftBreak => state.push_event(range, MarkdownEvent::SoftBreak),
434 pulldown_cmark::Event::HardBreak => state.push_event(range, MarkdownEvent::HardBreak),
435 pulldown_cmark::Event::Rule => state.push_event(range, MarkdownEvent::Rule),
436 pulldown_cmark::Event::TaskListMarker(checked) => {
437 state.push_event(range, MarkdownEvent::TaskListMarker(checked))
438 }
439 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
440 }
441 }
442
443 ParsedMarkdownData {
444 events: state.events,
445 language_names,
446 language_paths,
447 root_block_starts: state.root_block_starts,
448 html_blocks,
449 }
450}
451
452pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
453 let mut events = Vec::new();
454 let mut finder = LinkFinder::new();
455 finder.kinds(&[linkify::LinkKind::Url]);
456 let mut text_range = Range {
457 start: 0,
458 end: text.len(),
459 };
460 for link in finder.links(text) {
461 let link_range = link.start()..link.end();
462
463 if link_range.start > text_range.start {
464 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
465 }
466
467 events.push((
468 link_range.clone(),
469 MarkdownEvent::Start(MarkdownTag::Link {
470 link_type: LinkType::Autolink,
471 dest_url: SharedString::from(link.as_str().to_string()),
472 title: SharedString::default(),
473 id: SharedString::default(),
474 }),
475 ));
476 events.push((link_range.clone(), MarkdownEvent::Text));
477 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
478
479 text_range.start = link_range.end;
480 }
481
482 if text_range.end > text_range.start {
483 events.push((text_range, MarkdownEvent::Text));
484 }
485
486 events
487}
488
489/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
490/// parse result for rendering without resorting to unsafe lifetime coercion.
491#[derive(Clone, Debug, PartialEq)]
492pub enum MarkdownEvent {
493 /// Start of a tagged element. Events that are yielded after this event
494 /// and before its corresponding `End` event are inside this element.
495 /// Start and end events are guaranteed to be balanced.
496 Start(MarkdownTag),
497 /// End of a tagged element.
498 End(MarkdownTagEnd),
499 /// Text that uses the associated range from the markdown source.
500 Text,
501 /// Text that differs from the markdown source - typically due to substitution of HTML entities
502 /// and smart punctuation.
503 SubstitutedText(String),
504 /// An inline code node.
505 Code,
506 /// An HTML node.
507 Html,
508 /// An inline HTML node.
509 InlineHtml,
510 /// A reference to a footnote with given label, which may or may not be defined
511 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
512 /// occur in any order.
513 FootnoteReference,
514 /// A soft line break.
515 SoftBreak,
516 /// A hard line break.
517 HardBreak,
518 /// A horizontal ruler.
519 Rule,
520 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
521 TaskListMarker(bool),
522 /// Start of a root-level block (a top-level structural element like a paragraph, heading, list, etc.).
523 RootStart,
524 /// End of a root-level block. Contains the root block index.
525 RootEnd(usize),
526}
527
528/// Tags for elements that can contain other elements.
529#[derive(Clone, Debug, PartialEq)]
530pub enum MarkdownTag {
531 /// A paragraph of text and other inline elements.
532 Paragraph,
533
534 /// A heading, with optional identifier, classes and custom attributes.
535 /// The identifier is prefixed with `#` and the last one in the attributes
536 /// list is chosen, classes are prefixed with `.` and custom attributes
537 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
538 Heading {
539 level: HeadingLevel,
540 id: Option<SharedString>,
541 classes: Vec<SharedString>,
542 /// The first item of the tuple is the attr and second one the value.
543 attrs: Vec<(SharedString, Option<SharedString>)>,
544 },
545
546 BlockQuote,
547
548 /// A code block.
549 CodeBlock {
550 kind: CodeBlockKind,
551 metadata: CodeBlockMetadata,
552 },
553
554 /// A HTML block.
555 HtmlBlock,
556
557 /// A list. If the list is ordered the field indicates the number of the first item.
558 /// Contains only list items.
559 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
560
561 /// A list item.
562 Item,
563
564 /// A footnote definition. The value contained is the footnote's label by which it can
565 /// be referred to.
566 FootnoteDefinition(SharedString),
567
568 /// A table. Contains a vector describing the text-alignment for each of its columns.
569 Table(Vec<Alignment>),
570
571 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
572 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
573 TableHead,
574
575 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
576 TableRow,
577 TableCell,
578
579 // span-level tags
580 Emphasis,
581 Strong,
582 Strikethrough,
583 Superscript,
584 Subscript,
585
586 /// A link.
587 Link {
588 link_type: LinkType,
589 dest_url: SharedString,
590 title: SharedString,
591 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
592 id: SharedString,
593 },
594
595 /// An image. The first field is the link type, the second the destination URL and the third is a title,
596 /// the fourth is the link identifier.
597 Image {
598 link_type: LinkType,
599 dest_url: SharedString,
600 title: SharedString,
601 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
602 id: SharedString,
603 },
604
605 /// A metadata block.
606 MetadataBlock(MetadataBlockKind),
607
608 DefinitionList,
609 DefinitionListTitle,
610 DefinitionListDefinition,
611}
612
613#[derive(Clone, Debug, PartialEq)]
614pub enum CodeBlockKind {
615 Indented,
616 /// "Fenced" means "surrounded by triple backticks."
617 /// There can optionally be either a language after the backticks (like in traditional Markdown)
618 /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
619 /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
620 Fenced,
621 FencedLang(SharedString),
622 FencedSrc(PathWithRange),
623}
624
625#[derive(Default, Clone, Debug, PartialEq)]
626pub struct CodeBlockMetadata {
627 pub content_range: Range<usize>,
628 pub line_count: usize,
629}
630
631fn extract_code_content_range(text: &str) -> Range<usize> {
632 let text_len = text.len();
633 if text_len == 0 {
634 return 0..0;
635 }
636
637 let start_ticks = text.chars().take_while(|&c| c == '`').count();
638
639 if start_ticks == 0 || start_ticks > text_len {
640 return 0..text_len;
641 }
642
643 let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
644
645 if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
646 return 0..text_len;
647 }
648
649 start_ticks..text_len - end_ticks
650}
651
652pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
653 let mut range = 0..text.len();
654 if text.starts_with("```") {
655 range.start += 3;
656
657 if let Some(newline_ix) = text[range.clone()].find('\n') {
658 range.start += newline_ix + 1;
659 }
660 }
661
662 if !range.is_empty() && text.ends_with("```") {
663 range.end -= 3;
664 }
665 if range.start > range.end {
666 range.end = range.start;
667 }
668 range
669}
670
671#[cfg(test)]
672mod tests {
673 use super::MarkdownEvent::*;
674 use super::MarkdownTag::*;
675 use super::*;
676
677 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
678 .union(Options::ENABLE_MATH)
679 .union(Options::ENABLE_DEFINITION_LIST)
680 .union(Options::ENABLE_WIKILINKS);
681
682 #[test]
683 fn all_options_considered() {
684 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
685 // can be evaluated for inclusion.
686 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
687 }
688
689 #[test]
690 fn wanted_and_unwanted_options_disjoint() {
691 assert_eq!(
692 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
693 Options::empty()
694 );
695 }
696
697 #[test]
698 fn test_html_comments() {
699 assert_eq!(
700 parse_markdown_with_options(" <!--\nrdoc-file=string.c\n-->\nReturns", false),
701 ParsedMarkdownData {
702 events: vec![
703 (2..30, RootStart),
704 (2..30, Start(HtmlBlock)),
705 (2..2, SubstitutedText(" ".into())),
706 (2..7, Html),
707 (7..26, Html),
708 (26..30, Html),
709 (2..30, End(MarkdownTagEnd::HtmlBlock)),
710 (2..30, RootEnd(0)),
711 (30..37, RootStart),
712 (30..37, Start(Paragraph)),
713 (30..37, Text),
714 (30..37, End(MarkdownTagEnd::Paragraph)),
715 (30..37, RootEnd(1)),
716 ],
717 root_block_starts: vec![2, 30],
718 ..Default::default()
719 }
720 )
721 }
722
723 #[test]
724 fn test_plain_urls_and_escaped_text() {
725 assert_eq!(
726 parse_markdown_with_options(
727 " https://some.url some \\`►\\` text",
728 false
729 ),
730 ParsedMarkdownData {
731 events: vec![
732 (0..51, RootStart),
733 (0..51, Start(Paragraph)),
734 (0..6, SubstitutedText("\u{a0}".into())),
735 (6..12, SubstitutedText("\u{a0}".into())),
736 (12..13, Text),
737 (
738 13..29,
739 Start(Link {
740 link_type: LinkType::Autolink,
741 dest_url: "https://some.url".into(),
742 title: "".into(),
743 id: "".into(),
744 })
745 ),
746 (13..29, Text),
747 (13..29, End(MarkdownTagEnd::Link)),
748 (29..35, Text),
749 (36..37, Text), // Escaped backtick
750 (37..44, SubstitutedText("►".into())),
751 (45..46, Text), // Escaped backtick
752 (46..51, Text),
753 (0..51, End(MarkdownTagEnd::Paragraph)),
754 (0..51, RootEnd(0)),
755 ],
756 root_block_starts: vec![0],
757 ..Default::default()
758 }
759 );
760 }
761
762 #[test]
763 fn test_incomplete_link() {
764 assert_eq!(
765 parse_markdown_with_options(
766 "You can use the [GitHub Search API](https://docs.github.com/en",
767 false
768 )
769 .events,
770 vec![
771 (0..62, RootStart),
772 (0..62, Start(Paragraph)),
773 (0..16, Text),
774 (16..17, Text),
775 (17..34, Text),
776 (34..35, Text),
777 (35..36, Text),
778 (
779 36..62,
780 Start(Link {
781 link_type: LinkType::Autolink,
782 dest_url: "https://docs.github.com/en".into(),
783 title: "".into(),
784 id: "".into()
785 })
786 ),
787 (36..62, Text),
788 (36..62, End(MarkdownTagEnd::Link)),
789 (0..62, End(MarkdownTagEnd::Paragraph)),
790 (0..62, RootEnd(0)),
791 ],
792 );
793 }
794
795 #[test]
796 fn test_smart_punctuation() {
797 assert_eq!(
798 parse_markdown_with_options(
799 "-- --- ... \"double quoted\" 'single quoted' ----------",
800 false
801 ),
802 ParsedMarkdownData {
803 events: vec![
804 (0..53, RootStart),
805 (0..53, Start(Paragraph)),
806 (0..2, SubstitutedText("–".into())),
807 (2..3, Text),
808 (3..6, SubstitutedText("—".into())),
809 (6..7, Text),
810 (7..10, SubstitutedText("…".into())),
811 (10..11, Text),
812 (11..12, SubstitutedText("\u{201c}".into())),
813 (12..25, Text),
814 (25..26, SubstitutedText("\u{201d}".into())),
815 (26..27, Text),
816 (27..28, SubstitutedText("\u{2018}".into())),
817 (28..41, Text),
818 (41..42, SubstitutedText("\u{2019}".into())),
819 (42..43, Text),
820 (43..53, SubstitutedText("–––––".into())),
821 (0..53, End(MarkdownTagEnd::Paragraph)),
822 (0..53, RootEnd(0)),
823 ],
824 root_block_starts: vec![0],
825 ..Default::default()
826 }
827 )
828 }
829
830 #[test]
831 fn test_code_block_metadata() {
832 assert_eq!(
833 parse_markdown_with_options("```rust\nfn main() {\n let a = 1;\n}\n```", false),
834 ParsedMarkdownData {
835 events: vec![
836 (0..37, RootStart),
837 (
838 0..37,
839 Start(CodeBlock {
840 kind: CodeBlockKind::FencedLang("rust".into()),
841 metadata: CodeBlockMetadata {
842 content_range: 8..34,
843 line_count: 3
844 }
845 })
846 ),
847 (8..34, Text),
848 (0..37, End(MarkdownTagEnd::CodeBlock)),
849 (0..37, RootEnd(0)),
850 ],
851 language_names: {
852 let mut h = HashSet::default();
853 h.insert("rust".into());
854 h
855 },
856 root_block_starts: vec![0],
857 ..Default::default()
858 }
859 );
860 assert_eq!(
861 parse_markdown_with_options(" fn main() {}", false),
862 ParsedMarkdownData {
863 events: vec![
864 (4..16, RootStart),
865 (
866 4..16,
867 Start(CodeBlock {
868 kind: CodeBlockKind::Indented,
869 metadata: CodeBlockMetadata {
870 content_range: 4..16,
871 line_count: 1
872 }
873 })
874 ),
875 (4..16, Text),
876 (4..16, End(MarkdownTagEnd::CodeBlock)),
877 (4..16, RootEnd(0)),
878 ],
879 root_block_starts: vec![4],
880 ..Default::default()
881 }
882 );
883 }
884
885 fn assert_code_block_does_not_emit_links(markdown: &str) {
886 let parsed = parse_markdown_with_options(markdown, false);
887 let mut code_block_depth = 0;
888 let mut code_block_count = 0;
889 let mut saw_text_inside_code_block = false;
890
891 for (_, event) in &parsed.events {
892 match event {
893 Start(CodeBlock { .. }) => {
894 code_block_depth += 1;
895 code_block_count += 1;
896 }
897 End(MarkdownTagEnd::CodeBlock) => {
898 assert!(
899 code_block_depth > 0,
900 "encountered a code block end without a matching start"
901 );
902 code_block_depth -= 1;
903 }
904 Start(Link { .. }) | End(MarkdownTagEnd::Link) => {
905 assert_eq!(
906 code_block_depth, 0,
907 "code blocks should not emit link events"
908 );
909 }
910 Text | SubstitutedText(_) if code_block_depth > 0 => {
911 saw_text_inside_code_block = true;
912 }
913 _ => {}
914 }
915 }
916
917 assert_eq!(code_block_count, 1, "expected exactly one code block");
918 assert_eq!(code_block_depth, 0, "unterminated code block");
919 assert!(
920 saw_text_inside_code_block,
921 "expected text inside the code block"
922 );
923 }
924
925 #[test]
926 fn test_code_blocks_do_not_autolink_urls() {
927 assert_code_block_does_not_emit_links("```txt\nhttps://example.com\n```");
928 assert_code_block_does_not_emit_links(" https://example.com");
929 assert_code_block_does_not_emit_links(
930 "```txt\r\nhttps:/\\/example.com\r\nhttps://example.com\r\n```",
931 );
932 assert_code_block_does_not_emit_links(
933 " https:/\\/example.com\r\n https://example.com",
934 );
935 }
936
937 #[test]
938 fn test_metadata_blocks_do_not_affect_root_blocks() {
939 assert_eq!(
940 parse_markdown_with_options("+++\ntitle = \"Example\"\n+++\n\nParagraph", false),
941 ParsedMarkdownData {
942 events: vec![
943 (27..36, RootStart),
944 (27..36, Start(Paragraph)),
945 (27..36, Text),
946 (27..36, End(MarkdownTagEnd::Paragraph)),
947 (27..36, RootEnd(0)),
948 ],
949 root_block_starts: vec![27],
950 ..Default::default()
951 }
952 );
953 }
954
955 #[test]
956 fn test_table_checkboxes_remain_text_in_cells() {
957 let markdown = "\
958| Done | Task |
959|------|---------|
960| [x] | Fix bug |
961| [ ] | Add feature |";
962 let parsed = parse_markdown_with_options(markdown, false);
963
964 let mut in_table = false;
965 let mut saw_task_list_marker = false;
966 let mut cell_texts = Vec::new();
967 let mut current_cell = String::new();
968
969 for (range, event) in &parsed.events {
970 match event {
971 Start(Table(_)) => in_table = true,
972 End(MarkdownTagEnd::Table) => in_table = false,
973 Start(TableCell) => current_cell.clear(),
974 End(MarkdownTagEnd::TableCell) => {
975 if in_table {
976 cell_texts.push(current_cell.clone());
977 }
978 }
979 Text if in_table => current_cell.push_str(&markdown[range.clone()]),
980 TaskListMarker(_) if in_table => saw_task_list_marker = true,
981 _ => {}
982 }
983 }
984
985 let checkbox_cells: Vec<&str> = cell_texts
986 .iter()
987 .map(|cell| cell.trim())
988 .filter(|cell| *cell == "[x]" || *cell == "[X]" || *cell == "[ ]")
989 .collect();
990
991 assert!(
992 !saw_task_list_marker,
993 "Table checkboxes should remain text, not task-list markers"
994 );
995 assert_eq!(checkbox_cells, vec!["[x]", "[ ]"]);
996 }
997
998 #[test]
999 fn test_extract_code_content_range() {
1000 let input = "```let x = 5;```";
1001 assert_eq!(extract_code_content_range(input), 3..13);
1002
1003 let input = "``let x = 5;``";
1004 assert_eq!(extract_code_content_range(input), 2..12);
1005
1006 let input = "`let x = 5;`";
1007 assert_eq!(extract_code_content_range(input), 1..11);
1008
1009 let input = "plain text";
1010 assert_eq!(extract_code_content_range(input), 0..10);
1011
1012 let input = "``let x = 5;`";
1013 assert_eq!(extract_code_content_range(input), 0..13);
1014 }
1015
1016 #[test]
1017 fn test_extract_code_block_content_range() {
1018 let input = "```rust\nlet x = 5;\n```";
1019 assert_eq!(extract_code_block_content_range(input), 8..19);
1020
1021 let input = "plain text";
1022 assert_eq!(extract_code_block_content_range(input), 0..10);
1023
1024 let input = "```python\nprint('hello')\nprint('world')\n```";
1025 assert_eq!(extract_code_block_content_range(input), 10..40);
1026
1027 // Malformed input
1028 let input = "`````";
1029 assert_eq!(extract_code_block_content_range(input), 3..3);
1030 }
1031
1032 #[test]
1033 fn test_links_split_across_fragments() {
1034 // This test verifies that links split across multiple text fragments due to escaping or other issues
1035 // are correctly detected and processed
1036 // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
1037 // We're verifying our parser can handle this correctly
1038 assert_eq!(
1039 parse_markdown_with_options(
1040 "https:/\\/example.com is equivalent to https://example.com!",
1041 false
1042 )
1043 .events,
1044 vec![
1045 (0..62, RootStart),
1046 (0..62, Start(Paragraph)),
1047 (
1048 0..20,
1049 Start(Link {
1050 link_type: LinkType::Autolink,
1051 dest_url: "https://example.com".into(),
1052 title: "".into(),
1053 id: "".into()
1054 })
1055 ),
1056 (0..7, Text),
1057 (8..20, Text),
1058 (0..20, End(MarkdownTagEnd::Link)),
1059 (20..38, Text),
1060 (
1061 38..61,
1062 Start(Link {
1063 link_type: LinkType::Autolink,
1064 dest_url: "https://example.com".into(),
1065 title: "".into(),
1066 id: "".into()
1067 })
1068 ),
1069 (38..53, Text),
1070 (53..58, SubstitutedText(".".into())),
1071 (58..61, Text),
1072 (38..61, End(MarkdownTagEnd::Link)),
1073 (61..62, Text),
1074 (0..62, End(MarkdownTagEnd::Paragraph)),
1075 (0..62, RootEnd(0)),
1076 ],
1077 );
1078
1079 assert_eq!(
1080 parse_markdown_with_options(
1081 "Visit https://example.com/cat\\/é‍☕ for coffee!",
1082 false
1083 )
1084 .events,
1085 [
1086 (0..55, RootStart),
1087 (0..55, Start(Paragraph)),
1088 (0..6, Text),
1089 (
1090 6..43,
1091 Start(Link {
1092 link_type: LinkType::Autolink,
1093 dest_url: "https://example.com/cat/é\u{200d}☕".into(),
1094 title: "".into(),
1095 id: "".into()
1096 })
1097 ),
1098 (6..29, Text),
1099 (30..33, Text),
1100 (33..40, SubstitutedText("\u{200d}".into())),
1101 (40..43, Text),
1102 (6..43, End(MarkdownTagEnd::Link)),
1103 (43..55, Text),
1104 (0..55, End(MarkdownTagEnd::Paragraph)),
1105 (0..55, RootEnd(0)),
1106 ]
1107 );
1108 }
1109}