1use gpui::SharedString;
2use linkify::LinkFinder;
3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
4use pulldown_cmark::{
5 Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
6};
7use std::{collections::BTreeMap, ops::Range, sync::Arc};
8
9use collections::HashSet;
10
11use crate::{html, path_range::PathWithRange};
12
13pub const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
14 .union(Options::ENABLE_FOOTNOTES)
15 .union(Options::ENABLE_STRIKETHROUGH)
16 .union(Options::ENABLE_TASKLISTS)
17 .union(Options::ENABLE_SMART_PUNCTUATION)
18 .union(Options::ENABLE_HEADING_ATTRIBUTES)
19 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
20 .union(Options::ENABLE_OLD_FOOTNOTES)
21 .union(Options::ENABLE_GFM)
22 .union(Options::ENABLE_SUPERSCRIPT)
23 .union(Options::ENABLE_SUBSCRIPT);
24
25#[derive(Default)]
26struct ParseState {
27 events: Vec<(Range<usize>, MarkdownEvent)>,
28 root_block_starts: Vec<usize>,
29 depth: usize,
30}
31
32#[derive(Debug, Default)]
33#[cfg_attr(test, derive(PartialEq))]
34pub(crate) struct ParsedMarkdownData {
35 pub events: Vec<(Range<usize>, MarkdownEvent)>,
36 pub language_names: HashSet<SharedString>,
37 pub language_paths: HashSet<Arc<str>>,
38 pub root_block_starts: Vec<usize>,
39 pub html_blocks: BTreeMap<usize, html::html_parser::ParsedHtmlBlock>,
40}
41
42impl ParseState {
43 fn push_event(&mut self, range: Range<usize>, event: MarkdownEvent) {
44 match &event {
45 MarkdownEvent::Start(_) => {
46 if self.depth == 0 {
47 self.root_block_starts.push(range.start);
48 self.events.push((range.clone(), MarkdownEvent::RootStart));
49 }
50 self.depth += 1;
51 self.events.push((range, event));
52 }
53 MarkdownEvent::End(_) => {
54 self.events.push((range.clone(), event));
55 if self.depth > 0 {
56 self.depth -= 1;
57 if self.depth == 0 {
58 let root_block_index = self.root_block_starts.len() - 1;
59 self.events
60 .push((range, MarkdownEvent::RootEnd(root_block_index)));
61 }
62 }
63 }
64 MarkdownEvent::Rule => {
65 if self.depth == 0 && !range.is_empty() {
66 self.root_block_starts.push(range.start);
67 let root_block_index = self.root_block_starts.len() - 1;
68 self.events.push((range.clone(), MarkdownEvent::RootStart));
69 self.events.push((range.clone(), event));
70 self.events
71 .push((range, MarkdownEvent::RootEnd(root_block_index)));
72 } else {
73 self.events.push((range, event));
74 }
75 }
76 _ => {
77 self.events.push((range, event));
78 }
79 }
80 }
81}
82
83pub(crate) fn parse_markdown_with_options(text: &str, parse_html: bool) -> ParsedMarkdownData {
84 let mut state = ParseState::default();
85 let mut language_names = HashSet::default();
86 let mut language_paths = HashSet::default();
87 let mut html_blocks = BTreeMap::default();
88 let mut within_link = false;
89 let mut within_metadata = false;
90 let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
91 .into_offset_iter()
92 .peekable();
93 while let Some((pulldown_event, range)) = parser.next() {
94 if within_metadata {
95 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
96 pulldown_event
97 {
98 within_metadata = false;
99 }
100 continue;
101 }
102 match pulldown_event {
103 pulldown_cmark::Event::Start(tag) => {
104 if let pulldown_cmark::Tag::HtmlBlock = &tag {
105 state.push_event(range.clone(), MarkdownEvent::Start(MarkdownTag::HtmlBlock));
106
107 if parse_html {
108 if let Some(block) =
109 html::html_parser::parse_html_block(&text[range.clone()], range.clone())
110 {
111 html_blocks.insert(range.start, block);
112
113 while let Some((event, end_range)) = parser.next() {
114 if let pulldown_cmark::Event::End(
115 pulldown_cmark::TagEnd::HtmlBlock,
116 ) = event
117 {
118 state.push_event(
119 end_range,
120 MarkdownEvent::End(MarkdownTagEnd::HtmlBlock),
121 );
122 break;
123 }
124 }
125 }
126 }
127 continue;
128 }
129
130 let tag = match tag {
131 pulldown_cmark::Tag::Link {
132 link_type,
133 dest_url,
134 title,
135 id,
136 } => {
137 within_link = true;
138 MarkdownTag::Link {
139 link_type,
140 dest_url: SharedString::from(dest_url.into_string()),
141 title: SharedString::from(title.into_string()),
142 id: SharedString::from(id.into_string()),
143 }
144 }
145 pulldown_cmark::Tag::MetadataBlock(_kind) => {
146 within_metadata = true;
147 continue;
148 }
149 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
150 MarkdownTag::CodeBlock {
151 kind: CodeBlockKind::Indented,
152 metadata: CodeBlockMetadata {
153 content_range: range.clone(),
154 line_count: 1,
155 },
156 }
157 }
158 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
159 ref info,
160 )) => {
161 let content_range = extract_code_block_content_range(&text[range.clone()]);
162 let content_range =
163 content_range.start + range.start..content_range.end + range.start;
164
165 // Valid to use bytes since multi-byte UTF-8 doesn't use ASCII chars.
166 let line_count = text[content_range.clone()]
167 .bytes()
168 .filter(|c| *c == b'\n')
169 .count();
170 let metadata = CodeBlockMetadata {
171 content_range,
172 line_count,
173 };
174
175 let info = info.trim();
176 let kind = if info.is_empty() {
177 CodeBlockKind::Fenced
178 // Languages should never contain a slash, and PathRanges always should.
179 // (Models are told to specify them relative to a workspace root.)
180 } else if info.contains('/') {
181 let path_range = PathWithRange::new(info);
182 language_paths.insert(path_range.path.clone());
183 CodeBlockKind::FencedSrc(path_range)
184 } else {
185 let language = SharedString::from(info.to_string());
186 language_names.insert(language.clone());
187 CodeBlockKind::FencedLang(language)
188 };
189
190 MarkdownTag::CodeBlock { kind, metadata }
191 }
192 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
193 pulldown_cmark::Tag::Heading {
194 level,
195 id,
196 classes,
197 attrs,
198 } => {
199 let id = id.map(|id| SharedString::from(id.into_string()));
200 let classes = classes
201 .into_iter()
202 .map(|c| SharedString::from(c.into_string()))
203 .collect();
204 let attrs = attrs
205 .into_iter()
206 .map(|(key, value)| {
207 (
208 SharedString::from(key.into_string()),
209 value.map(|v| SharedString::from(v.into_string())),
210 )
211 })
212 .collect();
213 MarkdownTag::Heading {
214 level,
215 id,
216 classes,
217 attrs,
218 }
219 }
220 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
221 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
222 pulldown_cmark::Tag::Item => MarkdownTag::Item,
223 pulldown_cmark::Tag::FootnoteDefinition(label) => {
224 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
225 }
226 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
227 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
228 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
229 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
230 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
231 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
232 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
233 pulldown_cmark::Tag::Superscript => MarkdownTag::Superscript,
234 pulldown_cmark::Tag::Subscript => MarkdownTag::Subscript,
235 pulldown_cmark::Tag::Image {
236 link_type,
237 dest_url,
238 title,
239 id,
240 } => MarkdownTag::Image {
241 link_type,
242 dest_url: SharedString::from(dest_url.into_string()),
243 title: SharedString::from(title.into_string()),
244 id: SharedString::from(id.into_string()),
245 },
246 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock, // this is handled above separately
247 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
248 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
249 pulldown_cmark::Tag::DefinitionListDefinition => {
250 MarkdownTag::DefinitionListDefinition
251 }
252 };
253 state.push_event(range, MarkdownEvent::Start(tag))
254 }
255 pulldown_cmark::Event::End(tag) => {
256 if let pulldown_cmark::TagEnd::Link = tag {
257 within_link = false;
258 }
259 state.push_event(range, MarkdownEvent::End(tag));
260 }
261 pulldown_cmark::Event::Text(parsed) => {
262 fn event_for(
263 text: &str,
264 range: Range<usize>,
265 str: &str,
266 ) -> (Range<usize>, MarkdownEvent) {
267 if str == &text[range.clone()] {
268 (range, MarkdownEvent::Text)
269 } else {
270 (range, MarkdownEvent::SubstitutedText(str.to_owned()))
271 }
272 }
273 #[derive(Debug)]
274 struct TextRange<'a> {
275 source_range: Range<usize>,
276 merged_range: Range<usize>,
277 parsed: CowStr<'a>,
278 }
279
280 let mut last_len = parsed.len();
281 let mut ranges = vec![TextRange {
282 source_range: range.clone(),
283 merged_range: 0..last_len,
284 parsed,
285 }];
286
287 while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _)))
288 || (parse_html
289 && matches!(
290 parser.peek(),
291 Some((pulldown_cmark::Event::InlineHtml(_), _))
292 ))
293 {
294 let Some((next_event, next_range)) = parser.next() else {
295 unreachable!()
296 };
297 let next_text = match next_event {
298 pulldown_cmark::Event::Text(next_event) => next_event,
299 pulldown_cmark::Event::InlineHtml(_) => CowStr::Borrowed(""),
300 _ => unreachable!(),
301 };
302 let next_len = last_len + next_text.len();
303 ranges.push(TextRange {
304 source_range: next_range.clone(),
305 merged_range: last_len..next_len,
306 parsed: next_text,
307 });
308 last_len = next_len;
309 }
310
311 let mut merged_text =
312 String::with_capacity(ranges.last().unwrap().merged_range.end);
313 for range in &ranges {
314 merged_text.push_str(&range.parsed);
315 }
316
317 let mut ranges = ranges.into_iter().peekable();
318
319 if !within_link {
320 let mut finder = LinkFinder::new();
321 finder.kinds(&[linkify::LinkKind::Url]);
322
323 // Find links in the merged text
324 for link in finder.links(&merged_text) {
325 let link_start_in_merged = link.start();
326 let link_end_in_merged = link.end();
327
328 while ranges
329 .peek()
330 .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
331 {
332 let range = ranges.next().unwrap();
333 let (range, event) = event_for(text, range.source_range, &range.parsed);
334 state.push_event(range, event);
335 }
336
337 let Some(range) = ranges.peek_mut() else {
338 continue;
339 };
340 let prefix_len = link_start_in_merged - range.merged_range.start;
341 if prefix_len > 0 {
342 let (head, tail) = range.parsed.split_at(prefix_len);
343 let (event_range, event) = event_for(
344 text,
345 range.source_range.start..range.source_range.start + prefix_len,
346 head,
347 );
348 state.push_event(event_range, event);
349 range.parsed = CowStr::Boxed(tail.into());
350 range.merged_range.start += prefix_len;
351 range.source_range.start += prefix_len;
352 }
353
354 let link_start_in_source = range.source_range.start;
355 let mut link_end_in_source = range.source_range.end;
356 let mut link_events = Vec::new();
357
358 while ranges
359 .peek()
360 .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
361 {
362 let range = ranges.next().unwrap();
363 link_end_in_source = range.source_range.end;
364 link_events.push(event_for(text, range.source_range, &range.parsed));
365 }
366
367 if let Some(range) = ranges.peek_mut() {
368 let prefix_len = link_end_in_merged - range.merged_range.start;
369 if prefix_len > 0 {
370 let (head, tail) = range.parsed.split_at(prefix_len);
371 link_events.push(event_for(
372 text,
373 range.source_range.start..range.source_range.start + prefix_len,
374 head,
375 ));
376 range.parsed = CowStr::Boxed(tail.into());
377 range.merged_range.start += prefix_len;
378 range.source_range.start += prefix_len;
379 link_end_in_source = range.source_range.start;
380 }
381 }
382 let link_range = link_start_in_source..link_end_in_source;
383
384 state.push_event(
385 link_range.clone(),
386 MarkdownEvent::Start(MarkdownTag::Link {
387 link_type: LinkType::Autolink,
388 dest_url: SharedString::from(link.as_str().to_string()),
389 title: SharedString::default(),
390 id: SharedString::default(),
391 }),
392 );
393 for (range, event) in link_events {
394 state.push_event(range, event);
395 }
396 state.push_event(
397 link_range.clone(),
398 MarkdownEvent::End(MarkdownTagEnd::Link),
399 );
400 }
401 }
402
403 for range in ranges {
404 let (range, event) = event_for(text, range.source_range, &range.parsed);
405 state.push_event(range, event);
406 }
407 }
408 pulldown_cmark::Event::Code(_) => {
409 let content_range = extract_code_content_range(&text[range.clone()]);
410 let content_range =
411 content_range.start + range.start..content_range.end + range.start;
412 state.push_event(content_range, MarkdownEvent::Code)
413 }
414 pulldown_cmark::Event::Html(_) => state.push_event(range, MarkdownEvent::Html),
415 pulldown_cmark::Event::InlineHtml(_) => {
416 state.push_event(range, MarkdownEvent::InlineHtml)
417 }
418 pulldown_cmark::Event::FootnoteReference(_) => {
419 state.push_event(range, MarkdownEvent::FootnoteReference)
420 }
421 pulldown_cmark::Event::SoftBreak => state.push_event(range, MarkdownEvent::SoftBreak),
422 pulldown_cmark::Event::HardBreak => state.push_event(range, MarkdownEvent::HardBreak),
423 pulldown_cmark::Event::Rule => state.push_event(range, MarkdownEvent::Rule),
424 pulldown_cmark::Event::TaskListMarker(checked) => {
425 state.push_event(range, MarkdownEvent::TaskListMarker(checked))
426 }
427 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
428 }
429 }
430
431 ParsedMarkdownData {
432 events: state.events,
433 language_names,
434 language_paths,
435 root_block_starts: state.root_block_starts,
436 html_blocks,
437 }
438}
439
440pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
441 let mut events = Vec::new();
442 let mut finder = LinkFinder::new();
443 finder.kinds(&[linkify::LinkKind::Url]);
444 let mut text_range = Range {
445 start: 0,
446 end: text.len(),
447 };
448 for link in finder.links(text) {
449 let link_range = link.start()..link.end();
450
451 if link_range.start > text_range.start {
452 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
453 }
454
455 events.push((
456 link_range.clone(),
457 MarkdownEvent::Start(MarkdownTag::Link {
458 link_type: LinkType::Autolink,
459 dest_url: SharedString::from(link.as_str().to_string()),
460 title: SharedString::default(),
461 id: SharedString::default(),
462 }),
463 ));
464 events.push((link_range.clone(), MarkdownEvent::Text));
465 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
466
467 text_range.start = link_range.end;
468 }
469
470 if text_range.end > text_range.start {
471 events.push((text_range, MarkdownEvent::Text));
472 }
473
474 events
475}
476
477/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
478/// parse result for rendering without resorting to unsafe lifetime coercion.
479#[derive(Clone, Debug, PartialEq)]
480pub enum MarkdownEvent {
481 /// Start of a tagged element. Events that are yielded after this event
482 /// and before its corresponding `End` event are inside this element.
483 /// Start and end events are guaranteed to be balanced.
484 Start(MarkdownTag),
485 /// End of a tagged element.
486 End(MarkdownTagEnd),
487 /// Text that uses the associated range from the markdown source.
488 Text,
489 /// Text that differs from the markdown source - typically due to substitution of HTML entities
490 /// and smart punctuation.
491 SubstitutedText(String),
492 /// An inline code node.
493 Code,
494 /// An HTML node.
495 Html,
496 /// An inline HTML node.
497 InlineHtml,
498 /// A reference to a footnote with given label, which may or may not be defined
499 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
500 /// occur in any order.
501 FootnoteReference,
502 /// A soft line break.
503 SoftBreak,
504 /// A hard line break.
505 HardBreak,
506 /// A horizontal ruler.
507 Rule,
508 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
509 TaskListMarker(bool),
510 /// Start of a root-level block (a top-level structural element like a paragraph, heading, list, etc.).
511 RootStart,
512 /// End of a root-level block. Contains the root block index.
513 RootEnd(usize),
514}
515
516/// Tags for elements that can contain other elements.
517#[derive(Clone, Debug, PartialEq)]
518pub enum MarkdownTag {
519 /// A paragraph of text and other inline elements.
520 Paragraph,
521
522 /// A heading, with optional identifier, classes and custom attributes.
523 /// The identifier is prefixed with `#` and the last one in the attributes
524 /// list is chosen, classes are prefixed with `.` and custom attributes
525 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
526 Heading {
527 level: HeadingLevel,
528 id: Option<SharedString>,
529 classes: Vec<SharedString>,
530 /// The first item of the tuple is the attr and second one the value.
531 attrs: Vec<(SharedString, Option<SharedString>)>,
532 },
533
534 BlockQuote,
535
536 /// A code block.
537 CodeBlock {
538 kind: CodeBlockKind,
539 metadata: CodeBlockMetadata,
540 },
541
542 /// A HTML block.
543 HtmlBlock,
544
545 /// A list. If the list is ordered the field indicates the number of the first item.
546 /// Contains only list items.
547 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
548
549 /// A list item.
550 Item,
551
552 /// A footnote definition. The value contained is the footnote's label by which it can
553 /// be referred to.
554 FootnoteDefinition(SharedString),
555
556 /// A table. Contains a vector describing the text-alignment for each of its columns.
557 Table(Vec<Alignment>),
558
559 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
560 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
561 TableHead,
562
563 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
564 TableRow,
565 TableCell,
566
567 // span-level tags
568 Emphasis,
569 Strong,
570 Strikethrough,
571 Superscript,
572 Subscript,
573
574 /// A link.
575 Link {
576 link_type: LinkType,
577 dest_url: SharedString,
578 title: SharedString,
579 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
580 id: SharedString,
581 },
582
583 /// An image. The first field is the link type, the second the destination URL and the third is a title,
584 /// the fourth is the link identifier.
585 Image {
586 link_type: LinkType,
587 dest_url: SharedString,
588 title: SharedString,
589 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
590 id: SharedString,
591 },
592
593 /// A metadata block.
594 MetadataBlock(MetadataBlockKind),
595
596 DefinitionList,
597 DefinitionListTitle,
598 DefinitionListDefinition,
599}
600
601#[derive(Clone, Debug, PartialEq)]
602pub enum CodeBlockKind {
603 Indented,
604 /// "Fenced" means "surrounded by triple backticks."
605 /// There can optionally be either a language after the backticks (like in traditional Markdown)
606 /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
607 /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
608 Fenced,
609 FencedLang(SharedString),
610 FencedSrc(PathWithRange),
611}
612
613#[derive(Default, Clone, Debug, PartialEq)]
614pub struct CodeBlockMetadata {
615 pub content_range: Range<usize>,
616 pub line_count: usize,
617}
618
619fn extract_code_content_range(text: &str) -> Range<usize> {
620 let text_len = text.len();
621 if text_len == 0 {
622 return 0..0;
623 }
624
625 let start_ticks = text.chars().take_while(|&c| c == '`').count();
626
627 if start_ticks == 0 || start_ticks > text_len {
628 return 0..text_len;
629 }
630
631 let end_ticks = text.chars().rev().take_while(|&c| c == '`').count();
632
633 if end_ticks != start_ticks || text_len < start_ticks + end_ticks {
634 return 0..text_len;
635 }
636
637 start_ticks..text_len - end_ticks
638}
639
640pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
641 let mut range = 0..text.len();
642 if text.starts_with("```") {
643 range.start += 3;
644
645 if let Some(newline_ix) = text[range.clone()].find('\n') {
646 range.start += newline_ix + 1;
647 }
648 }
649
650 if !range.is_empty() && text.ends_with("```") {
651 range.end -= 3;
652 }
653 if range.start > range.end {
654 range.end = range.start;
655 }
656 range
657}
658
659#[cfg(test)]
660mod tests {
661 use super::MarkdownEvent::*;
662 use super::MarkdownTag::*;
663 use super::*;
664
665 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
666 .union(Options::ENABLE_MATH)
667 .union(Options::ENABLE_DEFINITION_LIST)
668 .union(Options::ENABLE_WIKILINKS);
669
670 #[test]
671 fn all_options_considered() {
672 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
673 // can be evaluated for inclusion.
674 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
675 }
676
677 #[test]
678 fn wanted_and_unwanted_options_disjoint() {
679 assert_eq!(
680 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
681 Options::empty()
682 );
683 }
684
685 #[test]
686 fn test_html_comments() {
687 assert_eq!(
688 parse_markdown_with_options(" <!--\nrdoc-file=string.c\n-->\nReturns", false),
689 ParsedMarkdownData {
690 events: vec![
691 (2..30, RootStart),
692 (2..30, Start(HtmlBlock)),
693 (2..2, SubstitutedText(" ".into())),
694 (2..7, Html),
695 (7..26, Html),
696 (26..30, Html),
697 (2..30, End(MarkdownTagEnd::HtmlBlock)),
698 (2..30, RootEnd(0)),
699 (30..37, RootStart),
700 (30..37, Start(Paragraph)),
701 (30..37, Text),
702 (30..37, End(MarkdownTagEnd::Paragraph)),
703 (30..37, RootEnd(1)),
704 ],
705 root_block_starts: vec![2, 30],
706 ..Default::default()
707 }
708 )
709 }
710
711 #[test]
712 fn test_plain_urls_and_escaped_text() {
713 assert_eq!(
714 parse_markdown_with_options(
715 " https://some.url some \\`►\\` text",
716 false
717 ),
718 ParsedMarkdownData {
719 events: vec![
720 (0..51, RootStart),
721 (0..51, Start(Paragraph)),
722 (0..6, SubstitutedText("\u{a0}".into())),
723 (6..12, SubstitutedText("\u{a0}".into())),
724 (12..13, Text),
725 (
726 13..29,
727 Start(Link {
728 link_type: LinkType::Autolink,
729 dest_url: "https://some.url".into(),
730 title: "".into(),
731 id: "".into(),
732 })
733 ),
734 (13..29, Text),
735 (13..29, End(MarkdownTagEnd::Link)),
736 (29..35, Text),
737 (36..37, Text), // Escaped backtick
738 (37..44, SubstitutedText("►".into())),
739 (45..46, Text), // Escaped backtick
740 (46..51, Text),
741 (0..51, End(MarkdownTagEnd::Paragraph)),
742 (0..51, RootEnd(0)),
743 ],
744 root_block_starts: vec![0],
745 ..Default::default()
746 }
747 );
748 }
749
750 #[test]
751 fn test_incomplete_link() {
752 assert_eq!(
753 parse_markdown_with_options(
754 "You can use the [GitHub Search API](https://docs.github.com/en",
755 false
756 )
757 .events,
758 vec![
759 (0..62, RootStart),
760 (0..62, Start(Paragraph)),
761 (0..16, Text),
762 (16..17, Text),
763 (17..34, Text),
764 (34..35, Text),
765 (35..36, Text),
766 (
767 36..62,
768 Start(Link {
769 link_type: LinkType::Autolink,
770 dest_url: "https://docs.github.com/en".into(),
771 title: "".into(),
772 id: "".into()
773 })
774 ),
775 (36..62, Text),
776 (36..62, End(MarkdownTagEnd::Link)),
777 (0..62, End(MarkdownTagEnd::Paragraph)),
778 (0..62, RootEnd(0)),
779 ],
780 );
781 }
782
783 #[test]
784 fn test_smart_punctuation() {
785 assert_eq!(
786 parse_markdown_with_options(
787 "-- --- ... \"double quoted\" 'single quoted' ----------",
788 false
789 ),
790 ParsedMarkdownData {
791 events: vec![
792 (0..53, RootStart),
793 (0..53, Start(Paragraph)),
794 (0..2, SubstitutedText("–".into())),
795 (2..3, Text),
796 (3..6, SubstitutedText("—".into())),
797 (6..7, Text),
798 (7..10, SubstitutedText("…".into())),
799 (10..11, Text),
800 (11..12, SubstitutedText("\u{201c}".into())),
801 (12..25, Text),
802 (25..26, SubstitutedText("\u{201d}".into())),
803 (26..27, Text),
804 (27..28, SubstitutedText("\u{2018}".into())),
805 (28..41, Text),
806 (41..42, SubstitutedText("\u{2019}".into())),
807 (42..43, Text),
808 (43..53, SubstitutedText("–––––".into())),
809 (0..53, End(MarkdownTagEnd::Paragraph)),
810 (0..53, RootEnd(0)),
811 ],
812 root_block_starts: vec![0],
813 ..Default::default()
814 }
815 )
816 }
817
818 #[test]
819 fn test_code_block_metadata() {
820 assert_eq!(
821 parse_markdown_with_options("```rust\nfn main() {\n let a = 1;\n}\n```", false),
822 ParsedMarkdownData {
823 events: vec![
824 (0..37, RootStart),
825 (
826 0..37,
827 Start(CodeBlock {
828 kind: CodeBlockKind::FencedLang("rust".into()),
829 metadata: CodeBlockMetadata {
830 content_range: 8..34,
831 line_count: 3
832 }
833 })
834 ),
835 (8..34, Text),
836 (0..37, End(MarkdownTagEnd::CodeBlock)),
837 (0..37, RootEnd(0)),
838 ],
839 language_names: {
840 let mut h = HashSet::default();
841 h.insert("rust".into());
842 h
843 },
844 root_block_starts: vec![0],
845 ..Default::default()
846 }
847 );
848 assert_eq!(
849 parse_markdown_with_options(" fn main() {}", false),
850 ParsedMarkdownData {
851 events: vec![
852 (4..16, RootStart),
853 (
854 4..16,
855 Start(CodeBlock {
856 kind: CodeBlockKind::Indented,
857 metadata: CodeBlockMetadata {
858 content_range: 4..16,
859 line_count: 1
860 }
861 })
862 ),
863 (4..16, Text),
864 (4..16, End(MarkdownTagEnd::CodeBlock)),
865 (4..16, RootEnd(0)),
866 ],
867 root_block_starts: vec![4],
868 ..Default::default()
869 }
870 );
871 }
872
873 #[test]
874 fn test_metadata_blocks_do_not_affect_root_blocks() {
875 assert_eq!(
876 parse_markdown_with_options("+++\ntitle = \"Example\"\n+++\n\nParagraph", false),
877 ParsedMarkdownData {
878 events: vec![
879 (27..36, RootStart),
880 (27..36, Start(Paragraph)),
881 (27..36, Text),
882 (27..36, End(MarkdownTagEnd::Paragraph)),
883 (27..36, RootEnd(0)),
884 ],
885 root_block_starts: vec![27],
886 ..Default::default()
887 }
888 );
889 }
890
891 #[test]
892 fn test_table_checkboxes_remain_text_in_cells() {
893 let markdown = "\
894| Done | Task |
895|------|---------|
896| [x] | Fix bug |
897| [ ] | Add feature |";
898 let parsed = parse_markdown_with_options(markdown, false);
899
900 let mut in_table = false;
901 let mut saw_task_list_marker = false;
902 let mut cell_texts = Vec::new();
903 let mut current_cell = String::new();
904
905 for (range, event) in &parsed.events {
906 match event {
907 Start(Table(_)) => in_table = true,
908 End(MarkdownTagEnd::Table) => in_table = false,
909 Start(TableCell) => current_cell.clear(),
910 End(MarkdownTagEnd::TableCell) => {
911 if in_table {
912 cell_texts.push(current_cell.clone());
913 }
914 }
915 Text if in_table => current_cell.push_str(&markdown[range.clone()]),
916 TaskListMarker(_) if in_table => saw_task_list_marker = true,
917 _ => {}
918 }
919 }
920
921 let checkbox_cells: Vec<&str> = cell_texts
922 .iter()
923 .map(|cell| cell.trim())
924 .filter(|cell| *cell == "[x]" || *cell == "[X]" || *cell == "[ ]")
925 .collect();
926
927 assert!(
928 !saw_task_list_marker,
929 "Table checkboxes should remain text, not task-list markers"
930 );
931 assert_eq!(checkbox_cells, vec!["[x]", "[ ]"]);
932 }
933
934 #[test]
935 fn test_extract_code_content_range() {
936 let input = "```let x = 5;```";
937 assert_eq!(extract_code_content_range(input), 3..13);
938
939 let input = "``let x = 5;``";
940 assert_eq!(extract_code_content_range(input), 2..12);
941
942 let input = "`let x = 5;`";
943 assert_eq!(extract_code_content_range(input), 1..11);
944
945 let input = "plain text";
946 assert_eq!(extract_code_content_range(input), 0..10);
947
948 let input = "``let x = 5;`";
949 assert_eq!(extract_code_content_range(input), 0..13);
950 }
951
952 #[test]
953 fn test_extract_code_block_content_range() {
954 let input = "```rust\nlet x = 5;\n```";
955 assert_eq!(extract_code_block_content_range(input), 8..19);
956
957 let input = "plain text";
958 assert_eq!(extract_code_block_content_range(input), 0..10);
959
960 let input = "```python\nprint('hello')\nprint('world')\n```";
961 assert_eq!(extract_code_block_content_range(input), 10..40);
962
963 // Malformed input
964 let input = "`````";
965 assert_eq!(extract_code_block_content_range(input), 3..3);
966 }
967
968 #[test]
969 fn test_links_split_across_fragments() {
970 // This test verifies that links split across multiple text fragments due to escaping or other issues
971 // are correctly detected and processed
972 // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
973 // We're verifying our parser can handle this correctly
974 assert_eq!(
975 parse_markdown_with_options(
976 "https:/\\/example.com is equivalent to https://example.com!",
977 false
978 )
979 .events,
980 vec![
981 (0..62, RootStart),
982 (0..62, Start(Paragraph)),
983 (
984 0..20,
985 Start(Link {
986 link_type: LinkType::Autolink,
987 dest_url: "https://example.com".into(),
988 title: "".into(),
989 id: "".into()
990 })
991 ),
992 (0..7, Text),
993 (8..20, Text),
994 (0..20, End(MarkdownTagEnd::Link)),
995 (20..38, Text),
996 (
997 38..61,
998 Start(Link {
999 link_type: LinkType::Autolink,
1000 dest_url: "https://example.com".into(),
1001 title: "".into(),
1002 id: "".into()
1003 })
1004 ),
1005 (38..53, Text),
1006 (53..58, SubstitutedText(".".into())),
1007 (58..61, Text),
1008 (38..61, End(MarkdownTagEnd::Link)),
1009 (61..62, Text),
1010 (0..62, End(MarkdownTagEnd::Paragraph)),
1011 (0..62, RootEnd(0)),
1012 ],
1013 );
1014
1015 assert_eq!(
1016 parse_markdown_with_options(
1017 "Visit https://example.com/cat\\/é‍☕ for coffee!",
1018 false
1019 )
1020 .events,
1021 [
1022 (0..55, RootStart),
1023 (0..55, Start(Paragraph)),
1024 (0..6, Text),
1025 (
1026 6..43,
1027 Start(Link {
1028 link_type: LinkType::Autolink,
1029 dest_url: "https://example.com/cat/é\u{200d}☕".into(),
1030 title: "".into(),
1031 id: "".into()
1032 })
1033 ),
1034 (6..29, Text),
1035 (30..33, Text),
1036 (33..40, SubstitutedText("\u{200d}".into())),
1037 (40..43, Text),
1038 (6..43, End(MarkdownTagEnd::Link)),
1039 (43..55, Text),
1040 (0..55, End(MarkdownTagEnd::Paragraph)),
1041 (0..55, RootEnd(0)),
1042 ]
1043 );
1044 }
1045}