1use gpui::SharedString;
2use linkify::LinkFinder;
3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
4use pulldown_cmark::{
5 Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
6};
7use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
8
9use crate::path_range::PathWithRange;
10
11const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
12 .union(Options::ENABLE_FOOTNOTES)
13 .union(Options::ENABLE_STRIKETHROUGH)
14 .union(Options::ENABLE_TASKLISTS)
15 .union(Options::ENABLE_SMART_PUNCTUATION)
16 .union(Options::ENABLE_HEADING_ATTRIBUTES)
17 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
18 .union(Options::ENABLE_OLD_FOOTNOTES)
19 .union(Options::ENABLE_GFM);
20
21pub fn parse_markdown(
22 text: &str,
23) -> (
24 Vec<(Range<usize>, MarkdownEvent)>,
25 HashSet<SharedString>,
26 HashSet<Arc<Path>>,
27) {
28 let mut events = Vec::new();
29 let mut language_names = HashSet::new();
30 let mut language_paths = HashSet::new();
31 let mut within_link = false;
32 let mut within_metadata = false;
33 let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
34 .into_offset_iter()
35 .peekable();
36 while let Some((pulldown_event, mut range)) = parser.next() {
37 if within_metadata {
38 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
39 pulldown_event
40 {
41 within_metadata = false;
42 }
43 continue;
44 }
45 match pulldown_event {
46 pulldown_cmark::Event::Start(tag) => {
47 let tag = match tag {
48 pulldown_cmark::Tag::Link {
49 link_type,
50 dest_url,
51 title,
52 id,
53 } => {
54 within_link = true;
55 MarkdownTag::Link {
56 link_type,
57 dest_url: SharedString::from(dest_url.into_string()),
58 title: SharedString::from(title.into_string()),
59 id: SharedString::from(id.into_string()),
60 }
61 }
62 pulldown_cmark::Tag::MetadataBlock(kind) => {
63 within_metadata = true;
64 MarkdownTag::MetadataBlock(kind)
65 }
66 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
67 MarkdownTag::CodeBlock {
68 kind: CodeBlockKind::Indented,
69 metadata: CodeBlockMetadata {
70 content_range: range.start + 1..range.end + 1,
71 line_count: 1,
72 },
73 }
74 }
75 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
76 ref info,
77 )) => {
78 let content_range = extract_code_block_content_range(&text[range.clone()]);
79 let content_range =
80 content_range.start + range.start..content_range.end + range.start;
81
82 let line_count = text[content_range.clone()]
83 .bytes()
84 .filter(|c| *c == b'\n')
85 .count();
86 let metadata = CodeBlockMetadata {
87 content_range,
88 line_count,
89 };
90
91 let info = info.trim();
92 let kind = if info.is_empty() {
93 CodeBlockKind::Fenced
94 // Languages should never contain a slash, and PathRanges always should.
95 // (Models are told to specify them relative to a workspace root.)
96 } else if info.contains('/') {
97 let path_range = PathWithRange::new(info);
98 language_paths.insert(path_range.path.clone());
99 CodeBlockKind::FencedSrc(path_range)
100 } else {
101 let language = SharedString::from(info.to_string());
102 language_names.insert(language.clone());
103 CodeBlockKind::FencedLang(language)
104 };
105
106 MarkdownTag::CodeBlock { kind, metadata }
107 }
108 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
109 pulldown_cmark::Tag::Heading {
110 level,
111 id,
112 classes,
113 attrs,
114 } => {
115 let id = id.map(|id| SharedString::from(id.into_string()));
116 let classes = classes
117 .into_iter()
118 .map(|c| SharedString::from(c.into_string()))
119 .collect();
120 let attrs = attrs
121 .into_iter()
122 .map(|(key, value)| {
123 (
124 SharedString::from(key.into_string()),
125 value.map(|v| SharedString::from(v.into_string())),
126 )
127 })
128 .collect();
129 MarkdownTag::Heading {
130 level,
131 id,
132 classes,
133 attrs,
134 }
135 }
136 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
137 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
138 pulldown_cmark::Tag::Item => MarkdownTag::Item,
139 pulldown_cmark::Tag::FootnoteDefinition(label) => {
140 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
141 }
142 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
143 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
144 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
145 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
146 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
147 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
148 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
149 pulldown_cmark::Tag::Image {
150 link_type,
151 dest_url,
152 title,
153 id,
154 } => MarkdownTag::Image {
155 link_type,
156 dest_url: SharedString::from(dest_url.into_string()),
157 title: SharedString::from(title.into_string()),
158 id: SharedString::from(id.into_string()),
159 },
160 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
161 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
162 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
163 pulldown_cmark::Tag::DefinitionListDefinition => {
164 MarkdownTag::DefinitionListDefinition
165 }
166 };
167 events.push((range, MarkdownEvent::Start(tag)))
168 }
169 pulldown_cmark::Event::End(tag) => {
170 if let pulldown_cmark::TagEnd::Link = tag {
171 within_link = false;
172 }
173 events.push((range, MarkdownEvent::End(tag)));
174 }
175 pulldown_cmark::Event::Text(parsed) => {
176 fn event_for(
177 text: &str,
178 range: Range<usize>,
179 str: &str,
180 ) -> (Range<usize>, MarkdownEvent) {
181 if str == &text[range.clone()] {
182 (range, MarkdownEvent::Text)
183 } else {
184 (range, MarkdownEvent::SubstitutedText(str.to_owned()))
185 }
186 }
187 struct TextRange<'a> {
188 source_range: Range<usize>,
189 merged_range: Range<usize>,
190 parsed: CowStr<'a>,
191 }
192
193 let mut last_len = parsed.len();
194 let mut ranges = vec![TextRange {
195 source_range: range.clone(),
196 merged_range: 0..last_len,
197 parsed,
198 }];
199
200 while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
201 let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
202 else {
203 unreachable!()
204 };
205 let next_len = last_len + next_event.len();
206 ranges.push(TextRange {
207 source_range: next_range.clone(),
208 merged_range: last_len..next_len,
209 parsed: next_event,
210 });
211 last_len = next_len;
212 }
213
214 let mut merged_text =
215 String::with_capacity(ranges.last().unwrap().merged_range.end);
216 for range in &ranges {
217 merged_text.push_str(&range.parsed);
218 }
219
220 let mut ranges = ranges.into_iter().peekable();
221
222 if !within_link {
223 let mut finder = LinkFinder::new();
224 finder.kinds(&[linkify::LinkKind::Url]);
225
226 // Find links in the merged text
227 for link in finder.links(&merged_text) {
228 let link_start_in_merged = link.start();
229 let link_end_in_merged = link.end();
230
231 while ranges
232 .peek()
233 .is_some_and(|range| range.merged_range.end <= link_start_in_merged)
234 {
235 let range = ranges.next().unwrap();
236 events.push(event_for(text, range.source_range, &range.parsed));
237 }
238
239 let range = ranges.peek_mut().unwrap();
240 let prefix_len = link_start_in_merged - range.merged_range.start;
241 if prefix_len > 0 {
242 let (head, tail) = range.parsed.split_at(prefix_len);
243 events.push(event_for(
244 text,
245 range.source_range.start..range.source_range.start + prefix_len,
246 &head,
247 ));
248 range.parsed = CowStr::Boxed(tail.into());
249 range.merged_range.start += prefix_len;
250 range.source_range.start += prefix_len;
251 }
252
253 let link_start_in_source = range.source_range.start;
254 let mut link_events = Vec::new();
255
256 while ranges
257 .peek()
258 .is_some_and(|range| range.merged_range.end <= link_end_in_merged)
259 {
260 let range = ranges.next().unwrap();
261 link_events.push(event_for(text, range.source_range, &range.parsed));
262 }
263
264 let range = ranges.peek_mut().unwrap();
265 let prefix_len = link_end_in_merged - range.merged_range.start;
266 if prefix_len > 0 {
267 let (head, tail) = range.parsed.split_at(prefix_len);
268 link_events.push(event_for(
269 text,
270 range.source_range.start..range.source_range.start + prefix_len,
271 head,
272 ));
273 range.parsed = CowStr::Boxed(tail.into());
274 range.merged_range.start += prefix_len;
275 range.source_range.start += prefix_len;
276 }
277 let link_range = link_start_in_source..range.source_range.start;
278
279 events.push((
280 link_range.clone(),
281 MarkdownEvent::Start(MarkdownTag::Link {
282 link_type: LinkType::Autolink,
283 dest_url: SharedString::from(link.as_str().to_string()),
284 title: SharedString::default(),
285 id: SharedString::default(),
286 }),
287 ));
288 events.extend(link_events);
289 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
290 }
291 }
292
293 for range in ranges {
294 events.push(event_for(text, range.source_range, &range.parsed));
295 }
296 }
297 pulldown_cmark::Event::Code(_) => {
298 range.start += 1;
299 range.end -= 1;
300 events.push((range, MarkdownEvent::Code))
301 }
302 pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
303 pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
304 pulldown_cmark::Event::FootnoteReference(_) => {
305 events.push((range, MarkdownEvent::FootnoteReference))
306 }
307 pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
308 pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
309 pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
310 pulldown_cmark::Event::TaskListMarker(checked) => {
311 events.push((range, MarkdownEvent::TaskListMarker(checked)))
312 }
313 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
314 }
315 }
316 (events, language_names, language_paths)
317}
318
319pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
320 let mut events = Vec::new();
321 let mut finder = LinkFinder::new();
322 finder.kinds(&[linkify::LinkKind::Url]);
323 let mut text_range = Range {
324 start: 0,
325 end: text.len(),
326 };
327 for link in finder.links(text) {
328 let link_range = link.start()..link.end();
329
330 if link_range.start > text_range.start {
331 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
332 }
333
334 events.push((
335 link_range.clone(),
336 MarkdownEvent::Start(MarkdownTag::Link {
337 link_type: LinkType::Autolink,
338 dest_url: SharedString::from(link.as_str().to_string()),
339 title: SharedString::default(),
340 id: SharedString::default(),
341 }),
342 ));
343 events.push((link_range.clone(), MarkdownEvent::Text));
344 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
345
346 text_range.start = link_range.end;
347 }
348
349 if text_range.end > text_range.start {
350 events.push((text_range, MarkdownEvent::Text));
351 }
352
353 events
354}
355
356/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
357/// parse result for rendering without resorting to unsafe lifetime coercion.
358#[derive(Clone, Debug, PartialEq)]
359pub enum MarkdownEvent {
360 /// Start of a tagged element. Events that are yielded after this event
361 /// and before its corresponding `End` event are inside this element.
362 /// Start and end events are guaranteed to be balanced.
363 Start(MarkdownTag),
364 /// End of a tagged element.
365 End(MarkdownTagEnd),
366 /// Text that uses the associated range from the markdown source.
367 Text,
368 /// Text that differs from the markdown source - typically due to substitution of HTML entities
369 /// and smart punctuation.
370 SubstitutedText(String),
371 /// An inline code node.
372 Code,
373 /// An HTML node.
374 Html,
375 /// An inline HTML node.
376 InlineHtml,
377 /// A reference to a footnote with given label, which may or may not be defined
378 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
379 /// occur in any order.
380 FootnoteReference,
381 /// A soft line break.
382 SoftBreak,
383 /// A hard line break.
384 HardBreak,
385 /// A horizontal ruler.
386 Rule,
387 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
388 TaskListMarker(bool),
389}
390
391/// Tags for elements that can contain other elements.
392#[derive(Clone, Debug, PartialEq)]
393pub enum MarkdownTag {
394 /// A paragraph of text and other inline elements.
395 Paragraph,
396
397 /// A heading, with optional identifier, classes and custom attributes.
398 /// The identifier is prefixed with `#` and the last one in the attributes
399 /// list is chosen, classes are prefixed with `.` and custom attributes
400 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
401 Heading {
402 level: HeadingLevel,
403 id: Option<SharedString>,
404 classes: Vec<SharedString>,
405 /// The first item of the tuple is the attr and second one the value.
406 attrs: Vec<(SharedString, Option<SharedString>)>,
407 },
408
409 BlockQuote,
410
411 /// A code block.
412 CodeBlock {
413 kind: CodeBlockKind,
414 metadata: CodeBlockMetadata,
415 },
416
417 /// A HTML block.
418 HtmlBlock,
419
420 /// A list. If the list is ordered the field indicates the number of the first item.
421 /// Contains only list items.
422 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
423
424 /// A list item.
425 Item,
426
427 /// A footnote definition. The value contained is the footnote's label by which it can
428 /// be referred to.
429 FootnoteDefinition(SharedString),
430
431 /// A table. Contains a vector describing the text-alignment for each of its columns.
432 Table(Vec<Alignment>),
433
434 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
435 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
436 TableHead,
437
438 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
439 TableRow,
440 TableCell,
441
442 // span-level tags
443 Emphasis,
444 Strong,
445 Strikethrough,
446
447 /// A link.
448 Link {
449 link_type: LinkType,
450 dest_url: SharedString,
451 title: SharedString,
452 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
453 id: SharedString,
454 },
455
456 /// An image. The first field is the link type, the second the destination URL and the third is a title,
457 /// the fourth is the link identifier.
458 Image {
459 link_type: LinkType,
460 dest_url: SharedString,
461 title: SharedString,
462 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
463 id: SharedString,
464 },
465
466 /// A metadata block.
467 MetadataBlock(MetadataBlockKind),
468
469 DefinitionList,
470 DefinitionListTitle,
471 DefinitionListDefinition,
472}
473
474#[derive(Clone, Debug, PartialEq)]
475pub enum CodeBlockKind {
476 Indented,
477 /// "Fenced" means "surrounded by triple backticks."
478 /// There can optionally be either a language after the backticks (like in traditional Markdown)
479 /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
480 /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
481 Fenced,
482 FencedLang(SharedString),
483 FencedSrc(PathWithRange),
484}
485
486#[derive(Default, Clone, Debug, PartialEq)]
487pub struct CodeBlockMetadata {
488 pub content_range: Range<usize>,
489 pub line_count: usize,
490}
491
492pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
493 let mut range = 0..text.len();
494 if text.starts_with("```") {
495 range.start += 3;
496
497 if let Some(newline_ix) = text[range.clone()].find('\n') {
498 range.start += newline_ix + 1;
499 }
500 }
501
502 if !range.is_empty() && text.ends_with("```") {
503 range.end -= 3;
504 }
505 range
506}
507
508#[cfg(test)]
509mod tests {
510 use super::MarkdownEvent::*;
511 use super::MarkdownTag::*;
512 use super::*;
513
514 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
515 .union(Options::ENABLE_MATH)
516 .union(Options::ENABLE_DEFINITION_LIST);
517
518 #[test]
519 fn all_options_considered() {
520 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
521 // can be evaluated for inclusion.
522 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
523 }
524
525 #[test]
526 fn wanted_and_unwanted_options_disjoint() {
527 assert_eq!(
528 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
529 Options::empty()
530 );
531 }
532
533 #[test]
534 fn test_plain_urls_and_escaped_text() {
535 assert_eq!(
536 parse_markdown(" https://some.url some \\`►\\` text"),
537 (
538 vec![
539 (0..51, Start(Paragraph)),
540 (0..6, SubstitutedText("\u{a0}".into())),
541 (6..12, SubstitutedText("\u{a0}".into())),
542 (12..13, Text),
543 (
544 13..29,
545 Start(Link {
546 link_type: LinkType::Autolink,
547 dest_url: "https://some.url".into(),
548 title: "".into(),
549 id: "".into(),
550 })
551 ),
552 (13..29, Text),
553 (13..29, End(MarkdownTagEnd::Link)),
554 (29..35, Text),
555 (36..37, Text), // Escaped backtick
556 (37..44, SubstitutedText("►".into())),
557 (45..46, Text), // Escaped backtick
558 (46..51, Text),
559 (0..51, End(MarkdownTagEnd::Paragraph))
560 ],
561 HashSet::new(),
562 HashSet::new()
563 )
564 );
565 }
566
567 #[test]
568 fn test_smart_punctuation() {
569 assert_eq!(
570 parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
571 (
572 vec![
573 (0..53, Start(Paragraph)),
574 (0..2, SubstitutedText("–".into())),
575 (2..3, Text),
576 (3..6, SubstitutedText("—".into())),
577 (6..7, Text),
578 (7..10, SubstitutedText("…".into())),
579 (10..11, Text),
580 (11..12, SubstitutedText("“".into())),
581 (12..25, Text),
582 (25..26, SubstitutedText("”".into())),
583 (26..27, Text),
584 (27..28, SubstitutedText("‘".into())),
585 (28..41, Text),
586 (41..42, SubstitutedText("’".into())),
587 (42..43, Text),
588 (43..53, SubstitutedText("–––––".into())),
589 (0..53, End(MarkdownTagEnd::Paragraph))
590 ],
591 HashSet::new(),
592 HashSet::new()
593 )
594 )
595 }
596
597 #[test]
598 fn test_code_block_metadata() {
599 assert_eq!(
600 parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
601 (
602 vec![
603 (
604 0..37,
605 Start(CodeBlock {
606 kind: CodeBlockKind::FencedLang("rust".into()),
607 metadata: CodeBlockMetadata {
608 content_range: 8..34,
609 line_count: 3
610 }
611 })
612 ),
613 (8..34, Text),
614 (0..37, End(MarkdownTagEnd::CodeBlock)),
615 ],
616 HashSet::from(["rust".into()]),
617 HashSet::new()
618 )
619 )
620 }
621
622 #[test]
623 fn test_extract_code_block_content_range() {
624 let input = "```rust\nlet x = 5;\n```";
625 assert_eq!(extract_code_block_content_range(input), 8..19);
626
627 let input = "plain text";
628 assert_eq!(extract_code_block_content_range(input), 0..10);
629
630 let input = "```python\nprint('hello')\nprint('world')\n```";
631 assert_eq!(extract_code_block_content_range(input), 10..40);
632 }
633
634 #[test]
635 fn test_links_split_across_fragments() {
636 // This test verifies that links split across multiple text fragments due to escaping or other issues
637 // are correctly detected and processed
638 // Note: In real usage, pulldown_cmark creates separate text events for the escaped character
639 // We're verifying our parser can handle this correctly
640 assert_eq!(
641 parse_markdown("https:/\\/example.com is equivalent to https://example.com!").0,
642 vec![
643 (0..62, Start(Paragraph)),
644 (
645 0..20,
646 Start(Link {
647 link_type: LinkType::Autolink,
648 dest_url: "https://example.com".into(),
649 title: "".into(),
650 id: "".into()
651 })
652 ),
653 (0..7, Text),
654 (8..20, Text),
655 (0..20, End(MarkdownTagEnd::Link)),
656 (20..38, Text),
657 (
658 38..61,
659 Start(Link {
660 link_type: LinkType::Autolink,
661 dest_url: "https://example.com".into(),
662 title: "".into(),
663 id: "".into()
664 })
665 ),
666 (38..53, Text),
667 (53..58, SubstitutedText(".".into())),
668 (58..61, Text),
669 (38..61, End(MarkdownTagEnd::Link)),
670 (61..62, Text),
671 (0..62, End(MarkdownTagEnd::Paragraph))
672 ],
673 );
674
675 assert_eq!(
676 parse_markdown("Visit https://example.com/cat\\/é‍☕ for coffee!").0,
677 [
678 (0..55, Start(Paragraph)),
679 (0..6, Text),
680 (
681 6..43,
682 Start(Link {
683 link_type: LinkType::Autolink,
684 dest_url: "https://example.com/cat/é\u{200d}☕".into(),
685 title: "".into(),
686 id: "".into()
687 })
688 ),
689 (6..29, Text),
690 (30..33, Text),
691 (33..40, SubstitutedText("\u{200d}".into())),
692 (40..43, Text),
693 (6..43, End(MarkdownTagEnd::Link)),
694 (43..55, Text),
695 (0..55, End(MarkdownTagEnd::Paragraph))
696 ]
697 );
698 }
699}