1use gpui::SharedString;
2use linkify::LinkFinder;
3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
4use pulldown_cmark::{
5 Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
6};
7use std::{
8 collections::HashSet,
9 ops::{Deref, Range},
10 path::Path,
11 sync::Arc,
12};
13
14use crate::path_range::PathWithRange;
15
16const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
17 .union(Options::ENABLE_FOOTNOTES)
18 .union(Options::ENABLE_STRIKETHROUGH)
19 .union(Options::ENABLE_TASKLISTS)
20 .union(Options::ENABLE_SMART_PUNCTUATION)
21 .union(Options::ENABLE_HEADING_ATTRIBUTES)
22 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
23 .union(Options::ENABLE_OLD_FOOTNOTES)
24 .union(Options::ENABLE_GFM);
25
26pub fn parse_markdown(
27 text: &str,
28) -> (
29 Vec<(Range<usize>, MarkdownEvent)>,
30 HashSet<SharedString>,
31 HashSet<Arc<Path>>,
32) {
33 let mut events = Vec::new();
34 let mut language_names = HashSet::new();
35 let mut language_paths = HashSet::new();
36 let mut within_link = false;
37 let mut within_metadata = false;
38 for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
39 if within_metadata {
40 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
41 pulldown_event
42 {
43 within_metadata = false;
44 }
45 continue;
46 }
47 match pulldown_event {
48 pulldown_cmark::Event::Start(tag) => {
49 let tag = match tag {
50 pulldown_cmark::Tag::Link {
51 link_type,
52 dest_url,
53 title,
54 id,
55 } => {
56 within_link = true;
57 MarkdownTag::Link {
58 link_type,
59 dest_url: SharedString::from(dest_url.into_string()),
60 title: SharedString::from(title.into_string()),
61 id: SharedString::from(id.into_string()),
62 }
63 }
64 pulldown_cmark::Tag::MetadataBlock(kind) => {
65 within_metadata = true;
66 MarkdownTag::MetadataBlock(kind)
67 }
68 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
69 MarkdownTag::CodeBlock {
70 kind: CodeBlockKind::Indented,
71 metadata: CodeBlockMetadata {
72 content_range: range.start + 1..range.end + 1,
73 line_count: 1,
74 },
75 }
76 }
77 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
78 ref info,
79 )) => {
80 let content_range = extract_code_block_content_range(&text[range.clone()]);
81 let content_range =
82 content_range.start + range.start..content_range.end + range.start;
83
84 let line_count = text[content_range.clone()]
85 .bytes()
86 .filter(|c| *c == b'\n')
87 .count();
88 let metadata = CodeBlockMetadata {
89 content_range,
90 line_count,
91 };
92
93 let info = info.trim();
94 let kind = if info.is_empty() {
95 CodeBlockKind::Fenced
96 // Languages should never contain a slash, and PathRanges always should.
97 // (Models are told to specify them relative to a workspace root.)
98 } else if info.contains('/') {
99 let path_range = PathWithRange::new(info);
100 language_paths.insert(path_range.path.clone());
101 CodeBlockKind::FencedSrc(path_range)
102 } else {
103 let language = SharedString::from(info.to_string());
104 language_names.insert(language.clone());
105 CodeBlockKind::FencedLang(language)
106 };
107
108 MarkdownTag::CodeBlock { kind, metadata }
109 }
110 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
111 pulldown_cmark::Tag::Heading {
112 level,
113 id,
114 classes,
115 attrs,
116 } => {
117 let id = id.map(|id| SharedString::from(id.into_string()));
118 let classes = classes
119 .into_iter()
120 .map(|c| SharedString::from(c.into_string()))
121 .collect();
122 let attrs = attrs
123 .into_iter()
124 .map(|(key, value)| {
125 (
126 SharedString::from(key.into_string()),
127 value.map(|v| SharedString::from(v.into_string())),
128 )
129 })
130 .collect();
131 MarkdownTag::Heading {
132 level,
133 id,
134 classes,
135 attrs,
136 }
137 }
138 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
139 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
140 pulldown_cmark::Tag::Item => MarkdownTag::Item,
141 pulldown_cmark::Tag::FootnoteDefinition(label) => {
142 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
143 }
144 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
145 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
146 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
147 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
148 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
149 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
150 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
151 pulldown_cmark::Tag::Image {
152 link_type,
153 dest_url,
154 title,
155 id,
156 } => MarkdownTag::Image {
157 link_type,
158 dest_url: SharedString::from(dest_url.into_string()),
159 title: SharedString::from(title.into_string()),
160 id: SharedString::from(id.into_string()),
161 },
162 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
163 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
164 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
165 pulldown_cmark::Tag::DefinitionListDefinition => {
166 MarkdownTag::DefinitionListDefinition
167 }
168 };
169 events.push((range, MarkdownEvent::Start(tag)))
170 }
171 pulldown_cmark::Event::End(tag) => {
172 if let pulldown_cmark::TagEnd::Link = tag {
173 within_link = false;
174 }
175 events.push((range, MarkdownEvent::End(tag)));
176 }
177 pulldown_cmark::Event::Text(parsed) => {
178 // `parsed` will share bytes with the input unless a substitution like handling of
179 // HTML entities or smart punctuation has occurred. When these substitutions occur,
180 // `parsed` only consists of the result of a single substitution.
181 if !cow_str_points_inside(&parsed, text) {
182 events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
183 } else {
184 // Automatically detect links in text if not already within a markdown link.
185 if !within_link {
186 let mut finder = LinkFinder::new();
187 finder.kinds(&[linkify::LinkKind::Url]);
188 let text_range = range.clone();
189 for link in finder.links(&text[text_range.clone()]) {
190 let link_range =
191 text_range.start + link.start()..text_range.start + link.end();
192
193 if link_range.start > range.start {
194 events.push((range.start..link_range.start, MarkdownEvent::Text));
195 }
196
197 events.push((
198 link_range.clone(),
199 MarkdownEvent::Start(MarkdownTag::Link {
200 link_type: LinkType::Autolink,
201 dest_url: SharedString::from(link.as_str().to_string()),
202 title: SharedString::default(),
203 id: SharedString::default(),
204 }),
205 ));
206
207 events.push((link_range.clone(), MarkdownEvent::Text));
208 events.push((
209 link_range.clone(),
210 MarkdownEvent::End(MarkdownTagEnd::Link),
211 ));
212
213 range.start = link_range.end;
214 }
215 }
216 if range.start < range.end {
217 events.push((range, MarkdownEvent::Text));
218 }
219 }
220 }
221 pulldown_cmark::Event::Code(_) => {
222 range.start += 1;
223 range.end -= 1;
224 events.push((range, MarkdownEvent::Code))
225 }
226 pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
227 pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
228 pulldown_cmark::Event::FootnoteReference(_) => {
229 events.push((range, MarkdownEvent::FootnoteReference))
230 }
231 pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
232 pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
233 pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
234 pulldown_cmark::Event::TaskListMarker(checked) => {
235 events.push((range, MarkdownEvent::TaskListMarker(checked)))
236 }
237 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
238 }
239 }
240 (events, language_names, language_paths)
241}
242
243pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
244 let mut events = Vec::new();
245 let mut finder = LinkFinder::new();
246 finder.kinds(&[linkify::LinkKind::Url]);
247 let mut text_range = Range {
248 start: 0,
249 end: text.len(),
250 };
251 for link in finder.links(text) {
252 let link_range = link.start()..link.end();
253
254 if link_range.start > text_range.start {
255 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
256 }
257
258 events.push((
259 link_range.clone(),
260 MarkdownEvent::Start(MarkdownTag::Link {
261 link_type: LinkType::Autolink,
262 dest_url: SharedString::from(link.as_str().to_string()),
263 title: SharedString::default(),
264 id: SharedString::default(),
265 }),
266 ));
267 events.push((link_range.clone(), MarkdownEvent::Text));
268 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
269
270 text_range.start = link_range.end;
271 }
272
273 if text_range.end > text_range.start {
274 events.push((text_range, MarkdownEvent::Text));
275 }
276
277 events
278}
279
280/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
281/// parse result for rendering without resorting to unsafe lifetime coercion.
282#[derive(Clone, Debug, PartialEq)]
283pub enum MarkdownEvent {
284 /// Start of a tagged element. Events that are yielded after this event
285 /// and before its corresponding `End` event are inside this element.
286 /// Start and end events are guaranteed to be balanced.
287 Start(MarkdownTag),
288 /// End of a tagged element.
289 End(MarkdownTagEnd),
290 /// Text that uses the associated range from the markdown source.
291 Text,
292 /// Text that differs from the markdown source - typically due to substitution of HTML entities
293 /// and smart punctuation.
294 SubstitutedText(CompactStr),
295 /// An inline code node.
296 Code,
297 /// An HTML node.
298 Html,
299 /// An inline HTML node.
300 InlineHtml,
301 /// A reference to a footnote with given label, which may or may not be defined
302 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
303 /// occur in any order.
304 FootnoteReference,
305 /// A soft line break.
306 SoftBreak,
307 /// A hard line break.
308 HardBreak,
309 /// A horizontal ruler.
310 Rule,
311 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
312 TaskListMarker(bool),
313}
314
315/// Tags for elements that can contain other elements.
316#[derive(Clone, Debug, PartialEq)]
317pub enum MarkdownTag {
318 /// A paragraph of text and other inline elements.
319 Paragraph,
320
321 /// A heading, with optional identifier, classes and custom attributes.
322 /// The identifier is prefixed with `#` and the last one in the attributes
323 /// list is chosen, classes are prefixed with `.` and custom attributes
324 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
325 Heading {
326 level: HeadingLevel,
327 id: Option<SharedString>,
328 classes: Vec<SharedString>,
329 /// The first item of the tuple is the attr and second one the value.
330 attrs: Vec<(SharedString, Option<SharedString>)>,
331 },
332
333 BlockQuote,
334
335 /// A code block.
336 CodeBlock {
337 kind: CodeBlockKind,
338 metadata: CodeBlockMetadata,
339 },
340
341 /// A HTML block.
342 HtmlBlock,
343
344 /// A list. If the list is ordered the field indicates the number of the first item.
345 /// Contains only list items.
346 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
347
348 /// A list item.
349 Item,
350
351 /// A footnote definition. The value contained is the footnote's label by which it can
352 /// be referred to.
353 FootnoteDefinition(SharedString),
354
355 /// A table. Contains a vector describing the text-alignment for each of its columns.
356 Table(Vec<Alignment>),
357
358 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
359 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
360 TableHead,
361
362 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
363 TableRow,
364 TableCell,
365
366 // span-level tags
367 Emphasis,
368 Strong,
369 Strikethrough,
370
371 /// A link.
372 Link {
373 link_type: LinkType,
374 dest_url: SharedString,
375 title: SharedString,
376 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
377 id: SharedString,
378 },
379
380 /// An image. The first field is the link type, the second the destination URL and the third is a title,
381 /// the fourth is the link identifier.
382 Image {
383 link_type: LinkType,
384 dest_url: SharedString,
385 title: SharedString,
386 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
387 id: SharedString,
388 },
389
390 /// A metadata block.
391 MetadataBlock(MetadataBlockKind),
392
393 DefinitionList,
394 DefinitionListTitle,
395 DefinitionListDefinition,
396}
397
398#[derive(Clone, Debug, PartialEq)]
399pub enum CodeBlockKind {
400 Indented,
401 /// "Fenced" means "surrounded by triple backticks."
402 /// There can optionally be either a language after the backticks (like in traditional Markdown)
403 /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
404 /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
405 Fenced,
406 FencedLang(SharedString),
407 FencedSrc(PathWithRange),
408}
409
410#[derive(Default, Clone, Debug, PartialEq)]
411pub struct CodeBlockMetadata {
412 pub content_range: Range<usize>,
413 pub line_count: usize,
414}
415
416pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
417 let mut range = 0..text.len();
418 if text.starts_with("```") {
419 range.start += 3;
420
421 if let Some(newline_ix) = text[range.clone()].find('\n') {
422 range.start += newline_ix + 1;
423 }
424 }
425
426 if !range.is_empty() && text.ends_with("```") {
427 range.end -= 3;
428 }
429 range
430}
431
432/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
433/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
434///
435/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
436#[derive(Clone)]
437pub enum CompactStr {
438 Boxed(Box<str>),
439 Inlined(InlineStr),
440}
441
442impl std::fmt::Debug for CompactStr {
443 fn fmt(&self, formatter: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
444 self.deref().fmt(formatter)
445 }
446}
447
448impl Deref for CompactStr {
449 type Target = str;
450
451 fn deref(&self) -> &str {
452 match self {
453 CompactStr::Boxed(b) => b,
454 CompactStr::Inlined(i) => i,
455 }
456 }
457}
458
459impl From<&str> for CompactStr {
460 fn from(s: &str) -> Self {
461 if let Ok(inlined) = s.try_into() {
462 CompactStr::Inlined(inlined)
463 } else {
464 CompactStr::Boxed(s.into())
465 }
466 }
467}
468
469impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
470 fn from(cow_str: pulldown_cmark::CowStr) -> Self {
471 match cow_str {
472 pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
473 pulldown_cmark::CowStr::Borrowed(b) => b.into(),
474 pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
475 }
476 }
477}
478
479impl PartialEq for CompactStr {
480 fn eq(&self, other: &Self) -> bool {
481 self.deref() == other.deref()
482 }
483}
484
485fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
486 match substring {
487 pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
488 pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
489 pulldown_cmark::CowStr::Inlined(_) => false,
490 }
491}
492
493fn str_points_inside(substring: &str, container: &str) -> bool {
494 let substring_ptr = substring.as_ptr();
495 let container_ptr = container.as_ptr();
496 unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
497}
498
499#[cfg(test)]
500mod tests {
501 use super::MarkdownEvent::*;
502 use super::MarkdownTag::*;
503 use super::*;
504
505 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
506 .union(Options::ENABLE_MATH)
507 .union(Options::ENABLE_DEFINITION_LIST);
508
509 #[test]
510 fn all_options_considered() {
511 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
512 // can be evaluated for inclusion.
513 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
514 }
515
516 #[test]
517 fn wanted_and_unwanted_options_disjoint() {
518 assert_eq!(
519 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
520 Options::empty()
521 );
522 }
523
524 #[test]
525 fn test_plain_urls_and_escaped_text() {
526 assert_eq!(
527 parse_markdown(" https://some.url some \\`►\\` text"),
528 (
529 vec![
530 (0..51, Start(Paragraph)),
531 (0..6, SubstitutedText("\u{a0}".into())),
532 (6..12, SubstitutedText("\u{a0}".into())),
533 (12..13, Text),
534 (
535 13..29,
536 Start(Link {
537 link_type: LinkType::Autolink,
538 dest_url: "https://some.url".into(),
539 title: "".into(),
540 id: "".into(),
541 })
542 ),
543 (13..29, Text),
544 (13..29, End(MarkdownTagEnd::Link)),
545 (29..35, Text),
546 (36..37, Text), // Escaped backtick
547 (37..44, SubstitutedText("►".into())),
548 (45..46, Text), // Escaped backtick
549 (46..51, Text),
550 (0..51, End(MarkdownTagEnd::Paragraph))
551 ],
552 HashSet::new(),
553 HashSet::new()
554 )
555 );
556 }
557
558 #[test]
559 fn test_smart_punctuation() {
560 assert_eq!(
561 parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
562 (
563 vec![
564 (0..53, Start(Paragraph)),
565 (0..2, SubstitutedText("–".into())),
566 (2..3, Text),
567 (3..6, SubstitutedText("—".into())),
568 (6..7, Text),
569 (7..10, SubstitutedText("…".into())),
570 (10..11, Text),
571 (11..12, SubstitutedText("“".into())),
572 (12..25, Text),
573 (25..26, SubstitutedText("”".into())),
574 (26..27, Text),
575 (27..28, SubstitutedText("‘".into())),
576 (28..41, Text),
577 (41..42, SubstitutedText("’".into())),
578 (42..43, Text),
579 (43..53, SubstitutedText("–––––".into())),
580 (0..53, End(MarkdownTagEnd::Paragraph))
581 ],
582 HashSet::new(),
583 HashSet::new()
584 )
585 )
586 }
587
588 #[test]
589 fn test_code_block_metadata() {
590 assert_eq!(
591 parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
592 (
593 vec![
594 (
595 0..37,
596 Start(CodeBlock {
597 kind: CodeBlockKind::FencedLang("rust".into()),
598 metadata: CodeBlockMetadata {
599 content_range: 8..34,
600 line_count: 3
601 }
602 })
603 ),
604 (8..34, Text),
605 (0..37, End(MarkdownTagEnd::CodeBlock)),
606 ],
607 HashSet::from(["rust".into()]),
608 HashSet::new()
609 )
610 )
611 }
612
613 #[test]
614 fn test_extract_code_block_content_range() {
615 let input = "```rust\nlet x = 5;\n```";
616 assert_eq!(extract_code_block_content_range(input), 8..19);
617
618 let input = "plain text";
619 assert_eq!(extract_code_block_content_range(input), 0..10);
620
621 let input = "```python\nprint('hello')\nprint('world')\n```";
622 assert_eq!(extract_code_block_content_range(input), 10..40);
623 }
624}