1use gpui::SharedString;
2use linkify::LinkFinder;
3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
4use pulldown_cmark::{
5 Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
6};
7use std::{
8 collections::HashSet,
9 ops::{Deref, Range},
10 path::Path,
11 sync::Arc,
12};
13
14use crate::path_range::PathWithRange;
15
16const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
17 .union(Options::ENABLE_FOOTNOTES)
18 .union(Options::ENABLE_STRIKETHROUGH)
19 .union(Options::ENABLE_TASKLISTS)
20 .union(Options::ENABLE_SMART_PUNCTUATION)
21 .union(Options::ENABLE_HEADING_ATTRIBUTES)
22 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
23 .union(Options::ENABLE_OLD_FOOTNOTES)
24 .union(Options::ENABLE_GFM);
25
26pub fn parse_markdown(
27 text: &str,
28) -> (
29 Vec<(Range<usize>, MarkdownEvent)>,
30 HashSet<SharedString>,
31 HashSet<Arc<Path>>,
32) {
33 let mut events = Vec::new();
34 let mut language_names = HashSet::new();
35 let mut language_paths = HashSet::new();
36 let mut within_link = false;
37 let mut within_metadata = false;
38 for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
39 if within_metadata {
40 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
41 pulldown_event
42 {
43 within_metadata = false;
44 }
45 continue;
46 }
47 match pulldown_event {
48 pulldown_cmark::Event::Start(tag) => {
49 let tag = match tag {
50 pulldown_cmark::Tag::Link {
51 link_type,
52 dest_url,
53 title,
54 id,
55 } => {
56 within_link = true;
57 MarkdownTag::Link {
58 link_type,
59 dest_url: SharedString::from(dest_url.into_string()),
60 title: SharedString::from(title.into_string()),
61 id: SharedString::from(id.into_string()),
62 }
63 }
64 pulldown_cmark::Tag::MetadataBlock(kind) => {
65 within_metadata = true;
66 MarkdownTag::MetadataBlock(kind)
67 }
68 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
69 ref info,
70 )) => {
71 let info = info.trim();
72 MarkdownTag::CodeBlock(if info.is_empty() {
73 CodeBlockKind::Fenced
74 // Languages should never contain a slash, and PathRanges always should.
75 // (Models are told to specify them relative to a workspace root.)
76 } else if info.contains('/') {
77 let path_range = PathWithRange::new(info);
78 language_paths.insert(path_range.path.clone());
79 CodeBlockKind::FencedSrc(path_range)
80 } else {
81 let language = SharedString::from(info.to_string());
82 language_names.insert(language.clone());
83 CodeBlockKind::FencedLang(language)
84 })
85 }
86 tag => tag.into(),
87 };
88 events.push((range, MarkdownEvent::Start(tag)))
89 }
90 pulldown_cmark::Event::End(tag) => {
91 if let pulldown_cmark::TagEnd::Link = tag {
92 within_link = false;
93 }
94 events.push((range, MarkdownEvent::End(tag)));
95 }
96 pulldown_cmark::Event::Text(parsed) => {
97 // `parsed` will share bytes with the input unless a substitution like handling of
98 // HTML entities or smart punctuation has occurred. When these substitutions occur,
99 // `parsed` only consists of the result of a single substitution.
100 if !cow_str_points_inside(&parsed, text) {
101 events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
102 } else {
103 // Automatically detect links in text if not already within a markdown link.
104 if !within_link {
105 let mut finder = LinkFinder::new();
106 finder.kinds(&[linkify::LinkKind::Url]);
107 let text_range = range.clone();
108 for link in finder.links(&text[text_range.clone()]) {
109 let link_range =
110 text_range.start + link.start()..text_range.start + link.end();
111
112 if link_range.start > range.start {
113 events.push((range.start..link_range.start, MarkdownEvent::Text));
114 }
115
116 events.push((
117 link_range.clone(),
118 MarkdownEvent::Start(MarkdownTag::Link {
119 link_type: LinkType::Autolink,
120 dest_url: SharedString::from(link.as_str().to_string()),
121 title: SharedString::default(),
122 id: SharedString::default(),
123 }),
124 ));
125
126 events.push((link_range.clone(), MarkdownEvent::Text));
127 events.push((
128 link_range.clone(),
129 MarkdownEvent::End(MarkdownTagEnd::Link),
130 ));
131
132 range.start = link_range.end;
133 }
134 }
135 if range.start < range.end {
136 events.push((range, MarkdownEvent::Text));
137 }
138 }
139 }
140 pulldown_cmark::Event::Code(_) => {
141 range.start += 1;
142 range.end -= 1;
143 events.push((range, MarkdownEvent::Code))
144 }
145 pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
146 pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
147 pulldown_cmark::Event::FootnoteReference(_) => {
148 events.push((range, MarkdownEvent::FootnoteReference))
149 }
150 pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
151 pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
152 pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
153 pulldown_cmark::Event::TaskListMarker(checked) => {
154 events.push((range, MarkdownEvent::TaskListMarker(checked)))
155 }
156 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
157 }
158 }
159 (events, language_names, language_paths)
160}
161
162pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
163 let mut events = Vec::new();
164 let mut finder = LinkFinder::new();
165 finder.kinds(&[linkify::LinkKind::Url]);
166 let mut text_range = Range {
167 start: 0,
168 end: text.len(),
169 };
170 for link in finder.links(text) {
171 let link_range = link.start()..link.end();
172
173 if link_range.start > text_range.start {
174 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
175 }
176
177 events.push((
178 link_range.clone(),
179 MarkdownEvent::Start(MarkdownTag::Link {
180 link_type: LinkType::Autolink,
181 dest_url: SharedString::from(link.as_str().to_string()),
182 title: SharedString::default(),
183 id: SharedString::default(),
184 }),
185 ));
186 events.push((link_range.clone(), MarkdownEvent::Text));
187 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
188
189 text_range.start = link_range.end;
190 }
191
192 if text_range.end > text_range.start {
193 events.push((text_range, MarkdownEvent::Text));
194 }
195
196 events
197}
198
199/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
200/// parse result for rendering without resorting to unsafe lifetime coercion.
201#[derive(Clone, Debug, PartialEq)]
202pub enum MarkdownEvent {
203 /// Start of a tagged element. Events that are yielded after this event
204 /// and before its corresponding `End` event are inside this element.
205 /// Start and end events are guaranteed to be balanced.
206 Start(MarkdownTag),
207 /// End of a tagged element.
208 End(MarkdownTagEnd),
209 /// Text that uses the associated range from the markdown source.
210 Text,
211 /// Text that differs from the markdown source - typically due to substitution of HTML entities
212 /// and smart punctuation.
213 SubstitutedText(CompactStr),
214 /// An inline code node.
215 Code,
216 /// An HTML node.
217 Html,
218 /// An inline HTML node.
219 InlineHtml,
220 /// A reference to a footnote with given label, which may or may not be defined
221 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
222 /// occur in any order.
223 FootnoteReference,
224 /// A soft line break.
225 SoftBreak,
226 /// A hard line break.
227 HardBreak,
228 /// A horizontal ruler.
229 Rule,
230 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
231 TaskListMarker(bool),
232}
233
234/// Tags for elements that can contain other elements.
235#[derive(Clone, Debug, PartialEq)]
236pub enum MarkdownTag {
237 /// A paragraph of text and other inline elements.
238 Paragraph,
239
240 /// A heading, with optional identifier, classes and custom attributes.
241 /// The identifier is prefixed with `#` and the last one in the attributes
242 /// list is chosen, classes are prefixed with `.` and custom attributes
243 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
244 Heading {
245 level: HeadingLevel,
246 id: Option<SharedString>,
247 classes: Vec<SharedString>,
248 /// The first item of the tuple is the attr and second one the value.
249 attrs: Vec<(SharedString, Option<SharedString>)>,
250 },
251
252 BlockQuote,
253
254 /// A code block.
255 CodeBlock(CodeBlockKind),
256
257 /// A HTML block.
258 HtmlBlock,
259
260 /// A list. If the list is ordered the field indicates the number of the first item.
261 /// Contains only list items.
262 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
263
264 /// A list item.
265 Item,
266
267 /// A footnote definition. The value contained is the footnote's label by which it can
268 /// be referred to.
269 FootnoteDefinition(SharedString),
270
271 /// A table. Contains a vector describing the text-alignment for each of its columns.
272 Table(Vec<Alignment>),
273
274 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
275 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
276 TableHead,
277
278 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
279 TableRow,
280 TableCell,
281
282 // span-level tags
283 Emphasis,
284 Strong,
285 Strikethrough,
286
287 /// A link.
288 Link {
289 link_type: LinkType,
290 dest_url: SharedString,
291 title: SharedString,
292 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
293 id: SharedString,
294 },
295
296 /// An image. The first field is the link type, the second the destination URL and the third is a title,
297 /// the fourth is the link identifier.
298 Image {
299 link_type: LinkType,
300 dest_url: SharedString,
301 title: SharedString,
302 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
303 id: SharedString,
304 },
305
306 /// A metadata block.
307 MetadataBlock(MetadataBlockKind),
308
309 DefinitionList,
310 DefinitionListTitle,
311 DefinitionListDefinition,
312}
313
314#[derive(Clone, Debug, PartialEq)]
315pub enum CodeBlockKind {
316 Indented,
317 /// "Fenced" means "surrounded by triple backticks."
318 /// There can optionally be either a language after the backticks (like in traditional Markdown)
319 /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
320 /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
321 Fenced,
322 FencedLang(SharedString),
323 FencedSrc(PathWithRange),
324}
325
326impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
327 fn from(tag: pulldown_cmark::Tag) -> Self {
328 match tag {
329 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
330 pulldown_cmark::Tag::Heading {
331 level,
332 id,
333 classes,
334 attrs,
335 } => {
336 let id = id.map(|id| SharedString::from(id.into_string()));
337 let classes = classes
338 .into_iter()
339 .map(|c| SharedString::from(c.into_string()))
340 .collect();
341 let attrs = attrs
342 .into_iter()
343 .map(|(key, value)| {
344 (
345 SharedString::from(key.into_string()),
346 value.map(|v| SharedString::from(v.into_string())),
347 )
348 })
349 .collect();
350 MarkdownTag::Heading {
351 level,
352 id,
353 classes,
354 attrs,
355 }
356 }
357 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
358 pulldown_cmark::Tag::CodeBlock(kind) => match kind {
359 pulldown_cmark::CodeBlockKind::Indented => {
360 MarkdownTag::CodeBlock(CodeBlockKind::Indented)
361 }
362 pulldown_cmark::CodeBlockKind::Fenced(info) => {
363 let info = info.trim();
364 MarkdownTag::CodeBlock(if info.is_empty() {
365 CodeBlockKind::Fenced
366 } else if info.contains('/') {
367 // Languages should never contain a slash, and PathRanges always should.
368 // (Models are told to specify them relative to a workspace root.)
369 CodeBlockKind::FencedSrc(PathWithRange::new(info))
370 } else {
371 CodeBlockKind::FencedLang(SharedString::from(info.to_string()))
372 })
373 }
374 },
375 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
376 pulldown_cmark::Tag::Item => MarkdownTag::Item,
377 pulldown_cmark::Tag::FootnoteDefinition(label) => {
378 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
379 }
380 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
381 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
382 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
383 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
384 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
385 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
386 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
387 pulldown_cmark::Tag::Link {
388 link_type,
389 dest_url,
390 title,
391 id,
392 } => MarkdownTag::Link {
393 link_type,
394 dest_url: SharedString::from(dest_url.into_string()),
395 title: SharedString::from(title.into_string()),
396 id: SharedString::from(id.into_string()),
397 },
398 pulldown_cmark::Tag::Image {
399 link_type,
400 dest_url,
401 title,
402 id,
403 } => MarkdownTag::Image {
404 link_type,
405 dest_url: SharedString::from(dest_url.into_string()),
406 title: SharedString::from(title.into_string()),
407 id: SharedString::from(id.into_string()),
408 },
409 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
410 pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
411 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
412 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
413 pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
414 }
415 }
416}
417
418/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
419/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
420///
421/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
422#[derive(Clone)]
423pub enum CompactStr {
424 Boxed(Box<str>),
425 Inlined(InlineStr),
426}
427
428impl std::fmt::Debug for CompactStr {
429 fn fmt(&self, formatter: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
430 self.deref().fmt(formatter)
431 }
432}
433
434impl Deref for CompactStr {
435 type Target = str;
436
437 fn deref(&self) -> &str {
438 match self {
439 CompactStr::Boxed(b) => b,
440 CompactStr::Inlined(i) => i,
441 }
442 }
443}
444
445impl From<&str> for CompactStr {
446 fn from(s: &str) -> Self {
447 if let Ok(inlined) = s.try_into() {
448 CompactStr::Inlined(inlined)
449 } else {
450 CompactStr::Boxed(s.into())
451 }
452 }
453}
454
455impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
456 fn from(cow_str: pulldown_cmark::CowStr) -> Self {
457 match cow_str {
458 pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
459 pulldown_cmark::CowStr::Borrowed(b) => b.into(),
460 pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
461 }
462 }
463}
464
465impl PartialEq for CompactStr {
466 fn eq(&self, other: &Self) -> bool {
467 self.deref() == other.deref()
468 }
469}
470
471fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
472 match substring {
473 pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
474 pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
475 pulldown_cmark::CowStr::Inlined(_) => false,
476 }
477}
478
479fn str_points_inside(substring: &str, container: &str) -> bool {
480 let substring_ptr = substring.as_ptr();
481 let container_ptr = container.as_ptr();
482 unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
483}
484
485#[cfg(test)]
486mod tests {
487 use super::MarkdownEvent::*;
488 use super::MarkdownTag::*;
489 use super::*;
490
491 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
492 .union(Options::ENABLE_MATH)
493 .union(Options::ENABLE_DEFINITION_LIST);
494
495 #[test]
496 fn all_options_considered() {
497 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
498 // can be evaluated for inclusion.
499 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
500 }
501
502 #[test]
503 fn wanted_and_unwanted_options_disjoint() {
504 assert_eq!(
505 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
506 Options::empty()
507 );
508 }
509
510 #[test]
511 fn test_plain_urls_and_escaped_text() {
512 assert_eq!(
513 parse_markdown(" https://some.url some \\`►\\` text"),
514 (
515 vec![
516 (0..51, Start(Paragraph)),
517 (0..6, SubstitutedText("\u{a0}".into())),
518 (6..12, SubstitutedText("\u{a0}".into())),
519 (12..13, Text),
520 (
521 13..29,
522 Start(Link {
523 link_type: LinkType::Autolink,
524 dest_url: "https://some.url".into(),
525 title: "".into(),
526 id: "".into(),
527 })
528 ),
529 (13..29, Text),
530 (13..29, End(MarkdownTagEnd::Link)),
531 (29..35, Text),
532 (36..37, Text), // Escaped backtick
533 (37..44, SubstitutedText("►".into())),
534 (45..46, Text), // Escaped backtick
535 (46..51, Text),
536 (0..51, End(MarkdownTagEnd::Paragraph))
537 ],
538 HashSet::new(),
539 HashSet::new()
540 )
541 );
542 }
543
544 #[test]
545 fn test_smart_punctuation() {
546 assert_eq!(
547 parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
548 (
549 vec![
550 (0..53, Start(Paragraph)),
551 (0..2, SubstitutedText("–".into())),
552 (2..3, Text),
553 (3..6, SubstitutedText("—".into())),
554 (6..7, Text),
555 (7..10, SubstitutedText("…".into())),
556 (10..11, Text),
557 (11..12, SubstitutedText("“".into())),
558 (12..25, Text),
559 (25..26, SubstitutedText("”".into())),
560 (26..27, Text),
561 (27..28, SubstitutedText("‘".into())),
562 (28..41, Text),
563 (41..42, SubstitutedText("’".into())),
564 (42..43, Text),
565 (43..53, SubstitutedText("–––––".into())),
566 (0..53, End(MarkdownTagEnd::Paragraph))
567 ],
568 HashSet::new(),
569 HashSet::new()
570 )
571 )
572 }
573}