1use gpui::SharedString;
2use linkify::LinkFinder;
3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
4use pulldown_cmark::{
5 Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
6};
7use std::{
8 collections::HashSet,
9 ops::{Deref, Range},
10 path::Path,
11 sync::Arc,
12};
13
14use crate::path_range::PathWithRange;
15
16const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
17 .union(Options::ENABLE_FOOTNOTES)
18 .union(Options::ENABLE_STRIKETHROUGH)
19 .union(Options::ENABLE_TASKLISTS)
20 .union(Options::ENABLE_SMART_PUNCTUATION)
21 .union(Options::ENABLE_HEADING_ATTRIBUTES)
22 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
23 .union(Options::ENABLE_OLD_FOOTNOTES)
24 .union(Options::ENABLE_GFM);
25
26pub fn parse_markdown(
27 text: &str,
28) -> (
29 Vec<(Range<usize>, MarkdownEvent)>,
30 HashSet<SharedString>,
31 HashSet<Arc<Path>>,
32) {
33 let mut events = Vec::new();
34 let mut language_names = HashSet::new();
35 let mut language_paths = HashSet::new();
36 let mut within_link = false;
37 let mut within_metadata = false;
38 for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
39 if within_metadata {
40 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
41 pulldown_event
42 {
43 within_metadata = false;
44 }
45 continue;
46 }
47 match pulldown_event {
48 pulldown_cmark::Event::Start(tag) => {
49 let tag = match tag {
50 pulldown_cmark::Tag::Link {
51 link_type,
52 dest_url,
53 title,
54 id,
55 } => {
56 within_link = true;
57 MarkdownTag::Link {
58 link_type,
59 dest_url: SharedString::from(dest_url.into_string()),
60 title: SharedString::from(title.into_string()),
61 id: SharedString::from(id.into_string()),
62 }
63 }
64 pulldown_cmark::Tag::MetadataBlock(kind) => {
65 within_metadata = true;
66 MarkdownTag::MetadataBlock(kind)
67 }
68 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
69 ref info,
70 )) => {
71 let info = info.trim();
72 MarkdownTag::CodeBlock(if info.is_empty() {
73 CodeBlockKind::Fenced
74 // Languages should never contain a slash, and PathRanges always should.
75 // (Models are told to specify them relative to a workspace root.)
76 } else if info.contains('/') {
77 let path_range = PathWithRange::new(info);
78 language_paths.insert(path_range.path.clone());
79 CodeBlockKind::FencedSrc(path_range)
80 } else {
81 let language = SharedString::from(info.to_string());
82 language_names.insert(language.clone());
83 CodeBlockKind::FencedLang(language)
84 })
85 }
86 tag => tag.into(),
87 };
88 events.push((range, MarkdownEvent::Start(tag)))
89 }
90 pulldown_cmark::Event::End(tag) => {
91 if let pulldown_cmark::TagEnd::Link = tag {
92 within_link = false;
93 }
94 events.push((range, MarkdownEvent::End(tag)));
95 }
96 pulldown_cmark::Event::Text(parsed) => {
97 // `parsed` will share bytes with the input unless a substitution like handling of
98 // HTML entities or smart punctuation has occurred. When these substitutions occur,
99 // `parsed` only consists of the result of a single substitution.
100 if !cow_str_points_inside(&parsed, text) {
101 // Attempt to detect cases where the assumptions here are not valid or the
102 // behavior has changed.
103 if parsed.len() > 4 {
104 log::error!(
105 "Bug in markdown parser. \
106 pulldown_cmark::Event::Text expected to a substituted HTML entity, \
107 but it was longer than expected.\n\
108 Source: {}\n\
109 Parsed: {}",
110 &text[range.clone()],
111 parsed
112 );
113 }
114 events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
115 } else {
116 // Automatically detect links in text if not already within a markdown link.
117 if !within_link {
118 let mut finder = LinkFinder::new();
119 finder.kinds(&[linkify::LinkKind::Url]);
120 let text_range = range.clone();
121 for link in finder.links(&text[text_range.clone()]) {
122 let link_range =
123 text_range.start + link.start()..text_range.start + link.end();
124
125 if link_range.start > range.start {
126 events.push((range.start..link_range.start, MarkdownEvent::Text));
127 }
128
129 events.push((
130 link_range.clone(),
131 MarkdownEvent::Start(MarkdownTag::Link {
132 link_type: LinkType::Autolink,
133 dest_url: SharedString::from(link.as_str().to_string()),
134 title: SharedString::default(),
135 id: SharedString::default(),
136 }),
137 ));
138
139 events.push((link_range.clone(), MarkdownEvent::Text));
140 events.push((
141 link_range.clone(),
142 MarkdownEvent::End(MarkdownTagEnd::Link),
143 ));
144
145 range.start = link_range.end;
146 }
147 }
148 if range.start < range.end {
149 events.push((range, MarkdownEvent::Text));
150 }
151 }
152 }
153 pulldown_cmark::Event::Code(_) => {
154 range.start += 1;
155 range.end -= 1;
156 events.push((range, MarkdownEvent::Code))
157 }
158 pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
159 pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
160 pulldown_cmark::Event::FootnoteReference(_) => {
161 events.push((range, MarkdownEvent::FootnoteReference))
162 }
163 pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
164 pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
165 pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
166 pulldown_cmark::Event::TaskListMarker(checked) => {
167 events.push((range, MarkdownEvent::TaskListMarker(checked)))
168 }
169 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
170 }
171 }
172 (events, language_names, language_paths)
173}
174
175pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
176 let mut events = Vec::new();
177 let mut finder = LinkFinder::new();
178 finder.kinds(&[linkify::LinkKind::Url]);
179 let mut text_range = Range {
180 start: 0,
181 end: text.len(),
182 };
183 for link in finder.links(text) {
184 let link_range = link.start()..link.end();
185
186 if link_range.start > text_range.start {
187 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
188 }
189
190 events.push((
191 link_range.clone(),
192 MarkdownEvent::Start(MarkdownTag::Link {
193 link_type: LinkType::Autolink,
194 dest_url: SharedString::from(link.as_str().to_string()),
195 title: SharedString::default(),
196 id: SharedString::default(),
197 }),
198 ));
199 events.push((link_range.clone(), MarkdownEvent::Text));
200 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
201
202 text_range.start = link_range.end;
203 }
204
205 if text_range.end > text_range.start {
206 events.push((text_range, MarkdownEvent::Text));
207 }
208
209 events
210}
211
212/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
213/// parse result for rendering without resorting to unsafe lifetime coercion.
214#[derive(Clone, Debug, PartialEq)]
215pub enum MarkdownEvent {
216 /// Start of a tagged element. Events that are yielded after this event
217 /// and before its corresponding `End` event are inside this element.
218 /// Start and end events are guaranteed to be balanced.
219 Start(MarkdownTag),
220 /// End of a tagged element.
221 End(MarkdownTagEnd),
222 /// Text that uses the associated range from the markdown source.
223 Text,
224 /// Text that differs from the markdown source - typically due to substitution of HTML entities
225 /// and smart punctuation.
226 SubstitutedText(CompactStr),
227 /// An inline code node.
228 Code,
229 /// An HTML node.
230 Html,
231 /// An inline HTML node.
232 InlineHtml,
233 /// A reference to a footnote with given label, which may or may not be defined
234 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
235 /// occur in any order.
236 FootnoteReference,
237 /// A soft line break.
238 SoftBreak,
239 /// A hard line break.
240 HardBreak,
241 /// A horizontal ruler.
242 Rule,
243 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
244 TaskListMarker(bool),
245}
246
247/// Tags for elements that can contain other elements.
248#[derive(Clone, Debug, PartialEq)]
249pub enum MarkdownTag {
250 /// A paragraph of text and other inline elements.
251 Paragraph,
252
253 /// A heading, with optional identifier, classes and custom attributes.
254 /// The identifier is prefixed with `#` and the last one in the attributes
255 /// list is chosen, classes are prefixed with `.` and custom attributes
256 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
257 Heading {
258 level: HeadingLevel,
259 id: Option<SharedString>,
260 classes: Vec<SharedString>,
261 /// The first item of the tuple is the attr and second one the value.
262 attrs: Vec<(SharedString, Option<SharedString>)>,
263 },
264
265 BlockQuote,
266
267 /// A code block.
268 CodeBlock(CodeBlockKind),
269
270 /// A HTML block.
271 HtmlBlock,
272
273 /// A list. If the list is ordered the field indicates the number of the first item.
274 /// Contains only list items.
275 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
276
277 /// A list item.
278 Item,
279
280 /// A footnote definition. The value contained is the footnote's label by which it can
281 /// be referred to.
282 FootnoteDefinition(SharedString),
283
284 /// A table. Contains a vector describing the text-alignment for each of its columns.
285 Table(Vec<Alignment>),
286
287 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
288 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
289 TableHead,
290
291 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
292 TableRow,
293 TableCell,
294
295 // span-level tags
296 Emphasis,
297 Strong,
298 Strikethrough,
299
300 /// A link.
301 Link {
302 link_type: LinkType,
303 dest_url: SharedString,
304 title: SharedString,
305 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
306 id: SharedString,
307 },
308
309 /// An image. The first field is the link type, the second the destination URL and the third is a title,
310 /// the fourth is the link identifier.
311 Image {
312 link_type: LinkType,
313 dest_url: SharedString,
314 title: SharedString,
315 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
316 id: SharedString,
317 },
318
319 /// A metadata block.
320 MetadataBlock(MetadataBlockKind),
321
322 DefinitionList,
323 DefinitionListTitle,
324 DefinitionListDefinition,
325}
326
327#[derive(Clone, Debug, PartialEq)]
328pub enum CodeBlockKind {
329 Indented,
330 /// "Fenced" means "surrounded by triple backticks."
331 /// There can optionally be either a language after the backticks (like in traditional Markdown)
332 /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
333 /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
334 Fenced,
335 FencedLang(SharedString),
336 FencedSrc(PathWithRange),
337}
338
339impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
340 fn from(tag: pulldown_cmark::Tag) -> Self {
341 match tag {
342 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
343 pulldown_cmark::Tag::Heading {
344 level,
345 id,
346 classes,
347 attrs,
348 } => {
349 let id = id.map(|id| SharedString::from(id.into_string()));
350 let classes = classes
351 .into_iter()
352 .map(|c| SharedString::from(c.into_string()))
353 .collect();
354 let attrs = attrs
355 .into_iter()
356 .map(|(key, value)| {
357 (
358 SharedString::from(key.into_string()),
359 value.map(|v| SharedString::from(v.into_string())),
360 )
361 })
362 .collect();
363 MarkdownTag::Heading {
364 level,
365 id,
366 classes,
367 attrs,
368 }
369 }
370 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
371 pulldown_cmark::Tag::CodeBlock(kind) => match kind {
372 pulldown_cmark::CodeBlockKind::Indented => {
373 MarkdownTag::CodeBlock(CodeBlockKind::Indented)
374 }
375 pulldown_cmark::CodeBlockKind::Fenced(info) => {
376 let info = info.trim();
377 MarkdownTag::CodeBlock(if info.is_empty() {
378 CodeBlockKind::Fenced
379 } else if info.contains('/') {
380 // Languages should never contain a slash, and PathRanges always should.
381 // (Models are told to specify them relative to a workspace root.)
382 CodeBlockKind::FencedSrc(PathWithRange::new(info))
383 } else {
384 CodeBlockKind::FencedLang(SharedString::from(info.to_string()))
385 })
386 }
387 },
388 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
389 pulldown_cmark::Tag::Item => MarkdownTag::Item,
390 pulldown_cmark::Tag::FootnoteDefinition(label) => {
391 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
392 }
393 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
394 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
395 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
396 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
397 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
398 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
399 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
400 pulldown_cmark::Tag::Link {
401 link_type,
402 dest_url,
403 title,
404 id,
405 } => MarkdownTag::Link {
406 link_type,
407 dest_url: SharedString::from(dest_url.into_string()),
408 title: SharedString::from(title.into_string()),
409 id: SharedString::from(id.into_string()),
410 },
411 pulldown_cmark::Tag::Image {
412 link_type,
413 dest_url,
414 title,
415 id,
416 } => MarkdownTag::Image {
417 link_type,
418 dest_url: SharedString::from(dest_url.into_string()),
419 title: SharedString::from(title.into_string()),
420 id: SharedString::from(id.into_string()),
421 },
422 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
423 pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
424 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
425 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
426 pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
427 }
428 }
429}
430
431/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
432/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
433///
434/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
435#[derive(Clone, Debug)]
436pub enum CompactStr {
437 Boxed(Box<str>),
438 Inlined(InlineStr),
439}
440
441impl Deref for CompactStr {
442 type Target = str;
443
444 fn deref(&self) -> &str {
445 match self {
446 CompactStr::Boxed(b) => b,
447 CompactStr::Inlined(i) => i,
448 }
449 }
450}
451
452impl From<&str> for CompactStr {
453 fn from(s: &str) -> Self {
454 if let Ok(inlined) = s.try_into() {
455 CompactStr::Inlined(inlined)
456 } else {
457 CompactStr::Boxed(s.into())
458 }
459 }
460}
461
462impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
463 fn from(cow_str: pulldown_cmark::CowStr) -> Self {
464 match cow_str {
465 pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
466 pulldown_cmark::CowStr::Borrowed(b) => b.into(),
467 pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
468 }
469 }
470}
471
472impl PartialEq for CompactStr {
473 fn eq(&self, other: &Self) -> bool {
474 self.deref() == other.deref()
475 }
476}
477
478fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
479 match substring {
480 pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
481 pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
482 pulldown_cmark::CowStr::Inlined(_) => false,
483 }
484}
485
486fn str_points_inside(substring: &str, container: &str) -> bool {
487 let substring_ptr = substring.as_ptr();
488 let container_ptr = container.as_ptr();
489 unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
490}
491
492#[cfg(test)]
493mod tests {
494 use super::MarkdownEvent::*;
495 use super::MarkdownTag::*;
496 use super::*;
497
498 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
499 .union(Options::ENABLE_MATH)
500 .union(Options::ENABLE_DEFINITION_LIST);
501
502 #[test]
503 fn all_options_considered() {
504 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
505 // can be evaluated for inclusion.
506 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
507 }
508
509 #[test]
510 fn wanted_and_unwanted_options_disjoint() {
511 assert_eq!(
512 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
513 Options::empty()
514 );
515 }
516
517 #[test]
518 fn test_plain_urls_and_escaped_text() {
519 assert_eq!(
520 parse_markdown(" https://some.url some \\`►\\` text"),
521 (
522 vec![
523 (0..51, Start(Paragraph)),
524 (0..6, SubstitutedText("\u{a0}".into())),
525 (6..12, SubstitutedText("\u{a0}".into())),
526 (12..13, Text),
527 (
528 13..29,
529 Start(Link {
530 link_type: LinkType::Autolink,
531 dest_url: "https://some.url".into(),
532 title: "".into(),
533 id: "".into(),
534 })
535 ),
536 (13..29, Text),
537 (13..29, End(MarkdownTagEnd::Link)),
538 (29..35, Text),
539 (36..37, Text), // Escaped backtick
540 (37..44, SubstitutedText("►".into())),
541 (45..46, Text), // Escaped backtick
542 (46..51, Text),
543 (0..51, End(MarkdownTagEnd::Paragraph))
544 ],
545 HashSet::new(),
546 HashSet::new()
547 )
548 );
549 }
550
551 #[test]
552 fn test_smart_punctuation() {
553 assert_eq!(
554 parse_markdown("-- --- ... \"double quoted\" 'single quoted'"),
555 (
556 vec![
557 (0..42, Start(Paragraph)),
558 (0..2, SubstitutedText("–".into())),
559 (2..3, Text),
560 (3..6, SubstitutedText("—".into())),
561 (6..7, Text),
562 (7..10, SubstitutedText("…".into())),
563 (10..11, Text),
564 (11..12, SubstitutedText("“".into())),
565 (12..25, Text),
566 (25..26, SubstitutedText("”".into())),
567 (26..27, Text),
568 (27..28, SubstitutedText("‘".into())),
569 (28..41, Text),
570 (41..42, SubstitutedText("’".into())),
571 (0..42, End(MarkdownTagEnd::Paragraph))
572 ],
573 HashSet::new(),
574 HashSet::new()
575 )
576 )
577 }
578}