1use gpui::SharedString;
2use linkify::LinkFinder;
3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
4use pulldown_cmark::{
5 Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
6};
7use std::{
8 collections::HashSet,
9 ops::{Deref, Range},
10 path::PathBuf,
11};
12
13use crate::path_range::PathRange;
14
15const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
16 .union(Options::ENABLE_FOOTNOTES)
17 .union(Options::ENABLE_STRIKETHROUGH)
18 .union(Options::ENABLE_TASKLISTS)
19 .union(Options::ENABLE_SMART_PUNCTUATION)
20 .union(Options::ENABLE_HEADING_ATTRIBUTES)
21 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
22 .union(Options::ENABLE_OLD_FOOTNOTES)
23 .union(Options::ENABLE_GFM);
24
25pub fn parse_markdown(
26 text: &str,
27) -> (
28 Vec<(Range<usize>, MarkdownEvent)>,
29 HashSet<SharedString>,
30 HashSet<PathBuf>,
31) {
32 let mut events = Vec::new();
33 let mut language_names = HashSet::new();
34 let mut language_paths = HashSet::new();
35 let mut within_link = false;
36 let mut within_metadata = false;
37 for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
38 if within_metadata {
39 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
40 pulldown_event
41 {
42 within_metadata = false;
43 }
44 continue;
45 }
46 match pulldown_event {
47 pulldown_cmark::Event::Start(tag) => {
48 let tag = match tag {
49 pulldown_cmark::Tag::Link {
50 link_type,
51 dest_url,
52 title,
53 id,
54 } => {
55 within_link = true;
56 MarkdownTag::Link {
57 link_type,
58 dest_url: SharedString::from(dest_url.into_string()),
59 title: SharedString::from(title.into_string()),
60 id: SharedString::from(id.into_string()),
61 }
62 }
63 pulldown_cmark::Tag::MetadataBlock(kind) => {
64 within_metadata = true;
65 MarkdownTag::MetadataBlock(kind)
66 }
67 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
68 ref info,
69 )) => {
70 let info = info.trim();
71 MarkdownTag::CodeBlock(if info.is_empty() {
72 CodeBlockKind::Fenced
73 // Languages should never contain a slash, and PathRanges always should.
74 // (Models are told to specify them relative to a workspace root.)
75 } else if info.contains('/') {
76 let path_range = PathRange::new(info);
77 language_paths.insert(path_range.path.clone());
78 CodeBlockKind::FencedSrc(path_range)
79 } else {
80 let language = SharedString::from(info.to_string());
81 language_names.insert(language.clone());
82 CodeBlockKind::FencedLang(language)
83 })
84 }
85 tag => tag.into(),
86 };
87 events.push((range, MarkdownEvent::Start(tag)))
88 }
89 pulldown_cmark::Event::End(tag) => {
90 if let pulldown_cmark::TagEnd::Link = tag {
91 within_link = false;
92 }
93 events.push((range, MarkdownEvent::End(tag)));
94 }
95 pulldown_cmark::Event::Text(parsed) => {
96 // `parsed` will share bytes with the input unless a substitution like handling of
97 // HTML entities or smart punctuation has occurred. When these substitutions occur,
98 // `parsed` only consists of the result of a single substitution.
99 if !cow_str_points_inside(&parsed, text) {
100 // Attempt to detect cases where the assumptions here are not valid or the
101 // behavior has changed.
102 if parsed.len() > 4 {
103 log::error!(
104 "Bug in markdown parser. \
105 pulldown_cmark::Event::Text expected to a substituted HTML entity, \
106 but it was longer than expected.\n\
107 Source: {}\n\
108 Parsed: {}",
109 &text[range.clone()],
110 parsed
111 );
112 }
113 events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
114 } else {
115 // Automatically detect links in text if not already within a markdown link.
116 if !within_link {
117 let mut finder = LinkFinder::new();
118 finder.kinds(&[linkify::LinkKind::Url]);
119 let text_range = range.clone();
120 for link in finder.links(&text[text_range.clone()]) {
121 let link_range =
122 text_range.start + link.start()..text_range.start + link.end();
123
124 if link_range.start > range.start {
125 events.push((range.start..link_range.start, MarkdownEvent::Text));
126 }
127
128 events.push((
129 link_range.clone(),
130 MarkdownEvent::Start(MarkdownTag::Link {
131 link_type: LinkType::Autolink,
132 dest_url: SharedString::from(link.as_str().to_string()),
133 title: SharedString::default(),
134 id: SharedString::default(),
135 }),
136 ));
137
138 events.push((link_range.clone(), MarkdownEvent::Text));
139 events.push((
140 link_range.clone(),
141 MarkdownEvent::End(MarkdownTagEnd::Link),
142 ));
143
144 range.start = link_range.end;
145 }
146 }
147 if range.start < range.end {
148 events.push((range, MarkdownEvent::Text));
149 }
150 }
151 }
152 pulldown_cmark::Event::Code(_) => {
153 range.start += 1;
154 range.end -= 1;
155 events.push((range, MarkdownEvent::Code))
156 }
157 pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
158 pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
159 pulldown_cmark::Event::FootnoteReference(_) => {
160 events.push((range, MarkdownEvent::FootnoteReference))
161 }
162 pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
163 pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
164 pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
165 pulldown_cmark::Event::TaskListMarker(checked) => {
166 events.push((range, MarkdownEvent::TaskListMarker(checked)))
167 }
168 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
169 }
170 }
171 (events, language_names, language_paths)
172}
173
174pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
175 let mut events = Vec::new();
176 let mut finder = LinkFinder::new();
177 finder.kinds(&[linkify::LinkKind::Url]);
178 let mut text_range = Range {
179 start: 0,
180 end: text.len(),
181 };
182 for link in finder.links(text) {
183 let link_range = link.start()..link.end();
184
185 if link_range.start > text_range.start {
186 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
187 }
188
189 events.push((
190 link_range.clone(),
191 MarkdownEvent::Start(MarkdownTag::Link {
192 link_type: LinkType::Autolink,
193 dest_url: SharedString::from(link.as_str().to_string()),
194 title: SharedString::default(),
195 id: SharedString::default(),
196 }),
197 ));
198 events.push((link_range.clone(), MarkdownEvent::Text));
199 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
200
201 text_range.start = link_range.end;
202 }
203
204 if text_range.end > text_range.start {
205 events.push((text_range, MarkdownEvent::Text));
206 }
207
208 events
209}
210
211/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
212/// parse result for rendering without resorting to unsafe lifetime coercion.
213#[derive(Clone, Debug, PartialEq)]
214pub enum MarkdownEvent {
215 /// Start of a tagged element. Events that are yielded after this event
216 /// and before its corresponding `End` event are inside this element.
217 /// Start and end events are guaranteed to be balanced.
218 Start(MarkdownTag),
219 /// End of a tagged element.
220 End(MarkdownTagEnd),
221 /// Text that uses the associated range from the mardown source.
222 Text,
223 /// Text that differs from the markdown source - typically due to substitution of HTML entities
224 /// and smart punctuation.
225 SubstitutedText(CompactStr),
226 /// An inline code node.
227 Code,
228 /// An HTML node.
229 Html,
230 /// An inline HTML node.
231 InlineHtml,
232 /// A reference to a footnote with given label, which may or may not be defined
233 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
234 /// occur in any order.
235 FootnoteReference,
236 /// A soft line break.
237 SoftBreak,
238 /// A hard line break.
239 HardBreak,
240 /// A horizontal ruler.
241 Rule,
242 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
243 TaskListMarker(bool),
244}
245
246/// Tags for elements that can contain other elements.
247#[derive(Clone, Debug, PartialEq)]
248pub enum MarkdownTag {
249 /// A paragraph of text and other inline elements.
250 Paragraph,
251
252 /// A heading, with optional identifier, classes and custom attributes.
253 /// The identifier is prefixed with `#` and the last one in the attributes
254 /// list is chosen, classes are prefixed with `.` and custom attributes
255 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
256 Heading {
257 level: HeadingLevel,
258 id: Option<SharedString>,
259 classes: Vec<SharedString>,
260 /// The first item of the tuple is the attr and second one the value.
261 attrs: Vec<(SharedString, Option<SharedString>)>,
262 },
263
264 BlockQuote,
265
266 /// A code block.
267 CodeBlock(CodeBlockKind),
268
269 /// A HTML block.
270 HtmlBlock,
271
272 /// A list. If the list is ordered the field indicates the number of the first item.
273 /// Contains only list items.
274 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
275
276 /// A list item.
277 Item,
278
279 /// A footnote definition. The value contained is the footnote's label by which it can
280 /// be referred to.
281 FootnoteDefinition(SharedString),
282
283 /// A table. Contains a vector describing the text-alignment for each of its columns.
284 Table(Vec<Alignment>),
285
286 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
287 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
288 TableHead,
289
290 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
291 TableRow,
292 TableCell,
293
294 // span-level tags
295 Emphasis,
296 Strong,
297 Strikethrough,
298
299 /// A link.
300 Link {
301 link_type: LinkType,
302 dest_url: SharedString,
303 title: SharedString,
304 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
305 id: SharedString,
306 },
307
308 /// An image. The first field is the link type, the second the destination URL and the third is a title,
309 /// the fourth is the link identifier.
310 Image {
311 link_type: LinkType,
312 dest_url: SharedString,
313 title: SharedString,
314 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
315 id: SharedString,
316 },
317
318 /// A metadata block.
319 MetadataBlock(MetadataBlockKind),
320
321 DefinitionList,
322 DefinitionListTitle,
323 DefinitionListDefinition,
324}
325
326#[derive(Clone, Debug, PartialEq)]
327pub enum CodeBlockKind {
328 Indented,
329 /// "Fenced" means "surrounded by triple backticks."
330 /// There can optionally be either a language after the backticks (like in traditional Markdown)
331 /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
332 /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
333 Fenced,
334 FencedLang(SharedString),
335 FencedSrc(PathRange),
336}
337
338impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
339 fn from(tag: pulldown_cmark::Tag) -> Self {
340 match tag {
341 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
342 pulldown_cmark::Tag::Heading {
343 level,
344 id,
345 classes,
346 attrs,
347 } => {
348 let id = id.map(|id| SharedString::from(id.into_string()));
349 let classes = classes
350 .into_iter()
351 .map(|c| SharedString::from(c.into_string()))
352 .collect();
353 let attrs = attrs
354 .into_iter()
355 .map(|(key, value)| {
356 (
357 SharedString::from(key.into_string()),
358 value.map(|v| SharedString::from(v.into_string())),
359 )
360 })
361 .collect();
362 MarkdownTag::Heading {
363 level,
364 id,
365 classes,
366 attrs,
367 }
368 }
369 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
370 pulldown_cmark::Tag::CodeBlock(kind) => match kind {
371 pulldown_cmark::CodeBlockKind::Indented => {
372 MarkdownTag::CodeBlock(CodeBlockKind::Indented)
373 }
374 pulldown_cmark::CodeBlockKind::Fenced(info) => {
375 let info = info.trim();
376 MarkdownTag::CodeBlock(if info.is_empty() {
377 CodeBlockKind::Fenced
378 } else if info.contains('/') {
379 // Languages should never contain a slash, and PathRanges always should.
380 // (Models are told to specify them relative to a workspace root.)
381 CodeBlockKind::FencedSrc(PathRange::new(info))
382 } else {
383 CodeBlockKind::FencedLang(SharedString::from(info.to_string()))
384 })
385 }
386 },
387 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
388 pulldown_cmark::Tag::Item => MarkdownTag::Item,
389 pulldown_cmark::Tag::FootnoteDefinition(label) => {
390 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
391 }
392 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
393 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
394 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
395 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
396 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
397 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
398 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
399 pulldown_cmark::Tag::Link {
400 link_type,
401 dest_url,
402 title,
403 id,
404 } => MarkdownTag::Link {
405 link_type,
406 dest_url: SharedString::from(dest_url.into_string()),
407 title: SharedString::from(title.into_string()),
408 id: SharedString::from(id.into_string()),
409 },
410 pulldown_cmark::Tag::Image {
411 link_type,
412 dest_url,
413 title,
414 id,
415 } => MarkdownTag::Image {
416 link_type,
417 dest_url: SharedString::from(dest_url.into_string()),
418 title: SharedString::from(title.into_string()),
419 id: SharedString::from(id.into_string()),
420 },
421 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
422 pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
423 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
424 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
425 pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
426 }
427 }
428}
429
430/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
431/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
432///
433/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
434#[derive(Clone, Debug)]
435pub enum CompactStr {
436 Boxed(Box<str>),
437 Inlined(InlineStr),
438}
439
440impl Deref for CompactStr {
441 type Target = str;
442
443 fn deref(&self) -> &str {
444 match self {
445 CompactStr::Boxed(b) => b,
446 CompactStr::Inlined(i) => i,
447 }
448 }
449}
450
451impl From<&str> for CompactStr {
452 fn from(s: &str) -> Self {
453 if let Ok(inlined) = s.try_into() {
454 CompactStr::Inlined(inlined)
455 } else {
456 CompactStr::Boxed(s.into())
457 }
458 }
459}
460
461impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
462 fn from(cow_str: pulldown_cmark::CowStr) -> Self {
463 match cow_str {
464 pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
465 pulldown_cmark::CowStr::Borrowed(b) => b.into(),
466 pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
467 }
468 }
469}
470
471impl PartialEq for CompactStr {
472 fn eq(&self, other: &Self) -> bool {
473 self.deref() == other.deref()
474 }
475}
476
477fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
478 match substring {
479 pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
480 pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
481 pulldown_cmark::CowStr::Inlined(_) => false,
482 }
483}
484
485fn str_points_inside(substring: &str, container: &str) -> bool {
486 let substring_ptr = substring.as_ptr();
487 let container_ptr = container.as_ptr();
488 unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
489}
490
491#[cfg(test)]
492mod tests {
493 use super::MarkdownEvent::*;
494 use super::MarkdownTag::*;
495 use super::*;
496
497 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
498 .union(Options::ENABLE_MATH)
499 .union(Options::ENABLE_DEFINITION_LIST);
500
501 #[test]
502 fn all_options_considered() {
503 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
504 // can be evaluated for inclusion.
505 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
506 }
507
508 #[test]
509 fn wanted_and_unwanted_options_disjoint() {
510 assert_eq!(
511 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
512 Options::empty()
513 );
514 }
515
516 #[test]
517 fn test_plain_urls_and_escaped_text() {
518 assert_eq!(
519 parse_markdown(" https://some.url some \\`►\\` text"),
520 (
521 vec![
522 (0..51, Start(Paragraph)),
523 (0..6, SubstitutedText("\u{a0}".into())),
524 (6..12, SubstitutedText("\u{a0}".into())),
525 (12..13, Text),
526 (
527 13..29,
528 Start(Link {
529 link_type: LinkType::Autolink,
530 dest_url: "https://some.url".into(),
531 title: "".into(),
532 id: "".into(),
533 })
534 ),
535 (13..29, Text),
536 (13..29, End(MarkdownTagEnd::Link)),
537 (29..35, Text),
538 (36..37, Text), // Escaped backtick
539 (37..44, SubstitutedText("►".into())),
540 (45..46, Text), // Escaped backtick
541 (46..51, Text),
542 (0..51, End(MarkdownTagEnd::Paragraph))
543 ],
544 HashSet::new(),
545 HashSet::new()
546 )
547 );
548 }
549
550 #[test]
551 fn test_smart_punctuation() {
552 assert_eq!(
553 parse_markdown("-- --- ... \"double quoted\" 'single quoted'"),
554 (
555 vec![
556 (0..42, Start(Paragraph)),
557 (0..2, SubstitutedText("–".into())),
558 (2..3, Text),
559 (3..6, SubstitutedText("—".into())),
560 (6..7, Text),
561 (7..10, SubstitutedText("…".into())),
562 (10..11, Text),
563 (11..12, SubstitutedText("“".into())),
564 (12..25, Text),
565 (25..26, SubstitutedText("”".into())),
566 (26..27, Text),
567 (27..28, SubstitutedText("‘".into())),
568 (28..41, Text),
569 (41..42, SubstitutedText("’".into())),
570 (0..42, End(MarkdownTagEnd::Paragraph))
571 ],
572 HashSet::new(),
573 HashSet::new()
574 )
575 )
576 }
577}