1use gpui::SharedString;
2use linkify::LinkFinder;
3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
4use pulldown_cmark::{
5 Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
6};
7use std::{
8 collections::HashSet,
9 ops::{Deref, Range},
10};
11
12const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
13 .union(Options::ENABLE_FOOTNOTES)
14 .union(Options::ENABLE_STRIKETHROUGH)
15 .union(Options::ENABLE_TASKLISTS)
16 .union(Options::ENABLE_SMART_PUNCTUATION)
17 .union(Options::ENABLE_HEADING_ATTRIBUTES)
18 .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
19 .union(Options::ENABLE_OLD_FOOTNOTES)
20 .union(Options::ENABLE_GFM);
21
22pub fn parse_markdown(text: &str) -> (Vec<(Range<usize>, MarkdownEvent)>, HashSet<SharedString>) {
23 let mut events = Vec::new();
24 let mut languages = HashSet::new();
25 let mut within_link = false;
26 let mut within_metadata = false;
27 for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
28 if within_metadata {
29 if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
30 pulldown_event
31 {
32 within_metadata = false;
33 }
34 continue;
35 }
36 match pulldown_event {
37 pulldown_cmark::Event::Start(tag) => {
38 match tag {
39 pulldown_cmark::Tag::Link { .. } => within_link = true,
40 pulldown_cmark::Tag::MetadataBlock { .. } => within_metadata = true,
41 pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
42 ref language,
43 )) => {
44 languages.insert(SharedString::from(language.to_string()));
45 }
46 _ => {}
47 }
48 events.push((range, MarkdownEvent::Start(tag.into())))
49 }
50 pulldown_cmark::Event::End(tag) => {
51 if let pulldown_cmark::TagEnd::Link = tag {
52 within_link = false;
53 }
54 events.push((range, MarkdownEvent::End(tag)));
55 }
56 pulldown_cmark::Event::Text(parsed) => {
57 // `parsed` will share bytes with the input unless a substitution like handling of
58 // HTML entities or smart punctuation has occurred. When these substitutions occur,
59 // `parsed` only consists of the result of a single substitution.
60 if !cow_str_points_inside(&parsed, text) {
61 // Attempt to detect cases where the assumptions here are not valid or the
62 // behavior has changed.
63 if parsed.len() > 4 {
64 log::error!(
65 "Bug in markdown parser. \
66 pulldown_cmark::Event::Text expected to a substituted HTML entity, \
67 but it was longer than expected.\n\
68 Source: {}\n\
69 Parsed: {}",
70 &text[range.clone()],
71 parsed
72 );
73 }
74 events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
75 } else {
76 // Automatically detect links in text if not already within a markdown link.
77 if !within_link {
78 let mut finder = LinkFinder::new();
79 finder.kinds(&[linkify::LinkKind::Url]);
80 let text_range = range.clone();
81 for link in finder.links(&text[text_range.clone()]) {
82 let link_range =
83 text_range.start + link.start()..text_range.start + link.end();
84
85 if link_range.start > range.start {
86 events.push((range.start..link_range.start, MarkdownEvent::Text));
87 }
88
89 events.push((
90 link_range.clone(),
91 MarkdownEvent::Start(MarkdownTag::Link {
92 link_type: LinkType::Autolink,
93 dest_url: SharedString::from(link.as_str().to_string()),
94 title: SharedString::default(),
95 id: SharedString::default(),
96 }),
97 ));
98
99 events.push((link_range.clone(), MarkdownEvent::Text));
100 events.push((
101 link_range.clone(),
102 MarkdownEvent::End(MarkdownTagEnd::Link),
103 ));
104
105 range.start = link_range.end;
106 }
107 }
108 if range.start < range.end {
109 events.push((range, MarkdownEvent::Text));
110 }
111 }
112 }
113 pulldown_cmark::Event::Code(_) => {
114 range.start += 1;
115 range.end -= 1;
116 events.push((range, MarkdownEvent::Code))
117 }
118 pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
119 pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
120 pulldown_cmark::Event::FootnoteReference(_) => {
121 events.push((range, MarkdownEvent::FootnoteReference))
122 }
123 pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
124 pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
125 pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
126 pulldown_cmark::Event::TaskListMarker(checked) => {
127 events.push((range, MarkdownEvent::TaskListMarker(checked)))
128 }
129 pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
130 }
131 }
132 (events, languages)
133}
134
135pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
136 let mut events = Vec::new();
137 let mut finder = LinkFinder::new();
138 finder.kinds(&[linkify::LinkKind::Url]);
139 let mut text_range = Range {
140 start: 0,
141 end: text.len(),
142 };
143 for link in finder.links(text) {
144 let link_range = link.start()..link.end();
145
146 if link_range.start > text_range.start {
147 events.push((text_range.start..link_range.start, MarkdownEvent::Text));
148 }
149
150 events.push((
151 link_range.clone(),
152 MarkdownEvent::Start(MarkdownTag::Link {
153 link_type: LinkType::Autolink,
154 dest_url: SharedString::from(link.as_str().to_string()),
155 title: SharedString::default(),
156 id: SharedString::default(),
157 }),
158 ));
159 events.push((link_range.clone(), MarkdownEvent::Text));
160 events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
161
162 text_range.start = link_range.end;
163 }
164
165 if text_range.end > text_range.start {
166 events.push((text_range, MarkdownEvent::Text));
167 }
168
169 events
170}
171
172/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
173/// parse result for rendering without resorting to unsafe lifetime coercion.
174#[derive(Clone, Debug, PartialEq)]
175pub enum MarkdownEvent {
176 /// Start of a tagged element. Events that are yielded after this event
177 /// and before its corresponding `End` event are inside this element.
178 /// Start and end events are guaranteed to be balanced.
179 Start(MarkdownTag),
180 /// End of a tagged element.
181 End(MarkdownTagEnd),
182 /// Text that uses the associated range from the mardown source.
183 Text,
184 /// Text that differs from the markdown source - typically due to substitution of HTML entities
185 /// and smart punctuation.
186 SubstitutedText(CompactStr),
187 /// An inline code node.
188 Code,
189 /// An HTML node.
190 Html,
191 /// An inline HTML node.
192 InlineHtml,
193 /// A reference to a footnote with given label, which may or may not be defined
194 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
195 /// occur in any order.
196 FootnoteReference,
197 /// A soft line break.
198 SoftBreak,
199 /// A hard line break.
200 HardBreak,
201 /// A horizontal ruler.
202 Rule,
203 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
204 TaskListMarker(bool),
205}
206
207/// Tags for elements that can contain other elements.
208#[derive(Clone, Debug, PartialEq)]
209pub enum MarkdownTag {
210 /// A paragraph of text and other inline elements.
211 Paragraph,
212
213 /// A heading, with optional identifier, classes and custom attributes.
214 /// The identifier is prefixed with `#` and the last one in the attributes
215 /// list is chosen, classes are prefixed with `.` and custom attributes
216 /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
217 Heading {
218 level: HeadingLevel,
219 id: Option<SharedString>,
220 classes: Vec<SharedString>,
221 /// The first item of the tuple is the attr and second one the value.
222 attrs: Vec<(SharedString, Option<SharedString>)>,
223 },
224
225 BlockQuote,
226
227 /// A code block.
228 CodeBlock(CodeBlockKind),
229
230 /// A HTML block.
231 HtmlBlock,
232
233 /// A list. If the list is ordered the field indicates the number of the first item.
234 /// Contains only list items.
235 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
236
237 /// A list item.
238 Item,
239
240 /// A footnote definition. The value contained is the footnote's label by which it can
241 /// be referred to.
242 FootnoteDefinition(SharedString),
243
244 /// A table. Contains a vector describing the text-alignment for each of its columns.
245 Table(Vec<Alignment>),
246
247 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
248 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
249 TableHead,
250
251 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
252 TableRow,
253 TableCell,
254
255 // span-level tags
256 Emphasis,
257 Strong,
258 Strikethrough,
259
260 /// A link.
261 Link {
262 link_type: LinkType,
263 dest_url: SharedString,
264 title: SharedString,
265 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
266 id: SharedString,
267 },
268
269 /// An image. The first field is the link type, the second the destination URL and the third is a title,
270 /// the fourth is the link identifier.
271 Image {
272 link_type: LinkType,
273 dest_url: SharedString,
274 title: SharedString,
275 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
276 id: SharedString,
277 },
278
279 /// A metadata block.
280 MetadataBlock(MetadataBlockKind),
281
282 DefinitionList,
283 DefinitionListTitle,
284 DefinitionListDefinition,
285}
286
287#[derive(Clone, Debug, PartialEq)]
288pub enum CodeBlockKind {
289 Indented,
290 /// The value contained in the tag describes the language of the code, which may be empty.
291 Fenced(SharedString),
292}
293
294impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
295 fn from(tag: pulldown_cmark::Tag) -> Self {
296 match tag {
297 pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
298 pulldown_cmark::Tag::Heading {
299 level,
300 id,
301 classes,
302 attrs,
303 } => {
304 let id = id.map(|id| SharedString::from(id.into_string()));
305 let classes = classes
306 .into_iter()
307 .map(|c| SharedString::from(c.into_string()))
308 .collect();
309 let attrs = attrs
310 .into_iter()
311 .map(|(key, value)| {
312 (
313 SharedString::from(key.into_string()),
314 value.map(|v| SharedString::from(v.into_string())),
315 )
316 })
317 .collect();
318 MarkdownTag::Heading {
319 level,
320 id,
321 classes,
322 attrs,
323 }
324 }
325 pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
326 pulldown_cmark::Tag::CodeBlock(kind) => match kind {
327 pulldown_cmark::CodeBlockKind::Indented => {
328 MarkdownTag::CodeBlock(CodeBlockKind::Indented)
329 }
330 pulldown_cmark::CodeBlockKind::Fenced(info) => MarkdownTag::CodeBlock(
331 CodeBlockKind::Fenced(SharedString::from(info.into_string())),
332 ),
333 },
334 pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
335 pulldown_cmark::Tag::Item => MarkdownTag::Item,
336 pulldown_cmark::Tag::FootnoteDefinition(label) => {
337 MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
338 }
339 pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
340 pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
341 pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
342 pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
343 pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
344 pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
345 pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
346 pulldown_cmark::Tag::Link {
347 link_type,
348 dest_url,
349 title,
350 id,
351 } => MarkdownTag::Link {
352 link_type,
353 dest_url: SharedString::from(dest_url.into_string()),
354 title: SharedString::from(title.into_string()),
355 id: SharedString::from(id.into_string()),
356 },
357 pulldown_cmark::Tag::Image {
358 link_type,
359 dest_url,
360 title,
361 id,
362 } => MarkdownTag::Image {
363 link_type,
364 dest_url: SharedString::from(dest_url.into_string()),
365 title: SharedString::from(title.into_string()),
366 id: SharedString::from(id.into_string()),
367 },
368 pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
369 pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
370 pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
371 pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
372 pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
373 }
374 }
375}
376
377/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
378/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
379///
380/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
381#[derive(Clone, Debug)]
382pub enum CompactStr {
383 Boxed(Box<str>),
384 Inlined(InlineStr),
385}
386
387impl Deref for CompactStr {
388 type Target = str;
389
390 fn deref(&self) -> &str {
391 match self {
392 CompactStr::Boxed(b) => b,
393 CompactStr::Inlined(i) => i,
394 }
395 }
396}
397
398impl From<&str> for CompactStr {
399 fn from(s: &str) -> Self {
400 if let Ok(inlined) = s.try_into() {
401 CompactStr::Inlined(inlined)
402 } else {
403 CompactStr::Boxed(s.into())
404 }
405 }
406}
407
408impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
409 fn from(cow_str: pulldown_cmark::CowStr) -> Self {
410 match cow_str {
411 pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
412 pulldown_cmark::CowStr::Borrowed(b) => b.into(),
413 pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
414 }
415 }
416}
417
418impl PartialEq for CompactStr {
419 fn eq(&self, other: &Self) -> bool {
420 self.deref() == other.deref()
421 }
422}
423
424fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
425 match substring {
426 pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
427 pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
428 pulldown_cmark::CowStr::Inlined(_) => false,
429 }
430}
431
432fn str_points_inside(substring: &str, container: &str) -> bool {
433 let substring_ptr = substring.as_ptr();
434 let container_ptr = container.as_ptr();
435 unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
436}
437
438#[cfg(test)]
439mod tests {
440 use super::MarkdownEvent::*;
441 use super::MarkdownTag::*;
442 use super::*;
443
444 const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
445 .union(Options::ENABLE_MATH)
446 .union(Options::ENABLE_DEFINITION_LIST);
447
448 #[test]
449 fn all_options_considered() {
450 // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
451 // can be evaluated for inclusion.
452 assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
453 }
454
455 #[test]
456 fn wanted_and_unwanted_options_disjoint() {
457 assert_eq!(
458 PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
459 Options::empty()
460 );
461 }
462
463 #[test]
464 fn test_plain_urls_and_escaped_text() {
465 assert_eq!(
466 parse_markdown(" https://some.url some \\`►\\` text"),
467 (
468 vec![
469 (0..51, Start(Paragraph)),
470 (0..6, SubstitutedText("\u{a0}".into())),
471 (6..12, SubstitutedText("\u{a0}".into())),
472 (12..13, Text),
473 (
474 13..29,
475 Start(Link {
476 link_type: LinkType::Autolink,
477 dest_url: "https://some.url".into(),
478 title: "".into(),
479 id: "".into(),
480 })
481 ),
482 (13..29, Text),
483 (13..29, End(MarkdownTagEnd::Link)),
484 (29..35, Text),
485 (36..37, Text), // Escaped backtick
486 (37..44, SubstitutedText("►".into())),
487 (45..46, Text), // Escaped backtick
488 (46..51, Text),
489 (0..51, End(MarkdownTagEnd::Paragraph))
490 ],
491 HashSet::new()
492 )
493 );
494 }
495
496 #[test]
497 fn test_smart_punctuation() {
498 assert_eq!(
499 parse_markdown("-- --- ... \"double quoted\" 'single quoted'"),
500 (
501 vec![
502 (0..42, Start(Paragraph)),
503 (0..2, SubstitutedText("–".into())),
504 (2..3, Text),
505 (3..6, SubstitutedText("—".into())),
506 (6..7, Text),
507 (7..10, SubstitutedText("…".into())),
508 (10..11, Text),
509 (11..12, SubstitutedText("“".into())),
510 (12..25, Text),
511 (25..26, SubstitutedText("”".into())),
512 (26..27, Text),
513 (27..28, SubstitutedText("‘".into())),
514 (28..41, Text),
515 (41..42, SubstitutedText("’".into())),
516 (0..42, End(MarkdownTagEnd::Paragraph))
517 ],
518 HashSet::new()
519 )
520 )
521 }
522}