parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{
  8    collections::HashSet,
  9    ops::{Deref, Range},
 10};
 11
 12const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 13    .union(Options::ENABLE_FOOTNOTES)
 14    .union(Options::ENABLE_STRIKETHROUGH)
 15    .union(Options::ENABLE_TASKLISTS)
 16    .union(Options::ENABLE_SMART_PUNCTUATION)
 17    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 18    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 19    .union(Options::ENABLE_OLD_FOOTNOTES)
 20    .union(Options::ENABLE_GFM);
 21
 22pub fn parse_markdown(text: &str) -> (Vec<(Range<usize>, MarkdownEvent)>, HashSet<SharedString>) {
 23    let mut events = Vec::new();
 24    let mut languages = HashSet::new();
 25    let mut within_link = false;
 26    let mut within_metadata = false;
 27    for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
 28        if within_metadata {
 29            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 30                pulldown_event
 31            {
 32                within_metadata = false;
 33            }
 34            continue;
 35        }
 36        match pulldown_event {
 37            pulldown_cmark::Event::Start(tag) => {
 38                match tag {
 39                    pulldown_cmark::Tag::Link { .. } => within_link = true,
 40                    pulldown_cmark::Tag::MetadataBlock { .. } => within_metadata = true,
 41                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 42                        ref language,
 43                    )) => {
 44                        languages.insert(SharedString::from(language.to_string()));
 45                    }
 46                    _ => {}
 47                }
 48                events.push((range, MarkdownEvent::Start(tag.into())))
 49            }
 50            pulldown_cmark::Event::End(tag) => {
 51                if let pulldown_cmark::TagEnd::Link = tag {
 52                    within_link = false;
 53                }
 54                events.push((range, MarkdownEvent::End(tag)));
 55            }
 56            pulldown_cmark::Event::Text(parsed) => {
 57                // `parsed` will share bytes with the input unless a substitution like handling of
 58                // HTML entities or smart punctuation has occurred. When these substitutions occur,
 59                // `parsed` only consists of the result of a single substitution.
 60                if !cow_str_points_inside(&parsed, text) {
 61                    // Attempt to detect cases where the assumptions here are not valid or the
 62                    // behavior has changed.
 63                    if parsed.len() > 4 {
 64                        log::error!(
 65                            "Bug in markdown parser. \
 66                            pulldown_cmark::Event::Text expected to a substituted HTML entity, \
 67                            but it was longer than expected.\n\
 68                            Source: {}\n\
 69                            Parsed: {}",
 70                            &text[range.clone()],
 71                            parsed
 72                        );
 73                    }
 74                    events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
 75                } else {
 76                    // Automatically detect links in text if not already within a markdown link.
 77                    if !within_link {
 78                        let mut finder = LinkFinder::new();
 79                        finder.kinds(&[linkify::LinkKind::Url]);
 80                        let text_range = range.clone();
 81                        for link in finder.links(&text[text_range.clone()]) {
 82                            let link_range =
 83                                text_range.start + link.start()..text_range.start + link.end();
 84
 85                            if link_range.start > range.start {
 86                                events.push((range.start..link_range.start, MarkdownEvent::Text));
 87                            }
 88
 89                            events.push((
 90                                link_range.clone(),
 91                                MarkdownEvent::Start(MarkdownTag::Link {
 92                                    link_type: LinkType::Autolink,
 93                                    dest_url: SharedString::from(link.as_str().to_string()),
 94                                    title: SharedString::default(),
 95                                    id: SharedString::default(),
 96                                }),
 97                            ));
 98
 99                            events.push((link_range.clone(), MarkdownEvent::Text));
100                            events.push((
101                                link_range.clone(),
102                                MarkdownEvent::End(MarkdownTagEnd::Link),
103                            ));
104
105                            range.start = link_range.end;
106                        }
107                    }
108                    if range.start < range.end {
109                        events.push((range, MarkdownEvent::Text));
110                    }
111                }
112            }
113            pulldown_cmark::Event::Code(_) => {
114                range.start += 1;
115                range.end -= 1;
116                events.push((range, MarkdownEvent::Code))
117            }
118            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
119            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
120            pulldown_cmark::Event::FootnoteReference(_) => {
121                events.push((range, MarkdownEvent::FootnoteReference))
122            }
123            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
124            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
125            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
126            pulldown_cmark::Event::TaskListMarker(checked) => {
127                events.push((range, MarkdownEvent::TaskListMarker(checked)))
128            }
129            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
130        }
131    }
132    (events, languages)
133}
134
135pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
136    let mut events = Vec::new();
137    let mut finder = LinkFinder::new();
138    finder.kinds(&[linkify::LinkKind::Url]);
139    let mut text_range = Range {
140        start: 0,
141        end: text.len(),
142    };
143    for link in finder.links(text) {
144        let link_range = link.start()..link.end();
145
146        if link_range.start > text_range.start {
147            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
148        }
149
150        events.push((
151            link_range.clone(),
152            MarkdownEvent::Start(MarkdownTag::Link {
153                link_type: LinkType::Autolink,
154                dest_url: SharedString::from(link.as_str().to_string()),
155                title: SharedString::default(),
156                id: SharedString::default(),
157            }),
158        ));
159        events.push((link_range.clone(), MarkdownEvent::Text));
160        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
161
162        text_range.start = link_range.end;
163    }
164
165    if text_range.end > text_range.start {
166        events.push((text_range, MarkdownEvent::Text));
167    }
168
169    events
170}
171
172/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
173/// parse result for rendering without resorting to unsafe lifetime coercion.
174#[derive(Clone, Debug, PartialEq)]
175pub enum MarkdownEvent {
176    /// Start of a tagged element. Events that are yielded after this event
177    /// and before its corresponding `End` event are inside this element.
178    /// Start and end events are guaranteed to be balanced.
179    Start(MarkdownTag),
180    /// End of a tagged element.
181    End(MarkdownTagEnd),
182    /// Text that uses the associated range from the mardown source.
183    Text,
184    /// Text that differs from the markdown source - typically due to substitution of HTML entities
185    /// and smart punctuation.
186    SubstitutedText(CompactStr),
187    /// An inline code node.
188    Code,
189    /// An HTML node.
190    Html,
191    /// An inline HTML node.
192    InlineHtml,
193    /// A reference to a footnote with given label, which may or may not be defined
194    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
195    /// occur in any order.
196    FootnoteReference,
197    /// A soft line break.
198    SoftBreak,
199    /// A hard line break.
200    HardBreak,
201    /// A horizontal ruler.
202    Rule,
203    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
204    TaskListMarker(bool),
205}
206
207/// Tags for elements that can contain other elements.
208#[derive(Clone, Debug, PartialEq)]
209pub enum MarkdownTag {
210    /// A paragraph of text and other inline elements.
211    Paragraph,
212
213    /// A heading, with optional identifier, classes and custom attributes.
214    /// The identifier is prefixed with `#` and the last one in the attributes
215    /// list is chosen, classes are prefixed with `.` and custom attributes
216    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
217    Heading {
218        level: HeadingLevel,
219        id: Option<SharedString>,
220        classes: Vec<SharedString>,
221        /// The first item of the tuple is the attr and second one the value.
222        attrs: Vec<(SharedString, Option<SharedString>)>,
223    },
224
225    BlockQuote,
226
227    /// A code block.
228    CodeBlock(CodeBlockKind),
229
230    /// A HTML block.
231    HtmlBlock,
232
233    /// A list. If the list is ordered the field indicates the number of the first item.
234    /// Contains only list items.
235    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
236
237    /// A list item.
238    Item,
239
240    /// A footnote definition. The value contained is the footnote's label by which it can
241    /// be referred to.
242    FootnoteDefinition(SharedString),
243
244    /// A table. Contains a vector describing the text-alignment for each of its columns.
245    Table(Vec<Alignment>),
246
247    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
248    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
249    TableHead,
250
251    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
252    TableRow,
253    TableCell,
254
255    // span-level tags
256    Emphasis,
257    Strong,
258    Strikethrough,
259
260    /// A link.
261    Link {
262        link_type: LinkType,
263        dest_url: SharedString,
264        title: SharedString,
265        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
266        id: SharedString,
267    },
268
269    /// An image. The first field is the link type, the second the destination URL and the third is a title,
270    /// the fourth is the link identifier.
271    Image {
272        link_type: LinkType,
273        dest_url: SharedString,
274        title: SharedString,
275        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
276        id: SharedString,
277    },
278
279    /// A metadata block.
280    MetadataBlock(MetadataBlockKind),
281
282    DefinitionList,
283    DefinitionListTitle,
284    DefinitionListDefinition,
285}
286
287#[derive(Clone, Debug, PartialEq)]
288pub enum CodeBlockKind {
289    Indented,
290    /// The value contained in the tag describes the language of the code, which may be empty.
291    Fenced(SharedString),
292}
293
294impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
295    fn from(tag: pulldown_cmark::Tag) -> Self {
296        match tag {
297            pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
298            pulldown_cmark::Tag::Heading {
299                level,
300                id,
301                classes,
302                attrs,
303            } => {
304                let id = id.map(|id| SharedString::from(id.into_string()));
305                let classes = classes
306                    .into_iter()
307                    .map(|c| SharedString::from(c.into_string()))
308                    .collect();
309                let attrs = attrs
310                    .into_iter()
311                    .map(|(key, value)| {
312                        (
313                            SharedString::from(key.into_string()),
314                            value.map(|v| SharedString::from(v.into_string())),
315                        )
316                    })
317                    .collect();
318                MarkdownTag::Heading {
319                    level,
320                    id,
321                    classes,
322                    attrs,
323                }
324            }
325            pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
326            pulldown_cmark::Tag::CodeBlock(kind) => match kind {
327                pulldown_cmark::CodeBlockKind::Indented => {
328                    MarkdownTag::CodeBlock(CodeBlockKind::Indented)
329                }
330                pulldown_cmark::CodeBlockKind::Fenced(info) => MarkdownTag::CodeBlock(
331                    CodeBlockKind::Fenced(SharedString::from(info.into_string())),
332                ),
333            },
334            pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
335            pulldown_cmark::Tag::Item => MarkdownTag::Item,
336            pulldown_cmark::Tag::FootnoteDefinition(label) => {
337                MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
338            }
339            pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
340            pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
341            pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
342            pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
343            pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
344            pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
345            pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
346            pulldown_cmark::Tag::Link {
347                link_type,
348                dest_url,
349                title,
350                id,
351            } => MarkdownTag::Link {
352                link_type,
353                dest_url: SharedString::from(dest_url.into_string()),
354                title: SharedString::from(title.into_string()),
355                id: SharedString::from(id.into_string()),
356            },
357            pulldown_cmark::Tag::Image {
358                link_type,
359                dest_url,
360                title,
361                id,
362            } => MarkdownTag::Image {
363                link_type,
364                dest_url: SharedString::from(dest_url.into_string()),
365                title: SharedString::from(title.into_string()),
366                id: SharedString::from(id.into_string()),
367            },
368            pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
369            pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
370            pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
371            pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
372            pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
373        }
374    }
375}
376
377/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
378/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
379///
380/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
381#[derive(Clone, Debug)]
382pub enum CompactStr {
383    Boxed(Box<str>),
384    Inlined(InlineStr),
385}
386
387impl Deref for CompactStr {
388    type Target = str;
389
390    fn deref(&self) -> &str {
391        match self {
392            CompactStr::Boxed(b) => b,
393            CompactStr::Inlined(i) => i,
394        }
395    }
396}
397
398impl From<&str> for CompactStr {
399    fn from(s: &str) -> Self {
400        if let Ok(inlined) = s.try_into() {
401            CompactStr::Inlined(inlined)
402        } else {
403            CompactStr::Boxed(s.into())
404        }
405    }
406}
407
408impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
409    fn from(cow_str: pulldown_cmark::CowStr) -> Self {
410        match cow_str {
411            pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
412            pulldown_cmark::CowStr::Borrowed(b) => b.into(),
413            pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
414        }
415    }
416}
417
418impl PartialEq for CompactStr {
419    fn eq(&self, other: &Self) -> bool {
420        self.deref() == other.deref()
421    }
422}
423
424fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
425    match substring {
426        pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
427        pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
428        pulldown_cmark::CowStr::Inlined(_) => false,
429    }
430}
431
432fn str_points_inside(substring: &str, container: &str) -> bool {
433    let substring_ptr = substring.as_ptr();
434    let container_ptr = container.as_ptr();
435    unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
436}
437
438#[cfg(test)]
439mod tests {
440    use super::MarkdownEvent::*;
441    use super::MarkdownTag::*;
442    use super::*;
443
444    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
445        .union(Options::ENABLE_MATH)
446        .union(Options::ENABLE_DEFINITION_LIST);
447
448    #[test]
449    fn all_options_considered() {
450        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
451        // can be evaluated for inclusion.
452        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
453    }
454
455    #[test]
456    fn wanted_and_unwanted_options_disjoint() {
457        assert_eq!(
458            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
459            Options::empty()
460        );
461    }
462
463    #[test]
464    fn test_plain_urls_and_escaped_text() {
465        assert_eq!(
466            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
467            (
468                vec![
469                    (0..51, Start(Paragraph)),
470                    (0..6, SubstitutedText("\u{a0}".into())),
471                    (6..12, SubstitutedText("\u{a0}".into())),
472                    (12..13, Text),
473                    (
474                        13..29,
475                        Start(Link {
476                            link_type: LinkType::Autolink,
477                            dest_url: "https://some.url".into(),
478                            title: "".into(),
479                            id: "".into(),
480                        })
481                    ),
482                    (13..29, Text),
483                    (13..29, End(MarkdownTagEnd::Link)),
484                    (29..35, Text),
485                    (36..37, Text), // Escaped backtick
486                    (37..44, SubstitutedText("".into())),
487                    (45..46, Text), // Escaped backtick
488                    (46..51, Text),
489                    (0..51, End(MarkdownTagEnd::Paragraph))
490                ],
491                HashSet::new()
492            )
493        );
494    }
495
496    #[test]
497    fn test_smart_punctuation() {
498        assert_eq!(
499            parse_markdown("-- --- ... \"double quoted\" 'single quoted'"),
500            (
501                vec![
502                    (0..42, Start(Paragraph)),
503                    (0..2, SubstitutedText("".into())),
504                    (2..3, Text),
505                    (3..6, SubstitutedText("".into())),
506                    (6..7, Text),
507                    (7..10, SubstitutedText("".into())),
508                    (10..11, Text),
509                    (11..12, SubstitutedText("".into())),
510                    (12..25, Text),
511                    (25..26, SubstitutedText("".into())),
512                    (26..27, Text),
513                    (27..28, SubstitutedText("".into())),
514                    (28..41, Text),
515                    (41..42, SubstitutedText("".into())),
516                    (0..42, End(MarkdownTagEnd::Paragraph))
517                ],
518                HashSet::new()
519            )
520        )
521    }
522}