parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{Alignment, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser};
  5use std::{collections::HashSet, ops::Range};
  6
  7const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
  8    .union(Options::ENABLE_FOOTNOTES)
  9    .union(Options::ENABLE_STRIKETHROUGH)
 10    .union(Options::ENABLE_TASKLISTS)
 11    .union(Options::ENABLE_SMART_PUNCTUATION)
 12    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 13    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 14    .union(Options::ENABLE_OLD_FOOTNOTES)
 15    .union(Options::ENABLE_GFM);
 16
 17pub fn parse_markdown(text: &str) -> (Vec<(Range<usize>, MarkdownEvent)>, HashSet<SharedString>) {
 18    let mut events = Vec::new();
 19    let mut languages = HashSet::new();
 20    let mut within_link = false;
 21    let mut within_metadata = false;
 22    for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
 23        if within_metadata {
 24            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 25                pulldown_event
 26            {
 27                within_metadata = false;
 28            }
 29            continue;
 30        }
 31        match pulldown_event {
 32            pulldown_cmark::Event::Start(tag) => {
 33                match tag {
 34                    pulldown_cmark::Tag::Link { .. } => within_link = true,
 35                    pulldown_cmark::Tag::MetadataBlock { .. } => within_metadata = true,
 36                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 37                        ref language,
 38                    )) => {
 39                        languages.insert(SharedString::from(language.to_string()));
 40                    }
 41                    _ => {}
 42                }
 43                events.push((range, MarkdownEvent::Start(tag.into())))
 44            }
 45            pulldown_cmark::Event::End(tag) => {
 46                if let pulldown_cmark::TagEnd::Link = tag {
 47                    within_link = false;
 48                }
 49                events.push((range, MarkdownEvent::End(tag)));
 50            }
 51            pulldown_cmark::Event::Text(parsed) => {
 52                // Automatically detect links in text if we're not already within a markdown
 53                // link.
 54                let mut parsed = parsed.as_ref();
 55                if !within_link {
 56                    let mut finder = LinkFinder::new();
 57                    finder.kinds(&[linkify::LinkKind::Url]);
 58                    let text_range = range.clone();
 59                    for link in finder.links(&text[text_range.clone()]) {
 60                        let link_range =
 61                            text_range.start + link.start()..text_range.start + link.end();
 62
 63                        if link_range.start > range.start {
 64                            let (text, tail) = parsed.split_at(link_range.start - range.start);
 65                            events.push((
 66                                range.start..link_range.start,
 67                                MarkdownEvent::Text(SharedString::new(text)),
 68                            ));
 69                            parsed = tail;
 70                        }
 71
 72                        events.push((
 73                            link_range.clone(),
 74                            MarkdownEvent::Start(MarkdownTag::Link {
 75                                link_type: LinkType::Autolink,
 76                                dest_url: SharedString::from(link.as_str().to_string()),
 77                                title: SharedString::default(),
 78                                id: SharedString::default(),
 79                            }),
 80                        ));
 81
 82                        let (link_text, tail) = parsed.split_at(link_range.end - link_range.start);
 83                        events.push((
 84                            link_range.clone(),
 85                            MarkdownEvent::Text(SharedString::new(link_text)),
 86                        ));
 87                        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
 88
 89                        range.start = link_range.end;
 90                        parsed = tail;
 91                    }
 92                }
 93                if range.start < range.end {
 94                    events.push((range, MarkdownEvent::Text(SharedString::new(parsed))));
 95                }
 96            }
 97            pulldown_cmark::Event::Code(_) => {
 98                range.start += 1;
 99                range.end -= 1;
100                events.push((range, MarkdownEvent::Code))
101            }
102            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
103            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
104            pulldown_cmark::Event::FootnoteReference(_) => {
105                events.push((range, MarkdownEvent::FootnoteReference))
106            }
107            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
108            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
109            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
110            pulldown_cmark::Event::TaskListMarker(checked) => {
111                events.push((range, MarkdownEvent::TaskListMarker(checked)))
112            }
113            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
114        }
115    }
116    (events, languages)
117}
118
119pub fn parse_links_only(mut text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
120    let mut events = Vec::new();
121    let mut finder = LinkFinder::new();
122    finder.kinds(&[linkify::LinkKind::Url]);
123    let mut text_range = Range {
124        start: 0,
125        end: text.len(),
126    };
127    for link in finder.links(text) {
128        let link_range = link.start()..link.end();
129
130        if link_range.start > text_range.start {
131            let (head, tail) = text.split_at(link_range.start - text_range.start);
132            events.push((
133                text_range.start..link_range.start,
134                MarkdownEvent::Text(SharedString::new(head)),
135            ));
136            text = tail;
137        }
138
139        let (link_text, tail) = text.split_at(link_range.end - link_range.start);
140        events.push((
141            link_range.clone(),
142            MarkdownEvent::Start(MarkdownTag::Link {
143                link_type: LinkType::Autolink,
144                dest_url: SharedString::from(link.as_str().to_string()),
145                title: SharedString::default(),
146                id: SharedString::default(),
147            }),
148        ));
149        events.push((
150            link_range.clone(),
151            MarkdownEvent::Text(SharedString::new(link_text)),
152        ));
153        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
154
155        text_range.start = link_range.end;
156        text = tail;
157    }
158
159    if text_range.end > text_range.start {
160        events.push((text_range, MarkdownEvent::Text(SharedString::new(text))));
161    }
162
163    events
164}
165
166/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
167/// parse result for rendering without resorting to unsafe lifetime coercion.
168#[derive(Clone, Debug, PartialEq)]
169pub enum MarkdownEvent {
170    /// Start of a tagged element. Events that are yielded after this event
171    /// and before its corresponding `End` event are inside this element.
172    /// Start and end events are guaranteed to be balanced.
173    Start(MarkdownTag),
174    /// End of a tagged element.
175    End(MarkdownTagEnd),
176    /// A text node.
177    Text(SharedString),
178    /// An inline code node.
179    Code,
180    /// An HTML node.
181    Html,
182    /// An inline HTML node.
183    InlineHtml,
184    /// A reference to a footnote with given label, which may or may not be defined
185    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
186    /// occur in any order.
187    FootnoteReference,
188    /// A soft line break.
189    SoftBreak,
190    /// A hard line break.
191    HardBreak,
192    /// A horizontal ruler.
193    Rule,
194    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
195    TaskListMarker(bool),
196}
197
198/// Tags for elements that can contain other elements.
199#[derive(Clone, Debug, PartialEq)]
200pub enum MarkdownTag {
201    /// A paragraph of text and other inline elements.
202    Paragraph,
203
204    /// A heading, with optional identifier, classes and custom attributes.
205    /// The identifier is prefixed with `#` and the last one in the attributes
206    /// list is chosen, classes are prefixed with `.` and custom attributes
207    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
208    Heading {
209        level: HeadingLevel,
210        id: Option<SharedString>,
211        classes: Vec<SharedString>,
212        /// The first item of the tuple is the attr and second one the value.
213        attrs: Vec<(SharedString, Option<SharedString>)>,
214    },
215
216    BlockQuote,
217
218    /// A code block.
219    CodeBlock(CodeBlockKind),
220
221    /// A HTML block.
222    HtmlBlock,
223
224    /// A list. If the list is ordered the field indicates the number of the first item.
225    /// Contains only list items.
226    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
227
228    /// A list item.
229    Item,
230
231    /// A footnote definition. The value contained is the footnote's label by which it can
232    /// be referred to.
233    FootnoteDefinition(SharedString),
234
235    /// A table. Contains a vector describing the text-alignment for each of its columns.
236    Table(Vec<Alignment>),
237
238    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
239    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
240    TableHead,
241
242    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
243    TableRow,
244    TableCell,
245
246    // span-level tags
247    Emphasis,
248    Strong,
249    Strikethrough,
250
251    /// A link.
252    Link {
253        link_type: LinkType,
254        dest_url: SharedString,
255        title: SharedString,
256        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
257        id: SharedString,
258    },
259
260    /// An image. The first field is the link type, the second the destination URL and the third is a title,
261    /// the fourth is the link identifier.
262    Image {
263        link_type: LinkType,
264        dest_url: SharedString,
265        title: SharedString,
266        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
267        id: SharedString,
268    },
269
270    /// A metadata block.
271    MetadataBlock(MetadataBlockKind),
272
273    DefinitionList,
274    DefinitionListTitle,
275    DefinitionListDefinition,
276}
277
278#[derive(Clone, Debug, PartialEq)]
279pub enum CodeBlockKind {
280    Indented,
281    /// The value contained in the tag describes the language of the code, which may be empty.
282    Fenced(SharedString),
283}
284
285impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
286    fn from(tag: pulldown_cmark::Tag) -> Self {
287        match tag {
288            pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
289            pulldown_cmark::Tag::Heading {
290                level,
291                id,
292                classes,
293                attrs,
294            } => {
295                let id = id.map(|id| SharedString::from(id.into_string()));
296                let classes = classes
297                    .into_iter()
298                    .map(|c| SharedString::from(c.into_string()))
299                    .collect();
300                let attrs = attrs
301                    .into_iter()
302                    .map(|(key, value)| {
303                        (
304                            SharedString::from(key.into_string()),
305                            value.map(|v| SharedString::from(v.into_string())),
306                        )
307                    })
308                    .collect();
309                MarkdownTag::Heading {
310                    level,
311                    id,
312                    classes,
313                    attrs,
314                }
315            }
316            pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
317            pulldown_cmark::Tag::CodeBlock(kind) => match kind {
318                pulldown_cmark::CodeBlockKind::Indented => {
319                    MarkdownTag::CodeBlock(CodeBlockKind::Indented)
320                }
321                pulldown_cmark::CodeBlockKind::Fenced(info) => MarkdownTag::CodeBlock(
322                    CodeBlockKind::Fenced(SharedString::from(info.into_string())),
323                ),
324            },
325            pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
326            pulldown_cmark::Tag::Item => MarkdownTag::Item,
327            pulldown_cmark::Tag::FootnoteDefinition(label) => {
328                MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
329            }
330            pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
331            pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
332            pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
333            pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
334            pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
335            pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
336            pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
337            pulldown_cmark::Tag::Link {
338                link_type,
339                dest_url,
340                title,
341                id,
342            } => MarkdownTag::Link {
343                link_type,
344                dest_url: SharedString::from(dest_url.into_string()),
345                title: SharedString::from(title.into_string()),
346                id: SharedString::from(id.into_string()),
347            },
348            pulldown_cmark::Tag::Image {
349                link_type,
350                dest_url,
351                title,
352                id,
353            } => MarkdownTag::Image {
354                link_type,
355                dest_url: SharedString::from(dest_url.into_string()),
356                title: SharedString::from(title.into_string()),
357                id: SharedString::from(id.into_string()),
358            },
359            pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
360            pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
361            pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
362            pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
363            pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
364        }
365    }
366}
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371
372    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
373        .union(Options::ENABLE_MATH)
374        .union(Options::ENABLE_DEFINITION_LIST);
375
376    #[test]
377    fn all_options_considered() {
378        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
379        // can be evaluated for inclusion.
380        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
381    }
382
383    #[test]
384    fn wanted_and_unwanted_options_disjoint() {
385        assert_eq!(
386            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
387            Options::empty()
388        );
389    }
390}