parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{
  8    collections::HashSet,
  9    ops::{Deref, Range},
 10    path::Path,
 11    sync::Arc,
 12};
 13
 14use crate::path_range::PathWithRange;
 15
 16const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 17    .union(Options::ENABLE_FOOTNOTES)
 18    .union(Options::ENABLE_STRIKETHROUGH)
 19    .union(Options::ENABLE_TASKLISTS)
 20    .union(Options::ENABLE_SMART_PUNCTUATION)
 21    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 22    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 23    .union(Options::ENABLE_OLD_FOOTNOTES)
 24    .union(Options::ENABLE_GFM);
 25
 26pub fn parse_markdown(
 27    text: &str,
 28) -> (
 29    Vec<(Range<usize>, MarkdownEvent)>,
 30    HashSet<SharedString>,
 31    HashSet<Arc<Path>>,
 32) {
 33    let mut events = Vec::new();
 34    let mut language_names = HashSet::new();
 35    let mut language_paths = HashSet::new();
 36    let mut within_link = false;
 37    let mut within_metadata = false;
 38    for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
 39        if within_metadata {
 40            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 41                pulldown_event
 42            {
 43                within_metadata = false;
 44            }
 45            continue;
 46        }
 47        match pulldown_event {
 48            pulldown_cmark::Event::Start(tag) => {
 49                let tag = match tag {
 50                    pulldown_cmark::Tag::Link {
 51                        link_type,
 52                        dest_url,
 53                        title,
 54                        id,
 55                    } => {
 56                        within_link = true;
 57                        MarkdownTag::Link {
 58                            link_type,
 59                            dest_url: SharedString::from(dest_url.into_string()),
 60                            title: SharedString::from(title.into_string()),
 61                            id: SharedString::from(id.into_string()),
 62                        }
 63                    }
 64                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 65                        within_metadata = true;
 66                        MarkdownTag::MetadataBlock(kind)
 67                    }
 68                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 69                        ref info,
 70                    )) => {
 71                        let info = info.trim();
 72                        MarkdownTag::CodeBlock(if info.is_empty() {
 73                            CodeBlockKind::Fenced
 74                            // Languages should never contain a slash, and PathRanges always should.
 75                            // (Models are told to specify them relative to a workspace root.)
 76                        } else if info.contains('/') {
 77                            let path_range = PathWithRange::new(info);
 78                            language_paths.insert(path_range.path.clone());
 79                            CodeBlockKind::FencedSrc(path_range)
 80                        } else {
 81                            let language = SharedString::from(info.to_string());
 82                            language_names.insert(language.clone());
 83                            CodeBlockKind::FencedLang(language)
 84                        })
 85                    }
 86                    tag => tag.into(),
 87                };
 88                events.push((range, MarkdownEvent::Start(tag)))
 89            }
 90            pulldown_cmark::Event::End(tag) => {
 91                if let pulldown_cmark::TagEnd::Link = tag {
 92                    within_link = false;
 93                }
 94                events.push((range, MarkdownEvent::End(tag)));
 95            }
 96            pulldown_cmark::Event::Text(parsed) => {
 97                // `parsed` will share bytes with the input unless a substitution like handling of
 98                // HTML entities or smart punctuation has occurred. When these substitutions occur,
 99                // `parsed` only consists of the result of a single substitution.
100                if !cow_str_points_inside(&parsed, text) {
101                    events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
102                } else {
103                    // Automatically detect links in text if not already within a markdown link.
104                    if !within_link {
105                        let mut finder = LinkFinder::new();
106                        finder.kinds(&[linkify::LinkKind::Url]);
107                        let text_range = range.clone();
108                        for link in finder.links(&text[text_range.clone()]) {
109                            let link_range =
110                                text_range.start + link.start()..text_range.start + link.end();
111
112                            if link_range.start > range.start {
113                                events.push((range.start..link_range.start, MarkdownEvent::Text));
114                            }
115
116                            events.push((
117                                link_range.clone(),
118                                MarkdownEvent::Start(MarkdownTag::Link {
119                                    link_type: LinkType::Autolink,
120                                    dest_url: SharedString::from(link.as_str().to_string()),
121                                    title: SharedString::default(),
122                                    id: SharedString::default(),
123                                }),
124                            ));
125
126                            events.push((link_range.clone(), MarkdownEvent::Text));
127                            events.push((
128                                link_range.clone(),
129                                MarkdownEvent::End(MarkdownTagEnd::Link),
130                            ));
131
132                            range.start = link_range.end;
133                        }
134                    }
135                    if range.start < range.end {
136                        events.push((range, MarkdownEvent::Text));
137                    }
138                }
139            }
140            pulldown_cmark::Event::Code(_) => {
141                range.start += 1;
142                range.end -= 1;
143                events.push((range, MarkdownEvent::Code))
144            }
145            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
146            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
147            pulldown_cmark::Event::FootnoteReference(_) => {
148                events.push((range, MarkdownEvent::FootnoteReference))
149            }
150            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
151            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
152            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
153            pulldown_cmark::Event::TaskListMarker(checked) => {
154                events.push((range, MarkdownEvent::TaskListMarker(checked)))
155            }
156            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
157        }
158    }
159    (events, language_names, language_paths)
160}
161
162pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
163    let mut events = Vec::new();
164    let mut finder = LinkFinder::new();
165    finder.kinds(&[linkify::LinkKind::Url]);
166    let mut text_range = Range {
167        start: 0,
168        end: text.len(),
169    };
170    for link in finder.links(text) {
171        let link_range = link.start()..link.end();
172
173        if link_range.start > text_range.start {
174            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
175        }
176
177        events.push((
178            link_range.clone(),
179            MarkdownEvent::Start(MarkdownTag::Link {
180                link_type: LinkType::Autolink,
181                dest_url: SharedString::from(link.as_str().to_string()),
182                title: SharedString::default(),
183                id: SharedString::default(),
184            }),
185        ));
186        events.push((link_range.clone(), MarkdownEvent::Text));
187        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
188
189        text_range.start = link_range.end;
190    }
191
192    if text_range.end > text_range.start {
193        events.push((text_range, MarkdownEvent::Text));
194    }
195
196    events
197}
198
199/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
200/// parse result for rendering without resorting to unsafe lifetime coercion.
201#[derive(Clone, Debug, PartialEq)]
202pub enum MarkdownEvent {
203    /// Start of a tagged element. Events that are yielded after this event
204    /// and before its corresponding `End` event are inside this element.
205    /// Start and end events are guaranteed to be balanced.
206    Start(MarkdownTag),
207    /// End of a tagged element.
208    End(MarkdownTagEnd),
209    /// Text that uses the associated range from the markdown source.
210    Text,
211    /// Text that differs from the markdown source - typically due to substitution of HTML entities
212    /// and smart punctuation.
213    SubstitutedText(CompactStr),
214    /// An inline code node.
215    Code,
216    /// An HTML node.
217    Html,
218    /// An inline HTML node.
219    InlineHtml,
220    /// A reference to a footnote with given label, which may or may not be defined
221    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
222    /// occur in any order.
223    FootnoteReference,
224    /// A soft line break.
225    SoftBreak,
226    /// A hard line break.
227    HardBreak,
228    /// A horizontal ruler.
229    Rule,
230    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
231    TaskListMarker(bool),
232}
233
234/// Tags for elements that can contain other elements.
235#[derive(Clone, Debug, PartialEq)]
236pub enum MarkdownTag {
237    /// A paragraph of text and other inline elements.
238    Paragraph,
239
240    /// A heading, with optional identifier, classes and custom attributes.
241    /// The identifier is prefixed with `#` and the last one in the attributes
242    /// list is chosen, classes are prefixed with `.` and custom attributes
243    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
244    Heading {
245        level: HeadingLevel,
246        id: Option<SharedString>,
247        classes: Vec<SharedString>,
248        /// The first item of the tuple is the attr and second one the value.
249        attrs: Vec<(SharedString, Option<SharedString>)>,
250    },
251
252    BlockQuote,
253
254    /// A code block.
255    CodeBlock(CodeBlockKind),
256
257    /// A HTML block.
258    HtmlBlock,
259
260    /// A list. If the list is ordered the field indicates the number of the first item.
261    /// Contains only list items.
262    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
263
264    /// A list item.
265    Item,
266
267    /// A footnote definition. The value contained is the footnote's label by which it can
268    /// be referred to.
269    FootnoteDefinition(SharedString),
270
271    /// A table. Contains a vector describing the text-alignment for each of its columns.
272    Table(Vec<Alignment>),
273
274    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
275    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
276    TableHead,
277
278    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
279    TableRow,
280    TableCell,
281
282    // span-level tags
283    Emphasis,
284    Strong,
285    Strikethrough,
286
287    /// A link.
288    Link {
289        link_type: LinkType,
290        dest_url: SharedString,
291        title: SharedString,
292        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
293        id: SharedString,
294    },
295
296    /// An image. The first field is the link type, the second the destination URL and the third is a title,
297    /// the fourth is the link identifier.
298    Image {
299        link_type: LinkType,
300        dest_url: SharedString,
301        title: SharedString,
302        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
303        id: SharedString,
304    },
305
306    /// A metadata block.
307    MetadataBlock(MetadataBlockKind),
308
309    DefinitionList,
310    DefinitionListTitle,
311    DefinitionListDefinition,
312}
313
314#[derive(Clone, Debug, PartialEq)]
315pub enum CodeBlockKind {
316    Indented,
317    /// "Fenced" means "surrounded by triple backticks."
318    /// There can optionally be either a language after the backticks (like in traditional Markdown)
319    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
320    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
321    Fenced,
322    FencedLang(SharedString),
323    FencedSrc(PathWithRange),
324}
325
326impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
327    fn from(tag: pulldown_cmark::Tag) -> Self {
328        match tag {
329            pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
330            pulldown_cmark::Tag::Heading {
331                level,
332                id,
333                classes,
334                attrs,
335            } => {
336                let id = id.map(|id| SharedString::from(id.into_string()));
337                let classes = classes
338                    .into_iter()
339                    .map(|c| SharedString::from(c.into_string()))
340                    .collect();
341                let attrs = attrs
342                    .into_iter()
343                    .map(|(key, value)| {
344                        (
345                            SharedString::from(key.into_string()),
346                            value.map(|v| SharedString::from(v.into_string())),
347                        )
348                    })
349                    .collect();
350                MarkdownTag::Heading {
351                    level,
352                    id,
353                    classes,
354                    attrs,
355                }
356            }
357            pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
358            pulldown_cmark::Tag::CodeBlock(kind) => match kind {
359                pulldown_cmark::CodeBlockKind::Indented => {
360                    MarkdownTag::CodeBlock(CodeBlockKind::Indented)
361                }
362                pulldown_cmark::CodeBlockKind::Fenced(info) => {
363                    let info = info.trim();
364                    MarkdownTag::CodeBlock(if info.is_empty() {
365                        CodeBlockKind::Fenced
366                    } else if info.contains('/') {
367                        // Languages should never contain a slash, and PathRanges always should.
368                        // (Models are told to specify them relative to a workspace root.)
369                        CodeBlockKind::FencedSrc(PathWithRange::new(info))
370                    } else {
371                        CodeBlockKind::FencedLang(SharedString::from(info.to_string()))
372                    })
373                }
374            },
375            pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
376            pulldown_cmark::Tag::Item => MarkdownTag::Item,
377            pulldown_cmark::Tag::FootnoteDefinition(label) => {
378                MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
379            }
380            pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
381            pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
382            pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
383            pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
384            pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
385            pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
386            pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
387            pulldown_cmark::Tag::Link {
388                link_type,
389                dest_url,
390                title,
391                id,
392            } => MarkdownTag::Link {
393                link_type,
394                dest_url: SharedString::from(dest_url.into_string()),
395                title: SharedString::from(title.into_string()),
396                id: SharedString::from(id.into_string()),
397            },
398            pulldown_cmark::Tag::Image {
399                link_type,
400                dest_url,
401                title,
402                id,
403            } => MarkdownTag::Image {
404                link_type,
405                dest_url: SharedString::from(dest_url.into_string()),
406                title: SharedString::from(title.into_string()),
407                id: SharedString::from(id.into_string()),
408            },
409            pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
410            pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
411            pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
412            pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
413            pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
414        }
415    }
416}
417
418/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
419/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
420///
421/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
422#[derive(Clone)]
423pub enum CompactStr {
424    Boxed(Box<str>),
425    Inlined(InlineStr),
426}
427
428impl std::fmt::Debug for CompactStr {
429    fn fmt(&self, formatter: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
430        self.deref().fmt(formatter)
431    }
432}
433
434impl Deref for CompactStr {
435    type Target = str;
436
437    fn deref(&self) -> &str {
438        match self {
439            CompactStr::Boxed(b) => b,
440            CompactStr::Inlined(i) => i,
441        }
442    }
443}
444
445impl From<&str> for CompactStr {
446    fn from(s: &str) -> Self {
447        if let Ok(inlined) = s.try_into() {
448            CompactStr::Inlined(inlined)
449        } else {
450            CompactStr::Boxed(s.into())
451        }
452    }
453}
454
455impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
456    fn from(cow_str: pulldown_cmark::CowStr) -> Self {
457        match cow_str {
458            pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
459            pulldown_cmark::CowStr::Borrowed(b) => b.into(),
460            pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
461        }
462    }
463}
464
465impl PartialEq for CompactStr {
466    fn eq(&self, other: &Self) -> bool {
467        self.deref() == other.deref()
468    }
469}
470
471fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
472    match substring {
473        pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
474        pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
475        pulldown_cmark::CowStr::Inlined(_) => false,
476    }
477}
478
479fn str_points_inside(substring: &str, container: &str) -> bool {
480    let substring_ptr = substring.as_ptr();
481    let container_ptr = container.as_ptr();
482    unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
483}
484
485#[cfg(test)]
486mod tests {
487    use super::MarkdownEvent::*;
488    use super::MarkdownTag::*;
489    use super::*;
490
491    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
492        .union(Options::ENABLE_MATH)
493        .union(Options::ENABLE_DEFINITION_LIST);
494
495    #[test]
496    fn all_options_considered() {
497        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
498        // can be evaluated for inclusion.
499        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
500    }
501
502    #[test]
503    fn wanted_and_unwanted_options_disjoint() {
504        assert_eq!(
505            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
506            Options::empty()
507        );
508    }
509
510    #[test]
511    fn test_plain_urls_and_escaped_text() {
512        assert_eq!(
513            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
514            (
515                vec![
516                    (0..51, Start(Paragraph)),
517                    (0..6, SubstitutedText("\u{a0}".into())),
518                    (6..12, SubstitutedText("\u{a0}".into())),
519                    (12..13, Text),
520                    (
521                        13..29,
522                        Start(Link {
523                            link_type: LinkType::Autolink,
524                            dest_url: "https://some.url".into(),
525                            title: "".into(),
526                            id: "".into(),
527                        })
528                    ),
529                    (13..29, Text),
530                    (13..29, End(MarkdownTagEnd::Link)),
531                    (29..35, Text),
532                    (36..37, Text), // Escaped backtick
533                    (37..44, SubstitutedText("".into())),
534                    (45..46, Text), // Escaped backtick
535                    (46..51, Text),
536                    (0..51, End(MarkdownTagEnd::Paragraph))
537                ],
538                HashSet::new(),
539                HashSet::new()
540            )
541        );
542    }
543
544    #[test]
545    fn test_smart_punctuation() {
546        assert_eq!(
547            parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
548            (
549                vec![
550                    (0..53, Start(Paragraph)),
551                    (0..2, SubstitutedText("".into())),
552                    (2..3, Text),
553                    (3..6, SubstitutedText("".into())),
554                    (6..7, Text),
555                    (7..10, SubstitutedText("".into())),
556                    (10..11, Text),
557                    (11..12, SubstitutedText("".into())),
558                    (12..25, Text),
559                    (25..26, SubstitutedText("".into())),
560                    (26..27, Text),
561                    (27..28, SubstitutedText("".into())),
562                    (28..41, Text),
563                    (41..42, SubstitutedText("".into())),
564                    (42..43, Text),
565                    (43..53, SubstitutedText("–––––".into())),
566                    (0..53, End(MarkdownTagEnd::Paragraph))
567                ],
568                HashSet::new(),
569                HashSet::new()
570            )
571        )
572    }
573}