parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{
  8    collections::HashSet,
  9    ops::{Deref, Range},
 10    path::Path,
 11    sync::Arc,
 12};
 13
 14use crate::path_range::PathWithRange;
 15
 16const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 17    .union(Options::ENABLE_FOOTNOTES)
 18    .union(Options::ENABLE_STRIKETHROUGH)
 19    .union(Options::ENABLE_TASKLISTS)
 20    .union(Options::ENABLE_SMART_PUNCTUATION)
 21    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 22    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 23    .union(Options::ENABLE_OLD_FOOTNOTES)
 24    .union(Options::ENABLE_GFM);
 25
 26pub fn parse_markdown(
 27    text: &str,
 28) -> (
 29    Vec<(Range<usize>, MarkdownEvent)>,
 30    HashSet<SharedString>,
 31    HashSet<Arc<Path>>,
 32) {
 33    let mut events = Vec::new();
 34    let mut language_names = HashSet::new();
 35    let mut language_paths = HashSet::new();
 36    let mut within_link = false;
 37    let mut within_metadata = false;
 38    for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
 39        if within_metadata {
 40            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 41                pulldown_event
 42            {
 43                within_metadata = false;
 44            }
 45            continue;
 46        }
 47        match pulldown_event {
 48            pulldown_cmark::Event::Start(tag) => {
 49                let tag = match tag {
 50                    pulldown_cmark::Tag::Link {
 51                        link_type,
 52                        dest_url,
 53                        title,
 54                        id,
 55                    } => {
 56                        within_link = true;
 57                        MarkdownTag::Link {
 58                            link_type,
 59                            dest_url: SharedString::from(dest_url.into_string()),
 60                            title: SharedString::from(title.into_string()),
 61                            id: SharedString::from(id.into_string()),
 62                        }
 63                    }
 64                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 65                        within_metadata = true;
 66                        MarkdownTag::MetadataBlock(kind)
 67                    }
 68                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 69                        ref info,
 70                    )) => {
 71                        let info = info.trim();
 72                        MarkdownTag::CodeBlock(if info.is_empty() {
 73                            CodeBlockKind::Fenced
 74                            // Languages should never contain a slash, and PathRanges always should.
 75                            // (Models are told to specify them relative to a workspace root.)
 76                        } else if info.contains('/') {
 77                            let path_range = PathWithRange::new(info);
 78                            language_paths.insert(path_range.path.clone());
 79                            CodeBlockKind::FencedSrc(path_range)
 80                        } else {
 81                            let language = SharedString::from(info.to_string());
 82                            language_names.insert(language.clone());
 83                            CodeBlockKind::FencedLang(language)
 84                        })
 85                    }
 86                    tag => tag.into(),
 87                };
 88                events.push((range, MarkdownEvent::Start(tag)))
 89            }
 90            pulldown_cmark::Event::End(tag) => {
 91                if let pulldown_cmark::TagEnd::Link = tag {
 92                    within_link = false;
 93                }
 94                events.push((range, MarkdownEvent::End(tag)));
 95            }
 96            pulldown_cmark::Event::Text(parsed) => {
 97                // `parsed` will share bytes with the input unless a substitution like handling of
 98                // HTML entities or smart punctuation has occurred. When these substitutions occur,
 99                // `parsed` only consists of the result of a single substitution.
100                if !cow_str_points_inside(&parsed, text) {
101                    // Attempt to detect cases where the assumptions here are not valid or the
102                    // behavior has changed.
103                    if parsed.len() > 4 {
104                        log::error!(
105                            "Bug in markdown parser. \
106                            pulldown_cmark::Event::Text expected to a substituted HTML entity, \
107                            but it was longer than expected.\n\
108                            Source: {}\n\
109                            Parsed: {}",
110                            &text[range.clone()],
111                            parsed
112                        );
113                    }
114                    events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
115                } else {
116                    // Automatically detect links in text if not already within a markdown link.
117                    if !within_link {
118                        let mut finder = LinkFinder::new();
119                        finder.kinds(&[linkify::LinkKind::Url]);
120                        let text_range = range.clone();
121                        for link in finder.links(&text[text_range.clone()]) {
122                            let link_range =
123                                text_range.start + link.start()..text_range.start + link.end();
124
125                            if link_range.start > range.start {
126                                events.push((range.start..link_range.start, MarkdownEvent::Text));
127                            }
128
129                            events.push((
130                                link_range.clone(),
131                                MarkdownEvent::Start(MarkdownTag::Link {
132                                    link_type: LinkType::Autolink,
133                                    dest_url: SharedString::from(link.as_str().to_string()),
134                                    title: SharedString::default(),
135                                    id: SharedString::default(),
136                                }),
137                            ));
138
139                            events.push((link_range.clone(), MarkdownEvent::Text));
140                            events.push((
141                                link_range.clone(),
142                                MarkdownEvent::End(MarkdownTagEnd::Link),
143                            ));
144
145                            range.start = link_range.end;
146                        }
147                    }
148                    if range.start < range.end {
149                        events.push((range, MarkdownEvent::Text));
150                    }
151                }
152            }
153            pulldown_cmark::Event::Code(_) => {
154                range.start += 1;
155                range.end -= 1;
156                events.push((range, MarkdownEvent::Code))
157            }
158            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
159            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
160            pulldown_cmark::Event::FootnoteReference(_) => {
161                events.push((range, MarkdownEvent::FootnoteReference))
162            }
163            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
164            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
165            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
166            pulldown_cmark::Event::TaskListMarker(checked) => {
167                events.push((range, MarkdownEvent::TaskListMarker(checked)))
168            }
169            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
170        }
171    }
172    (events, language_names, language_paths)
173}
174
175pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
176    let mut events = Vec::new();
177    let mut finder = LinkFinder::new();
178    finder.kinds(&[linkify::LinkKind::Url]);
179    let mut text_range = Range {
180        start: 0,
181        end: text.len(),
182    };
183    for link in finder.links(text) {
184        let link_range = link.start()..link.end();
185
186        if link_range.start > text_range.start {
187            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
188        }
189
190        events.push((
191            link_range.clone(),
192            MarkdownEvent::Start(MarkdownTag::Link {
193                link_type: LinkType::Autolink,
194                dest_url: SharedString::from(link.as_str().to_string()),
195                title: SharedString::default(),
196                id: SharedString::default(),
197            }),
198        ));
199        events.push((link_range.clone(), MarkdownEvent::Text));
200        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
201
202        text_range.start = link_range.end;
203    }
204
205    if text_range.end > text_range.start {
206        events.push((text_range, MarkdownEvent::Text));
207    }
208
209    events
210}
211
212/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
213/// parse result for rendering without resorting to unsafe lifetime coercion.
214#[derive(Clone, Debug, PartialEq)]
215pub enum MarkdownEvent {
216    /// Start of a tagged element. Events that are yielded after this event
217    /// and before its corresponding `End` event are inside this element.
218    /// Start and end events are guaranteed to be balanced.
219    Start(MarkdownTag),
220    /// End of a tagged element.
221    End(MarkdownTagEnd),
222    /// Text that uses the associated range from the mardown source.
223    Text,
224    /// Text that differs from the markdown source - typically due to substitution of HTML entities
225    /// and smart punctuation.
226    SubstitutedText(CompactStr),
227    /// An inline code node.
228    Code,
229    /// An HTML node.
230    Html,
231    /// An inline HTML node.
232    InlineHtml,
233    /// A reference to a footnote with given label, which may or may not be defined
234    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
235    /// occur in any order.
236    FootnoteReference,
237    /// A soft line break.
238    SoftBreak,
239    /// A hard line break.
240    HardBreak,
241    /// A horizontal ruler.
242    Rule,
243    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
244    TaskListMarker(bool),
245}
246
247/// Tags for elements that can contain other elements.
248#[derive(Clone, Debug, PartialEq)]
249pub enum MarkdownTag {
250    /// A paragraph of text and other inline elements.
251    Paragraph,
252
253    /// A heading, with optional identifier, classes and custom attributes.
254    /// The identifier is prefixed with `#` and the last one in the attributes
255    /// list is chosen, classes are prefixed with `.` and custom attributes
256    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
257    Heading {
258        level: HeadingLevel,
259        id: Option<SharedString>,
260        classes: Vec<SharedString>,
261        /// The first item of the tuple is the attr and second one the value.
262        attrs: Vec<(SharedString, Option<SharedString>)>,
263    },
264
265    BlockQuote,
266
267    /// A code block.
268    CodeBlock(CodeBlockKind),
269
270    /// A HTML block.
271    HtmlBlock,
272
273    /// A list. If the list is ordered the field indicates the number of the first item.
274    /// Contains only list items.
275    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
276
277    /// A list item.
278    Item,
279
280    /// A footnote definition. The value contained is the footnote's label by which it can
281    /// be referred to.
282    FootnoteDefinition(SharedString),
283
284    /// A table. Contains a vector describing the text-alignment for each of its columns.
285    Table(Vec<Alignment>),
286
287    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
288    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
289    TableHead,
290
291    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
292    TableRow,
293    TableCell,
294
295    // span-level tags
296    Emphasis,
297    Strong,
298    Strikethrough,
299
300    /// A link.
301    Link {
302        link_type: LinkType,
303        dest_url: SharedString,
304        title: SharedString,
305        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
306        id: SharedString,
307    },
308
309    /// An image. The first field is the link type, the second the destination URL and the third is a title,
310    /// the fourth is the link identifier.
311    Image {
312        link_type: LinkType,
313        dest_url: SharedString,
314        title: SharedString,
315        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
316        id: SharedString,
317    },
318
319    /// A metadata block.
320    MetadataBlock(MetadataBlockKind),
321
322    DefinitionList,
323    DefinitionListTitle,
324    DefinitionListDefinition,
325}
326
327#[derive(Clone, Debug, PartialEq)]
328pub enum CodeBlockKind {
329    Indented,
330    /// "Fenced" means "surrounded by triple backticks."
331    /// There can optionally be either a language after the backticks (like in traditional Markdown)
332    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
333    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
334    Fenced,
335    FencedLang(SharedString),
336    FencedSrc(PathWithRange),
337}
338
339impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
340    fn from(tag: pulldown_cmark::Tag) -> Self {
341        match tag {
342            pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
343            pulldown_cmark::Tag::Heading {
344                level,
345                id,
346                classes,
347                attrs,
348            } => {
349                let id = id.map(|id| SharedString::from(id.into_string()));
350                let classes = classes
351                    .into_iter()
352                    .map(|c| SharedString::from(c.into_string()))
353                    .collect();
354                let attrs = attrs
355                    .into_iter()
356                    .map(|(key, value)| {
357                        (
358                            SharedString::from(key.into_string()),
359                            value.map(|v| SharedString::from(v.into_string())),
360                        )
361                    })
362                    .collect();
363                MarkdownTag::Heading {
364                    level,
365                    id,
366                    classes,
367                    attrs,
368                }
369            }
370            pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
371            pulldown_cmark::Tag::CodeBlock(kind) => match kind {
372                pulldown_cmark::CodeBlockKind::Indented => {
373                    MarkdownTag::CodeBlock(CodeBlockKind::Indented)
374                }
375                pulldown_cmark::CodeBlockKind::Fenced(info) => {
376                    let info = info.trim();
377                    MarkdownTag::CodeBlock(if info.is_empty() {
378                        CodeBlockKind::Fenced
379                    } else if info.contains('/') {
380                        // Languages should never contain a slash, and PathRanges always should.
381                        // (Models are told to specify them relative to a workspace root.)
382                        CodeBlockKind::FencedSrc(PathWithRange::new(info))
383                    } else {
384                        CodeBlockKind::FencedLang(SharedString::from(info.to_string()))
385                    })
386                }
387            },
388            pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
389            pulldown_cmark::Tag::Item => MarkdownTag::Item,
390            pulldown_cmark::Tag::FootnoteDefinition(label) => {
391                MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
392            }
393            pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
394            pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
395            pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
396            pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
397            pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
398            pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
399            pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
400            pulldown_cmark::Tag::Link {
401                link_type,
402                dest_url,
403                title,
404                id,
405            } => MarkdownTag::Link {
406                link_type,
407                dest_url: SharedString::from(dest_url.into_string()),
408                title: SharedString::from(title.into_string()),
409                id: SharedString::from(id.into_string()),
410            },
411            pulldown_cmark::Tag::Image {
412                link_type,
413                dest_url,
414                title,
415                id,
416            } => MarkdownTag::Image {
417                link_type,
418                dest_url: SharedString::from(dest_url.into_string()),
419                title: SharedString::from(title.into_string()),
420                id: SharedString::from(id.into_string()),
421            },
422            pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
423            pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
424            pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
425            pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
426            pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
427        }
428    }
429}
430
431/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
432/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
433///
434/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
435#[derive(Clone, Debug)]
436pub enum CompactStr {
437    Boxed(Box<str>),
438    Inlined(InlineStr),
439}
440
441impl Deref for CompactStr {
442    type Target = str;
443
444    fn deref(&self) -> &str {
445        match self {
446            CompactStr::Boxed(b) => b,
447            CompactStr::Inlined(i) => i,
448        }
449    }
450}
451
452impl From<&str> for CompactStr {
453    fn from(s: &str) -> Self {
454        if let Ok(inlined) = s.try_into() {
455            CompactStr::Inlined(inlined)
456        } else {
457            CompactStr::Boxed(s.into())
458        }
459    }
460}
461
462impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
463    fn from(cow_str: pulldown_cmark::CowStr) -> Self {
464        match cow_str {
465            pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
466            pulldown_cmark::CowStr::Borrowed(b) => b.into(),
467            pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
468        }
469    }
470}
471
472impl PartialEq for CompactStr {
473    fn eq(&self, other: &Self) -> bool {
474        self.deref() == other.deref()
475    }
476}
477
478fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
479    match substring {
480        pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
481        pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
482        pulldown_cmark::CowStr::Inlined(_) => false,
483    }
484}
485
486fn str_points_inside(substring: &str, container: &str) -> bool {
487    let substring_ptr = substring.as_ptr();
488    let container_ptr = container.as_ptr();
489    unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
490}
491
492#[cfg(test)]
493mod tests {
494    use super::MarkdownEvent::*;
495    use super::MarkdownTag::*;
496    use super::*;
497
498    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
499        .union(Options::ENABLE_MATH)
500        .union(Options::ENABLE_DEFINITION_LIST);
501
502    #[test]
503    fn all_options_considered() {
504        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
505        // can be evaluated for inclusion.
506        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
507    }
508
509    #[test]
510    fn wanted_and_unwanted_options_disjoint() {
511        assert_eq!(
512            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
513            Options::empty()
514        );
515    }
516
517    #[test]
518    fn test_plain_urls_and_escaped_text() {
519        assert_eq!(
520            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
521            (
522                vec![
523                    (0..51, Start(Paragraph)),
524                    (0..6, SubstitutedText("\u{a0}".into())),
525                    (6..12, SubstitutedText("\u{a0}".into())),
526                    (12..13, Text),
527                    (
528                        13..29,
529                        Start(Link {
530                            link_type: LinkType::Autolink,
531                            dest_url: "https://some.url".into(),
532                            title: "".into(),
533                            id: "".into(),
534                        })
535                    ),
536                    (13..29, Text),
537                    (13..29, End(MarkdownTagEnd::Link)),
538                    (29..35, Text),
539                    (36..37, Text), // Escaped backtick
540                    (37..44, SubstitutedText("".into())),
541                    (45..46, Text), // Escaped backtick
542                    (46..51, Text),
543                    (0..51, End(MarkdownTagEnd::Paragraph))
544                ],
545                HashSet::new(),
546                HashSet::new()
547            )
548        );
549    }
550
551    #[test]
552    fn test_smart_punctuation() {
553        assert_eq!(
554            parse_markdown("-- --- ... \"double quoted\" 'single quoted'"),
555            (
556                vec![
557                    (0..42, Start(Paragraph)),
558                    (0..2, SubstitutedText("".into())),
559                    (2..3, Text),
560                    (3..6, SubstitutedText("".into())),
561                    (6..7, Text),
562                    (7..10, SubstitutedText("".into())),
563                    (10..11, Text),
564                    (11..12, SubstitutedText("".into())),
565                    (12..25, Text),
566                    (25..26, SubstitutedText("".into())),
567                    (26..27, Text),
568                    (27..28, SubstitutedText("".into())),
569                    (28..41, Text),
570                    (41..42, SubstitutedText("".into())),
571                    (0..42, End(MarkdownTagEnd::Paragraph))
572                ],
573                HashSet::new(),
574                HashSet::new()
575            )
576        )
577    }
578}