parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{
  8    collections::HashSet,
  9    ops::{Deref, Range},
 10    path::PathBuf,
 11};
 12
 13use crate::path_range::PathRange;
 14
 15const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 16    .union(Options::ENABLE_FOOTNOTES)
 17    .union(Options::ENABLE_STRIKETHROUGH)
 18    .union(Options::ENABLE_TASKLISTS)
 19    .union(Options::ENABLE_SMART_PUNCTUATION)
 20    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 21    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 22    .union(Options::ENABLE_OLD_FOOTNOTES)
 23    .union(Options::ENABLE_GFM);
 24
 25pub fn parse_markdown(
 26    text: &str,
 27) -> (
 28    Vec<(Range<usize>, MarkdownEvent)>,
 29    HashSet<SharedString>,
 30    HashSet<PathBuf>,
 31) {
 32    let mut events = Vec::new();
 33    let mut language_names = HashSet::new();
 34    let mut language_paths = HashSet::new();
 35    let mut within_link = false;
 36    let mut within_metadata = false;
 37    for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
 38        if within_metadata {
 39            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 40                pulldown_event
 41            {
 42                within_metadata = false;
 43            }
 44            continue;
 45        }
 46        match pulldown_event {
 47            pulldown_cmark::Event::Start(tag) => {
 48                let tag = match tag {
 49                    pulldown_cmark::Tag::Link {
 50                        link_type,
 51                        dest_url,
 52                        title,
 53                        id,
 54                    } => {
 55                        within_link = true;
 56                        MarkdownTag::Link {
 57                            link_type,
 58                            dest_url: SharedString::from(dest_url.into_string()),
 59                            title: SharedString::from(title.into_string()),
 60                            id: SharedString::from(id.into_string()),
 61                        }
 62                    }
 63                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 64                        within_metadata = true;
 65                        MarkdownTag::MetadataBlock(kind)
 66                    }
 67                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 68                        ref info,
 69                    )) => {
 70                        let info = info.trim();
 71                        MarkdownTag::CodeBlock(if info.is_empty() {
 72                            CodeBlockKind::Fenced
 73                            // Languages should never contain a slash, and PathRanges always should.
 74                            // (Models are told to specify them relative to a workspace root.)
 75                        } else if info.contains('/') {
 76                            let path_range = PathRange::new(info);
 77                            language_paths.insert(path_range.path.clone());
 78                            CodeBlockKind::FencedSrc(path_range)
 79                        } else {
 80                            let language = SharedString::from(info.to_string());
 81                            language_names.insert(language.clone());
 82                            CodeBlockKind::FencedLang(language)
 83                        })
 84                    }
 85                    tag => tag.into(),
 86                };
 87                events.push((range, MarkdownEvent::Start(tag)))
 88            }
 89            pulldown_cmark::Event::End(tag) => {
 90                if let pulldown_cmark::TagEnd::Link = tag {
 91                    within_link = false;
 92                }
 93                events.push((range, MarkdownEvent::End(tag)));
 94            }
 95            pulldown_cmark::Event::Text(parsed) => {
 96                // `parsed` will share bytes with the input unless a substitution like handling of
 97                // HTML entities or smart punctuation has occurred. When these substitutions occur,
 98                // `parsed` only consists of the result of a single substitution.
 99                if !cow_str_points_inside(&parsed, text) {
100                    // Attempt to detect cases where the assumptions here are not valid or the
101                    // behavior has changed.
102                    if parsed.len() > 4 {
103                        log::error!(
104                            "Bug in markdown parser. \
105                            pulldown_cmark::Event::Text expected to a substituted HTML entity, \
106                            but it was longer than expected.\n\
107                            Source: {}\n\
108                            Parsed: {}",
109                            &text[range.clone()],
110                            parsed
111                        );
112                    }
113                    events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
114                } else {
115                    // Automatically detect links in text if not already within a markdown link.
116                    if !within_link {
117                        let mut finder = LinkFinder::new();
118                        finder.kinds(&[linkify::LinkKind::Url]);
119                        let text_range = range.clone();
120                        for link in finder.links(&text[text_range.clone()]) {
121                            let link_range =
122                                text_range.start + link.start()..text_range.start + link.end();
123
124                            if link_range.start > range.start {
125                                events.push((range.start..link_range.start, MarkdownEvent::Text));
126                            }
127
128                            events.push((
129                                link_range.clone(),
130                                MarkdownEvent::Start(MarkdownTag::Link {
131                                    link_type: LinkType::Autolink,
132                                    dest_url: SharedString::from(link.as_str().to_string()),
133                                    title: SharedString::default(),
134                                    id: SharedString::default(),
135                                }),
136                            ));
137
138                            events.push((link_range.clone(), MarkdownEvent::Text));
139                            events.push((
140                                link_range.clone(),
141                                MarkdownEvent::End(MarkdownTagEnd::Link),
142                            ));
143
144                            range.start = link_range.end;
145                        }
146                    }
147                    if range.start < range.end {
148                        events.push((range, MarkdownEvent::Text));
149                    }
150                }
151            }
152            pulldown_cmark::Event::Code(_) => {
153                range.start += 1;
154                range.end -= 1;
155                events.push((range, MarkdownEvent::Code))
156            }
157            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
158            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
159            pulldown_cmark::Event::FootnoteReference(_) => {
160                events.push((range, MarkdownEvent::FootnoteReference))
161            }
162            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
163            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
164            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
165            pulldown_cmark::Event::TaskListMarker(checked) => {
166                events.push((range, MarkdownEvent::TaskListMarker(checked)))
167            }
168            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
169        }
170    }
171    (events, language_names, language_paths)
172}
173
174pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
175    let mut events = Vec::new();
176    let mut finder = LinkFinder::new();
177    finder.kinds(&[linkify::LinkKind::Url]);
178    let mut text_range = Range {
179        start: 0,
180        end: text.len(),
181    };
182    for link in finder.links(text) {
183        let link_range = link.start()..link.end();
184
185        if link_range.start > text_range.start {
186            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
187        }
188
189        events.push((
190            link_range.clone(),
191            MarkdownEvent::Start(MarkdownTag::Link {
192                link_type: LinkType::Autolink,
193                dest_url: SharedString::from(link.as_str().to_string()),
194                title: SharedString::default(),
195                id: SharedString::default(),
196            }),
197        ));
198        events.push((link_range.clone(), MarkdownEvent::Text));
199        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
200
201        text_range.start = link_range.end;
202    }
203
204    if text_range.end > text_range.start {
205        events.push((text_range, MarkdownEvent::Text));
206    }
207
208    events
209}
210
211/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
212/// parse result for rendering without resorting to unsafe lifetime coercion.
213#[derive(Clone, Debug, PartialEq)]
214pub enum MarkdownEvent {
215    /// Start of a tagged element. Events that are yielded after this event
216    /// and before its corresponding `End` event are inside this element.
217    /// Start and end events are guaranteed to be balanced.
218    Start(MarkdownTag),
219    /// End of a tagged element.
220    End(MarkdownTagEnd),
221    /// Text that uses the associated range from the mardown source.
222    Text,
223    /// Text that differs from the markdown source - typically due to substitution of HTML entities
224    /// and smart punctuation.
225    SubstitutedText(CompactStr),
226    /// An inline code node.
227    Code,
228    /// An HTML node.
229    Html,
230    /// An inline HTML node.
231    InlineHtml,
232    /// A reference to a footnote with given label, which may or may not be defined
233    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
234    /// occur in any order.
235    FootnoteReference,
236    /// A soft line break.
237    SoftBreak,
238    /// A hard line break.
239    HardBreak,
240    /// A horizontal ruler.
241    Rule,
242    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
243    TaskListMarker(bool),
244}
245
246/// Tags for elements that can contain other elements.
247#[derive(Clone, Debug, PartialEq)]
248pub enum MarkdownTag {
249    /// A paragraph of text and other inline elements.
250    Paragraph,
251
252    /// A heading, with optional identifier, classes and custom attributes.
253    /// The identifier is prefixed with `#` and the last one in the attributes
254    /// list is chosen, classes are prefixed with `.` and custom attributes
255    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
256    Heading {
257        level: HeadingLevel,
258        id: Option<SharedString>,
259        classes: Vec<SharedString>,
260        /// The first item of the tuple is the attr and second one the value.
261        attrs: Vec<(SharedString, Option<SharedString>)>,
262    },
263
264    BlockQuote,
265
266    /// A code block.
267    CodeBlock(CodeBlockKind),
268
269    /// A HTML block.
270    HtmlBlock,
271
272    /// A list. If the list is ordered the field indicates the number of the first item.
273    /// Contains only list items.
274    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
275
276    /// A list item.
277    Item,
278
279    /// A footnote definition. The value contained is the footnote's label by which it can
280    /// be referred to.
281    FootnoteDefinition(SharedString),
282
283    /// A table. Contains a vector describing the text-alignment for each of its columns.
284    Table(Vec<Alignment>),
285
286    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
287    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
288    TableHead,
289
290    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
291    TableRow,
292    TableCell,
293
294    // span-level tags
295    Emphasis,
296    Strong,
297    Strikethrough,
298
299    /// A link.
300    Link {
301        link_type: LinkType,
302        dest_url: SharedString,
303        title: SharedString,
304        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
305        id: SharedString,
306    },
307
308    /// An image. The first field is the link type, the second the destination URL and the third is a title,
309    /// the fourth is the link identifier.
310    Image {
311        link_type: LinkType,
312        dest_url: SharedString,
313        title: SharedString,
314        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
315        id: SharedString,
316    },
317
318    /// A metadata block.
319    MetadataBlock(MetadataBlockKind),
320
321    DefinitionList,
322    DefinitionListTitle,
323    DefinitionListDefinition,
324}
325
326#[derive(Clone, Debug, PartialEq)]
327pub enum CodeBlockKind {
328    Indented,
329    /// "Fenced" means "surrounded by triple backticks."
330    /// There can optionally be either a language after the backticks (like in traditional Markdown)
331    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
332    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
333    Fenced,
334    FencedLang(SharedString),
335    FencedSrc(PathRange),
336}
337
338impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
339    fn from(tag: pulldown_cmark::Tag) -> Self {
340        match tag {
341            pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
342            pulldown_cmark::Tag::Heading {
343                level,
344                id,
345                classes,
346                attrs,
347            } => {
348                let id = id.map(|id| SharedString::from(id.into_string()));
349                let classes = classes
350                    .into_iter()
351                    .map(|c| SharedString::from(c.into_string()))
352                    .collect();
353                let attrs = attrs
354                    .into_iter()
355                    .map(|(key, value)| {
356                        (
357                            SharedString::from(key.into_string()),
358                            value.map(|v| SharedString::from(v.into_string())),
359                        )
360                    })
361                    .collect();
362                MarkdownTag::Heading {
363                    level,
364                    id,
365                    classes,
366                    attrs,
367                }
368            }
369            pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
370            pulldown_cmark::Tag::CodeBlock(kind) => match kind {
371                pulldown_cmark::CodeBlockKind::Indented => {
372                    MarkdownTag::CodeBlock(CodeBlockKind::Indented)
373                }
374                pulldown_cmark::CodeBlockKind::Fenced(info) => {
375                    let info = info.trim();
376                    MarkdownTag::CodeBlock(if info.is_empty() {
377                        CodeBlockKind::Fenced
378                    } else if info.contains('/') {
379                        // Languages should never contain a slash, and PathRanges always should.
380                        // (Models are told to specify them relative to a workspace root.)
381                        CodeBlockKind::FencedSrc(PathRange::new(info))
382                    } else {
383                        CodeBlockKind::FencedLang(SharedString::from(info.to_string()))
384                    })
385                }
386            },
387            pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
388            pulldown_cmark::Tag::Item => MarkdownTag::Item,
389            pulldown_cmark::Tag::FootnoteDefinition(label) => {
390                MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
391            }
392            pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
393            pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
394            pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
395            pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
396            pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
397            pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
398            pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
399            pulldown_cmark::Tag::Link {
400                link_type,
401                dest_url,
402                title,
403                id,
404            } => MarkdownTag::Link {
405                link_type,
406                dest_url: SharedString::from(dest_url.into_string()),
407                title: SharedString::from(title.into_string()),
408                id: SharedString::from(id.into_string()),
409            },
410            pulldown_cmark::Tag::Image {
411                link_type,
412                dest_url,
413                title,
414                id,
415            } => MarkdownTag::Image {
416                link_type,
417                dest_url: SharedString::from(dest_url.into_string()),
418                title: SharedString::from(title.into_string()),
419                id: SharedString::from(id.into_string()),
420            },
421            pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
422            pulldown_cmark::Tag::MetadataBlock(kind) => MarkdownTag::MetadataBlock(kind),
423            pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
424            pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
425            pulldown_cmark::Tag::DefinitionListDefinition => MarkdownTag::DefinitionListDefinition,
426        }
427    }
428}
429
430/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
431/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
432///
433/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
434#[derive(Clone, Debug)]
435pub enum CompactStr {
436    Boxed(Box<str>),
437    Inlined(InlineStr),
438}
439
440impl Deref for CompactStr {
441    type Target = str;
442
443    fn deref(&self) -> &str {
444        match self {
445            CompactStr::Boxed(b) => b,
446            CompactStr::Inlined(i) => i,
447        }
448    }
449}
450
451impl From<&str> for CompactStr {
452    fn from(s: &str) -> Self {
453        if let Ok(inlined) = s.try_into() {
454            CompactStr::Inlined(inlined)
455        } else {
456            CompactStr::Boxed(s.into())
457        }
458    }
459}
460
461impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
462    fn from(cow_str: pulldown_cmark::CowStr) -> Self {
463        match cow_str {
464            pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
465            pulldown_cmark::CowStr::Borrowed(b) => b.into(),
466            pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
467        }
468    }
469}
470
471impl PartialEq for CompactStr {
472    fn eq(&self, other: &Self) -> bool {
473        self.deref() == other.deref()
474    }
475}
476
477fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
478    match substring {
479        pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
480        pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
481        pulldown_cmark::CowStr::Inlined(_) => false,
482    }
483}
484
485fn str_points_inside(substring: &str, container: &str) -> bool {
486    let substring_ptr = substring.as_ptr();
487    let container_ptr = container.as_ptr();
488    unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
489}
490
491#[cfg(test)]
492mod tests {
493    use super::MarkdownEvent::*;
494    use super::MarkdownTag::*;
495    use super::*;
496
497    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
498        .union(Options::ENABLE_MATH)
499        .union(Options::ENABLE_DEFINITION_LIST);
500
501    #[test]
502    fn all_options_considered() {
503        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
504        // can be evaluated for inclusion.
505        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
506    }
507
508    #[test]
509    fn wanted_and_unwanted_options_disjoint() {
510        assert_eq!(
511            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
512            Options::empty()
513        );
514    }
515
516    #[test]
517    fn test_plain_urls_and_escaped_text() {
518        assert_eq!(
519            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
520            (
521                vec![
522                    (0..51, Start(Paragraph)),
523                    (0..6, SubstitutedText("\u{a0}".into())),
524                    (6..12, SubstitutedText("\u{a0}".into())),
525                    (12..13, Text),
526                    (
527                        13..29,
528                        Start(Link {
529                            link_type: LinkType::Autolink,
530                            dest_url: "https://some.url".into(),
531                            title: "".into(),
532                            id: "".into(),
533                        })
534                    ),
535                    (13..29, Text),
536                    (13..29, End(MarkdownTagEnd::Link)),
537                    (29..35, Text),
538                    (36..37, Text), // Escaped backtick
539                    (37..44, SubstitutedText("".into())),
540                    (45..46, Text), // Escaped backtick
541                    (46..51, Text),
542                    (0..51, End(MarkdownTagEnd::Paragraph))
543                ],
544                HashSet::new(),
545                HashSet::new()
546            )
547        );
548    }
549
550    #[test]
551    fn test_smart_punctuation() {
552        assert_eq!(
553            parse_markdown("-- --- ... \"double quoted\" 'single quoted'"),
554            (
555                vec![
556                    (0..42, Start(Paragraph)),
557                    (0..2, SubstitutedText("".into())),
558                    (2..3, Text),
559                    (3..6, SubstitutedText("".into())),
560                    (6..7, Text),
561                    (7..10, SubstitutedText("".into())),
562                    (10..11, Text),
563                    (11..12, SubstitutedText("".into())),
564                    (12..25, Text),
565                    (25..26, SubstitutedText("".into())),
566                    (26..27, Text),
567                    (27..28, SubstitutedText("".into())),
568                    (28..41, Text),
569                    (41..42, SubstitutedText("".into())),
570                    (0..42, End(MarkdownTagEnd::Paragraph))
571                ],
572                HashSet::new(),
573                HashSet::new()
574            )
575        )
576    }
577}