parser.rs

  1use gpui::SharedString;
  2use linkify::LinkFinder;
  3pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
  4use pulldown_cmark::{
  5    Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
  6};
  7use std::{
  8    collections::HashSet,
  9    ops::{Deref, Range},
 10    path::Path,
 11    sync::Arc,
 12};
 13
 14use crate::path_range::PathWithRange;
 15
 16const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
 17    .union(Options::ENABLE_FOOTNOTES)
 18    .union(Options::ENABLE_STRIKETHROUGH)
 19    .union(Options::ENABLE_TASKLISTS)
 20    .union(Options::ENABLE_SMART_PUNCTUATION)
 21    .union(Options::ENABLE_HEADING_ATTRIBUTES)
 22    .union(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS)
 23    .union(Options::ENABLE_OLD_FOOTNOTES)
 24    .union(Options::ENABLE_GFM);
 25
 26pub fn parse_markdown(
 27    text: &str,
 28) -> (
 29    Vec<(Range<usize>, MarkdownEvent)>,
 30    HashSet<SharedString>,
 31    HashSet<Arc<Path>>,
 32) {
 33    let mut events = Vec::new();
 34    let mut language_names = HashSet::new();
 35    let mut language_paths = HashSet::new();
 36    let mut within_link = false;
 37    let mut within_metadata = false;
 38    for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
 39        if within_metadata {
 40            if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
 41                pulldown_event
 42            {
 43                within_metadata = false;
 44            }
 45            continue;
 46        }
 47        match pulldown_event {
 48            pulldown_cmark::Event::Start(tag) => {
 49                let tag = match tag {
 50                    pulldown_cmark::Tag::Link {
 51                        link_type,
 52                        dest_url,
 53                        title,
 54                        id,
 55                    } => {
 56                        within_link = true;
 57                        MarkdownTag::Link {
 58                            link_type,
 59                            dest_url: SharedString::from(dest_url.into_string()),
 60                            title: SharedString::from(title.into_string()),
 61                            id: SharedString::from(id.into_string()),
 62                        }
 63                    }
 64                    pulldown_cmark::Tag::MetadataBlock(kind) => {
 65                        within_metadata = true;
 66                        MarkdownTag::MetadataBlock(kind)
 67                    }
 68                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Indented) => {
 69                        MarkdownTag::CodeBlock {
 70                            kind: CodeBlockKind::Indented,
 71                            metadata: CodeBlockMetadata {
 72                                content_range: range.start + 1..range.end + 1,
 73                                line_count: 1,
 74                            },
 75                        }
 76                    }
 77                    pulldown_cmark::Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(
 78                        ref info,
 79                    )) => {
 80                        let content_range = extract_code_block_content_range(&text[range.clone()]);
 81                        let content_range =
 82                            content_range.start + range.start..content_range.end + range.start;
 83
 84                        let line_count = text[content_range.clone()]
 85                            .bytes()
 86                            .filter(|c| *c == b'\n')
 87                            .count();
 88                        let metadata = CodeBlockMetadata {
 89                            content_range,
 90                            line_count,
 91                        };
 92
 93                        let info = info.trim();
 94                        let kind = if info.is_empty() {
 95                            CodeBlockKind::Fenced
 96                            // Languages should never contain a slash, and PathRanges always should.
 97                            // (Models are told to specify them relative to a workspace root.)
 98                        } else if info.contains('/') {
 99                            let path_range = PathWithRange::new(info);
100                            language_paths.insert(path_range.path.clone());
101                            CodeBlockKind::FencedSrc(path_range)
102                        } else {
103                            let language = SharedString::from(info.to_string());
104                            language_names.insert(language.clone());
105                            CodeBlockKind::FencedLang(language)
106                        };
107
108                        MarkdownTag::CodeBlock { kind, metadata }
109                    }
110                    pulldown_cmark::Tag::Paragraph => MarkdownTag::Paragraph,
111                    pulldown_cmark::Tag::Heading {
112                        level,
113                        id,
114                        classes,
115                        attrs,
116                    } => {
117                        let id = id.map(|id| SharedString::from(id.into_string()));
118                        let classes = classes
119                            .into_iter()
120                            .map(|c| SharedString::from(c.into_string()))
121                            .collect();
122                        let attrs = attrs
123                            .into_iter()
124                            .map(|(key, value)| {
125                                (
126                                    SharedString::from(key.into_string()),
127                                    value.map(|v| SharedString::from(v.into_string())),
128                                )
129                            })
130                            .collect();
131                        MarkdownTag::Heading {
132                            level,
133                            id,
134                            classes,
135                            attrs,
136                        }
137                    }
138                    pulldown_cmark::Tag::BlockQuote(_kind) => MarkdownTag::BlockQuote,
139                    pulldown_cmark::Tag::List(start_number) => MarkdownTag::List(start_number),
140                    pulldown_cmark::Tag::Item => MarkdownTag::Item,
141                    pulldown_cmark::Tag::FootnoteDefinition(label) => {
142                        MarkdownTag::FootnoteDefinition(SharedString::from(label.to_string()))
143                    }
144                    pulldown_cmark::Tag::Table(alignments) => MarkdownTag::Table(alignments),
145                    pulldown_cmark::Tag::TableHead => MarkdownTag::TableHead,
146                    pulldown_cmark::Tag::TableRow => MarkdownTag::TableRow,
147                    pulldown_cmark::Tag::TableCell => MarkdownTag::TableCell,
148                    pulldown_cmark::Tag::Emphasis => MarkdownTag::Emphasis,
149                    pulldown_cmark::Tag::Strong => MarkdownTag::Strong,
150                    pulldown_cmark::Tag::Strikethrough => MarkdownTag::Strikethrough,
151                    pulldown_cmark::Tag::Image {
152                        link_type,
153                        dest_url,
154                        title,
155                        id,
156                    } => MarkdownTag::Image {
157                        link_type,
158                        dest_url: SharedString::from(dest_url.into_string()),
159                        title: SharedString::from(title.into_string()),
160                        id: SharedString::from(id.into_string()),
161                    },
162                    pulldown_cmark::Tag::HtmlBlock => MarkdownTag::HtmlBlock,
163                    pulldown_cmark::Tag::DefinitionList => MarkdownTag::DefinitionList,
164                    pulldown_cmark::Tag::DefinitionListTitle => MarkdownTag::DefinitionListTitle,
165                    pulldown_cmark::Tag::DefinitionListDefinition => {
166                        MarkdownTag::DefinitionListDefinition
167                    }
168                };
169                events.push((range, MarkdownEvent::Start(tag)))
170            }
171            pulldown_cmark::Event::End(tag) => {
172                if let pulldown_cmark::TagEnd::Link = tag {
173                    within_link = false;
174                }
175                events.push((range, MarkdownEvent::End(tag)));
176            }
177            pulldown_cmark::Event::Text(parsed) => {
178                // `parsed` will share bytes with the input unless a substitution like handling of
179                // HTML entities or smart punctuation has occurred. When these substitutions occur,
180                // `parsed` only consists of the result of a single substitution.
181                if !cow_str_points_inside(&parsed, text) {
182                    events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
183                } else {
184                    // Automatically detect links in text if not already within a markdown link.
185                    if !within_link {
186                        let mut finder = LinkFinder::new();
187                        finder.kinds(&[linkify::LinkKind::Url]);
188                        let text_range = range.clone();
189                        for link in finder.links(&text[text_range.clone()]) {
190                            let link_range =
191                                text_range.start + link.start()..text_range.start + link.end();
192
193                            if link_range.start > range.start {
194                                events.push((range.start..link_range.start, MarkdownEvent::Text));
195                            }
196
197                            events.push((
198                                link_range.clone(),
199                                MarkdownEvent::Start(MarkdownTag::Link {
200                                    link_type: LinkType::Autolink,
201                                    dest_url: SharedString::from(link.as_str().to_string()),
202                                    title: SharedString::default(),
203                                    id: SharedString::default(),
204                                }),
205                            ));
206
207                            events.push((link_range.clone(), MarkdownEvent::Text));
208                            events.push((
209                                link_range.clone(),
210                                MarkdownEvent::End(MarkdownTagEnd::Link),
211                            ));
212
213                            range.start = link_range.end;
214                        }
215                    }
216                    if range.start < range.end {
217                        events.push((range, MarkdownEvent::Text));
218                    }
219                }
220            }
221            pulldown_cmark::Event::Code(_) => {
222                range.start += 1;
223                range.end -= 1;
224                events.push((range, MarkdownEvent::Code))
225            }
226            pulldown_cmark::Event::Html(_) => events.push((range, MarkdownEvent::Html)),
227            pulldown_cmark::Event::InlineHtml(_) => events.push((range, MarkdownEvent::InlineHtml)),
228            pulldown_cmark::Event::FootnoteReference(_) => {
229                events.push((range, MarkdownEvent::FootnoteReference))
230            }
231            pulldown_cmark::Event::SoftBreak => events.push((range, MarkdownEvent::SoftBreak)),
232            pulldown_cmark::Event::HardBreak => events.push((range, MarkdownEvent::HardBreak)),
233            pulldown_cmark::Event::Rule => events.push((range, MarkdownEvent::Rule)),
234            pulldown_cmark::Event::TaskListMarker(checked) => {
235                events.push((range, MarkdownEvent::TaskListMarker(checked)))
236            }
237            pulldown_cmark::Event::InlineMath(_) | pulldown_cmark::Event::DisplayMath(_) => {}
238        }
239    }
240    (events, language_names, language_paths)
241}
242
243pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
244    let mut events = Vec::new();
245    let mut finder = LinkFinder::new();
246    finder.kinds(&[linkify::LinkKind::Url]);
247    let mut text_range = Range {
248        start: 0,
249        end: text.len(),
250    };
251    for link in finder.links(text) {
252        let link_range = link.start()..link.end();
253
254        if link_range.start > text_range.start {
255            events.push((text_range.start..link_range.start, MarkdownEvent::Text));
256        }
257
258        events.push((
259            link_range.clone(),
260            MarkdownEvent::Start(MarkdownTag::Link {
261                link_type: LinkType::Autolink,
262                dest_url: SharedString::from(link.as_str().to_string()),
263                title: SharedString::default(),
264                id: SharedString::default(),
265            }),
266        ));
267        events.push((link_range.clone(), MarkdownEvent::Text));
268        events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
269
270        text_range.start = link_range.end;
271    }
272
273    if text_range.end > text_range.start {
274        events.push((text_range, MarkdownEvent::Text));
275    }
276
277    events
278}
279
280/// A static-lifetime equivalent of pulldown_cmark::Event so we can cache the
281/// parse result for rendering without resorting to unsafe lifetime coercion.
282#[derive(Clone, Debug, PartialEq)]
283pub enum MarkdownEvent {
284    /// Start of a tagged element. Events that are yielded after this event
285    /// and before its corresponding `End` event are inside this element.
286    /// Start and end events are guaranteed to be balanced.
287    Start(MarkdownTag),
288    /// End of a tagged element.
289    End(MarkdownTagEnd),
290    /// Text that uses the associated range from the markdown source.
291    Text,
292    /// Text that differs from the markdown source - typically due to substitution of HTML entities
293    /// and smart punctuation.
294    SubstitutedText(CompactStr),
295    /// An inline code node.
296    Code,
297    /// An HTML node.
298    Html,
299    /// An inline HTML node.
300    InlineHtml,
301    /// A reference to a footnote with given label, which may or may not be defined
302    /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
303    /// occur in any order.
304    FootnoteReference,
305    /// A soft line break.
306    SoftBreak,
307    /// A hard line break.
308    HardBreak,
309    /// A horizontal ruler.
310    Rule,
311    /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
312    TaskListMarker(bool),
313}
314
315/// Tags for elements that can contain other elements.
316#[derive(Clone, Debug, PartialEq)]
317pub enum MarkdownTag {
318    /// A paragraph of text and other inline elements.
319    Paragraph,
320
321    /// A heading, with optional identifier, classes and custom attributes.
322    /// The identifier is prefixed with `#` and the last one in the attributes
323    /// list is chosen, classes are prefixed with `.` and custom attributes
324    /// have no prefix and can optionally have a value (`myattr` o `myattr=myvalue`).
325    Heading {
326        level: HeadingLevel,
327        id: Option<SharedString>,
328        classes: Vec<SharedString>,
329        /// The first item of the tuple is the attr and second one the value.
330        attrs: Vec<(SharedString, Option<SharedString>)>,
331    },
332
333    BlockQuote,
334
335    /// A code block.
336    CodeBlock {
337        kind: CodeBlockKind,
338        metadata: CodeBlockMetadata,
339    },
340
341    /// A HTML block.
342    HtmlBlock,
343
344    /// A list. If the list is ordered the field indicates the number of the first item.
345    /// Contains only list items.
346    List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
347
348    /// A list item.
349    Item,
350
351    /// A footnote definition. The value contained is the footnote's label by which it can
352    /// be referred to.
353    FootnoteDefinition(SharedString),
354
355    /// A table. Contains a vector describing the text-alignment for each of its columns.
356    Table(Vec<Alignment>),
357
358    /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
359    /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
360    TableHead,
361
362    /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
363    TableRow,
364    TableCell,
365
366    // span-level tags
367    Emphasis,
368    Strong,
369    Strikethrough,
370
371    /// A link.
372    Link {
373        link_type: LinkType,
374        dest_url: SharedString,
375        title: SharedString,
376        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
377        id: SharedString,
378    },
379
380    /// An image. The first field is the link type, the second the destination URL and the third is a title,
381    /// the fourth is the link identifier.
382    Image {
383        link_type: LinkType,
384        dest_url: SharedString,
385        title: SharedString,
386        /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
387        id: SharedString,
388    },
389
390    /// A metadata block.
391    MetadataBlock(MetadataBlockKind),
392
393    DefinitionList,
394    DefinitionListTitle,
395    DefinitionListDefinition,
396}
397
398#[derive(Clone, Debug, PartialEq)]
399pub enum CodeBlockKind {
400    Indented,
401    /// "Fenced" means "surrounded by triple backticks."
402    /// There can optionally be either a language after the backticks (like in traditional Markdown)
403    /// or, if an agent is specifying a path for a source location in the project, it can be a PathRange,
404    /// e.g. ```path/to/foo.rs#L123-456 instead of ```rust
405    Fenced,
406    FencedLang(SharedString),
407    FencedSrc(PathWithRange),
408}
409
410#[derive(Default, Clone, Debug, PartialEq)]
411pub struct CodeBlockMetadata {
412    pub content_range: Range<usize>,
413    pub line_count: usize,
414}
415
416pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
417    let mut range = 0..text.len();
418    if text.starts_with("```") {
419        range.start += 3;
420
421        if let Some(newline_ix) = text[range.clone()].find('\n') {
422            range.start += newline_ix + 1;
423        }
424    }
425
426    if !range.is_empty() && text.ends_with("```") {
427        range.end -= 3;
428    }
429    range
430}
431
432/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
433/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
434///
435/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
436#[derive(Clone)]
437pub enum CompactStr {
438    Boxed(Box<str>),
439    Inlined(InlineStr),
440}
441
442impl std::fmt::Debug for CompactStr {
443    fn fmt(&self, formatter: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
444        self.deref().fmt(formatter)
445    }
446}
447
448impl Deref for CompactStr {
449    type Target = str;
450
451    fn deref(&self) -> &str {
452        match self {
453            CompactStr::Boxed(b) => b,
454            CompactStr::Inlined(i) => i,
455        }
456    }
457}
458
459impl From<&str> for CompactStr {
460    fn from(s: &str) -> Self {
461        if let Ok(inlined) = s.try_into() {
462            CompactStr::Inlined(inlined)
463        } else {
464            CompactStr::Boxed(s.into())
465        }
466    }
467}
468
469impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
470    fn from(cow_str: pulldown_cmark::CowStr) -> Self {
471        match cow_str {
472            pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
473            pulldown_cmark::CowStr::Borrowed(b) => b.into(),
474            pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
475        }
476    }
477}
478
479impl PartialEq for CompactStr {
480    fn eq(&self, other: &Self) -> bool {
481        self.deref() == other.deref()
482    }
483}
484
485fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
486    match substring {
487        pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
488        pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
489        pulldown_cmark::CowStr::Inlined(_) => false,
490    }
491}
492
493fn str_points_inside(substring: &str, container: &str) -> bool {
494    let substring_ptr = substring.as_ptr();
495    let container_ptr = container.as_ptr();
496    unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
497}
498
499#[cfg(test)]
500mod tests {
501    use super::MarkdownEvent::*;
502    use super::MarkdownTag::*;
503    use super::*;
504
505    const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
506        .union(Options::ENABLE_MATH)
507        .union(Options::ENABLE_DEFINITION_LIST);
508
509    #[test]
510    fn all_options_considered() {
511        // The purpose of this is to fail when new options are added to pulldown_cmark, so that they
512        // can be evaluated for inclusion.
513        assert_eq!(PARSE_OPTIONS.union(UNWANTED_OPTIONS), Options::all());
514    }
515
516    #[test]
517    fn wanted_and_unwanted_options_disjoint() {
518        assert_eq!(
519            PARSE_OPTIONS.intersection(UNWANTED_OPTIONS),
520            Options::empty()
521        );
522    }
523
524    #[test]
525    fn test_plain_urls_and_escaped_text() {
526        assert_eq!(
527            parse_markdown("&nbsp;&nbsp; https://some.url some \\`&#9658;\\` text"),
528            (
529                vec![
530                    (0..51, Start(Paragraph)),
531                    (0..6, SubstitutedText("\u{a0}".into())),
532                    (6..12, SubstitutedText("\u{a0}".into())),
533                    (12..13, Text),
534                    (
535                        13..29,
536                        Start(Link {
537                            link_type: LinkType::Autolink,
538                            dest_url: "https://some.url".into(),
539                            title: "".into(),
540                            id: "".into(),
541                        })
542                    ),
543                    (13..29, Text),
544                    (13..29, End(MarkdownTagEnd::Link)),
545                    (29..35, Text),
546                    (36..37, Text), // Escaped backtick
547                    (37..44, SubstitutedText("".into())),
548                    (45..46, Text), // Escaped backtick
549                    (46..51, Text),
550                    (0..51, End(MarkdownTagEnd::Paragraph))
551                ],
552                HashSet::new(),
553                HashSet::new()
554            )
555        );
556    }
557
558    #[test]
559    fn test_smart_punctuation() {
560        assert_eq!(
561            parse_markdown("-- --- ... \"double quoted\" 'single quoted' ----------"),
562            (
563                vec![
564                    (0..53, Start(Paragraph)),
565                    (0..2, SubstitutedText("".into())),
566                    (2..3, Text),
567                    (3..6, SubstitutedText("".into())),
568                    (6..7, Text),
569                    (7..10, SubstitutedText("".into())),
570                    (10..11, Text),
571                    (11..12, SubstitutedText("".into())),
572                    (12..25, Text),
573                    (25..26, SubstitutedText("".into())),
574                    (26..27, Text),
575                    (27..28, SubstitutedText("".into())),
576                    (28..41, Text),
577                    (41..42, SubstitutedText("".into())),
578                    (42..43, Text),
579                    (43..53, SubstitutedText("–––––".into())),
580                    (0..53, End(MarkdownTagEnd::Paragraph))
581                ],
582                HashSet::new(),
583                HashSet::new()
584            )
585        )
586    }
587
588    #[test]
589    fn test_code_block_metadata() {
590        assert_eq!(
591            parse_markdown("```rust\nfn main() {\n let a = 1;\n}\n```"),
592            (
593                vec![
594                    (
595                        0..37,
596                        Start(CodeBlock {
597                            kind: CodeBlockKind::FencedLang("rust".into()),
598                            metadata: CodeBlockMetadata {
599                                content_range: 8..34,
600                                line_count: 3
601                            }
602                        })
603                    ),
604                    (8..34, Text),
605                    (0..37, End(MarkdownTagEnd::CodeBlock)),
606                ],
607                HashSet::from(["rust".into()]),
608                HashSet::new()
609            )
610        )
611    }
612
613    #[test]
614    fn test_extract_code_block_content_range() {
615        let input = "```rust\nlet x = 5;\n```";
616        assert_eq!(extract_code_block_content_range(input), 8..19);
617
618        let input = "plain text";
619        assert_eq!(extract_code_block_content_range(input), 0..10);
620
621        let input = "```python\nprint('hello')\nprint('world')\n```";
622        assert_eq!(extract_code_block_content_range(input), 10..40);
623    }
624}