@@ -1,8 +1,13 @@
use gpui::SharedString;
use linkify::LinkFinder;
pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
-use pulldown_cmark::{Alignment, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser};
-use std::{collections::HashSet, ops::Range};
+use pulldown_cmark::{
+ Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
+};
+use std::{
+ collections::HashSet,
+ ops::{Deref, Range},
+};
const PARSE_OPTIONS: Options = Options::ENABLE_TABLES
.union(Options::ENABLE_FOOTNOTES)
@@ -49,49 +54,60 @@ pub fn parse_markdown(text: &str) -> (Vec<(Range<usize>, MarkdownEvent)>, HashSe
events.push((range, MarkdownEvent::End(tag)));
}
pulldown_cmark::Event::Text(parsed) => {
- // Automatically detect links in text if we're not already within a markdown
- // link.
- let mut parsed = parsed.as_ref();
- if !within_link {
- let mut finder = LinkFinder::new();
- finder.kinds(&[linkify::LinkKind::Url]);
- let text_range = range.clone();
- for link in finder.links(&text[text_range.clone()]) {
- let link_range =
- text_range.start + link.start()..text_range.start + link.end();
-
- if link_range.start > range.start {
- let (text, tail) = parsed.split_at(link_range.start - range.start);
+ // `parsed` will share bytes with the input unless a substitution like handling of
+ // HTML entities or smart punctuation has occurred. When these substitutions occur,
+ // `parsed` only consists of the result of a single substitution.
+ if !cow_str_points_inside(&parsed, text) {
+ // Attempt to detect cases where the assumptions here are not valid or the
+ // behavior has changed.
+ if parsed.len() > 4 {
+ log::error!(
+ "Bug in markdown parser. \
+ pulldown_cmark::Event::Text expected to a substituted HTML entity, \
+ but it was longer than expected.\n\
+ Source: {}\n\
+ Parsed: {}",
+ &text[range.clone()],
+ parsed
+ );
+ }
+ events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
+ } else {
+ // Automatically detect links in text if not already within a markdown link.
+ if !within_link {
+ let mut finder = LinkFinder::new();
+ finder.kinds(&[linkify::LinkKind::Url]);
+ let text_range = range.clone();
+ for link in finder.links(&text[text_range.clone()]) {
+ let link_range =
+ text_range.start + link.start()..text_range.start + link.end();
+
+ if link_range.start > range.start {
+ events.push((range.start..link_range.start, MarkdownEvent::Text));
+ }
+
events.push((
- range.start..link_range.start,
- MarkdownEvent::Text(SharedString::new(text)),
+ link_range.clone(),
+ MarkdownEvent::Start(MarkdownTag::Link {
+ link_type: LinkType::Autolink,
+ dest_url: SharedString::from(link.as_str().to_string()),
+ title: SharedString::default(),
+ id: SharedString::default(),
+ }),
+ ));
+
+ events.push((link_range.clone(), MarkdownEvent::Text));
+ events.push((
+ link_range.clone(),
+ MarkdownEvent::End(MarkdownTagEnd::Link),
));
- parsed = tail;
- }
- events.push((
- link_range.clone(),
- MarkdownEvent::Start(MarkdownTag::Link {
- link_type: LinkType::Autolink,
- dest_url: SharedString::from(link.as_str().to_string()),
- title: SharedString::default(),
- id: SharedString::default(),
- }),
- ));
-
- let (link_text, tail) = parsed.split_at(link_range.end - link_range.start);
- events.push((
- link_range.clone(),
- MarkdownEvent::Text(SharedString::new(link_text)),
- ));
- events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
-
- range.start = link_range.end;
- parsed = tail;
+ range.start = link_range.end;
+ }
+ }
+ if range.start < range.end {
+ events.push((range, MarkdownEvent::Text));
}
- }
- if range.start < range.end {
- events.push((range, MarkdownEvent::Text(SharedString::new(parsed))));
}
}
pulldown_cmark::Event::Code(_) => {
@@ -116,7 +132,7 @@ pub fn parse_markdown(text: &str) -> (Vec<(Range<usize>, MarkdownEvent)>, HashSe
(events, languages)
}
-pub fn parse_links_only(mut text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
+pub fn parse_links_only(text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
let mut events = Vec::new();
let mut finder = LinkFinder::new();
finder.kinds(&[linkify::LinkKind::Url]);
@@ -128,15 +144,9 @@ pub fn parse_links_only(mut text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
let link_range = link.start()..link.end();
if link_range.start > text_range.start {
- let (head, tail) = text.split_at(link_range.start - text_range.start);
- events.push((
- text_range.start..link_range.start,
- MarkdownEvent::Text(SharedString::new(head)),
- ));
- text = tail;
+ events.push((text_range.start..link_range.start, MarkdownEvent::Text));
}
- let (link_text, tail) = text.split_at(link_range.end - link_range.start);
events.push((
link_range.clone(),
MarkdownEvent::Start(MarkdownTag::Link {
@@ -146,18 +156,14 @@ pub fn parse_links_only(mut text: &str) -> Vec<(Range<usize>, MarkdownEvent)> {
id: SharedString::default(),
}),
));
- events.push((
- link_range.clone(),
- MarkdownEvent::Text(SharedString::new(link_text)),
- ));
+ events.push((link_range.clone(), MarkdownEvent::Text));
events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
text_range.start = link_range.end;
- text = tail;
}
if text_range.end > text_range.start {
- events.push((text_range, MarkdownEvent::Text(SharedString::new(text))));
+ events.push((text_range, MarkdownEvent::Text));
}
events
@@ -173,8 +179,11 @@ pub enum MarkdownEvent {
Start(MarkdownTag),
/// End of a tagged element.
End(MarkdownTagEnd),
- /// A text node.
- Text(SharedString),
+ /// Text that uses the associated range from the mardown source.
+ Text,
+ /// Text that differs from the markdown source - typically due to substitution of HTML entities
+ /// and smart punctuation.
+ SubstitutedText(CompactStr),
/// An inline code node.
Code,
/// An HTML node.
@@ -365,8 +374,71 @@ impl From<pulldown_cmark::Tag<'_>> for MarkdownTag {
}
}
+/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
+/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
+///
+/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
+#[derive(Clone, Debug)]
+pub enum CompactStr {
+ Boxed(Box<str>),
+ Inlined(InlineStr),
+}
+
+impl Deref for CompactStr {
+ type Target = str;
+
+ fn deref(&self) -> &str {
+ match self {
+ CompactStr::Boxed(b) => b,
+ CompactStr::Inlined(i) => i,
+ }
+ }
+}
+
+impl From<&str> for CompactStr {
+ fn from(s: &str) -> Self {
+ if let Ok(inlined) = s.try_into() {
+ CompactStr::Inlined(inlined)
+ } else {
+ CompactStr::Boxed(s.into())
+ }
+ }
+}
+
+impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
+ fn from(cow_str: pulldown_cmark::CowStr) -> Self {
+ match cow_str {
+ pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
+ pulldown_cmark::CowStr::Borrowed(b) => b.into(),
+ pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
+ }
+ }
+}
+
+impl PartialEq for CompactStr {
+ fn eq(&self, other: &Self) -> bool {
+ self.deref() == other.deref()
+ }
+}
+
+fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
+ match substring {
+ pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
+ pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
+ pulldown_cmark::CowStr::Inlined(_) => false,
+ }
+}
+
+fn str_points_inside(substring: &str, container: &str) -> bool {
+ let substring_ptr = substring.as_ptr();
+ let container_ptr = container.as_ptr();
+ unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
+}
+
#[cfg(test)]
mod tests {
+ use super::MarkdownEvent::*;
+ use super::MarkdownTag::*;
use super::*;
const UNWANTED_OPTIONS: Options = Options::ENABLE_YAML_STYLE_METADATA_BLOCKS
@@ -387,4 +459,64 @@ mod tests {
Options::empty()
);
}
+
+ #[test]
+ fn test_plain_urls_and_escaped_text() {
+ assert_eq!(
+ parse_markdown(" https://some.url some \\`►\\` text"),
+ (
+ vec![
+ (0..51, Start(Paragraph)),
+ (0..6, SubstitutedText("\u{a0}".into())),
+ (6..12, SubstitutedText("\u{a0}".into())),
+ (12..13, Text),
+ (
+ 13..29,
+ Start(Link {
+ link_type: LinkType::Autolink,
+ dest_url: "https://some.url".into(),
+ title: "".into(),
+ id: "".into(),
+ })
+ ),
+ (13..29, Text),
+ (13..29, End(MarkdownTagEnd::Link)),
+ (29..35, Text),
+ (36..37, Text), // Escaped backtick
+ (37..44, SubstitutedText("►".into())),
+ (45..46, Text), // Escaped backtick
+ (46..51, Text),
+ (0..51, End(MarkdownTagEnd::Paragraph))
+ ],
+ HashSet::new()
+ )
+ );
+ }
+
+ #[test]
+ fn test_smart_punctuation() {
+ assert_eq!(
+ parse_markdown("-- --- ... \"double quoted\" 'single quoted'"),
+ (
+ vec![
+ (0..42, Start(Paragraph)),
+ (0..2, SubstitutedText("–".into())),
+ (2..3, Text),
+ (3..6, SubstitutedText("—".into())),
+ (6..7, Text),
+ (7..10, SubstitutedText("…".into())),
+ (10..11, Text),
+ (11..12, SubstitutedText("“".into())),
+ (12..25, Text),
+ (25..26, SubstitutedText("”".into())),
+ (26..27, Text),
+ (27..28, SubstitutedText("‘".into())),
+ (28..41, Text),
+ (41..42, SubstitutedText("’".into())),
+ (0..42, End(MarkdownTagEnd::Paragraph))
+ ],
+ HashSet::new()
+ )
+ )
+ }
}