markdown.rs

  1use std::fmt::{Display, Formatter};
  2
  3/// Generates a URL-friendly slug from heading text (e.g. "Hello World" β†’ "hello-world").
  4pub fn generate_heading_slug(text: &str) -> String {
  5    text.trim()
  6        .chars()
  7        .filter_map(|c| {
  8            if c.is_alphanumeric() || c == '-' || c == '_' {
  9                Some(c.to_lowercase().next().unwrap_or(c))
 10            } else if c == ' ' {
 11                Some('-')
 12            } else {
 13                None
 14            }
 15        })
 16        .collect()
 17}
 18
 19/// Returns true if the URL starts with a URI scheme (RFC 3986 Β§3.1).
 20fn has_uri_scheme(url: &str) -> bool {
 21    let mut chars = url.chars();
 22    match chars.next() {
 23        Some(c) if c.is_ascii_alphabetic() => {}
 24        _ => return false,
 25    }
 26    for c in chars {
 27        if c == ':' {
 28            return true;
 29        }
 30        if !(c.is_ascii_alphanumeric() || c == '+' || c == '-' || c == '.') {
 31            return false;
 32        }
 33    }
 34    false
 35}
 36
 37/// Splits a relative URL into its path and `#fragment` parts.
 38/// Absolute URLs are returned as-is with no fragment.
 39pub fn split_local_url_fragment(url: &str) -> (&str, Option<&str>) {
 40    if has_uri_scheme(url) {
 41        return (url, None);
 42    }
 43    match url.find('#') {
 44        Some(pos) => {
 45            let path = &url[..pos];
 46            let fragment = &url[pos + 1..];
 47            (
 48                path,
 49                if fragment.is_empty() {
 50                    None
 51                } else {
 52                    Some(fragment)
 53                },
 54            )
 55        }
 56        None => (url, None),
 57    }
 58}
 59
 60/// Indicates that the wrapped `String` is markdown text.
 61#[derive(Debug, Clone)]
 62pub struct MarkdownString(pub String);
 63
 64impl Display for MarkdownString {
 65    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
 66        write!(f, "{}", self.0)
 67    }
 68}
 69
 70/// Escapes markdown special characters in markdown text blocks. Markdown code blocks follow
 71/// different rules and `MarkdownInlineCode` or `MarkdownCodeBlock` should be used in that case.
 72///
 73/// Also escapes the following markdown extensions:
 74///
 75/// * `^` for superscripts
 76/// * `$` for inline math
 77/// * `~` for strikethrough
 78///
 79/// Escape of some characters is unnecessary, because while they are involved in markdown syntax,
 80/// the other characters involved are escaped:
 81///
 82/// * `!`, `]`, `(`, and `)` are used in link syntax, but `[` is escaped so these are parsed as
 83///   plaintext.
 84///
 85/// * `;` is used in HTML entity syntax, but `&` is escaped, so they are parsed as plaintext.
 86///
 87/// TODO: There is one escape this doesn't do currently. Period after numbers at the start of the
 88/// line (`[0-9]*\.`) should also be escaped to avoid it being interpreted as a list item.
 89pub struct MarkdownEscaped<'a>(pub &'a str);
 90
 91/// Implements `Display` to format markdown inline code (wrapped in backticks), handling code that
 92/// contains backticks and spaces. All whitespace is treated as a single space character. For text
 93/// that does not contain whitespace other than ' ', this escaping roundtrips through
 94/// pulldown-cmark.
 95///
 96/// When used in tables, `|` should be escaped like `\|` in the text provided to this function.
 97pub struct MarkdownInlineCode<'a>(pub &'a str);
 98
 99/// Implements `Display` to format markdown code blocks, wrapped in 3 or more backticks as needed.
100pub struct MarkdownCodeBlock<'a> {
101    pub tag: &'a str,
102    pub text: &'a str,
103}
104
105impl Display for MarkdownEscaped<'_> {
106    fn fmt(&self, formatter: &mut Formatter<'_>) -> std::fmt::Result {
107        let mut start_of_unescaped = None;
108        for (ix, c) in self.0.char_indices() {
109            match c {
110                // Always escaped.
111                '\\' | '`' | '*' | '_' | '[' | '^' | '$' | '~' | '&' |
112                // TODO: these only need to be escaped when they are the first non-whitespace
113                // character of the line of a block. There should probably be both an `escape_block`
114                // which does this and an `escape_inline` method which does not escape these.
115                '#' | '+' | '=' | '-' => {
116                    match start_of_unescaped {
117                        None => {}
118                        Some(start_of_unescaped) => {
119                            write!(formatter, "{}", &self.0[start_of_unescaped..ix])?;
120                        }
121                    }
122                    write!(formatter, "\\")?;
123                    // Can include this char in the "unescaped" text since a
124                    // backslash was just emitted.
125                    start_of_unescaped = Some(ix);
126                }
127                // Escaped since `<` is used in opening HTML tags. `&lt;` is used since Markdown
128                // supports HTML entities, and this allows the text to be used directly in HTML.
129                '<' => {
130                    match start_of_unescaped {
131                        None => {}
132                        Some(start_of_unescaped) => {
133                            write!(formatter, "{}", &self.0[start_of_unescaped..ix])?;
134                        }
135                    }
136                    write!(formatter, "&lt;")?;
137                    start_of_unescaped = None;
138                }
139                // Escaped since `>` is used for blockquotes. `&gt;` is used since Markdown supports
140                // HTML entities, and this allows the text to be used directly in HTML.
141                '>' => {
142                    match start_of_unescaped {
143                        None => {}
144                        Some(start_of_unescaped) => {
145                            write!(formatter, "{}", &self.0[start_of_unescaped..ix])?;
146                        }
147                    }
148                    write!(formatter, "&gt;")?;
149                    start_of_unescaped = None;
150                }
151                _ => {
152                    if start_of_unescaped.is_none() {
153                        start_of_unescaped = Some(ix);
154                    }
155                }
156            }
157        }
158        if let Some(start_of_unescaped) = start_of_unescaped {
159            write!(formatter, "{}", &self.0[start_of_unescaped..])?;
160        }
161        Ok(())
162    }
163}
164
165impl Display for MarkdownInlineCode<'_> {
166    fn fmt(&self, formatter: &mut Formatter<'_>) -> std::fmt::Result {
167        // Apache License 2.0, same as this crate.
168        //
169        // Copied from `pulldown-cmark-to-cmark-20.0.0` with modifications:
170        //
171        // * Handling of all whitespace. pulldown-cmark-to-cmark is anticipating
172        // `Code` events parsed by pulldown-cmark.
173        //
174        // https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L290
175
176        let mut all_whitespace = true;
177        let text = self
178            .0
179            .chars()
180            .map(|c| {
181                if c.is_whitespace() {
182                    ' '
183                } else {
184                    all_whitespace = false;
185                    c
186                }
187            })
188            .collect::<String>();
189
190        // When inline code has leading and trailing ' ' characters, additional space is needed
191        // to escape it, unless all characters are space.
192        if all_whitespace {
193            write!(formatter, "`{text}`")
194        } else {
195            // More backticks are needed to delimit the inline code than the maximum number of
196            // backticks in a consecutive run.
197            let backticks = "`".repeat(count_max_consecutive_chars(&text, '`') + 1);
198            let space = match text.as_bytes() {
199                &[b'`', ..] | &[.., b'`'] => " ", // Space needed to separate backtick.
200                &[b' ', .., b' '] => " ",         // Space needed to escape inner space.
201                _ => "",                          // No space needed.
202            };
203            write!(formatter, "{backticks}{space}{text}{space}{backticks}")
204        }
205    }
206}
207
208impl Display for MarkdownCodeBlock<'_> {
209    fn fmt(&self, formatter: &mut Formatter<'_>) -> std::fmt::Result {
210        let tag = self.tag;
211        let text = self.text;
212        let backticks = "`".repeat(3.max(count_max_consecutive_chars(text, '`') + 1));
213        write!(formatter, "{backticks}{tag}\n{text}\n{backticks}\n")
214    }
215}
216
217// Copied from `pulldown-cmark-to-cmark-20.0.0` with changed names.
218// https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L1063
219// Apache License 2.0, same as this code.
220fn count_max_consecutive_chars(text: &str, search: char) -> usize {
221    let mut in_search_chars = false;
222    let mut max_count = 0;
223    let mut cur_count = 0;
224
225    for ch in text.chars() {
226        if ch == search {
227            cur_count += 1;
228            in_search_chars = true;
229        } else if in_search_chars {
230            max_count = max_count.max(cur_count);
231            cur_count = 0;
232            in_search_chars = false;
233        }
234    }
235    max_count.max(cur_count)
236}
237
238#[cfg(test)]
239mod tests {
240    use super::*;
241
242    #[test]
243    fn test_markdown_escaped() {
244        let input = r#"
245        # Heading
246
247        Another heading
248        ===
249
250        Another heading variant
251        ---
252
253        Paragraph with [link](https://example.com) and `code`, *emphasis*, and ~strikethrough~.
254
255        ```
256        code block
257        ```
258
259        List with varying leaders:
260          - Item 1
261          * Item 2
262          + Item 3
263
264        Some math:  $`\sqrt{3x-1}+(1+x)^2`$
265
266        HTML entity: &nbsp;
267        "#;
268
269        let expected = r#"
270        \# Heading
271
272        Another heading
273        \=\=\=
274
275        Another heading variant
276        \-\-\-
277
278        Paragraph with \[link](https://example.com) and \`code\`, \*emphasis\*, and \~strikethrough\~.
279
280        \`\`\`
281        code block
282        \`\`\`
283
284        List with varying leaders:
285          \- Item 1
286          \* Item 2
287          \+ Item 3
288
289        Some math:  \$\`\\sqrt{3x\-1}\+(1\+x)\^2\`\$
290
291        HTML entity: \&nbsp;
292        "#;
293
294        assert_eq!(MarkdownEscaped(input).to_string(), expected);
295    }
296
297    #[test]
298    fn test_markdown_inline_code() {
299        assert_eq!(MarkdownInlineCode(" ").to_string(), "` `");
300        assert_eq!(MarkdownInlineCode("text").to_string(), "`text`");
301        assert_eq!(MarkdownInlineCode("text ").to_string(), "`text `");
302        assert_eq!(MarkdownInlineCode(" text ").to_string(), "`  text  `");
303        assert_eq!(MarkdownInlineCode("`").to_string(), "`` ` ``");
304        assert_eq!(MarkdownInlineCode("``").to_string(), "``` `` ```");
305        assert_eq!(MarkdownInlineCode("`text`").to_string(), "`` `text` ``");
306        assert_eq!(
307            MarkdownInlineCode("some `text` no leading or trailing backticks").to_string(),
308            "``some `text` no leading or trailing backticks``"
309        );
310    }
311
312    #[test]
313    fn test_count_max_consecutive_chars() {
314        assert_eq!(
315            count_max_consecutive_chars("``a```b``", '`'),
316            3,
317            "the highest seen consecutive segment of backticks counts"
318        );
319        assert_eq!(
320            count_max_consecutive_chars("```a``b`", '`'),
321            3,
322            "it can't be downgraded later"
323        );
324    }
325
326    #[test]
327    fn test_split_local_url_fragment() {
328        assert_eq!(split_local_url_fragment("#heading"), ("", Some("heading")));
329        assert_eq!(
330            split_local_url_fragment("./file.md#heading"),
331            ("./file.md", Some("heading"))
332        );
333        assert_eq!(split_local_url_fragment("./file.md"), ("./file.md", None));
334        assert_eq!(
335            split_local_url_fragment("https://example.com#frag"),
336            ("https://example.com#frag", None)
337        );
338        assert_eq!(
339            split_local_url_fragment("mailto:user@example.com"),
340            ("mailto:user@example.com", None)
341        );
342        assert_eq!(split_local_url_fragment("#"), ("", None));
343        assert_eq!(
344            split_local_url_fragment("../other.md#section"),
345            ("../other.md", Some("section"))
346        );
347        assert_eq!(
348            split_local_url_fragment("123:not-a-scheme#frag"),
349            ("123:not-a-scheme", Some("frag"))
350        );
351    }
352
353    #[test]
354    fn test_generate_heading_slug() {
355        assert_eq!(generate_heading_slug("Hello World"), "hello-world");
356        assert_eq!(generate_heading_slug("Hello  World"), "hello--world");
357        assert_eq!(generate_heading_slug("Hello-World"), "hello-world");
358        assert_eq!(
359            generate_heading_slug("Some **bold** text"),
360            "some-bold-text"
361        );
362        assert_eq!(generate_heading_slug("Let's try with Ü"), "lets-try-with-ü");
363        assert_eq!(
364            generate_heading_slug("heading with 123 numbers"),
365            "heading-with-123-numbers"
366        );
367        assert_eq!(
368            generate_heading_slug("What about (parens)?"),
369            "what-about-parens"
370        );
371        assert_eq!(
372            generate_heading_slug("  leading spaces  "),
373            "leading-spaces"
374        );
375    }
376}