markdown.rs

  1use std::fmt::{Display, Formatter};
  2
  3/// Markdown text.
  4#[derive(Debug, Clone)]
  5pub struct MarkdownString(pub String);
  6
  7impl Display for MarkdownString {
  8    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
  9        write!(f, "{}", self.0)
 10    }
 11}
 12
 13impl MarkdownString {
 14    /// Escapes markdown special characters.
 15    ///
 16    /// Also escapes the following markdown extensions:
 17    ///
 18    /// * `^` for superscripts
 19    /// * `$` for inline math
 20    /// * `~` for strikethrough
 21    ///
 22    /// Escape of some characters is unnecessary, because while they are involved in markdown syntax,
 23    /// the other characters involved are escaped:
 24    ///
 25    /// * `!`, `]`, `(`, and `)` are used in link syntax, but `[` is escaped so these are parsed as
 26    /// plaintext.
 27    ///
 28    /// * `;` is used in HTML entity syntax, but `&` is escaped, so they are parsed as plaintext.
 29    ///
 30    /// TODO: There is one escape this doesn't do currently. Period after numbers at the start of the
 31    /// line (`[0-9]*\.`) should also be escaped to avoid it being interpreted as a list item.
 32    pub fn escape(text: &str) -> Self {
 33        let mut chunks = Vec::new();
 34        let mut start_of_unescaped = None;
 35        for (ix, c) in text.char_indices() {
 36            match c {
 37                // Always escaped.
 38                '\\' | '`' | '*' | '_' | '[' | '^' | '$' | '~' | '&' |
 39                // TODO: these only need to be escaped when they are the first non-whitespace
 40                // character of the line of a block. There should probably be both an `escape_block`
 41                // which does this and an `escape_inline` method which does not escape these.
 42                '#' | '+' | '=' | '-' => {
 43                    match start_of_unescaped {
 44                        None => {}
 45                        Some(start_of_unescaped) => {
 46                            chunks.push(&text[start_of_unescaped..ix]);
 47                        }
 48                    }
 49                    chunks.push("\\");
 50                    // Can include this char in the "unescaped" text since a
 51                    // backslash was just emitted.
 52                    start_of_unescaped = Some(ix);
 53                }
 54                // Escaped since `<` is used in opening HTML tags. `&lt;` is used since Markdown
 55                // supports HTML entities, and this allows the text to be used directly in HTML.
 56                '<' => {
 57                    match start_of_unescaped {
 58                        None => {}
 59                        Some(start_of_unescaped) => {
 60                            chunks.push(&text[start_of_unescaped..ix]);
 61                        }
 62                    }
 63                    chunks.push("&lt;");
 64                    start_of_unescaped = None;
 65                }
 66                // Escaped since `>` is used for blockquotes. `&gt;` is used since Markdown supports
 67                // HTML entities, and this allows the text to be used directly in HTML.
 68                '>' => {
 69                    match start_of_unescaped {
 70                        None => {}
 71                        Some(start_of_unescaped) => {
 72                            chunks.push(&text[start_of_unescaped..ix]);
 73                        }
 74                    }
 75                    chunks.push("gt;");
 76                    start_of_unescaped = None;
 77                }
 78                _ => {
 79                    if start_of_unescaped.is_none() {
 80                        start_of_unescaped = Some(ix);
 81                    }
 82                }
 83            }
 84        }
 85        if let Some(start_of_unescaped) = start_of_unescaped {
 86            chunks.push(&text[start_of_unescaped..])
 87        }
 88        Self(chunks.concat())
 89    }
 90
 91    /// Returns markdown for inline code (wrapped in backticks), handling code that contains backticks
 92    /// and spaces. All whitespace is treated as a single space character. For text that does not
 93    /// contain whitespace other than ' ', this escaping roundtrips through pulldown-cmark.
 94    ///
 95    /// When used in tables, `|` should be escaped like `\|` in the text provided to this function.
 96    pub fn inline_code(text: &str) -> Self {
 97        // Apache License 2.0, same as this crate.
 98        //
 99        // Copied from `pulldown-cmark-to-cmark-20.0.0` with modifications:
100        //
101        // * Handling of all whitespace. pulldown-cmark-to-cmark is anticipating
102        // `Code` events parsed by pulldown-cmark.
103        //
104        // * Direct return of string.
105        //
106        // https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L290
107
108        let mut all_whitespace = true;
109        let text = text
110            .chars()
111            .map(|c| {
112                if c.is_whitespace() {
113                    ' '
114                } else {
115                    all_whitespace = false;
116                    c
117                }
118            })
119            .collect::<String>();
120
121        // When inline code has leading and trailing ' ' characters, additional space is needed
122        // to escape it, unless all characters are space.
123        if all_whitespace {
124            Self(format!("`{text}`"))
125        } else {
126            // More backticks are needed to delimit the inline code than the maximum number of
127            // backticks in a consecutive run.
128            let backticks = "`".repeat(count_max_consecutive_chars(&text, '`') + 1);
129            let space = match text.as_bytes() {
130                &[b'`', ..] | &[.., b'`'] => " ", // Space needed to separate backtick.
131                &[b' ', .., b' '] => " ",         // Space needed to escape inner space.
132                _ => "",                          // No space needed.
133            };
134            Self(format!("{backticks}{space}{text}{space}{backticks}"))
135        }
136    }
137}
138
139// Copied from `pulldown-cmark-to-cmark-20.0.0` with changed names.
140// https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L1063
141// Apache License 2.0, same as this code.
142fn count_max_consecutive_chars(text: &str, search: char) -> usize {
143    let mut in_search_chars = false;
144    let mut max_count = 0;
145    let mut cur_count = 0;
146
147    for ch in text.chars() {
148        if ch == search {
149            cur_count += 1;
150            in_search_chars = true;
151        } else if in_search_chars {
152            max_count = max_count.max(cur_count);
153            cur_count = 0;
154            in_search_chars = false;
155        }
156    }
157    max_count.max(cur_count)
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn test_markdown_string_escape() {
166        let input = r#"
167        # Heading
168
169        Another heading
170        ===
171
172        Another heading variant
173        ---
174
175        Paragraph with [link](https://example.com) and `code`, *emphasis*, and ~strikethrough~.
176
177        ```
178        code block
179        ```
180
181        List with varying leaders:
182          - Item 1
183          * Item 2
184          + Item 3
185
186        Some math:  $`\sqrt{3x-1}+(1+x)^2`$
187
188        HTML entity: &nbsp;
189        "#;
190
191        let expected = r#"
192        \# Heading
193
194        Another heading
195        \=\=\=
196
197        Another heading variant
198        \-\-\-
199
200        Paragraph with \[link](https://example.com) and \`code\`, \*emphasis\*, and \~strikethrough\~.
201
202        \`\`\`
203        code block
204        \`\`\`
205
206        List with varying leaders:
207          \- Item 1
208          \* Item 2
209          \+ Item 3
210
211        Some math:  \$\`\\sqrt{3x\-1}\+(1\+x)\^2\`\$
212
213        HTML entity: \&nbsp;
214        "#;
215
216        assert_eq!(MarkdownString::escape(input).0, expected);
217    }
218
219    #[test]
220    fn test_markdown_string_inline_code() {
221        assert_eq!(MarkdownString::inline_code(" ").0, "` `");
222        assert_eq!(MarkdownString::inline_code("text").0, "`text`");
223        assert_eq!(MarkdownString::inline_code("text ").0, "`text `");
224        assert_eq!(MarkdownString::inline_code(" text ").0, "`  text  `");
225        assert_eq!(MarkdownString::inline_code("`").0, "`` ` ``");
226        assert_eq!(MarkdownString::inline_code("``").0, "``` `` ```");
227        assert_eq!(MarkdownString::inline_code("`text`").0, "`` `text` ``");
228        assert_eq!(
229            MarkdownString::inline_code("some `text` no leading or trailing backticks").0,
230            "``some `text` no leading or trailing backticks``"
231        );
232    }
233
234    #[test]
235    fn test_count_max_consecutive_chars() {
236        assert_eq!(
237            count_max_consecutive_chars("``a```b``", '`'),
238            3,
239            "the highest seen consecutive segment of backticks counts"
240        );
241        assert_eq!(
242            count_max_consecutive_chars("```a``b`", '`'),
243            3,
244            "it can't be downgraded later"
245        );
246    }
247}