markdown.rs

  1use std::fmt::{Display, Formatter};
  2
  3/// Markdown text.
  4#[derive(Debug, Clone)]
  5pub struct MarkdownString(pub String);
  6
  7impl Display for MarkdownString {
  8    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
  9        write!(f, "{}", self.0)
 10    }
 11}
 12
 13impl MarkdownString {
 14    /// Escapes markdown special characters in markdown text blocks. Markdown code blocks follow
 15    /// different rules and `MarkdownString::inline_code` or `MarkdownString::code_block` should be
 16    /// used in that case.
 17    ///
 18    /// Also escapes the following markdown extensions:
 19    ///
 20    /// * `^` for superscripts
 21    /// * `$` for inline math
 22    /// * `~` for strikethrough
 23    ///
 24    /// Escape of some characters is unnecessary, because while they are involved in markdown syntax,
 25    /// the other characters involved are escaped:
 26    ///
 27    /// * `!`, `]`, `(`, and `)` are used in link syntax, but `[` is escaped so these are parsed as
 28    /// plaintext.
 29    ///
 30    /// * `;` is used in HTML entity syntax, but `&` is escaped, so they are parsed as plaintext.
 31    ///
 32    /// TODO: There is one escape this doesn't do currently. Period after numbers at the start of the
 33    /// line (`[0-9]*\.`) should also be escaped to avoid it being interpreted as a list item.
 34    pub fn escape(text: &str) -> Self {
 35        let mut chunks = Vec::new();
 36        let mut start_of_unescaped = None;
 37        for (ix, c) in text.char_indices() {
 38            match c {
 39                // Always escaped.
 40                '\\' | '`' | '*' | '_' | '[' | '^' | '$' | '~' | '&' |
 41                // TODO: these only need to be escaped when they are the first non-whitespace
 42                // character of the line of a block. There should probably be both an `escape_block`
 43                // which does this and an `escape_inline` method which does not escape these.
 44                '#' | '+' | '=' | '-' => {
 45                    match start_of_unescaped {
 46                        None => {}
 47                        Some(start_of_unescaped) => {
 48                            chunks.push(&text[start_of_unescaped..ix]);
 49                        }
 50                    }
 51                    chunks.push("\\");
 52                    // Can include this char in the "unescaped" text since a
 53                    // backslash was just emitted.
 54                    start_of_unescaped = Some(ix);
 55                }
 56                // Escaped since `<` is used in opening HTML tags. `&lt;` is used since Markdown
 57                // supports HTML entities, and this allows the text to be used directly in HTML.
 58                '<' => {
 59                    match start_of_unescaped {
 60                        None => {}
 61                        Some(start_of_unescaped) => {
 62                            chunks.push(&text[start_of_unescaped..ix]);
 63                        }
 64                    }
 65                    chunks.push("&lt;");
 66                    start_of_unescaped = None;
 67                }
 68                // Escaped since `>` is used for blockquotes. `&gt;` is used since Markdown supports
 69                // HTML entities, and this allows the text to be used directly in HTML.
 70                '>' => {
 71                    match start_of_unescaped {
 72                        None => {}
 73                        Some(start_of_unescaped) => {
 74                            chunks.push(&text[start_of_unescaped..ix]);
 75                        }
 76                    }
 77                    chunks.push("gt;");
 78                    start_of_unescaped = None;
 79                }
 80                _ => {
 81                    if start_of_unescaped.is_none() {
 82                        start_of_unescaped = Some(ix);
 83                    }
 84                }
 85            }
 86        }
 87        if let Some(start_of_unescaped) = start_of_unescaped {
 88            chunks.push(&text[start_of_unescaped..])
 89        }
 90        Self(chunks.concat())
 91    }
 92
 93    /// Returns markdown for inline code (wrapped in backticks), handling code that contains backticks
 94    /// and spaces. All whitespace is treated as a single space character. For text that does not
 95    /// contain whitespace other than ' ', this escaping roundtrips through pulldown-cmark.
 96    ///
 97    /// When used in tables, `|` should be escaped like `\|` in the text provided to this function.
 98    pub fn inline_code(text: &str) -> Self {
 99        // Apache License 2.0, same as this crate.
100        //
101        // Copied from `pulldown-cmark-to-cmark-20.0.0` with modifications:
102        //
103        // * Handling of all whitespace. pulldown-cmark-to-cmark is anticipating
104        // `Code` events parsed by pulldown-cmark.
105        //
106        // * Direct return of string.
107        //
108        // https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L290
109
110        let mut all_whitespace = true;
111        let text = text
112            .chars()
113            .map(|c| {
114                if c.is_whitespace() {
115                    ' '
116                } else {
117                    all_whitespace = false;
118                    c
119                }
120            })
121            .collect::<String>();
122
123        // When inline code has leading and trailing ' ' characters, additional space is needed
124        // to escape it, unless all characters are space.
125        if all_whitespace {
126            Self(format!("`{text}`"))
127        } else {
128            // More backticks are needed to delimit the inline code than the maximum number of
129            // backticks in a consecutive run.
130            let backticks = "`".repeat(count_max_consecutive_chars(&text, '`') + 1);
131            let space = match text.as_bytes() {
132                &[b'`', ..] | &[.., b'`'] => " ", // Space needed to separate backtick.
133                &[b' ', .., b' '] => " ",         // Space needed to escape inner space.
134                _ => "",                          // No space needed.
135            };
136            Self(format!("{backticks}{space}{text}{space}{backticks}"))
137        }
138    }
139
140    /// Returns markdown for code blocks, wrapped in 3 or more backticks as needed.
141    pub fn code_block(tag: &str, text: &str) -> Self {
142        let backticks = "`".repeat(3.max(count_max_consecutive_chars(text, '`') + 1));
143        Self(format!("{backticks}{tag}\n{text}\n{backticks}\n"))
144    }
145}
146
147// Copied from `pulldown-cmark-to-cmark-20.0.0` with changed names.
148// https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L1063
149// Apache License 2.0, same as this code.
150fn count_max_consecutive_chars(text: &str, search: char) -> usize {
151    let mut in_search_chars = false;
152    let mut max_count = 0;
153    let mut cur_count = 0;
154
155    for ch in text.chars() {
156        if ch == search {
157            cur_count += 1;
158            in_search_chars = true;
159        } else if in_search_chars {
160            max_count = max_count.max(cur_count);
161            cur_count = 0;
162            in_search_chars = false;
163        }
164    }
165    max_count.max(cur_count)
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    #[test]
173    fn test_markdown_string_escape() {
174        let input = r#"
175        # Heading
176
177        Another heading
178        ===
179
180        Another heading variant
181        ---
182
183        Paragraph with [link](https://example.com) and `code`, *emphasis*, and ~strikethrough~.
184
185        ```
186        code block
187        ```
188
189        List with varying leaders:
190          - Item 1
191          * Item 2
192          + Item 3
193
194        Some math:  $`\sqrt{3x-1}+(1+x)^2`$
195
196        HTML entity: &nbsp;
197        "#;
198
199        let expected = r#"
200        \# Heading
201
202        Another heading
203        \=\=\=
204
205        Another heading variant
206        \-\-\-
207
208        Paragraph with \[link](https://example.com) and \`code\`, \*emphasis\*, and \~strikethrough\~.
209
210        \`\`\`
211        code block
212        \`\`\`
213
214        List with varying leaders:
215          \- Item 1
216          \* Item 2
217          \+ Item 3
218
219        Some math:  \$\`\\sqrt{3x\-1}\+(1\+x)\^2\`\$
220
221        HTML entity: \&nbsp;
222        "#;
223
224        assert_eq!(MarkdownString::escape(input).0, expected);
225    }
226
227    #[test]
228    fn test_markdown_string_inline_code() {
229        assert_eq!(MarkdownString::inline_code(" ").0, "` `");
230        assert_eq!(MarkdownString::inline_code("text").0, "`text`");
231        assert_eq!(MarkdownString::inline_code("text ").0, "`text `");
232        assert_eq!(MarkdownString::inline_code(" text ").0, "`  text  `");
233        assert_eq!(MarkdownString::inline_code("`").0, "`` ` ``");
234        assert_eq!(MarkdownString::inline_code("``").0, "``` `` ```");
235        assert_eq!(MarkdownString::inline_code("`text`").0, "`` `text` ``");
236        assert_eq!(
237            MarkdownString::inline_code("some `text` no leading or trailing backticks").0,
238            "``some `text` no leading or trailing backticks``"
239        );
240    }
241
242    #[test]
243    fn test_count_max_consecutive_chars() {
244        assert_eq!(
245            count_max_consecutive_chars("``a```b``", '`'),
246            3,
247            "the highest seen consecutive segment of backticks counts"
248        );
249        assert_eq!(
250            count_max_consecutive_chars("```a``b`", '`'),
251            3,
252            "it can't be downgraded later"
253        );
254    }
255}