markdown_writer.rs

  1use std::collections::VecDeque;
  2use std::sync::OnceLock;
  3
  4use anyhow::Result;
  5use markup5ever_rcdom::{Handle, NodeData};
  6use regex::Regex;
  7
  8use crate::html_element::HtmlElement;
  9
 10fn empty_line_regex() -> &'static Regex {
 11    static REGEX: OnceLock<Regex> = OnceLock::new();
 12    REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
 13}
 14
 15fn more_than_three_newlines_regex() -> &'static Regex {
 16    static REGEX: OnceLock<Regex> = OnceLock::new();
 17    REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
 18}
 19
 20const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name";
 21
 22enum StartTagOutcome {
 23    Continue,
 24    Skip,
 25}
 26
 27pub struct MarkdownWriter {
 28    current_element_stack: VecDeque<HtmlElement>,
 29    /// The number of columns in the current `<table>`.
 30    current_table_columns: usize,
 31    is_first_th: bool,
 32    is_first_td: bool,
 33    /// The Markdown output.
 34    markdown: String,
 35}
 36
 37impl MarkdownWriter {
 38    pub fn new() -> Self {
 39        Self {
 40            current_element_stack: VecDeque::new(),
 41            current_table_columns: 0,
 42            is_first_th: true,
 43            is_first_td: true,
 44            markdown: String::new(),
 45        }
 46    }
 47
 48    fn is_inside(&self, tag: &str) -> bool {
 49        self.current_element_stack
 50            .iter()
 51            .any(|parent_element| parent_element.tag == tag)
 52    }
 53
 54    /// Appends the given string slice onto the end of the Markdown output.
 55    fn push_str(&mut self, str: &str) {
 56        self.markdown.push_str(str);
 57    }
 58
 59    /// Appends a newline to the end of the Markdown output.
 60    fn push_newline(&mut self) {
 61        self.push_str("\n");
 62    }
 63
 64    /// Appends a blank line to the end of the Markdown output.
 65    fn push_blank_line(&mut self) {
 66        self.push_str("\n\n");
 67    }
 68
 69    pub fn run(mut self, root_node: &Handle) -> Result<String> {
 70        self.visit_node(&root_node)?;
 71        Ok(Self::prettify_markdown(self.markdown))
 72    }
 73
 74    fn prettify_markdown(markdown: String) -> String {
 75        let markdown = empty_line_regex().replace_all(&markdown, "");
 76        let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
 77
 78        markdown.trim().to_string()
 79    }
 80
 81    fn visit_node(&mut self, node: &Handle) -> Result<()> {
 82        let mut current_element = None;
 83
 84        match node.data {
 85            NodeData::Document
 86            | NodeData::Doctype { .. }
 87            | NodeData::ProcessingInstruction { .. }
 88            | NodeData::Comment { .. } => {
 89                // Currently left unimplemented, as we're not interested in this data
 90                // at this time.
 91            }
 92            NodeData::Element {
 93                ref name,
 94                ref attrs,
 95                ..
 96            } => {
 97                let tag_name = name.local.to_string();
 98                if !tag_name.is_empty() {
 99                    current_element = Some(HtmlElement {
100                        tag: tag_name,
101                        attrs: attrs.clone(),
102                    });
103                }
104            }
105            NodeData::Text { ref contents } => {
106                let text = contents.borrow().to_string();
107                self.visit_text(text)?;
108            }
109        }
110
111        if let Some(current_element) = current_element.as_ref() {
112            match self.start_tag(&current_element) {
113                StartTagOutcome::Continue => {}
114                StartTagOutcome::Skip => return Ok(()),
115            }
116
117            self.current_element_stack
118                .push_back(current_element.clone());
119        }
120
121        for child in node.children.borrow().iter() {
122            self.visit_node(child)?;
123        }
124
125        if let Some(current_element) = current_element {
126            self.current_element_stack.pop_back();
127            self.end_tag(&current_element);
128        }
129
130        Ok(())
131    }
132
133    fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
134        if tag.is_inline() && self.is_inside("p") {
135            if let Some(parent) = self.current_element_stack.iter().last() {
136                if !parent.is_inline() {
137                    if !(self.markdown.ends_with(' ') || self.markdown.ends_with('\n')) {
138                        self.push_str(" ");
139                    }
140                }
141            }
142        }
143
144        match tag.tag.as_str() {
145            "head" | "script" | "nav" => return StartTagOutcome::Skip,
146            "h1" => self.push_str("\n\n# "),
147            "h2" => self.push_str("\n\n## "),
148            "h3" => self.push_str("\n\n### "),
149            "h4" => self.push_str("\n\n#### "),
150            "h5" => self.push_str("\n\n##### "),
151            "h6" => self.push_str("\n\n###### "),
152            "p" => self.push_blank_line(),
153            "strong" => self.push_str("**"),
154            "em" => self.push_str("_"),
155            "code" => {
156                if !self.is_inside("pre") {
157                    self.push_str("`");
158                }
159            }
160            "pre" => {
161                let classes = tag.classes();
162                let is_rust = classes.iter().any(|class| class == "rust");
163                let language = is_rust
164                    .then(|| "rs")
165                    .or_else(|| {
166                        classes.iter().find_map(|class| {
167                            if let Some((_, language)) = class.split_once("language-") {
168                                Some(language.trim())
169                            } else {
170                                None
171                            }
172                        })
173                    })
174                    .unwrap_or("");
175
176                self.push_str(&format!("\n\n```{language}\n"));
177            }
178            "ul" | "ol" => self.push_newline(),
179            "li" => self.push_str("- "),
180            "thead" => self.push_blank_line(),
181            "tr" => self.push_newline(),
182            "th" => {
183                self.current_table_columns += 1;
184                if self.is_first_th {
185                    self.is_first_th = false;
186                } else {
187                    self.push_str(" ");
188                }
189                self.push_str("| ");
190            }
191            "td" => {
192                if self.is_first_td {
193                    self.is_first_td = false;
194                } else {
195                    self.push_str(" ");
196                }
197                self.push_str("| ");
198            }
199            "summary" => {
200                if tag.has_class("hideme") {
201                    return StartTagOutcome::Skip;
202                }
203            }
204            "button" => {
205                if tag.attr("id").as_deref() == Some("copy-path") {
206                    return StartTagOutcome::Skip;
207                }
208            }
209            "div" | "span" => {
210                let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"];
211                if tag.has_any_classes(&classes_to_skip) {
212                    return StartTagOutcome::Skip;
213                }
214
215                if self.is_inside_item_name() && tag.has_class("stab") {
216                    self.push_str(" [");
217                }
218            }
219            _ => {}
220        }
221
222        StartTagOutcome::Continue
223    }
224
225    fn end_tag(&mut self, tag: &HtmlElement) {
226        match tag.tag.as_str() {
227            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
228            "strong" => self.push_str("**"),
229            "em" => self.push_str("_"),
230            "code" => {
231                if !self.is_inside("pre") {
232                    self.push_str("`");
233                }
234            }
235            "pre" => self.push_str("\n```\n"),
236            "ul" | "ol" => self.push_newline(),
237            "li" => self.push_newline(),
238            "thead" => {
239                self.push_newline();
240                for ix in 0..self.current_table_columns {
241                    if ix > 0 {
242                        self.push_str(" ");
243                    }
244                    self.push_str("| ---");
245                }
246                self.push_str(" |");
247                self.is_first_th = true;
248            }
249            "tr" => {
250                self.push_str(" |");
251                self.is_first_td = true;
252            }
253            "table" => {
254                self.current_table_columns = 0;
255            }
256            "div" | "span" => {
257                if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) {
258                    self.push_str(": ");
259                }
260
261                if self.is_inside_item_name() && tag.has_class("stab") {
262                    self.push_str("]");
263                }
264            }
265            _ => {}
266        }
267    }
268
269    fn visit_text(&mut self, text: String) -> Result<()> {
270        if self.is_inside("pre") {
271            self.push_str(&text);
272            return Ok(());
273        }
274
275        let text = text
276            .trim_matches(|char| char == '\n' || char == '\r' || char == 'ยง')
277            .replace('\n', " ");
278
279        if self.is_inside_item_name() && !self.is_inside("span") && !self.is_inside("code") {
280            self.push_str(&format!("`{text}`"));
281            return Ok(());
282        }
283
284        self.push_str(&text);
285
286        Ok(())
287    }
288
289    /// Returns whether we're currently inside of an `.item-name` element, which
290    /// rustdoc uses to display Rust items in a list.
291    fn is_inside_item_name(&self) -> bool {
292        self.current_element_stack
293            .iter()
294            .any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS))
295    }
296}