markdown_writer.rs

  1use std::cell::RefCell;
  2use std::collections::VecDeque;
  3use std::sync::OnceLock;
  4
  5use anyhow::Result;
  6use html5ever::Attribute;
  7use markup5ever_rcdom::{Handle, NodeData};
  8use regex::Regex;
  9
 10fn empty_line_regex() -> &'static Regex {
 11    static REGEX: OnceLock<Regex> = OnceLock::new();
 12    REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
 13}
 14
 15fn more_than_three_newlines_regex() -> &'static Regex {
 16    static REGEX: OnceLock<Regex> = OnceLock::new();
 17    REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
 18}
 19
 20#[derive(Debug, Clone)]
 21struct HtmlElement {
 22    tag: String,
 23    attrs: RefCell<Vec<Attribute>>,
 24}
 25
 26enum StartTagOutcome {
 27    Continue,
 28    Skip,
 29}
 30
 31pub struct MarkdownWriter {
 32    current_element_stack: VecDeque<HtmlElement>,
 33    /// The Markdown output.
 34    markdown: String,
 35}
 36
 37impl MarkdownWriter {
 38    pub fn new() -> Self {
 39        Self {
 40            current_element_stack: VecDeque::new(),
 41            markdown: String::new(),
 42        }
 43    }
 44
 45    fn is_inside(&self, tag: &str) -> bool {
 46        self.current_element_stack
 47            .iter()
 48            .any(|parent_element| parent_element.tag == tag)
 49    }
 50
 51    /// Appends the given string slice onto the end of the Markdown output.
 52    fn push_str(&mut self, str: &str) {
 53        self.markdown.push_str(str);
 54    }
 55
 56    /// Appends a newline to the end of the Markdown output.
 57    fn push_newline(&mut self) {
 58        self.push_str("\n");
 59    }
 60
 61    pub fn run(mut self, root_node: &Handle) -> Result<String> {
 62        self.visit_node(&root_node)?;
 63        Ok(Self::prettify_markdown(self.markdown))
 64    }
 65
 66    fn prettify_markdown(markdown: String) -> String {
 67        let markdown = empty_line_regex().replace_all(&markdown, "");
 68        let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
 69
 70        markdown.trim().to_string()
 71    }
 72
 73    fn visit_node(&mut self, node: &Handle) -> Result<()> {
 74        let mut current_element = None;
 75
 76        match node.data {
 77            NodeData::Document
 78            | NodeData::Doctype { .. }
 79            | NodeData::ProcessingInstruction { .. }
 80            | NodeData::Comment { .. } => {
 81                // Currently left unimplemented, as we're not interested in this data
 82                // at this time.
 83            }
 84            NodeData::Element {
 85                ref name,
 86                ref attrs,
 87                ..
 88            } => {
 89                let tag_name = name.local.to_string();
 90                if !tag_name.is_empty() {
 91                    current_element = Some(HtmlElement {
 92                        tag: tag_name,
 93                        attrs: attrs.clone(),
 94                    });
 95                }
 96            }
 97            NodeData::Text { ref contents } => {
 98                let text = contents.borrow().to_string();
 99                self.visit_text(text)?;
100            }
101        }
102
103        if let Some(current_element) = current_element.as_ref() {
104            match self.start_tag(&current_element) {
105                StartTagOutcome::Continue => {}
106                StartTagOutcome::Skip => return Ok(()),
107            }
108
109            self.current_element_stack
110                .push_back(current_element.clone());
111        }
112
113        for child in node.children.borrow().iter() {
114            self.visit_node(child)?;
115        }
116
117        if let Some(current_element) = current_element {
118            self.current_element_stack.pop_back();
119            self.end_tag(&current_element);
120        }
121
122        Ok(())
123    }
124
125    fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
126        match tag.tag.as_str() {
127            "head" | "script" | "nav" => return StartTagOutcome::Skip,
128            "h1" => self.push_str("\n\n# "),
129            "h2" => self.push_str("\n\n## "),
130            "h3" => self.push_str("\n\n### "),
131            "h4" => self.push_str("\n\n#### "),
132            "h5" => self.push_str("\n\n##### "),
133            "h6" => self.push_str("\n\n###### "),
134            "code" => {
135                if !self.is_inside("pre") {
136                    self.push_str("`")
137                }
138            }
139            "pre" => {
140                let attrs = tag.attrs.borrow();
141                let classes = attrs
142                    .iter()
143                    .find(|attr| attr.name.local.to_string() == "class")
144                    .map(|attr| {
145                        attr.value
146                            .split(' ')
147                            .map(|class| class.trim())
148                            .collect::<Vec<_>>()
149                    })
150                    .unwrap_or_default();
151                let is_rust = classes.into_iter().any(|class| class == "rust");
152                let language = if is_rust { "rs" } else { "" };
153
154                self.push_str(&format!("\n```{language}\n"))
155            }
156            "ul" | "ol" => self.push_newline(),
157            "li" => self.push_str("- "),
158            "summary" => {
159                if tag.attrs.borrow().iter().any(|attr| {
160                    attr.name.local.to_string() == "class" && attr.value.to_string() == "hideme"
161                }) {
162                    return StartTagOutcome::Skip;
163                }
164            }
165            "div" | "span" => {
166                let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"];
167
168                if tag.attrs.borrow().iter().any(|attr| {
169                    attr.name.local.to_string() == "class"
170                        && attr
171                            .value
172                            .split(' ')
173                            .any(|class| classes_to_skip.contains(&class.trim()))
174                }) {
175                    return StartTagOutcome::Skip;
176                }
177
178                if tag.attrs.borrow().iter().any(|attr| {
179                    attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
180                }) {
181                    self.push_str("`");
182                }
183            }
184            _ => {}
185        }
186
187        StartTagOutcome::Continue
188    }
189
190    fn end_tag(&mut self, tag: &HtmlElement) {
191        match tag.tag.as_str() {
192            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
193            "code" => {
194                if !self.is_inside("pre") {
195                    self.push_str("`")
196                }
197            }
198            "pre" => self.push_str("\n```\n"),
199            "ul" | "ol" => self.push_newline(),
200            "li" => self.push_newline(),
201            "div" => {
202                if tag.attrs.borrow().iter().any(|attr| {
203                    attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
204                }) {
205                    self.push_str("`: ");
206                }
207            }
208            _ => {}
209        }
210    }
211
212    fn visit_text(&mut self, text: String) -> Result<()> {
213        if self.is_inside("pre") {
214            self.push_str(&text);
215            return Ok(());
216        }
217
218        let trimmed_text = text.trim_matches(|char| char == '\n' || char == '\r' || char == 'ยง');
219        self.push_str(trimmed_text);
220
221        Ok(())
222    }
223}