1use std::cell::RefCell;
2use std::collections::VecDeque;
3use std::sync::OnceLock;
4
5use anyhow::Result;
6use html5ever::Attribute;
7use markup5ever_rcdom::{Handle, NodeData};
8use regex::Regex;
9
10fn empty_line_regex() -> &'static Regex {
11 static REGEX: OnceLock<Regex> = OnceLock::new();
12 REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
13}
14
15fn more_than_three_newlines_regex() -> &'static Regex {
16 static REGEX: OnceLock<Regex> = OnceLock::new();
17 REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
18}
19
20#[derive(Debug, Clone)]
21struct HtmlElement {
22 tag: String,
23 attrs: RefCell<Vec<Attribute>>,
24}
25
26enum StartTagOutcome {
27 Continue,
28 Skip,
29}
30
31pub struct MarkdownWriter {
32 current_element_stack: VecDeque<HtmlElement>,
33 /// The Markdown output.
34 markdown: String,
35}
36
37impl MarkdownWriter {
38 pub fn new() -> Self {
39 Self {
40 current_element_stack: VecDeque::new(),
41 markdown: String::new(),
42 }
43 }
44
45 fn is_inside(&self, tag: &str) -> bool {
46 self.current_element_stack
47 .iter()
48 .any(|parent_element| parent_element.tag == tag)
49 }
50
51 /// Appends the given string slice onto the end of the Markdown output.
52 fn push_str(&mut self, str: &str) {
53 self.markdown.push_str(str);
54 }
55
56 /// Appends a newline to the end of the Markdown output.
57 fn push_newline(&mut self) {
58 self.push_str("\n");
59 }
60
61 pub fn run(mut self, root_node: &Handle) -> Result<String> {
62 self.visit_node(&root_node)?;
63 Ok(Self::prettify_markdown(self.markdown))
64 }
65
66 fn prettify_markdown(markdown: String) -> String {
67 let markdown = empty_line_regex().replace_all(&markdown, "");
68 let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
69
70 markdown.trim().to_string()
71 }
72
73 fn visit_node(&mut self, node: &Handle) -> Result<()> {
74 let mut current_element = None;
75
76 match node.data {
77 NodeData::Document
78 | NodeData::Doctype { .. }
79 | NodeData::ProcessingInstruction { .. }
80 | NodeData::Comment { .. } => {
81 // Currently left unimplemented, as we're not interested in this data
82 // at this time.
83 }
84 NodeData::Element {
85 ref name,
86 ref attrs,
87 ..
88 } => {
89 let tag_name = name.local.to_string();
90 if !tag_name.is_empty() {
91 current_element = Some(HtmlElement {
92 tag: tag_name,
93 attrs: attrs.clone(),
94 });
95 }
96 }
97 NodeData::Text { ref contents } => {
98 let text = contents.borrow().to_string();
99 self.visit_text(text)?;
100 }
101 }
102
103 if let Some(current_element) = current_element.as_ref() {
104 match self.start_tag(¤t_element) {
105 StartTagOutcome::Continue => {}
106 StartTagOutcome::Skip => return Ok(()),
107 }
108
109 self.current_element_stack
110 .push_back(current_element.clone());
111 }
112
113 for child in node.children.borrow().iter() {
114 self.visit_node(child)?;
115 }
116
117 if let Some(current_element) = current_element {
118 self.current_element_stack.pop_back();
119 self.end_tag(¤t_element);
120 }
121
122 Ok(())
123 }
124
125 fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
126 match tag.tag.as_str() {
127 "head" | "script" | "nav" => return StartTagOutcome::Skip,
128 "h1" => self.push_str("\n\n# "),
129 "h2" => self.push_str("\n\n## "),
130 "h3" => self.push_str("\n\n### "),
131 "h4" => self.push_str("\n\n#### "),
132 "h5" => self.push_str("\n\n##### "),
133 "h6" => self.push_str("\n\n###### "),
134 "code" => {
135 if !self.is_inside("pre") {
136 self.push_str("`")
137 }
138 }
139 "pre" => {
140 let attrs = tag.attrs.borrow();
141 let classes = attrs
142 .iter()
143 .find(|attr| attr.name.local.to_string() == "class")
144 .map(|attr| {
145 attr.value
146 .split(' ')
147 .map(|class| class.trim())
148 .collect::<Vec<_>>()
149 })
150 .unwrap_or_default();
151 let is_rust = classes.into_iter().any(|class| class == "rust");
152 let language = if is_rust { "rs" } else { "" };
153
154 self.push_str(&format!("\n```{language}\n"))
155 }
156 "ul" | "ol" => self.push_newline(),
157 "li" => self.push_str("- "),
158 "summary" => {
159 if tag.attrs.borrow().iter().any(|attr| {
160 attr.name.local.to_string() == "class" && attr.value.to_string() == "hideme"
161 }) {
162 return StartTagOutcome::Skip;
163 }
164 }
165 "div" | "span" => {
166 let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"];
167
168 if tag.attrs.borrow().iter().any(|attr| {
169 attr.name.local.to_string() == "class"
170 && attr
171 .value
172 .split(' ')
173 .any(|class| classes_to_skip.contains(&class.trim()))
174 }) {
175 return StartTagOutcome::Skip;
176 }
177
178 if tag.attrs.borrow().iter().any(|attr| {
179 attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
180 }) {
181 self.push_str("`");
182 }
183 }
184 _ => {}
185 }
186
187 StartTagOutcome::Continue
188 }
189
190 fn end_tag(&mut self, tag: &HtmlElement) {
191 match tag.tag.as_str() {
192 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
193 "code" => {
194 if !self.is_inside("pre") {
195 self.push_str("`")
196 }
197 }
198 "pre" => self.push_str("\n```\n"),
199 "ul" | "ol" => self.push_newline(),
200 "li" => self.push_newline(),
201 "div" => {
202 if tag.attrs.borrow().iter().any(|attr| {
203 attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
204 }) {
205 self.push_str("`: ");
206 }
207 }
208 _ => {}
209 }
210 }
211
212 fn visit_text(&mut self, text: String) -> Result<()> {
213 if self.is_inside("pre") {
214 self.push_str(&text);
215 return Ok(());
216 }
217
218 let trimmed_text = text.trim_matches(|char| char == '\n' || char == '\r' || char == 'ยง');
219 self.push_str(trimmed_text);
220
221 Ok(())
222 }
223}