1use std::collections::VecDeque;
2use std::sync::OnceLock;
3
4use anyhow::Result;
5use markup5ever_rcdom::{Handle, NodeData};
6use regex::Regex;
7
8use crate::html_element::HtmlElement;
9
10fn empty_line_regex() -> &'static Regex {
11 static REGEX: OnceLock<Regex> = OnceLock::new();
12 REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
13}
14
15fn more_than_three_newlines_regex() -> &'static Regex {
16 static REGEX: OnceLock<Regex> = OnceLock::new();
17 REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
18}
19
20const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name";
21
22enum StartTagOutcome {
23 Continue,
24 Skip,
25}
26
27pub struct MarkdownWriter {
28 current_element_stack: VecDeque<HtmlElement>,
29 /// The number of columns in the current `<table>`.
30 current_table_columns: usize,
31 is_first_th: bool,
32 is_first_td: bool,
33 /// The Markdown output.
34 markdown: String,
35}
36
37impl MarkdownWriter {
38 pub fn new() -> Self {
39 Self {
40 current_element_stack: VecDeque::new(),
41 current_table_columns: 0,
42 is_first_th: true,
43 is_first_td: true,
44 markdown: String::new(),
45 }
46 }
47
48 fn is_inside(&self, tag: &str) -> bool {
49 self.current_element_stack
50 .iter()
51 .any(|parent_element| parent_element.tag == tag)
52 }
53
54 /// Appends the given string slice onto the end of the Markdown output.
55 fn push_str(&mut self, str: &str) {
56 self.markdown.push_str(str);
57 }
58
59 /// Appends a newline to the end of the Markdown output.
60 fn push_newline(&mut self) {
61 self.push_str("\n");
62 }
63
64 /// Appends a blank line to the end of the Markdown output.
65 fn push_blank_line(&mut self) {
66 self.push_str("\n\n");
67 }
68
69 pub fn run(mut self, root_node: &Handle) -> Result<String> {
70 self.visit_node(&root_node)?;
71 Ok(Self::prettify_markdown(self.markdown))
72 }
73
74 fn prettify_markdown(markdown: String) -> String {
75 let markdown = empty_line_regex().replace_all(&markdown, "");
76 let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
77
78 markdown.trim().to_string()
79 }
80
81 fn visit_node(&mut self, node: &Handle) -> Result<()> {
82 let mut current_element = None;
83
84 match node.data {
85 NodeData::Document
86 | NodeData::Doctype { .. }
87 | NodeData::ProcessingInstruction { .. }
88 | NodeData::Comment { .. } => {
89 // Currently left unimplemented, as we're not interested in this data
90 // at this time.
91 }
92 NodeData::Element {
93 ref name,
94 ref attrs,
95 ..
96 } => {
97 let tag_name = name.local.to_string();
98 if !tag_name.is_empty() {
99 current_element = Some(HtmlElement {
100 tag: tag_name,
101 attrs: attrs.clone(),
102 });
103 }
104 }
105 NodeData::Text { ref contents } => {
106 let text = contents.borrow().to_string();
107 self.visit_text(text)?;
108 }
109 }
110
111 if let Some(current_element) = current_element.as_ref() {
112 match self.start_tag(¤t_element) {
113 StartTagOutcome::Continue => {}
114 StartTagOutcome::Skip => return Ok(()),
115 }
116
117 self.current_element_stack
118 .push_back(current_element.clone());
119 }
120
121 for child in node.children.borrow().iter() {
122 self.visit_node(child)?;
123 }
124
125 if let Some(current_element) = current_element {
126 self.current_element_stack.pop_back();
127 self.end_tag(¤t_element);
128 }
129
130 Ok(())
131 }
132
133 fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
134 if tag.is_inline() && self.is_inside("p") {
135 if let Some(parent) = self.current_element_stack.iter().last() {
136 if !parent.is_inline() {
137 if !(self.markdown.ends_with(' ') || self.markdown.ends_with('\n')) {
138 self.push_str(" ");
139 }
140 }
141 }
142 }
143
144 match tag.tag.as_str() {
145 "head" | "script" | "nav" => return StartTagOutcome::Skip,
146 "h1" => self.push_str("\n\n# "),
147 "h2" => self.push_str("\n\n## "),
148 "h3" => self.push_str("\n\n### "),
149 "h4" => self.push_str("\n\n#### "),
150 "h5" => self.push_str("\n\n##### "),
151 "h6" => self.push_str("\n\n###### "),
152 "p" => self.push_blank_line(),
153 "strong" => self.push_str("**"),
154 "em" => self.push_str("_"),
155 "code" => {
156 if !self.is_inside("pre") {
157 self.push_str("`");
158 }
159 }
160 "pre" => {
161 let classes = tag.classes();
162 let is_rust = classes.iter().any(|class| class == "rust");
163 let language = is_rust
164 .then(|| "rs")
165 .or_else(|| {
166 classes.iter().find_map(|class| {
167 if let Some((_, language)) = class.split_once("language-") {
168 Some(language.trim())
169 } else {
170 None
171 }
172 })
173 })
174 .unwrap_or("");
175
176 self.push_str(&format!("\n\n```{language}\n"));
177 }
178 "ul" | "ol" => self.push_newline(),
179 "li" => self.push_str("- "),
180 "thead" => self.push_blank_line(),
181 "tr" => self.push_newline(),
182 "th" => {
183 self.current_table_columns += 1;
184 if self.is_first_th {
185 self.is_first_th = false;
186 } else {
187 self.push_str(" ");
188 }
189 self.push_str("| ");
190 }
191 "td" => {
192 if self.is_first_td {
193 self.is_first_td = false;
194 } else {
195 self.push_str(" ");
196 }
197 self.push_str("| ");
198 }
199 "summary" => {
200 if tag.has_class("hideme") {
201 return StartTagOutcome::Skip;
202 }
203 }
204 "button" => {
205 if tag.attr("id").as_deref() == Some("copy-path") {
206 return StartTagOutcome::Skip;
207 }
208 }
209 "div" | "span" => {
210 let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"];
211 if tag.has_any_classes(&classes_to_skip) {
212 return StartTagOutcome::Skip;
213 }
214
215 if self.is_inside_item_name() && tag.has_class("stab") {
216 self.push_str(" [");
217 }
218 }
219 _ => {}
220 }
221
222 StartTagOutcome::Continue
223 }
224
225 fn end_tag(&mut self, tag: &HtmlElement) {
226 match tag.tag.as_str() {
227 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
228 "strong" => self.push_str("**"),
229 "em" => self.push_str("_"),
230 "code" => {
231 if !self.is_inside("pre") {
232 self.push_str("`");
233 }
234 }
235 "pre" => self.push_str("\n```\n"),
236 "ul" | "ol" => self.push_newline(),
237 "li" => self.push_newline(),
238 "thead" => {
239 self.push_newline();
240 for ix in 0..self.current_table_columns {
241 if ix > 0 {
242 self.push_str(" ");
243 }
244 self.push_str("| ---");
245 }
246 self.push_str(" |");
247 self.is_first_th = true;
248 }
249 "tr" => {
250 self.push_str(" |");
251 self.is_first_td = true;
252 }
253 "table" => {
254 self.current_table_columns = 0;
255 }
256 "div" | "span" => {
257 if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) {
258 self.push_str(": ");
259 }
260
261 if self.is_inside_item_name() && tag.has_class("stab") {
262 self.push_str("]");
263 }
264 }
265 _ => {}
266 }
267 }
268
269 fn visit_text(&mut self, text: String) -> Result<()> {
270 if self.is_inside("pre") {
271 self.push_str(&text);
272 return Ok(());
273 }
274
275 let text = text
276 .trim_matches(|char| char == '\n' || char == '\r' || char == 'ยง')
277 .replace('\n', " ");
278
279 if self.is_inside_item_name() && !self.is_inside("span") && !self.is_inside("code") {
280 self.push_str(&format!("`{text}`"));
281 return Ok(());
282 }
283
284 self.push_str(&text);
285
286 Ok(())
287 }
288
289 /// Returns whether we're currently inside of an `.item-name` element, which
290 /// rustdoc uses to display Rust items in a list.
291 fn is_inside_item_name(&self) -> bool {
292 self.current_element_stack
293 .iter()
294 .any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS))
295 }
296}