1use std::collections::VecDeque;
2use std::sync::OnceLock;
3
4use anyhow::Result;
5use markup5ever_rcdom::{Handle, NodeData};
6use regex::Regex;
7
8use crate::html_element::HtmlElement;
9
10fn empty_line_regex() -> &'static Regex {
11 static REGEX: OnceLock<Regex> = OnceLock::new();
12 REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
13}
14
15fn more_than_three_newlines_regex() -> &'static Regex {
16 static REGEX: OnceLock<Regex> = OnceLock::new();
17 REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
18}
19
20pub enum StartTagOutcome {
21 Continue,
22 Skip,
23}
24
25pub struct MarkdownWriter {
26 current_element_stack: VecDeque<HtmlElement>,
27 pub(crate) markdown: String,
28}
29
30impl MarkdownWriter {
31 pub fn new() -> Self {
32 Self {
33 current_element_stack: VecDeque::new(),
34 markdown: String::new(),
35 }
36 }
37
38 pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
39 &self.current_element_stack
40 }
41
42 pub fn is_inside(&self, tag: &str) -> bool {
43 self.current_element_stack
44 .iter()
45 .any(|parent_element| parent_element.tag == tag)
46 }
47
48 /// Appends the given string slice onto the end of the Markdown output.
49 pub fn push_str(&mut self, str: &str) {
50 self.markdown.push_str(str);
51 }
52
53 /// Appends a newline to the end of the Markdown output.
54 pub fn push_newline(&mut self) {
55 self.push_str("\n");
56 }
57
58 /// Appends a blank line to the end of the Markdown output.
59 pub fn push_blank_line(&mut self) {
60 self.push_str("\n\n");
61 }
62
63 pub fn run(
64 mut self,
65 root_node: &Handle,
66 mut handlers: Vec<Box<dyn HandleTag>>,
67 ) -> Result<String> {
68 self.visit_node(&root_node, &mut handlers)?;
69 Ok(Self::prettify_markdown(self.markdown))
70 }
71
72 fn prettify_markdown(markdown: String) -> String {
73 let markdown = empty_line_regex().replace_all(&markdown, "");
74 let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
75
76 markdown.trim().to_string()
77 }
78
79 fn visit_node(&mut self, node: &Handle, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
80 let mut current_element = None;
81
82 match node.data {
83 NodeData::Document
84 | NodeData::Doctype { .. }
85 | NodeData::ProcessingInstruction { .. }
86 | NodeData::Comment { .. } => {
87 // Currently left unimplemented, as we're not interested in this data
88 // at this time.
89 }
90 NodeData::Element {
91 ref name,
92 ref attrs,
93 ..
94 } => {
95 let tag_name = name.local.to_string();
96 if !tag_name.is_empty() {
97 current_element = Some(HtmlElement {
98 tag: tag_name,
99 attrs: attrs.clone(),
100 });
101 }
102 }
103 NodeData::Text { ref contents } => {
104 let text = contents.borrow().to_string();
105 self.visit_text(text, handlers)?;
106 }
107 }
108
109 if let Some(current_element) = current_element.as_ref() {
110 match self.start_tag(¤t_element, handlers) {
111 StartTagOutcome::Continue => {}
112 StartTagOutcome::Skip => return Ok(()),
113 }
114
115 self.current_element_stack
116 .push_back(current_element.clone());
117 }
118
119 for child in node.children.borrow().iter() {
120 self.visit_node(child, handlers)?;
121 }
122
123 if let Some(current_element) = current_element {
124 self.current_element_stack.pop_back();
125 self.end_tag(¤t_element, handlers);
126 }
127
128 Ok(())
129 }
130
131 fn start_tag(
132 &mut self,
133 tag: &HtmlElement,
134 handlers: &mut [Box<dyn HandleTag>],
135 ) -> StartTagOutcome {
136 for handler in handlers {
137 if handler.should_handle(tag.tag.as_str()) {
138 match handler.handle_tag_start(tag, self) {
139 StartTagOutcome::Continue => {}
140 StartTagOutcome::Skip => return StartTagOutcome::Skip,
141 }
142 }
143 }
144
145 StartTagOutcome::Continue
146 }
147
148 fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box<dyn HandleTag>]) {
149 for handler in handlers {
150 if handler.should_handle(tag.tag.as_str()) {
151 handler.handle_tag_end(tag, self);
152 }
153 }
154 }
155
156 fn visit_text(&mut self, text: String, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
157 for handler in handlers {
158 match handler.handle_text(&text, self) {
159 HandlerOutcome::Handled => return Ok(()),
160 HandlerOutcome::NoOp => {}
161 }
162 }
163
164 let text = text
165 .trim_matches(|char| char == '\n' || char == '\r')
166 .replace('\n', " ");
167
168 self.push_str(&text);
169
170 Ok(())
171 }
172}
173
174pub enum HandlerOutcome {
175 Handled,
176 NoOp,
177}
178
179pub trait HandleTag {
180 /// Returns whether this handler should handle the given tag.
181 fn should_handle(&self, tag: &str) -> bool;
182
183 /// Handles the start of the given tag.
184 fn handle_tag_start(
185 &mut self,
186 _tag: &HtmlElement,
187 _writer: &mut MarkdownWriter,
188 ) -> StartTagOutcome {
189 StartTagOutcome::Continue
190 }
191
192 /// Handles the end of the given tag.
193 fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
194
195 fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
196 HandlerOutcome::NoOp
197 }
198}