1use std::cell::RefCell;
2use std::collections::VecDeque;
3use std::rc::Rc;
4use std::sync::OnceLock;
5
6use anyhow::Result;
7use markup5ever_rcdom::{Handle, NodeData};
8use regex::Regex;
9
10use crate::html_element::HtmlElement;
11
12fn empty_line_regex() -> &'static Regex {
13 static REGEX: OnceLock<Regex> = OnceLock::new();
14 REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
15}
16
17fn more_than_three_newlines_regex() -> &'static Regex {
18 static REGEX: OnceLock<Regex> = OnceLock::new();
19 REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
20}
21
22pub enum StartTagOutcome {
23 Continue,
24 Skip,
25}
26
27pub type TagHandler = Rc<RefCell<dyn HandleTag>>;
28
29pub struct MarkdownWriter {
30 current_element_stack: VecDeque<HtmlElement>,
31 pub(crate) markdown: String,
32}
33
34impl MarkdownWriter {
35 pub fn new() -> Self {
36 Self {
37 current_element_stack: VecDeque::new(),
38 markdown: String::new(),
39 }
40 }
41
42 pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
43 &self.current_element_stack
44 }
45
46 pub fn is_inside(&self, tag: &str) -> bool {
47 self.current_element_stack
48 .iter()
49 .any(|parent_element| parent_element.tag() == tag)
50 }
51
52 /// Appends the given string slice onto the end of the Markdown output.
53 pub fn push_str(&mut self, str: &str) {
54 self.markdown.push_str(str);
55 }
56
57 /// Appends a newline to the end of the Markdown output.
58 pub fn push_newline(&mut self) {
59 self.push_str("\n");
60 }
61
62 /// Appends a blank line to the end of the Markdown output.
63 pub fn push_blank_line(&mut self) {
64 self.push_str("\n\n");
65 }
66
67 pub fn run(mut self, root_node: &Handle, handlers: &mut Vec<TagHandler>) -> Result<String> {
68 self.visit_node(&root_node, handlers)?;
69 Ok(Self::prettify_markdown(self.markdown))
70 }
71
72 fn prettify_markdown(markdown: String) -> String {
73 let markdown = empty_line_regex().replace_all(&markdown, "");
74 let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
75
76 markdown.trim().to_string()
77 }
78
79 fn visit_node(&mut self, node: &Handle, handlers: &mut [TagHandler]) -> Result<()> {
80 let mut current_element = None;
81
82 match node.data {
83 NodeData::Document
84 | NodeData::Doctype { .. }
85 | NodeData::ProcessingInstruction { .. }
86 | NodeData::Comment { .. } => {
87 // Currently left unimplemented, as we're not interested in this data
88 // at this time.
89 }
90 NodeData::Element {
91 ref name,
92 ref attrs,
93 ..
94 } => {
95 let tag_name = name.local.to_string();
96 if !tag_name.is_empty() {
97 current_element = Some(HtmlElement::new(tag_name, attrs.clone()));
98 }
99 }
100 NodeData::Text { ref contents } => {
101 let text = contents.borrow().to_string();
102 self.visit_text(text, handlers)?;
103 }
104 }
105
106 if let Some(current_element) = current_element.as_ref() {
107 match self.start_tag(¤t_element, handlers) {
108 StartTagOutcome::Continue => {}
109 StartTagOutcome::Skip => return Ok(()),
110 }
111
112 self.current_element_stack
113 .push_back(current_element.clone());
114 }
115
116 for child in node.children.borrow().iter() {
117 self.visit_node(child, handlers)?;
118 }
119
120 if let Some(current_element) = current_element {
121 self.current_element_stack.pop_back();
122 self.end_tag(¤t_element, handlers);
123 }
124
125 Ok(())
126 }
127
128 fn start_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) -> StartTagOutcome {
129 for handler in handlers {
130 if handler.borrow().should_handle(tag.tag()) {
131 match handler.borrow_mut().handle_tag_start(tag, self) {
132 StartTagOutcome::Continue => {}
133 StartTagOutcome::Skip => return StartTagOutcome::Skip,
134 }
135 }
136 }
137
138 StartTagOutcome::Continue
139 }
140
141 fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) {
142 for handler in handlers {
143 if handler.borrow().should_handle(tag.tag()) {
144 handler.borrow_mut().handle_tag_end(tag, self);
145 }
146 }
147 }
148
149 fn visit_text(&mut self, text: String, handlers: &mut [TagHandler]) -> Result<()> {
150 for handler in handlers {
151 match handler.borrow_mut().handle_text(&text, self) {
152 HandlerOutcome::Handled => return Ok(()),
153 HandlerOutcome::NoOp => {}
154 }
155 }
156
157 let text = text
158 .trim_matches(|char| char == '\n' || char == '\r' || char == '\t')
159 .replace('\n', " ");
160
161 self.push_str(&text);
162
163 Ok(())
164 }
165}
166
167pub enum HandlerOutcome {
168 Handled,
169 NoOp,
170}
171
172pub trait HandleTag {
173 /// Returns whether this handler should handle the given tag.
174 fn should_handle(&self, tag: &str) -> bool;
175
176 /// Handles the start of the given tag.
177 fn handle_tag_start(
178 &mut self,
179 _tag: &HtmlElement,
180 _writer: &mut MarkdownWriter,
181 ) -> StartTagOutcome {
182 StartTagOutcome::Continue
183 }
184
185 /// Handles the end of the given tag.
186 fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
187
188 fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
189 HandlerOutcome::NoOp
190 }
191}