1use std::collections::VecDeque;
2use std::rc::Rc;
3use std::{cell::RefCell, sync::LazyLock};
4
5use anyhow::Result;
6use markup5ever_rcdom::{Handle, NodeData};
7use regex::Regex;
8
9use crate::html_element::HtmlElement;
10
11fn empty_line_regex() -> &'static Regex {
12 static REGEX: LazyLock<Regex> =
13 LazyLock::new(|| Regex::new(r"^\s*$").expect("Failed to create empty_line_regex"));
14 ®EX
15}
16
17fn more_than_three_newlines_regex() -> &'static Regex {
18 static REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
19 ®EX
20}
21
22pub enum StartTagOutcome {
23 Continue,
24 Skip,
25}
26
27pub type TagHandler = Rc<RefCell<dyn HandleTag>>;
28
29pub struct MarkdownWriter {
30 current_element_stack: VecDeque<HtmlElement>,
31 pub(crate) markdown: String,
32}
33
34impl Default for MarkdownWriter {
35 fn default() -> Self {
36 Self::new()
37 }
38}
39
40impl MarkdownWriter {
41 pub fn new() -> Self {
42 Self {
43 current_element_stack: VecDeque::new(),
44 markdown: String::new(),
45 }
46 }
47
48 pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
49 &self.current_element_stack
50 }
51
52 pub fn is_inside(&self, tag: &str) -> bool {
53 self.current_element_stack
54 .iter()
55 .any(|parent_element| parent_element.tag() == tag)
56 }
57
58 /// Appends the given string slice onto the end of the Markdown output.
59 pub fn push_str(&mut self, str: &str) {
60 self.markdown.push_str(str);
61 }
62
63 /// Appends a newline to the end of the Markdown output.
64 pub fn push_newline(&mut self) {
65 self.push_str("\n");
66 }
67
68 /// Appends a blank line to the end of the Markdown output.
69 pub fn push_blank_line(&mut self) {
70 self.push_str("\n\n");
71 }
72
73 pub fn run(mut self, root_node: &Handle, handlers: &mut [TagHandler]) -> Result<String> {
74 self.visit_node(root_node, handlers)?;
75 Ok(Self::prettify_markdown(self.markdown))
76 }
77
78 fn prettify_markdown(markdown: String) -> String {
79 let markdown = empty_line_regex().replace_all(&markdown, "");
80 let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
81
82 markdown.trim().to_string()
83 }
84
85 fn visit_node(&mut self, node: &Handle, handlers: &mut [TagHandler]) -> Result<()> {
86 let mut current_element = None;
87
88 match node.data {
89 NodeData::Document
90 | NodeData::Doctype { .. }
91 | NodeData::ProcessingInstruction { .. }
92 | NodeData::Comment { .. } => {
93 // Currently left unimplemented, as we're not interested in this data
94 // at this time.
95 }
96 NodeData::Element {
97 ref name,
98 ref attrs,
99 ..
100 } => {
101 let tag_name = name.local.to_string();
102 if !tag_name.is_empty() {
103 current_element = Some(HtmlElement::new(tag_name, attrs.clone()));
104 }
105 }
106 NodeData::Text { ref contents } => {
107 let text = contents.borrow().to_string();
108 self.visit_text(text, handlers)?;
109 }
110 }
111
112 if let Some(current_element) = current_element.as_ref() {
113 match self.start_tag(current_element, handlers) {
114 StartTagOutcome::Continue => {}
115 StartTagOutcome::Skip => return Ok(()),
116 }
117
118 self.current_element_stack
119 .push_back(current_element.clone());
120 }
121
122 for child in node.children.borrow().iter() {
123 self.visit_node(child, handlers)?;
124 }
125
126 if let Some(current_element) = current_element {
127 self.current_element_stack.pop_back();
128 self.end_tag(¤t_element, handlers);
129 }
130
131 Ok(())
132 }
133
134 fn start_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) -> StartTagOutcome {
135 for handler in handlers {
136 if handler.borrow().should_handle(tag.tag()) {
137 match handler.borrow_mut().handle_tag_start(tag, self) {
138 StartTagOutcome::Continue => {}
139 StartTagOutcome::Skip => return StartTagOutcome::Skip,
140 }
141 }
142 }
143
144 StartTagOutcome::Continue
145 }
146
147 fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) {
148 for handler in handlers {
149 if handler.borrow().should_handle(tag.tag()) {
150 handler.borrow_mut().handle_tag_end(tag, self);
151 }
152 }
153 }
154
155 fn visit_text(&mut self, text: String, handlers: &mut [TagHandler]) -> Result<()> {
156 for handler in handlers {
157 match handler.borrow_mut().handle_text(&text, self) {
158 HandlerOutcome::Handled => return Ok(()),
159 HandlerOutcome::NoOp => {}
160 }
161 }
162
163 let text = text
164 .trim_matches(|char| char == '\n' || char == '\r' || char == '\t')
165 .replace('\n', " ");
166
167 self.push_str(&text);
168
169 Ok(())
170 }
171}
172
173pub enum HandlerOutcome {
174 Handled,
175 NoOp,
176}
177
178pub trait HandleTag {
179 /// Returns whether this handler should handle the given tag.
180 fn should_handle(&self, tag: &str) -> bool;
181
182 /// Handles the start of the given tag.
183 fn handle_tag_start(
184 &mut self,
185 _tag: &HtmlElement,
186 _writer: &mut MarkdownWriter,
187 ) -> StartTagOutcome {
188 StartTagOutcome::Continue
189 }
190
191 /// Handles the end of the given tag.
192 fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
193
194 fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
195 HandlerOutcome::NoOp
196 }
197}