1use std::collections::VecDeque;
2use std::rc::Rc;
3use std::{cell::RefCell, sync::LazyLock};
4
5use anyhow::Result;
6use markup5ever_rcdom::{Handle, NodeData};
7use regex::Regex;
8
9use crate::html_element::HtmlElement;
10
11fn empty_line_regex() -> &'static Regex {
12 static REGEX: LazyLock<Regex> =
13 LazyLock::new(|| Regex::new(r"^\s*$").expect("Failed to create empty_line_regex"));
14 ®EX
15}
16
17fn more_than_three_newlines_regex() -> &'static Regex {
18 static REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
19 ®EX
20}
21
22pub enum StartTagOutcome {
23 Continue,
24 Skip,
25}
26
27pub type TagHandler = Rc<RefCell<dyn HandleTag>>;
28
29pub struct MarkdownWriter {
30 current_element_stack: VecDeque<HtmlElement>,
31 pub(crate) markdown: String,
32}
33
34impl Default for MarkdownWriter {
35 fn default() -> Self {
36 Self::new()
37 }
38}
39
40impl MarkdownWriter {
41 pub fn new() -> Self {
42 Self {
43 current_element_stack: VecDeque::new(),
44 markdown: String::new(),
45 }
46 }
47
48 pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
49 &self.current_element_stack
50 }
51
52 pub fn is_inside(&self, tag: &str) -> bool {
53 self.current_element_stack
54 .iter()
55 .any(|parent_element| parent_element.tag() == tag)
56 }
57
58 /// Appends the given string slice onto the end of the Markdown output.
59 pub fn push_str(&mut self, str: &str) {
60 self.markdown.push_str(str);
61 }
62
63 /// Appends a newline to the end of the Markdown output.
64 pub fn push_newline(&mut self) {
65 self.push_str("\n");
66 }
67
68 /// Appends a blank line to the end of the Markdown output.
69 pub fn push_blank_line(&mut self) {
70 self.push_str("\n\n");
71 }
72
73 pub fn run(mut self, root_node: &Handle, handlers: &mut [TagHandler]) -> Result<String> {
74 self.visit_node(root_node, handlers)?;
75 Ok(Self::prettify_markdown(self.markdown))
76 }
77
78 fn prettify_markdown(markdown: String) -> String {
79 let markdown = empty_line_regex().replace_all(&markdown, "");
80 let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
81
82 markdown.trim().to_string()
83 }
84
85 fn visit_node(&mut self, node: &Handle, handlers: &mut [TagHandler]) -> Result<()> {
86 let mut current_element = None;
87
88 match node.data {
89 NodeData::Document
90 | NodeData::Doctype { .. }
91 | NodeData::ProcessingInstruction { .. }
92 | NodeData::Comment { .. } => {
93 // Currently left unimplemented, as we're not interested in this data
94 // at this time.
95 }
96 NodeData::Element {
97 ref name,
98 ref attrs,
99 ..
100 } => {
101 let tag_name = name.local.to_string();
102 if !tag_name.is_empty() {
103 current_element = Some(HtmlElement::new(tag_name, attrs.clone()));
104 }
105 }
106 NodeData::Text { ref contents } => {
107 let text = contents.borrow().to_string();
108 self.visit_text(text, handlers)?;
109 }
110 }
111
112 if let Some(current_element) = current_element.as_ref() {
113 match self.start_tag(current_element, handlers) {
114 StartTagOutcome::Continue => {}
115 StartTagOutcome::Skip => return Ok(()),
116 }
117
118 self.current_element_stack
119 .push_back(current_element.clone());
120 }
121
122 if self.current_element_stack.len() < 200 {
123 for child in node.children.borrow().iter() {
124 self.visit_node(child, handlers)?;
125 }
126 }
127
128 if let Some(current_element) = current_element {
129 self.current_element_stack.pop_back();
130 self.end_tag(¤t_element, handlers);
131 }
132
133 Ok(())
134 }
135
136 fn start_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) -> StartTagOutcome {
137 for handler in handlers {
138 if handler.borrow().should_handle(tag.tag()) {
139 match handler.borrow_mut().handle_tag_start(tag, self) {
140 StartTagOutcome::Continue => {}
141 StartTagOutcome::Skip => return StartTagOutcome::Skip,
142 }
143 }
144 }
145
146 StartTagOutcome::Continue
147 }
148
149 fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) {
150 for handler in handlers {
151 if handler.borrow().should_handle(tag.tag()) {
152 handler.borrow_mut().handle_tag_end(tag, self);
153 }
154 }
155 }
156
157 fn visit_text(&mut self, text: String, handlers: &mut [TagHandler]) -> Result<()> {
158 for handler in handlers {
159 match handler.borrow_mut().handle_text(&text, self) {
160 HandlerOutcome::Handled => return Ok(()),
161 HandlerOutcome::NoOp => {}
162 }
163 }
164
165 let text = text
166 .trim_matches(|char| char == '\n' || char == '\r' || char == '\t')
167 .replace('\n', " ");
168
169 self.push_str(&text);
170
171 Ok(())
172 }
173}
174
175pub enum HandlerOutcome {
176 Handled,
177 NoOp,
178}
179
180pub trait HandleTag {
181 /// Returns whether this handler should handle the given tag.
182 fn should_handle(&self, tag: &str) -> bool;
183
184 /// Handles the start of the given tag.
185 fn handle_tag_start(
186 &mut self,
187 _tag: &HtmlElement,
188 _writer: &mut MarkdownWriter,
189 ) -> StartTagOutcome {
190 StartTagOutcome::Continue
191 }
192
193 /// Handles the end of the given tag.
194 fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
195
196 fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
197 HandlerOutcome::NoOp
198 }
199}