markdown_writer.rs

  1use std::collections::VecDeque;
  2use std::sync::OnceLock;
  3
  4use anyhow::Result;
  5use markup5ever_rcdom::{Handle, NodeData};
  6use regex::Regex;
  7
  8use crate::html_element::HtmlElement;
  9
 10fn empty_line_regex() -> &'static Regex {
 11    static REGEX: OnceLock<Regex> = OnceLock::new();
 12    REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
 13}
 14
 15fn more_than_three_newlines_regex() -> &'static Regex {
 16    static REGEX: OnceLock<Regex> = OnceLock::new();
 17    REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
 18}
 19
 20pub enum StartTagOutcome {
 21    Continue,
 22    Skip,
 23}
 24
 25pub struct MarkdownWriter {
 26    current_element_stack: VecDeque<HtmlElement>,
 27    pub(crate) markdown: String,
 28}
 29
 30impl MarkdownWriter {
 31    pub fn new() -> Self {
 32        Self {
 33            current_element_stack: VecDeque::new(),
 34            markdown: String::new(),
 35        }
 36    }
 37
 38    pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
 39        &self.current_element_stack
 40    }
 41
 42    pub fn is_inside(&self, tag: &str) -> bool {
 43        self.current_element_stack
 44            .iter()
 45            .any(|parent_element| parent_element.tag == tag)
 46    }
 47
 48    /// Appends the given string slice onto the end of the Markdown output.
 49    pub fn push_str(&mut self, str: &str) {
 50        self.markdown.push_str(str);
 51    }
 52
 53    /// Appends a newline to the end of the Markdown output.
 54    pub fn push_newline(&mut self) {
 55        self.push_str("\n");
 56    }
 57
 58    /// Appends a blank line to the end of the Markdown output.
 59    pub fn push_blank_line(&mut self) {
 60        self.push_str("\n\n");
 61    }
 62
 63    pub fn run(
 64        mut self,
 65        root_node: &Handle,
 66        mut handlers: Vec<Box<dyn HandleTag>>,
 67    ) -> Result<String> {
 68        self.visit_node(&root_node, &mut handlers)?;
 69        Ok(Self::prettify_markdown(self.markdown))
 70    }
 71
 72    fn prettify_markdown(markdown: String) -> String {
 73        let markdown = empty_line_regex().replace_all(&markdown, "");
 74        let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
 75
 76        markdown.trim().to_string()
 77    }
 78
 79    fn visit_node(&mut self, node: &Handle, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
 80        let mut current_element = None;
 81
 82        match node.data {
 83            NodeData::Document
 84            | NodeData::Doctype { .. }
 85            | NodeData::ProcessingInstruction { .. }
 86            | NodeData::Comment { .. } => {
 87                // Currently left unimplemented, as we're not interested in this data
 88                // at this time.
 89            }
 90            NodeData::Element {
 91                ref name,
 92                ref attrs,
 93                ..
 94            } => {
 95                let tag_name = name.local.to_string();
 96                if !tag_name.is_empty() {
 97                    current_element = Some(HtmlElement {
 98                        tag: tag_name,
 99                        attrs: attrs.clone(),
100                    });
101                }
102            }
103            NodeData::Text { ref contents } => {
104                let text = contents.borrow().to_string();
105                self.visit_text(text, handlers)?;
106            }
107        }
108
109        if let Some(current_element) = current_element.as_ref() {
110            match self.start_tag(&current_element, handlers) {
111                StartTagOutcome::Continue => {}
112                StartTagOutcome::Skip => return Ok(()),
113            }
114
115            self.current_element_stack
116                .push_back(current_element.clone());
117        }
118
119        for child in node.children.borrow().iter() {
120            self.visit_node(child, handlers)?;
121        }
122
123        if let Some(current_element) = current_element {
124            self.current_element_stack.pop_back();
125            self.end_tag(&current_element, handlers);
126        }
127
128        Ok(())
129    }
130
131    fn start_tag(
132        &mut self,
133        tag: &HtmlElement,
134        handlers: &mut [Box<dyn HandleTag>],
135    ) -> StartTagOutcome {
136        for handler in handlers {
137            if handler.should_handle(tag.tag.as_str()) {
138                match handler.handle_tag_start(tag, self) {
139                    StartTagOutcome::Continue => {}
140                    StartTagOutcome::Skip => return StartTagOutcome::Skip,
141                }
142            }
143        }
144
145        StartTagOutcome::Continue
146    }
147
148    fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box<dyn HandleTag>]) {
149        for handler in handlers {
150            if handler.should_handle(tag.tag.as_str()) {
151                handler.handle_tag_end(tag, self);
152            }
153        }
154    }
155
156    fn visit_text(&mut self, text: String, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
157        for handler in handlers {
158            match handler.handle_text(&text, self) {
159                HandlerOutcome::Handled => return Ok(()),
160                HandlerOutcome::NoOp => {}
161            }
162        }
163
164        let text = text
165            .trim_matches(|char| char == '\n' || char == '\r')
166            .replace('\n', " ");
167
168        self.push_str(&text);
169
170        Ok(())
171    }
172}
173
174pub enum HandlerOutcome {
175    Handled,
176    NoOp,
177}
178
179pub trait HandleTag {
180    /// Returns whether this handler should handle the given tag.
181    fn should_handle(&self, tag: &str) -> bool;
182
183    /// Handles the start of the given tag.
184    fn handle_tag_start(
185        &mut self,
186        _tag: &HtmlElement,
187        _writer: &mut MarkdownWriter,
188    ) -> StartTagOutcome {
189        StartTagOutcome::Continue
190    }
191
192    /// Handles the end of the given tag.
193    fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
194
195    fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
196        HandlerOutcome::NoOp
197    }
198}