markdown_writer.rs

  1use std::collections::VecDeque;
  2use std::rc::Rc;
  3use std::{cell::RefCell, sync::LazyLock};
  4
  5use anyhow::Result;
  6use markup5ever_rcdom::{Handle, NodeData};
  7use regex::Regex;
  8
  9use crate::html_element::HtmlElement;
 10
 11fn empty_line_regex() -> &'static Regex {
 12    static REGEX: LazyLock<Regex> =
 13        LazyLock::new(|| Regex::new(r"^\s*$").expect("Failed to create empty_line_regex"));
 14    &REGEX
 15}
 16
 17fn more_than_three_newlines_regex() -> &'static Regex {
 18    static REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
 19    &REGEX
 20}
 21
 22pub enum StartTagOutcome {
 23    Continue,
 24    Skip,
 25}
 26
 27pub type TagHandler = Rc<RefCell<dyn HandleTag>>;
 28
 29pub struct MarkdownWriter {
 30    current_element_stack: VecDeque<HtmlElement>,
 31    pub(crate) markdown: String,
 32}
 33
 34impl Default for MarkdownWriter {
 35    fn default() -> Self {
 36        Self::new()
 37    }
 38}
 39
 40impl MarkdownWriter {
 41    pub fn new() -> Self {
 42        Self {
 43            current_element_stack: VecDeque::new(),
 44            markdown: String::new(),
 45        }
 46    }
 47
 48    pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
 49        &self.current_element_stack
 50    }
 51
 52    pub fn is_inside(&self, tag: &str) -> bool {
 53        self.current_element_stack
 54            .iter()
 55            .any(|parent_element| parent_element.tag() == tag)
 56    }
 57
 58    /// Appends the given string slice onto the end of the Markdown output.
 59    pub fn push_str(&mut self, str: &str) {
 60        self.markdown.push_str(str);
 61    }
 62
 63    /// Appends a newline to the end of the Markdown output.
 64    pub fn push_newline(&mut self) {
 65        self.push_str("\n");
 66    }
 67
 68    /// Appends a blank line to the end of the Markdown output.
 69    pub fn push_blank_line(&mut self) {
 70        self.push_str("\n\n");
 71    }
 72
 73    pub fn run(mut self, root_node: &Handle, handlers: &mut [TagHandler]) -> Result<String> {
 74        self.visit_node(root_node, handlers)?;
 75        Ok(Self::prettify_markdown(self.markdown))
 76    }
 77
 78    fn prettify_markdown(markdown: String) -> String {
 79        let markdown = empty_line_regex().replace_all(&markdown, "");
 80        let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
 81
 82        markdown.trim().to_string()
 83    }
 84
 85    fn visit_node(&mut self, node: &Handle, handlers: &mut [TagHandler]) -> Result<()> {
 86        let mut current_element = None;
 87
 88        match node.data {
 89            NodeData::Document
 90            | NodeData::Doctype { .. }
 91            | NodeData::ProcessingInstruction { .. }
 92            | NodeData::Comment { .. } => {
 93                // Currently left unimplemented, as we're not interested in this data
 94                // at this time.
 95            }
 96            NodeData::Element {
 97                ref name,
 98                ref attrs,
 99                ..
100            } => {
101                let tag_name = name.local.to_string();
102                if !tag_name.is_empty() {
103                    current_element = Some(HtmlElement::new(tag_name, attrs.clone()));
104                }
105            }
106            NodeData::Text { ref contents } => {
107                let text = contents.borrow().to_string();
108                self.visit_text(text, handlers)?;
109            }
110        }
111
112        if let Some(current_element) = current_element.as_ref() {
113            match self.start_tag(current_element, handlers) {
114                StartTagOutcome::Continue => {}
115                StartTagOutcome::Skip => return Ok(()),
116            }
117
118            self.current_element_stack
119                .push_back(current_element.clone());
120        }
121
122        if self.current_element_stack.len() < 200 {
123            for child in node.children.borrow().iter() {
124                self.visit_node(child, handlers)?;
125            }
126        }
127
128        if let Some(current_element) = current_element {
129            self.current_element_stack.pop_back();
130            self.end_tag(&current_element, handlers);
131        }
132
133        Ok(())
134    }
135
136    fn start_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) -> StartTagOutcome {
137        for handler in handlers {
138            if handler.borrow().should_handle(tag.tag()) {
139                match handler.borrow_mut().handle_tag_start(tag, self) {
140                    StartTagOutcome::Continue => {}
141                    StartTagOutcome::Skip => return StartTagOutcome::Skip,
142                }
143            }
144        }
145
146        StartTagOutcome::Continue
147    }
148
149    fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) {
150        for handler in handlers {
151            if handler.borrow().should_handle(tag.tag()) {
152                handler.borrow_mut().handle_tag_end(tag, self);
153            }
154        }
155    }
156
157    fn visit_text(&mut self, text: String, handlers: &mut [TagHandler]) -> Result<()> {
158        for handler in handlers {
159            match handler.borrow_mut().handle_text(&text, self) {
160                HandlerOutcome::Handled => return Ok(()),
161                HandlerOutcome::NoOp => {}
162            }
163        }
164
165        let text = text
166            .trim_matches(|char| char == '\n' || char == '\r' || char == '\t')
167            .replace('\n', " ");
168
169        self.push_str(&text);
170
171        Ok(())
172    }
173}
174
175pub enum HandlerOutcome {
176    Handled,
177    NoOp,
178}
179
180pub trait HandleTag {
181    /// Returns whether this handler should handle the given tag.
182    fn should_handle(&self, tag: &str) -> bool;
183
184    /// Handles the start of the given tag.
185    fn handle_tag_start(
186        &mut self,
187        _tag: &HtmlElement,
188        _writer: &mut MarkdownWriter,
189    ) -> StartTagOutcome {
190        StartTagOutcome::Continue
191    }
192
193    /// Handles the end of the given tag.
194    fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
195
196    fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
197        HandlerOutcome::NoOp
198    }
199}