markdown_writer.rs

  1use std::cell::RefCell;
  2use std::collections::VecDeque;
  3use std::rc::Rc;
  4use std::sync::OnceLock;
  5
  6use anyhow::Result;
  7use markup5ever_rcdom::{Handle, NodeData};
  8use regex::Regex;
  9
 10use crate::html_element::HtmlElement;
 11
 12fn empty_line_regex() -> &'static Regex {
 13    static REGEX: OnceLock<Regex> = OnceLock::new();
 14    REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
 15}
 16
 17fn more_than_three_newlines_regex() -> &'static Regex {
 18    static REGEX: OnceLock<Regex> = OnceLock::new();
 19    REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
 20}
 21
 22pub enum StartTagOutcome {
 23    Continue,
 24    Skip,
 25}
 26
 27pub type TagHandler = Rc<RefCell<dyn HandleTag>>;
 28
 29pub struct MarkdownWriter {
 30    current_element_stack: VecDeque<HtmlElement>,
 31    pub(crate) markdown: String,
 32}
 33
 34impl MarkdownWriter {
 35    pub fn new() -> Self {
 36        Self {
 37            current_element_stack: VecDeque::new(),
 38            markdown: String::new(),
 39        }
 40    }
 41
 42    pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
 43        &self.current_element_stack
 44    }
 45
 46    pub fn is_inside(&self, tag: &str) -> bool {
 47        self.current_element_stack
 48            .iter()
 49            .any(|parent_element| parent_element.tag() == tag)
 50    }
 51
 52    /// Appends the given string slice onto the end of the Markdown output.
 53    pub fn push_str(&mut self, str: &str) {
 54        self.markdown.push_str(str);
 55    }
 56
 57    /// Appends a newline to the end of the Markdown output.
 58    pub fn push_newline(&mut self) {
 59        self.push_str("\n");
 60    }
 61
 62    /// Appends a blank line to the end of the Markdown output.
 63    pub fn push_blank_line(&mut self) {
 64        self.push_str("\n\n");
 65    }
 66
 67    pub fn run(mut self, root_node: &Handle, handlers: &mut Vec<TagHandler>) -> Result<String> {
 68        self.visit_node(&root_node, handlers)?;
 69        Ok(Self::prettify_markdown(self.markdown))
 70    }
 71
 72    fn prettify_markdown(markdown: String) -> String {
 73        let markdown = empty_line_regex().replace_all(&markdown, "");
 74        let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
 75
 76        markdown.trim().to_string()
 77    }
 78
 79    fn visit_node(&mut self, node: &Handle, handlers: &mut [TagHandler]) -> Result<()> {
 80        let mut current_element = None;
 81
 82        match node.data {
 83            NodeData::Document
 84            | NodeData::Doctype { .. }
 85            | NodeData::ProcessingInstruction { .. }
 86            | NodeData::Comment { .. } => {
 87                // Currently left unimplemented, as we're not interested in this data
 88                // at this time.
 89            }
 90            NodeData::Element {
 91                ref name,
 92                ref attrs,
 93                ..
 94            } => {
 95                let tag_name = name.local.to_string();
 96                if !tag_name.is_empty() {
 97                    current_element = Some(HtmlElement::new(tag_name, attrs.clone()));
 98                }
 99            }
100            NodeData::Text { ref contents } => {
101                let text = contents.borrow().to_string();
102                self.visit_text(text, handlers)?;
103            }
104        }
105
106        if let Some(current_element) = current_element.as_ref() {
107            match self.start_tag(&current_element, handlers) {
108                StartTagOutcome::Continue => {}
109                StartTagOutcome::Skip => return Ok(()),
110            }
111
112            self.current_element_stack
113                .push_back(current_element.clone());
114        }
115
116        for child in node.children.borrow().iter() {
117            self.visit_node(child, handlers)?;
118        }
119
120        if let Some(current_element) = current_element {
121            self.current_element_stack.pop_back();
122            self.end_tag(&current_element, handlers);
123        }
124
125        Ok(())
126    }
127
128    fn start_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) -> StartTagOutcome {
129        for handler in handlers {
130            if handler.borrow().should_handle(tag.tag()) {
131                match handler.borrow_mut().handle_tag_start(tag, self) {
132                    StartTagOutcome::Continue => {}
133                    StartTagOutcome::Skip => return StartTagOutcome::Skip,
134                }
135            }
136        }
137
138        StartTagOutcome::Continue
139    }
140
141    fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) {
142        for handler in handlers {
143            if handler.borrow().should_handle(tag.tag()) {
144                handler.borrow_mut().handle_tag_end(tag, self);
145            }
146        }
147    }
148
149    fn visit_text(&mut self, text: String, handlers: &mut [TagHandler]) -> Result<()> {
150        for handler in handlers {
151            match handler.borrow_mut().handle_text(&text, self) {
152                HandlerOutcome::Handled => return Ok(()),
153                HandlerOutcome::NoOp => {}
154            }
155        }
156
157        let text = text
158            .trim_matches(|char| char == '\n' || char == '\r' || char == '\t')
159            .replace('\n', " ");
160
161        self.push_str(&text);
162
163        Ok(())
164    }
165}
166
167pub enum HandlerOutcome {
168    Handled,
169    NoOp,
170}
171
172pub trait HandleTag {
173    /// Returns whether this handler should handle the given tag.
174    fn should_handle(&self, tag: &str) -> bool;
175
176    /// Handles the start of the given tag.
177    fn handle_tag_start(
178        &mut self,
179        _tag: &HtmlElement,
180        _writer: &mut MarkdownWriter,
181    ) -> StartTagOutcome {
182        StartTagOutcome::Continue
183    }
184
185    /// Handles the end of the given tag.
186    fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
187
188    fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
189        HandlerOutcome::NoOp
190    }
191}