wikipedia.rs

  1use crate::html_element::HtmlElement;
  2use crate::markdown_writer::{HandlerOutcome, MarkdownWriter, StartTagOutcome};
  3use crate::HandleTag;
  4
  5pub struct WikipediaChromeRemover;
  6
  7impl HandleTag for WikipediaChromeRemover {
  8    fn should_handle(&self, _tag: &str) -> bool {
  9        true
 10    }
 11
 12    fn handle_tag_start(
 13        &mut self,
 14        tag: &HtmlElement,
 15        _writer: &mut MarkdownWriter,
 16    ) -> StartTagOutcome {
 17        match tag.tag() {
 18            "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip,
 19            "sup" => {
 20                if tag.has_class("reference") {
 21                    return StartTagOutcome::Skip;
 22                }
 23            }
 24            "div" | "span" | "a" => {
 25                if tag.attr("id").as_deref() == Some("p-lang-btn") {
 26                    return StartTagOutcome::Skip;
 27                }
 28
 29                if tag.attr("id").as_deref() == Some("p-search") {
 30                    return StartTagOutcome::Skip;
 31                }
 32
 33                let classes_to_skip = ["noprint", "mw-editsection", "mw-jump-link"];
 34                if tag.has_any_classes(&classes_to_skip) {
 35                    return StartTagOutcome::Skip;
 36                }
 37            }
 38            _ => {}
 39        }
 40
 41        StartTagOutcome::Continue
 42    }
 43}
 44
 45pub struct WikipediaInfoboxHandler;
 46
 47impl HandleTag for WikipediaInfoboxHandler {
 48    fn should_handle(&self, tag: &str) -> bool {
 49        tag == "table"
 50    }
 51
 52    fn handle_tag_start(
 53        &mut self,
 54        tag: &HtmlElement,
 55        _writer: &mut MarkdownWriter,
 56    ) -> StartTagOutcome {
 57        match tag.tag() {
 58            "table" => {
 59                if tag.has_class("infobox") {
 60                    return StartTagOutcome::Skip;
 61                }
 62            }
 63            _ => {}
 64        }
 65
 66        StartTagOutcome::Continue
 67    }
 68}
 69
 70pub struct WikipediaCodeHandler {
 71    language: Option<String>,
 72}
 73
 74impl WikipediaCodeHandler {
 75    pub fn new() -> Self {
 76        Self { language: None }
 77    }
 78}
 79
 80impl HandleTag for WikipediaCodeHandler {
 81    fn should_handle(&self, tag: &str) -> bool {
 82        match tag {
 83            "div" | "pre" | "code" => true,
 84            _ => false,
 85        }
 86    }
 87
 88    fn handle_tag_start(
 89        &mut self,
 90        tag: &HtmlElement,
 91        writer: &mut MarkdownWriter,
 92    ) -> StartTagOutcome {
 93        match tag.tag() {
 94            "code" => {
 95                if !writer.is_inside("pre") {
 96                    writer.push_str("`");
 97                }
 98            }
 99            "div" => {
100                let classes = tag.classes();
101                self.language = classes.iter().find_map(|class| {
102                    if let Some((_, language)) = class.split_once("mw-highlight-lang-") {
103                        Some(language.trim().to_owned())
104                    } else {
105                        None
106                    }
107                });
108            }
109            "pre" => {
110                writer.push_blank_line();
111                writer.push_str("```");
112                if let Some(language) = self.language.take() {
113                    writer.push_str(&language);
114                }
115                writer.push_newline();
116            }
117            _ => {}
118        }
119
120        StartTagOutcome::Continue
121    }
122
123    fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
124        match tag.tag() {
125            "code" => {
126                if !writer.is_inside("pre") {
127                    writer.push_str("`");
128                }
129            }
130            "pre" => writer.push_str("\n```\n"),
131            _ => {}
132        }
133    }
134
135    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
136        if writer.is_inside("pre") {
137            writer.push_str(&text);
138            return HandlerOutcome::Handled;
139        }
140
141        HandlerOutcome::NoOp
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    use std::cell::RefCell;
148    use std::rc::Rc;
149
150    use indoc::indoc;
151    use pretty_assertions::assert_eq;
152
153    use crate::{convert_html_to_markdown, markdown, TagHandler};
154
155    use super::*;
156
157    fn wikipedia_handlers() -> Vec<TagHandler> {
158        vec![
159            Rc::new(RefCell::new(markdown::ParagraphHandler)),
160            Rc::new(RefCell::new(markdown::HeadingHandler)),
161            Rc::new(RefCell::new(markdown::ListHandler)),
162            Rc::new(RefCell::new(markdown::StyledTextHandler)),
163            Rc::new(RefCell::new(WikipediaChromeRemover)),
164        ]
165    }
166
167    #[test]
168    fn test_citation_references_get_removed() {
169        let html = indoc! {r##"
170            <p>Rust began as a personal project in 2006 by <a href="/wiki/Mozilla" title="Mozilla">Mozilla</a> Research employee Graydon Hoare.<sup id="cite_ref-MITTechReview_23-0" class="reference"><a href="#cite_note-MITTechReview-23">[20]</a></sup> Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental <a href="/wiki/Browser_engine" title="Browser engine">browser engine</a> called <a href="/wiki/Servo_(software)" title="Servo (software)">Servo</a>,<sup id="cite_ref-infoq2012_24-0" class="reference"><a href="#cite_note-infoq2012-24">[21]</a></sup> which was officially announced by Mozilla in 2010.<sup id="cite_ref-MattAsay_25-0" class="reference"><a href="#cite_note-MattAsay-25">[22]</a></sup><sup id="cite_ref-26" class="reference"><a href="#cite_note-26">[23]</a></sup> Rust's memory and ownership system was influenced by <a href="/wiki/Region-based_memory_management" title="Region-based memory management">region-based memory management</a> in languages such as <a href="/wiki/Cyclone_(programming_language)" title="Cyclone (programming language)">Cyclone</a> and ML Kit.<sup id="cite_ref-influences_8-13" class="reference"><a href="#cite_note-influences-8">[5]</a></sup>
171            </p>
172        "##};
173        let expected = indoc! {"
174            Rust began as a personal project in 2006 by Mozilla Research employee Graydon Hoare.  Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental browser engine called Servo,  which was officially announced by Mozilla in 2010.  Rust's memory and ownership system was influenced by region-based memory management in languages such as Cyclone and ML Kit.
175        "}
176        .trim();
177
178        assert_eq!(
179            convert_html_to_markdown(html.as_bytes(), &mut wikipedia_handlers()).unwrap(),
180            expected
181        )
182    }
183}