1use crate::HandleTag;
  2use crate::html_element::HtmlElement;
  3use crate::markdown_writer::{HandlerOutcome, MarkdownWriter, StartTagOutcome};
  4
  5pub struct WikipediaChromeRemover;
  6
  7impl HandleTag for WikipediaChromeRemover {
  8    fn should_handle(&self, _tag: &str) -> bool {
  9        true
 10    }
 11
 12    fn handle_tag_start(
 13        &mut self,
 14        tag: &HtmlElement,
 15        _writer: &mut MarkdownWriter,
 16    ) -> StartTagOutcome {
 17        match tag.tag() {
 18            "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip,
 19            "sup" => {
 20                if tag.has_class("reference") {
 21                    return StartTagOutcome::Skip;
 22                }
 23            }
 24            "div" | "span" | "a" => {
 25                if tag.attr("id").as_deref() == Some("p-lang-btn") {
 26                    return StartTagOutcome::Skip;
 27                }
 28
 29                if tag.attr("id").as_deref() == Some("p-search") {
 30                    return StartTagOutcome::Skip;
 31                }
 32
 33                let classes_to_skip = ["noprint", "mw-editsection", "mw-jump-link"];
 34                if tag.has_any_classes(&classes_to_skip) {
 35                    return StartTagOutcome::Skip;
 36                }
 37            }
 38            _ => {}
 39        }
 40
 41        StartTagOutcome::Continue
 42    }
 43}
 44
 45pub struct WikipediaInfoboxHandler;
 46
 47impl HandleTag for WikipediaInfoboxHandler {
 48    fn should_handle(&self, tag: &str) -> bool {
 49        tag == "table"
 50    }
 51
 52    fn handle_tag_start(
 53        &mut self,
 54        tag: &HtmlElement,
 55        _writer: &mut MarkdownWriter,
 56    ) -> StartTagOutcome {
 57        if tag.tag() == "table" && tag.has_class("infobox") {
 58            return StartTagOutcome::Skip;
 59        }
 60
 61        StartTagOutcome::Continue
 62    }
 63}
 64
 65pub struct WikipediaCodeHandler {
 66    language: Option<String>,
 67}
 68
 69impl WikipediaCodeHandler {
 70    pub const fn new() -> Self {
 71        Self { language: None }
 72    }
 73}
 74
 75impl Default for WikipediaCodeHandler {
 76    fn default() -> Self {
 77        Self::new()
 78    }
 79}
 80
 81impl HandleTag for WikipediaCodeHandler {
 82    fn should_handle(&self, tag: &str) -> bool {
 83        matches!(tag, "div" | "pre" | "code")
 84    }
 85
 86    fn handle_tag_start(
 87        &mut self,
 88        tag: &HtmlElement,
 89        writer: &mut MarkdownWriter,
 90    ) -> StartTagOutcome {
 91        match tag.tag() {
 92            "code" => {
 93                if !writer.is_inside("pre") {
 94                    writer.push_str("`");
 95                }
 96            }
 97            "div" => {
 98                let classes = tag.classes();
 99                self.language = classes.iter().find_map(|class| {
100                    if let Some((_, language)) = class.split_once("mw-highlight-lang-") {
101                        Some(language.trim().to_owned())
102                    } else {
103                        None
104                    }
105                });
106            }
107            "pre" => {
108                writer.push_blank_line();
109                writer.push_str("```");
110                if let Some(language) = self.language.take() {
111                    writer.push_str(&language);
112                }
113                writer.push_newline();
114            }
115            _ => {}
116        }
117
118        StartTagOutcome::Continue
119    }
120
121    fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
122        match tag.tag() {
123            "code" => {
124                if !writer.is_inside("pre") {
125                    writer.push_str("`");
126                }
127            }
128            "pre" => writer.push_str("\n```\n"),
129            _ => {}
130        }
131    }
132
133    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
134        if writer.is_inside("pre") {
135            writer.push_str(text);
136            return HandlerOutcome::Handled;
137        }
138
139        HandlerOutcome::NoOp
140    }
141}
142
143#[cfg(test)]
144mod tests {
145    use std::cell::RefCell;
146    use std::rc::Rc;
147
148    use indoc::indoc;
149    use pretty_assertions::assert_eq;
150
151    use crate::{TagHandler, convert_html_to_markdown, markdown};
152
153    use super::*;
154
155    fn wikipedia_handlers() -> Vec<TagHandler> {
156        vec![
157            Rc::new(RefCell::new(markdown::ParagraphHandler)),
158            Rc::new(RefCell::new(markdown::HeadingHandler)),
159            Rc::new(RefCell::new(markdown::ListHandler)),
160            Rc::new(RefCell::new(markdown::StyledTextHandler)),
161            Rc::new(RefCell::new(WikipediaChromeRemover)),
162        ]
163    }
164
165    #[test]
166    fn test_citation_references_get_removed() {
167        let html = indoc! {r##"
168            <p>Rust began as a personal project in 2006 by <a href="/wiki/Mozilla" title="Mozilla">Mozilla</a> Research employee Graydon Hoare.<sup id="cite_ref-MITTechReview_23-0" class="reference"><a href="#cite_note-MITTechReview-23">[20]</a></sup> Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental <a href="/wiki/Browser_engine" title="Browser engine">browser engine</a> called <a href="/wiki/Servo_(software)" title="Servo (software)">Servo</a>,<sup id="cite_ref-infoq2012_24-0" class="reference"><a href="#cite_note-infoq2012-24">[21]</a></sup> which was officially announced by Mozilla in 2010.<sup id="cite_ref-MattAsay_25-0" class="reference"><a href="#cite_note-MattAsay-25">[22]</a></sup><sup id="cite_ref-26" class="reference"><a href="#cite_note-26">[23]</a></sup> Rust's memory and ownership system was influenced by <a href="/wiki/Region-based_memory_management" title="Region-based memory management">region-based memory management</a> in languages such as <a href="/wiki/Cyclone_(programming_language)" title="Cyclone (programming language)">Cyclone</a> and ML Kit.<sup id="cite_ref-influences_8-13" class="reference"><a href="#cite_note-influences-8">[5]</a></sup>
169            </p>
170        "##};
171        let expected = indoc! {"
172            Rust began as a personal project in 2006 by Mozilla Research employee Graydon Hoare.  Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental browser engine called Servo,  which was officially announced by Mozilla in 2010.  Rust's memory and ownership system was influenced by region-based memory management in languages such as Cyclone and ML Kit.
173        "}
174        .trim();
175
176        assert_eq!(
177            convert_html_to_markdown(html.as_bytes(), &mut wikipedia_handlers()).unwrap(),
178            expected
179        )
180    }
181}