1use crate::HandleTag;
2use crate::html_element::HtmlElement;
3use crate::markdown_writer::{HandlerOutcome, MarkdownWriter, StartTagOutcome};
4
5pub struct WikipediaChromeRemover;
6
7impl HandleTag for WikipediaChromeRemover {
8 fn should_handle(&self, _tag: &str) -> bool {
9 true
10 }
11
12 fn handle_tag_start(
13 &mut self,
14 tag: &HtmlElement,
15 _writer: &mut MarkdownWriter,
16 ) -> StartTagOutcome {
17 match tag.tag() {
18 "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip,
19 "sup" => {
20 if tag.has_class("reference") {
21 return StartTagOutcome::Skip;
22 }
23 }
24 "div" | "span" | "a" => {
25 if tag.attr("id").as_deref() == Some("p-lang-btn") {
26 return StartTagOutcome::Skip;
27 }
28
29 if tag.attr("id").as_deref() == Some("p-search") {
30 return StartTagOutcome::Skip;
31 }
32
33 let classes_to_skip = ["noprint", "mw-editsection", "mw-jump-link"];
34 if tag.has_any_classes(&classes_to_skip) {
35 return StartTagOutcome::Skip;
36 }
37 }
38 _ => {}
39 }
40
41 StartTagOutcome::Continue
42 }
43}
44
45pub struct WikipediaInfoboxHandler;
46
47impl HandleTag for WikipediaInfoboxHandler {
48 fn should_handle(&self, tag: &str) -> bool {
49 tag == "table"
50 }
51
52 fn handle_tag_start(
53 &mut self,
54 tag: &HtmlElement,
55 _writer: &mut MarkdownWriter,
56 ) -> StartTagOutcome {
57 if tag.tag() == "table" && tag.has_class("infobox") {
58 return StartTagOutcome::Skip;
59 }
60
61 StartTagOutcome::Continue
62 }
63}
64
65pub struct WikipediaCodeHandler {
66 language: Option<String>,
67}
68
69impl WikipediaCodeHandler {
70 pub fn new() -> Self {
71 Self { language: None }
72 }
73}
74
75impl Default for WikipediaCodeHandler {
76 fn default() -> Self {
77 Self::new()
78 }
79}
80
81impl HandleTag for WikipediaCodeHandler {
82 fn should_handle(&self, tag: &str) -> bool {
83 matches!(tag, "div" | "pre" | "code")
84 }
85
86 fn handle_tag_start(
87 &mut self,
88 tag: &HtmlElement,
89 writer: &mut MarkdownWriter,
90 ) -> StartTagOutcome {
91 match tag.tag() {
92 "code" => {
93 if !writer.is_inside("pre") {
94 writer.push_str("`");
95 }
96 }
97 "div" => {
98 let classes = tag.classes();
99 self.language = classes.iter().find_map(|class| {
100 if let Some((_, language)) = class.split_once("mw-highlight-lang-") {
101 Some(language.trim().to_owned())
102 } else {
103 None
104 }
105 });
106 }
107 "pre" => {
108 writer.push_blank_line();
109 writer.push_str("```");
110 if let Some(language) = self.language.take() {
111 writer.push_str(&language);
112 }
113 writer.push_newline();
114 }
115 _ => {}
116 }
117
118 StartTagOutcome::Continue
119 }
120
121 fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
122 match tag.tag() {
123 "code" => {
124 if !writer.is_inside("pre") {
125 writer.push_str("`");
126 }
127 }
128 "pre" => writer.push_str("\n```\n"),
129 _ => {}
130 }
131 }
132
133 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
134 if writer.is_inside("pre") {
135 writer.push_str(text);
136 return HandlerOutcome::Handled;
137 }
138
139 HandlerOutcome::NoOp
140 }
141}
142
143#[cfg(test)]
144mod tests {
145 use std::cell::RefCell;
146 use std::rc::Rc;
147
148 use indoc::indoc;
149 use pretty_assertions::assert_eq;
150
151 use crate::{TagHandler, convert_html_to_markdown, markdown};
152
153 use super::*;
154
155 fn wikipedia_handlers() -> Vec<TagHandler> {
156 vec![
157 Rc::new(RefCell::new(markdown::ParagraphHandler)),
158 Rc::new(RefCell::new(markdown::HeadingHandler)),
159 Rc::new(RefCell::new(markdown::ListHandler)),
160 Rc::new(RefCell::new(markdown::StyledTextHandler)),
161 Rc::new(RefCell::new(WikipediaChromeRemover)),
162 ]
163 }
164
165 #[test]
166 fn test_citation_references_get_removed() {
167 let html = indoc! {r##"
168 <p>Rust began as a personal project in 2006 by <a href="/wiki/Mozilla" title="Mozilla">Mozilla</a> Research employee Graydon Hoare.<sup id="cite_ref-MITTechReview_23-0" class="reference"><a href="#cite_note-MITTechReview-23">[20]</a></sup> Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental <a href="/wiki/Browser_engine" title="Browser engine">browser engine</a> called <a href="/wiki/Servo_(software)" title="Servo (software)">Servo</a>,<sup id="cite_ref-infoq2012_24-0" class="reference"><a href="#cite_note-infoq2012-24">[21]</a></sup> which was officially announced by Mozilla in 2010.<sup id="cite_ref-MattAsay_25-0" class="reference"><a href="#cite_note-MattAsay-25">[22]</a></sup><sup id="cite_ref-26" class="reference"><a href="#cite_note-26">[23]</a></sup> Rust's memory and ownership system was influenced by <a href="/wiki/Region-based_memory_management" title="Region-based memory management">region-based memory management</a> in languages such as <a href="/wiki/Cyclone_(programming_language)" title="Cyclone (programming language)">Cyclone</a> and ML Kit.<sup id="cite_ref-influences_8-13" class="reference"><a href="#cite_note-influences-8">[5]</a></sup>
169 </p>
170 "##};
171 let expected = indoc! {"
172 Rust began as a personal project in 2006 by Mozilla Research employee Graydon Hoare. Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental browser engine called Servo, which was officially announced by Mozilla in 2010. Rust's memory and ownership system was influenced by region-based memory management in languages such as Cyclone and ML Kit.
173 "}
174 .trim();
175
176 assert_eq!(
177 convert_html_to_markdown(html.as_bytes(), &mut wikipedia_handlers()).unwrap(),
178 expected
179 )
180 }
181}