1use crate::html_element::HtmlElement;
2use crate::markdown_writer::{HandlerOutcome, MarkdownWriter, StartTagOutcome};
3use crate::HandleTag;
4
5pub struct WikipediaChromeRemover;
6
7impl HandleTag for WikipediaChromeRemover {
8 fn should_handle(&self, _tag: &str) -> bool {
9 true
10 }
11
12 fn handle_tag_start(
13 &mut self,
14 tag: &HtmlElement,
15 _writer: &mut MarkdownWriter,
16 ) -> StartTagOutcome {
17 match tag.tag() {
18 "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip,
19 "sup" => {
20 if tag.has_class("reference") {
21 return StartTagOutcome::Skip;
22 }
23 }
24 "div" | "span" | "a" => {
25 if tag.attr("id").as_deref() == Some("p-lang-btn") {
26 return StartTagOutcome::Skip;
27 }
28
29 if tag.attr("id").as_deref() == Some("p-search") {
30 return StartTagOutcome::Skip;
31 }
32
33 let classes_to_skip = ["noprint", "mw-editsection", "mw-jump-link"];
34 if tag.has_any_classes(&classes_to_skip) {
35 return StartTagOutcome::Skip;
36 }
37 }
38 _ => {}
39 }
40
41 StartTagOutcome::Continue
42 }
43}
44
45pub struct WikipediaInfoboxHandler;
46
47impl HandleTag for WikipediaInfoboxHandler {
48 fn should_handle(&self, tag: &str) -> bool {
49 tag == "table"
50 }
51
52 fn handle_tag_start(
53 &mut self,
54 tag: &HtmlElement,
55 _writer: &mut MarkdownWriter,
56 ) -> StartTagOutcome {
57 match tag.tag() {
58 "table" => {
59 if tag.has_class("infobox") {
60 return StartTagOutcome::Skip;
61 }
62 }
63 _ => {}
64 }
65
66 StartTagOutcome::Continue
67 }
68}
69
70pub struct WikipediaCodeHandler {
71 language: Option<String>,
72}
73
74impl WikipediaCodeHandler {
75 pub fn new() -> Self {
76 Self { language: None }
77 }
78}
79
80impl HandleTag for WikipediaCodeHandler {
81 fn should_handle(&self, tag: &str) -> bool {
82 match tag {
83 "div" | "pre" | "code" => true,
84 _ => false,
85 }
86 }
87
88 fn handle_tag_start(
89 &mut self,
90 tag: &HtmlElement,
91 writer: &mut MarkdownWriter,
92 ) -> StartTagOutcome {
93 match tag.tag() {
94 "code" => {
95 if !writer.is_inside("pre") {
96 writer.push_str("`");
97 }
98 }
99 "div" => {
100 let classes = tag.classes();
101 self.language = classes.iter().find_map(|class| {
102 if let Some((_, language)) = class.split_once("mw-highlight-lang-") {
103 Some(language.trim().to_owned())
104 } else {
105 None
106 }
107 });
108 }
109 "pre" => {
110 writer.push_blank_line();
111 writer.push_str("```");
112 if let Some(language) = self.language.take() {
113 writer.push_str(&language);
114 }
115 writer.push_newline();
116 }
117 _ => {}
118 }
119
120 StartTagOutcome::Continue
121 }
122
123 fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
124 match tag.tag() {
125 "code" => {
126 if !writer.is_inside("pre") {
127 writer.push_str("`");
128 }
129 }
130 "pre" => writer.push_str("\n```\n"),
131 _ => {}
132 }
133 }
134
135 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
136 if writer.is_inside("pre") {
137 writer.push_str(&text);
138 return HandlerOutcome::Handled;
139 }
140
141 HandlerOutcome::NoOp
142 }
143}
144
145#[cfg(test)]
146mod tests {
147 use std::cell::RefCell;
148 use std::rc::Rc;
149
150 use indoc::indoc;
151 use pretty_assertions::assert_eq;
152
153 use crate::{convert_html_to_markdown, markdown, TagHandler};
154
155 use super::*;
156
157 fn wikipedia_handlers() -> Vec<TagHandler> {
158 vec![
159 Rc::new(RefCell::new(markdown::ParagraphHandler)),
160 Rc::new(RefCell::new(markdown::HeadingHandler)),
161 Rc::new(RefCell::new(markdown::ListHandler)),
162 Rc::new(RefCell::new(markdown::StyledTextHandler)),
163 Rc::new(RefCell::new(WikipediaChromeRemover)),
164 ]
165 }
166
167 #[test]
168 fn test_citation_references_get_removed() {
169 let html = indoc! {r##"
170 <p>Rust began as a personal project in 2006 by <a href="/wiki/Mozilla" title="Mozilla">Mozilla</a> Research employee Graydon Hoare.<sup id="cite_ref-MITTechReview_23-0" class="reference"><a href="#cite_note-MITTechReview-23">[20]</a></sup> Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental <a href="/wiki/Browser_engine" title="Browser engine">browser engine</a> called <a href="/wiki/Servo_(software)" title="Servo (software)">Servo</a>,<sup id="cite_ref-infoq2012_24-0" class="reference"><a href="#cite_note-infoq2012-24">[21]</a></sup> which was officially announced by Mozilla in 2010.<sup id="cite_ref-MattAsay_25-0" class="reference"><a href="#cite_note-MattAsay-25">[22]</a></sup><sup id="cite_ref-26" class="reference"><a href="#cite_note-26">[23]</a></sup> Rust's memory and ownership system was influenced by <a href="/wiki/Region-based_memory_management" title="Region-based memory management">region-based memory management</a> in languages such as <a href="/wiki/Cyclone_(programming_language)" title="Cyclone (programming language)">Cyclone</a> and ML Kit.<sup id="cite_ref-influences_8-13" class="reference"><a href="#cite_note-influences-8">[5]</a></sup>
171 </p>
172 "##};
173 let expected = indoc! {"
174 Rust began as a personal project in 2006 by Mozilla Research employee Graydon Hoare. Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental browser engine called Servo, which was officially announced by Mozilla in 2010. Rust's memory and ownership system was influenced by region-based memory management in languages such as Cyclone and ML Kit.
175 "}
176 .trim();
177
178 assert_eq!(
179 convert_html_to_markdown(html.as_bytes(), &mut wikipedia_handlers()).unwrap(),
180 expected
181 )
182 }
183}