Handle Wikipedia code blocks in `/fetch` command (#12780)

Marshall Bowers created

This PR extends the `/fetch` command with support for Wikipedia code
blocks.

Release Notes:

- N/A

Change summary

crates/assistant/src/slash_command/fetch_command.rs |   5 
crates/html_to_markdown/src/markdown_writer.rs      |   2 
crates/html_to_markdown/src/structure/wikipedia.rs  | 104 ++++++++++++++
3 files changed, 107 insertions(+), 4 deletions(-)

Detailed changes

crates/assistant/src/slash_command/fetch_command.rs 🔗

@@ -43,12 +43,15 @@ impl FetchSlashCommand {
             Box::new(markdown::ListHandler),
             Box::new(markdown::TableHandler::new()),
             Box::new(markdown::StyledTextHandler),
-            Box::new(markdown::CodeHandler),
         ];
         if url.contains("wikipedia.org") {
             use html_to_markdown::structure::wikipedia;
 
             handlers.push(Box::new(wikipedia::WikipediaChromeRemover));
+            handlers.push(Box::new(wikipedia::WikipediaInfoboxHandler));
+            handlers.push(Box::new(wikipedia::WikipediaCodeHandler::new()));
+        } else {
+            handlers.push(Box::new(markdown::CodeHandler));
         }
 
         convert_html_to_markdown(&body[..], handlers)

crates/html_to_markdown/src/markdown_writer.rs 🔗

@@ -162,7 +162,7 @@ impl MarkdownWriter {
         }
 
         let text = text
-            .trim_matches(|char| char == '\n' || char == '\r')
+            .trim_matches(|char| char == '\n' || char == '\r' || char == '\t')
             .replace('\n', " ");
 
         self.push_str(&text);

crates/html_to_markdown/src/structure/wikipedia.rs 🔗

@@ -1,5 +1,5 @@
 use crate::html_element::HtmlElement;
-use crate::markdown_writer::{MarkdownWriter, StartTagOutcome};
+use crate::markdown_writer::{HandlerOutcome, MarkdownWriter, StartTagOutcome};
 use crate::HandleTag;
 
 pub struct WikipediaChromeRemover;
@@ -30,7 +30,7 @@ impl HandleTag for WikipediaChromeRemover {
                     return StartTagOutcome::Skip;
                 }
 
-                let classes_to_skip = ["mw-editsection", "mw-jump-link"];
+                let classes_to_skip = ["noprint", "mw-editsection", "mw-jump-link"];
                 if tag.has_any_classes(&classes_to_skip) {
                     return StartTagOutcome::Skip;
                 }
@@ -42,6 +42,106 @@ impl HandleTag for WikipediaChromeRemover {
     }
 }
 
+pub struct WikipediaInfoboxHandler;
+
+impl HandleTag for WikipediaInfoboxHandler {
+    fn should_handle(&self, tag: &str) -> bool {
+        tag == "table"
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        _writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        match tag.tag.as_str() {
+            "table" => {
+                if tag.has_class("infobox") {
+                    return StartTagOutcome::Skip;
+                }
+            }
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+}
+
+pub struct WikipediaCodeHandler {
+    language: Option<String>,
+}
+
+impl WikipediaCodeHandler {
+    pub fn new() -> Self {
+        Self { language: None }
+    }
+}
+
+impl HandleTag for WikipediaCodeHandler {
+    fn should_handle(&self, tag: &str) -> bool {
+        match tag {
+            "div" | "pre" | "code" => true,
+            _ => false,
+        }
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        match tag.tag.as_str() {
+            "code" => {
+                if !writer.is_inside("pre") {
+                    writer.push_str("`");
+                }
+            }
+            "div" => {
+                let classes = tag.classes();
+                self.language = classes.iter().find_map(|class| {
+                    if let Some((_, language)) = class.split_once("mw-highlight-lang-") {
+                        Some(language.trim().to_owned())
+                    } else {
+                        None
+                    }
+                });
+            }
+            "pre" => {
+                writer.push_blank_line();
+                writer.push_str("```");
+                if let Some(language) = self.language.take() {
+                    writer.push_str(&language);
+                }
+                writer.push_newline();
+            }
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+
+    fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+        match tag.tag.as_str() {
+            "code" => {
+                if !writer.is_inside("pre") {
+                    writer.push_str("`");
+                }
+            }
+            "pre" => writer.push_str("\n```\n"),
+            _ => {}
+        }
+    }
+
+    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+        if writer.is_inside("pre") {
+            writer.push_str(&text);
+            return HandlerOutcome::Handled;
+        }
+
+        HandlerOutcome::NoOp
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use indoc::indoc;