Add basic Wikipedia support to `/fetch` (#12777)

Marshall Bowers created

This PR extends the `/fetch` slash command with the initial support for
Wikipedia's HTML structure.

Release Notes:

- N/A

Change summary

crates/assistant/src/slash_command/fetch_command.rs | 18 +++
crates/html_to_markdown/src/html_to_markdown.rs     | 61 +++++--------
crates/html_to_markdown/src/markdown.rs             | 52 +++++++++++
crates/html_to_markdown/src/structure.rs            |  1 
crates/html_to_markdown/src/structure/wikipedia.rs  | 66 ++++++++++++++
5 files changed, 157 insertions(+), 41 deletions(-)

Detailed changes

crates/assistant/src/slash_command/fetch_command.rs 🔗

@@ -5,7 +5,7 @@ use anyhow::{anyhow, bail, Context, Result};
 use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection};
 use futures::AsyncReadExt;
 use gpui::{AppContext, Task, WeakView};
-use html_to_markdown::convert_html_to_markdown;
+use html_to_markdown::{convert_html_to_markdown, markdown, HandleTag};
 use http::{AsyncBody, HttpClient, HttpClientWithUrl};
 use language::LspAdapterDelegate;
 use ui::{prelude::*, ButtonLike, ElevationIndex};
@@ -37,7 +37,21 @@ impl FetchSlashCommand {
             );
         }
 
-        convert_html_to_markdown(&body[..])
+        let mut handlers: Vec<Box<dyn HandleTag>> = vec![
+            Box::new(markdown::ParagraphHandler),
+            Box::new(markdown::HeadingHandler),
+            Box::new(markdown::ListHandler),
+            Box::new(markdown::TableHandler::new()),
+            Box::new(markdown::StyledTextHandler),
+            Box::new(markdown::CodeHandler),
+        ];
+        if url.contains("wikipedia.org") {
+            use html_to_markdown::structure::wikipedia;
+
+            handlers.push(Box::new(wikipedia::WikipediaChromeRemover));
+        }
+
+        convert_html_to_markdown(&body[..], handlers)
     }
 }
 

crates/html_to_markdown/src/html_to_markdown.rs 🔗

@@ -1,11 +1,9 @@
 //! Provides conversion from rustdoc's HTML output to Markdown.
 
-#![deny(missing_docs)]
-
 mod html_element;
-mod markdown;
+pub mod markdown;
 mod markdown_writer;
-mod structure;
+pub mod structure;
 
 use std::io::Read;
 
@@ -19,24 +17,17 @@ use markup5ever_rcdom::RcDom;
 use crate::markdown::{
     HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
 };
-use crate::markdown_writer::{HandleTag, MarkdownWriter};
+use crate::markdown_writer::MarkdownWriter;
+
+pub use crate::markdown_writer::HandleTag;
 
 /// Converts the provided HTML to Markdown.
-pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
+pub fn convert_html_to_markdown(
+    html: impl Read,
+    handlers: Vec<Box<dyn HandleTag>>,
+) -> Result<String> {
     let dom = parse_html(html).context("failed to parse HTML")?;
 
-    let handlers: Vec<Box<dyn HandleTag>> = vec![
-        Box::new(ParagraphHandler),
-        Box::new(HeadingHandler),
-        Box::new(ListHandler),
-        Box::new(TableHandler::new()),
-        Box::new(StyledTextHandler),
-        Box::new(structure::rustdoc::RustdocChromeRemover),
-        Box::new(structure::rustdoc::RustdocHeadingHandler),
-        Box::new(structure::rustdoc::RustdocCodeHandler),
-        Box::new(structure::rustdoc::RustdocItemHandler),
-    ];
-
     let markdown_writer = MarkdownWriter::new();
     let markdown = markdown_writer
         .run(&dom.document, handlers)
@@ -47,26 +38,20 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
 
 /// Converts the provided rustdoc HTML to Markdown.
 pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
-    let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
-
-    let handlers: Vec<Box<dyn HandleTag>> = vec![
-        Box::new(ParagraphHandler),
-        Box::new(HeadingHandler),
-        Box::new(ListHandler),
-        Box::new(TableHandler::new()),
-        Box::new(StyledTextHandler),
-        Box::new(structure::rustdoc::RustdocChromeRemover),
-        Box::new(structure::rustdoc::RustdocHeadingHandler),
-        Box::new(structure::rustdoc::RustdocCodeHandler),
-        Box::new(structure::rustdoc::RustdocItemHandler),
-    ];
-
-    let markdown_writer = MarkdownWriter::new();
-    let markdown = markdown_writer
-        .run(&dom.document, handlers)
-        .context("failed to convert rustdoc HTML to Markdown")?;
-
-    Ok(markdown)
+    convert_html_to_markdown(
+        html,
+        vec![
+            Box::new(ParagraphHandler),
+            Box::new(HeadingHandler),
+            Box::new(ListHandler),
+            Box::new(TableHandler::new()),
+            Box::new(StyledTextHandler),
+            Box::new(structure::rustdoc::RustdocChromeRemover),
+            Box::new(structure::rustdoc::RustdocHeadingHandler),
+            Box::new(structure::rustdoc::RustdocCodeHandler),
+            Box::new(structure::rustdoc::RustdocItemHandler),
+        ],
+    )
 }
 
 fn parse_html(mut html: impl Read) -> Result<RcDom> {

crates/html_to_markdown/src/markdown.rs 🔗

@@ -1,5 +1,5 @@
 use crate::html_element::HtmlElement;
-use crate::markdown_writer::{HandleTag, MarkdownWriter, StartTagOutcome};
+use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
 
 pub struct ParagraphHandler;
 
@@ -214,3 +214,53 @@ impl HandleTag for StyledTextHandler {
         }
     }
 }
+
+pub struct CodeHandler;
+
+impl HandleTag for CodeHandler {
+    fn should_handle(&self, tag: &str) -> bool {
+        match tag {
+            "pre" | "code" => true,
+            _ => false,
+        }
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        match tag.tag.as_str() {
+            "code" => {
+                if !writer.is_inside("pre") {
+                    writer.push_str("`");
+                }
+            }
+            "pre" => writer.push_str("\n\n```\n"),
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+
+    fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+        match tag.tag.as_str() {
+            "code" => {
+                if !writer.is_inside("pre") {
+                    writer.push_str("`");
+                }
+            }
+            "pre" => writer.push_str("\n```\n"),
+            _ => {}
+        }
+    }
+
+    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+        if writer.is_inside("pre") {
+            writer.push_str(&text);
+            return HandlerOutcome::Handled;
+        }
+
+        HandlerOutcome::NoOp
+    }
+}

crates/html_to_markdown/src/structure/wikipedia.rs 🔗

@@ -0,0 +1,80 @@
+use crate::html_element::HtmlElement;
+use crate::markdown_writer::{MarkdownWriter, StartTagOutcome};
+use crate::HandleTag;
+
+pub struct WikipediaChromeRemover;
+
+impl HandleTag for WikipediaChromeRemover {
+    fn should_handle(&self, _tag: &str) -> bool {
+        true
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        _writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        match tag.tag.as_str() {
+            "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip,
+            "sup" => {
+                if tag.has_class("reference") {
+                    return StartTagOutcome::Skip;
+                }
+            }
+            "div" | "span" | "a" => {
+                if tag.attr("id").as_deref() == Some("p-lang-btn") {
+                    return StartTagOutcome::Skip;
+                }
+
+                if tag.attr("id").as_deref() == Some("p-search") {
+                    return StartTagOutcome::Skip;
+                }
+
+                let classes_to_skip = ["mw-editsection", "mw-jump-link"];
+                if tag.has_any_classes(&classes_to_skip) {
+                    return StartTagOutcome::Skip;
+                }
+            }
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use indoc::indoc;
+    use pretty_assertions::assert_eq;
+
+    use crate::{convert_html_to_markdown, markdown};
+
+    use super::*;
+
+    fn wikipedia_handlers() -> Vec<Box<dyn HandleTag>> {
+        vec![
+            Box::new(markdown::ParagraphHandler),
+            Box::new(markdown::HeadingHandler),
+            Box::new(markdown::ListHandler),
+            Box::new(markdown::StyledTextHandler),
+            Box::new(WikipediaChromeRemover),
+        ]
+    }
+
+    #[test]
+    fn test_citation_references_get_removed() {
+        let html = indoc! {r##"