assistant: Strip out general website chrome in `/fetch` command (#13264)

Marshall Bowers created

This PR updates the `/fetch` command to strip out general website chrome
that likely won't contain content on any websites.

Release Notes:

- N/A

Change summary

crates/assistant/src/slash_command/fetch_command.rs |  1 
crates/html_to_markdown/src/markdown.rs             | 24 +++++++++++++++
2 files changed, 25 insertions(+)

Detailed changes

crates/assistant/src/slash_command/fetch_command.rs 🔗

@@ -62,6 +62,7 @@ impl FetchSlashCommand {
         match content_type {
             ContentType::Html => {
                 let mut handlers: Vec<TagHandler> = vec![
+                    Rc::new(RefCell::new(markdown::WebpageChromeRemover)),
                     Rc::new(RefCell::new(markdown::ParagraphHandler)),
                     Rc::new(RefCell::new(markdown::HeadingHandler)),
                     Rc::new(RefCell::new(markdown::ListHandler)),

crates/html_to_markdown/src/markdown.rs 🔗

@@ -1,6 +1,30 @@
 use crate::html_element::HtmlElement;
 use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
 
+pub struct WebpageChromeRemover;
+
+impl HandleTag for WebpageChromeRemover {
+    fn should_handle(&self, tag: &str) -> bool {
+        match tag {
+            "head" | "script" | "style" | "nav" => true,
+            _ => false,
+        }
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        _writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        match tag.tag() {
+            "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip,
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+}
+
 pub struct ParagraphHandler;
 
 impl HandleTag for ParagraphHandler {