Add tag handler for collecting crate items from rustdoc output (#12903)

Marshall Bowers created

This PR adds a tag handler for collecting crate items from rustdoc's
HTML output.

This will serve as the foundation for getting more insight into a
crate's contents.

Release Notes:

- N/A

Change summary

Cargo.lock                                            |   2 
crates/assistant/src/slash_command/fetch_command.rs   |  28 +-
crates/assistant/src/slash_command/rustdoc_command.rs |  14 
crates/html_to_markdown/Cargo.toml                    |   2 
crates/html_to_markdown/src/html_to_markdown.rs       |  81 +++++--
crates/html_to_markdown/src/markdown_writer.rs        |  34 +-
crates/html_to_markdown/src/structure/rustdoc.rs      | 132 +++++++++++++
crates/html_to_markdown/src/structure/wikipedia.rs    |  19 +
8 files changed, 237 insertions(+), 75 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -5072,10 +5072,12 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "html5ever",
+ "indexmap 1.9.3",
  "indoc",
  "markup5ever_rcdom",
  "pretty_assertions",
  "regex",
+ "strum",
 ]
 
 [[package]]

crates/assistant/src/slash_command/fetch_command.rs 🔗

@@ -1,3 +1,5 @@
+use std::cell::RefCell;
+use std::rc::Rc;
 use std::sync::atomic::AtomicBool;
 use std::sync::Arc;
 
@@ -5,7 +7,7 @@ use anyhow::{anyhow, bail, Context, Result};
 use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection};
 use futures::AsyncReadExt;
 use gpui::{AppContext, Task, WeakView};
-use html_to_markdown::{convert_html_to_markdown, markdown, HandleTag};
+use html_to_markdown::{convert_html_to_markdown, markdown, TagHandler};
 use http::{AsyncBody, HttpClient, HttpClientWithUrl};
 use language::LspAdapterDelegate;
 use ui::{prelude::*, ButtonLike, ElevationIndex};
@@ -59,24 +61,26 @@ impl FetchSlashCommand {
 
         match content_type {
             ContentType::Html => {
-                let mut handlers: Vec<Box<dyn HandleTag>> = vec![
-                    Box::new(markdown::ParagraphHandler),
-                    Box::new(markdown::HeadingHandler),
-                    Box::new(markdown::ListHandler),
-                    Box::new(markdown::TableHandler::new()),
-                    Box::new(markdown::StyledTextHandler),
+                let mut handlers: Vec<TagHandler> = vec![
+                    Rc::new(RefCell::new(markdown::ParagraphHandler)),
+                    Rc::new(RefCell::new(markdown::HeadingHandler)),
+                    Rc::new(RefCell::new(markdown::ListHandler)),
+                    Rc::new(RefCell::new(markdown::TableHandler::new())),
+                    Rc::new(RefCell::new(markdown::StyledTextHandler)),
                 ];
                 if url.contains("wikipedia.org") {
                     use html_to_markdown::structure::wikipedia;
 
-                    handlers.push(Box::new(wikipedia::WikipediaChromeRemover));
-                    handlers.push(Box::new(wikipedia::WikipediaInfoboxHandler));
-                    handlers.push(Box::new(wikipedia::WikipediaCodeHandler::new()));
+                    handlers.push(Rc::new(RefCell::new(wikipedia::WikipediaChromeRemover)));
+                    handlers.push(Rc::new(RefCell::new(wikipedia::WikipediaInfoboxHandler)));
+                    handlers.push(Rc::new(
+                        RefCell::new(wikipedia::WikipediaCodeHandler::new()),
+                    ));
                 } else {
-                    handlers.push(Box::new(markdown::CodeHandler));
+                    handlers.push(Rc::new(RefCell::new(markdown::CodeHandler)));
                 }
 
-                convert_html_to_markdown(&body[..], handlers)
+                convert_html_to_markdown(&body[..], &mut handlers)
             }
             ContentType::Plaintext => Ok(std::str::from_utf8(&body)?.to_owned()),
             ContentType::Json => {

crates/assistant/src/slash_command/rustdoc_command.rs 🔗

@@ -42,10 +42,9 @@ impl RustdocSlashCommand {
             local_cargo_doc_path.push("index.html");
 
             if let Ok(contents) = fs.load(&local_cargo_doc_path).await {
-                return Ok((
-                    RustdocSource::Local,
-                    convert_rustdoc_to_markdown(contents.as_bytes())?,
-                ));
+                let (markdown, _items) = convert_rustdoc_to_markdown(contents.as_bytes())?;
+
+                return Ok((RustdocSource::Local, markdown));
             }
         }
 
@@ -78,10 +77,9 @@ impl RustdocSlashCommand {
             );
         }
 
-        Ok((
-            RustdocSource::DocsDotRs,
-            convert_rustdoc_to_markdown(&body[..])?,
-        ))
+        let (markdown, _items) = convert_rustdoc_to_markdown(&body[..])?;
+
+        Ok((RustdocSource::DocsDotRs, markdown))
     }
 
     fn path_to_cargo_toml(project: Model<Project>, cx: &mut AppContext) -> Option<Arc<Path>> {

crates/html_to_markdown/Cargo.toml 🔗

@@ -14,8 +14,10 @@ path = "src/html_to_markdown.rs"
 [dependencies]
 anyhow.workspace = true
 html5ever.workspace = true
+indexmap.workspace = true
 markup5ever_rcdom.workspace = true
 regex.workspace = true
+strum.workspace = true
 
 [dev-dependencies]
 indoc.workspace = true

crates/html_to_markdown/src/html_to_markdown.rs 🔗

@@ -5,7 +5,9 @@ pub mod markdown;
 mod markdown_writer;
 pub mod structure;
 
+use std::cell::RefCell;
 use std::io::Read;
+use std::rc::Rc;
 
 use anyhow::{Context, Result};
 use html5ever::driver::ParseOpts;
@@ -19,13 +21,11 @@ use crate::markdown::{
 };
 use crate::markdown_writer::MarkdownWriter;
 
-pub use crate::markdown_writer::HandleTag;
+pub use crate::markdown_writer::{HandleTag, TagHandler};
+use crate::structure::rustdoc::RustdocItem;
 
 /// Converts the provided HTML to Markdown.
-pub fn convert_html_to_markdown(
-    html: impl Read,
-    handlers: Vec<Box<dyn HandleTag>>,
-) -> Result<String> {
+pub fn convert_html_to_markdown(html: impl Read, handlers: &mut Vec<TagHandler>) -> Result<String> {
     let dom = parse_html(html).context("failed to parse HTML")?;
 
     let markdown_writer = MarkdownWriter::new();
@@ -37,21 +37,32 @@ pub fn convert_html_to_markdown(
 }
 
 /// Converts the provided rustdoc HTML to Markdown.
-pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
-    convert_html_to_markdown(
-        html,
-        vec![
-            Box::new(ParagraphHandler),
-            Box::new(HeadingHandler),
-            Box::new(ListHandler),
-            Box::new(TableHandler::new()),
-            Box::new(StyledTextHandler),
-            Box::new(structure::rustdoc::RustdocChromeRemover),
-            Box::new(structure::rustdoc::RustdocHeadingHandler),
-            Box::new(structure::rustdoc::RustdocCodeHandler),
-            Box::new(structure::rustdoc::RustdocItemHandler),
-        ],
-    )
+pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec<RustdocItem>)> {
+    let item_collector = Rc::new(RefCell::new(structure::rustdoc::RustdocItemCollector::new()));
+
+    let mut handlers: Vec<TagHandler> = vec![
+        Rc::new(RefCell::new(ParagraphHandler)),
+        Rc::new(RefCell::new(HeadingHandler)),
+        Rc::new(RefCell::new(ListHandler)),
+        Rc::new(RefCell::new(TableHandler::new())),
+        Rc::new(RefCell::new(StyledTextHandler)),
+        Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
+        Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
+        Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
+        Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
+        item_collector.clone(),
+    ];
+
+    let markdown = convert_html_to_markdown(html, &mut handlers)?;
+
+    let items = item_collector
+        .borrow()
+        .items
+        .values()
+        .cloned()
+        .collect::<Vec<_>>();
+
+    Ok((markdown, items))
 }
 
 fn parse_html(mut html: impl Read) -> Result<RcDom> {
@@ -77,6 +88,20 @@ mod tests {
 
     use super::*;
 
+    fn rustdoc_handlers() -> Vec<TagHandler> {
+        vec![
+            Rc::new(RefCell::new(ParagraphHandler)),
+            Rc::new(RefCell::new(HeadingHandler)),
+            Rc::new(RefCell::new(ListHandler)),
+            Rc::new(RefCell::new(TableHandler::new())),
+            Rc::new(RefCell::new(StyledTextHandler)),
+            Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
+            Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
+            Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
+            Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
+        ]
+    }
+
     #[test]
     fn test_main_heading_buttons_get_removed() {
         let html = indoc! {r##"
@@ -93,7 +118,7 @@ mod tests {
         .trim();
 
         assert_eq!(
-            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
             expected
         )
     }
@@ -113,7 +138,7 @@ mod tests {
         .trim();
 
         assert_eq!(
-            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
             expected
         )
     }
@@ -159,7 +184,7 @@ mod tests {
         .trim();
 
         assert_eq!(
-            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
             expected
         )
     }
@@ -178,7 +203,7 @@ mod tests {
         .trim();
 
         assert_eq!(
-            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
             expected
         )
     }
@@ -220,7 +245,7 @@ mod tests {
         .trim();
 
         assert_eq!(
-            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
             expected
         )
     }
@@ -252,7 +277,7 @@ mod tests {
         .trim();
 
         assert_eq!(
-            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
             expected
         )
     }
@@ -288,7 +313,7 @@ mod tests {
         .trim();
 
         assert_eq!(
-            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
             expected
         )
     }
@@ -342,7 +367,7 @@ mod tests {
         .trim();
 
         assert_eq!(
-            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
             expected
         )
     }

crates/html_to_markdown/src/markdown_writer.rs 🔗

@@ -1,4 +1,6 @@
+use std::cell::RefCell;
 use std::collections::VecDeque;
+use std::rc::Rc;
 use std::sync::OnceLock;
 
 use anyhow::Result;
@@ -22,6 +24,8 @@ pub enum StartTagOutcome {
     Skip,
 }
 
+pub type TagHandler = Rc<RefCell<dyn HandleTag>>;
+
 pub struct MarkdownWriter {
     current_element_stack: VecDeque<HtmlElement>,
     pub(crate) markdown: String,
@@ -60,12 +64,8 @@ impl MarkdownWriter {
         self.push_str("\n\n");
     }
 
-    pub fn run(
-        mut self,
-        root_node: &Handle,
-        mut handlers: Vec<Box<dyn HandleTag>>,
-    ) -> Result<String> {
-        self.visit_node(&root_node, &mut handlers)?;
+    pub fn run(mut self, root_node: &Handle, handlers: &mut Vec<TagHandler>) -> Result<String> {
+        self.visit_node(&root_node, handlers)?;
         Ok(Self::prettify_markdown(self.markdown))
     }
 
@@ -76,7 +76,7 @@ impl MarkdownWriter {
         markdown.trim().to_string()
     }
 
-    fn visit_node(&mut self, node: &Handle, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
+    fn visit_node(&mut self, node: &Handle, handlers: &mut [TagHandler]) -> Result<()> {
         let mut current_element = None;
 
         match node.data {
@@ -128,14 +128,10 @@ impl MarkdownWriter {
         Ok(())
     }
 
-    fn start_tag(
-        &mut self,
-        tag: &HtmlElement,
-        handlers: &mut [Box<dyn HandleTag>],
-    ) -> StartTagOutcome {
+    fn start_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) -> StartTagOutcome {
         for handler in handlers {
-            if handler.should_handle(tag.tag.as_str()) {
-                match handler.handle_tag_start(tag, self) {
+            if handler.borrow().should_handle(tag.tag.as_str()) {
+                match handler.borrow_mut().handle_tag_start(tag, self) {
                     StartTagOutcome::Continue => {}
                     StartTagOutcome::Skip => return StartTagOutcome::Skip,
                 }
@@ -145,17 +141,17 @@ impl MarkdownWriter {
         StartTagOutcome::Continue
     }
 
-    fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box<dyn HandleTag>]) {
+    fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) {
         for handler in handlers {
-            if handler.should_handle(tag.tag.as_str()) {
-                handler.handle_tag_end(tag, self);
+            if handler.borrow().should_handle(tag.tag.as_str()) {
+                handler.borrow_mut().handle_tag_end(tag, self);
             }
         }
     }
 
-    fn visit_text(&mut self, text: String, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
+    fn visit_text(&mut self, text: String, handlers: &mut [TagHandler]) -> Result<()> {
         for handler in handlers {
-            match handler.handle_text(&text, self) {
+            match handler.borrow_mut().handle_text(&text, self) {
                 HandlerOutcome::Handled => return Ok(()),
                 HandlerOutcome::NoOp => {}
             }

crates/html_to_markdown/src/structure/rustdoc.rs 🔗

@@ -1,3 +1,6 @@
+use indexmap::IndexMap;
+use strum::{EnumIter, IntoEnumIterator};
+
 use crate::html_element::HtmlElement;
 use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
 
@@ -203,3 +206,132 @@ impl HandleTag for RustdocChromeRemover {
         StartTagOutcome::Continue
     }
 }
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy, EnumIter)]
+pub enum RustdocItemKind {
+    Mod,
+    Macro,
+    Struct,
+    Enum,
+    Constant,
+    Trait,
+    Function,
+    TypeAlias,
+    AttributeMacro,
+    DeriveMacro,
+}
+
+impl RustdocItemKind {
+    const fn class(&self) -> &'static str {
+        match self {
+            Self::Mod => "mod",
+            Self::Macro => "macro",
+            Self::Struct => "struct",
+            Self::Enum => "enum",
+            Self::Constant => "constant",
+            Self::Trait => "trait",
+            Self::Function => "fn",
+            Self::TypeAlias => "type",
+            Self::AttributeMacro => "attr",
+            Self::DeriveMacro => "derive",
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct RustdocItem {
+    pub kind: RustdocItemKind,
+    pub name: String,
+}
+
+impl RustdocItem {
+    pub fn url_path(&self) -> String {
+        let name = &self.name;
+        match self.kind {
+            RustdocItemKind::Mod => format!("{name}/index.html"),
+            RustdocItemKind::Macro
+            | RustdocItemKind::Struct
+            | RustdocItemKind::Enum
+            | RustdocItemKind::Constant
+            | RustdocItemKind::Trait
+            | RustdocItemKind::Function
+            | RustdocItemKind::TypeAlias
+            | RustdocItemKind::AttributeMacro
+            | RustdocItemKind::DeriveMacro => {
+                format!("{kind}.{name}.html", kind = self.kind.class())
+            }
+        }
+    }
+}
+
+pub struct RustdocItemCollector {
+    pub items: IndexMap<(RustdocItemKind, String), RustdocItem>,
+}
+
+impl RustdocItemCollector {
+    pub fn new() -> Self {
+        Self {
+            items: IndexMap::new(),
+        }
+    }
+
+    fn parse_item(tag: &HtmlElement) -> Option<RustdocItem> {
+        if tag.tag.as_str() != "a" {
+            return None;
+        }
+
+        let href = tag.attr("href")?;
+        if href == "#" {
+            return None;
+        }
+
+        for kind in RustdocItemKind::iter() {
+            if tag.has_class(kind.class()) {
+                let name = href
+                    .trim_start_matches(&format!("{}.", kind.class()))
+                    .trim_end_matches("/index.html")
+                    .trim_end_matches(".html");
+
+                return Some(RustdocItem {
+                    kind,
+                    name: name.to_owned(),
+                });
+            }
+        }
+
+        None
+    }
+}
+
+impl HandleTag for RustdocItemCollector {
+    fn should_handle(&self, tag: &str) -> bool {
+        tag == "a"
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        match tag.tag.as_str() {
+            "a" => {
+                let is_reexport = writer.current_element_stack().iter().any(|element| {
+                    if let Some(id) = element.attr("id") {
+                        id.starts_with("reexport.")
+                    } else {
+                        false
+                    }
+                });
+
+                if !is_reexport {
+                    if let Some(item) = Self::parse_item(tag) {
+                        self.items.insert((item.kind, item.name.clone()), item);
+                    }
+                }
+            }
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+}

crates/html_to_markdown/src/structure/wikipedia.rs 🔗

@@ -144,20 +144,23 @@ impl HandleTag for WikipediaCodeHandler {
 
 #[cfg(test)]
 mod tests {
+    use std::cell::RefCell;
+    use std::rc::Rc;
+
     use indoc::indoc;
     use pretty_assertions::assert_eq;
 
-    use crate::{convert_html_to_markdown, markdown};
+    use crate::{convert_html_to_markdown, markdown, TagHandler};
 
     use super::*;
 
-    fn wikipedia_handlers() -> Vec<Box<dyn HandleTag>> {
+    fn wikipedia_handlers() -> Vec<TagHandler> {
         vec![
-            Box::new(markdown::ParagraphHandler),
-            Box::new(markdown::HeadingHandler),
-            Box::new(markdown::ListHandler),
-            Box::new(markdown::StyledTextHandler),
-            Box::new(WikipediaChromeRemover),
+            Rc::new(RefCell::new(markdown::ParagraphHandler)),
+            Rc::new(RefCell::new(markdown::HeadingHandler)),
+            Rc::new(RefCell::new(markdown::ListHandler)),
+            Rc::new(RefCell::new(markdown::StyledTextHandler)),
+            Rc::new(RefCell::new(WikipediaChromeRemover)),
         ]
     }
 
@@ -173,7 +176,7 @@ mod tests {
         .trim();
 
         assert_eq!(
-            convert_html_to_markdown(html.as_bytes(), wikipedia_handlers()).unwrap(),
+            convert_html_to_markdown(html.as_bytes(), &mut wikipedia_handlers()).unwrap(),
             expected
         )
     }