Start on rustdoc crawler (#12942)

Marshall Bowers and Richard created

This PR adds a first pass at a rustdoc crawler.

We'll be using this to get information about a crate from the rustdoc
artifacts for use in the Assistant.

Release Notes:

- N/A

---------

Co-authored-by: Richard <richard@zed.dev>

Change summary

Cargo.lock                                       |  13 +
Cargo.toml                                       |   2 
crates/html_to_markdown/src/html_to_markdown.rs  |   2 
crates/html_to_markdown/src/structure/rustdoc.rs |  60 +++-
crates/rustdoc/Cargo.toml                        |  23 +
crates/rustdoc/LICENSE-GPL                       |   1 
crates/rustdoc/src/crawler.rs                    | 211 ++++++++++++++++++
crates/rustdoc/src/rustdoc.rs                    |   1 
8 files changed, 293 insertions(+), 20 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -8664,6 +8664,19 @@ dependencies = [
  "semver",
 ]
 
+[[package]]
+name = "rustdoc"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "collections",
+ "fs",
+ "futures 0.3.28",
+ "html_to_markdown",
+ "http 0.1.0",
+]
+
 [[package]]
 name = "rustix"
 version = "0.37.23"

Cargo.toml 🔗

@@ -79,6 +79,7 @@ members = [
     "crates/rich_text",
     "crates/rope",
     "crates/rpc",
+    "crates/rustdoc",
     "crates/task",
     "crates/tasks_ui",
     "crates/search",
@@ -227,6 +228,7 @@ dev_server_projects = { path = "crates/dev_server_projects" }
 rich_text = { path = "crates/rich_text" }
 rope = { path = "crates/rope" }
 rpc = { path = "crates/rpc" }
+rustdoc = { path = "crates/rustdoc" }
 task = { path = "crates/task" }
 tasks_ui = { path = "crates/tasks_ui" }
 search = { path = "crates/search" }

crates/html_to_markdown/src/structure/rustdoc.rs 🔗

@@ -1,4 +1,6 @@
-use indexmap::IndexMap;
+use std::sync::Arc;
+
+use indexmap::IndexSet;
 use strum::{EnumIter, IntoEnumIterator};
 
 use crate::html_element::HtmlElement;
@@ -238,17 +240,25 @@ impl RustdocItemKind {
     }
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
 pub struct RustdocItem {
     pub kind: RustdocItemKind,
-    pub name: String,
+    /// The item path, up until the name of the item.
+    pub path: Vec<Arc<str>>,
+    /// The name of the item.
+    pub name: Arc<str>,
 }
 
 impl RustdocItem {
     pub fn url_path(&self) -> String {
         let name = &self.name;
+        let mut path_components = self.path.clone();
+
         match self.kind {
-            RustdocItemKind::Mod => format!("{name}/index.html"),
+            RustdocItemKind::Mod => {
+                path_components.push(name.clone());
+                path_components.push("index.html".into());
+            }
             RustdocItemKind::Macro
             | RustdocItemKind::Struct
             | RustdocItemKind::Enum
@@ -258,20 +268,23 @@ impl RustdocItem {
             | RustdocItemKind::TypeAlias
             | RustdocItemKind::AttributeMacro
             | RustdocItemKind::DeriveMacro => {
-                format!("{kind}.{name}.html", kind = self.kind.class())
+                path_components
+                    .push(format!("{kind}.{name}.html", kind = self.kind.class()).into());
             }
         }
+
+        path_components.join("/")
     }
 }
 
 pub struct RustdocItemCollector {
-    pub items: IndexMap<(RustdocItemKind, String), RustdocItem>,
+    pub items: IndexSet<RustdocItem>,
 }
 
 impl RustdocItemCollector {
     pub fn new() -> Self {
         Self {
-            items: IndexMap::new(),
+            items: IndexSet::new(),
         }
     }
 
@@ -281,21 +294,30 @@ impl RustdocItemCollector {
         }
 
         let href = tag.attr("href")?;
-        if href == "#" {
+        if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
             return None;
         }
 
         for kind in RustdocItemKind::iter() {
             if tag.has_class(kind.class()) {
-                let name = href
-                    .trim_start_matches(&format!("{}.", kind.class()))
-                    .trim_end_matches("/index.html")
-                    .trim_end_matches(".html");
-
-                return Some(RustdocItem {
-                    kind,
-                    name: name.to_owned(),
-                });
+                let mut parts = href.trim_end_matches("/index.html").split('/');
+
+                if let Some(last_component) = parts.next_back() {
+                    let last_component = match last_component.split_once('#') {
+                        Some((component, _fragment)) => component,
+                        None => last_component,
+                    };
+
+                    let name = last_component
+                        .trim_start_matches(&format!("{}.", kind.class()))
+                        .trim_end_matches(".html");
+
+                    return Some(RustdocItem {
+                        kind,
+                        name: name.into(),
+                        path: parts.map(Into::into).collect(),
+                    });
+                }
             }
         }
 
@@ -317,7 +339,7 @@ impl HandleTag for RustdocItemCollector {
             "a" => {
                 let is_reexport = writer.current_element_stack().iter().any(|element| {
                     if let Some(id) = element.attr("id") {
-                        id.starts_with("reexport.")
+                        id.starts_with("reexport.") || id.starts_with("method.")
                     } else {
                         false
                     }
@@ -325,7 +347,7 @@ impl HandleTag for RustdocItemCollector {
 
                 if !is_reexport {
                     if let Some(item) = Self::parse_item(tag) {
-                        self.items.insert((item.kind, item.name.clone()), item);
+                        self.items.insert(item);
                     }
                 }
             }

crates/rustdoc/Cargo.toml 🔗

@@ -0,0 +1,23 @@
+[package]
+name = "rustdoc"
+version = "0.1.0"
+edition = "2021"
+publish = false
+license = "GPL-3.0-or-later"
+
+[lints]
+workspace = true
+
+[lib]
+path = "src/rustdoc.rs"
+
+[dependencies]
+anyhow.workspace = true
+async-trait.workspace = true
+collections.workspace = true
+fs.workspace = true
+futures.workspace = true
+html_to_markdown.workspace = true
+http.workspace = true
+
+[dev-dependencies]

crates/rustdoc/src/crawler.rs 🔗

@@ -0,0 +1,211 @@
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use anyhow::{bail, Context, Result};
+use async_trait::async_trait;
+use collections::{HashSet, VecDeque};
+use fs::Fs;
+use futures::AsyncReadExt;
+use html_to_markdown::convert_rustdoc_to_markdown;
+use html_to_markdown::structure::rustdoc::{RustdocItem, RustdocItemKind};
+use http::{AsyncBody, HttpClient, HttpClientWithUrl};
+
+#[derive(Debug, Clone, Copy)]
+pub enum RustdocSource {
+    /// The docs were sourced from local `cargo doc` output.
+    Local,
+    /// The docs were sourced from `docs.rs`.
+    DocsDotRs,
+}
+
+#[async_trait]
+pub trait RustdocProvider {
+    async fn fetch_page(
+        &self,
+        crate_name: &str,
+        item: Option<&RustdocItem>,
+    ) -> Result<Option<String>>;
+}
+
+pub struct LocalProvider {
+    fs: Arc<dyn Fs>,
+    cargo_workspace_root: PathBuf,
+}
+
+impl LocalProvider {
+    pub fn new(fs: Arc<dyn Fs>, cargo_workspace_root: PathBuf) -> Self {
+        Self {
+            fs,
+            cargo_workspace_root,
+        }
+    }
+}
+
+#[async_trait]
+impl RustdocProvider for LocalProvider {
+    async fn fetch_page(
+        &self,
+        crate_name: &str,
+        item: Option<&RustdocItem>,
+    ) -> Result<Option<String>> {
+        let mut local_cargo_doc_path = self.cargo_workspace_root.join("target/doc");
+        local_cargo_doc_path.push(&crate_name);
+        if let Some(item) = item {
+            if !item.path.is_empty() {
+                local_cargo_doc_path.push(item.path.join("/"));
+            }
+        }
+        local_cargo_doc_path.push("index.html");
+
+        let Ok(contents) = self.fs.load(&local_cargo_doc_path).await else {
+            return Ok(None);
+        };
+
+        Ok(Some(contents))
+    }
+}
+
+pub struct DocsDotRsProvider {
+    http_client: Arc<HttpClientWithUrl>,
+}
+
+impl DocsDotRsProvider {
+    pub fn new(http_client: Arc<HttpClientWithUrl>) -> Self {
+        Self { http_client }
+    }
+}
+
+#[async_trait]
+impl RustdocProvider for DocsDotRsProvider {
+    async fn fetch_page(
+        &self,
+        crate_name: &str,
+        item: Option<&RustdocItem>,
+    ) -> Result<Option<String>> {
+        let version = "latest";
+        let path = format!(
+            "{crate_name}/{version}/{crate_name}{item_path}",
+            item_path = item
+                .map(|item| format!("/{}", item.url_path()))
+                .unwrap_or_default()
+        );
+
+        println!("Fetching {}", &format!("https://docs.rs/{path}"));
+
+        let mut response = self
+            .http_client
+            .get(
+                &format!("https://docs.rs/{path}"),
+                AsyncBody::default(),
+                true,
+            )
+            .await?;
+
+        let mut body = Vec::new();
+        response
+            .body_mut()
+            .read_to_end(&mut body)
+            .await
+            .context("error reading docs.rs response body")?;
+
+        if response.status().is_client_error() {
+            let text = String::from_utf8_lossy(body.as_slice());
+            bail!(
+                "status error {}, response: {text:?}",
+                response.status().as_u16()
+            );
+        }
+
+        Ok(Some(String::from_utf8(body)?))
+    }
+}
+
+pub struct RustdocItemWithHistory {
+    pub item: RustdocItem,
+    #[cfg(debug_assertions)]
+    pub history: Vec<String>,
+}
+
+pub struct RustdocCrawler {
+    provider: Box<dyn RustdocProvider + Send + Sync + 'static>,
+}
+
+impl RustdocCrawler {
+    pub fn new(provider: Box<dyn RustdocProvider + Send + Sync + 'static>) -> Self {
+        Self { provider }
+    }
+
+    pub async fn crawl(&self, crate_name: String) -> Result<Option<String>> {
+        let Some(crate_index_content) = self.provider.fetch_page(&crate_name, None).await? else {
+            return Ok(None);
+        };
+
+        let (_markdown, items) = convert_rustdoc_to_markdown(crate_index_content.as_bytes())?;
+
+        let mut seen_items = HashSet::default();
+        let mut items_to_visit: VecDeque<RustdocItemWithHistory> =
+            VecDeque::from_iter(items.into_iter().map(|item| RustdocItemWithHistory {
+                item,
+                #[cfg(debug_assertions)]
+                history: Vec::new(),
+            }));
+
+        while let Some(item_with_history) = items_to_visit.pop_front() {
+            let item = &item_with_history.item;
+            println!("Visiting {:?} {:?} {}", &item.kind, &item.path, &item.name);
+
+            let Some(result) = self
+                .provider
+                .fetch_page(&crate_name, Some(&item))
+                .await
+                .with_context(|| {
+                    #[cfg(debug_assertions)]
+                    {
+                        format!(
+                            "failed to fetch {item:?}: {history:?}",
+                            history = item_with_history.history
+                        )
+                    }
+
+                    #[cfg(not(debug_assertions))]
+                    {
+                        format!("failed to fetch {item:?}")
+                    }
+                })?
+            else {
+                continue;
+            };
+
+            let (_markdown, mut items) = convert_rustdoc_to_markdown(result.as_bytes())?;
+
+            seen_items.insert(item.clone());
+
+            for child in &mut items {
+                child.path.extend(item.path.clone());
+                match item.kind {
+                    RustdocItemKind::Mod => {
+                        child.path.push(item.name.clone());
+                    }
+                    _ => {}
+                }
+            }
+
+            let unseen_items = items
+                .into_iter()
+                .map(|item| RustdocItemWithHistory {
+                    #[cfg(debug_assertions)]
+                    history: {
+                        let mut history = item_with_history.history.clone();
+                        history.push(item.url_path());
+                        history
+                    },
+                    item,
+                })
+                .filter(|item| !seen_items.contains(&item.item));
+
+            items_to_visit.extend(unseen_items);
+        }
+
+        Ok(Some(String::new()))
+    }
+}