Detailed changes
@@ -8664,6 +8664,19 @@ dependencies = [
"semver",
]
+[[package]]
+name = "rustdoc"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "collections",
+ "fs",
+ "futures 0.3.28",
+ "html_to_markdown",
+ "http 0.1.0",
+]
+
[[package]]
name = "rustix"
version = "0.37.23"
@@ -79,6 +79,7 @@ members = [
"crates/rich_text",
"crates/rope",
"crates/rpc",
+ "crates/rustdoc",
"crates/task",
"crates/tasks_ui",
"crates/search",
@@ -227,6 +228,7 @@ dev_server_projects = { path = "crates/dev_server_projects" }
rich_text = { path = "crates/rich_text" }
rope = { path = "crates/rope" }
rpc = { path = "crates/rpc" }
+rustdoc = { path = "crates/rustdoc" }
task = { path = "crates/task" }
tasks_ui = { path = "crates/tasks_ui" }
search = { path = "crates/search" }
@@ -58,7 +58,7 @@ pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec<Rustd
let items = item_collector
.borrow()
.items
- .values()
+ .iter()
.cloned()
.collect::<Vec<_>>();
@@ -1,4 +1,6 @@
-use indexmap::IndexMap;
+use std::sync::Arc;
+
+use indexmap::IndexSet;
use strum::{EnumIter, IntoEnumIterator};
use crate::html_element::HtmlElement;
@@ -238,17 +240,25 @@ impl RustdocItemKind {
}
}
-#[derive(Debug, Clone)]
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
pub struct RustdocItem {
pub kind: RustdocItemKind,
- pub name: String,
+ /// The item path, up until the name of the item.
+ pub path: Vec<Arc<str>>,
+ /// The name of the item.
+ pub name: Arc<str>,
}
impl RustdocItem {
pub fn url_path(&self) -> String {
let name = &self.name;
+ let mut path_components = self.path.clone();
+
match self.kind {
- RustdocItemKind::Mod => format!("{name}/index.html"),
+ RustdocItemKind::Mod => {
+ path_components.push(name.clone());
+ path_components.push("index.html".into());
+ }
RustdocItemKind::Macro
| RustdocItemKind::Struct
| RustdocItemKind::Enum
@@ -258,20 +268,23 @@ impl RustdocItem {
| RustdocItemKind::TypeAlias
| RustdocItemKind::AttributeMacro
| RustdocItemKind::DeriveMacro => {
- format!("{kind}.{name}.html", kind = self.kind.class())
+ path_components
+ .push(format!("{kind}.{name}.html", kind = self.kind.class()).into());
}
}
+
+ path_components.join("/")
}
}
pub struct RustdocItemCollector {
- pub items: IndexMap<(RustdocItemKind, String), RustdocItem>,
+ pub items: IndexSet<RustdocItem>,
}
impl RustdocItemCollector {
pub fn new() -> Self {
Self {
- items: IndexMap::new(),
+ items: IndexSet::new(),
}
}
@@ -281,21 +294,30 @@ impl RustdocItemCollector {
}
let href = tag.attr("href")?;
- if href == "#" {
+ if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
return None;
}
for kind in RustdocItemKind::iter() {
if tag.has_class(kind.class()) {
- let name = href
- .trim_start_matches(&format!("{}.", kind.class()))
- .trim_end_matches("/index.html")
- .trim_end_matches(".html");
-
- return Some(RustdocItem {
- kind,
- name: name.to_owned(),
- });
+ let mut parts = href.trim_end_matches("/index.html").split('/');
+
+ if let Some(last_component) = parts.next_back() {
+ let last_component = match last_component.split_once('#') {
+ Some((component, _fragment)) => component,
+ None => last_component,
+ };
+
+ let name = last_component
+ .trim_start_matches(&format!("{}.", kind.class()))
+ .trim_end_matches(".html");
+
+ return Some(RustdocItem {
+ kind,
+ name: name.into(),
+ path: parts.map(Into::into).collect(),
+ });
+ }
}
}
@@ -317,7 +339,7 @@ impl HandleTag for RustdocItemCollector {
"a" => {
let is_reexport = writer.current_element_stack().iter().any(|element| {
if let Some(id) = element.attr("id") {
- id.starts_with("reexport.")
+ id.starts_with("reexport.") || id.starts_with("method.")
} else {
false
}
@@ -325,7 +347,7 @@ impl HandleTag for RustdocItemCollector {
if !is_reexport {
if let Some(item) = Self::parse_item(tag) {
- self.items.insert((item.kind, item.name.clone()), item);
+ self.items.insert(item);
}
}
}
@@ -0,0 +1,23 @@
+[package]
+name = "rustdoc"
+version = "0.1.0"
+edition = "2021"
+publish = false
+license = "GPL-3.0-or-later"
+
+[lints]
+workspace = true
+
+[lib]
+path = "src/rustdoc.rs"
+
+[dependencies]
+anyhow.workspace = true
+async-trait.workspace = true
+collections.workspace = true
+fs.workspace = true
+futures.workspace = true
+html_to_markdown.workspace = true
+http.workspace = true
+
+[dev-dependencies]
@@ -0,0 +1 @@
+../../LICENSE-GPL
@@ -0,0 +1,211 @@
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use anyhow::{bail, Context, Result};
+use async_trait::async_trait;
+use collections::{HashSet, VecDeque};
+use fs::Fs;
+use futures::AsyncReadExt;
+use html_to_markdown::convert_rustdoc_to_markdown;
+use html_to_markdown::structure::rustdoc::{RustdocItem, RustdocItemKind};
+use http::{AsyncBody, HttpClient, HttpClientWithUrl};
+
+#[derive(Debug, Clone, Copy)]
+pub enum RustdocSource {
+ /// The docs were sourced from local `cargo doc` output.
+ Local,
+ /// The docs were sourced from `docs.rs`.
+ DocsDotRs,
+}
+
+#[async_trait]
+pub trait RustdocProvider {
+ async fn fetch_page(
+ &self,
+ crate_name: &str,
+ item: Option<&RustdocItem>,
+ ) -> Result<Option<String>>;
+}
+
+pub struct LocalProvider {
+ fs: Arc<dyn Fs>,
+ cargo_workspace_root: PathBuf,
+}
+
+impl LocalProvider {
+ pub fn new(fs: Arc<dyn Fs>, cargo_workspace_root: PathBuf) -> Self {
+ Self {
+ fs,
+ cargo_workspace_root,
+ }
+ }
+}
+
+#[async_trait]
+impl RustdocProvider for LocalProvider {
+ async fn fetch_page(
+ &self,
+ crate_name: &str,
+ item: Option<&RustdocItem>,
+ ) -> Result<Option<String>> {
+ let mut local_cargo_doc_path = self.cargo_workspace_root.join("target/doc");
+ local_cargo_doc_path.push(&crate_name);
+ if let Some(item) = item {
+ if !item.path.is_empty() {
+ local_cargo_doc_path.push(item.path.join("/"));
+ }
+ }
+ local_cargo_doc_path.push("index.html");
+
+ let Ok(contents) = self.fs.load(&local_cargo_doc_path).await else {
+ return Ok(None);
+ };
+
+ Ok(Some(contents))
+ }
+}
+
+pub struct DocsDotRsProvider {
+ http_client: Arc<HttpClientWithUrl>,
+}
+
+impl DocsDotRsProvider {
+ pub fn new(http_client: Arc<HttpClientWithUrl>) -> Self {
+ Self { http_client }
+ }
+}
+
+#[async_trait]
+impl RustdocProvider for DocsDotRsProvider {
+ async fn fetch_page(
+ &self,
+ crate_name: &str,
+ item: Option<&RustdocItem>,
+ ) -> Result<Option<String>> {
+ let version = "latest";
+ let path = format!(
+ "{crate_name}/{version}/{crate_name}{item_path}",
+ item_path = item
+ .map(|item| format!("/{}", item.url_path()))
+ .unwrap_or_default()
+ );
+
+ println!("Fetching {}", &format!("https://docs.rs/{path}"));
+
+ let mut response = self
+ .http_client
+ .get(
+ &format!("https://docs.rs/{path}"),
+ AsyncBody::default(),
+ true,
+ )
+ .await?;
+
+ let mut body = Vec::new();
+ response
+ .body_mut()
+ .read_to_end(&mut body)
+ .await
+ .context("error reading docs.rs response body")?;
+
+ if response.status().is_client_error() {
+ let text = String::from_utf8_lossy(body.as_slice());
+ bail!(
+ "status error {}, response: {text:?}",
+ response.status().as_u16()
+ );
+ }
+
+ Ok(Some(String::from_utf8(body)?))
+ }
+}
+
+pub struct RustdocItemWithHistory {
+ pub item: RustdocItem,
+ #[cfg(debug_assertions)]
+ pub history: Vec<String>,
+}
+
+pub struct RustdocCrawler {
+ provider: Box<dyn RustdocProvider + Send + Sync + 'static>,
+}
+
+impl RustdocCrawler {
+ pub fn new(provider: Box<dyn RustdocProvider + Send + Sync + 'static>) -> Self {
+ Self { provider }
+ }
+
+ pub async fn crawl(&self, crate_name: String) -> Result<Option<String>> {
+ let Some(crate_index_content) = self.provider.fetch_page(&crate_name, None).await? else {
+ return Ok(None);
+ };
+
+ let (_markdown, items) = convert_rustdoc_to_markdown(crate_index_content.as_bytes())?;
+
+ let mut seen_items = HashSet::default();
+ let mut items_to_visit: VecDeque<RustdocItemWithHistory> =
+ VecDeque::from_iter(items.into_iter().map(|item| RustdocItemWithHistory {
+ item,
+ #[cfg(debug_assertions)]
+ history: Vec::new(),
+ }));
+
+ while let Some(item_with_history) = items_to_visit.pop_front() {
+ let item = &item_with_history.item;
+ println!("Visiting {:?} {:?} {}", &item.kind, &item.path, &item.name);
+
+ let Some(result) = self
+ .provider
+ .fetch_page(&crate_name, Some(&item))
+ .await
+ .with_context(|| {
+ #[cfg(debug_assertions)]
+ {
+ format!(
+ "failed to fetch {item:?}: {history:?}",
+ history = item_with_history.history
+ )
+ }
+
+ #[cfg(not(debug_assertions))]
+ {
+ format!("failed to fetch {item:?}")
+ }
+ })?
+ else {
+ continue;
+ };
+
+ let (_markdown, mut items) = convert_rustdoc_to_markdown(result.as_bytes())?;
+
+ seen_items.insert(item.clone());
+
+ for child in &mut items {
+ child.path.extend(item.path.clone());
+ match item.kind {
+ RustdocItemKind::Mod => {
+ child.path.push(item.name.clone());
+ }
+ _ => {}
+ }
+ }
+
+ let unseen_items = items
+ .into_iter()
+ .map(|item| RustdocItemWithHistory {
+ #[cfg(debug_assertions)]
+ history: {
+ let mut history = item_with_history.history.clone();
+ history.push(item.url_path());
+ history
+ },
+ item,
+ })
+ .filter(|item| !seen_items.contains(&item.item));
+
+ items_to_visit.extend(unseen_items);
+ }
+
+ Ok(Some(String::new()))
+ }
+}
@@ -0,0 +1 @@
+pub mod crawler;