crawler.rs

  1use std::path::PathBuf;
  2use std::sync::Arc;
  3
  4use anyhow::{bail, Context, Result};
  5use async_trait::async_trait;
  6use collections::{HashSet, VecDeque};
  7use fs::Fs;
  8use futures::AsyncReadExt;
  9use html_to_markdown::convert_rustdoc_to_markdown;
 10use html_to_markdown::structure::rustdoc::{RustdocItem, RustdocItemKind};
 11use http::{AsyncBody, HttpClient, HttpClientWithUrl};
 12
 13#[derive(Debug, Clone, Copy)]
 14pub enum RustdocSource {
 15    /// The docs were sourced from local `cargo doc` output.
 16    Local,
 17    /// The docs were sourced from `docs.rs`.
 18    DocsDotRs,
 19}
 20
 21#[async_trait]
 22pub trait RustdocProvider {
 23    async fn fetch_page(
 24        &self,
 25        crate_name: &str,
 26        item: Option<&RustdocItem>,
 27    ) -> Result<Option<String>>;
 28}
 29
 30pub struct LocalProvider {
 31    fs: Arc<dyn Fs>,
 32    cargo_workspace_root: PathBuf,
 33}
 34
 35impl LocalProvider {
 36    pub fn new(fs: Arc<dyn Fs>, cargo_workspace_root: PathBuf) -> Self {
 37        Self {
 38            fs,
 39            cargo_workspace_root,
 40        }
 41    }
 42}
 43
 44#[async_trait]
 45impl RustdocProvider for LocalProvider {
 46    async fn fetch_page(
 47        &self,
 48        crate_name: &str,
 49        item: Option<&RustdocItem>,
 50    ) -> Result<Option<String>> {
 51        let mut local_cargo_doc_path = self.cargo_workspace_root.join("target/doc");
 52        local_cargo_doc_path.push(&crate_name);
 53        if let Some(item) = item {
 54            if !item.path.is_empty() {
 55                local_cargo_doc_path.push(item.path.join("/"));
 56            }
 57        }
 58        local_cargo_doc_path.push("index.html");
 59
 60        let Ok(contents) = self.fs.load(&local_cargo_doc_path).await else {
 61            return Ok(None);
 62        };
 63
 64        Ok(Some(contents))
 65    }
 66}
 67
 68pub struct DocsDotRsProvider {
 69    http_client: Arc<HttpClientWithUrl>,
 70}
 71
 72impl DocsDotRsProvider {
 73    pub fn new(http_client: Arc<HttpClientWithUrl>) -> Self {
 74        Self { http_client }
 75    }
 76}
 77
 78#[async_trait]
 79impl RustdocProvider for DocsDotRsProvider {
 80    async fn fetch_page(
 81        &self,
 82        crate_name: &str,
 83        item: Option<&RustdocItem>,
 84    ) -> Result<Option<String>> {
 85        let version = "latest";
 86        let path = format!(
 87            "{crate_name}/{version}/{crate_name}{item_path}",
 88            item_path = item
 89                .map(|item| format!("/{}", item.url_path()))
 90                .unwrap_or_default()
 91        );
 92
 93        println!("Fetching {}", &format!("https://docs.rs/{path}"));
 94
 95        let mut response = self
 96            .http_client
 97            .get(
 98                &format!("https://docs.rs/{path}"),
 99                AsyncBody::default(),
100                true,
101            )
102            .await?;
103
104        let mut body = Vec::new();
105        response
106            .body_mut()
107            .read_to_end(&mut body)
108            .await
109            .context("error reading docs.rs response body")?;
110
111        if response.status().is_client_error() {
112            let text = String::from_utf8_lossy(body.as_slice());
113            bail!(
114                "status error {}, response: {text:?}",
115                response.status().as_u16()
116            );
117        }
118
119        Ok(Some(String::from_utf8(body)?))
120    }
121}
122
123pub struct RustdocItemWithHistory {
124    pub item: RustdocItem,
125    #[cfg(debug_assertions)]
126    pub history: Vec<String>,
127}
128
129pub struct RustdocCrawler {
130    provider: Box<dyn RustdocProvider + Send + Sync + 'static>,
131}
132
133impl RustdocCrawler {
134    pub fn new(provider: Box<dyn RustdocProvider + Send + Sync + 'static>) -> Self {
135        Self { provider }
136    }
137
138    pub async fn crawl(&self, crate_name: String) -> Result<Option<String>> {
139        let Some(crate_index_content) = self.provider.fetch_page(&crate_name, None).await? else {
140            return Ok(None);
141        };
142
143        let (_markdown, items) = convert_rustdoc_to_markdown(crate_index_content.as_bytes())?;
144
145        let mut seen_items = HashSet::default();
146        let mut items_to_visit: VecDeque<RustdocItemWithHistory> =
147            VecDeque::from_iter(items.into_iter().map(|item| RustdocItemWithHistory {
148                item,
149                #[cfg(debug_assertions)]
150                history: Vec::new(),
151            }));
152
153        while let Some(item_with_history) = items_to_visit.pop_front() {
154            let item = &item_with_history.item;
155            println!("Visiting {:?} {:?} {}", &item.kind, &item.path, &item.name);
156
157            let Some(result) = self
158                .provider
159                .fetch_page(&crate_name, Some(&item))
160                .await
161                .with_context(|| {
162                    #[cfg(debug_assertions)]
163                    {
164                        format!(
165                            "failed to fetch {item:?}: {history:?}",
166                            history = item_with_history.history
167                        )
168                    }
169
170                    #[cfg(not(debug_assertions))]
171                    {
172                        format!("failed to fetch {item:?}")
173                    }
174                })?
175            else {
176                continue;
177            };
178
179            let (_markdown, mut items) = convert_rustdoc_to_markdown(result.as_bytes())?;
180
181            seen_items.insert(item.clone());
182
183            for child in &mut items {
184                child.path.extend(item.path.clone());
185                match item.kind {
186                    RustdocItemKind::Mod => {
187                        child.path.push(item.name.clone());
188                    }
189                    _ => {}
190                }
191            }
192
193            let unseen_items = items
194                .into_iter()
195                .map(|item| RustdocItemWithHistory {
196                    #[cfg(debug_assertions)]
197                    history: {
198                        let mut history = item_with_history.history.clone();
199                        history.push(item.url_path());
200                        history
201                    },
202                    item,
203                })
204                .filter(|item| !seen_items.contains(&item.item));
205
206            items_to_visit.extend(unseen_items);
207        }
208
209        Ok(Some(String::new()))
210    }
211}