crawler.rs

  1use std::path::PathBuf;
  2use std::sync::Arc;
  3
  4use anyhow::{bail, Context, Result};
  5use async_trait::async_trait;
  6use collections::{HashSet, VecDeque};
  7use fs::Fs;
  8use futures::AsyncReadExt;
  9use http::{AsyncBody, HttpClient, HttpClientWithUrl};
 10use indexmap::IndexMap;
 11
 12use crate::{convert_rustdoc_to_markdown, RustdocItem, RustdocItemKind};
 13
 14#[derive(Debug, Clone, Copy)]
 15pub enum RustdocSource {
 16    /// The docs were sourced from local `cargo doc` output.
 17    Local,
 18    /// The docs were sourced from `docs.rs`.
 19    DocsDotRs,
 20}
 21
 22#[async_trait]
 23pub trait RustdocProvider {
 24    async fn fetch_page(
 25        &self,
 26        crate_name: &str,
 27        item: Option<&RustdocItem>,
 28    ) -> Result<Option<String>>;
 29}
 30
 31pub struct LocalProvider {
 32    fs: Arc<dyn Fs>,
 33    cargo_workspace_root: PathBuf,
 34}
 35
 36impl LocalProvider {
 37    pub fn new(fs: Arc<dyn Fs>, cargo_workspace_root: PathBuf) -> Self {
 38        Self {
 39            fs,
 40            cargo_workspace_root,
 41        }
 42    }
 43}
 44
 45#[async_trait]
 46impl RustdocProvider for LocalProvider {
 47    async fn fetch_page(
 48        &self,
 49        crate_name: &str,
 50        item: Option<&RustdocItem>,
 51    ) -> Result<Option<String>> {
 52        let mut local_cargo_doc_path = self.cargo_workspace_root.join("target/doc");
 53        local_cargo_doc_path.push(&crate_name);
 54        if let Some(item) = item {
 55            local_cargo_doc_path.push(item.url_path());
 56        } else {
 57            local_cargo_doc_path.push("index.html");
 58        }
 59
 60        println!("Fetching {}", local_cargo_doc_path.display());
 61
 62        let Ok(contents) = self.fs.load(&local_cargo_doc_path).await else {
 63            return Ok(None);
 64        };
 65
 66        Ok(Some(contents))
 67    }
 68}
 69
 70pub struct DocsDotRsProvider {
 71    http_client: Arc<HttpClientWithUrl>,
 72}
 73
 74impl DocsDotRsProvider {
 75    pub fn new(http_client: Arc<HttpClientWithUrl>) -> Self {
 76        Self { http_client }
 77    }
 78}
 79
 80#[async_trait]
 81impl RustdocProvider for DocsDotRsProvider {
 82    async fn fetch_page(
 83        &self,
 84        crate_name: &str,
 85        item: Option<&RustdocItem>,
 86    ) -> Result<Option<String>> {
 87        let version = "latest";
 88        let path = format!(
 89            "{crate_name}/{version}/{crate_name}{item_path}",
 90            item_path = item
 91                .map(|item| format!("/{}", item.url_path()))
 92                .unwrap_or_default()
 93        );
 94
 95        println!("Fetching {}", &format!("https://docs.rs/{path}"));
 96
 97        let mut response = self
 98            .http_client
 99            .get(
100                &format!("https://docs.rs/{path}"),
101                AsyncBody::default(),
102                true,
103            )
104            .await?;
105
106        let mut body = Vec::new();
107        response
108            .body_mut()
109            .read_to_end(&mut body)
110            .await
111            .context("error reading docs.rs response body")?;
112
113        if response.status().is_client_error() {
114            let text = String::from_utf8_lossy(body.as_slice());
115            bail!(
116                "status error {}, response: {text:?}",
117                response.status().as_u16()
118            );
119        }
120
121        Ok(Some(String::from_utf8(body)?))
122    }
123}
124
125#[derive(Debug)]
126struct RustdocItemWithHistory {
127    pub item: RustdocItem,
128    #[cfg(debug_assertions)]
129    pub history: Vec<String>,
130}
131
132pub struct CrateDocs {
133    pub crate_root_markdown: String,
134    pub items: IndexMap<RustdocItem, String>,
135}
136
137pub struct RustdocCrawler {
138    provider: Box<dyn RustdocProvider + Send + Sync + 'static>,
139}
140
141impl RustdocCrawler {
142    pub fn new(provider: Box<dyn RustdocProvider + Send + Sync + 'static>) -> Self {
143        Self { provider }
144    }
145
146    pub async fn crawl(&self, crate_name: String) -> Result<Option<CrateDocs>> {
147        let Some(crate_root_content) = self.provider.fetch_page(&crate_name, None).await? else {
148            return Ok(None);
149        };
150
151        let (crate_root_markdown, items) =
152            convert_rustdoc_to_markdown(crate_root_content.as_bytes())?;
153
154        let mut docs_by_item = IndexMap::new();
155        let mut seen_items = HashSet::from_iter(items.clone());
156        let mut items_to_visit: VecDeque<RustdocItemWithHistory> =
157            VecDeque::from_iter(items.into_iter().map(|item| RustdocItemWithHistory {
158                item,
159                #[cfg(debug_assertions)]
160                history: Vec::new(),
161            }));
162
163        while let Some(item_with_history) = items_to_visit.pop_front() {
164            let item = &item_with_history.item;
165
166            println!("Visiting {:?} {:?} {}", &item.kind, &item.path, &item.name);
167
168            let Some(result) = self
169                .provider
170                .fetch_page(&crate_name, Some(&item))
171                .await
172                .with_context(|| {
173                    #[cfg(debug_assertions)]
174                    {
175                        format!(
176                            "failed to fetch {item:?}: {history:?}",
177                            history = item_with_history.history
178                        )
179                    }
180
181                    #[cfg(not(debug_assertions))]
182                    {
183                        format!("failed to fetch {item:?}")
184                    }
185                })?
186            else {
187                continue;
188            };
189
190            let (markdown, referenced_items) = convert_rustdoc_to_markdown(result.as_bytes())?;
191
192            docs_by_item.insert(item.clone(), markdown);
193
194            let parent_item = item;
195            for mut item in referenced_items {
196                if seen_items.contains(&item) {
197                    continue;
198                }
199
200                seen_items.insert(item.clone());
201
202                item.path.extend(parent_item.path.clone());
203                match parent_item.kind {
204                    RustdocItemKind::Mod => {
205                        item.path.push(parent_item.name.clone());
206                    }
207                    _ => {}
208                }
209
210                items_to_visit.push_back(RustdocItemWithHistory {
211                    #[cfg(debug_assertions)]
212                    history: {
213                        let mut history = item_with_history.history.clone();
214                        history.push(item.url_path());
215                        history
216                    },
217                    item,
218                });
219            }
220        }
221
222        Ok(Some(CrateDocs {
223            crate_root_markdown,
224            items: docs_by_item,
225        }))
226    }
227}