hexdocs.rs

  1use std::cell::RefCell;
  2use std::collections::BTreeSet;
  3use std::io::{self, Read};
  4use std::rc::Rc;
  5
  6use html_to_markdown::markdown::{
  7    HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
  8};
  9use html_to_markdown::{
 10    convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
 11    StartTagOutcome, TagHandler,
 12};
 13use zed_extension_api::{
 14    http_client::{HttpMethod, HttpRequest, RedirectPolicy},
 15    KeyValueStore, Result,
 16};
 17
 18pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
 19    let headers = vec![(
 20        "User-Agent".to_string(),
 21        "Zed (Gleam Extension)".to_string(),
 22    )];
 23
 24    let response = HttpRequest::builder()
 25        .method(HttpMethod::Get)
 26        .url(format!("https://hexdocs.pm/{package}"))
 27        .headers(headers.clone())
 28        .redirect_policy(RedirectPolicy::FollowAll)
 29        .build()?
 30        .fetch()?;
 31
 32    let (package_root_markdown, modules) =
 33        convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
 34
 35    database.insert(&package, &package_root_markdown)?;
 36
 37    for module in modules {
 38        let response = HttpRequest::builder()
 39            .method(HttpMethod::Get)
 40            .url(format!("https://hexdocs.pm/{package}/{module}.html"))
 41            .headers(headers.clone())
 42            .redirect_policy(RedirectPolicy::FollowAll)
 43            .build()?
 44            .fetch()?;
 45
 46        let (markdown, _modules) =
 47            convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
 48
 49        database.insert(&format!("{module} ({package})"), &markdown)?;
 50    }
 51
 52    Ok(())
 53}
 54
 55pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
 56    let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
 57
 58    let mut handlers: Vec<TagHandler> = vec![
 59        module_collector.clone(),
 60        Rc::new(RefCell::new(GleamChromeRemover)),
 61        Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
 62        Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
 63        Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
 64        Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
 65        Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
 66    ];
 67
 68    let markdown = convert_html_to_markdown(html, &mut handlers)
 69        .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
 70
 71    let modules = module_collector
 72        .borrow()
 73        .modules
 74        .iter()
 75        .cloned()
 76        .collect::<Vec<_>>();
 77
 78    Ok((markdown, modules))
 79}
 80
 81/// A higher-order handler that skips all content from the `nav`.
 82///
 83/// We still need to traverse the `nav` for collecting information, but
 84/// we don't want to include any of its content in the resulting Markdown.
 85pub struct NavSkipper<T: HandleTag> {
 86    handler: T,
 87}
 88
 89impl<T: HandleTag> NavSkipper<T> {
 90    pub fn new(handler: T) -> Self {
 91        Self { handler }
 92    }
 93}
 94
 95impl<T: HandleTag> HandleTag for NavSkipper<T> {
 96    fn should_handle(&self, tag: &str) -> bool {
 97        tag == "nav" || self.handler.should_handle(tag)
 98    }
 99
100    fn handle_tag_start(
101        &mut self,
102        tag: &HtmlElement,
103        writer: &mut MarkdownWriter,
104    ) -> StartTagOutcome {
105        if writer.is_inside("nav") {
106            return StartTagOutcome::Continue;
107        }
108
109        self.handler.handle_tag_start(tag, writer)
110    }
111
112    fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
113        if writer.is_inside("nav") {
114            return;
115        }
116
117        self.handler.handle_tag_end(tag, writer)
118    }
119
120    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
121        if writer.is_inside("nav") {
122            return HandlerOutcome::Handled;
123        }
124
125        self.handler.handle_text(text, writer)
126    }
127}
128
129pub struct GleamChromeRemover;
130
131impl HandleTag for GleamChromeRemover {
132    fn should_handle(&self, tag: &str) -> bool {
133        match tag {
134            "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
135            _ => false,
136        }
137    }
138
139    fn handle_tag_start(
140        &mut self,
141        tag: &HtmlElement,
142        _writer: &mut MarkdownWriter,
143    ) -> StartTagOutcome {
144        match tag.tag() {
145            "head" | "script" | "style" | "svg" | "header" | "footer" => {
146                return StartTagOutcome::Skip;
147            }
148            "a" => {
149                if tag.attr("onclick").is_some() {
150                    return StartTagOutcome::Skip;
151                }
152            }
153            _ => {}
154        }
155
156        StartTagOutcome::Continue
157    }
158}
159
160pub struct GleamModuleCollector {
161    modules: BTreeSet<String>,
162    has_seen_modules_header: bool,
163}
164
165impl GleamModuleCollector {
166    pub fn new() -> Self {
167        Self {
168            modules: BTreeSet::new(),
169            has_seen_modules_header: false,
170        }
171    }
172
173    fn parse_module(tag: &HtmlElement) -> Option<String> {
174        if tag.tag() != "a" {
175            return None;
176        }
177
178        let href = tag.attr("href")?;
179        if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
180            return None;
181        }
182
183        let module_name = href.trim_start_matches("./").trim_end_matches(".html");
184
185        Some(module_name.to_owned())
186    }
187}
188
189impl HandleTag for GleamModuleCollector {
190    fn should_handle(&self, tag: &str) -> bool {
191        match tag {
192            "h2" | "a" => true,
193            _ => false,
194        }
195    }
196
197    fn handle_tag_start(
198        &mut self,
199        tag: &HtmlElement,
200        writer: &mut MarkdownWriter,
201    ) -> StartTagOutcome {
202        match tag.tag() {
203            "a" => {
204                if self.has_seen_modules_header && writer.is_inside("li") {
205                    if let Some(module_name) = Self::parse_module(tag) {
206                        self.modules.insert(module_name);
207                    }
208                }
209            }
210            _ => {}
211        }
212
213        StartTagOutcome::Continue
214    }
215
216    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
217        if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
218            self.has_seen_modules_header = true;
219        }
220
221        HandlerOutcome::NoOp
222    }
223}