hexdocs.rs

  1use std::cell::RefCell;
  2use std::collections::BTreeSet;
  3use std::io::Read;
  4use std::rc::Rc;
  5
  6use html_to_markdown::markdown::{
  7    HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
  8};
  9use html_to_markdown::{
 10    convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
 11    StartTagOutcome, TagHandler,
 12};
 13use zed_extension_api::{self as zed, HttpRequest, KeyValueStore, Result};
 14
 15pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
 16    let response = zed::fetch(&HttpRequest {
 17        url: format!("https://hexdocs.pm/{package}"),
 18    })?;
 19
 20    let (package_root_markdown, modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
 21
 22    database.insert(&package, &package_root_markdown)?;
 23
 24    for module in modules {
 25        let response = zed::fetch(&HttpRequest {
 26            url: format!("https://hexdocs.pm/{package}/{module}.html"),
 27        })?;
 28
 29        let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
 30
 31        database.insert(&format!("{module} ({package})"), &markdown)?;
 32    }
 33
 34    Ok(())
 35}
 36
 37pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
 38    let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
 39
 40    let mut handlers: Vec<TagHandler> = vec![
 41        module_collector.clone(),
 42        Rc::new(RefCell::new(GleamChromeRemover)),
 43        Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
 44        Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
 45        Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
 46        Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
 47        Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
 48    ];
 49
 50    let markdown = convert_html_to_markdown(html, &mut handlers)
 51        .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
 52
 53    let modules = module_collector
 54        .borrow()
 55        .modules
 56        .iter()
 57        .cloned()
 58        .collect::<Vec<_>>();
 59
 60    Ok((markdown, modules))
 61}
 62
 63/// A higher-order handler that skips all content from the `nav`.
 64///
 65/// We still need to traverse the `nav` for collecting information, but
 66/// we don't want to include any of its content in the resulting Markdown.
 67pub struct NavSkipper<T: HandleTag> {
 68    handler: T,
 69}
 70
 71impl<T: HandleTag> NavSkipper<T> {
 72    pub fn new(handler: T) -> Self {
 73        Self { handler }
 74    }
 75}
 76
 77impl<T: HandleTag> HandleTag for NavSkipper<T> {
 78    fn should_handle(&self, tag: &str) -> bool {
 79        tag == "nav" || self.handler.should_handle(tag)
 80    }
 81
 82    fn handle_tag_start(
 83        &mut self,
 84        tag: &HtmlElement,
 85        writer: &mut MarkdownWriter,
 86    ) -> StartTagOutcome {
 87        if writer.is_inside("nav") {
 88            return StartTagOutcome::Continue;
 89        }
 90
 91        self.handler.handle_tag_start(tag, writer)
 92    }
 93
 94    fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
 95        if writer.is_inside("nav") {
 96            return;
 97        }
 98
 99        self.handler.handle_tag_end(tag, writer)
100    }
101
102    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
103        if writer.is_inside("nav") {
104            return HandlerOutcome::Handled;
105        }
106
107        self.handler.handle_text(text, writer)
108    }
109}
110
111pub struct GleamChromeRemover;
112
113impl HandleTag for GleamChromeRemover {
114    fn should_handle(&self, tag: &str) -> bool {
115        match tag {
116            "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
117            _ => false,
118        }
119    }
120
121    fn handle_tag_start(
122        &mut self,
123        tag: &HtmlElement,
124        _writer: &mut MarkdownWriter,
125    ) -> StartTagOutcome {
126        match tag.tag() {
127            "head" | "script" | "style" | "svg" | "header" | "footer" => {
128                return StartTagOutcome::Skip;
129            }
130            "a" => {
131                if tag.attr("onclick").is_some() {
132                    return StartTagOutcome::Skip;
133                }
134            }
135            _ => {}
136        }
137
138        StartTagOutcome::Continue
139    }
140}
141
142pub struct GleamModuleCollector {
143    modules: BTreeSet<String>,
144    has_seen_modules_header: bool,
145}
146
147impl GleamModuleCollector {
148    pub fn new() -> Self {
149        Self {
150            modules: BTreeSet::new(),
151            has_seen_modules_header: false,
152        }
153    }
154
155    fn parse_module(tag: &HtmlElement) -> Option<String> {
156        if tag.tag() != "a" {
157            return None;
158        }
159
160        let href = tag.attr("href")?;
161        if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
162            return None;
163        }
164
165        let module_name = href.trim_start_matches("./").trim_end_matches(".html");
166
167        Some(module_name.to_owned())
168    }
169}
170
171impl HandleTag for GleamModuleCollector {
172    fn should_handle(&self, tag: &str) -> bool {
173        match tag {
174            "h2" | "a" => true,
175            _ => false,
176        }
177    }
178
179    fn handle_tag_start(
180        &mut self,
181        tag: &HtmlElement,
182        writer: &mut MarkdownWriter,
183    ) -> StartTagOutcome {
184        match tag.tag() {
185            "a" => {
186                if self.has_seen_modules_header && writer.is_inside("li") {
187                    if let Some(module_name) = Self::parse_module(tag) {
188                        self.modules.insert(module_name);
189                    }
190                }
191            }
192            _ => {}
193        }
194
195        StartTagOutcome::Continue
196    }
197
198    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
199        if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
200            self.has_seen_modules_header = true;
201        }
202
203        HandlerOutcome::NoOp
204    }
205}