hexdocs.rs

  1use std::cell::RefCell;
  2use std::collections::BTreeSet;
  3use std::io::{self, Read};
  4use std::rc::Rc;
  5
  6use html_to_markdown::markdown::{
  7    HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
  8};
  9use html_to_markdown::{
 10    convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
 11    StartTagOutcome, TagHandler,
 12};
 13use zed_extension_api::{self as zed, HttpMethod, HttpRequest, KeyValueStore, Result};
 14
 15pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
 16    let headers = vec![(
 17        "User-Agent".to_string(),
 18        "Zed (Gleam Extension)".to_string(),
 19    )];
 20
 21    let response = zed::fetch(&HttpRequest {
 22        method: HttpMethod::Get,
 23        url: format!("https://hexdocs.pm/{package}"),
 24        headers: headers.clone(),
 25        body: None,
 26    })?;
 27
 28    let (package_root_markdown, modules) =
 29        convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
 30
 31    database.insert(&package, &package_root_markdown)?;
 32
 33    for module in modules {
 34        let response = zed::fetch(&HttpRequest {
 35            method: HttpMethod::Get,
 36            url: format!("https://hexdocs.pm/{package}/{module}.html"),
 37            headers: headers.clone(),
 38            body: None,
 39        })?;
 40
 41        let (markdown, _modules) =
 42            convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
 43
 44        database.insert(&format!("{module} ({package})"), &markdown)?;
 45    }
 46
 47    Ok(())
 48}
 49
 50pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
 51    let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
 52
 53    let mut handlers: Vec<TagHandler> = vec![
 54        module_collector.clone(),
 55        Rc::new(RefCell::new(GleamChromeRemover)),
 56        Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
 57        Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
 58        Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
 59        Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
 60        Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
 61    ];
 62
 63    let markdown = convert_html_to_markdown(html, &mut handlers)
 64        .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
 65
 66    let modules = module_collector
 67        .borrow()
 68        .modules
 69        .iter()
 70        .cloned()
 71        .collect::<Vec<_>>();
 72
 73    Ok((markdown, modules))
 74}
 75
 76/// A higher-order handler that skips all content from the `nav`.
 77///
 78/// We still need to traverse the `nav` for collecting information, but
 79/// we don't want to include any of its content in the resulting Markdown.
 80pub struct NavSkipper<T: HandleTag> {
 81    handler: T,
 82}
 83
 84impl<T: HandleTag> NavSkipper<T> {
 85    pub fn new(handler: T) -> Self {
 86        Self { handler }
 87    }
 88}
 89
 90impl<T: HandleTag> HandleTag for NavSkipper<T> {
 91    fn should_handle(&self, tag: &str) -> bool {
 92        tag == "nav" || self.handler.should_handle(tag)
 93    }
 94
 95    fn handle_tag_start(
 96        &mut self,
 97        tag: &HtmlElement,
 98        writer: &mut MarkdownWriter,
 99    ) -> StartTagOutcome {
100        if writer.is_inside("nav") {
101            return StartTagOutcome::Continue;
102        }
103
104        self.handler.handle_tag_start(tag, writer)
105    }
106
107    fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
108        if writer.is_inside("nav") {
109            return;
110        }
111
112        self.handler.handle_tag_end(tag, writer)
113    }
114
115    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
116        if writer.is_inside("nav") {
117            return HandlerOutcome::Handled;
118        }
119
120        self.handler.handle_text(text, writer)
121    }
122}
123
124pub struct GleamChromeRemover;
125
126impl HandleTag for GleamChromeRemover {
127    fn should_handle(&self, tag: &str) -> bool {
128        match tag {
129            "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
130            _ => false,
131        }
132    }
133
134    fn handle_tag_start(
135        &mut self,
136        tag: &HtmlElement,
137        _writer: &mut MarkdownWriter,
138    ) -> StartTagOutcome {
139        match tag.tag() {
140            "head" | "script" | "style" | "svg" | "header" | "footer" => {
141                return StartTagOutcome::Skip;
142            }
143            "a" => {
144                if tag.attr("onclick").is_some() {
145                    return StartTagOutcome::Skip;
146                }
147            }
148            _ => {}
149        }
150
151        StartTagOutcome::Continue
152    }
153}
154
155pub struct GleamModuleCollector {
156    modules: BTreeSet<String>,
157    has_seen_modules_header: bool,
158}
159
160impl GleamModuleCollector {
161    pub fn new() -> Self {
162        Self {
163            modules: BTreeSet::new(),
164            has_seen_modules_header: false,
165        }
166    }
167
168    fn parse_module(tag: &HtmlElement) -> Option<String> {
169        if tag.tag() != "a" {
170            return None;
171        }
172
173        let href = tag.attr("href")?;
174        if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
175            return None;
176        }
177
178        let module_name = href.trim_start_matches("./").trim_end_matches(".html");
179
180        Some(module_name.to_owned())
181    }
182}
183
184impl HandleTag for GleamModuleCollector {
185    fn should_handle(&self, tag: &str) -> bool {
186        match tag {
187            "h2" | "a" => true,
188            _ => false,
189        }
190    }
191
192    fn handle_tag_start(
193        &mut self,
194        tag: &HtmlElement,
195        writer: &mut MarkdownWriter,
196    ) -> StartTagOutcome {
197        match tag.tag() {
198            "a" => {
199                if self.has_seen_modules_header && writer.is_inside("li") {
200                    if let Some(module_name) = Self::parse_module(tag) {
201                        self.modules.insert(module_name);
202                    }
203                }
204            }
205            _ => {}
206        }
207
208        StartTagOutcome::Continue
209    }
210
211    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
212        if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
213            self.has_seen_modules_header = true;
214        }
215
216        HandlerOutcome::NoOp
217    }
218}