hexdocs.rs

  1use std::cell::RefCell;
  2use std::collections::BTreeSet;
  3use std::io::{self, Read};
  4use std::rc::Rc;
  5
  6use html_to_markdown::markdown::{
  7    HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
  8};
  9use html_to_markdown::{
 10    convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
 11    StartTagOutcome, TagHandler,
 12};
 13use zed_extension_api::{
 14    self as zed, HttpMethod, HttpRequest, KeyValueStore, RedirectPolicy, Result,
 15};
 16
 17pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
 18    let headers = vec![(
 19        "User-Agent".to_string(),
 20        "Zed (Gleam Extension)".to_string(),
 21    )];
 22
 23    let response = zed::fetch(&HttpRequest {
 24        method: HttpMethod::Get,
 25        url: format!("https://hexdocs.pm/{package}"),
 26        headers: headers.clone(),
 27        body: None,
 28        redirect_policy: RedirectPolicy::FollowAll,
 29    })?;
 30
 31    let (package_root_markdown, modules) =
 32        convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
 33
 34    database.insert(&package, &package_root_markdown)?;
 35
 36    for module in modules {
 37        let response = zed::fetch(&HttpRequest {
 38            method: HttpMethod::Get,
 39            url: format!("https://hexdocs.pm/{package}/{module}.html"),
 40            headers: headers.clone(),
 41            body: None,
 42            redirect_policy: RedirectPolicy::FollowAll,
 43        })?;
 44
 45        let (markdown, _modules) =
 46            convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
 47
 48        database.insert(&format!("{module} ({package})"), &markdown)?;
 49    }
 50
 51    Ok(())
 52}
 53
 54pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
 55    let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
 56
 57    let mut handlers: Vec<TagHandler> = vec![
 58        module_collector.clone(),
 59        Rc::new(RefCell::new(GleamChromeRemover)),
 60        Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
 61        Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
 62        Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
 63        Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
 64        Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
 65    ];
 66
 67    let markdown = convert_html_to_markdown(html, &mut handlers)
 68        .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
 69
 70    let modules = module_collector
 71        .borrow()
 72        .modules
 73        .iter()
 74        .cloned()
 75        .collect::<Vec<_>>();
 76
 77    Ok((markdown, modules))
 78}
 79
 80/// A higher-order handler that skips all content from the `nav`.
 81///
 82/// We still need to traverse the `nav` for collecting information, but
 83/// we don't want to include any of its content in the resulting Markdown.
 84pub struct NavSkipper<T: HandleTag> {
 85    handler: T,
 86}
 87
 88impl<T: HandleTag> NavSkipper<T> {
 89    pub fn new(handler: T) -> Self {
 90        Self { handler }
 91    }
 92}
 93
 94impl<T: HandleTag> HandleTag for NavSkipper<T> {
 95    fn should_handle(&self, tag: &str) -> bool {
 96        tag == "nav" || self.handler.should_handle(tag)
 97    }
 98
 99    fn handle_tag_start(
100        &mut self,
101        tag: &HtmlElement,
102        writer: &mut MarkdownWriter,
103    ) -> StartTagOutcome {
104        if writer.is_inside("nav") {
105            return StartTagOutcome::Continue;
106        }
107
108        self.handler.handle_tag_start(tag, writer)
109    }
110
111    fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
112        if writer.is_inside("nav") {
113            return;
114        }
115
116        self.handler.handle_tag_end(tag, writer)
117    }
118
119    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
120        if writer.is_inside("nav") {
121            return HandlerOutcome::Handled;
122        }
123
124        self.handler.handle_text(text, writer)
125    }
126}
127
128pub struct GleamChromeRemover;
129
130impl HandleTag for GleamChromeRemover {
131    fn should_handle(&self, tag: &str) -> bool {
132        match tag {
133            "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
134            _ => false,
135        }
136    }
137
138    fn handle_tag_start(
139        &mut self,
140        tag: &HtmlElement,
141        _writer: &mut MarkdownWriter,
142    ) -> StartTagOutcome {
143        match tag.tag() {
144            "head" | "script" | "style" | "svg" | "header" | "footer" => {
145                return StartTagOutcome::Skip;
146            }
147            "a" => {
148                if tag.attr("onclick").is_some() {
149                    return StartTagOutcome::Skip;
150                }
151            }
152            _ => {}
153        }
154
155        StartTagOutcome::Continue
156    }
157}
158
159pub struct GleamModuleCollector {
160    modules: BTreeSet<String>,
161    has_seen_modules_header: bool,
162}
163
164impl GleamModuleCollector {
165    pub fn new() -> Self {
166        Self {
167            modules: BTreeSet::new(),
168            has_seen_modules_header: false,
169        }
170    }
171
172    fn parse_module(tag: &HtmlElement) -> Option<String> {
173        if tag.tag() != "a" {
174            return None;
175        }
176
177        let href = tag.attr("href")?;
178        if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
179            return None;
180        }
181
182        let module_name = href.trim_start_matches("./").trim_end_matches(".html");
183
184        Some(module_name.to_owned())
185    }
186}
187
188impl HandleTag for GleamModuleCollector {
189    fn should_handle(&self, tag: &str) -> bool {
190        match tag {
191            "h2" | "a" => true,
192            _ => false,
193        }
194    }
195
196    fn handle_tag_start(
197        &mut self,
198        tag: &HtmlElement,
199        writer: &mut MarkdownWriter,
200    ) -> StartTagOutcome {
201        match tag.tag() {
202            "a" => {
203                if self.has_seen_modules_header && writer.is_inside("li") {
204                    if let Some(module_name) = Self::parse_module(tag) {
205                        self.modules.insert(module_name);
206                    }
207                }
208            }
209            _ => {}
210        }
211
212        StartTagOutcome::Continue
213    }
214
215    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
216        if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
217            self.has_seen_modules_header = true;
218        }
219
220        HandlerOutcome::NoOp
221    }
222}