diff --git a/extensions/gleam/src/gleam.rs b/extensions/gleam/src/gleam.rs index 2907bcf6ada44b155db9d6d5918835acacf51bf5..27478a2915f0a7e9a8bdf189108f8e23d152d50a 100644 --- a/extensions/gleam/src/gleam.rs +++ b/extensions/gleam/src/gleam.rs @@ -1,7 +1,6 @@ -use html_to_markdown::{convert_html_to_markdown, TagHandler}; -use std::cell::RefCell; +mod hexdocs; + use std::fs; -use std::rc::Rc; use zed::lsp::CompletionKind; use zed::{ CodeLabel, CodeLabelSpan, HttpRequest, KeyValueStore, LanguageServerId, SlashCommand, @@ -9,6 +8,8 @@ use zed::{ }; use zed_extension_api::{self as zed, Result}; +use crate::hexdocs::convert_hexdocs_to_markdown; + struct GleamExtension { cached_binary_path: Option, } @@ -191,19 +192,7 @@ impl zed::Extension for GleamExtension { ), })?; - let mut handlers: Vec = vec![ - Rc::new(RefCell::new( - html_to_markdown::markdown::WebpageChromeRemover, - )), - Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)), - Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)), - Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)), - Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())), - Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)), - ]; - - let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers) - .map_err(|err| format!("failed to convert docs to Markdown {err}"))?; + let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?; let mut text = String::new(); text.push_str(&markdown); @@ -244,27 +233,7 @@ impl zed::Extension for GleamExtension { database: &KeyValueStore, ) -> Result<(), String> { match provider.as_str() { - "gleam-hexdocs" => { - let response = zed::fetch(&HttpRequest { - url: format!("https://hexdocs.pm/{package}"), - })?; - - let mut handlers: Vec = vec![ - Rc::new(RefCell::new( - html_to_markdown::markdown::WebpageChromeRemover, - )), - Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)), - Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)), - Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)), - Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())), - Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)), - ]; - - let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers) - .map_err(|err| format!("failed to convert docs to Markdown {err}"))?; - - Ok(database.insert(&package, &markdown)?) - } + "gleam-hexdocs" => hexdocs::index(package, database), _ => Ok(()), } } diff --git a/extensions/gleam/src/hexdocs.rs b/extensions/gleam/src/hexdocs.rs new file mode 100644 index 0000000000000000000000000000000000000000..cc21746934cc14506a9438d1cba18f172e76c6b4 --- /dev/null +++ b/extensions/gleam/src/hexdocs.rs @@ -0,0 +1,205 @@ +use std::cell::RefCell; +use std::collections::BTreeSet; +use std::io::Read; +use std::rc::Rc; + +use html_to_markdown::markdown::{ + HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler, +}; +use html_to_markdown::{ + convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter, + StartTagOutcome, TagHandler, +}; +use zed_extension_api::{self as zed, HttpRequest, KeyValueStore, Result}; + +pub fn index(package: String, database: &KeyValueStore) -> Result<()> { + let response = zed::fetch(&HttpRequest { + url: format!("https://hexdocs.pm/{package}"), + })?; + + let (package_root_markdown, modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?; + + database.insert(&package, &package_root_markdown)?; + + for module in modules { + let response = zed::fetch(&HttpRequest { + url: format!("https://hexdocs.pm/{package}/{module}.html"), + })?; + + let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?; + + database.insert(&module, &markdown)?; + } + + Ok(()) +} + +pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec)> { + let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new())); + + let mut handlers: Vec = vec![ + module_collector.clone(), + Rc::new(RefCell::new(GleamChromeRemover)), + Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))), + Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))), + Rc::new(RefCell::new(NavSkipper::new(ListHandler))), + Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))), + Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))), + ]; + + let markdown = convert_html_to_markdown(html, &mut handlers) + .map_err(|err| format!("failed to convert docs to Markdown {err}"))?; + + let modules = module_collector + .borrow() + .modules + .iter() + .cloned() + .collect::>(); + + Ok((markdown, modules)) +} + +/// A higher-order handler that skips all content from the `nav`. +/// +/// We still need to traverse the `nav` for collecting information, but +/// we don't want to include any of its content in the resulting Markdown. +pub struct NavSkipper { + handler: T, +} + +impl NavSkipper { + pub fn new(handler: T) -> Self { + Self { handler } + } +} + +impl HandleTag for NavSkipper { + fn should_handle(&self, tag: &str) -> bool { + tag == "nav" || self.handler.should_handle(tag) + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + if writer.is_inside("nav") { + return StartTagOutcome::Continue; + } + + self.handler.handle_tag_start(tag, writer) + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + if writer.is_inside("nav") { + return; + } + + self.handler.handle_tag_end(tag, writer) + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if writer.is_inside("nav") { + return HandlerOutcome::Handled; + } + + self.handler.handle_text(text, writer) + } +} + +pub struct GleamChromeRemover; + +impl HandleTag for GleamChromeRemover { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + _writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag() { + "head" | "script" | "style" | "svg" | "header" | "footer" => { + return StartTagOutcome::Skip; + } + "a" => { + if tag.attr("onclick").is_some() { + return StartTagOutcome::Skip; + } + } + _ => {} + } + + StartTagOutcome::Continue + } +} + +pub struct GleamModuleCollector { + modules: BTreeSet, + has_seen_modules_header: bool, +} + +impl GleamModuleCollector { + pub fn new() -> Self { + Self { + modules: BTreeSet::new(), + has_seen_modules_header: false, + } + } + + fn parse_module(tag: &HtmlElement) -> Option { + if tag.tag() != "a" { + return None; + } + + let href = tag.attr("href")?; + if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") { + return None; + } + + let module_name = href.trim_start_matches("./").trim_end_matches(".html"); + + Some(module_name.to_owned()) + } +} + +impl HandleTag for GleamModuleCollector { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "h2" | "a" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag() { + "a" => { + if self.has_seen_modules_header && writer.is_inside("li") { + if let Some(module_name) = Self::parse_module(tag) { + self.modules.insert(module_name); + } + } + } + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" { + self.has_seen_modules_header = true; + } + + HandlerOutcome::NoOp + } +}