@@ -1,7 +1,6 @@
-use html_to_markdown::{convert_html_to_markdown, TagHandler};
-use std::cell::RefCell;
+mod hexdocs;
+
use std::fs;
-use std::rc::Rc;
use zed::lsp::CompletionKind;
use zed::{
CodeLabel, CodeLabelSpan, HttpRequest, KeyValueStore, LanguageServerId, SlashCommand,
@@ -9,6 +8,8 @@ use zed::{
};
use zed_extension_api::{self as zed, Result};
+use crate::hexdocs::convert_hexdocs_to_markdown;
+
struct GleamExtension {
cached_binary_path: Option<String>,
}
@@ -191,19 +192,7 @@ impl zed::Extension for GleamExtension {
),
})?;
- let mut handlers: Vec<TagHandler> = vec![
- Rc::new(RefCell::new(
- html_to_markdown::markdown::WebpageChromeRemover,
- )),
- Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
- Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
- Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
- Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
- Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
- ];
-
- let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
- .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
+ let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
let mut text = String::new();
text.push_str(&markdown);
@@ -244,27 +233,7 @@ impl zed::Extension for GleamExtension {
database: &KeyValueStore,
) -> Result<(), String> {
match provider.as_str() {
- "gleam-hexdocs" => {
- let response = zed::fetch(&HttpRequest {
- url: format!("https://hexdocs.pm/{package}"),
- })?;
-
- let mut handlers: Vec<TagHandler> = vec![
- Rc::new(RefCell::new(
- html_to_markdown::markdown::WebpageChromeRemover,
- )),
- Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
- Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
- Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
- Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
- Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
- ];
-
- let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
- .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
-
- Ok(database.insert(&package, &markdown)?)
- }
+ "gleam-hexdocs" => hexdocs::index(package, database),
_ => Ok(()),
}
}
@@ -0,0 +1,205 @@
+use std::cell::RefCell;
+use std::collections::BTreeSet;
+use std::io::Read;
+use std::rc::Rc;
+
+use html_to_markdown::markdown::{
+ HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
+};
+use html_to_markdown::{
+ convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
+ StartTagOutcome, TagHandler,
+};
+use zed_extension_api::{self as zed, HttpRequest, KeyValueStore, Result};
+
+pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
+ let response = zed::fetch(&HttpRequest {
+ url: format!("https://hexdocs.pm/{package}"),
+ })?;
+
+ let (package_root_markdown, modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
+
+ database.insert(&package, &package_root_markdown)?;
+
+ for module in modules {
+ let response = zed::fetch(&HttpRequest {
+ url: format!("https://hexdocs.pm/{package}/{module}.html"),
+ })?;
+
+ let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
+
+ database.insert(&module, &markdown)?;
+ }
+
+ Ok(())
+}
+
+pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
+ let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
+
+ let mut handlers: Vec<TagHandler> = vec![
+ module_collector.clone(),
+ Rc::new(RefCell::new(GleamChromeRemover)),
+ Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
+ Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
+ Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
+ Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
+ Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
+ ];
+
+ let markdown = convert_html_to_markdown(html, &mut handlers)
+ .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
+
+ let modules = module_collector
+ .borrow()
+ .modules
+ .iter()
+ .cloned()
+ .collect::<Vec<_>>();
+
+ Ok((markdown, modules))
+}
+
+/// A higher-order handler that skips all content from the `nav`.
+///
+/// We still need to traverse the `nav` for collecting information, but
+/// we don't want to include any of its content in the resulting Markdown.
+pub struct NavSkipper<T: HandleTag> {
+ handler: T,
+}
+
+impl<T: HandleTag> NavSkipper<T> {
+ pub fn new(handler: T) -> Self {
+ Self { handler }
+ }
+}
+
+impl<T: HandleTag> HandleTag for NavSkipper<T> {
+ fn should_handle(&self, tag: &str) -> bool {
+ tag == "nav" || self.handler.should_handle(tag)
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ if writer.is_inside("nav") {
+ return StartTagOutcome::Continue;
+ }
+
+ self.handler.handle_tag_start(tag, writer)
+ }
+
+ fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+ if writer.is_inside("nav") {
+ return;
+ }
+
+ self.handler.handle_tag_end(tag, writer)
+ }
+
+ fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+ if writer.is_inside("nav") {
+ return HandlerOutcome::Handled;
+ }
+
+ self.handler.handle_text(text, writer)
+ }
+}
+
+pub struct GleamChromeRemover;
+
+impl HandleTag for GleamChromeRemover {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ _writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag() {
+ "head" | "script" | "style" | "svg" | "header" | "footer" => {
+ return StartTagOutcome::Skip;
+ }
+ "a" => {
+ if tag.attr("onclick").is_some() {
+ return StartTagOutcome::Skip;
+ }
+ }
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+}
+
+pub struct GleamModuleCollector {
+ modules: BTreeSet<String>,
+ has_seen_modules_header: bool,
+}
+
+impl GleamModuleCollector {
+ pub fn new() -> Self {
+ Self {
+ modules: BTreeSet::new(),
+ has_seen_modules_header: false,
+ }
+ }
+
+ fn parse_module(tag: &HtmlElement) -> Option<String> {
+ if tag.tag() != "a" {
+ return None;
+ }
+
+ let href = tag.attr("href")?;
+ if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
+ return None;
+ }
+
+ let module_name = href.trim_start_matches("./").trim_end_matches(".html");
+
+ Some(module_name.to_owned())
+ }
+}
+
+impl HandleTag for GleamModuleCollector {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "h2" | "a" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag() {
+ "a" => {
+ if self.has_seen_modules_header && writer.is_inside("li") {
+ if let Some(module_name) = Self::parse_module(tag) {
+ self.modules.insert(module_name);
+ }
+ }
+ }
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+
+ fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+ if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
+ self.has_seen_modules_header = true;
+ }
+
+ HandlerOutcome::NoOp
+ }
+}