gleam: Improve indexing of HexDocs (#13787)

Marshall Bowers created

This PR improves the indexing of HexDocs content for Gleam packages.

We now index each of the modules in the package instead of just the
root.

Release Notes:

- N/A

Change summary

extensions/gleam/src/gleam.rs   |  43 +------
extensions/gleam/src/hexdocs.rs | 205 +++++++++++++++++++++++++++++++++++
2 files changed, 211 insertions(+), 37 deletions(-)

Detailed changes

extensions/gleam/src/gleam.rs 🔗

@@ -1,7 +1,6 @@
-use html_to_markdown::{convert_html_to_markdown, TagHandler};
-use std::cell::RefCell;
+mod hexdocs;
+
 use std::fs;
-use std::rc::Rc;
 use zed::lsp::CompletionKind;
 use zed::{
     CodeLabel, CodeLabelSpan, HttpRequest, KeyValueStore, LanguageServerId, SlashCommand,
@@ -9,6 +8,8 @@ use zed::{
 };
 use zed_extension_api::{self as zed, Result};
 
+use crate::hexdocs::convert_hexdocs_to_markdown;
+
 struct GleamExtension {
     cached_binary_path: Option<String>,
 }
@@ -191,19 +192,7 @@ impl zed::Extension for GleamExtension {
                     ),
                 })?;
 
-                let mut handlers: Vec<TagHandler> = vec![
-                    Rc::new(RefCell::new(
-                        html_to_markdown::markdown::WebpageChromeRemover,
-                    )),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
-                ];
-
-                let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
-                    .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
+                let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
 
                 let mut text = String::new();
                 text.push_str(&markdown);
@@ -244,27 +233,7 @@ impl zed::Extension for GleamExtension {
         database: &KeyValueStore,
     ) -> Result<(), String> {
         match provider.as_str() {
-            "gleam-hexdocs" => {
-                let response = zed::fetch(&HttpRequest {
-                    url: format!("https://hexdocs.pm/{package}"),
-                })?;
-
-                let mut handlers: Vec<TagHandler> = vec![
-                    Rc::new(RefCell::new(
-                        html_to_markdown::markdown::WebpageChromeRemover,
-                    )),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
-                    Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
-                ];
-
-                let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
-                    .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
-
-                Ok(database.insert(&package, &markdown)?)
-            }
+            "gleam-hexdocs" => hexdocs::index(package, database),
             _ => Ok(()),
         }
     }

extensions/gleam/src/hexdocs.rs 🔗

@@ -0,0 +1,205 @@
+use std::cell::RefCell;
+use std::collections::BTreeSet;
+use std::io::Read;
+use std::rc::Rc;
+
+use html_to_markdown::markdown::{
+    HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
+};
+use html_to_markdown::{
+    convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
+    StartTagOutcome, TagHandler,
+};
+use zed_extension_api::{self as zed, HttpRequest, KeyValueStore, Result};
+
+pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
+    let response = zed::fetch(&HttpRequest {
+        url: format!("https://hexdocs.pm/{package}"),
+    })?;
+
+    let (package_root_markdown, modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
+
+    database.insert(&package, &package_root_markdown)?;
+
+    for module in modules {
+        let response = zed::fetch(&HttpRequest {
+            url: format!("https://hexdocs.pm/{package}/{module}.html"),
+        })?;
+
+        let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
+
+        database.insert(&module, &markdown)?;
+    }
+
+    Ok(())
+}
+
+pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
+    let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
+
+    let mut handlers: Vec<TagHandler> = vec![
+        module_collector.clone(),
+        Rc::new(RefCell::new(GleamChromeRemover)),
+        Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
+        Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
+        Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
+        Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
+        Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
+    ];
+
+    let markdown = convert_html_to_markdown(html, &mut handlers)
+        .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
+
+    let modules = module_collector
+        .borrow()
+        .modules
+        .iter()
+        .cloned()
+        .collect::<Vec<_>>();
+
+    Ok((markdown, modules))
+}
+
+/// A higher-order handler that skips all content from the `nav`.
+///
+/// We still need to traverse the `nav` for collecting information, but
+/// we don't want to include any of its content in the resulting Markdown.
+pub struct NavSkipper<T: HandleTag> {
+    handler: T,
+}
+
+impl<T: HandleTag> NavSkipper<T> {
+    pub fn new(handler: T) -> Self {
+        Self { handler }
+    }
+}
+
+impl<T: HandleTag> HandleTag for NavSkipper<T> {
+    fn should_handle(&self, tag: &str) -> bool {
+        tag == "nav" || self.handler.should_handle(tag)
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        if writer.is_inside("nav") {
+            return StartTagOutcome::Continue;
+        }
+
+        self.handler.handle_tag_start(tag, writer)
+    }
+
+    fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+        if writer.is_inside("nav") {
+            return;
+        }
+
+        self.handler.handle_tag_end(tag, writer)
+    }
+
+    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+        if writer.is_inside("nav") {
+            return HandlerOutcome::Handled;
+        }
+
+        self.handler.handle_text(text, writer)
+    }
+}
+
+pub struct GleamChromeRemover;
+
+impl HandleTag for GleamChromeRemover {
+    fn should_handle(&self, tag: &str) -> bool {
+        match tag {
+            "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
+            _ => false,
+        }
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        _writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        match tag.tag() {
+            "head" | "script" | "style" | "svg" | "header" | "footer" => {
+                return StartTagOutcome::Skip;
+            }
+            "a" => {
+                if tag.attr("onclick").is_some() {
+                    return StartTagOutcome::Skip;
+                }
+            }
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+}
+
+pub struct GleamModuleCollector {
+    modules: BTreeSet<String>,
+    has_seen_modules_header: bool,
+}
+
+impl GleamModuleCollector {
+    pub fn new() -> Self {
+        Self {
+            modules: BTreeSet::new(),
+            has_seen_modules_header: false,
+        }
+    }
+
+    fn parse_module(tag: &HtmlElement) -> Option<String> {
+        if tag.tag() != "a" {
+            return None;
+        }
+
+        let href = tag.attr("href")?;
+        if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
+            return None;
+        }
+
+        let module_name = href.trim_start_matches("./").trim_end_matches(".html");
+
+        Some(module_name.to_owned())
+    }
+}
+
+impl HandleTag for GleamModuleCollector {
+    fn should_handle(&self, tag: &str) -> bool {
+        match tag {
+            "h2" | "a" => true,
+            _ => false,
+        }
+    }
+
+    fn handle_tag_start(
+        &mut self,
+        tag: &HtmlElement,
+        writer: &mut MarkdownWriter,
+    ) -> StartTagOutcome {
+        match tag.tag() {
+            "a" => {
+                if self.has_seen_modules_header && writer.is_inside("li") {
+                    if let Some(module_name) = Self::parse_module(tag) {
+                        self.modules.insert(module_name);
+                    }
+                }
+            }
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+
+    fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+        if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
+            self.has_seen_modules_header = true;
+        }
+
+        HandlerOutcome::NoOp
+    }
+}