1use std::cell::RefCell;
2use std::collections::BTreeSet;
3use std::io::Read;
4use std::rc::Rc;
5
6use html_to_markdown::markdown::{
7 HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
8};
9use html_to_markdown::{
10 convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
11 StartTagOutcome, TagHandler,
12};
13use zed_extension_api::{self as zed, HttpRequest, KeyValueStore, Result};
14
15pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
16 let response = zed::fetch(&HttpRequest {
17 url: format!("https://hexdocs.pm/{package}"),
18 })?;
19
20 let (package_root_markdown, modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
21
22 database.insert(&package, &package_root_markdown)?;
23
24 for module in modules {
25 let response = zed::fetch(&HttpRequest {
26 url: format!("https://hexdocs.pm/{package}/{module}.html"),
27 })?;
28
29 let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
30
31 database.insert(&format!("{module} ({package})"), &markdown)?;
32 }
33
34 Ok(())
35}
36
37pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
38 let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
39
40 let mut handlers: Vec<TagHandler> = vec![
41 module_collector.clone(),
42 Rc::new(RefCell::new(GleamChromeRemover)),
43 Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
44 Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
45 Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
46 Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
47 Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
48 ];
49
50 let markdown = convert_html_to_markdown(html, &mut handlers)
51 .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
52
53 let modules = module_collector
54 .borrow()
55 .modules
56 .iter()
57 .cloned()
58 .collect::<Vec<_>>();
59
60 Ok((markdown, modules))
61}
62
63/// A higher-order handler that skips all content from the `nav`.
64///
65/// We still need to traverse the `nav` for collecting information, but
66/// we don't want to include any of its content in the resulting Markdown.
67pub struct NavSkipper<T: HandleTag> {
68 handler: T,
69}
70
71impl<T: HandleTag> NavSkipper<T> {
72 pub fn new(handler: T) -> Self {
73 Self { handler }
74 }
75}
76
77impl<T: HandleTag> HandleTag for NavSkipper<T> {
78 fn should_handle(&self, tag: &str) -> bool {
79 tag == "nav" || self.handler.should_handle(tag)
80 }
81
82 fn handle_tag_start(
83 &mut self,
84 tag: &HtmlElement,
85 writer: &mut MarkdownWriter,
86 ) -> StartTagOutcome {
87 if writer.is_inside("nav") {
88 return StartTagOutcome::Continue;
89 }
90
91 self.handler.handle_tag_start(tag, writer)
92 }
93
94 fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
95 if writer.is_inside("nav") {
96 return;
97 }
98
99 self.handler.handle_tag_end(tag, writer)
100 }
101
102 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
103 if writer.is_inside("nav") {
104 return HandlerOutcome::Handled;
105 }
106
107 self.handler.handle_text(text, writer)
108 }
109}
110
111pub struct GleamChromeRemover;
112
113impl HandleTag for GleamChromeRemover {
114 fn should_handle(&self, tag: &str) -> bool {
115 match tag {
116 "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
117 _ => false,
118 }
119 }
120
121 fn handle_tag_start(
122 &mut self,
123 tag: &HtmlElement,
124 _writer: &mut MarkdownWriter,
125 ) -> StartTagOutcome {
126 match tag.tag() {
127 "head" | "script" | "style" | "svg" | "header" | "footer" => {
128 return StartTagOutcome::Skip;
129 }
130 "a" => {
131 if tag.attr("onclick").is_some() {
132 return StartTagOutcome::Skip;
133 }
134 }
135 _ => {}
136 }
137
138 StartTagOutcome::Continue
139 }
140}
141
142pub struct GleamModuleCollector {
143 modules: BTreeSet<String>,
144 has_seen_modules_header: bool,
145}
146
147impl GleamModuleCollector {
148 pub fn new() -> Self {
149 Self {
150 modules: BTreeSet::new(),
151 has_seen_modules_header: false,
152 }
153 }
154
155 fn parse_module(tag: &HtmlElement) -> Option<String> {
156 if tag.tag() != "a" {
157 return None;
158 }
159
160 let href = tag.attr("href")?;
161 if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
162 return None;
163 }
164
165 let module_name = href.trim_start_matches("./").trim_end_matches(".html");
166
167 Some(module_name.to_owned())
168 }
169}
170
171impl HandleTag for GleamModuleCollector {
172 fn should_handle(&self, tag: &str) -> bool {
173 match tag {
174 "h2" | "a" => true,
175 _ => false,
176 }
177 }
178
179 fn handle_tag_start(
180 &mut self,
181 tag: &HtmlElement,
182 writer: &mut MarkdownWriter,
183 ) -> StartTagOutcome {
184 match tag.tag() {
185 "a" => {
186 if self.has_seen_modules_header && writer.is_inside("li") {
187 if let Some(module_name) = Self::parse_module(tag) {
188 self.modules.insert(module_name);
189 }
190 }
191 }
192 _ => {}
193 }
194
195 StartTagOutcome::Continue
196 }
197
198 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
199 if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
200 self.has_seen_modules_header = true;
201 }
202
203 HandlerOutcome::NoOp
204 }
205}