1use std::cell::RefCell;
2use std::collections::BTreeSet;
3use std::io::{self, Read};
4use std::rc::Rc;
5
6use html_to_markdown::markdown::{
7 HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
8};
9use html_to_markdown::{
10 convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
11 StartTagOutcome, TagHandler,
12};
13use zed_extension_api::{self as zed, HttpMethod, HttpRequest, KeyValueStore, Result};
14
15pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
16 let headers = vec![(
17 "User-Agent".to_string(),
18 "Zed (Gleam Extension)".to_string(),
19 )];
20
21 let response = zed::fetch(&HttpRequest {
22 method: HttpMethod::Get,
23 url: format!("https://hexdocs.pm/{package}"),
24 headers: headers.clone(),
25 body: None,
26 })?;
27
28 let (package_root_markdown, modules) =
29 convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
30
31 database.insert(&package, &package_root_markdown)?;
32
33 for module in modules {
34 let response = zed::fetch(&HttpRequest {
35 method: HttpMethod::Get,
36 url: format!("https://hexdocs.pm/{package}/{module}.html"),
37 headers: headers.clone(),
38 body: None,
39 })?;
40
41 let (markdown, _modules) =
42 convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
43
44 database.insert(&format!("{module} ({package})"), &markdown)?;
45 }
46
47 Ok(())
48}
49
50pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
51 let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
52
53 let mut handlers: Vec<TagHandler> = vec![
54 module_collector.clone(),
55 Rc::new(RefCell::new(GleamChromeRemover)),
56 Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
57 Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
58 Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
59 Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
60 Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
61 ];
62
63 let markdown = convert_html_to_markdown(html, &mut handlers)
64 .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
65
66 let modules = module_collector
67 .borrow()
68 .modules
69 .iter()
70 .cloned()
71 .collect::<Vec<_>>();
72
73 Ok((markdown, modules))
74}
75
76/// A higher-order handler that skips all content from the `nav`.
77///
78/// We still need to traverse the `nav` for collecting information, but
79/// we don't want to include any of its content in the resulting Markdown.
80pub struct NavSkipper<T: HandleTag> {
81 handler: T,
82}
83
84impl<T: HandleTag> NavSkipper<T> {
85 pub fn new(handler: T) -> Self {
86 Self { handler }
87 }
88}
89
90impl<T: HandleTag> HandleTag for NavSkipper<T> {
91 fn should_handle(&self, tag: &str) -> bool {
92 tag == "nav" || self.handler.should_handle(tag)
93 }
94
95 fn handle_tag_start(
96 &mut self,
97 tag: &HtmlElement,
98 writer: &mut MarkdownWriter,
99 ) -> StartTagOutcome {
100 if writer.is_inside("nav") {
101 return StartTagOutcome::Continue;
102 }
103
104 self.handler.handle_tag_start(tag, writer)
105 }
106
107 fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
108 if writer.is_inside("nav") {
109 return;
110 }
111
112 self.handler.handle_tag_end(tag, writer)
113 }
114
115 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
116 if writer.is_inside("nav") {
117 return HandlerOutcome::Handled;
118 }
119
120 self.handler.handle_text(text, writer)
121 }
122}
123
124pub struct GleamChromeRemover;
125
126impl HandleTag for GleamChromeRemover {
127 fn should_handle(&self, tag: &str) -> bool {
128 match tag {
129 "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
130 _ => false,
131 }
132 }
133
134 fn handle_tag_start(
135 &mut self,
136 tag: &HtmlElement,
137 _writer: &mut MarkdownWriter,
138 ) -> StartTagOutcome {
139 match tag.tag() {
140 "head" | "script" | "style" | "svg" | "header" | "footer" => {
141 return StartTagOutcome::Skip;
142 }
143 "a" => {
144 if tag.attr("onclick").is_some() {
145 return StartTagOutcome::Skip;
146 }
147 }
148 _ => {}
149 }
150
151 StartTagOutcome::Continue
152 }
153}
154
155pub struct GleamModuleCollector {
156 modules: BTreeSet<String>,
157 has_seen_modules_header: bool,
158}
159
160impl GleamModuleCollector {
161 pub fn new() -> Self {
162 Self {
163 modules: BTreeSet::new(),
164 has_seen_modules_header: false,
165 }
166 }
167
168 fn parse_module(tag: &HtmlElement) -> Option<String> {
169 if tag.tag() != "a" {
170 return None;
171 }
172
173 let href = tag.attr("href")?;
174 if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
175 return None;
176 }
177
178 let module_name = href.trim_start_matches("./").trim_end_matches(".html");
179
180 Some(module_name.to_owned())
181 }
182}
183
184impl HandleTag for GleamModuleCollector {
185 fn should_handle(&self, tag: &str) -> bool {
186 match tag {
187 "h2" | "a" => true,
188 _ => false,
189 }
190 }
191
192 fn handle_tag_start(
193 &mut self,
194 tag: &HtmlElement,
195 writer: &mut MarkdownWriter,
196 ) -> StartTagOutcome {
197 match tag.tag() {
198 "a" => {
199 if self.has_seen_modules_header && writer.is_inside("li") {
200 if let Some(module_name) = Self::parse_module(tag) {
201 self.modules.insert(module_name);
202 }
203 }
204 }
205 _ => {}
206 }
207
208 StartTagOutcome::Continue
209 }
210
211 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
212 if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
213 self.has_seen_modules_header = true;
214 }
215
216 HandlerOutcome::NoOp
217 }
218}