1use std::cell::RefCell;
2use std::collections::BTreeSet;
3use std::io::{self, Read};
4use std::rc::Rc;
5
6use html_to_markdown::markdown::{
7 HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
8};
9use html_to_markdown::{
10 convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
11 StartTagOutcome, TagHandler,
12};
13use zed_extension_api::{
14 self as zed, HttpMethod, HttpRequest, KeyValueStore, RedirectPolicy, Result,
15};
16
17pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
18 let headers = vec![(
19 "User-Agent".to_string(),
20 "Zed (Gleam Extension)".to_string(),
21 )];
22
23 let response = zed::fetch(&HttpRequest {
24 method: HttpMethod::Get,
25 url: format!("https://hexdocs.pm/{package}"),
26 headers: headers.clone(),
27 body: None,
28 redirect_policy: RedirectPolicy::FollowAll,
29 })?;
30
31 let (package_root_markdown, modules) =
32 convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
33
34 database.insert(&package, &package_root_markdown)?;
35
36 for module in modules {
37 let response = zed::fetch(&HttpRequest {
38 method: HttpMethod::Get,
39 url: format!("https://hexdocs.pm/{package}/{module}.html"),
40 headers: headers.clone(),
41 body: None,
42 redirect_policy: RedirectPolicy::FollowAll,
43 })?;
44
45 let (markdown, _modules) =
46 convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
47
48 database.insert(&format!("{module} ({package})"), &markdown)?;
49 }
50
51 Ok(())
52}
53
54pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
55 let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
56
57 let mut handlers: Vec<TagHandler> = vec![
58 module_collector.clone(),
59 Rc::new(RefCell::new(GleamChromeRemover)),
60 Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
61 Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
62 Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
63 Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
64 Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
65 ];
66
67 let markdown = convert_html_to_markdown(html, &mut handlers)
68 .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
69
70 let modules = module_collector
71 .borrow()
72 .modules
73 .iter()
74 .cloned()
75 .collect::<Vec<_>>();
76
77 Ok((markdown, modules))
78}
79
80/// A higher-order handler that skips all content from the `nav`.
81///
82/// We still need to traverse the `nav` for collecting information, but
83/// we don't want to include any of its content in the resulting Markdown.
84pub struct NavSkipper<T: HandleTag> {
85 handler: T,
86}
87
88impl<T: HandleTag> NavSkipper<T> {
89 pub fn new(handler: T) -> Self {
90 Self { handler }
91 }
92}
93
94impl<T: HandleTag> HandleTag for NavSkipper<T> {
95 fn should_handle(&self, tag: &str) -> bool {
96 tag == "nav" || self.handler.should_handle(tag)
97 }
98
99 fn handle_tag_start(
100 &mut self,
101 tag: &HtmlElement,
102 writer: &mut MarkdownWriter,
103 ) -> StartTagOutcome {
104 if writer.is_inside("nav") {
105 return StartTagOutcome::Continue;
106 }
107
108 self.handler.handle_tag_start(tag, writer)
109 }
110
111 fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
112 if writer.is_inside("nav") {
113 return;
114 }
115
116 self.handler.handle_tag_end(tag, writer)
117 }
118
119 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
120 if writer.is_inside("nav") {
121 return HandlerOutcome::Handled;
122 }
123
124 self.handler.handle_text(text, writer)
125 }
126}
127
128pub struct GleamChromeRemover;
129
130impl HandleTag for GleamChromeRemover {
131 fn should_handle(&self, tag: &str) -> bool {
132 match tag {
133 "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
134 _ => false,
135 }
136 }
137
138 fn handle_tag_start(
139 &mut self,
140 tag: &HtmlElement,
141 _writer: &mut MarkdownWriter,
142 ) -> StartTagOutcome {
143 match tag.tag() {
144 "head" | "script" | "style" | "svg" | "header" | "footer" => {
145 return StartTagOutcome::Skip;
146 }
147 "a" => {
148 if tag.attr("onclick").is_some() {
149 return StartTagOutcome::Skip;
150 }
151 }
152 _ => {}
153 }
154
155 StartTagOutcome::Continue
156 }
157}
158
159pub struct GleamModuleCollector {
160 modules: BTreeSet<String>,
161 has_seen_modules_header: bool,
162}
163
164impl GleamModuleCollector {
165 pub fn new() -> Self {
166 Self {
167 modules: BTreeSet::new(),
168 has_seen_modules_header: false,
169 }
170 }
171
172 fn parse_module(tag: &HtmlElement) -> Option<String> {
173 if tag.tag() != "a" {
174 return None;
175 }
176
177 let href = tag.attr("href")?;
178 if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
179 return None;
180 }
181
182 let module_name = href.trim_start_matches("./").trim_end_matches(".html");
183
184 Some(module_name.to_owned())
185 }
186}
187
188impl HandleTag for GleamModuleCollector {
189 fn should_handle(&self, tag: &str) -> bool {
190 match tag {
191 "h2" | "a" => true,
192 _ => false,
193 }
194 }
195
196 fn handle_tag_start(
197 &mut self,
198 tag: &HtmlElement,
199 writer: &mut MarkdownWriter,
200 ) -> StartTagOutcome {
201 match tag.tag() {
202 "a" => {
203 if self.has_seen_modules_header && writer.is_inside("li") {
204 if let Some(module_name) = Self::parse_module(tag) {
205 self.modules.insert(module_name);
206 }
207 }
208 }
209 _ => {}
210 }
211
212 StartTagOutcome::Continue
213 }
214
215 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
216 if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
217 self.has_seen_modules_header = true;
218 }
219
220 HandlerOutcome::NoOp
221 }
222}