1use std::cell::RefCell;
2use std::collections::BTreeSet;
3use std::io::{self, Read};
4use std::rc::Rc;
5
6use html_to_markdown::markdown::{
7 HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
8};
9use html_to_markdown::{
10 convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
11 StartTagOutcome, TagHandler,
12};
13use zed_extension_api::{
14 http_client::{HttpMethod, HttpRequest, RedirectPolicy},
15 KeyValueStore, Result,
16};
17
18pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
19 let headers = vec![(
20 "User-Agent".to_string(),
21 "Zed (Gleam Extension)".to_string(),
22 )];
23
24 let response = HttpRequest::builder()
25 .method(HttpMethod::Get)
26 .url(format!("https://hexdocs.pm/{package}"))
27 .headers(headers.clone())
28 .redirect_policy(RedirectPolicy::FollowAll)
29 .build()?
30 .fetch()?;
31
32 let (package_root_markdown, modules) =
33 convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
34
35 database.insert(&package, &package_root_markdown)?;
36
37 for module in modules {
38 let response = HttpRequest::builder()
39 .method(HttpMethod::Get)
40 .url(format!("https://hexdocs.pm/{package}/{module}.html"))
41 .headers(headers.clone())
42 .redirect_policy(RedirectPolicy::FollowAll)
43 .build()?
44 .fetch()?;
45
46 let (markdown, _modules) =
47 convert_hexdocs_to_markdown(&mut io::Cursor::new(&response.body))?;
48
49 database.insert(&format!("{module} ({package})"), &markdown)?;
50 }
51
52 Ok(())
53}
54
55pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
56 let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
57
58 let mut handlers: Vec<TagHandler> = vec![
59 module_collector.clone(),
60 Rc::new(RefCell::new(GleamChromeRemover)),
61 Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
62 Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
63 Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
64 Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
65 Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
66 ];
67
68 let markdown = convert_html_to_markdown(html, &mut handlers)
69 .map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
70
71 let modules = module_collector
72 .borrow()
73 .modules
74 .iter()
75 .cloned()
76 .collect::<Vec<_>>();
77
78 Ok((markdown, modules))
79}
80
81/// A higher-order handler that skips all content from the `nav`.
82///
83/// We still need to traverse the `nav` for collecting information, but
84/// we don't want to include any of its content in the resulting Markdown.
85pub struct NavSkipper<T: HandleTag> {
86 handler: T,
87}
88
89impl<T: HandleTag> NavSkipper<T> {
90 pub fn new(handler: T) -> Self {
91 Self { handler }
92 }
93}
94
95impl<T: HandleTag> HandleTag for NavSkipper<T> {
96 fn should_handle(&self, tag: &str) -> bool {
97 tag == "nav" || self.handler.should_handle(tag)
98 }
99
100 fn handle_tag_start(
101 &mut self,
102 tag: &HtmlElement,
103 writer: &mut MarkdownWriter,
104 ) -> StartTagOutcome {
105 if writer.is_inside("nav") {
106 return StartTagOutcome::Continue;
107 }
108
109 self.handler.handle_tag_start(tag, writer)
110 }
111
112 fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
113 if writer.is_inside("nav") {
114 return;
115 }
116
117 self.handler.handle_tag_end(tag, writer)
118 }
119
120 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
121 if writer.is_inside("nav") {
122 return HandlerOutcome::Handled;
123 }
124
125 self.handler.handle_text(text, writer)
126 }
127}
128
129pub struct GleamChromeRemover;
130
131impl HandleTag for GleamChromeRemover {
132 fn should_handle(&self, tag: &str) -> bool {
133 match tag {
134 "head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
135 _ => false,
136 }
137 }
138
139 fn handle_tag_start(
140 &mut self,
141 tag: &HtmlElement,
142 _writer: &mut MarkdownWriter,
143 ) -> StartTagOutcome {
144 match tag.tag() {
145 "head" | "script" | "style" | "svg" | "header" | "footer" => {
146 return StartTagOutcome::Skip;
147 }
148 "a" => {
149 if tag.attr("onclick").is_some() {
150 return StartTagOutcome::Skip;
151 }
152 }
153 _ => {}
154 }
155
156 StartTagOutcome::Continue
157 }
158}
159
160pub struct GleamModuleCollector {
161 modules: BTreeSet<String>,
162 has_seen_modules_header: bool,
163}
164
165impl GleamModuleCollector {
166 pub fn new() -> Self {
167 Self {
168 modules: BTreeSet::new(),
169 has_seen_modules_header: false,
170 }
171 }
172
173 fn parse_module(tag: &HtmlElement) -> Option<String> {
174 if tag.tag() != "a" {
175 return None;
176 }
177
178 let href = tag.attr("href")?;
179 if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
180 return None;
181 }
182
183 let module_name = href.trim_start_matches("./").trim_end_matches(".html");
184
185 Some(module_name.to_owned())
186 }
187}
188
189impl HandleTag for GleamModuleCollector {
190 fn should_handle(&self, tag: &str) -> bool {
191 match tag {
192 "h2" | "a" => true,
193 _ => false,
194 }
195 }
196
197 fn handle_tag_start(
198 &mut self,
199 tag: &HtmlElement,
200 writer: &mut MarkdownWriter,
201 ) -> StartTagOutcome {
202 match tag.tag() {
203 "a" => {
204 if self.has_seen_modules_header && writer.is_inside("li") {
205 if let Some(module_name) = Self::parse_module(tag) {
206 self.modules.insert(module_name);
207 }
208 }
209 }
210 _ => {}
211 }
212
213 StartTagOutcome::Continue
214 }
215
216 fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
217 if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
218 self.has_seen_modules_header = true;
219 }
220
221 HandlerOutcome::NoOp
222 }
223}