Cargo.lock 🔗
@@ -5072,10 +5072,12 @@ version = "0.1.0"
dependencies = [
"anyhow",
"html5ever",
+ "indexmap 1.9.3",
"indoc",
"markup5ever_rcdom",
"pretty_assertions",
"regex",
+ "strum",
]
[[package]]
Marshall Bowers created
This PR adds a tag handler for collecting crate items from rustdoc's
HTML output.
This will serve as the foundation for getting more insight into a
crate's contents.
Release Notes:
- N/A
Cargo.lock | 2
crates/assistant/src/slash_command/fetch_command.rs | 28 +-
crates/assistant/src/slash_command/rustdoc_command.rs | 14
crates/html_to_markdown/Cargo.toml | 2
crates/html_to_markdown/src/html_to_markdown.rs | 81 +++++--
crates/html_to_markdown/src/markdown_writer.rs | 34 +-
crates/html_to_markdown/src/structure/rustdoc.rs | 132 +++++++++++++
crates/html_to_markdown/src/structure/wikipedia.rs | 19 +
8 files changed, 237 insertions(+), 75 deletions(-)
@@ -5072,10 +5072,12 @@ version = "0.1.0"
dependencies = [
"anyhow",
"html5ever",
+ "indexmap 1.9.3",
"indoc",
"markup5ever_rcdom",
"pretty_assertions",
"regex",
+ "strum",
]
[[package]]
@@ -1,3 +1,5 @@
+use std::cell::RefCell;
+use std::rc::Rc;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
@@ -5,7 +7,7 @@ use anyhow::{anyhow, bail, Context, Result};
use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection};
use futures::AsyncReadExt;
use gpui::{AppContext, Task, WeakView};
-use html_to_markdown::{convert_html_to_markdown, markdown, HandleTag};
+use html_to_markdown::{convert_html_to_markdown, markdown, TagHandler};
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
use language::LspAdapterDelegate;
use ui::{prelude::*, ButtonLike, ElevationIndex};
@@ -59,24 +61,26 @@ impl FetchSlashCommand {
match content_type {
ContentType::Html => {
- let mut handlers: Vec<Box<dyn HandleTag>> = vec![
- Box::new(markdown::ParagraphHandler),
- Box::new(markdown::HeadingHandler),
- Box::new(markdown::ListHandler),
- Box::new(markdown::TableHandler::new()),
- Box::new(markdown::StyledTextHandler),
+ let mut handlers: Vec<TagHandler> = vec![
+ Rc::new(RefCell::new(markdown::ParagraphHandler)),
+ Rc::new(RefCell::new(markdown::HeadingHandler)),
+ Rc::new(RefCell::new(markdown::ListHandler)),
+ Rc::new(RefCell::new(markdown::TableHandler::new())),
+ Rc::new(RefCell::new(markdown::StyledTextHandler)),
];
if url.contains("wikipedia.org") {
use html_to_markdown::structure::wikipedia;
- handlers.push(Box::new(wikipedia::WikipediaChromeRemover));
- handlers.push(Box::new(wikipedia::WikipediaInfoboxHandler));
- handlers.push(Box::new(wikipedia::WikipediaCodeHandler::new()));
+ handlers.push(Rc::new(RefCell::new(wikipedia::WikipediaChromeRemover)));
+ handlers.push(Rc::new(RefCell::new(wikipedia::WikipediaInfoboxHandler)));
+ handlers.push(Rc::new(
+ RefCell::new(wikipedia::WikipediaCodeHandler::new()),
+ ));
} else {
- handlers.push(Box::new(markdown::CodeHandler));
+ handlers.push(Rc::new(RefCell::new(markdown::CodeHandler)));
}
- convert_html_to_markdown(&body[..], handlers)
+ convert_html_to_markdown(&body[..], &mut handlers)
}
ContentType::Plaintext => Ok(std::str::from_utf8(&body)?.to_owned()),
ContentType::Json => {
@@ -42,10 +42,9 @@ impl RustdocSlashCommand {
local_cargo_doc_path.push("index.html");
if let Ok(contents) = fs.load(&local_cargo_doc_path).await {
- return Ok((
- RustdocSource::Local,
- convert_rustdoc_to_markdown(contents.as_bytes())?,
- ));
+ let (markdown, _items) = convert_rustdoc_to_markdown(contents.as_bytes())?;
+
+ return Ok((RustdocSource::Local, markdown));
}
}
@@ -78,10 +77,9 @@ impl RustdocSlashCommand {
);
}
- Ok((
- RustdocSource::DocsDotRs,
- convert_rustdoc_to_markdown(&body[..])?,
- ))
+ let (markdown, _items) = convert_rustdoc_to_markdown(&body[..])?;
+
+ Ok((RustdocSource::DocsDotRs, markdown))
}
fn path_to_cargo_toml(project: Model<Project>, cx: &mut AppContext) -> Option<Arc<Path>> {
@@ -14,8 +14,10 @@ path = "src/html_to_markdown.rs"
[dependencies]
anyhow.workspace = true
html5ever.workspace = true
+indexmap.workspace = true
markup5ever_rcdom.workspace = true
regex.workspace = true
+strum.workspace = true
[dev-dependencies]
indoc.workspace = true
@@ -5,7 +5,9 @@ pub mod markdown;
mod markdown_writer;
pub mod structure;
+use std::cell::RefCell;
use std::io::Read;
+use std::rc::Rc;
use anyhow::{Context, Result};
use html5ever::driver::ParseOpts;
@@ -19,13 +21,11 @@ use crate::markdown::{
};
use crate::markdown_writer::MarkdownWriter;
-pub use crate::markdown_writer::HandleTag;
+pub use crate::markdown_writer::{HandleTag, TagHandler};
+use crate::structure::rustdoc::RustdocItem;
/// Converts the provided HTML to Markdown.
-pub fn convert_html_to_markdown(
- html: impl Read,
- handlers: Vec<Box<dyn HandleTag>>,
-) -> Result<String> {
+pub fn convert_html_to_markdown(html: impl Read, handlers: &mut Vec<TagHandler>) -> Result<String> {
let dom = parse_html(html).context("failed to parse HTML")?;
let markdown_writer = MarkdownWriter::new();
@@ -37,21 +37,32 @@ pub fn convert_html_to_markdown(
}
/// Converts the provided rustdoc HTML to Markdown.
-pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
- convert_html_to_markdown(
- html,
- vec![
- Box::new(ParagraphHandler),
- Box::new(HeadingHandler),
- Box::new(ListHandler),
- Box::new(TableHandler::new()),
- Box::new(StyledTextHandler),
- Box::new(structure::rustdoc::RustdocChromeRemover),
- Box::new(structure::rustdoc::RustdocHeadingHandler),
- Box::new(structure::rustdoc::RustdocCodeHandler),
- Box::new(structure::rustdoc::RustdocItemHandler),
- ],
- )
+pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec<RustdocItem>)> {
+ let item_collector = Rc::new(RefCell::new(structure::rustdoc::RustdocItemCollector::new()));
+
+ let mut handlers: Vec<TagHandler> = vec![
+ Rc::new(RefCell::new(ParagraphHandler)),
+ Rc::new(RefCell::new(HeadingHandler)),
+ Rc::new(RefCell::new(ListHandler)),
+ Rc::new(RefCell::new(TableHandler::new())),
+ Rc::new(RefCell::new(StyledTextHandler)),
+ Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
+ Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
+ Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
+ Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
+ item_collector.clone(),
+ ];
+
+ let markdown = convert_html_to_markdown(html, &mut handlers)?;
+
+ let items = item_collector
+ .borrow()
+ .items
+ .values()
+ .cloned()
+ .collect::<Vec<_>>();
+
+ Ok((markdown, items))
}
fn parse_html(mut html: impl Read) -> Result<RcDom> {
@@ -77,6 +88,20 @@ mod tests {
use super::*;
+ fn rustdoc_handlers() -> Vec<TagHandler> {
+ vec![
+ Rc::new(RefCell::new(ParagraphHandler)),
+ Rc::new(RefCell::new(HeadingHandler)),
+ Rc::new(RefCell::new(ListHandler)),
+ Rc::new(RefCell::new(TableHandler::new())),
+ Rc::new(RefCell::new(StyledTextHandler)),
+ Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
+ Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
+ Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
+ Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
+ ]
+ }
+
#[test]
fn test_main_heading_buttons_get_removed() {
let html = indoc! {r##"
@@ -93,7 +118,7 @@ mod tests {
.trim();
assert_eq!(
- convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+ convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@@ -113,7 +138,7 @@ mod tests {
.trim();
assert_eq!(
- convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+ convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@@ -159,7 +184,7 @@ mod tests {
.trim();
assert_eq!(
- convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+ convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@@ -178,7 +203,7 @@ mod tests {
.trim();
assert_eq!(
- convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+ convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@@ -220,7 +245,7 @@ mod tests {
.trim();
assert_eq!(
- convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+ convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@@ -252,7 +277,7 @@ mod tests {
.trim();
assert_eq!(
- convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+ convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@@ -288,7 +313,7 @@ mod tests {
.trim();
assert_eq!(
- convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+ convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@@ -342,7 +367,7 @@ mod tests {
.trim();
assert_eq!(
- convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
+ convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
expected
)
}
@@ -1,4 +1,6 @@
+use std::cell::RefCell;
use std::collections::VecDeque;
+use std::rc::Rc;
use std::sync::OnceLock;
use anyhow::Result;
@@ -22,6 +24,8 @@ pub enum StartTagOutcome {
Skip,
}
+pub type TagHandler = Rc<RefCell<dyn HandleTag>>;
+
pub struct MarkdownWriter {
current_element_stack: VecDeque<HtmlElement>,
pub(crate) markdown: String,
@@ -60,12 +64,8 @@ impl MarkdownWriter {
self.push_str("\n\n");
}
- pub fn run(
- mut self,
- root_node: &Handle,
- mut handlers: Vec<Box<dyn HandleTag>>,
- ) -> Result<String> {
- self.visit_node(&root_node, &mut handlers)?;
+ pub fn run(mut self, root_node: &Handle, handlers: &mut Vec<TagHandler>) -> Result<String> {
+ self.visit_node(&root_node, handlers)?;
Ok(Self::prettify_markdown(self.markdown))
}
@@ -76,7 +76,7 @@ impl MarkdownWriter {
markdown.trim().to_string()
}
- fn visit_node(&mut self, node: &Handle, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
+ fn visit_node(&mut self, node: &Handle, handlers: &mut [TagHandler]) -> Result<()> {
let mut current_element = None;
match node.data {
@@ -128,14 +128,10 @@ impl MarkdownWriter {
Ok(())
}
- fn start_tag(
- &mut self,
- tag: &HtmlElement,
- handlers: &mut [Box<dyn HandleTag>],
- ) -> StartTagOutcome {
+ fn start_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) -> StartTagOutcome {
for handler in handlers {
- if handler.should_handle(tag.tag.as_str()) {
- match handler.handle_tag_start(tag, self) {
+ if handler.borrow().should_handle(tag.tag.as_str()) {
+ match handler.borrow_mut().handle_tag_start(tag, self) {
StartTagOutcome::Continue => {}
StartTagOutcome::Skip => return StartTagOutcome::Skip,
}
@@ -145,17 +141,17 @@ impl MarkdownWriter {
StartTagOutcome::Continue
}
- fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box<dyn HandleTag>]) {
+ fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [TagHandler]) {
for handler in handlers {
- if handler.should_handle(tag.tag.as_str()) {
- handler.handle_tag_end(tag, self);
+ if handler.borrow().should_handle(tag.tag.as_str()) {
+ handler.borrow_mut().handle_tag_end(tag, self);
}
}
}
- fn visit_text(&mut self, text: String, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
+ fn visit_text(&mut self, text: String, handlers: &mut [TagHandler]) -> Result<()> {
for handler in handlers {
- match handler.handle_text(&text, self) {
+ match handler.borrow_mut().handle_text(&text, self) {
HandlerOutcome::Handled => return Ok(()),
HandlerOutcome::NoOp => {}
}
@@ -1,3 +1,6 @@
+use indexmap::IndexMap;
+use strum::{EnumIter, IntoEnumIterator};
+
use crate::html_element::HtmlElement;
use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
@@ -203,3 +206,132 @@ impl HandleTag for RustdocChromeRemover {
StartTagOutcome::Continue
}
}
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy, EnumIter)]
+pub enum RustdocItemKind {
+ Mod,
+ Macro,
+ Struct,
+ Enum,
+ Constant,
+ Trait,
+ Function,
+ TypeAlias,
+ AttributeMacro,
+ DeriveMacro,
+}
+
+impl RustdocItemKind {
+ const fn class(&self) -> &'static str {
+ match self {
+ Self::Mod => "mod",
+ Self::Macro => "macro",
+ Self::Struct => "struct",
+ Self::Enum => "enum",
+ Self::Constant => "constant",
+ Self::Trait => "trait",
+ Self::Function => "fn",
+ Self::TypeAlias => "type",
+ Self::AttributeMacro => "attr",
+ Self::DeriveMacro => "derive",
+ }
+ }
+}
+
+#[derive(Debug, Clone)]
+pub struct RustdocItem {
+ pub kind: RustdocItemKind,
+ pub name: String,
+}
+
+impl RustdocItem {
+ pub fn url_path(&self) -> String {
+ let name = &self.name;
+ match self.kind {
+ RustdocItemKind::Mod => format!("{name}/index.html"),
+ RustdocItemKind::Macro
+ | RustdocItemKind::Struct
+ | RustdocItemKind::Enum
+ | RustdocItemKind::Constant
+ | RustdocItemKind::Trait
+ | RustdocItemKind::Function
+ | RustdocItemKind::TypeAlias
+ | RustdocItemKind::AttributeMacro
+ | RustdocItemKind::DeriveMacro => {
+ format!("{kind}.{name}.html", kind = self.kind.class())
+ }
+ }
+ }
+}
+
+pub struct RustdocItemCollector {
+ pub items: IndexMap<(RustdocItemKind, String), RustdocItem>,
+}
+
+impl RustdocItemCollector {
+ pub fn new() -> Self {
+ Self {
+ items: IndexMap::new(),
+ }
+ }
+
+ fn parse_item(tag: &HtmlElement) -> Option<RustdocItem> {
+ if tag.tag.as_str() != "a" {
+ return None;
+ }
+
+ let href = tag.attr("href")?;
+ if href == "#" {
+ return None;
+ }
+
+ for kind in RustdocItemKind::iter() {
+ if tag.has_class(kind.class()) {
+ let name = href
+ .trim_start_matches(&format!("{}.", kind.class()))
+ .trim_end_matches("/index.html")
+ .trim_end_matches(".html");
+
+ return Some(RustdocItem {
+ kind,
+ name: name.to_owned(),
+ });
+ }
+ }
+
+ None
+ }
+}
+
+impl HandleTag for RustdocItemCollector {
+ fn should_handle(&self, tag: &str) -> bool {
+ tag == "a"
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "a" => {
+ let is_reexport = writer.current_element_stack().iter().any(|element| {
+ if let Some(id) = element.attr("id") {
+ id.starts_with("reexport.")
+ } else {
+ false
+ }
+ });
+
+ if !is_reexport {
+ if let Some(item) = Self::parse_item(tag) {
+ self.items.insert((item.kind, item.name.clone()), item);
+ }
+ }
+ }
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+}
@@ -144,20 +144,23 @@ impl HandleTag for WikipediaCodeHandler {
#[cfg(test)]
mod tests {
+ use std::cell::RefCell;
+ use std::rc::Rc;
+
use indoc::indoc;
use pretty_assertions::assert_eq;
- use crate::{convert_html_to_markdown, markdown};
+ use crate::{convert_html_to_markdown, markdown, TagHandler};
use super::*;
- fn wikipedia_handlers() -> Vec<Box<dyn HandleTag>> {
+ fn wikipedia_handlers() -> Vec<TagHandler> {
vec![
- Box::new(markdown::ParagraphHandler),
- Box::new(markdown::HeadingHandler),
- Box::new(markdown::ListHandler),
- Box::new(markdown::StyledTextHandler),
- Box::new(WikipediaChromeRemover),
+ Rc::new(RefCell::new(markdown::ParagraphHandler)),
+ Rc::new(RefCell::new(markdown::HeadingHandler)),
+ Rc::new(RefCell::new(markdown::ListHandler)),
+ Rc::new(RefCell::new(markdown::StyledTextHandler)),
+ Rc::new(RefCell::new(WikipediaChromeRemover)),
]
}
@@ -173,7 +176,7 @@ mod tests {
.trim();
assert_eq!(
- convert_html_to_markdown(html.as_bytes(), wikipedia_handlers()).unwrap(),
+ convert_html_to_markdown(html.as_bytes(), &mut wikipedia_handlers()).unwrap(),
expected
)
}