From 2d9479667f5b100c1769f76cad0e4ca0a3cf1163 Mon Sep 17 00:00:00 2001 From: Marshall Bowers Date: Tue, 4 Jun 2024 16:14:26 -0400 Subject: [PATCH] Make HTML to Markdown conversion more pluggable (#12653) This PR overhauls the HTML to Markdown conversion functionality in order to make it more pluggable. This will ultimately allow for supporting a variety of different HTML input structures (both natively and via extensions). As part of this, the `rustdoc_to_markdown` crate has been renamed to `html_to_markdown`. The `MarkdownWriter` now accepts a list of trait objects that can be used to drive the conversion of the HTML into Markdown. Right now we have some generic handler implementations for going from plain HTML elements to their Markdown equivalents, as well as some rustdoc-specific ones. Release Notes: - N/A --- Cargo.lock | 26 +- Cargo.toml | 4 +- crates/assistant/Cargo.toml | 2 +- .../src/slash_command/fetch_command.rs | 2 +- .../src/slash_command/rustdoc_command.rs | 2 +- .../Cargo.toml | 4 +- .../LICENSE-GPL | 0 .../examples/test.rs | 2 +- .../src/html_element.rs | 0 .../src/html_to_markdown.rs} | 33 +- crates/html_to_markdown/src/markdown.rs | 135 ++++++++ .../html_to_markdown/src/markdown_writer.rs | 198 ++++++++++++ crates/html_to_markdown/src/structure.rs | 1 + .../html_to_markdown/src/structure/rustdoc.rs | 286 +++++++++++++++++ .../src/markdown_writer.rs | 296 ------------------ 15 files changed, 671 insertions(+), 320 deletions(-) rename crates/{rustdoc_to_markdown => html_to_markdown}/Cargo.toml (83%) rename crates/{rustdoc_to_markdown => html_to_markdown}/LICENSE-GPL (100%) rename crates/{rustdoc_to_markdown => html_to_markdown}/examples/test.rs (92%) rename crates/{rustdoc_to_markdown => html_to_markdown}/src/html_element.rs (100%) rename crates/{rustdoc_to_markdown/src/rustdoc_to_markdown.rs => html_to_markdown/src/html_to_markdown.rs} (93%) create mode 100644 crates/html_to_markdown/src/markdown.rs create mode 100644 crates/html_to_markdown/src/markdown_writer.rs create mode 100644 crates/html_to_markdown/src/structure.rs create mode 100644 crates/html_to_markdown/src/structure/rustdoc.rs delete mode 100644 crates/rustdoc_to_markdown/src/markdown_writer.rs diff --git a/Cargo.lock b/Cargo.lock index fb6ad1f833c076a597d8ae47257b7a48a5f89773..a9394d1ed1e5f0c5efcec5af046eb5ef90e4b0dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -353,6 +353,7 @@ dependencies = [ "fuzzy", "gpui", "heed", + "html_to_markdown", "http 0.1.0", "indoc", "language", @@ -367,7 +368,6 @@ dependencies = [ "rand 0.8.5", "regex", "rope", - "rustdoc_to_markdown", "schemars", "search", "semantic_index", @@ -5067,6 +5067,18 @@ dependencies = [ "syn 2.0.59", ] +[[package]] +name = "html_to_markdown" +version = "0.1.0" +dependencies = [ + "anyhow", + "html5ever", + "indoc", + "markup5ever_rcdom", + "pretty_assertions", + "regex", +] + [[package]] name = "http" version = "0.1.0" @@ -8618,18 +8630,6 @@ dependencies = [ "semver", ] -[[package]] -name = "rustdoc_to_markdown" -version = "0.1.0" -dependencies = [ - "anyhow", - "html5ever", - "indoc", - "markup5ever_rcdom", - "pretty_assertions", - "regex", -] - [[package]] name = "rustix" version = "0.37.23" diff --git a/Cargo.toml b/Cargo.toml index 7c3cf3762b206e620182c7cd9980de6ec18e9ad1..cb58423a818a52dbca5b5479e8033bf4d66f659c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ members = [ "crates/gpui", "crates/gpui_macros", "crates/headless", + "crates/html_to_markdown", "crates/http", "crates/image_viewer", "crates/inline_completion_button", @@ -76,7 +77,6 @@ members = [ "crates/rich_text", "crates/rope", "crates/rpc", - "crates/rustdoc_to_markdown", "crates/task", "crates/tasks_ui", "crates/search", @@ -187,6 +187,7 @@ google_ai = { path = "crates/google_ai" } gpui = { path = "crates/gpui" } gpui_macros = { path = "crates/gpui_macros" } headless = { path = "crates/headless" } +html_to_markdown = { path = "crates/html_to_markdown" } http = { path = "crates/http" } install_cli = { path = "crates/install_cli" } image_viewer = { path = "crates/image_viewer" } @@ -223,7 +224,6 @@ dev_server_projects = { path = "crates/dev_server_projects" } rich_text = { path = "crates/rich_text" } rope = { path = "crates/rope" } rpc = { path = "crates/rpc" } -rustdoc_to_markdown = { path = "crates/rustdoc_to_markdown" } task = { path = "crates/task" } tasks_ui = { path = "crates/tasks_ui" } search = { path = "crates/search" } diff --git a/crates/assistant/Cargo.toml b/crates/assistant/Cargo.toml index 029da5d553c048c4d10f236da46e8191ec6e2e59..06df24d69a3912fffe296d1b266ea430bde012bd 100644 --- a/crates/assistant/Cargo.toml +++ b/crates/assistant/Cargo.toml @@ -28,6 +28,7 @@ futures.workspace = true fuzzy.workspace = true gpui.workspace = true heed.workspace = true +html_to_markdown.workspace = true http.workspace = true indoc.workspace = true language.workspace = true @@ -40,7 +41,6 @@ parking_lot.workspace = true project.workspace = true regex.workspace = true rope.workspace = true -rustdoc_to_markdown.workspace = true schemars.workspace = true search.workspace = true semantic_index.workspace = true diff --git a/crates/assistant/src/slash_command/fetch_command.rs b/crates/assistant/src/slash_command/fetch_command.rs index 6bd870c1b6d3e36f14c0a3f39eb6e5589e42f9b4..37483cbb1ae89d70dab37a0f5824c8486ed357ca 100644 --- a/crates/assistant/src/slash_command/fetch_command.rs +++ b/crates/assistant/src/slash_command/fetch_command.rs @@ -5,9 +5,9 @@ use anyhow::{anyhow, bail, Context, Result}; use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection}; use futures::AsyncReadExt; use gpui::{AppContext, Task, WeakView}; +use html_to_markdown::convert_html_to_markdown; use http::{AsyncBody, HttpClient, HttpClientWithUrl}; use language::LspAdapterDelegate; -use rustdoc_to_markdown::convert_html_to_markdown; use ui::{prelude::*, ButtonLike, ElevationIndex}; use workspace::Workspace; diff --git a/crates/assistant/src/slash_command/rustdoc_command.rs b/crates/assistant/src/slash_command/rustdoc_command.rs index 85015798b2b4b8d66e6c8b6d71beb73d4fd956d2..cf48dc28dcc88dff65bb9e9531fc80101665bd14 100644 --- a/crates/assistant/src/slash_command/rustdoc_command.rs +++ b/crates/assistant/src/slash_command/rustdoc_command.rs @@ -7,10 +7,10 @@ use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutp use fs::Fs; use futures::AsyncReadExt; use gpui::{AppContext, Model, Task, WeakView}; +use html_to_markdown::convert_rustdoc_to_markdown; use http::{AsyncBody, HttpClient, HttpClientWithUrl}; use language::LspAdapterDelegate; use project::{Project, ProjectPath}; -use rustdoc_to_markdown::convert_rustdoc_to_markdown; use ui::{prelude::*, ButtonLike, ElevationIndex}; use workspace::Workspace; diff --git a/crates/rustdoc_to_markdown/Cargo.toml b/crates/html_to_markdown/Cargo.toml similarity index 83% rename from crates/rustdoc_to_markdown/Cargo.toml rename to crates/html_to_markdown/Cargo.toml index 58a60bc7bf330ecf8b1f05b65a128ffb16a3356a..bac60ef9a638575479903b78fe3f54e889bdb31e 100644 --- a/crates/rustdoc_to_markdown/Cargo.toml +++ b/crates/html_to_markdown/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "rustdoc_to_markdown" +name = "html_to_markdown" version = "0.1.0" edition = "2021" publish = false @@ -9,7 +9,7 @@ license = "GPL-3.0-or-later" workspace = true [lib] -path = "src/rustdoc_to_markdown.rs" +path = "src/html_to_markdown.rs" [dependencies] anyhow.workspace = true diff --git a/crates/rustdoc_to_markdown/LICENSE-GPL b/crates/html_to_markdown/LICENSE-GPL similarity index 100% rename from crates/rustdoc_to_markdown/LICENSE-GPL rename to crates/html_to_markdown/LICENSE-GPL diff --git a/crates/rustdoc_to_markdown/examples/test.rs b/crates/html_to_markdown/examples/test.rs similarity index 92% rename from crates/rustdoc_to_markdown/examples/test.rs rename to crates/html_to_markdown/examples/test.rs index 38a85874dfd1f337c51c7f3bbd089f6dc7dab0c8..3937a7b3b3d123856866fdac30563131fe0df087 100644 --- a/crates/rustdoc_to_markdown/examples/test.rs +++ b/crates/html_to_markdown/examples/test.rs @@ -1,5 +1,5 @@ +use html_to_markdown::convert_rustdoc_to_markdown; use indoc::indoc; -use rustdoc_to_markdown::convert_rustdoc_to_markdown; pub fn main() { let html = indoc! {" diff --git a/crates/rustdoc_to_markdown/src/html_element.rs b/crates/html_to_markdown/src/html_element.rs similarity index 100% rename from crates/rustdoc_to_markdown/src/html_element.rs rename to crates/html_to_markdown/src/html_element.rs diff --git a/crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs b/crates/html_to_markdown/src/html_to_markdown.rs similarity index 93% rename from crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs rename to crates/html_to_markdown/src/html_to_markdown.rs index 05d0b531289af8a2df4fd5e6bee7abc375fcd72c..34eec2b00181dd2bc3ab57b7492c098f33a32ad9 100644 --- a/crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs +++ b/crates/html_to_markdown/src/html_to_markdown.rs @@ -3,7 +3,9 @@ #![deny(missing_docs)] mod html_element; +mod markdown; mod markdown_writer; +mod structure; use std::io::Read; @@ -14,15 +16,28 @@ use html5ever::tendril::TendrilSink; use html5ever::tree_builder::TreeBuilderOpts; use markup5ever_rcdom::RcDom; -use crate::markdown_writer::MarkdownWriter; +use crate::markdown::{HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler}; +use crate::markdown_writer::{HandleTag, MarkdownWriter}; /// Converts the provided HTML to Markdown. pub fn convert_html_to_markdown(html: impl Read) -> Result { let dom = parse_html(html).context("failed to parse HTML")?; + let handlers: Vec> = vec![ + Box::new(ParagraphHandler), + Box::new(HeadingHandler), + Box::new(ListHandler), + Box::new(StyledTextHandler), + Box::new(structure::rustdoc::RustdocChromeRemover), + Box::new(structure::rustdoc::RustdocHeadingHandler), + Box::new(structure::rustdoc::RustdocCodeHandler), + Box::new(structure::rustdoc::RustdocTableHandler::new()), + Box::new(structure::rustdoc::RustdocItemHandler), + ]; + let markdown_writer = MarkdownWriter::new(); let markdown = markdown_writer - .run(&dom.document) + .run(&dom.document, handlers) .context("failed to convert HTML to Markdown")?; Ok(markdown) @@ -32,9 +47,21 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result { pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result { let dom = parse_html(html).context("failed to parse rustdoc HTML")?; + let handlers: Vec> = vec![ + Box::new(ParagraphHandler), + Box::new(HeadingHandler), + Box::new(ListHandler), + Box::new(StyledTextHandler), + Box::new(structure::rustdoc::RustdocChromeRemover), + Box::new(structure::rustdoc::RustdocHeadingHandler), + Box::new(structure::rustdoc::RustdocCodeHandler), + Box::new(structure::rustdoc::RustdocTableHandler::new()), + Box::new(structure::rustdoc::RustdocItemHandler), + ]; + let markdown_writer = MarkdownWriter::new(); let markdown = markdown_writer - .run(&dom.document) + .run(&dom.document, handlers) .context("failed to convert rustdoc HTML to Markdown")?; Ok(markdown) diff --git a/crates/html_to_markdown/src/markdown.rs b/crates/html_to_markdown/src/markdown.rs new file mode 100644 index 0000000000000000000000000000000000000000..f6af5794b5ca44f1f00e690138e6ef7861ad28bf --- /dev/null +++ b/crates/html_to_markdown/src/markdown.rs @@ -0,0 +1,135 @@ +use crate::html_element::HtmlElement; +use crate::markdown_writer::{HandleTag, MarkdownWriter, StartTagOutcome}; + +pub struct ParagraphHandler; + +impl HandleTag for ParagraphHandler { + fn should_handle(&self, _tag: &str) -> bool { + true + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + if tag.is_inline() && writer.is_inside("p") { + if let Some(parent) = writer.current_element_stack().iter().last() { + if !parent.is_inline() { + if !(writer.markdown.ends_with(' ') || writer.markdown.ends_with('\n')) { + writer.push_str(" "); + } + } + } + } + + match tag.tag.as_str() { + "p" => writer.push_blank_line(), + _ => {} + } + + StartTagOutcome::Continue + } +} + +pub struct HeadingHandler; + +impl HandleTag for HeadingHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "h1" => writer.push_str("\n\n# "), + "h2" => writer.push_str("\n\n## "), + "h3" => writer.push_str("\n\n### "), + "h4" => writer.push_str("\n\n#### "), + "h5" => writer.push_str("\n\n##### "), + "h6" => writer.push_str("\n\n###### "), + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => writer.push_blank_line(), + _ => {} + } + } +} + +pub struct ListHandler; + +impl HandleTag for ListHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "ul" | "ol" | "li" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "ul" | "ol" => writer.push_newline(), + "li" => writer.push_str("- "), + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "ul" | "ol" => writer.push_newline(), + "li" => writer.push_newline(), + _ => {} + } + } +} + +pub struct StyledTextHandler; + +impl HandleTag for StyledTextHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "strong" | "em" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "strong" => writer.push_str("**"), + "em" => writer.push_str("_"), + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "strong" => writer.push_str("**"), + "em" => writer.push_str("_"), + _ => {} + } + } +} diff --git a/crates/html_to_markdown/src/markdown_writer.rs b/crates/html_to_markdown/src/markdown_writer.rs new file mode 100644 index 0000000000000000000000000000000000000000..436f895d7e25d1e5f96ee3a5cf1ab4429cab5dab --- /dev/null +++ b/crates/html_to_markdown/src/markdown_writer.rs @@ -0,0 +1,198 @@ +use std::collections::VecDeque; +use std::sync::OnceLock; + +use anyhow::Result; +use markup5ever_rcdom::{Handle, NodeData}; +use regex::Regex; + +use crate::html_element::HtmlElement; + +fn empty_line_regex() -> &'static Regex { + static REGEX: OnceLock = OnceLock::new(); + REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap()) +} + +fn more_than_three_newlines_regex() -> &'static Regex { + static REGEX: OnceLock = OnceLock::new(); + REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap()) +} + +pub enum StartTagOutcome { + Continue, + Skip, +} + +pub struct MarkdownWriter { + current_element_stack: VecDeque, + pub(crate) markdown: String, +} + +impl MarkdownWriter { + pub fn new() -> Self { + Self { + current_element_stack: VecDeque::new(), + markdown: String::new(), + } + } + + pub fn current_element_stack(&self) -> &VecDeque { + &self.current_element_stack + } + + pub fn is_inside(&self, tag: &str) -> bool { + self.current_element_stack + .iter() + .any(|parent_element| parent_element.tag == tag) + } + + /// Appends the given string slice onto the end of the Markdown output. + pub fn push_str(&mut self, str: &str) { + self.markdown.push_str(str); + } + + /// Appends a newline to the end of the Markdown output. + pub fn push_newline(&mut self) { + self.push_str("\n"); + } + + /// Appends a blank line to the end of the Markdown output. + pub fn push_blank_line(&mut self) { + self.push_str("\n\n"); + } + + pub fn run( + mut self, + root_node: &Handle, + mut handlers: Vec>, + ) -> Result { + self.visit_node(&root_node, &mut handlers)?; + Ok(Self::prettify_markdown(self.markdown)) + } + + fn prettify_markdown(markdown: String) -> String { + let markdown = empty_line_regex().replace_all(&markdown, ""); + let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n"); + + markdown.trim().to_string() + } + + fn visit_node(&mut self, node: &Handle, handlers: &mut [Box]) -> Result<()> { + let mut current_element = None; + + match node.data { + NodeData::Document + | NodeData::Doctype { .. } + | NodeData::ProcessingInstruction { .. } + | NodeData::Comment { .. } => { + // Currently left unimplemented, as we're not interested in this data + // at this time. + } + NodeData::Element { + ref name, + ref attrs, + .. + } => { + let tag_name = name.local.to_string(); + if !tag_name.is_empty() { + current_element = Some(HtmlElement { + tag: tag_name, + attrs: attrs.clone(), + }); + } + } + NodeData::Text { ref contents } => { + let text = contents.borrow().to_string(); + self.visit_text(text, handlers)?; + } + } + + if let Some(current_element) = current_element.as_ref() { + match self.start_tag(¤t_element, handlers) { + StartTagOutcome::Continue => {} + StartTagOutcome::Skip => return Ok(()), + } + + self.current_element_stack + .push_back(current_element.clone()); + } + + for child in node.children.borrow().iter() { + self.visit_node(child, handlers)?; + } + + if let Some(current_element) = current_element { + self.current_element_stack.pop_back(); + self.end_tag(¤t_element, handlers); + } + + Ok(()) + } + + fn start_tag( + &mut self, + tag: &HtmlElement, + handlers: &mut [Box], + ) -> StartTagOutcome { + for handler in handlers { + if handler.should_handle(tag.tag.as_str()) { + match handler.handle_tag_start(tag, self) { + StartTagOutcome::Continue => {} + StartTagOutcome::Skip => return StartTagOutcome::Skip, + } + } + } + + StartTagOutcome::Continue + } + + fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box]) { + for handler in handlers { + if handler.should_handle(tag.tag.as_str()) { + handler.handle_tag_end(tag, self); + } + } + } + + fn visit_text(&mut self, text: String, handlers: &mut [Box]) -> Result<()> { + for handler in handlers { + match handler.handle_text(&text, self) { + HandlerOutcome::Handled => return Ok(()), + HandlerOutcome::NoOp => {} + } + } + + let text = text + .trim_matches(|char| char == '\n' || char == '\r') + .replace('\n', " "); + + self.push_str(&text); + + Ok(()) + } +} + +pub enum HandlerOutcome { + Handled, + NoOp, +} + +pub trait HandleTag { + /// Returns whether this handler should handle the given tag. + fn should_handle(&self, tag: &str) -> bool; + + /// Handles the start of the given tag. + fn handle_tag_start( + &mut self, + _tag: &HtmlElement, + _writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + StartTagOutcome::Continue + } + + /// Handles the end of the given tag. + fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {} + + fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome { + HandlerOutcome::NoOp + } +} diff --git a/crates/html_to_markdown/src/structure.rs b/crates/html_to_markdown/src/structure.rs new file mode 100644 index 0000000000000000000000000000000000000000..c6505a2ab667724c83d3f7bed3c1ca16a1423bc5 --- /dev/null +++ b/crates/html_to_markdown/src/structure.rs @@ -0,0 +1 @@ +pub mod rustdoc; diff --git a/crates/html_to_markdown/src/structure/rustdoc.rs b/crates/html_to_markdown/src/structure/rustdoc.rs new file mode 100644 index 0000000000000000000000000000000000000000..b1ae7d2425a56ae20d84572f21b34ba69b9bf041 --- /dev/null +++ b/crates/html_to_markdown/src/structure/rustdoc.rs @@ -0,0 +1,286 @@ +use crate::html_element::HtmlElement; +use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome}; + +pub struct RustdocHeadingHandler; + +impl HandleTag for RustdocHeadingHandler { + fn should_handle(&self, _tag: &str) -> bool { + // We're only handling text, so we don't need to visit any tags. + false + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if writer.is_inside("h1") + || writer.is_inside("h2") + || writer.is_inside("h3") + || writer.is_inside("h4") + || writer.is_inside("h5") + || writer.is_inside("h6") + { + let text = text + .trim_matches(|char| char == '\n' || char == '\r' || char == '§') + .replace('\n', " "); + writer.push_str(&text); + + return HandlerOutcome::Handled; + } + + HandlerOutcome::NoOp + } +} + +pub struct RustdocCodeHandler; + +impl HandleTag for RustdocCodeHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "pre" | "code" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "code" => { + if !writer.is_inside("pre") { + writer.push_str("`"); + } + } + "pre" => { + let classes = tag.classes(); + let is_rust = classes.iter().any(|class| class == "rust"); + let language = is_rust + .then(|| "rs") + .or_else(|| { + classes.iter().find_map(|class| { + if let Some((_, language)) = class.split_once("language-") { + Some(language.trim()) + } else { + None + } + }) + }) + .unwrap_or(""); + + writer.push_str(&format!("\n\n```{language}\n")); + } + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "code" => { + if !writer.is_inside("pre") { + writer.push_str("`"); + } + } + "pre" => writer.push_str("\n```\n"), + _ => {} + } + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if writer.is_inside("pre") { + writer.push_str(&text); + return HandlerOutcome::Handled; + } + + HandlerOutcome::NoOp + } +} + +pub struct RustdocTableHandler { + /// The number of columns in the current ``. + current_table_columns: usize, + is_first_th: bool, + is_first_td: bool, +} + +impl RustdocTableHandler { + pub fn new() -> Self { + Self { + current_table_columns: 0, + is_first_th: true, + is_first_td: true, + } + } +} + +impl HandleTag for RustdocTableHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "table" | "thead" | "tbody" | "tr" | "th" | "td" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "thead" => writer.push_blank_line(), + "tr" => writer.push_newline(), + "th" => { + self.current_table_columns += 1; + if self.is_first_th { + self.is_first_th = false; + } else { + writer.push_str(" "); + } + writer.push_str("| "); + } + "td" => { + if self.is_first_td { + self.is_first_td = false; + } else { + writer.push_str(" "); + } + writer.push_str("| "); + } + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "thead" => { + writer.push_newline(); + for ix in 0..self.current_table_columns { + if ix > 0 { + writer.push_str(" "); + } + writer.push_str("| ---"); + } + writer.push_str(" |"); + self.is_first_th = true; + } + "tr" => { + writer.push_str(" |"); + self.is_first_td = true; + } + "table" => { + self.current_table_columns = 0; + } + _ => {} + } + } +} + +const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name"; + +pub struct RustdocItemHandler; + +impl RustdocItemHandler { + /// Returns whether we're currently inside of an `.item-name` element, which + /// rustdoc uses to display Rust items in a list. + fn is_inside_item_name(writer: &MarkdownWriter) -> bool { + writer + .current_element_stack() + .iter() + .any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS)) + } +} + +impl HandleTag for RustdocItemHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "div" | "span" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "div" | "span" => { + if Self::is_inside_item_name(writer) && tag.has_class("stab") { + writer.push_str(" ["); + } + } + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "div" | "span" => { + if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) { + writer.push_str(": "); + } + + if Self::is_inside_item_name(writer) && tag.has_class("stab") { + writer.push_str("]"); + } + } + _ => {} + } + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if Self::is_inside_item_name(writer) + && !writer.is_inside("span") + && !writer.is_inside("code") + { + writer.push_str(&format!("`{text}`")); + return HandlerOutcome::Handled; + } + + HandlerOutcome::NoOp + } +} + +pub struct RustdocChromeRemover; + +impl HandleTag for RustdocChromeRemover { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "head" | "script" | "nav" | "summary" | "button" | "div" | "span" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + _writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "head" | "script" | "nav" => return StartTagOutcome::Skip, + "summary" => { + if tag.has_class("hideme") { + return StartTagOutcome::Skip; + } + } + "button" => { + if tag.attr("id").as_deref() == Some("copy-path") { + return StartTagOutcome::Skip; + } + } + "div" | "span" => { + let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"]; + if tag.has_any_classes(&classes_to_skip) { + return StartTagOutcome::Skip; + } + } + _ => {} + } + + StartTagOutcome::Continue + } +} diff --git a/crates/rustdoc_to_markdown/src/markdown_writer.rs b/crates/rustdoc_to_markdown/src/markdown_writer.rs deleted file mode 100644 index bafac18a33e6feed4b4ba8cf343f971194e453c8..0000000000000000000000000000000000000000 --- a/crates/rustdoc_to_markdown/src/markdown_writer.rs +++ /dev/null @@ -1,296 +0,0 @@ -use std::collections::VecDeque; -use std::sync::OnceLock; - -use anyhow::Result; -use markup5ever_rcdom::{Handle, NodeData}; -use regex::Regex; - -use crate::html_element::HtmlElement; - -fn empty_line_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap()) -} - -fn more_than_three_newlines_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap()) -} - -const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name"; - -enum StartTagOutcome { - Continue, - Skip, -} - -pub struct MarkdownWriter { - current_element_stack: VecDeque, - /// The number of columns in the current `
`. - current_table_columns: usize, - is_first_th: bool, - is_first_td: bool, - /// The Markdown output. - markdown: String, -} - -impl MarkdownWriter { - pub fn new() -> Self { - Self { - current_element_stack: VecDeque::new(), - current_table_columns: 0, - is_first_th: true, - is_first_td: true, - markdown: String::new(), - } - } - - fn is_inside(&self, tag: &str) -> bool { - self.current_element_stack - .iter() - .any(|parent_element| parent_element.tag == tag) - } - - /// Appends the given string slice onto the end of the Markdown output. - fn push_str(&mut self, str: &str) { - self.markdown.push_str(str); - } - - /// Appends a newline to the end of the Markdown output. - fn push_newline(&mut self) { - self.push_str("\n"); - } - - /// Appends a blank line to the end of the Markdown output. - fn push_blank_line(&mut self) { - self.push_str("\n\n"); - } - - pub fn run(mut self, root_node: &Handle) -> Result { - self.visit_node(&root_node)?; - Ok(Self::prettify_markdown(self.markdown)) - } - - fn prettify_markdown(markdown: String) -> String { - let markdown = empty_line_regex().replace_all(&markdown, ""); - let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n"); - - markdown.trim().to_string() - } - - fn visit_node(&mut self, node: &Handle) -> Result<()> { - let mut current_element = None; - - match node.data { - NodeData::Document - | NodeData::Doctype { .. } - | NodeData::ProcessingInstruction { .. } - | NodeData::Comment { .. } => { - // Currently left unimplemented, as we're not interested in this data - // at this time. - } - NodeData::Element { - ref name, - ref attrs, - .. - } => { - let tag_name = name.local.to_string(); - if !tag_name.is_empty() { - current_element = Some(HtmlElement { - tag: tag_name, - attrs: attrs.clone(), - }); - } - } - NodeData::Text { ref contents } => { - let text = contents.borrow().to_string(); - self.visit_text(text)?; - } - } - - if let Some(current_element) = current_element.as_ref() { - match self.start_tag(¤t_element) { - StartTagOutcome::Continue => {} - StartTagOutcome::Skip => return Ok(()), - } - - self.current_element_stack - .push_back(current_element.clone()); - } - - for child in node.children.borrow().iter() { - self.visit_node(child)?; - } - - if let Some(current_element) = current_element { - self.current_element_stack.pop_back(); - self.end_tag(¤t_element); - } - - Ok(()) - } - - fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome { - if tag.is_inline() && self.is_inside("p") { - if let Some(parent) = self.current_element_stack.iter().last() { - if !parent.is_inline() { - if !(self.markdown.ends_with(' ') || self.markdown.ends_with('\n')) { - self.push_str(" "); - } - } - } - } - - match tag.tag.as_str() { - "head" | "script" | "nav" => return StartTagOutcome::Skip, - "h1" => self.push_str("\n\n# "), - "h2" => self.push_str("\n\n## "), - "h3" => self.push_str("\n\n### "), - "h4" => self.push_str("\n\n#### "), - "h5" => self.push_str("\n\n##### "), - "h6" => self.push_str("\n\n###### "), - "p" => self.push_blank_line(), - "strong" => self.push_str("**"), - "em" => self.push_str("_"), - "code" => { - if !self.is_inside("pre") { - self.push_str("`"); - } - } - "pre" => { - let classes = tag.classes(); - let is_rust = classes.iter().any(|class| class == "rust"); - let language = is_rust - .then(|| "rs") - .or_else(|| { - classes.iter().find_map(|class| { - if let Some((_, language)) = class.split_once("language-") { - Some(language.trim()) - } else { - None - } - }) - }) - .unwrap_or(""); - - self.push_str(&format!("\n\n```{language}\n")); - } - "ul" | "ol" => self.push_newline(), - "li" => self.push_str("- "), - "thead" => self.push_blank_line(), - "tr" => self.push_newline(), - "th" => { - self.current_table_columns += 1; - if self.is_first_th { - self.is_first_th = false; - } else { - self.push_str(" "); - } - self.push_str("| "); - } - "td" => { - if self.is_first_td { - self.is_first_td = false; - } else { - self.push_str(" "); - } - self.push_str("| "); - } - "summary" => { - if tag.has_class("hideme") { - return StartTagOutcome::Skip; - } - } - "button" => { - if tag.attr("id").as_deref() == Some("copy-path") { - return StartTagOutcome::Skip; - } - } - "div" | "span" => { - let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"]; - if tag.has_any_classes(&classes_to_skip) { - return StartTagOutcome::Skip; - } - - if self.is_inside_item_name() && tag.has_class("stab") { - self.push_str(" ["); - } - } - _ => {} - } - - StartTagOutcome::Continue - } - - fn end_tag(&mut self, tag: &HtmlElement) { - match tag.tag.as_str() { - "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"), - "strong" => self.push_str("**"), - "em" => self.push_str("_"), - "code" => { - if !self.is_inside("pre") { - self.push_str("`"); - } - } - "pre" => self.push_str("\n```\n"), - "ul" | "ol" => self.push_newline(), - "li" => self.push_newline(), - "thead" => { - self.push_newline(); - for ix in 0..self.current_table_columns { - if ix > 0 { - self.push_str(" "); - } - self.push_str("| ---"); - } - self.push_str(" |"); - self.is_first_th = true; - } - "tr" => { - self.push_str(" |"); - self.is_first_td = true; - } - "table" => { - self.current_table_columns = 0; - } - "div" | "span" => { - if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) { - self.push_str(": "); - } - - if self.is_inside_item_name() && tag.has_class("stab") { - self.push_str("]"); - } - } - _ => {} - } - } - - fn visit_text(&mut self, text: String) -> Result<()> { - if self.is_inside("pre") { - self.push_str(&text); - return Ok(()); - } - - let text = text - .trim_matches(|char| char == '\n' || char == '\r' || char == '§') - .replace('\n', " "); - - if self.is_inside_item_name() && !self.is_inside("span") && !self.is_inside("code") { - self.push_str(&format!("`{text}`")); - return Ok(()); - } - - self.push_str(&text); - - Ok(()) - } - - /// Returns whether we're currently inside of an `.item-name` element, which - /// rustdoc uses to display Rust items in a list. - fn is_inside_item_name(&self) -> bool { - self.current_element_stack - .iter() - .any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS)) - } -}