Detailed changes
@@ -353,6 +353,7 @@ dependencies = [
"fuzzy",
"gpui",
"heed",
+ "html_to_markdown",
"http 0.1.0",
"indoc",
"language",
@@ -367,7 +368,6 @@ dependencies = [
"rand 0.8.5",
"regex",
"rope",
- "rustdoc_to_markdown",
"schemars",
"search",
"semantic_index",
@@ -5067,6 +5067,18 @@ dependencies = [
"syn 2.0.59",
]
+[[package]]
+name = "html_to_markdown"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "html5ever",
+ "indoc",
+ "markup5ever_rcdom",
+ "pretty_assertions",
+ "regex",
+]
+
[[package]]
name = "http"
version = "0.1.0"
@@ -8618,18 +8630,6 @@ dependencies = [
"semver",
]
-[[package]]
-name = "rustdoc_to_markdown"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "html5ever",
- "indoc",
- "markup5ever_rcdom",
- "pretty_assertions",
- "regex",
-]
-
[[package]]
name = "rustix"
version = "0.37.23"
@@ -41,6 +41,7 @@ members = [
"crates/gpui",
"crates/gpui_macros",
"crates/headless",
+ "crates/html_to_markdown",
"crates/http",
"crates/image_viewer",
"crates/inline_completion_button",
@@ -76,7 +77,6 @@ members = [
"crates/rich_text",
"crates/rope",
"crates/rpc",
- "crates/rustdoc_to_markdown",
"crates/task",
"crates/tasks_ui",
"crates/search",
@@ -187,6 +187,7 @@ google_ai = { path = "crates/google_ai" }
gpui = { path = "crates/gpui" }
gpui_macros = { path = "crates/gpui_macros" }
headless = { path = "crates/headless" }
+html_to_markdown = { path = "crates/html_to_markdown" }
http = { path = "crates/http" }
install_cli = { path = "crates/install_cli" }
image_viewer = { path = "crates/image_viewer" }
@@ -223,7 +224,6 @@ dev_server_projects = { path = "crates/dev_server_projects" }
rich_text = { path = "crates/rich_text" }
rope = { path = "crates/rope" }
rpc = { path = "crates/rpc" }
-rustdoc_to_markdown = { path = "crates/rustdoc_to_markdown" }
task = { path = "crates/task" }
tasks_ui = { path = "crates/tasks_ui" }
search = { path = "crates/search" }
@@ -28,6 +28,7 @@ futures.workspace = true
fuzzy.workspace = true
gpui.workspace = true
heed.workspace = true
+html_to_markdown.workspace = true
http.workspace = true
indoc.workspace = true
language.workspace = true
@@ -40,7 +41,6 @@ parking_lot.workspace = true
project.workspace = true
regex.workspace = true
rope.workspace = true
-rustdoc_to_markdown.workspace = true
schemars.workspace = true
search.workspace = true
semantic_index.workspace = true
@@ -5,9 +5,9 @@ use anyhow::{anyhow, bail, Context, Result};
use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection};
use futures::AsyncReadExt;
use gpui::{AppContext, Task, WeakView};
+use html_to_markdown::convert_html_to_markdown;
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
use language::LspAdapterDelegate;
-use rustdoc_to_markdown::convert_html_to_markdown;
use ui::{prelude::*, ButtonLike, ElevationIndex};
use workspace::Workspace;
@@ -7,10 +7,10 @@ use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutp
use fs::Fs;
use futures::AsyncReadExt;
use gpui::{AppContext, Model, Task, WeakView};
+use html_to_markdown::convert_rustdoc_to_markdown;
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
use language::LspAdapterDelegate;
use project::{Project, ProjectPath};
-use rustdoc_to_markdown::convert_rustdoc_to_markdown;
use ui::{prelude::*, ButtonLike, ElevationIndex};
use workspace::Workspace;
@@ -1,5 +1,5 @@
[package]
-name = "rustdoc_to_markdown"
+name = "html_to_markdown"
version = "0.1.0"
edition = "2021"
publish = false
@@ -9,7 +9,7 @@ license = "GPL-3.0-or-later"
workspace = true
[lib]
-path = "src/rustdoc_to_markdown.rs"
+path = "src/html_to_markdown.rs"
[dependencies]
anyhow.workspace = true
@@ -1,5 +1,5 @@
+use html_to_markdown::convert_rustdoc_to_markdown;
use indoc::indoc;
-use rustdoc_to_markdown::convert_rustdoc_to_markdown;
pub fn main() {
let html = indoc! {"
@@ -3,7 +3,9 @@
#![deny(missing_docs)]
mod html_element;
+mod markdown;
mod markdown_writer;
+mod structure;
use std::io::Read;
@@ -14,15 +16,28 @@ use html5ever::tendril::TendrilSink;
use html5ever::tree_builder::TreeBuilderOpts;
use markup5ever_rcdom::RcDom;
-use crate::markdown_writer::MarkdownWriter;
+use crate::markdown::{HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler};
+use crate::markdown_writer::{HandleTag, MarkdownWriter};
/// Converts the provided HTML to Markdown.
pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
let dom = parse_html(html).context("failed to parse HTML")?;
+ let handlers: Vec<Box<dyn HandleTag>> = vec![
+ Box::new(ParagraphHandler),
+ Box::new(HeadingHandler),
+ Box::new(ListHandler),
+ Box::new(StyledTextHandler),
+ Box::new(structure::rustdoc::RustdocChromeRemover),
+ Box::new(structure::rustdoc::RustdocHeadingHandler),
+ Box::new(structure::rustdoc::RustdocCodeHandler),
+ Box::new(structure::rustdoc::RustdocTableHandler::new()),
+ Box::new(structure::rustdoc::RustdocItemHandler),
+ ];
+
let markdown_writer = MarkdownWriter::new();
let markdown = markdown_writer
- .run(&dom.document)
+ .run(&dom.document, handlers)
.context("failed to convert HTML to Markdown")?;
Ok(markdown)
@@ -32,9 +47,21 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
+ let handlers: Vec<Box<dyn HandleTag>> = vec![
+ Box::new(ParagraphHandler),
+ Box::new(HeadingHandler),
+ Box::new(ListHandler),
+ Box::new(StyledTextHandler),
+ Box::new(structure::rustdoc::RustdocChromeRemover),
+ Box::new(structure::rustdoc::RustdocHeadingHandler),
+ Box::new(structure::rustdoc::RustdocCodeHandler),
+ Box::new(structure::rustdoc::RustdocTableHandler::new()),
+ Box::new(structure::rustdoc::RustdocItemHandler),
+ ];
+
let markdown_writer = MarkdownWriter::new();
let markdown = markdown_writer
- .run(&dom.document)
+ .run(&dom.document, handlers)
.context("failed to convert rustdoc HTML to Markdown")?;
Ok(markdown)
@@ -0,0 +1,135 @@
+use crate::html_element::HtmlElement;
+use crate::markdown_writer::{HandleTag, MarkdownWriter, StartTagOutcome};
+
+pub struct ParagraphHandler;
+
+impl HandleTag for ParagraphHandler {
+ fn should_handle(&self, _tag: &str) -> bool {
+ true
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ if tag.is_inline() && writer.is_inside("p") {
+ if let Some(parent) = writer.current_element_stack().iter().last() {
+ if !parent.is_inline() {
+ if !(writer.markdown.ends_with(' ') || writer.markdown.ends_with('\n')) {
+ writer.push_str(" ");
+ }
+ }
+ }
+ }
+
+ match tag.tag.as_str() {
+ "p" => writer.push_blank_line(),
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+}
+
+pub struct HeadingHandler;
+
+impl HandleTag for HeadingHandler {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "h1" => writer.push_str("\n\n# "),
+ "h2" => writer.push_str("\n\n## "),
+ "h3" => writer.push_str("\n\n### "),
+ "h4" => writer.push_str("\n\n#### "),
+ "h5" => writer.push_str("\n\n##### "),
+ "h6" => writer.push_str("\n\n###### "),
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+
+ fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+ match tag.tag.as_str() {
+ "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => writer.push_blank_line(),
+ _ => {}
+ }
+ }
+}
+
+pub struct ListHandler;
+
+impl HandleTag for ListHandler {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "ul" | "ol" | "li" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "ul" | "ol" => writer.push_newline(),
+ "li" => writer.push_str("- "),
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+
+ fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+ match tag.tag.as_str() {
+ "ul" | "ol" => writer.push_newline(),
+ "li" => writer.push_newline(),
+ _ => {}
+ }
+ }
+}
+
+pub struct StyledTextHandler;
+
+impl HandleTag for StyledTextHandler {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "strong" | "em" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "strong" => writer.push_str("**"),
+ "em" => writer.push_str("_"),
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+
+ fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+ match tag.tag.as_str() {
+ "strong" => writer.push_str("**"),
+ "em" => writer.push_str("_"),
+ _ => {}
+ }
+ }
+}
@@ -0,0 +1,198 @@
+use std::collections::VecDeque;
+use std::sync::OnceLock;
+
+use anyhow::Result;
+use markup5ever_rcdom::{Handle, NodeData};
+use regex::Regex;
+
+use crate::html_element::HtmlElement;
+
+fn empty_line_regex() -> &'static Regex {
+ static REGEX: OnceLock<Regex> = OnceLock::new();
+ REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
+}
+
+fn more_than_three_newlines_regex() -> &'static Regex {
+ static REGEX: OnceLock<Regex> = OnceLock::new();
+ REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
+}
+
+pub enum StartTagOutcome {
+ Continue,
+ Skip,
+}
+
+pub struct MarkdownWriter {
+ current_element_stack: VecDeque<HtmlElement>,
+ pub(crate) markdown: String,
+}
+
+impl MarkdownWriter {
+ pub fn new() -> Self {
+ Self {
+ current_element_stack: VecDeque::new(),
+ markdown: String::new(),
+ }
+ }
+
+ pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
+ &self.current_element_stack
+ }
+
+ pub fn is_inside(&self, tag: &str) -> bool {
+ self.current_element_stack
+ .iter()
+ .any(|parent_element| parent_element.tag == tag)
+ }
+
+ /// Appends the given string slice onto the end of the Markdown output.
+ pub fn push_str(&mut self, str: &str) {
+ self.markdown.push_str(str);
+ }
+
+ /// Appends a newline to the end of the Markdown output.
+ pub fn push_newline(&mut self) {
+ self.push_str("\n");
+ }
+
+ /// Appends a blank line to the end of the Markdown output.
+ pub fn push_blank_line(&mut self) {
+ self.push_str("\n\n");
+ }
+
+ pub fn run(
+ mut self,
+ root_node: &Handle,
+ mut handlers: Vec<Box<dyn HandleTag>>,
+ ) -> Result<String> {
+ self.visit_node(&root_node, &mut handlers)?;
+ Ok(Self::prettify_markdown(self.markdown))
+ }
+
+ fn prettify_markdown(markdown: String) -> String {
+ let markdown = empty_line_regex().replace_all(&markdown, "");
+ let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
+
+ markdown.trim().to_string()
+ }
+
+ fn visit_node(&mut self, node: &Handle, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
+ let mut current_element = None;
+
+ match node.data {
+ NodeData::Document
+ | NodeData::Doctype { .. }
+ | NodeData::ProcessingInstruction { .. }
+ | NodeData::Comment { .. } => {
+ // Currently left unimplemented, as we're not interested in this data
+ // at this time.
+ }
+ NodeData::Element {
+ ref name,
+ ref attrs,
+ ..
+ } => {
+ let tag_name = name.local.to_string();
+ if !tag_name.is_empty() {
+ current_element = Some(HtmlElement {
+ tag: tag_name,
+ attrs: attrs.clone(),
+ });
+ }
+ }
+ NodeData::Text { ref contents } => {
+ let text = contents.borrow().to_string();
+ self.visit_text(text, handlers)?;
+ }
+ }
+
+ if let Some(current_element) = current_element.as_ref() {
+ match self.start_tag(¤t_element, handlers) {
+ StartTagOutcome::Continue => {}
+ StartTagOutcome::Skip => return Ok(()),
+ }
+
+ self.current_element_stack
+ .push_back(current_element.clone());
+ }
+
+ for child in node.children.borrow().iter() {
+ self.visit_node(child, handlers)?;
+ }
+
+ if let Some(current_element) = current_element {
+ self.current_element_stack.pop_back();
+ self.end_tag(¤t_element, handlers);
+ }
+
+ Ok(())
+ }
+
+ fn start_tag(
+ &mut self,
+ tag: &HtmlElement,
+ handlers: &mut [Box<dyn HandleTag>],
+ ) -> StartTagOutcome {
+ for handler in handlers {
+ if handler.should_handle(tag.tag.as_str()) {
+ match handler.handle_tag_start(tag, self) {
+ StartTagOutcome::Continue => {}
+ StartTagOutcome::Skip => return StartTagOutcome::Skip,
+ }
+ }
+ }
+
+ StartTagOutcome::Continue
+ }
+
+ fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box<dyn HandleTag>]) {
+ for handler in handlers {
+ if handler.should_handle(tag.tag.as_str()) {
+ handler.handle_tag_end(tag, self);
+ }
+ }
+ }
+
+ fn visit_text(&mut self, text: String, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
+ for handler in handlers {
+ match handler.handle_text(&text, self) {
+ HandlerOutcome::Handled => return Ok(()),
+ HandlerOutcome::NoOp => {}
+ }
+ }
+
+ let text = text
+ .trim_matches(|char| char == '\n' || char == '\r')
+ .replace('\n', " ");
+
+ self.push_str(&text);
+
+ Ok(())
+ }
+}
+
+pub enum HandlerOutcome {
+ Handled,
+ NoOp,
+}
+
+pub trait HandleTag {
+ /// Returns whether this handler should handle the given tag.
+ fn should_handle(&self, tag: &str) -> bool;
+
+ /// Handles the start of the given tag.
+ fn handle_tag_start(
+ &mut self,
+ _tag: &HtmlElement,
+ _writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ StartTagOutcome::Continue
+ }
+
+ /// Handles the end of the given tag.
+ fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
+
+ fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
+ HandlerOutcome::NoOp
+ }
+}
@@ -0,0 +1 @@
+pub mod rustdoc;
@@ -0,0 +1,286 @@
+use crate::html_element::HtmlElement;
+use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
+
+pub struct RustdocHeadingHandler;
+
+impl HandleTag for RustdocHeadingHandler {
+ fn should_handle(&self, _tag: &str) -> bool {
+ // We're only handling text, so we don't need to visit any tags.
+ false
+ }
+
+ fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+ if writer.is_inside("h1")
+ || writer.is_inside("h2")
+ || writer.is_inside("h3")
+ || writer.is_inside("h4")
+ || writer.is_inside("h5")
+ || writer.is_inside("h6")
+ {
+ let text = text
+ .trim_matches(|char| char == '\n' || char == '\r' || char == 'ยง')
+ .replace('\n', " ");
+ writer.push_str(&text);
+
+ return HandlerOutcome::Handled;
+ }
+
+ HandlerOutcome::NoOp
+ }
+}
+
+pub struct RustdocCodeHandler;
+
+impl HandleTag for RustdocCodeHandler {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "pre" | "code" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "code" => {
+ if !writer.is_inside("pre") {
+ writer.push_str("`");
+ }
+ }
+ "pre" => {
+ let classes = tag.classes();
+ let is_rust = classes.iter().any(|class| class == "rust");
+ let language = is_rust
+ .then(|| "rs")
+ .or_else(|| {
+ classes.iter().find_map(|class| {
+ if let Some((_, language)) = class.split_once("language-") {
+ Some(language.trim())
+ } else {
+ None
+ }
+ })
+ })
+ .unwrap_or("");
+
+ writer.push_str(&format!("\n\n```{language}\n"));
+ }
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+
+ fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+ match tag.tag.as_str() {
+ "code" => {
+ if !writer.is_inside("pre") {
+ writer.push_str("`");
+ }
+ }
+ "pre" => writer.push_str("\n```\n"),
+ _ => {}
+ }
+ }
+
+ fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+ if writer.is_inside("pre") {
+ writer.push_str(&text);
+ return HandlerOutcome::Handled;
+ }
+
+ HandlerOutcome::NoOp
+ }
+}
+
+pub struct RustdocTableHandler {
+ /// The number of columns in the current `<table>`.
+ current_table_columns: usize,
+ is_first_th: bool,
+ is_first_td: bool,
+}
+
+impl RustdocTableHandler {
+ pub fn new() -> Self {
+ Self {
+ current_table_columns: 0,
+ is_first_th: true,
+ is_first_td: true,
+ }
+ }
+}
+
+impl HandleTag for RustdocTableHandler {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "table" | "thead" | "tbody" | "tr" | "th" | "td" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "thead" => writer.push_blank_line(),
+ "tr" => writer.push_newline(),
+ "th" => {
+ self.current_table_columns += 1;
+ if self.is_first_th {
+ self.is_first_th = false;
+ } else {
+ writer.push_str(" ");
+ }
+ writer.push_str("| ");
+ }
+ "td" => {
+ if self.is_first_td {
+ self.is_first_td = false;
+ } else {
+ writer.push_str(" ");
+ }
+ writer.push_str("| ");
+ }
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+
+ fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+ match tag.tag.as_str() {
+ "thead" => {
+ writer.push_newline();
+ for ix in 0..self.current_table_columns {
+ if ix > 0 {
+ writer.push_str(" ");
+ }
+ writer.push_str("| ---");
+ }
+ writer.push_str(" |");
+ self.is_first_th = true;
+ }
+ "tr" => {
+ writer.push_str(" |");
+ self.is_first_td = true;
+ }
+ "table" => {
+ self.current_table_columns = 0;
+ }
+ _ => {}
+ }
+ }
+}
+
+const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name";
+
+pub struct RustdocItemHandler;
+
+impl RustdocItemHandler {
+ /// Returns whether we're currently inside of an `.item-name` element, which
+ /// rustdoc uses to display Rust items in a list.
+ fn is_inside_item_name(writer: &MarkdownWriter) -> bool {
+ writer
+ .current_element_stack()
+ .iter()
+ .any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS))
+ }
+}
+
+impl HandleTag for RustdocItemHandler {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "div" | "span" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "div" | "span" => {
+ if Self::is_inside_item_name(writer) && tag.has_class("stab") {
+ writer.push_str(" [");
+ }
+ }
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+
+ fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+ match tag.tag.as_str() {
+ "div" | "span" => {
+ if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) {
+ writer.push_str(": ");
+ }
+
+ if Self::is_inside_item_name(writer) && tag.has_class("stab") {
+ writer.push_str("]");
+ }
+ }
+ _ => {}
+ }
+ }
+
+ fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+ if Self::is_inside_item_name(writer)
+ && !writer.is_inside("span")
+ && !writer.is_inside("code")
+ {
+ writer.push_str(&format!("`{text}`"));
+ return HandlerOutcome::Handled;
+ }
+
+ HandlerOutcome::NoOp
+ }
+}
+
+pub struct RustdocChromeRemover;
+
+impl HandleTag for RustdocChromeRemover {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "head" | "script" | "nav" | "summary" | "button" | "div" | "span" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ _writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "head" | "script" | "nav" => return StartTagOutcome::Skip,
+ "summary" => {
+ if tag.has_class("hideme") {
+ return StartTagOutcome::Skip;
+ }
+ }
+ "button" => {
+ if tag.attr("id").as_deref() == Some("copy-path") {
+ return StartTagOutcome::Skip;
+ }
+ }
+ "div" | "span" => {
+ let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"];
+ if tag.has_any_classes(&classes_to_skip) {
+ return StartTagOutcome::Skip;
+ }
+ }
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+}
@@ -1,296 +0,0 @@
-use std::collections::VecDeque;
-use std::sync::OnceLock;
-
-use anyhow::Result;
-use markup5ever_rcdom::{Handle, NodeData};
-use regex::Regex;
-
-use crate::html_element::HtmlElement;
-
-fn empty_line_regex() -> &'static Regex {
- static REGEX: OnceLock<Regex> = OnceLock::new();
- REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
-}
-
-fn more_than_three_newlines_regex() -> &'static Regex {
- static REGEX: OnceLock<Regex> = OnceLock::new();
- REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
-}
-
-const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name";
-
-enum StartTagOutcome {
- Continue,
- Skip,
-}
-
-pub struct MarkdownWriter {
- current_element_stack: VecDeque<HtmlElement>,
- /// The number of columns in the current `<table>`.
- current_table_columns: usize,
- is_first_th: bool,
- is_first_td: bool,
- /// The Markdown output.
- markdown: String,
-}
-
-impl MarkdownWriter {
- pub fn new() -> Self {
- Self {
- current_element_stack: VecDeque::new(),
- current_table_columns: 0,
- is_first_th: true,
- is_first_td: true,
- markdown: String::new(),
- }
- }
-
- fn is_inside(&self, tag: &str) -> bool {
- self.current_element_stack
- .iter()
- .any(|parent_element| parent_element.tag == tag)
- }
-
- /// Appends the given string slice onto the end of the Markdown output.
- fn push_str(&mut self, str: &str) {
- self.markdown.push_str(str);
- }
-
- /// Appends a newline to the end of the Markdown output.
- fn push_newline(&mut self) {
- self.push_str("\n");
- }
-
- /// Appends a blank line to the end of the Markdown output.
- fn push_blank_line(&mut self) {
- self.push_str("\n\n");
- }
-
- pub fn run(mut self, root_node: &Handle) -> Result<String> {
- self.visit_node(&root_node)?;
- Ok(Self::prettify_markdown(self.markdown))
- }
-
- fn prettify_markdown(markdown: String) -> String {
- let markdown = empty_line_regex().replace_all(&markdown, "");
- let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
-
- markdown.trim().to_string()
- }
-
- fn visit_node(&mut self, node: &Handle) -> Result<()> {
- let mut current_element = None;
-
- match node.data {
- NodeData::Document
- | NodeData::Doctype { .. }
- | NodeData::ProcessingInstruction { .. }
- | NodeData::Comment { .. } => {
- // Currently left unimplemented, as we're not interested in this data
- // at this time.
- }
- NodeData::Element {
- ref name,
- ref attrs,
- ..
- } => {
- let tag_name = name.local.to_string();
- if !tag_name.is_empty() {
- current_element = Some(HtmlElement {
- tag: tag_name,
- attrs: attrs.clone(),
- });
- }
- }
- NodeData::Text { ref contents } => {
- let text = contents.borrow().to_string();
- self.visit_text(text)?;
- }
- }
-
- if let Some(current_element) = current_element.as_ref() {
- match self.start_tag(¤t_element) {
- StartTagOutcome::Continue => {}
- StartTagOutcome::Skip => return Ok(()),
- }
-
- self.current_element_stack
- .push_back(current_element.clone());
- }
-
- for child in node.children.borrow().iter() {
- self.visit_node(child)?;
- }
-
- if let Some(current_element) = current_element {
- self.current_element_stack.pop_back();
- self.end_tag(¤t_element);
- }
-
- Ok(())
- }
-
- fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
- if tag.is_inline() && self.is_inside("p") {
- if let Some(parent) = self.current_element_stack.iter().last() {
- if !parent.is_inline() {
- if !(self.markdown.ends_with(' ') || self.markdown.ends_with('\n')) {
- self.push_str(" ");
- }
- }
- }
- }
-
- match tag.tag.as_str() {
- "head" | "script" | "nav" => return StartTagOutcome::Skip,
- "h1" => self.push_str("\n\n# "),
- "h2" => self.push_str("\n\n## "),
- "h3" => self.push_str("\n\n### "),
- "h4" => self.push_str("\n\n#### "),
- "h5" => self.push_str("\n\n##### "),
- "h6" => self.push_str("\n\n###### "),
- "p" => self.push_blank_line(),
- "strong" => self.push_str("**"),
- "em" => self.push_str("_"),
- "code" => {
- if !self.is_inside("pre") {
- self.push_str("`");
- }
- }
- "pre" => {
- let classes = tag.classes();
- let is_rust = classes.iter().any(|class| class == "rust");
- let language = is_rust
- .then(|| "rs")
- .or_else(|| {
- classes.iter().find_map(|class| {
- if let Some((_, language)) = class.split_once("language-") {
- Some(language.trim())
- } else {
- None
- }
- })
- })
- .unwrap_or("");
-
- self.push_str(&format!("\n\n```{language}\n"));
- }
- "ul" | "ol" => self.push_newline(),
- "li" => self.push_str("- "),
- "thead" => self.push_blank_line(),
- "tr" => self.push_newline(),
- "th" => {
- self.current_table_columns += 1;
- if self.is_first_th {
- self.is_first_th = false;
- } else {
- self.push_str(" ");
- }
- self.push_str("| ");
- }
- "td" => {
- if self.is_first_td {
- self.is_first_td = false;
- } else {
- self.push_str(" ");
- }
- self.push_str("| ");
- }
- "summary" => {
- if tag.has_class("hideme") {
- return StartTagOutcome::Skip;
- }
- }
- "button" => {
- if tag.attr("id").as_deref() == Some("copy-path") {
- return StartTagOutcome::Skip;
- }
- }
- "div" | "span" => {
- let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"];
- if tag.has_any_classes(&classes_to_skip) {
- return StartTagOutcome::Skip;
- }
-
- if self.is_inside_item_name() && tag.has_class("stab") {
- self.push_str(" [");
- }
- }
- _ => {}
- }
-
- StartTagOutcome::Continue
- }
-
- fn end_tag(&mut self, tag: &HtmlElement) {
- match tag.tag.as_str() {
- "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
- "strong" => self.push_str("**"),
- "em" => self.push_str("_"),
- "code" => {
- if !self.is_inside("pre") {
- self.push_str("`");
- }
- }
- "pre" => self.push_str("\n```\n"),
- "ul" | "ol" => self.push_newline(),
- "li" => self.push_newline(),
- "thead" => {
- self.push_newline();
- for ix in 0..self.current_table_columns {
- if ix > 0 {
- self.push_str(" ");
- }
- self.push_str("| ---");
- }
- self.push_str(" |");
- self.is_first_th = true;
- }
- "tr" => {
- self.push_str(" |");
- self.is_first_td = true;
- }
- "table" => {
- self.current_table_columns = 0;
- }
- "div" | "span" => {
- if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) {
- self.push_str(": ");
- }
-
- if self.is_inside_item_name() && tag.has_class("stab") {
- self.push_str("]");
- }
- }
- _ => {}
- }
- }
-
- fn visit_text(&mut self, text: String) -> Result<()> {
- if self.is_inside("pre") {
- self.push_str(&text);
- return Ok(());
- }
-
- let text = text
- .trim_matches(|char| char == '\n' || char == '\r' || char == 'ยง')
- .replace('\n', " ");
-
- if self.is_inside_item_name() && !self.is_inside("span") && !self.is_inside("code") {
- self.push_str(&format!("`{text}`"));
- return Ok(());
- }
-
- self.push_str(&text);
-
- Ok(())
- }
-
- /// Returns whether we're currently inside of an `.item-name` element, which
- /// rustdoc uses to display Rust items in a list.
- fn is_inside_item_name(&self) -> bool {
- self.current_element_stack
- .iter()
- .any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS))
- }
-}