Add `rustdoc_to_markdown` crate (#12445)

Marshall Bowers created

This PR adds a new crate for converting rustdoc output to Markdown.

We're leveraging Servo's `html5ever` to parse the Markdown content, and
then walking the DOM nodes to convert it to a Markdown string.

The Markdown output will be continued to be refined, but it's in a place
where it should be reasonable.

Release Notes:

- N/A

Change summary

Cargo.lock                                            | 146 ++++++++-
Cargo.toml                                            |   4 
crates/rustdoc_to_markdown/Cargo.toml                 |  20 +
crates/rustdoc_to_markdown/LICENSE-GPL                |   1 
crates/rustdoc_to_markdown/examples/test.rs           |  29 +
crates/rustdoc_to_markdown/src/markdown_writer.rs     | 201 +++++++++++++
crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs |  36 ++
7 files changed, 420 insertions(+), 17 deletions(-)

Detailed changes

Cargo.lock ๐Ÿ”—

@@ -5060,6 +5060,20 @@ version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4d13cdbd5dbb29f9c88095bbdc2590c9cba0d0a1269b983fef6b2cdd7e9f4db1"
 
+[[package]]
+name = "html5ever"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4"
+dependencies = [
+ "log",
+ "mac",
+ "markup5ever",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.59",
+]
+
 [[package]]
 name = "http"
 version = "0.1.0"
@@ -5719,7 +5733,7 @@ dependencies = [
  "tree-sitter-embedded-template",
  "tree-sitter-heex",
  "tree-sitter-html",
- "tree-sitter-json 0.20.2",
+ "tree-sitter-json",
  "tree-sitter-markdown",
  "tree-sitter-ruby",
  "tree-sitter-rust",
@@ -5809,7 +5823,7 @@ dependencies = [
  "tree-sitter-gomod",
  "tree-sitter-gowork",
  "tree-sitter-jsdoc",
- "tree-sitter-json 0.20.2",
+ "tree-sitter-json",
  "tree-sitter-markdown",
  "tree-sitter-proto",
  "tree-sitter-python",
@@ -6181,6 +6195,32 @@ dependencies = [
  "workspace",
 ]
 
+[[package]]
+name = "markup5ever"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45"
+dependencies = [
+ "log",
+ "phf",
+ "phf_codegen",
+ "string_cache",
+ "string_cache_codegen",
+ "tendril",
+]
+
+[[package]]
+name = "markup5ever_rcdom"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18"
+dependencies = [
+ "html5ever",
+ "markup5ever",
+ "tendril",
+ "xml5ever",
+]
+
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -7286,7 +7326,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
 dependencies = [
  "phf_macros",
- "phf_shared",
+ "phf_shared 0.11.2",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
+dependencies = [
+ "phf_generator 0.11.2",
+ "phf_shared 0.11.2",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
+dependencies = [
+ "phf_shared 0.10.0",
+ "rand 0.8.5",
 ]
 
 [[package]]
@@ -7295,7 +7355,7 @@ version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
 dependencies = [
- "phf_shared",
+ "phf_shared 0.11.2",
  "rand 0.8.5",
 ]
 
@@ -7305,13 +7365,22 @@ version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
 dependencies = [
- "phf_generator",
- "phf_shared",
+ "phf_generator 0.11.2",
+ "phf_shared 0.11.2",
  "proc-macro2",
  "quote",
  "syn 2.0.59",
 ]
 
+[[package]]
+name = "phf_shared"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
+dependencies = [
+ "siphasher 0.3.11",
+]
+
 [[package]]
 name = "phf_shared"
 version = "0.11.2"
@@ -7555,6 +7624,12 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "precomputed-hash"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
+
 [[package]]
 name = "prettier"
 version = "0.1.0"
@@ -8554,6 +8629,16 @@ dependencies = [
  "semver",
 ]
 
+[[package]]
+name = "rustdoc_to_markdown"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "html5ever",
+ "indoc",
+ "markup5ever_rcdom",
+]
+
 [[package]]
 name = "rustix"
 version = "0.37.23"
@@ -9118,7 +9203,7 @@ dependencies = [
  "serde_json_lenient",
  "smallvec",
  "tree-sitter",
- "tree-sitter-json 0.19.0",
+ "tree-sitter-json",
  "unindent",
  "util",
 ]
@@ -9802,6 +9887,32 @@ dependencies = [
  "float-cmp",
 ]
 
+[[package]]
+name = "string_cache"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
+dependencies = [
+ "new_debug_unreachable",
+ "once_cell",
+ "parking_lot",
+ "phf_shared 0.10.0",
+ "precomputed-hash",
+ "serde",
+]
+
+[[package]]
+name = "string_cache_codegen"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
+dependencies = [
+ "phf_generator 0.10.0",
+ "phf_shared 0.10.0",
+ "proc-macro2",
+ "quote",
+]
+
 [[package]]
 name = "stringprep"
 version = "0.1.4"
@@ -10991,16 +11102,6 @@ dependencies = [
  "tree-sitter",
 ]
 
-[[package]]
-name = "tree-sitter-json"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "90b04c4e1a92139535eb9fca4ec8fa9666cc96b618005d3ae35f3c957fa92f92"
-dependencies = [
- "cc",
- "tree-sitter",
-]
-
 [[package]]
 name = "tree-sitter-json"
 version = "0.20.2"
@@ -12937,6 +13038,17 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "054a8e68b76250b253f671d1268cb7f1ae089ec35e195b2efb2a4e9a836d0621"
 
+[[package]]
+name = "xml5ever"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c376f76ed09df711203e20c3ef5ce556f0166fa03d39590016c0fd625437fad"
+dependencies = [
+ "log",
+ "mac",
+ "markup5ever",
+]
+
 [[package]]
 name = "xmlparser"
 version = "0.13.5"

Cargo.toml ๐Ÿ”—

@@ -76,6 +76,7 @@ members = [
     "crates/rich_text",
     "crates/rope",
     "crates/rpc",
+    "crates/rustdoc_to_markdown",
     "crates/task",
     "crates/tasks_ui",
     "crates/search",
@@ -220,6 +221,7 @@ dev_server_projects = { path = "crates/dev_server_projects" }
 rich_text = { path = "crates/rich_text" }
 rope = { path = "crates/rope" }
 rpc = { path = "crates/rpc" }
+rustdoc_to_markdown = { path = "crates/rustdoc_to_markdown" }
 task = { path = "crates/task" }
 tasks_ui = { path = "crates/tasks_ui" }
 search = { path = "crates/search" }
@@ -288,6 +290,7 @@ heed = { version = "0.20.1", features = [
     "read-txn-no-tls",
 ] }
 hex = "0.4.3"
+html5ever = "0.27.0"
 ignore = "0.4.22"
 indoc = "1"
 # We explicitly disable http2 support in isahc.
@@ -300,6 +303,7 @@ lazy_static = "1.4.0"
 libc = "0.2"
 linkify = "0.10.0"
 log = { version = "0.4.16", features = ["kv_unstable_serde"] }
+markup5ever_rcdom = "0.3.0"
 nanoid = "0.4"
 nix = "0.28"
 once_cell = "1.19.0"

crates/rustdoc_to_markdown/Cargo.toml ๐Ÿ”—

@@ -0,0 +1,20 @@
+[package]
+name = "rustdoc_to_markdown"
+version = "0.1.0"
+edition = "2021"
+publish = false
+license = "GPL-3.0-or-later"
+
+[lints]
+workspace = true
+
+[lib]
+path = "src/rustdoc_to_markdown.rs"
+
+[dependencies]
+anyhow.workspace = true
+html5ever.workspace = true
+markup5ever_rcdom.workspace = true
+
+[dev-dependencies]
+indoc.workspace = true

crates/rustdoc_to_markdown/examples/test.rs ๐Ÿ”—

@@ -0,0 +1,29 @@
+use indoc::indoc;
+use rustdoc_to_markdown::convert_rustdoc_to_markdown;
+
+pub fn main() {
+    let html = indoc! {"
+        <html>
+            <body>
+                <h1>Hello World</h1>
+                <p>
+                    Here is some content.
+                </p>
+                <h2>Some items</h2>
+                <ul>
+                    <li>One</li>
+                    <li>Two</li>
+                    <li>Three</li>
+                </ul>
+            </body>
+        </html>
+    "};
+    // To test this out with some real input, try this:
+    //
+    // ```
+    // let html = include_str!("/path/to/zed/target/doc/gpui/index.html");
+    // ```
+    let markdown = convert_rustdoc_to_markdown(html).unwrap();
+
+    println!("{markdown}");
+}

crates/rustdoc_to_markdown/src/markdown_writer.rs ๐Ÿ”—

@@ -0,0 +1,201 @@
+use std::cell::RefCell;
+use std::collections::VecDeque;
+
+use anyhow::Result;
+use html5ever::Attribute;
+use markup5ever_rcdom::{Handle, NodeData};
+
+#[derive(Debug, Clone)]
+struct HtmlElement {
+    tag: String,
+    attrs: RefCell<Vec<Attribute>>,
+}
+
+enum StartTagOutcome {
+    Continue,
+    Skip,
+}
+
+pub struct MarkdownWriter {
+    current_element_stack: VecDeque<HtmlElement>,
+    /// The Markdown output.
+    markdown: String,
+}
+
+impl MarkdownWriter {
+    pub fn new() -> Self {
+        Self {
+            current_element_stack: VecDeque::new(),
+            markdown: String::new(),
+        }
+    }
+
+    fn is_inside(&self, tag: &str) -> bool {
+        self.current_element_stack
+            .iter()
+            .any(|parent_element| parent_element.tag == tag)
+    }
+
+    fn is_inside_heading(&self) -> bool {
+        ["h1", "h2", "h3", "h4", "h5", "h6"]
+            .into_iter()
+            .any(|heading| self.is_inside(heading))
+    }
+
+    /// Appends the given string slice onto the end of the Markdown output.
+    fn push_str(&mut self, str: &str) {
+        self.markdown.push_str(str);
+    }
+
+    /// Appends a newline to the end of the Markdown output.
+    fn push_newline(&mut self) {
+        self.push_str("\n");
+    }
+
+    pub fn run(mut self, root_node: &Handle) -> Result<String> {
+        self.visit_node(&root_node)?;
+        Ok(self.markdown.trim().to_string())
+    }
+
+    fn visit_node(&mut self, node: &Handle) -> Result<()> {
+        let mut current_element = None;
+
+        match node.data {
+            NodeData::Document
+            | NodeData::Doctype { .. }
+            | NodeData::ProcessingInstruction { .. }
+            | NodeData::Comment { .. } => {
+                // Currently left unimplemented, as we're not interested in this data
+                // at this time.
+            }
+            NodeData::Element {
+                ref name,
+                ref attrs,
+                ..
+            } => {
+                let tag_name = name.local.to_string();
+                if !tag_name.is_empty() {
+                    current_element = Some(HtmlElement {
+                        tag: tag_name,
+                        attrs: attrs.clone(),
+                    });
+                }
+            }
+            NodeData::Text { ref contents } => {
+                let text = contents.borrow().to_string();
+                self.visit_text(text)?;
+            }
+        }
+
+        if let Some(current_element) = current_element.as_ref() {
+            match self.start_tag(&current_element) {
+                StartTagOutcome::Continue => {}
+                StartTagOutcome::Skip => return Ok(()),
+            }
+
+            self.current_element_stack
+                .push_back(current_element.clone());
+        }
+
+        for child in node.children.borrow().iter() {
+            self.visit_node(child)?;
+        }
+
+        self.current_element_stack.pop_back();
+
+        if let Some(current_element) = current_element {
+            self.end_tag(&current_element);
+        }
+
+        Ok(())
+    }
+
+    fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
+        match tag.tag.as_str() {
+            "head" | "script" | "nav" => return StartTagOutcome::Skip,
+            "h1" => self.push_str("\n# "),
+            "h2" => self.push_str("\n## "),
+            "h3" => self.push_str("\n### "),
+            "h4" => self.push_str("\n#### "),
+            "h5" => self.push_str("\n##### "),
+            "h6" => self.push_str("\n###### "),
+            "code" => {
+                if !self.is_inside("pre") {
+                    self.push_str("`")
+                }
+            }
+            "pre" => self.push_str("\n```\n"),
+            "ul" | "ol" => self.push_newline(),
+            "li" => self.push_str("- "),
+            "summary" => {
+                if tag.attrs.borrow().iter().any(|attr| {
+                    attr.name.local.to_string() == "class" && attr.value.to_string() == "hideme"
+                }) {
+                    return StartTagOutcome::Skip;
+                }
+            }
+            "div" | "span" => {
+                if tag.attrs.borrow().iter().any(|attr| {
+                    attr.name.local.to_string() == "class"
+                        && attr.value.to_string() == "sidebar-elems"
+                }) {
+                    return StartTagOutcome::Skip;
+                }
+
+                if tag.attrs.borrow().iter().any(|attr| {
+                    attr.name.local.to_string() == "class"
+                        && attr.value.to_string() == "out-of-band"
+                }) {
+                    return StartTagOutcome::Skip;
+                }
+
+                if tag.attrs.borrow().iter().any(|attr| {
+                    attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
+                }) {
+                    self.push_str("`");
+                }
+            }
+            _ => {}
+        }
+
+        StartTagOutcome::Continue
+    }
+
+    fn end_tag(&mut self, tag: &HtmlElement) {
+        match tag.tag.as_str() {
+            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
+            "code" => {
+                if !self.is_inside("pre") {
+                    self.push_str("`")
+                }
+            }
+            "pre" => self.push_str("\n```\n"),
+            "ul" | "ol" => self.push_newline(),
+            "li" => self.push_newline(),
+            "div" => {
+                if tag.attrs.borrow().iter().any(|attr| {
+                    attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
+                }) {
+                    self.push_str("`: ");
+                }
+            }
+            _ => {}
+        }
+    }
+
+    fn visit_text(&mut self, text: String) -> Result<()> {
+        if self.is_inside("pre") {
+            self.push_str(&text);
+            return Ok(());
+        }
+
+        if self.is_inside_heading() && self.is_inside("a") {
+            return Ok(());
+        }
+
+        let trimmed_text = text.trim_matches(|char| char == '\n' || char == '\r' || char == 'ยง');
+        self.push_str(trimmed_text);
+
+        Ok(())
+    }
+}

crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs ๐Ÿ”—

@@ -0,0 +1,36 @@
+//! Provides conversion from rustdoc's HTML output to Markdown.
+
+#![deny(missing_docs)]
+
+mod markdown_writer;
+
+use anyhow::{Context, Result};
+use html5ever::driver::ParseOpts;
+use html5ever::parse_document;
+use html5ever::tendril::TendrilSink;
+use html5ever::tree_builder::TreeBuilderOpts;
+use markup5ever_rcdom::RcDom;
+
+use crate::markdown_writer::MarkdownWriter;
+
+/// Converts the provided rustdoc HTML to Markdown.
+pub fn convert_rustdoc_to_markdown(html: &str) -> Result<String> {
+    let parse_options = ParseOpts {
+        tree_builder: TreeBuilderOpts {
+            drop_doctype: true,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let dom = parse_document(RcDom::default(), parse_options)
+        .from_utf8()
+        .read_from(&mut html.as_bytes())
+        .context("failed to parse rustdoc HTML")?;
+
+    let markdown_writer = MarkdownWriter::new();
+    let markdown = markdown_writer
+        .run(&dom.document)
+        .context("failed to convert rustdoc to HTML")?;
+
+    Ok(markdown)
+}