rustdoc_to_markdown.rs

  1//! Provides conversion from rustdoc's HTML output to Markdown.
  2
  3#![deny(missing_docs)]
  4
  5mod html_element;
  6mod markdown_writer;
  7
  8use std::io::Read;
  9
 10use anyhow::{Context, Result};
 11use html5ever::driver::ParseOpts;
 12use html5ever::parse_document;
 13use html5ever::tendril::TendrilSink;
 14use html5ever::tree_builder::TreeBuilderOpts;
 15use markup5ever_rcdom::RcDom;
 16
 17use crate::markdown_writer::MarkdownWriter;
 18
 19/// Converts the provided HTML to Markdown.
 20pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
 21    let dom = parse_html(html).context("failed to parse HTML")?;
 22
 23    let markdown_writer = MarkdownWriter::new();
 24    let markdown = markdown_writer
 25        .run(&dom.document)
 26        .context("failed to convert HTML to Markdown")?;
 27
 28    Ok(markdown)
 29}
 30
 31/// Converts the provided rustdoc HTML to Markdown.
 32pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
 33    let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
 34
 35    let markdown_writer = MarkdownWriter::new();
 36    let markdown = markdown_writer
 37        .run(&dom.document)
 38        .context("failed to convert rustdoc HTML to Markdown")?;
 39
 40    Ok(markdown)
 41}
 42
 43fn parse_html(mut html: impl Read) -> Result<RcDom> {
 44    let parse_options = ParseOpts {
 45        tree_builder: TreeBuilderOpts {
 46            drop_doctype: true,
 47            ..Default::default()
 48        },
 49        ..Default::default()
 50    };
 51    let dom = parse_document(RcDom::default(), parse_options)
 52        .from_utf8()
 53        .read_from(&mut html)
 54        .context("failed to parse HTML document")?;
 55
 56    Ok(dom)
 57}
 58
 59#[cfg(test)]
 60mod tests {
 61    use indoc::indoc;
 62    use pretty_assertions::assert_eq;
 63
 64    use super::*;
 65
 66    #[test]
 67    fn test_main_heading_buttons_get_removed() {
 68        let html = indoc! {r##"
 69            <div class="main-heading">
 70                <h1>Crate <a class="mod" href="#">serde</a><button id="copy-path" title="Copy item path to clipboard">Copy item path</button></h1>
 71                <span class="out-of-band">
 72                    <a class="src" href="../src/serde/lib.rs.html#1-340">source</a> · <button id="toggle-all-docs" title="collapse all docs">[<span>−</span>]</button>
 73                </span>
 74            </div>
 75        "##};
 76        let expected = indoc! {"
 77            # Crate serde
 78        "}
 79        .trim();
 80
 81        assert_eq!(
 82            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
 83            expected
 84        )
 85    }
 86
 87    #[test]
 88    fn test_single_paragraph() {
 89        let html = indoc! {r#"
 90            <p>In particular, the last point is what sets <code>axum</code> apart from other frameworks.
 91            <code>axum</code> doesn’t have its own middleware system but instead uses
 92            <a href="https://docs.rs/tower-service/0.3.2/x86_64-unknown-linux-gnu/tower_service/trait.Service.html" title="trait tower_service::Service"><code>tower::Service</code></a>. This means <code>axum</code> gets timeouts, tracing, compression,
 93            authorization, and more, for free. It also enables you to share middleware with
 94            applications written using <a href="http://crates.io/crates/hyper"><code>hyper</code></a> or <a href="http://crates.io/crates/tonic"><code>tonic</code></a>.</p>
 95        "#};
 96        let expected = indoc! {"
 97            In particular, the last point is what sets `axum` apart from other frameworks. `axum` doesn’t have its own middleware system but instead uses `tower::Service`. This means `axum` gets timeouts, tracing, compression, authorization, and more, for free. It also enables you to share middleware with applications written using `hyper` or `tonic`.
 98        "}
 99        .trim();
100
101        assert_eq!(
102            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
103            expected
104        )
105    }
106
107    #[test]
108    fn test_multiple_paragraphs() {
109        let html = indoc! {r##"
110            <h2 id="serde"><a class="doc-anchor" href="#serde">§</a>Serde</h2>
111            <p>Serde is a framework for <em><strong>ser</strong></em>ializing and <em><strong>de</strong></em>serializing Rust data
112            structures efficiently and generically.</p>
113            <p>The Serde ecosystem consists of data structures that know how to serialize
114            and deserialize themselves along with data formats that know how to
115            serialize and deserialize other things. Serde provides the layer by which
116            these two groups interact with each other, allowing any supported data
117            structure to be serialized and deserialized using any supported data format.</p>
118            <p>See the Serde website <a href="https://serde.rs/">https://serde.rs/</a> for additional documentation and
119            usage examples.</p>
120            <h3 id="design"><a class="doc-anchor" href="#design">§</a>Design</h3>
121            <p>Where many other languages rely on runtime reflection for serializing data,
122            Serde is instead built on Rust’s powerful trait system. A data structure
123            that knows how to serialize and deserialize itself is one that implements
124            Serde’s <code>Serialize</code> and <code>Deserialize</code> traits (or uses Serde’s derive
125            attribute to automatically generate implementations at compile time). This
126            avoids any overhead of reflection or runtime type information. In fact in
127            many situations the interaction between data structure and data format can
128            be completely optimized away by the Rust compiler, leaving Serde
129            serialization to perform the same speed as a handwritten serializer for the
130            specific selection of data structure and data format.</p>
131        "##};
132        let expected = indoc! {"
133            ## Serde
134
135            Serde is a framework for _**ser**_ializing and _**de**_serializing Rust data structures efficiently and generically.
136
137            The Serde ecosystem consists of data structures that know how to serialize and deserialize themselves along with data formats that know how to serialize and deserialize other things. Serde provides the layer by which these two groups interact with each other, allowing any supported data structure to be serialized and deserialized using any supported data format.
138
139            See the Serde website https://serde.rs/ for additional documentation and usage examples.
140
141            ### Design
142
143            Where many other languages rely on runtime reflection for serializing data, Serde is instead built on Rust’s powerful trait system. A data structure that knows how to serialize and deserialize itself is one that implements Serde’s `Serialize` and `Deserialize` traits (or uses Serde’s derive attribute to automatically generate implementations at compile time). This avoids any overhead of reflection or runtime type information. In fact in many situations the interaction between data structure and data format can be completely optimized away by the Rust compiler, leaving Serde serialization to perform the same speed as a handwritten serializer for the specific selection of data structure and data format.
144        "}
145        .trim();
146
147        assert_eq!(
148            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
149            expected
150        )
151    }
152
153    #[test]
154    fn test_styled_text() {
155        let html = indoc! {r#"
156            <p>This text is <strong>bolded</strong>.</p>
157            <p>This text is <em>italicized</em>.</p>
158        "#};
159        let expected = indoc! {"
160            This text is **bolded**.
161
162            This text is _italicized_.
163        "}
164        .trim();
165
166        assert_eq!(
167            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
168            expected
169        )
170    }
171
172    #[test]
173    fn test_rust_code_block() {
174        let html = indoc! {r#"
175            <pre class="rust rust-example-rendered"><code><span class="kw">use </span>axum::extract::{Path, Query, Json};
176            <span class="kw">use </span>std::collections::HashMap;
177
178            <span class="comment">// `Path` gives you the path parameters and deserializes them.
179            </span><span class="kw">async fn </span>path(Path(user_id): Path&lt;u32&gt;) {}
180
181            <span class="comment">// `Query` gives you the query parameters and deserializes them.
182            </span><span class="kw">async fn </span>query(Query(params): Query&lt;HashMap&lt;String, String&gt;&gt;) {}
183
184            <span class="comment">// Buffer the request body and deserialize it as JSON into a
185            // `serde_json::Value`. `Json` supports any type that implements
186            // `serde::Deserialize`.
187            </span><span class="kw">async fn </span>json(Json(payload): Json&lt;serde_json::Value&gt;) {}</code></pre>
188        "#};
189        let expected = indoc! {"
190            ```rs
191            use axum::extract::{Path, Query, Json};
192            use std::collections::HashMap;
193
194            // `Path` gives you the path parameters and deserializes them.
195            async fn path(Path(user_id): Path<u32>) {}
196
197            // `Query` gives you the query parameters and deserializes them.
198            async fn query(Query(params): Query<HashMap<String, String>>) {}
199
200            // Buffer the request body and deserialize it as JSON into a
201            // `serde_json::Value`. `Json` supports any type that implements
202            // `serde::Deserialize`.
203            async fn json(Json(payload): Json<serde_json::Value>) {}
204            ```
205        "}
206        .trim();
207
208        assert_eq!(
209            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
210            expected
211        )
212    }
213
214    #[test]
215    fn test_toml_code_block() {
216        let html = indoc! {r##"
217            <h2 id="required-dependencies"><a class="doc-anchor" href="#required-dependencies">§</a>Required dependencies</h2>
218            <p>To use axum there are a few dependencies you have to pull in as well:</p>
219            <div class="example-wrap"><pre class="language-toml"><code>[dependencies]
220            axum = &quot;&lt;latest-version&gt;&quot;
221            tokio = { version = &quot;&lt;latest-version&gt;&quot;, features = [&quot;full&quot;] }
222            tower = &quot;&lt;latest-version&gt;&quot;
223            </code></pre></div>
224        "##};
225        let expected = indoc! {r#"
226            ## Required dependencies
227
228            To use axum there are a few dependencies you have to pull in as well:
229
230            ```toml
231            [dependencies]
232            axum = "<latest-version>"
233            tokio = { version = "<latest-version>", features = ["full"] }
234            tower = "<latest-version>"
235
236            ```
237        "#}
238        .trim();
239
240        assert_eq!(
241            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
242            expected
243        )
244    }
245
246    #[test]
247    fn test_item_table() {
248        let html = indoc! {r##"
249            <h2 id="structs" class="section-header">Structs<a href="#structs" class="anchor">§</a></h2>
250            <ul class="item-table">
251            <li><div class="item-name"><a class="struct" href="struct.Error.html" title="struct axum::Error">Error</a></div><div class="desc docblock-short">Errors that can happen when using axum.</div></li>
252            <li><div class="item-name"><a class="struct" href="struct.Extension.html" title="struct axum::Extension">Extension</a></div><div class="desc docblock-short">Extractor and response for extensions.</div></li>
253            <li><div class="item-name"><a class="struct" href="struct.Form.html" title="struct axum::Form">Form</a><span class="stab portability" title="Available on crate feature `form` only"><code>form</code></span></div><div class="desc docblock-short">URL encoded extractor and response.</div></li>
254            <li><div class="item-name"><a class="struct" href="struct.Json.html" title="struct axum::Json">Json</a><span class="stab portability" title="Available on crate feature `json` only"><code>json</code></span></div><div class="desc docblock-short">JSON Extractor / Response.</div></li>
255            <li><div class="item-name"><a class="struct" href="struct.Router.html" title="struct axum::Router">Router</a></div><div class="desc docblock-short">The router type for composing handlers and services.</div></li></ul>
256            <h2 id="functions" class="section-header">Functions<a href="#functions" class="anchor">§</a></h2>
257            <ul class="item-table">
258            <li><div class="item-name"><a class="fn" href="fn.serve.html" title="fn axum::serve">serve</a><span class="stab portability" title="Available on crate feature `tokio` and (crate features `http1` or `http2`) only"><code>tokio</code> and (<code>http1</code> or <code>http2</code>)</span></div><div class="desc docblock-short">Serve the service with the supplied listener.</div></li>
259            </ul>
260        "##};
261        let expected = indoc! {r#"
262            ## Structs
263
264            - `Error`: Errors that can happen when using axum.
265            - `Extension`: Extractor and response for extensions.
266            - `Form` [`form`]: URL encoded extractor and response.
267            - `Json` [`json`]: JSON Extractor / Response.
268            - `Router`: The router type for composing handlers and services.
269
270            ## Functions
271
272            - `serve` [`tokio` and (`http1` or `http2`)]: Serve the service with the supplied listener.
273        "#}
274        .trim();
275
276        assert_eq!(
277            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
278            expected
279        )
280    }
281
282    #[test]
283    fn test_table() {
284        let html = indoc! {r##"
285            <h2 id="feature-flags"><a class="doc-anchor" href="#feature-flags">§</a>Feature flags</h2>
286            <p>axum uses a set of <a href="https://doc.rust-lang.org/cargo/reference/features.html#the-features-section">feature flags</a> to reduce the amount of compiled and
287            optional dependencies.</p>
288            <p>The following optional features are available:</p>
289            <div><table><thead><tr><th>Name</th><th>Description</th><th>Default?</th></tr></thead><tbody>
290            <tr><td><code>http1</code></td><td>Enables hyper’s <code>http1</code> feature</td><td>Yes</td></tr>
291            <tr><td><code>http2</code></td><td>Enables hyper’s <code>http2</code> feature</td><td>No</td></tr>
292            <tr><td><code>json</code></td><td>Enables the <a href="struct.Json.html" title="struct axum::Json"><code>Json</code></a> type and some similar convenience functionality</td><td>Yes</td></tr>
293            <tr><td><code>macros</code></td><td>Enables optional utility macros</td><td>No</td></tr>
294            <tr><td><code>matched-path</code></td><td>Enables capturing of every request’s router path and the <a href="extract/struct.MatchedPath.html" title="struct axum::extract::MatchedPath"><code>MatchedPath</code></a> extractor</td><td>Yes</td></tr>
295            <tr><td><code>multipart</code></td><td>Enables parsing <code>multipart/form-data</code> requests with <a href="extract/struct.Multipart.html" title="struct axum::extract::Multipart"><code>Multipart</code></a></td><td>No</td></tr>
296            <tr><td><code>original-uri</code></td><td>Enables capturing of every request’s original URI and the <a href="extract/struct.OriginalUri.html" title="struct axum::extract::OriginalUri"><code>OriginalUri</code></a> extractor</td><td>Yes</td></tr>
297            <tr><td><code>tokio</code></td><td>Enables <code>tokio</code> as a dependency and <code>axum::serve</code>, <code>SSE</code> and <code>extract::connect_info</code> types.</td><td>Yes</td></tr>
298            <tr><td><code>tower-log</code></td><td>Enables <code>tower</code>’s <code>log</code> feature</td><td>Yes</td></tr>
299            <tr><td><code>tracing</code></td><td>Log rejections from built-in extractors</td><td>Yes</td></tr>
300            <tr><td><code>ws</code></td><td>Enables WebSockets support via <a href="extract/ws/index.html" title="mod axum::extract::ws"><code>extract::ws</code></a></td><td>No</td></tr>
301            <tr><td><code>form</code></td><td>Enables the <code>Form</code> extractor</td><td>Yes</td></tr>
302            <tr><td><code>query</code></td><td>Enables the <code>Query</code> extractor</td><td>Yes</td></tr>
303            </tbody></table>
304        "##};
305        let expected = indoc! {r#"
306            ## Feature flags
307
308            axum uses a set of feature flags to reduce the amount of compiled and optional dependencies.
309
310            The following optional features are available:
311
312            | Name | Description | Default? |
313            | --- | --- | --- |
314            | `http1` | Enables hyper’s `http1` feature | Yes |
315            | `http2` | Enables hyper’s `http2` feature | No |
316            | `json` | Enables the `Json` type and some similar convenience functionality | Yes |
317            | `macros` | Enables optional utility macros | No |
318            | `matched-path` | Enables capturing of every request’s router path and the `MatchedPath` extractor | Yes |
319            | `multipart` | Enables parsing `multipart/form-data` requests with `Multipart` | No |
320            | `original-uri` | Enables capturing of every request’s original URI and the `OriginalUri` extractor | Yes |
321            | `tokio` | Enables `tokio` as a dependency and `axum::serve`, `SSE` and `extract::connect_info` types. | Yes |
322            | `tower-log` | Enables `tower`’s `log` feature | Yes |
323            | `tracing` | Log rejections from built-in extractors | Yes |
324            | `ws` | Enables WebSockets support via `extract::ws` | No |
325            | `form` | Enables the `Form` extractor | Yes |
326            | `query` | Enables the `Query` extractor | Yes |
327        "#}
328        .trim();
329
330        assert_eq!(
331            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
332            expected
333        )
334    }
335}