html_to_markdown.rs

  1//! Provides conversion from rustdoc's HTML output to Markdown.
  2
  3#![deny(missing_docs)]
  4
  5mod html_element;
  6mod markdown;
  7mod markdown_writer;
  8mod structure;
  9
 10use std::io::Read;
 11
 12use anyhow::{Context, Result};
 13use html5ever::driver::ParseOpts;
 14use html5ever::parse_document;
 15use html5ever::tendril::TendrilSink;
 16use html5ever::tree_builder::TreeBuilderOpts;
 17use markup5ever_rcdom::RcDom;
 18
 19use crate::markdown::{
 20    HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
 21};
 22use crate::markdown_writer::{HandleTag, MarkdownWriter};
 23
 24/// Converts the provided HTML to Markdown.
 25pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
 26    let dom = parse_html(html).context("failed to parse HTML")?;
 27
 28    let handlers: Vec<Box<dyn HandleTag>> = vec![
 29        Box::new(ParagraphHandler),
 30        Box::new(HeadingHandler),
 31        Box::new(ListHandler),
 32        Box::new(TableHandler::new()),
 33        Box::new(StyledTextHandler),
 34        Box::new(structure::rustdoc::RustdocChromeRemover),
 35        Box::new(structure::rustdoc::RustdocHeadingHandler),
 36        Box::new(structure::rustdoc::RustdocCodeHandler),
 37        Box::new(structure::rustdoc::RustdocItemHandler),
 38    ];
 39
 40    let markdown_writer = MarkdownWriter::new();
 41    let markdown = markdown_writer
 42        .run(&dom.document, handlers)
 43        .context("failed to convert HTML to Markdown")?;
 44
 45    Ok(markdown)
 46}
 47
 48/// Converts the provided rustdoc HTML to Markdown.
 49pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
 50    let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
 51
 52    let handlers: Vec<Box<dyn HandleTag>> = vec![
 53        Box::new(ParagraphHandler),
 54        Box::new(HeadingHandler),
 55        Box::new(ListHandler),
 56        Box::new(TableHandler::new()),
 57        Box::new(StyledTextHandler),
 58        Box::new(structure::rustdoc::RustdocChromeRemover),
 59        Box::new(structure::rustdoc::RustdocHeadingHandler),
 60        Box::new(structure::rustdoc::RustdocCodeHandler),
 61        Box::new(structure::rustdoc::RustdocItemHandler),
 62    ];
 63
 64    let markdown_writer = MarkdownWriter::new();
 65    let markdown = markdown_writer
 66        .run(&dom.document, handlers)
 67        .context("failed to convert rustdoc HTML to Markdown")?;
 68
 69    Ok(markdown)
 70}
 71
 72fn parse_html(mut html: impl Read) -> Result<RcDom> {
 73    let parse_options = ParseOpts {
 74        tree_builder: TreeBuilderOpts {
 75            drop_doctype: true,
 76            ..Default::default()
 77        },
 78        ..Default::default()
 79    };
 80    let dom = parse_document(RcDom::default(), parse_options)
 81        .from_utf8()
 82        .read_from(&mut html)
 83        .context("failed to parse HTML document")?;
 84
 85    Ok(dom)
 86}
 87
 88#[cfg(test)]
 89mod tests {
 90    use indoc::indoc;
 91    use pretty_assertions::assert_eq;
 92
 93    use super::*;
 94
 95    #[test]
 96    fn test_main_heading_buttons_get_removed() {
 97        let html = indoc! {r##"
 98            <div class="main-heading">
 99                <h1>Crate <a class="mod" href="#">serde</a><button id="copy-path" title="Copy item path to clipboard">Copy item path</button></h1>
100                <span class="out-of-band">
101                    <a class="src" href="../src/serde/lib.rs.html#1-340">source</a> · <button id="toggle-all-docs" title="collapse all docs">[<span>−</span>]</button>
102                </span>
103            </div>
104        "##};
105        let expected = indoc! {"
106            # Crate serde
107        "}
108        .trim();
109
110        assert_eq!(
111            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
112            expected
113        )
114    }
115
116    #[test]
117    fn test_single_paragraph() {
118        let html = indoc! {r#"
119            <p>In particular, the last point is what sets <code>axum</code> apart from other frameworks.
120            <code>axum</code> doesn’t have its own middleware system but instead uses
121            <a href="https://docs.rs/tower-service/0.3.2/x86_64-unknown-linux-gnu/tower_service/trait.Service.html" title="trait tower_service::Service"><code>tower::Service</code></a>. This means <code>axum</code> gets timeouts, tracing, compression,
122            authorization, and more, for free. It also enables you to share middleware with
123            applications written using <a href="http://crates.io/crates/hyper"><code>hyper</code></a> or <a href="http://crates.io/crates/tonic"><code>tonic</code></a>.</p>
124        "#};
125        let expected = indoc! {"
126            In particular, the last point is what sets `axum` apart from other frameworks. `axum` doesn’t have its own middleware system but instead uses `tower::Service`. This means `axum` gets timeouts, tracing, compression, authorization, and more, for free. It also enables you to share middleware with applications written using `hyper` or `tonic`.
127        "}
128        .trim();
129
130        assert_eq!(
131            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
132            expected
133        )
134    }
135
136    #[test]
137    fn test_multiple_paragraphs() {
138        let html = indoc! {r##"
139            <h2 id="serde"><a class="doc-anchor" href="#serde">§</a>Serde</h2>
140            <p>Serde is a framework for <em><strong>ser</strong></em>ializing and <em><strong>de</strong></em>serializing Rust data
141            structures efficiently and generically.</p>
142            <p>The Serde ecosystem consists of data structures that know how to serialize
143            and deserialize themselves along with data formats that know how to
144            serialize and deserialize other things. Serde provides the layer by which
145            these two groups interact with each other, allowing any supported data
146            structure to be serialized and deserialized using any supported data format.</p>
147            <p>See the Serde website <a href="https://serde.rs/">https://serde.rs/</a> for additional documentation and
148            usage examples.</p>
149            <h3 id="design"><a class="doc-anchor" href="#design">§</a>Design</h3>
150            <p>Where many other languages rely on runtime reflection for serializing data,
151            Serde is instead built on Rust’s powerful trait system. A data structure
152            that knows how to serialize and deserialize itself is one that implements
153            Serde’s <code>Serialize</code> and <code>Deserialize</code> traits (or uses Serde’s derive
154            attribute to automatically generate implementations at compile time). This
155            avoids any overhead of reflection or runtime type information. In fact in
156            many situations the interaction between data structure and data format can
157            be completely optimized away by the Rust compiler, leaving Serde
158            serialization to perform the same speed as a handwritten serializer for the
159            specific selection of data structure and data format.</p>
160        "##};
161        let expected = indoc! {"
162            ## Serde
163
164            Serde is a framework for _**ser**_ializing and _**de**_serializing Rust data structures efficiently and generically.
165
166            The Serde ecosystem consists of data structures that know how to serialize and deserialize themselves along with data formats that know how to serialize and deserialize other things. Serde provides the layer by which these two groups interact with each other, allowing any supported data structure to be serialized and deserialized using any supported data format.
167
168            See the Serde website https://serde.rs/ for additional documentation and usage examples.
169
170            ### Design
171
172            Where many other languages rely on runtime reflection for serializing data, Serde is instead built on Rust’s powerful trait system. A data structure that knows how to serialize and deserialize itself is one that implements Serde’s `Serialize` and `Deserialize` traits (or uses Serde’s derive attribute to automatically generate implementations at compile time). This avoids any overhead of reflection or runtime type information. In fact in many situations the interaction between data structure and data format can be completely optimized away by the Rust compiler, leaving Serde serialization to perform the same speed as a handwritten serializer for the specific selection of data structure and data format.
173        "}
174        .trim();
175
176        assert_eq!(
177            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
178            expected
179        )
180    }
181
182    #[test]
183    fn test_styled_text() {
184        let html = indoc! {r#"
185            <p>This text is <strong>bolded</strong>.</p>
186            <p>This text is <em>italicized</em>.</p>
187        "#};
188        let expected = indoc! {"
189            This text is **bolded**.
190
191            This text is _italicized_.
192        "}
193        .trim();
194
195        assert_eq!(
196            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
197            expected
198        )
199    }
200
201    #[test]
202    fn test_rust_code_block() {
203        let html = indoc! {r#"
204            <pre class="rust rust-example-rendered"><code><span class="kw">use </span>axum::extract::{Path, Query, Json};
205            <span class="kw">use </span>std::collections::HashMap;
206
207            <span class="comment">// `Path` gives you the path parameters and deserializes them.
208            </span><span class="kw">async fn </span>path(Path(user_id): Path&lt;u32&gt;) {}
209
210            <span class="comment">// `Query` gives you the query parameters and deserializes them.
211            </span><span class="kw">async fn </span>query(Query(params): Query&lt;HashMap&lt;String, String&gt;&gt;) {}
212
213            <span class="comment">// Buffer the request body and deserialize it as JSON into a
214            // `serde_json::Value`. `Json` supports any type that implements
215            // `serde::Deserialize`.
216            </span><span class="kw">async fn </span>json(Json(payload): Json&lt;serde_json::Value&gt;) {}</code></pre>
217        "#};
218        let expected = indoc! {"
219            ```rs
220            use axum::extract::{Path, Query, Json};
221            use std::collections::HashMap;
222
223            // `Path` gives you the path parameters and deserializes them.
224            async fn path(Path(user_id): Path<u32>) {}
225
226            // `Query` gives you the query parameters and deserializes them.
227            async fn query(Query(params): Query<HashMap<String, String>>) {}
228
229            // Buffer the request body and deserialize it as JSON into a
230            // `serde_json::Value`. `Json` supports any type that implements
231            // `serde::Deserialize`.
232            async fn json(Json(payload): Json<serde_json::Value>) {}
233            ```
234        "}
235        .trim();
236
237        assert_eq!(
238            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
239            expected
240        )
241    }
242
243    #[test]
244    fn test_toml_code_block() {
245        let html = indoc! {r##"
246            <h2 id="required-dependencies"><a class="doc-anchor" href="#required-dependencies">§</a>Required dependencies</h2>
247            <p>To use axum there are a few dependencies you have to pull in as well:</p>
248            <div class="example-wrap"><pre class="language-toml"><code>[dependencies]
249            axum = &quot;&lt;latest-version&gt;&quot;
250            tokio = { version = &quot;&lt;latest-version&gt;&quot;, features = [&quot;full&quot;] }
251            tower = &quot;&lt;latest-version&gt;&quot;
252            </code></pre></div>
253        "##};
254        let expected = indoc! {r#"
255            ## Required dependencies
256
257            To use axum there are a few dependencies you have to pull in as well:
258
259            ```toml
260            [dependencies]
261            axum = "<latest-version>"
262            tokio = { version = "<latest-version>", features = ["full"] }
263            tower = "<latest-version>"
264
265            ```
266        "#}
267        .trim();
268
269        assert_eq!(
270            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
271            expected
272        )
273    }
274
275    #[test]
276    fn test_item_table() {
277        let html = indoc! {r##"
278            <h2 id="structs" class="section-header">Structs<a href="#structs" class="anchor">§</a></h2>
279            <ul class="item-table">
280            <li><div class="item-name"><a class="struct" href="struct.Error.html" title="struct axum::Error">Error</a></div><div class="desc docblock-short">Errors that can happen when using axum.</div></li>
281            <li><div class="item-name"><a class="struct" href="struct.Extension.html" title="struct axum::Extension">Extension</a></div><div class="desc docblock-short">Extractor and response for extensions.</div></li>
282            <li><div class="item-name"><a class="struct" href="struct.Form.html" title="struct axum::Form">Form</a><span class="stab portability" title="Available on crate feature `form` only"><code>form</code></span></div><div class="desc docblock-short">URL encoded extractor and response.</div></li>
283            <li><div class="item-name"><a class="struct" href="struct.Json.html" title="struct axum::Json">Json</a><span class="stab portability" title="Available on crate feature `json` only"><code>json</code></span></div><div class="desc docblock-short">JSON Extractor / Response.</div></li>
284            <li><div class="item-name"><a class="struct" href="struct.Router.html" title="struct axum::Router">Router</a></div><div class="desc docblock-short">The router type for composing handlers and services.</div></li></ul>
285            <h2 id="functions" class="section-header">Functions<a href="#functions" class="anchor">§</a></h2>
286            <ul class="item-table">
287            <li><div class="item-name"><a class="fn" href="fn.serve.html" title="fn axum::serve">serve</a><span class="stab portability" title="Available on crate feature `tokio` and (crate features `http1` or `http2`) only"><code>tokio</code> and (<code>http1</code> or <code>http2</code>)</span></div><div class="desc docblock-short">Serve the service with the supplied listener.</div></li>
288            </ul>
289        "##};
290        let expected = indoc! {r#"
291            ## Structs
292
293            - `Error`: Errors that can happen when using axum.
294            - `Extension`: Extractor and response for extensions.
295            - `Form` [`form`]: URL encoded extractor and response.
296            - `Json` [`json`]: JSON Extractor / Response.
297            - `Router`: The router type for composing handlers and services.
298
299            ## Functions
300
301            - `serve` [`tokio` and (`http1` or `http2`)]: Serve the service with the supplied listener.
302        "#}
303        .trim();
304
305        assert_eq!(
306            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
307            expected
308        )
309    }
310
311    #[test]
312    fn test_table() {
313        let html = indoc! {r##"
314            <h2 id="feature-flags"><a class="doc-anchor" href="#feature-flags">§</a>Feature flags</h2>
315            <p>axum uses a set of <a href="https://doc.rust-lang.org/cargo/reference/features.html#the-features-section">feature flags</a> to reduce the amount of compiled and
316            optional dependencies.</p>
317            <p>The following optional features are available:</p>
318            <div><table><thead><tr><th>Name</th><th>Description</th><th>Default?</th></tr></thead><tbody>
319            <tr><td><code>http1</code></td><td>Enables hyper’s <code>http1</code> feature</td><td>Yes</td></tr>
320            <tr><td><code>http2</code></td><td>Enables hyper’s <code>http2</code> feature</td><td>No</td></tr>
321            <tr><td><code>json</code></td><td>Enables the <a href="struct.Json.html" title="struct axum::Json"><code>Json</code></a> type and some similar convenience functionality</td><td>Yes</td></tr>
322            <tr><td><code>macros</code></td><td>Enables optional utility macros</td><td>No</td></tr>
323            <tr><td><code>matched-path</code></td><td>Enables capturing of every request’s router path and the <a href="extract/struct.MatchedPath.html" title="struct axum::extract::MatchedPath"><code>MatchedPath</code></a> extractor</td><td>Yes</td></tr>
324            <tr><td><code>multipart</code></td><td>Enables parsing <code>multipart/form-data</code> requests with <a href="extract/struct.Multipart.html" title="struct axum::extract::Multipart"><code>Multipart</code></a></td><td>No</td></tr>
325            <tr><td><code>original-uri</code></td><td>Enables capturing of every request’s original URI and the <a href="extract/struct.OriginalUri.html" title="struct axum::extract::OriginalUri"><code>OriginalUri</code></a> extractor</td><td>Yes</td></tr>
326            <tr><td><code>tokio</code></td><td>Enables <code>tokio</code> as a dependency and <code>axum::serve</code>, <code>SSE</code> and <code>extract::connect_info</code> types.</td><td>Yes</td></tr>
327            <tr><td><code>tower-log</code></td><td>Enables <code>tower</code>’s <code>log</code> feature</td><td>Yes</td></tr>
328            <tr><td><code>tracing</code></td><td>Log rejections from built-in extractors</td><td>Yes</td></tr>
329            <tr><td><code>ws</code></td><td>Enables WebSockets support via <a href="extract/ws/index.html" title="mod axum::extract::ws"><code>extract::ws</code></a></td><td>No</td></tr>
330            <tr><td><code>form</code></td><td>Enables the <code>Form</code> extractor</td><td>Yes</td></tr>
331            <tr><td><code>query</code></td><td>Enables the <code>Query</code> extractor</td><td>Yes</td></tr>
332            </tbody></table>
333        "##};
334        let expected = indoc! {r#"
335            ## Feature flags
336
337            axum uses a set of feature flags to reduce the amount of compiled and optional dependencies.
338
339            The following optional features are available:
340
341            | Name | Description | Default? |
342            | --- | --- | --- |
343            | `http1` | Enables hyper’s `http1` feature | Yes |
344            | `http2` | Enables hyper’s `http2` feature | No |
345            | `json` | Enables the `Json` type and some similar convenience functionality | Yes |
346            | `macros` | Enables optional utility macros | No |
347            | `matched-path` | Enables capturing of every request’s router path and the `MatchedPath` extractor | Yes |
348            | `multipart` | Enables parsing `multipart/form-data` requests with `Multipart` | No |
349            | `original-uri` | Enables capturing of every request’s original URI and the `OriginalUri` extractor | Yes |
350            | `tokio` | Enables `tokio` as a dependency and `axum::serve`, `SSE` and `extract::connect_info` types. | Yes |
351            | `tower-log` | Enables `tower`’s `log` feature | Yes |
352            | `tracing` | Log rejections from built-in extractors | Yes |
353            | `ws` | Enables WebSockets support via `extract::ws` | No |
354            | `form` | Enables the `Form` extractor | Yes |
355            | `query` | Enables the `Query` extractor | Yes |
356        "#}
357        .trim();
358
359        assert_eq!(
360            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
361            expected
362        )
363    }
364}