html_to_markdown.rs

  1//! Provides conversion from rustdoc's HTML output to Markdown.
  2
  3#![deny(missing_docs)]
  4
  5mod html_element;
  6mod markdown;
  7mod markdown_writer;
  8mod structure;
  9
 10use std::io::Read;
 11
 12use anyhow::{Context, Result};
 13use html5ever::driver::ParseOpts;
 14use html5ever::parse_document;
 15use html5ever::tendril::TendrilSink;
 16use html5ever::tree_builder::TreeBuilderOpts;
 17use markup5ever_rcdom::RcDom;
 18
 19use crate::markdown::{HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler};
 20use crate::markdown_writer::{HandleTag, MarkdownWriter};
 21
 22/// Converts the provided HTML to Markdown.
 23pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
 24    let dom = parse_html(html).context("failed to parse HTML")?;
 25
 26    let handlers: Vec<Box<dyn HandleTag>> = vec![
 27        Box::new(ParagraphHandler),
 28        Box::new(HeadingHandler),
 29        Box::new(ListHandler),
 30        Box::new(StyledTextHandler),
 31        Box::new(structure::rustdoc::RustdocChromeRemover),
 32        Box::new(structure::rustdoc::RustdocHeadingHandler),
 33        Box::new(structure::rustdoc::RustdocCodeHandler),
 34        Box::new(structure::rustdoc::RustdocTableHandler::new()),
 35        Box::new(structure::rustdoc::RustdocItemHandler),
 36    ];
 37
 38    let markdown_writer = MarkdownWriter::new();
 39    let markdown = markdown_writer
 40        .run(&dom.document, handlers)
 41        .context("failed to convert HTML to Markdown")?;
 42
 43    Ok(markdown)
 44}
 45
 46/// Converts the provided rustdoc HTML to Markdown.
 47pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
 48    let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
 49
 50    let handlers: Vec<Box<dyn HandleTag>> = vec![
 51        Box::new(ParagraphHandler),
 52        Box::new(HeadingHandler),
 53        Box::new(ListHandler),
 54        Box::new(StyledTextHandler),
 55        Box::new(structure::rustdoc::RustdocChromeRemover),
 56        Box::new(structure::rustdoc::RustdocHeadingHandler),
 57        Box::new(structure::rustdoc::RustdocCodeHandler),
 58        Box::new(structure::rustdoc::RustdocTableHandler::new()),
 59        Box::new(structure::rustdoc::RustdocItemHandler),
 60    ];
 61
 62    let markdown_writer = MarkdownWriter::new();
 63    let markdown = markdown_writer
 64        .run(&dom.document, handlers)
 65        .context("failed to convert rustdoc HTML to Markdown")?;
 66
 67    Ok(markdown)
 68}
 69
 70fn parse_html(mut html: impl Read) -> Result<RcDom> {
 71    let parse_options = ParseOpts {
 72        tree_builder: TreeBuilderOpts {
 73            drop_doctype: true,
 74            ..Default::default()
 75        },
 76        ..Default::default()
 77    };
 78    let dom = parse_document(RcDom::default(), parse_options)
 79        .from_utf8()
 80        .read_from(&mut html)
 81        .context("failed to parse HTML document")?;
 82
 83    Ok(dom)
 84}
 85
 86#[cfg(test)]
 87mod tests {
 88    use indoc::indoc;
 89    use pretty_assertions::assert_eq;
 90
 91    use super::*;
 92
 93    #[test]
 94    fn test_main_heading_buttons_get_removed() {
 95        let html = indoc! {r##"
 96            <div class="main-heading">
 97                <h1>Crate <a class="mod" href="#">serde</a><button id="copy-path" title="Copy item path to clipboard">Copy item path</button></h1>
 98                <span class="out-of-band">
 99                    <a class="src" href="../src/serde/lib.rs.html#1-340">source</a> · <button id="toggle-all-docs" title="collapse all docs">[<span>−</span>]</button>
100                </span>
101            </div>
102        "##};
103        let expected = indoc! {"
104            # Crate serde
105        "}
106        .trim();
107
108        assert_eq!(
109            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
110            expected
111        )
112    }
113
114    #[test]
115    fn test_single_paragraph() {
116        let html = indoc! {r#"
117            <p>In particular, the last point is what sets <code>axum</code> apart from other frameworks.
118            <code>axum</code> doesn’t have its own middleware system but instead uses
119            <a href="https://docs.rs/tower-service/0.3.2/x86_64-unknown-linux-gnu/tower_service/trait.Service.html" title="trait tower_service::Service"><code>tower::Service</code></a>. This means <code>axum</code> gets timeouts, tracing, compression,
120            authorization, and more, for free. It also enables you to share middleware with
121            applications written using <a href="http://crates.io/crates/hyper"><code>hyper</code></a> or <a href="http://crates.io/crates/tonic"><code>tonic</code></a>.</p>
122        "#};
123        let expected = indoc! {"
124            In particular, the last point is what sets `axum` apart from other frameworks. `axum` doesn’t have its own middleware system but instead uses `tower::Service`. This means `axum` gets timeouts, tracing, compression, authorization, and more, for free. It also enables you to share middleware with applications written using `hyper` or `tonic`.
125        "}
126        .trim();
127
128        assert_eq!(
129            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
130            expected
131        )
132    }
133
134    #[test]
135    fn test_multiple_paragraphs() {
136        let html = indoc! {r##"
137            <h2 id="serde"><a class="doc-anchor" href="#serde">§</a>Serde</h2>
138            <p>Serde is a framework for <em><strong>ser</strong></em>ializing and <em><strong>de</strong></em>serializing Rust data
139            structures efficiently and generically.</p>
140            <p>The Serde ecosystem consists of data structures that know how to serialize
141            and deserialize themselves along with data formats that know how to
142            serialize and deserialize other things. Serde provides the layer by which
143            these two groups interact with each other, allowing any supported data
144            structure to be serialized and deserialized using any supported data format.</p>
145            <p>See the Serde website <a href="https://serde.rs/">https://serde.rs/</a> for additional documentation and
146            usage examples.</p>
147            <h3 id="design"><a class="doc-anchor" href="#design">§</a>Design</h3>
148            <p>Where many other languages rely on runtime reflection for serializing data,
149            Serde is instead built on Rust’s powerful trait system. A data structure
150            that knows how to serialize and deserialize itself is one that implements
151            Serde’s <code>Serialize</code> and <code>Deserialize</code> traits (or uses Serde’s derive
152            attribute to automatically generate implementations at compile time). This
153            avoids any overhead of reflection or runtime type information. In fact in
154            many situations the interaction between data structure and data format can
155            be completely optimized away by the Rust compiler, leaving Serde
156            serialization to perform the same speed as a handwritten serializer for the
157            specific selection of data structure and data format.</p>
158        "##};
159        let expected = indoc! {"
160            ## Serde
161
162            Serde is a framework for _**ser**_ializing and _**de**_serializing Rust data structures efficiently and generically.
163
164            The Serde ecosystem consists of data structures that know how to serialize and deserialize themselves along with data formats that know how to serialize and deserialize other things. Serde provides the layer by which these two groups interact with each other, allowing any supported data structure to be serialized and deserialized using any supported data format.
165
166            See the Serde website https://serde.rs/ for additional documentation and usage examples.
167
168            ### Design
169
170            Where many other languages rely on runtime reflection for serializing data, Serde is instead built on Rust’s powerful trait system. A data structure that knows how to serialize and deserialize itself is one that implements Serde’s `Serialize` and `Deserialize` traits (or uses Serde’s derive attribute to automatically generate implementations at compile time). This avoids any overhead of reflection or runtime type information. In fact in many situations the interaction between data structure and data format can be completely optimized away by the Rust compiler, leaving Serde serialization to perform the same speed as a handwritten serializer for the specific selection of data structure and data format.
171        "}
172        .trim();
173
174        assert_eq!(
175            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
176            expected
177        )
178    }
179
180    #[test]
181    fn test_styled_text() {
182        let html = indoc! {r#"
183            <p>This text is <strong>bolded</strong>.</p>
184            <p>This text is <em>italicized</em>.</p>
185        "#};
186        let expected = indoc! {"
187            This text is **bolded**.
188
189            This text is _italicized_.
190        "}
191        .trim();
192
193        assert_eq!(
194            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
195            expected
196        )
197    }
198
199    #[test]
200    fn test_rust_code_block() {
201        let html = indoc! {r#"
202            <pre class="rust rust-example-rendered"><code><span class="kw">use </span>axum::extract::{Path, Query, Json};
203            <span class="kw">use </span>std::collections::HashMap;
204
205            <span class="comment">// `Path` gives you the path parameters and deserializes them.
206            </span><span class="kw">async fn </span>path(Path(user_id): Path&lt;u32&gt;) {}
207
208            <span class="comment">// `Query` gives you the query parameters and deserializes them.
209            </span><span class="kw">async fn </span>query(Query(params): Query&lt;HashMap&lt;String, String&gt;&gt;) {}
210
211            <span class="comment">// Buffer the request body and deserialize it as JSON into a
212            // `serde_json::Value`. `Json` supports any type that implements
213            // `serde::Deserialize`.
214            </span><span class="kw">async fn </span>json(Json(payload): Json&lt;serde_json::Value&gt;) {}</code></pre>
215        "#};
216        let expected = indoc! {"
217            ```rs
218            use axum::extract::{Path, Query, Json};
219            use std::collections::HashMap;
220
221            // `Path` gives you the path parameters and deserializes them.
222            async fn path(Path(user_id): Path<u32>) {}
223
224            // `Query` gives you the query parameters and deserializes them.
225            async fn query(Query(params): Query<HashMap<String, String>>) {}
226
227            // Buffer the request body and deserialize it as JSON into a
228            // `serde_json::Value`. `Json` supports any type that implements
229            // `serde::Deserialize`.
230            async fn json(Json(payload): Json<serde_json::Value>) {}
231            ```
232        "}
233        .trim();
234
235        assert_eq!(
236            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
237            expected
238        )
239    }
240
241    #[test]
242    fn test_toml_code_block() {
243        let html = indoc! {r##"
244            <h2 id="required-dependencies"><a class="doc-anchor" href="#required-dependencies">§</a>Required dependencies</h2>
245            <p>To use axum there are a few dependencies you have to pull in as well:</p>
246            <div class="example-wrap"><pre class="language-toml"><code>[dependencies]
247            axum = &quot;&lt;latest-version&gt;&quot;
248            tokio = { version = &quot;&lt;latest-version&gt;&quot;, features = [&quot;full&quot;] }
249            tower = &quot;&lt;latest-version&gt;&quot;
250            </code></pre></div>
251        "##};
252        let expected = indoc! {r#"
253            ## Required dependencies
254
255            To use axum there are a few dependencies you have to pull in as well:
256
257            ```toml
258            [dependencies]
259            axum = "<latest-version>"
260            tokio = { version = "<latest-version>", features = ["full"] }
261            tower = "<latest-version>"
262
263            ```
264        "#}
265        .trim();
266
267        assert_eq!(
268            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
269            expected
270        )
271    }
272
273    #[test]
274    fn test_item_table() {
275        let html = indoc! {r##"
276            <h2 id="structs" class="section-header">Structs<a href="#structs" class="anchor">§</a></h2>
277            <ul class="item-table">
278            <li><div class="item-name"><a class="struct" href="struct.Error.html" title="struct axum::Error">Error</a></div><div class="desc docblock-short">Errors that can happen when using axum.</div></li>
279            <li><div class="item-name"><a class="struct" href="struct.Extension.html" title="struct axum::Extension">Extension</a></div><div class="desc docblock-short">Extractor and response for extensions.</div></li>
280            <li><div class="item-name"><a class="struct" href="struct.Form.html" title="struct axum::Form">Form</a><span class="stab portability" title="Available on crate feature `form` only"><code>form</code></span></div><div class="desc docblock-short">URL encoded extractor and response.</div></li>
281            <li><div class="item-name"><a class="struct" href="struct.Json.html" title="struct axum::Json">Json</a><span class="stab portability" title="Available on crate feature `json` only"><code>json</code></span></div><div class="desc docblock-short">JSON Extractor / Response.</div></li>
282            <li><div class="item-name"><a class="struct" href="struct.Router.html" title="struct axum::Router">Router</a></div><div class="desc docblock-short">The router type for composing handlers and services.</div></li></ul>
283            <h2 id="functions" class="section-header">Functions<a href="#functions" class="anchor">§</a></h2>
284            <ul class="item-table">
285            <li><div class="item-name"><a class="fn" href="fn.serve.html" title="fn axum::serve">serve</a><span class="stab portability" title="Available on crate feature `tokio` and (crate features `http1` or `http2`) only"><code>tokio</code> and (<code>http1</code> or <code>http2</code>)</span></div><div class="desc docblock-short">Serve the service with the supplied listener.</div></li>
286            </ul>
287        "##};
288        let expected = indoc! {r#"
289            ## Structs
290
291            - `Error`: Errors that can happen when using axum.
292            - `Extension`: Extractor and response for extensions.
293            - `Form` [`form`]: URL encoded extractor and response.
294            - `Json` [`json`]: JSON Extractor / Response.
295            - `Router`: The router type for composing handlers and services.
296
297            ## Functions
298
299            - `serve` [`tokio` and (`http1` or `http2`)]: Serve the service with the supplied listener.
300        "#}
301        .trim();
302
303        assert_eq!(
304            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
305            expected
306        )
307    }
308
309    #[test]
310    fn test_table() {
311        let html = indoc! {r##"
312            <h2 id="feature-flags"><a class="doc-anchor" href="#feature-flags">§</a>Feature flags</h2>
313            <p>axum uses a set of <a href="https://doc.rust-lang.org/cargo/reference/features.html#the-features-section">feature flags</a> to reduce the amount of compiled and
314            optional dependencies.</p>
315            <p>The following optional features are available:</p>
316            <div><table><thead><tr><th>Name</th><th>Description</th><th>Default?</th></tr></thead><tbody>
317            <tr><td><code>http1</code></td><td>Enables hyper’s <code>http1</code> feature</td><td>Yes</td></tr>
318            <tr><td><code>http2</code></td><td>Enables hyper’s <code>http2</code> feature</td><td>No</td></tr>
319            <tr><td><code>json</code></td><td>Enables the <a href="struct.Json.html" title="struct axum::Json"><code>Json</code></a> type and some similar convenience functionality</td><td>Yes</td></tr>
320            <tr><td><code>macros</code></td><td>Enables optional utility macros</td><td>No</td></tr>
321            <tr><td><code>matched-path</code></td><td>Enables capturing of every request’s router path and the <a href="extract/struct.MatchedPath.html" title="struct axum::extract::MatchedPath"><code>MatchedPath</code></a> extractor</td><td>Yes</td></tr>
322            <tr><td><code>multipart</code></td><td>Enables parsing <code>multipart/form-data</code> requests with <a href="extract/struct.Multipart.html" title="struct axum::extract::Multipart"><code>Multipart</code></a></td><td>No</td></tr>
323            <tr><td><code>original-uri</code></td><td>Enables capturing of every request’s original URI and the <a href="extract/struct.OriginalUri.html" title="struct axum::extract::OriginalUri"><code>OriginalUri</code></a> extractor</td><td>Yes</td></tr>
324            <tr><td><code>tokio</code></td><td>Enables <code>tokio</code> as a dependency and <code>axum::serve</code>, <code>SSE</code> and <code>extract::connect_info</code> types.</td><td>Yes</td></tr>
325            <tr><td><code>tower-log</code></td><td>Enables <code>tower</code>’s <code>log</code> feature</td><td>Yes</td></tr>
326            <tr><td><code>tracing</code></td><td>Log rejections from built-in extractors</td><td>Yes</td></tr>
327            <tr><td><code>ws</code></td><td>Enables WebSockets support via <a href="extract/ws/index.html" title="mod axum::extract::ws"><code>extract::ws</code></a></td><td>No</td></tr>
328            <tr><td><code>form</code></td><td>Enables the <code>Form</code> extractor</td><td>Yes</td></tr>
329            <tr><td><code>query</code></td><td>Enables the <code>Query</code> extractor</td><td>Yes</td></tr>
330            </tbody></table>
331        "##};
332        let expected = indoc! {r#"
333            ## Feature flags
334
335            axum uses a set of feature flags to reduce the amount of compiled and optional dependencies.
336
337            The following optional features are available:
338
339            | Name | Description | Default? |
340            | --- | --- | --- |
341            | `http1` | Enables hyper’s `http1` feature | Yes |
342            | `http2` | Enables hyper’s `http2` feature | No |
343            | `json` | Enables the `Json` type and some similar convenience functionality | Yes |
344            | `macros` | Enables optional utility macros | No |
345            | `matched-path` | Enables capturing of every request’s router path and the `MatchedPath` extractor | Yes |
346            | `multipart` | Enables parsing `multipart/form-data` requests with `Multipart` | No |
347            | `original-uri` | Enables capturing of every request’s original URI and the `OriginalUri` extractor | Yes |
348            | `tokio` | Enables `tokio` as a dependency and `axum::serve`, `SSE` and `extract::connect_info` types. | Yes |
349            | `tower-log` | Enables `tower`’s `log` feature | Yes |
350            | `tracing` | Log rejections from built-in extractors | Yes |
351            | `ws` | Enables WebSockets support via `extract::ws` | No |
352            | `form` | Enables the `Form` extractor | Yes |
353            | `query` | Enables the `Query` extractor | Yes |
354        "#}
355        .trim();
356
357        assert_eq!(
358            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
359            expected
360        )
361    }
362}