html_to_markdown.rs

  1//! Provides conversion from rustdoc's HTML output to Markdown.
  2
  3mod html_element;
  4pub mod markdown;
  5mod markdown_writer;
  6pub mod structure;
  7
  8use std::io::Read;
  9
 10use anyhow::{Context, Result};
 11use html5ever::driver::ParseOpts;
 12use html5ever::parse_document;
 13use html5ever::tendril::TendrilSink;
 14use html5ever::tree_builder::TreeBuilderOpts;
 15use markup5ever_rcdom::RcDom;
 16
 17use crate::markdown::{
 18    HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
 19};
 20use crate::markdown_writer::MarkdownWriter;
 21
 22pub use crate::markdown_writer::HandleTag;
 23
 24/// Converts the provided HTML to Markdown.
 25pub fn convert_html_to_markdown(
 26    html: impl Read,
 27    handlers: Vec<Box<dyn HandleTag>>,
 28) -> Result<String> {
 29    let dom = parse_html(html).context("failed to parse HTML")?;
 30
 31    let markdown_writer = MarkdownWriter::new();
 32    let markdown = markdown_writer
 33        .run(&dom.document, handlers)
 34        .context("failed to convert HTML to Markdown")?;
 35
 36    Ok(markdown)
 37}
 38
 39/// Converts the provided rustdoc HTML to Markdown.
 40pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
 41    convert_html_to_markdown(
 42        html,
 43        vec![
 44            Box::new(ParagraphHandler),
 45            Box::new(HeadingHandler),
 46            Box::new(ListHandler),
 47            Box::new(TableHandler::new()),
 48            Box::new(StyledTextHandler),
 49            Box::new(structure::rustdoc::RustdocChromeRemover),
 50            Box::new(structure::rustdoc::RustdocHeadingHandler),
 51            Box::new(structure::rustdoc::RustdocCodeHandler),
 52            Box::new(structure::rustdoc::RustdocItemHandler),
 53        ],
 54    )
 55}
 56
 57fn parse_html(mut html: impl Read) -> Result<RcDom> {
 58    let parse_options = ParseOpts {
 59        tree_builder: TreeBuilderOpts {
 60            drop_doctype: true,
 61            ..Default::default()
 62        },
 63        ..Default::default()
 64    };
 65    let dom = parse_document(RcDom::default(), parse_options)
 66        .from_utf8()
 67        .read_from(&mut html)
 68        .context("failed to parse HTML document")?;
 69
 70    Ok(dom)
 71}
 72
 73#[cfg(test)]
 74mod tests {
 75    use indoc::indoc;
 76    use pretty_assertions::assert_eq;
 77
 78    use super::*;
 79
 80    #[test]
 81    fn test_main_heading_buttons_get_removed() {
 82        let html = indoc! {r##"
 83            <div class="main-heading">
 84                <h1>Crate <a class="mod" href="#">serde</a><button id="copy-path" title="Copy item path to clipboard">Copy item path</button></h1>
 85                <span class="out-of-band">
 86                    <a class="src" href="../src/serde/lib.rs.html#1-340">source</a> · <button id="toggle-all-docs" title="collapse all docs">[<span>−</span>]</button>
 87                </span>
 88            </div>
 89        "##};
 90        let expected = indoc! {"
 91            # Crate serde
 92        "}
 93        .trim();
 94
 95        assert_eq!(
 96            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
 97            expected
 98        )
 99    }
100
101    #[test]
102    fn test_single_paragraph() {
103        let html = indoc! {r#"
104            <p>In particular, the last point is what sets <code>axum</code> apart from other frameworks.
105            <code>axum</code> doesn’t have its own middleware system but instead uses
106            <a href="https://docs.rs/tower-service/0.3.2/x86_64-unknown-linux-gnu/tower_service/trait.Service.html" title="trait tower_service::Service"><code>tower::Service</code></a>. This means <code>axum</code> gets timeouts, tracing, compression,
107            authorization, and more, for free. It also enables you to share middleware with
108            applications written using <a href="http://crates.io/crates/hyper"><code>hyper</code></a> or <a href="http://crates.io/crates/tonic"><code>tonic</code></a>.</p>
109        "#};
110        let expected = indoc! {"
111            In particular, the last point is what sets `axum` apart from other frameworks. `axum` doesn’t have its own middleware system but instead uses `tower::Service`. This means `axum` gets timeouts, tracing, compression, authorization, and more, for free. It also enables you to share middleware with applications written using `hyper` or `tonic`.
112        "}
113        .trim();
114
115        assert_eq!(
116            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
117            expected
118        )
119    }
120
121    #[test]
122    fn test_multiple_paragraphs() {
123        let html = indoc! {r##"
124            <h2 id="serde"><a class="doc-anchor" href="#serde">§</a>Serde</h2>
125            <p>Serde is a framework for <em><strong>ser</strong></em>ializing and <em><strong>de</strong></em>serializing Rust data
126            structures efficiently and generically.</p>
127            <p>The Serde ecosystem consists of data structures that know how to serialize
128            and deserialize themselves along with data formats that know how to
129            serialize and deserialize other things. Serde provides the layer by which
130            these two groups interact with each other, allowing any supported data
131            structure to be serialized and deserialized using any supported data format.</p>
132            <p>See the Serde website <a href="https://serde.rs/">https://serde.rs/</a> for additional documentation and
133            usage examples.</p>
134            <h3 id="design"><a class="doc-anchor" href="#design">§</a>Design</h3>
135            <p>Where many other languages rely on runtime reflection for serializing data,
136            Serde is instead built on Rust’s powerful trait system. A data structure
137            that knows how to serialize and deserialize itself is one that implements
138            Serde’s <code>Serialize</code> and <code>Deserialize</code> traits (or uses Serde’s derive
139            attribute to automatically generate implementations at compile time). This
140            avoids any overhead of reflection or runtime type information. In fact in
141            many situations the interaction between data structure and data format can
142            be completely optimized away by the Rust compiler, leaving Serde
143            serialization to perform the same speed as a handwritten serializer for the
144            specific selection of data structure and data format.</p>
145        "##};
146        let expected = indoc! {"
147            ## Serde
148
149            Serde is a framework for _**ser**_ializing and _**de**_serializing Rust data structures efficiently and generically.
150
151            The Serde ecosystem consists of data structures that know how to serialize and deserialize themselves along with data formats that know how to serialize and deserialize other things. Serde provides the layer by which these two groups interact with each other, allowing any supported data structure to be serialized and deserialized using any supported data format.
152
153            See the Serde website https://serde.rs/ for additional documentation and usage examples.
154
155            ### Design
156
157            Where many other languages rely on runtime reflection for serializing data, Serde is instead built on Rust’s powerful trait system. A data structure that knows how to serialize and deserialize itself is one that implements Serde’s `Serialize` and `Deserialize` traits (or uses Serde’s derive attribute to automatically generate implementations at compile time). This avoids any overhead of reflection or runtime type information. In fact in many situations the interaction between data structure and data format can be completely optimized away by the Rust compiler, leaving Serde serialization to perform the same speed as a handwritten serializer for the specific selection of data structure and data format.
158        "}
159        .trim();
160
161        assert_eq!(
162            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
163            expected
164        )
165    }
166
167    #[test]
168    fn test_styled_text() {
169        let html = indoc! {r#"
170            <p>This text is <strong>bolded</strong>.</p>
171            <p>This text is <em>italicized</em>.</p>
172        "#};
173        let expected = indoc! {"
174            This text is **bolded**.
175
176            This text is _italicized_.
177        "}
178        .trim();
179
180        assert_eq!(
181            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
182            expected
183        )
184    }
185
186    #[test]
187    fn test_rust_code_block() {
188        let html = indoc! {r#"
189            <pre class="rust rust-example-rendered"><code><span class="kw">use </span>axum::extract::{Path, Query, Json};
190            <span class="kw">use </span>std::collections::HashMap;
191
192            <span class="comment">// `Path` gives you the path parameters and deserializes them.
193            </span><span class="kw">async fn </span>path(Path(user_id): Path&lt;u32&gt;) {}
194
195            <span class="comment">// `Query` gives you the query parameters and deserializes them.
196            </span><span class="kw">async fn </span>query(Query(params): Query&lt;HashMap&lt;String, String&gt;&gt;) {}
197
198            <span class="comment">// Buffer the request body and deserialize it as JSON into a
199            // `serde_json::Value`. `Json` supports any type that implements
200            // `serde::Deserialize`.
201            </span><span class="kw">async fn </span>json(Json(payload): Json&lt;serde_json::Value&gt;) {}</code></pre>
202        "#};
203        let expected = indoc! {"
204            ```rs
205            use axum::extract::{Path, Query, Json};
206            use std::collections::HashMap;
207
208            // `Path` gives you the path parameters and deserializes them.
209            async fn path(Path(user_id): Path<u32>) {}
210
211            // `Query` gives you the query parameters and deserializes them.
212            async fn query(Query(params): Query<HashMap<String, String>>) {}
213
214            // Buffer the request body and deserialize it as JSON into a
215            // `serde_json::Value`. `Json` supports any type that implements
216            // `serde::Deserialize`.
217            async fn json(Json(payload): Json<serde_json::Value>) {}
218            ```
219        "}
220        .trim();
221
222        assert_eq!(
223            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
224            expected
225        )
226    }
227
228    #[test]
229    fn test_toml_code_block() {
230        let html = indoc! {r##"
231            <h2 id="required-dependencies"><a class="doc-anchor" href="#required-dependencies">§</a>Required dependencies</h2>
232            <p>To use axum there are a few dependencies you have to pull in as well:</p>
233            <div class="example-wrap"><pre class="language-toml"><code>[dependencies]
234            axum = &quot;&lt;latest-version&gt;&quot;
235            tokio = { version = &quot;&lt;latest-version&gt;&quot;, features = [&quot;full&quot;] }
236            tower = &quot;&lt;latest-version&gt;&quot;
237            </code></pre></div>
238        "##};
239        let expected = indoc! {r#"
240            ## Required dependencies
241
242            To use axum there are a few dependencies you have to pull in as well:
243
244            ```toml
245            [dependencies]
246            axum = "<latest-version>"
247            tokio = { version = "<latest-version>", features = ["full"] }
248            tower = "<latest-version>"
249
250            ```
251        "#}
252        .trim();
253
254        assert_eq!(
255            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
256            expected
257        )
258    }
259
260    #[test]
261    fn test_item_table() {
262        let html = indoc! {r##"
263            <h2 id="structs" class="section-header">Structs<a href="#structs" class="anchor">§</a></h2>
264            <ul class="item-table">
265            <li><div class="item-name"><a class="struct" href="struct.Error.html" title="struct axum::Error">Error</a></div><div class="desc docblock-short">Errors that can happen when using axum.</div></li>
266            <li><div class="item-name"><a class="struct" href="struct.Extension.html" title="struct axum::Extension">Extension</a></div><div class="desc docblock-short">Extractor and response for extensions.</div></li>
267            <li><div class="item-name"><a class="struct" href="struct.Form.html" title="struct axum::Form">Form</a><span class="stab portability" title="Available on crate feature `form` only"><code>form</code></span></div><div class="desc docblock-short">URL encoded extractor and response.</div></li>
268            <li><div class="item-name"><a class="struct" href="struct.Json.html" title="struct axum::Json">Json</a><span class="stab portability" title="Available on crate feature `json` only"><code>json</code></span></div><div class="desc docblock-short">JSON Extractor / Response.</div></li>
269            <li><div class="item-name"><a class="struct" href="struct.Router.html" title="struct axum::Router">Router</a></div><div class="desc docblock-short">The router type for composing handlers and services.</div></li></ul>
270            <h2 id="functions" class="section-header">Functions<a href="#functions" class="anchor">§</a></h2>
271            <ul class="item-table">
272            <li><div class="item-name"><a class="fn" href="fn.serve.html" title="fn axum::serve">serve</a><span class="stab portability" title="Available on crate feature `tokio` and (crate features `http1` or `http2`) only"><code>tokio</code> and (<code>http1</code> or <code>http2</code>)</span></div><div class="desc docblock-short">Serve the service with the supplied listener.</div></li>
273            </ul>
274        "##};
275        let expected = indoc! {r#"
276            ## Structs
277
278            - `Error`: Errors that can happen when using axum.
279            - `Extension`: Extractor and response for extensions.
280            - `Form` [`form`]: URL encoded extractor and response.
281            - `Json` [`json`]: JSON Extractor / Response.
282            - `Router`: The router type for composing handlers and services.
283
284            ## Functions
285
286            - `serve` [`tokio` and (`http1` or `http2`)]: Serve the service with the supplied listener.
287        "#}
288        .trim();
289
290        assert_eq!(
291            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
292            expected
293        )
294    }
295
296    #[test]
297    fn test_table() {
298        let html = indoc! {r##"
299            <h2 id="feature-flags"><a class="doc-anchor" href="#feature-flags">§</a>Feature flags</h2>
300            <p>axum uses a set of <a href="https://doc.rust-lang.org/cargo/reference/features.html#the-features-section">feature flags</a> to reduce the amount of compiled and
301            optional dependencies.</p>
302            <p>The following optional features are available:</p>
303            <div><table><thead><tr><th>Name</th><th>Description</th><th>Default?</th></tr></thead><tbody>
304            <tr><td><code>http1</code></td><td>Enables hyper’s <code>http1</code> feature</td><td>Yes</td></tr>
305            <tr><td><code>http2</code></td><td>Enables hyper’s <code>http2</code> feature</td><td>No</td></tr>
306            <tr><td><code>json</code></td><td>Enables the <a href="struct.Json.html" title="struct axum::Json"><code>Json</code></a> type and some similar convenience functionality</td><td>Yes</td></tr>
307            <tr><td><code>macros</code></td><td>Enables optional utility macros</td><td>No</td></tr>
308            <tr><td><code>matched-path</code></td><td>Enables capturing of every request’s router path and the <a href="extract/struct.MatchedPath.html" title="struct axum::extract::MatchedPath"><code>MatchedPath</code></a> extractor</td><td>Yes</td></tr>
309            <tr><td><code>multipart</code></td><td>Enables parsing <code>multipart/form-data</code> requests with <a href="extract/struct.Multipart.html" title="struct axum::extract::Multipart"><code>Multipart</code></a></td><td>No</td></tr>
310            <tr><td><code>original-uri</code></td><td>Enables capturing of every request’s original URI and the <a href="extract/struct.OriginalUri.html" title="struct axum::extract::OriginalUri"><code>OriginalUri</code></a> extractor</td><td>Yes</td></tr>
311            <tr><td><code>tokio</code></td><td>Enables <code>tokio</code> as a dependency and <code>axum::serve</code>, <code>SSE</code> and <code>extract::connect_info</code> types.</td><td>Yes</td></tr>
312            <tr><td><code>tower-log</code></td><td>Enables <code>tower</code>’s <code>log</code> feature</td><td>Yes</td></tr>
313            <tr><td><code>tracing</code></td><td>Log rejections from built-in extractors</td><td>Yes</td></tr>
314            <tr><td><code>ws</code></td><td>Enables WebSockets support via <a href="extract/ws/index.html" title="mod axum::extract::ws"><code>extract::ws</code></a></td><td>No</td></tr>
315            <tr><td><code>form</code></td><td>Enables the <code>Form</code> extractor</td><td>Yes</td></tr>
316            <tr><td><code>query</code></td><td>Enables the <code>Query</code> extractor</td><td>Yes</td></tr>
317            </tbody></table>
318        "##};
319        let expected = indoc! {r#"
320            ## Feature flags
321
322            axum uses a set of feature flags to reduce the amount of compiled and optional dependencies.
323
324            The following optional features are available:
325
326            | Name | Description | Default? |
327            | --- | --- | --- |
328            | `http1` | Enables hyper’s `http1` feature | Yes |
329            | `http2` | Enables hyper’s `http2` feature | No |
330            | `json` | Enables the `Json` type and some similar convenience functionality | Yes |
331            | `macros` | Enables optional utility macros | No |
332            | `matched-path` | Enables capturing of every request’s router path and the `MatchedPath` extractor | Yes |
333            | `multipart` | Enables parsing `multipart/form-data` requests with `Multipart` | No |
334            | `original-uri` | Enables capturing of every request’s original URI and the `OriginalUri` extractor | Yes |
335            | `tokio` | Enables `tokio` as a dependency and `axum::serve`, `SSE` and `extract::connect_info` types. | Yes |
336            | `tower-log` | Enables `tower`’s `log` feature | Yes |
337            | `tracing` | Log rejections from built-in extractors | Yes |
338            | `ws` | Enables WebSockets support via `extract::ws` | No |
339            | `form` | Enables the `Form` extractor | Yes |
340            | `query` | Enables the `Query` extractor | Yes |
341        "#}
342        .trim();
343
344        assert_eq!(
345            convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
346            expected
347        )
348    }
349}