html_to_markdown.rs

  1//! Provides conversion from rustdoc's HTML output to Markdown.
  2
  3mod html_element;
  4pub mod markdown;
  5mod markdown_writer;
  6pub mod structure;
  7
  8use std::cell::RefCell;
  9use std::io::Read;
 10use std::rc::Rc;
 11
 12use anyhow::{Context, Result};
 13use html5ever::driver::ParseOpts;
 14use html5ever::parse_document;
 15use html5ever::tendril::TendrilSink;
 16use html5ever::tree_builder::TreeBuilderOpts;
 17use markup5ever_rcdom::RcDom;
 18
 19use crate::markdown::{
 20    HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
 21};
 22use crate::markdown_writer::MarkdownWriter;
 23
 24pub use crate::markdown_writer::{HandleTag, TagHandler};
 25use crate::structure::rustdoc::RustdocItem;
 26
 27/// Converts the provided HTML to Markdown.
 28pub fn convert_html_to_markdown(html: impl Read, handlers: &mut Vec<TagHandler>) -> Result<String> {
 29    let dom = parse_html(html).context("failed to parse HTML")?;
 30
 31    let markdown_writer = MarkdownWriter::new();
 32    let markdown = markdown_writer
 33        .run(&dom.document, handlers)
 34        .context("failed to convert HTML to Markdown")?;
 35
 36    Ok(markdown)
 37}
 38
 39/// Converts the provided rustdoc HTML to Markdown.
 40pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec<RustdocItem>)> {
 41    let item_collector = Rc::new(RefCell::new(structure::rustdoc::RustdocItemCollector::new()));
 42
 43    let mut handlers: Vec<TagHandler> = vec![
 44        Rc::new(RefCell::new(ParagraphHandler)),
 45        Rc::new(RefCell::new(HeadingHandler)),
 46        Rc::new(RefCell::new(ListHandler)),
 47        Rc::new(RefCell::new(TableHandler::new())),
 48        Rc::new(RefCell::new(StyledTextHandler)),
 49        Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
 50        Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
 51        Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
 52        Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
 53        item_collector.clone(),
 54    ];
 55
 56    let markdown = convert_html_to_markdown(html, &mut handlers)?;
 57
 58    let items = item_collector
 59        .borrow()
 60        .items
 61        .values()
 62        .cloned()
 63        .collect::<Vec<_>>();
 64
 65    Ok((markdown, items))
 66}
 67
 68fn parse_html(mut html: impl Read) -> Result<RcDom> {
 69    let parse_options = ParseOpts {
 70        tree_builder: TreeBuilderOpts {
 71            drop_doctype: true,
 72            ..Default::default()
 73        },
 74        ..Default::default()
 75    };
 76    let dom = parse_document(RcDom::default(), parse_options)
 77        .from_utf8()
 78        .read_from(&mut html)
 79        .context("failed to parse HTML document")?;
 80
 81    Ok(dom)
 82}
 83
 84#[cfg(test)]
 85mod tests {
 86    use indoc::indoc;
 87    use pretty_assertions::assert_eq;
 88
 89    use super::*;
 90
 91    fn rustdoc_handlers() -> Vec<TagHandler> {
 92        vec![
 93            Rc::new(RefCell::new(ParagraphHandler)),
 94            Rc::new(RefCell::new(HeadingHandler)),
 95            Rc::new(RefCell::new(ListHandler)),
 96            Rc::new(RefCell::new(TableHandler::new())),
 97            Rc::new(RefCell::new(StyledTextHandler)),
 98            Rc::new(RefCell::new(structure::rustdoc::RustdocChromeRemover)),
 99            Rc::new(RefCell::new(structure::rustdoc::RustdocHeadingHandler)),
100            Rc::new(RefCell::new(structure::rustdoc::RustdocCodeHandler)),
101            Rc::new(RefCell::new(structure::rustdoc::RustdocItemHandler)),
102        ]
103    }
104
105    #[test]
106    fn test_main_heading_buttons_get_removed() {
107        let html = indoc! {r##"
108            <div class="main-heading">
109                <h1>Crate <a class="mod" href="#">serde</a><button id="copy-path" title="Copy item path to clipboard">Copy item path</button></h1>
110                <span class="out-of-band">
111                    <a class="src" href="../src/serde/lib.rs.html#1-340">source</a> · <button id="toggle-all-docs" title="collapse all docs">[<span>−</span>]</button>
112                </span>
113            </div>
114        "##};
115        let expected = indoc! {"
116            # Crate serde
117        "}
118        .trim();
119
120        assert_eq!(
121            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
122            expected
123        )
124    }
125
126    #[test]
127    fn test_single_paragraph() {
128        let html = indoc! {r#"
129            <p>In particular, the last point is what sets <code>axum</code> apart from other frameworks.
130            <code>axum</code> doesn’t have its own middleware system but instead uses
131            <a href="https://docs.rs/tower-service/0.3.2/x86_64-unknown-linux-gnu/tower_service/trait.Service.html" title="trait tower_service::Service"><code>tower::Service</code></a>. This means <code>axum</code> gets timeouts, tracing, compression,
132            authorization, and more, for free. It also enables you to share middleware with
133            applications written using <a href="http://crates.io/crates/hyper"><code>hyper</code></a> or <a href="http://crates.io/crates/tonic"><code>tonic</code></a>.</p>
134        "#};
135        let expected = indoc! {"
136            In particular, the last point is what sets `axum` apart from other frameworks. `axum` doesn’t have its own middleware system but instead uses `tower::Service`. This means `axum` gets timeouts, tracing, compression, authorization, and more, for free. It also enables you to share middleware with applications written using `hyper` or `tonic`.
137        "}
138        .trim();
139
140        assert_eq!(
141            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
142            expected
143        )
144    }
145
146    #[test]
147    fn test_multiple_paragraphs() {
148        let html = indoc! {r##"
149            <h2 id="serde"><a class="doc-anchor" href="#serde">§</a>Serde</h2>
150            <p>Serde is a framework for <em><strong>ser</strong></em>ializing and <em><strong>de</strong></em>serializing Rust data
151            structures efficiently and generically.</p>
152            <p>The Serde ecosystem consists of data structures that know how to serialize
153            and deserialize themselves along with data formats that know how to
154            serialize and deserialize other things. Serde provides the layer by which
155            these two groups interact with each other, allowing any supported data
156            structure to be serialized and deserialized using any supported data format.</p>
157            <p>See the Serde website <a href="https://serde.rs/">https://serde.rs/</a> for additional documentation and
158            usage examples.</p>
159            <h3 id="design"><a class="doc-anchor" href="#design">§</a>Design</h3>
160            <p>Where many other languages rely on runtime reflection for serializing data,
161            Serde is instead built on Rust’s powerful trait system. A data structure
162            that knows how to serialize and deserialize itself is one that implements
163            Serde’s <code>Serialize</code> and <code>Deserialize</code> traits (or uses Serde’s derive
164            attribute to automatically generate implementations at compile time). This
165            avoids any overhead of reflection or runtime type information. In fact in
166            many situations the interaction between data structure and data format can
167            be completely optimized away by the Rust compiler, leaving Serde
168            serialization to perform the same speed as a handwritten serializer for the
169            specific selection of data structure and data format.</p>
170        "##};
171        let expected = indoc! {"
172            ## Serde
173
174            Serde is a framework for _**ser**_ializing and _**de**_serializing Rust data structures efficiently and generically.
175
176            The Serde ecosystem consists of data structures that know how to serialize and deserialize themselves along with data formats that know how to serialize and deserialize other things. Serde provides the layer by which these two groups interact with each other, allowing any supported data structure to be serialized and deserialized using any supported data format.
177
178            See the Serde website https://serde.rs/ for additional documentation and usage examples.
179
180            ### Design
181
182            Where many other languages rely on runtime reflection for serializing data, Serde is instead built on Rust’s powerful trait system. A data structure that knows how to serialize and deserialize itself is one that implements Serde’s `Serialize` and `Deserialize` traits (or uses Serde’s derive attribute to automatically generate implementations at compile time). This avoids any overhead of reflection or runtime type information. In fact in many situations the interaction between data structure and data format can be completely optimized away by the Rust compiler, leaving Serde serialization to perform the same speed as a handwritten serializer for the specific selection of data structure and data format.
183        "}
184        .trim();
185
186        assert_eq!(
187            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
188            expected
189        )
190    }
191
192    #[test]
193    fn test_styled_text() {
194        let html = indoc! {r#"
195            <p>This text is <strong>bolded</strong>.</p>
196            <p>This text is <em>italicized</em>.</p>
197        "#};
198        let expected = indoc! {"
199            This text is **bolded**.
200
201            This text is _italicized_.
202        "}
203        .trim();
204
205        assert_eq!(
206            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
207            expected
208        )
209    }
210
211    #[test]
212    fn test_rust_code_block() {
213        let html = indoc! {r#"
214            <pre class="rust rust-example-rendered"><code><span class="kw">use </span>axum::extract::{Path, Query, Json};
215            <span class="kw">use </span>std::collections::HashMap;
216
217            <span class="comment">// `Path` gives you the path parameters and deserializes them.
218            </span><span class="kw">async fn </span>path(Path(user_id): Path&lt;u32&gt;) {}
219
220            <span class="comment">// `Query` gives you the query parameters and deserializes them.
221            </span><span class="kw">async fn </span>query(Query(params): Query&lt;HashMap&lt;String, String&gt;&gt;) {}
222
223            <span class="comment">// Buffer the request body and deserialize it as JSON into a
224            // `serde_json::Value`. `Json` supports any type that implements
225            // `serde::Deserialize`.
226            </span><span class="kw">async fn </span>json(Json(payload): Json&lt;serde_json::Value&gt;) {}</code></pre>
227        "#};
228        let expected = indoc! {"
229            ```rs
230            use axum::extract::{Path, Query, Json};
231            use std::collections::HashMap;
232
233            // `Path` gives you the path parameters and deserializes them.
234            async fn path(Path(user_id): Path<u32>) {}
235
236            // `Query` gives you the query parameters and deserializes them.
237            async fn query(Query(params): Query<HashMap<String, String>>) {}
238
239            // Buffer the request body and deserialize it as JSON into a
240            // `serde_json::Value`. `Json` supports any type that implements
241            // `serde::Deserialize`.
242            async fn json(Json(payload): Json<serde_json::Value>) {}
243            ```
244        "}
245        .trim();
246
247        assert_eq!(
248            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
249            expected
250        )
251    }
252
253    #[test]
254    fn test_toml_code_block() {
255        let html = indoc! {r##"
256            <h2 id="required-dependencies"><a class="doc-anchor" href="#required-dependencies">§</a>Required dependencies</h2>
257            <p>To use axum there are a few dependencies you have to pull in as well:</p>
258            <div class="example-wrap"><pre class="language-toml"><code>[dependencies]
259            axum = &quot;&lt;latest-version&gt;&quot;
260            tokio = { version = &quot;&lt;latest-version&gt;&quot;, features = [&quot;full&quot;] }
261            tower = &quot;&lt;latest-version&gt;&quot;
262            </code></pre></div>
263        "##};
264        let expected = indoc! {r#"
265            ## Required dependencies
266
267            To use axum there are a few dependencies you have to pull in as well:
268
269            ```toml
270            [dependencies]
271            axum = "<latest-version>"
272            tokio = { version = "<latest-version>", features = ["full"] }
273            tower = "<latest-version>"
274
275            ```
276        "#}
277        .trim();
278
279        assert_eq!(
280            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
281            expected
282        )
283    }
284
285    #[test]
286    fn test_item_table() {
287        let html = indoc! {r##"
288            <h2 id="structs" class="section-header">Structs<a href="#structs" class="anchor">§</a></h2>
289            <ul class="item-table">
290            <li><div class="item-name"><a class="struct" href="struct.Error.html" title="struct axum::Error">Error</a></div><div class="desc docblock-short">Errors that can happen when using axum.</div></li>
291            <li><div class="item-name"><a class="struct" href="struct.Extension.html" title="struct axum::Extension">Extension</a></div><div class="desc docblock-short">Extractor and response for extensions.</div></li>
292            <li><div class="item-name"><a class="struct" href="struct.Form.html" title="struct axum::Form">Form</a><span class="stab portability" title="Available on crate feature `form` only"><code>form</code></span></div><div class="desc docblock-short">URL encoded extractor and response.</div></li>
293            <li><div class="item-name"><a class="struct" href="struct.Json.html" title="struct axum::Json">Json</a><span class="stab portability" title="Available on crate feature `json` only"><code>json</code></span></div><div class="desc docblock-short">JSON Extractor / Response.</div></li>
294            <li><div class="item-name"><a class="struct" href="struct.Router.html" title="struct axum::Router">Router</a></div><div class="desc docblock-short">The router type for composing handlers and services.</div></li></ul>
295            <h2 id="functions" class="section-header">Functions<a href="#functions" class="anchor">§</a></h2>
296            <ul class="item-table">
297            <li><div class="item-name"><a class="fn" href="fn.serve.html" title="fn axum::serve">serve</a><span class="stab portability" title="Available on crate feature `tokio` and (crate features `http1` or `http2`) only"><code>tokio</code> and (<code>http1</code> or <code>http2</code>)</span></div><div class="desc docblock-short">Serve the service with the supplied listener.</div></li>
298            </ul>
299        "##};
300        let expected = indoc! {r#"
301            ## Structs
302
303            - `Error`: Errors that can happen when using axum.
304            - `Extension`: Extractor and response for extensions.
305            - `Form` [`form`]: URL encoded extractor and response.
306            - `Json` [`json`]: JSON Extractor / Response.
307            - `Router`: The router type for composing handlers and services.
308
309            ## Functions
310
311            - `serve` [`tokio` and (`http1` or `http2`)]: Serve the service with the supplied listener.
312        "#}
313        .trim();
314
315        assert_eq!(
316            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
317            expected
318        )
319    }
320
321    #[test]
322    fn test_table() {
323        let html = indoc! {r##"
324            <h2 id="feature-flags"><a class="doc-anchor" href="#feature-flags">§</a>Feature flags</h2>
325            <p>axum uses a set of <a href="https://doc.rust-lang.org/cargo/reference/features.html#the-features-section">feature flags</a> to reduce the amount of compiled and
326            optional dependencies.</p>
327            <p>The following optional features are available:</p>
328            <div><table><thead><tr><th>Name</th><th>Description</th><th>Default?</th></tr></thead><tbody>
329            <tr><td><code>http1</code></td><td>Enables hyper’s <code>http1</code> feature</td><td>Yes</td></tr>
330            <tr><td><code>http2</code></td><td>Enables hyper’s <code>http2</code> feature</td><td>No</td></tr>
331            <tr><td><code>json</code></td><td>Enables the <a href="struct.Json.html" title="struct axum::Json"><code>Json</code></a> type and some similar convenience functionality</td><td>Yes</td></tr>
332            <tr><td><code>macros</code></td><td>Enables optional utility macros</td><td>No</td></tr>
333            <tr><td><code>matched-path</code></td><td>Enables capturing of every request’s router path and the <a href="extract/struct.MatchedPath.html" title="struct axum::extract::MatchedPath"><code>MatchedPath</code></a> extractor</td><td>Yes</td></tr>
334            <tr><td><code>multipart</code></td><td>Enables parsing <code>multipart/form-data</code> requests with <a href="extract/struct.Multipart.html" title="struct axum::extract::Multipart"><code>Multipart</code></a></td><td>No</td></tr>
335            <tr><td><code>original-uri</code></td><td>Enables capturing of every request’s original URI and the <a href="extract/struct.OriginalUri.html" title="struct axum::extract::OriginalUri"><code>OriginalUri</code></a> extractor</td><td>Yes</td></tr>
336            <tr><td><code>tokio</code></td><td>Enables <code>tokio</code> as a dependency and <code>axum::serve</code>, <code>SSE</code> and <code>extract::connect_info</code> types.</td><td>Yes</td></tr>
337            <tr><td><code>tower-log</code></td><td>Enables <code>tower</code>’s <code>log</code> feature</td><td>Yes</td></tr>
338            <tr><td><code>tracing</code></td><td>Log rejections from built-in extractors</td><td>Yes</td></tr>
339            <tr><td><code>ws</code></td><td>Enables WebSockets support via <a href="extract/ws/index.html" title="mod axum::extract::ws"><code>extract::ws</code></a></td><td>No</td></tr>
340            <tr><td><code>form</code></td><td>Enables the <code>Form</code> extractor</td><td>Yes</td></tr>
341            <tr><td><code>query</code></td><td>Enables the <code>Query</code> extractor</td><td>Yes</td></tr>
342            </tbody></table>
343        "##};
344        let expected = indoc! {r#"
345            ## Feature flags
346
347            axum uses a set of feature flags to reduce the amount of compiled and optional dependencies.
348
349            The following optional features are available:
350
351            | Name | Description | Default? |
352            | --- | --- | --- |
353            | `http1` | Enables hyper’s `http1` feature | Yes |
354            | `http2` | Enables hyper’s `http2` feature | No |
355            | `json` | Enables the `Json` type and some similar convenience functionality | Yes |
356            | `macros` | Enables optional utility macros | No |
357            | `matched-path` | Enables capturing of every request’s router path and the `MatchedPath` extractor | Yes |
358            | `multipart` | Enables parsing `multipart/form-data` requests with `Multipart` | No |
359            | `original-uri` | Enables capturing of every request’s original URI and the `OriginalUri` extractor | Yes |
360            | `tokio` | Enables `tokio` as a dependency and `axum::serve`, `SSE` and `extract::connect_info` types. | Yes |
361            | `tower-log` | Enables `tower`’s `log` feature | Yes |
362            | `tracing` | Log rejections from built-in extractors | Yes |
363            | `ws` | Enables WebSockets support via `extract::ws` | No |
364            | `form` | Enables the `Form` extractor | Yes |
365            | `query` | Enables the `Query` extractor | Yes |
366        "#}
367        .trim();
368
369        assert_eq!(
370            convert_html_to_markdown(html.as_bytes(), &mut rustdoc_handlers()).unwrap(),
371            expected
372        )
373    }
374}