html_to_markdown.rs

 1//! Convert HTML to Markdown.
 2
 3mod html_element;
 4pub mod markdown;
 5mod markdown_writer;
 6pub mod structure;
 7
 8use std::io::Read;
 9
10use anyhow::{Context as _, Result};
11use html5ever::driver::ParseOpts;
12use html5ever::parse_document;
13use html5ever::tendril::TendrilSink;
14use html5ever::tree_builder::TreeBuilderOpts;
15use markup5ever_rcdom::RcDom;
16
17pub use crate::html_element::*;
18pub use crate::markdown_writer::*;
19
20/// Converts the provided HTML to Markdown.
21pub fn convert_html_to_markdown(html: impl Read, handlers: &mut [TagHandler]) -> Result<String> {
22    let dom = parse_html(html).context("failed to parse HTML")?;
23
24    let markdown_writer = MarkdownWriter::new();
25    let markdown = markdown_writer
26        .run(&dom.document, handlers)
27        .context("failed to convert HTML to Markdown")?;
28
29    Ok(markdown)
30}
31
32fn parse_html(mut html: impl Read) -> Result<RcDom> {
33    let parse_options = ParseOpts {
34        tree_builder: TreeBuilderOpts {
35            drop_doctype: true,
36            ..Default::default()
37        },
38        ..Default::default()
39    };
40    let dom = parse_document(RcDom::default(), parse_options)
41        .from_utf8()
42        .read_from(&mut html)
43        .context("failed to parse HTML document")?;
44
45    Ok(dom)
46}