html.rs

  1use anyhow::Result;
  2use html_to_markdown::markdown::{
  3    CodeHandler, HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
  4    WebpageChromeRemover,
  5};
  6use html_to_markdown::{TagHandler, convert_html_to_markdown};
  7use std::cell::RefCell;
  8use std::rc::Rc;
  9
 10/// Convert HTML to Markdown for rendering in the REPL.
 11pub fn html_to_markdown(html: &str) -> Result<String> {
 12    let mut handlers: Vec<TagHandler> = vec![
 13        // WebpageChromeRemover must come first to skip style, script, head, nav tags
 14        Rc::new(RefCell::new(WebpageChromeRemover)),
 15        Rc::new(RefCell::new(ParagraphHandler)),
 16        Rc::new(RefCell::new(HeadingHandler)),
 17        Rc::new(RefCell::new(ListHandler)),
 18        Rc::new(RefCell::new(TableHandler::new())),
 19        Rc::new(RefCell::new(StyledTextHandler)),
 20        Rc::new(RefCell::new(CodeHandler)),
 21    ];
 22
 23    let markdown = convert_html_to_markdown(html.as_bytes(), &mut handlers)?;
 24    Ok(clean_markdown_tables(&markdown))
 25}
 26
 27/// Clean up markdown table formatting and ensure tables have separator rows.
 28fn clean_markdown_tables(markdown: &str) -> String {
 29    let lines: Vec<&str> = markdown.lines().collect();
 30    let mut result: Vec<String> = Vec::new();
 31    let mut in_table = false;
 32    let mut has_separator = false;
 33
 34    for (i, line) in lines.iter().enumerate() {
 35        let trimmed = line.trim();
 36
 37        if trimmed.starts_with('|') {
 38            let normalized = normalize_table_row(trimmed);
 39
 40            if !in_table {
 41                // Starting a new table
 42                in_table = true;
 43                has_separator = false;
 44            }
 45
 46            // Check if this line is a separator row
 47            if trimmed.contains("---") {
 48                has_separator = true;
 49            }
 50
 51            result.push(normalized.clone());
 52
 53            // If this is the first row and no separator exists yet,
 54            // check if next row is a table row (not separator) and add one
 55            if !has_separator {
 56                let next_is_table_row = i + 1 < lines.len()
 57                    && lines[i + 1].trim().starts_with('|')
 58                    && !lines[i + 1].contains("---");
 59
 60                if next_is_table_row {
 61                    // Insert separator after first row
 62                    let col_count = normalized.matches('|').count().saturating_sub(1);
 63                    if col_count > 0 {
 64                        let separator = (0..col_count)
 65                            .map(|_| "---")
 66                            .collect::<Vec<_>>()
 67                            .join(" | ");
 68                        result.push(format!("| {} |", separator));
 69                        has_separator = true;
 70                    }
 71                }
 72            }
 73        } else {
 74            // Not a table row
 75            if !trimmed.is_empty() {
 76                result.push(trimmed.to_string());
 77            }
 78            in_table = false;
 79            has_separator = false;
 80        }
 81    }
 82
 83    result.join("\n")
 84}
 85
 86/// Normalize a table row by trimming cells and ensuring consistent spacing.
 87fn normalize_table_row(row: &str) -> String {
 88    let parts: Vec<&str> = row.split('|').collect();
 89    let normalized: Vec<String> = parts.iter().map(|cell| cell.trim().to_string()).collect();
 90    normalized.join(" | ").trim().to_string()
 91}
 92
 93#[cfg(test)]
 94mod tests {
 95    use super::*;
 96
 97    #[test]
 98    fn test_html_table_to_markdown() {
 99        let html = r#"<table>
100            <thead><tr><th>A</th><th>B</th></tr></thead>
101            <tbody><tr><td>1</td><td>x</td></tr></tbody>
102        </table>"#;
103
104        let md = html_to_markdown(html).unwrap();
105        assert!(md.contains("|"));
106        assert!(md.contains("---"));
107    }
108
109    #[test]
110    fn test_html_with_headings() {
111        let html = "<h1>Title</h1><p>Content</p>";
112        let md = html_to_markdown(html).unwrap();
113        assert!(md.contains("# Title"));
114    }
115
116    #[test]
117    fn test_pandas_dataframe_html() {
118        let html = r#"<table border="1" class="dataframe">
119            <thead><tr><th></th><th>A</th><th>B</th></tr></thead>
120            <tbody>
121                <tr><th>0</th><td>1</td><td>x</td></tr>
122                <tr><th>1</th><td>2</td><td>y</td></tr>
123            </tbody>
124        </table>"#;
125
126        let md = html_to_markdown(html).unwrap();
127        assert!(md.contains("|"));
128        // Verify table rows are properly formatted (start with |)
129        for line in md.lines() {
130            if line.contains("|") {
131                assert!(
132                    line.starts_with("|"),
133                    "Table line should start with |: {:?}",
134                    line
135                );
136            }
137        }
138    }
139
140    #[test]
141    fn test_table_format_normalized() {
142        let html = r#"<table>
143  <thead>
144    <tr><th>Name</th><th>Age</th></tr>
145  </thead>
146  <tbody>
147    <tr><td>Alice</td><td>25</td></tr>
148  </tbody>
149</table>"#;
150
151        let md = html_to_markdown(html).unwrap();
152
153        // Should have clean table format
154        assert!(md.contains("| Name | Age |"));
155        assert!(md.contains("| --- | --- |"));
156        assert!(md.contains("| Alice | 25 |"));
157    }
158
159    #[test]
160    fn test_style_tags_are_filtered() {
161        let html = r#"<style>
162            .dataframe { border: 1px solid; }
163        </style>
164        <table>
165            <thead><tr><th>A</th></tr></thead>
166            <tbody><tr><td>1</td></tr></tbody>
167        </table>"#;
168
169        let md = html_to_markdown(html).unwrap();
170
171        // Style content should not appear in output
172        assert!(!md.contains("dataframe"));
173        assert!(!md.contains("border"));
174        // Table should still be present
175        assert!(md.contains("| A |"));
176    }
177
178    #[test]
179    fn test_table_without_thead() {
180        // Tables without <thead> should still get a separator row
181        let html = r#"<table>
182            <tr><th>Feature</th><th>Supported</th></tr>
183            <tr><td>Tables</td><td>✓</td></tr>
184            <tr><td>Lists</td><td>✓</td></tr>
185        </table>"#;
186
187        let md = html_to_markdown(html).unwrap();
188
189        // Should have separator row inserted after first row
190        assert!(
191            md.contains("| --- | --- |"),
192            "Missing separator row: {}",
193            md
194        );
195        assert!(md.contains("| Feature | Supported |"));
196        assert!(md.contains("| Tables | ✓ |"));
197    }
198}