1use anyhow::Result;
2use html_to_markdown::markdown::{
3 CodeHandler, HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
4 WebpageChromeRemover,
5};
6use html_to_markdown::{TagHandler, convert_html_to_markdown};
7use std::cell::RefCell;
8use std::rc::Rc;
9
10/// Convert HTML to Markdown for rendering in the REPL.
11pub fn html_to_markdown(html: &str) -> Result<String> {
12 let mut handlers: Vec<TagHandler> = vec![
13 // WebpageChromeRemover must come first to skip style, script, head, nav tags
14 Rc::new(RefCell::new(WebpageChromeRemover)),
15 Rc::new(RefCell::new(ParagraphHandler)),
16 Rc::new(RefCell::new(HeadingHandler)),
17 Rc::new(RefCell::new(ListHandler)),
18 Rc::new(RefCell::new(TableHandler::new())),
19 Rc::new(RefCell::new(StyledTextHandler)),
20 Rc::new(RefCell::new(CodeHandler)),
21 ];
22
23 let markdown = convert_html_to_markdown(html.as_bytes(), &mut handlers)?;
24 Ok(clean_markdown_tables(&markdown))
25}
26
27/// Clean up markdown table formatting and ensure tables have separator rows.
28fn clean_markdown_tables(markdown: &str) -> String {
29 let lines: Vec<&str> = markdown.lines().collect();
30 let mut result: Vec<String> = Vec::new();
31 let mut in_table = false;
32 let mut has_separator = false;
33
34 for (i, line) in lines.iter().enumerate() {
35 let trimmed = line.trim();
36
37 if trimmed.starts_with('|') {
38 let normalized = normalize_table_row(trimmed);
39
40 if !in_table {
41 // Starting a new table
42 in_table = true;
43 has_separator = false;
44 }
45
46 // Check if this line is a separator row
47 if trimmed.contains("---") {
48 has_separator = true;
49 }
50
51 result.push(normalized.clone());
52
53 // If this is the first row and no separator exists yet,
54 // check if next row is a table row (not separator) and add one
55 if !has_separator {
56 let next_is_table_row = i + 1 < lines.len()
57 && lines[i + 1].trim().starts_with('|')
58 && !lines[i + 1].contains("---");
59
60 if next_is_table_row {
61 // Insert separator after first row
62 let col_count = normalized.matches('|').count().saturating_sub(1);
63 if col_count > 0 {
64 let separator = (0..col_count)
65 .map(|_| "---")
66 .collect::<Vec<_>>()
67 .join(" | ");
68 result.push(format!("| {} |", separator));
69 has_separator = true;
70 }
71 }
72 }
73 } else {
74 // Not a table row
75 if !trimmed.is_empty() {
76 result.push(trimmed.to_string());
77 }
78 in_table = false;
79 has_separator = false;
80 }
81 }
82
83 result.join("\n")
84}
85
86/// Normalize a table row by trimming cells and ensuring consistent spacing.
87fn normalize_table_row(row: &str) -> String {
88 let parts: Vec<&str> = row.split('|').collect();
89 let normalized: Vec<String> = parts.iter().map(|cell| cell.trim().to_string()).collect();
90 normalized.join(" | ").trim().to_string()
91}
92
93#[cfg(test)]
94mod tests {
95 use super::*;
96
97 #[test]
98 fn test_html_table_to_markdown() {
99 let html = r#"<table>
100 <thead><tr><th>A</th><th>B</th></tr></thead>
101 <tbody><tr><td>1</td><td>x</td></tr></tbody>
102 </table>"#;
103
104 let md = html_to_markdown(html).unwrap();
105 assert!(md.contains("|"));
106 assert!(md.contains("---"));
107 }
108
109 #[test]
110 fn test_html_with_headings() {
111 let html = "<h1>Title</h1><p>Content</p>";
112 let md = html_to_markdown(html).unwrap();
113 assert!(md.contains("# Title"));
114 }
115
116 #[test]
117 fn test_pandas_dataframe_html() {
118 let html = r#"<table border="1" class="dataframe">
119 <thead><tr><th></th><th>A</th><th>B</th></tr></thead>
120 <tbody>
121 <tr><th>0</th><td>1</td><td>x</td></tr>
122 <tr><th>1</th><td>2</td><td>y</td></tr>
123 </tbody>
124 </table>"#;
125
126 let md = html_to_markdown(html).unwrap();
127 assert!(md.contains("|"));
128 // Verify table rows are properly formatted (start with |)
129 for line in md.lines() {
130 if line.contains("|") {
131 assert!(
132 line.starts_with("|"),
133 "Table line should start with |: {:?}",
134 line
135 );
136 }
137 }
138 }
139
140 #[test]
141 fn test_table_format_normalized() {
142 let html = r#"<table>
143 <thead>
144 <tr><th>Name</th><th>Age</th></tr>
145 </thead>
146 <tbody>
147 <tr><td>Alice</td><td>25</td></tr>
148 </tbody>
149</table>"#;
150
151 let md = html_to_markdown(html).unwrap();
152
153 // Should have clean table format
154 assert!(md.contains("| Name | Age |"));
155 assert!(md.contains("| --- | --- |"));
156 assert!(md.contains("| Alice | 25 |"));
157 }
158
159 #[test]
160 fn test_style_tags_are_filtered() {
161 let html = r#"<style>
162 .dataframe { border: 1px solid; }
163 </style>
164 <table>
165 <thead><tr><th>A</th></tr></thead>
166 <tbody><tr><td>1</td></tr></tbody>
167 </table>"#;
168
169 let md = html_to_markdown(html).unwrap();
170
171 // Style content should not appear in output
172 assert!(!md.contains("dataframe"));
173 assert!(!md.contains("border"));
174 // Table should still be present
175 assert!(md.contains("| A |"));
176 }
177
178 #[test]
179 fn test_table_without_thead() {
180 // Tables without <thead> should still get a separator row
181 let html = r#"<table>
182 <tr><th>Feature</th><th>Supported</th></tr>
183 <tr><td>Tables</td><td>✓</td></tr>
184 <tr><td>Lists</td><td>✓</td></tr>
185 </table>"#;
186
187 let md = html_to_markdown(html).unwrap();
188
189 // Should have separator row inserted after first row
190 assert!(
191 md.contains("| --- | --- |"),
192 "Missing separator row: {}",
193 md
194 );
195 assert!(md.contains("| Feature | Supported |"));
196 assert!(md.contains("| Tables | ✓ |"));
197 }
198}