repl: Support HTML outputs through `html_to_markdown` (#49646)

Kyle Kelley created

Closes #15555

Adds a super basic render of html output from jupyter kernels.

<img width="1061" height="1207" alt="image"
src="https://github.com/user-attachments/assets/1bfb8c71-0e38-4bff-9f0c-bec12721232a"
/>

Obviously not as full featured as #48157

Release Notes:

- Added basic handling of HTML in REPL outputs

Change summary

Cargo.lock                      |   1 
crates/repl/Cargo.toml          |   1 
crates/repl/src/outputs.rs      |  27 +++
crates/repl/src/outputs/html.rs | 198 +++++++++++++++++++++++++++++++++++
4 files changed, 222 insertions(+), 5 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -13911,6 +13911,7 @@ dependencies = [
  "file_icons",
  "futures 0.3.31",
  "gpui",
+ "html_to_markdown",
  "http_client",
  "image",
  "indoc",

crates/repl/Cargo.toml 🔗

@@ -30,6 +30,7 @@ feature_flags.workspace = true
 file_icons.workspace = true
 futures.workspace = true
 gpui.workspace = true
+html_to_markdown.workspace = true
 http_client.workspace = true
 image.workspace = true
 jupyter-websocket-client.workspace = true

crates/repl/src/outputs.rs 🔗

@@ -52,6 +52,8 @@ use table::TableView;
 mod json;
 use json::JsonView;
 
+mod html;
+
 pub mod plain;
 use plain::TerminalOutput;
 
@@ -65,7 +67,8 @@ use settings::Settings;
 /// When deciding what to render from a collection of mediatypes, we need to rank them in order of importance
 fn rank_mime_type(mimetype: &MimeType) -> usize {
     match mimetype {
-        MimeType::DataTable(_) => 6,
+        MimeType::DataTable(_) => 7,
+        MimeType::Html(_) => 6,
         MimeType::Json(_) => 5,
         MimeType::Png(_) => 4,
         MimeType::Jpeg(_) => 3,
@@ -419,6 +422,19 @@ impl Output {
                 content: cx.new(|cx| TableView::new(data, window, cx)),
                 display_id,
             },
+            Some(MimeType::Html(html_content)) => match html::html_to_markdown(html_content) {
+                Ok(markdown_text) => {
+                    let content = cx.new(|cx| MarkdownView::from(markdown_text, cx));
+                    Output::Markdown {
+                        content,
+                        display_id,
+                    }
+                }
+                Err(_) => Output::Plain {
+                    content: cx.new(|cx| TerminalOutput::from(html_content, window, cx)),
+                    display_id,
+                },
+            },
             // Any other media types are not supported
             _ => Output::Message("Unsupported media type".to_string()),
         }
@@ -836,20 +852,23 @@ mod tests {
     #[test]
     fn test_rank_mime_type_ordering() {
         let data_table = MimeType::DataTable(Box::default());
+        let html = MimeType::Html(String::new());
         let json = MimeType::Json(serde_json::json!({}));
         let png = MimeType::Png(String::new());
         let jpeg = MimeType::Jpeg(String::new());
         let markdown = MimeType::Markdown(String::new());
         let plain = MimeType::Plain(String::new());
 
-        assert_eq!(rank_mime_type(&data_table), 6);
+        assert_eq!(rank_mime_type(&data_table), 7);
+        assert_eq!(rank_mime_type(&html), 6);
         assert_eq!(rank_mime_type(&json), 5);
         assert_eq!(rank_mime_type(&png), 4);
         assert_eq!(rank_mime_type(&jpeg), 3);
         assert_eq!(rank_mime_type(&markdown), 2);
         assert_eq!(rank_mime_type(&plain), 1);
 
-        assert!(rank_mime_type(&data_table) > rank_mime_type(&json));
+        assert!(rank_mime_type(&data_table) > rank_mime_type(&html));
+        assert!(rank_mime_type(&html) > rank_mime_type(&json));
         assert!(rank_mime_type(&json) > rank_mime_type(&png));
         assert!(rank_mime_type(&png) > rank_mime_type(&jpeg));
         assert!(rank_mime_type(&jpeg) > rank_mime_type(&markdown));
@@ -858,11 +877,9 @@ mod tests {
 
     #[test]
     fn test_rank_mime_type_unsupported_returns_zero() {
-        let html = MimeType::Html(String::new());
         let svg = MimeType::Svg(String::new());
         let latex = MimeType::Latex(String::new());
 
-        assert_eq!(rank_mime_type(&html), 0);
         assert_eq!(rank_mime_type(&svg), 0);
         assert_eq!(rank_mime_type(&latex), 0);
     }

crates/repl/src/outputs/html.rs 🔗

@@ -0,0 +1,198 @@
+use anyhow::Result;
+use html_to_markdown::markdown::{
+    CodeHandler, HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
+    WebpageChromeRemover,
+};
+use html_to_markdown::{TagHandler, convert_html_to_markdown};
+use std::cell::RefCell;
+use std::rc::Rc;
+
+/// Convert HTML to Markdown for rendering in the REPL.
+pub fn html_to_markdown(html: &str) -> Result<String> {
+    let mut handlers: Vec<TagHandler> = vec![
+        // WebpageChromeRemover must come first to skip style, script, head, nav tags
+        Rc::new(RefCell::new(WebpageChromeRemover)),
+        Rc::new(RefCell::new(ParagraphHandler)),
+        Rc::new(RefCell::new(HeadingHandler)),
+        Rc::new(RefCell::new(ListHandler)),
+        Rc::new(RefCell::new(TableHandler::new())),
+        Rc::new(RefCell::new(StyledTextHandler)),
+        Rc::new(RefCell::new(CodeHandler)),
+    ];
+
+    let markdown = convert_html_to_markdown(html.as_bytes(), &mut handlers)?;
+    Ok(clean_markdown_tables(&markdown))
+}
+
+/// Clean up markdown table formatting and ensure tables have separator rows.
+fn clean_markdown_tables(markdown: &str) -> String {
+    let lines: Vec<&str> = markdown.lines().collect();
+    let mut result: Vec<String> = Vec::new();
+    let mut in_table = false;
+    let mut has_separator = false;
+
+    for (i, line) in lines.iter().enumerate() {
+        let trimmed = line.trim();
+
+        if trimmed.starts_with('|') {
+            let normalized = normalize_table_row(trimmed);
+
+            if !in_table {
+                // Starting a new table
+                in_table = true;
+                has_separator = false;
+            }
+
+            // Check if this line is a separator row
+            if trimmed.contains("---") {
+                has_separator = true;
+            }
+
+            result.push(normalized.clone());
+
+            // If this is the first row and no separator exists yet,
+            // check if next row is a table row (not separator) and add one
+            if !has_separator {
+                let next_is_table_row = i + 1 < lines.len()
+                    && lines[i + 1].trim().starts_with('|')
+                    && !lines[i + 1].contains("---");
+
+                if next_is_table_row {
+                    // Insert separator after first row
+                    let col_count = normalized.matches('|').count().saturating_sub(1);
+                    if col_count > 0 {
+                        let separator = (0..col_count)
+                            .map(|_| "---")
+                            .collect::<Vec<_>>()
+                            .join(" | ");
+                        result.push(format!("| {} |", separator));
+                        has_separator = true;
+                    }
+                }
+            }
+        } else {
+            // Not a table row
+            if !trimmed.is_empty() {
+                result.push(trimmed.to_string());
+            }
+            in_table = false;
+            has_separator = false;
+        }
+    }
+
+    result.join("\n")
+}
+
+/// Normalize a table row by trimming cells and ensuring consistent spacing.
+fn normalize_table_row(row: &str) -> String {
+    let parts: Vec<&str> = row.split('|').collect();
+    let normalized: Vec<String> = parts.iter().map(|cell| cell.trim().to_string()).collect();
+    normalized.join(" | ").trim().to_string()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_html_table_to_markdown() {
+        let html = r#"<table>
+            <thead><tr><th>A</th><th>B</th></tr></thead>
+            <tbody><tr><td>1</td><td>x</td></tr></tbody>
+        </table>"#;
+
+        let md = html_to_markdown(html).unwrap();
+        assert!(md.contains("|"));
+        assert!(md.contains("---"));
+    }
+
+    #[test]
+    fn test_html_with_headings() {
+        let html = "<h1>Title</h1><p>Content</p>";
+        let md = html_to_markdown(html).unwrap();
+        assert!(md.contains("# Title"));
+    }
+
+    #[test]
+    fn test_pandas_dataframe_html() {
+        let html = r#"<table border="1" class="dataframe">
+            <thead><tr><th></th><th>A</th><th>B</th></tr></thead>
+            <tbody>
+                <tr><th>0</th><td>1</td><td>x</td></tr>
+                <tr><th>1</th><td>2</td><td>y</td></tr>
+            </tbody>
+        </table>"#;
+
+        let md = html_to_markdown(html).unwrap();
+        assert!(md.contains("|"));
+        // Verify table rows are properly formatted (start with |)
+        for line in md.lines() {
+            if line.contains("|") {
+                assert!(
+                    line.starts_with("|"),
+                    "Table line should start with |: {:?}",
+                    line
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_table_format_normalized() {
+        let html = r#"<table>
+  <thead>
+    <tr><th>Name</th><th>Age</th></tr>
+  </thead>
+  <tbody>
+    <tr><td>Alice</td><td>25</td></tr>
+  </tbody>
+</table>"#;
+
+        let md = html_to_markdown(html).unwrap();
+
+        // Should have clean table format
+        assert!(md.contains("| Name | Age |"));
+        assert!(md.contains("| --- | --- |"));
+        assert!(md.contains("| Alice | 25 |"));
+    }
+
+    #[test]
+    fn test_style_tags_are_filtered() {
+        let html = r#"<style>
+            .dataframe { border: 1px solid; }
+        </style>
+        <table>
+            <thead><tr><th>A</th></tr></thead>
+            <tbody><tr><td>1</td></tr></tbody>
+        </table>"#;
+
+        let md = html_to_markdown(html).unwrap();
+
+        // Style content should not appear in output
+        assert!(!md.contains("dataframe"));
+        assert!(!md.contains("border"));
+        // Table should still be present
+        assert!(md.contains("| A |"));
+    }
+
+    #[test]
+    fn test_table_without_thead() {
+        // Tables without <thead> should still get a separator row
+        let html = r#"<table>
+            <tr><th>Feature</th><th>Supported</th></tr>
+            <tr><td>Tables</td><td>✓</td></tr>
+            <tr><td>Lists</td><td>✓</td></tr>
+        </table>"#;
+
+        let md = html_to_markdown(html).unwrap();
+
+        // Should have separator row inserted after first row
+        assert!(
+            md.contains("| --- | --- |"),
+            "Missing separator row: {}",
+            md
+        );
+        assert!(md.contains("| Feature | Supported |"));
+        assert!(md.contains("| Tables | ✓ |"));
+    }
+}