From 19404e43df74d1d48d71b91429a3d6fb9982d103 Mon Sep 17 00:00:00 2001 From: Kyle Kelley Date: Thu, 19 Feb 2026 23:55:58 -0800 Subject: [PATCH] repl: Support HTML outputs through `html_to_markdown` (#49646) Closes #15555 Adds a super basic render of html output from jupyter kernels. image Obviously not as full featured as #48157 Release Notes: - Added basic handling of HTML in REPL outputs --- Cargo.lock | 1 + crates/repl/Cargo.toml | 1 + crates/repl/src/outputs.rs | 27 ++++- crates/repl/src/outputs/html.rs | 198 ++++++++++++++++++++++++++++++++ 4 files changed, 222 insertions(+), 5 deletions(-) create mode 100644 crates/repl/src/outputs/html.rs diff --git a/Cargo.lock b/Cargo.lock index 4eb85fb4f76269c36087da3677c511d98ddb8407..854584b6bb8367fa3c6d820e43fb9b5bfb05bc13 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13911,6 +13911,7 @@ dependencies = [ "file_icons", "futures 0.3.31", "gpui", + "html_to_markdown", "http_client", "image", "indoc", diff --git a/crates/repl/Cargo.toml b/crates/repl/Cargo.toml index 1987ae1a4cb0c3ee963df9983665cd087d7e47b0..f68fbd820c5701e5ce362002a9a1d14541d47e42 100644 --- a/crates/repl/Cargo.toml +++ b/crates/repl/Cargo.toml @@ -30,6 +30,7 @@ feature_flags.workspace = true file_icons.workspace = true futures.workspace = true gpui.workspace = true +html_to_markdown.workspace = true http_client.workspace = true image.workspace = true jupyter-websocket-client.workspace = true diff --git a/crates/repl/src/outputs.rs b/crates/repl/src/outputs.rs index 0fdc2798822504c34737978996fc2a18cccb0e39..8be8c57cceee84435a6d99ba5c611d24c563bec3 100644 --- a/crates/repl/src/outputs.rs +++ b/crates/repl/src/outputs.rs @@ -52,6 +52,8 @@ use table::TableView; mod json; use json::JsonView; +mod html; + pub mod plain; use plain::TerminalOutput; @@ -65,7 +67,8 @@ use settings::Settings; /// When deciding what to render from a collection of mediatypes, we need to rank them in order of importance fn rank_mime_type(mimetype: &MimeType) -> usize { match mimetype { - MimeType::DataTable(_) => 6, + MimeType::DataTable(_) => 7, + MimeType::Html(_) => 6, MimeType::Json(_) => 5, MimeType::Png(_) => 4, MimeType::Jpeg(_) => 3, @@ -419,6 +422,19 @@ impl Output { content: cx.new(|cx| TableView::new(data, window, cx)), display_id, }, + Some(MimeType::Html(html_content)) => match html::html_to_markdown(html_content) { + Ok(markdown_text) => { + let content = cx.new(|cx| MarkdownView::from(markdown_text, cx)); + Output::Markdown { + content, + display_id, + } + } + Err(_) => Output::Plain { + content: cx.new(|cx| TerminalOutput::from(html_content, window, cx)), + display_id, + }, + }, // Any other media types are not supported _ => Output::Message("Unsupported media type".to_string()), } @@ -836,20 +852,23 @@ mod tests { #[test] fn test_rank_mime_type_ordering() { let data_table = MimeType::DataTable(Box::default()); + let html = MimeType::Html(String::new()); let json = MimeType::Json(serde_json::json!({})); let png = MimeType::Png(String::new()); let jpeg = MimeType::Jpeg(String::new()); let markdown = MimeType::Markdown(String::new()); let plain = MimeType::Plain(String::new()); - assert_eq!(rank_mime_type(&data_table), 6); + assert_eq!(rank_mime_type(&data_table), 7); + assert_eq!(rank_mime_type(&html), 6); assert_eq!(rank_mime_type(&json), 5); assert_eq!(rank_mime_type(&png), 4); assert_eq!(rank_mime_type(&jpeg), 3); assert_eq!(rank_mime_type(&markdown), 2); assert_eq!(rank_mime_type(&plain), 1); - assert!(rank_mime_type(&data_table) > rank_mime_type(&json)); + assert!(rank_mime_type(&data_table) > rank_mime_type(&html)); + assert!(rank_mime_type(&html) > rank_mime_type(&json)); assert!(rank_mime_type(&json) > rank_mime_type(&png)); assert!(rank_mime_type(&png) > rank_mime_type(&jpeg)); assert!(rank_mime_type(&jpeg) > rank_mime_type(&markdown)); @@ -858,11 +877,9 @@ mod tests { #[test] fn test_rank_mime_type_unsupported_returns_zero() { - let html = MimeType::Html(String::new()); let svg = MimeType::Svg(String::new()); let latex = MimeType::Latex(String::new()); - assert_eq!(rank_mime_type(&html), 0); assert_eq!(rank_mime_type(&svg), 0); assert_eq!(rank_mime_type(&latex), 0); } diff --git a/crates/repl/src/outputs/html.rs b/crates/repl/src/outputs/html.rs new file mode 100644 index 0000000000000000000000000000000000000000..c784920c0810b1f5cf30d0da9551443734d9e2a0 --- /dev/null +++ b/crates/repl/src/outputs/html.rs @@ -0,0 +1,198 @@ +use anyhow::Result; +use html_to_markdown::markdown::{ + CodeHandler, HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler, + WebpageChromeRemover, +}; +use html_to_markdown::{TagHandler, convert_html_to_markdown}; +use std::cell::RefCell; +use std::rc::Rc; + +/// Convert HTML to Markdown for rendering in the REPL. +pub fn html_to_markdown(html: &str) -> Result { + let mut handlers: Vec = vec![ + // WebpageChromeRemover must come first to skip style, script, head, nav tags + Rc::new(RefCell::new(WebpageChromeRemover)), + Rc::new(RefCell::new(ParagraphHandler)), + Rc::new(RefCell::new(HeadingHandler)), + Rc::new(RefCell::new(ListHandler)), + Rc::new(RefCell::new(TableHandler::new())), + Rc::new(RefCell::new(StyledTextHandler)), + Rc::new(RefCell::new(CodeHandler)), + ]; + + let markdown = convert_html_to_markdown(html.as_bytes(), &mut handlers)?; + Ok(clean_markdown_tables(&markdown)) +} + +/// Clean up markdown table formatting and ensure tables have separator rows. +fn clean_markdown_tables(markdown: &str) -> String { + let lines: Vec<&str> = markdown.lines().collect(); + let mut result: Vec = Vec::new(); + let mut in_table = false; + let mut has_separator = false; + + for (i, line) in lines.iter().enumerate() { + let trimmed = line.trim(); + + if trimmed.starts_with('|') { + let normalized = normalize_table_row(trimmed); + + if !in_table { + // Starting a new table + in_table = true; + has_separator = false; + } + + // Check if this line is a separator row + if trimmed.contains("---") { + has_separator = true; + } + + result.push(normalized.clone()); + + // If this is the first row and no separator exists yet, + // check if next row is a table row (not separator) and add one + if !has_separator { + let next_is_table_row = i + 1 < lines.len() + && lines[i + 1].trim().starts_with('|') + && !lines[i + 1].contains("---"); + + if next_is_table_row { + // Insert separator after first row + let col_count = normalized.matches('|').count().saturating_sub(1); + if col_count > 0 { + let separator = (0..col_count) + .map(|_| "---") + .collect::>() + .join(" | "); + result.push(format!("| {} |", separator)); + has_separator = true; + } + } + } + } else { + // Not a table row + if !trimmed.is_empty() { + result.push(trimmed.to_string()); + } + in_table = false; + has_separator = false; + } + } + + result.join("\n") +} + +/// Normalize a table row by trimming cells and ensuring consistent spacing. +fn normalize_table_row(row: &str) -> String { + let parts: Vec<&str> = row.split('|').collect(); + let normalized: Vec = parts.iter().map(|cell| cell.trim().to_string()).collect(); + normalized.join(" | ").trim().to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_html_table_to_markdown() { + let html = r#" + + +
AB
1x
"#; + + let md = html_to_markdown(html).unwrap(); + assert!(md.contains("|")); + assert!(md.contains("---")); + } + + #[test] + fn test_html_with_headings() { + let html = "

Title

Content

"; + let md = html_to_markdown(html).unwrap(); + assert!(md.contains("# Title")); + } + + #[test] + fn test_pandas_dataframe_html() { + let html = r#" + + + + + +
AB
01x
12y
"#; + + let md = html_to_markdown(html).unwrap(); + assert!(md.contains("|")); + // Verify table rows are properly formatted (start with |) + for line in md.lines() { + if line.contains("|") { + assert!( + line.starts_with("|"), + "Table line should start with |: {:?}", + line + ); + } + } + } + + #[test] + fn test_table_format_normalized() { + let html = r#" + + + + + + +
NameAge
Alice25
"#; + + let md = html_to_markdown(html).unwrap(); + + // Should have clean table format + assert!(md.contains("| Name | Age |")); + assert!(md.contains("| --- | --- |")); + assert!(md.contains("| Alice | 25 |")); + } + + #[test] + fn test_style_tags_are_filtered() { + let html = r#" + + + +
A
1
"#; + + let md = html_to_markdown(html).unwrap(); + + // Style content should not appear in output + assert!(!md.contains("dataframe")); + assert!(!md.contains("border")); + // Table should still be present + assert!(md.contains("| A |")); + } + + #[test] + fn test_table_without_thead() { + // Tables without should still get a separator row + let html = r#" + + + +
FeatureSupported
Tables
Lists
"#; + + let md = html_to_markdown(html).unwrap(); + + // Should have separator row inserted after first row + assert!( + md.contains("| --- | --- |"), + "Missing separator row: {}", + md + ); + assert!(md.contains("| Feature | Supported |")); + assert!(md.contains("| Tables | ✓ |")); + } +}