Detailed changes
@@ -5,7 +5,7 @@ use anyhow::{anyhow, bail, Context, Result};
use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection};
use futures::AsyncReadExt;
use gpui::{AppContext, Task, WeakView};
-use html_to_markdown::convert_html_to_markdown;
+use html_to_markdown::{convert_html_to_markdown, markdown, HandleTag};
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
use language::LspAdapterDelegate;
use ui::{prelude::*, ButtonLike, ElevationIndex};
@@ -37,7 +37,21 @@ impl FetchSlashCommand {
);
}
- convert_html_to_markdown(&body[..])
+ let mut handlers: Vec<Box<dyn HandleTag>> = vec![
+ Box::new(markdown::ParagraphHandler),
+ Box::new(markdown::HeadingHandler),
+ Box::new(markdown::ListHandler),
+ Box::new(markdown::TableHandler::new()),
+ Box::new(markdown::StyledTextHandler),
+ Box::new(markdown::CodeHandler),
+ ];
+ if url.contains("wikipedia.org") {
+ use html_to_markdown::structure::wikipedia;
+
+ handlers.push(Box::new(wikipedia::WikipediaChromeRemover));
+ }
+
+ convert_html_to_markdown(&body[..], handlers)
}
}
@@ -1,11 +1,9 @@
//! Provides conversion from rustdoc's HTML output to Markdown.
-#![deny(missing_docs)]
-
mod html_element;
-mod markdown;
+pub mod markdown;
mod markdown_writer;
-mod structure;
+pub mod structure;
use std::io::Read;
@@ -19,24 +17,17 @@ use markup5ever_rcdom::RcDom;
use crate::markdown::{
HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
};
-use crate::markdown_writer::{HandleTag, MarkdownWriter};
+use crate::markdown_writer::MarkdownWriter;
+
+pub use crate::markdown_writer::HandleTag;
/// Converts the provided HTML to Markdown.
-pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
+pub fn convert_html_to_markdown(
+ html: impl Read,
+ handlers: Vec<Box<dyn HandleTag>>,
+) -> Result<String> {
let dom = parse_html(html).context("failed to parse HTML")?;
- let handlers: Vec<Box<dyn HandleTag>> = vec![
- Box::new(ParagraphHandler),
- Box::new(HeadingHandler),
- Box::new(ListHandler),
- Box::new(TableHandler::new()),
- Box::new(StyledTextHandler),
- Box::new(structure::rustdoc::RustdocChromeRemover),
- Box::new(structure::rustdoc::RustdocHeadingHandler),
- Box::new(structure::rustdoc::RustdocCodeHandler),
- Box::new(structure::rustdoc::RustdocItemHandler),
- ];
-
let markdown_writer = MarkdownWriter::new();
let markdown = markdown_writer
.run(&dom.document, handlers)
@@ -47,26 +38,20 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
/// Converts the provided rustdoc HTML to Markdown.
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
- let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
-
- let handlers: Vec<Box<dyn HandleTag>> = vec![
- Box::new(ParagraphHandler),
- Box::new(HeadingHandler),
- Box::new(ListHandler),
- Box::new(TableHandler::new()),
- Box::new(StyledTextHandler),
- Box::new(structure::rustdoc::RustdocChromeRemover),
- Box::new(structure::rustdoc::RustdocHeadingHandler),
- Box::new(structure::rustdoc::RustdocCodeHandler),
- Box::new(structure::rustdoc::RustdocItemHandler),
- ];
-
- let markdown_writer = MarkdownWriter::new();
- let markdown = markdown_writer
- .run(&dom.document, handlers)
- .context("failed to convert rustdoc HTML to Markdown")?;
-
- Ok(markdown)
+ convert_html_to_markdown(
+ html,
+ vec![
+ Box::new(ParagraphHandler),
+ Box::new(HeadingHandler),
+ Box::new(ListHandler),
+ Box::new(TableHandler::new()),
+ Box::new(StyledTextHandler),
+ Box::new(structure::rustdoc::RustdocChromeRemover),
+ Box::new(structure::rustdoc::RustdocHeadingHandler),
+ Box::new(structure::rustdoc::RustdocCodeHandler),
+ Box::new(structure::rustdoc::RustdocItemHandler),
+ ],
+ )
}
fn parse_html(mut html: impl Read) -> Result<RcDom> {
@@ -1,5 +1,5 @@
use crate::html_element::HtmlElement;
-use crate::markdown_writer::{HandleTag, MarkdownWriter, StartTagOutcome};
+use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
pub struct ParagraphHandler;
@@ -214,3 +214,53 @@ impl HandleTag for StyledTextHandler {
}
}
}
+
+pub struct CodeHandler;
+
+impl HandleTag for CodeHandler {
+ fn should_handle(&self, tag: &str) -> bool {
+ match tag {
+ "pre" | "code" => true,
+ _ => false,
+ }
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "code" => {
+ if !writer.is_inside("pre") {
+ writer.push_str("`");
+ }
+ }
+ "pre" => writer.push_str("\n\n```\n"),
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+
+ fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
+ match tag.tag.as_str() {
+ "code" => {
+ if !writer.is_inside("pre") {
+ writer.push_str("`");
+ }
+ }
+ "pre" => writer.push_str("\n```\n"),
+ _ => {}
+ }
+ }
+
+ fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
+ if writer.is_inside("pre") {
+ writer.push_str(&text);
+ return HandlerOutcome::Handled;
+ }
+
+ HandlerOutcome::NoOp
+ }
+}
@@ -1 +1,2 @@
pub mod rustdoc;
+pub mod wikipedia;
@@ -0,0 +1,80 @@
+use crate::html_element::HtmlElement;
+use crate::markdown_writer::{MarkdownWriter, StartTagOutcome};
+use crate::HandleTag;
+
+pub struct WikipediaChromeRemover;
+
+impl HandleTag for WikipediaChromeRemover {
+ fn should_handle(&self, _tag: &str) -> bool {
+ true
+ }
+
+ fn handle_tag_start(
+ &mut self,
+ tag: &HtmlElement,
+ _writer: &mut MarkdownWriter,
+ ) -> StartTagOutcome {
+ match tag.tag.as_str() {
+ "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip,
+ "sup" => {
+ if tag.has_class("reference") {
+ return StartTagOutcome::Skip;
+ }
+ }
+ "div" | "span" | "a" => {
+ if tag.attr("id").as_deref() == Some("p-lang-btn") {
+ return StartTagOutcome::Skip;
+ }
+
+ if tag.attr("id").as_deref() == Some("p-search") {
+ return StartTagOutcome::Skip;
+ }
+
+ let classes_to_skip = ["mw-editsection", "mw-jump-link"];
+ if tag.has_any_classes(&classes_to_skip) {
+ return StartTagOutcome::Skip;
+ }
+ }
+ _ => {}
+ }
+
+ StartTagOutcome::Continue
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use indoc::indoc;
+ use pretty_assertions::assert_eq;
+
+ use crate::{convert_html_to_markdown, markdown};
+
+ use super::*;
+
+ fn wikipedia_handlers() -> Vec<Box<dyn HandleTag>> {
+ vec![
+ Box::new(markdown::ParagraphHandler),
+ Box::new(markdown::HeadingHandler),
+ Box::new(markdown::ListHandler),
+ Box::new(markdown::StyledTextHandler),
+ Box::new(WikipediaChromeRemover),
+ ]
+ }
+
+ #[test]
+ fn test_citation_references_get_removed() {
+ let html = indoc! {r##"