Cargo.lock 🔗
@@ -715,6 +715,8 @@ dependencies = [
"feature_flags",
"futures 0.3.31",
"gpui",
+ "html_to_markdown",
+ "http_client",
"itertools 0.14.0",
"language",
"language_model",
Marshall Bowers created
This PR adds a new `fetch` tool to the set of tools the Assistant has
available.
This tool accepts a URL and fetches the content as Markdown.
<img width="1394" alt="Screenshot 2025-03-18 at 11 52 21 AM"
src="https://github.com/user-attachments/assets/e5bcde14-a0dd-4835-9d42-8f45def68f4d"
/>
<img width="1394" alt="Screenshot 2025-03-18 at 11 52 37 AM"
src="https://github.com/user-attachments/assets/3bcce4f5-f61b-40d7-8b30-2c673ce3c06a"
/>
Release Notes:
- N/A
Cargo.lock | 2
crates/assistant_eval/src/headless_assistant.rs | 2
crates/assistant_tools/Cargo.toml | 8
crates/assistant_tools/src/assistant_tools.rs | 8
crates/assistant_tools/src/fetch_tool.rs | 153 ++++++++++++++
crates/assistant_tools/src/fetch_tool/description.md | 1
crates/zed/src/main.rs | 2
7 files changed, 170 insertions(+), 6 deletions(-)
@@ -715,6 +715,8 @@ dependencies = [
"feature_flags",
"futures 0.3.31",
"gpui",
+ "html_to_markdown",
+ "http_client",
"itertools 0.14.0",
"language",
"language_model",
@@ -163,7 +163,7 @@ pub fn init(cx: &mut App) -> Arc<HeadlessAppState> {
language::init(cx);
language_model::init(client.clone(), cx);
language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
- assistant_tools::init(cx);
+ assistant_tools::init(client.http_client().clone(), cx);
context_server::init(cx);
let stdout_is_a_pty = false;
let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
@@ -18,8 +18,10 @@ chrono.workspace = true
collections.workspace = true
feature_flags.workspace = true
futures.workspace = true
-itertools.workspace = true
gpui.workspace = true
+html_to_markdown.workspace = true
+http_client.workspace = true
+itertools.workspace = true
language.workspace = true
language_model.workspace = true
project.workspace = true
@@ -27,17 +29,17 @@ release_channel.workspace = true
schemars.workspace = true
serde.workspace = true
serde_json.workspace = true
+settings.workspace = true
theme.workspace = true
ui.workspace = true
util.workspace = true
workspace.workspace = true
worktree.workspace = true
-settings.workspace = true
[dev-dependencies]
-rand.workspace = true
collections = { workspace = true, features = ["test-support"] }
gpui = { workspace = true, features = ["test-support"] }
language = { workspace = true, features = ["test-support"] }
project = { workspace = true, features = ["test-support"] }
+rand.workspace = true
workspace = { workspace = true, features = ["test-support"] }
@@ -2,6 +2,7 @@ mod bash_tool;
mod delete_path_tool;
mod diagnostics_tool;
mod edit_files_tool;
+mod fetch_tool;
mod list_directory_tool;
mod now_tool;
mod path_search_tool;
@@ -9,13 +10,17 @@ mod read_file_tool;
mod regex_search_tool;
mod thinking_tool;
+use std::sync::Arc;
+
use assistant_tool::ToolRegistry;
use gpui::App;
+use http_client::HttpClientWithUrl;
use crate::bash_tool::BashTool;
use crate::delete_path_tool::DeletePathTool;
use crate::diagnostics_tool::DiagnosticsTool;
use crate::edit_files_tool::EditFilesTool;
+use crate::fetch_tool::FetchTool;
use crate::list_directory_tool::ListDirectoryTool;
use crate::now_tool::NowTool;
use crate::path_search_tool::PathSearchTool;
@@ -23,7 +28,7 @@ use crate::read_file_tool::ReadFileTool;
use crate::regex_search_tool::RegexSearchTool;
use crate::thinking_tool::ThinkingTool;
-pub fn init(cx: &mut App) {
+pub fn init(http_client: Arc<HttpClientWithUrl>, cx: &mut App) {
assistant_tool::init(cx);
crate::edit_files_tool::log::init(cx);
@@ -38,4 +43,5 @@ pub fn init(cx: &mut App) {
registry.register_tool(ReadFileTool);
registry.register_tool(RegexSearchTool);
registry.register_tool(ThinkingTool);
+ registry.register_tool(FetchTool::new(http_client));
}
@@ -0,0 +1,153 @@
+use std::cell::RefCell;
+use std::rc::Rc;
+use std::sync::Arc;
+
+use anyhow::{anyhow, bail, Context as _, Result};
+use assistant_tool::{ActionLog, Tool};
+use futures::AsyncReadExt as _;
+use gpui::{App, AppContext as _, Entity, Task};
+use html_to_markdown::{convert_html_to_markdown, markdown, TagHandler};
+use http_client::{AsyncBody, HttpClientWithUrl};
+use language_model::LanguageModelRequestMessage;
+use project::Project;
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)]
+enum ContentType {
+ Html,
+ Plaintext,
+ Json,
+}
+
+#[derive(Debug, Serialize, Deserialize, JsonSchema)]
+pub struct FetchToolInput {
+ /// The URL to fetch.
+ url: String,
+}
+
+pub struct FetchTool {
+ http_client: Arc<HttpClientWithUrl>,
+}
+
+impl FetchTool {
+ pub fn new(http_client: Arc<HttpClientWithUrl>) -> Self {
+ Self { http_client }
+ }
+
+ async fn build_message(http_client: Arc<HttpClientWithUrl>, url: &str) -> Result<String> {
+ let mut url = url.to_owned();
+ if !url.starts_with("https://") && !url.starts_with("http://") {
+ url = format!("https://{url}");
+ }
+
+ let mut response = http_client.get(&url, AsyncBody::default(), true).await?;
+
+ let mut body = Vec::new();
+ response
+ .body_mut()
+ .read_to_end(&mut body)
+ .await
+ .context("error reading response body")?;
+
+ if response.status().is_client_error() {
+ let text = String::from_utf8_lossy(body.as_slice());
+ bail!(
+ "status error {}, response: {text:?}",
+ response.status().as_u16()
+ );
+ }
+
+ let Some(content_type) = response.headers().get("content-type") else {
+ bail!("missing Content-Type header");
+ };
+ let content_type = content_type
+ .to_str()
+ .context("invalid Content-Type header")?;
+ let content_type = match content_type {
+ "text/html" => ContentType::Html,
+ "text/plain" => ContentType::Plaintext,
+ "application/json" => ContentType::Json,
+ _ => ContentType::Html,
+ };
+
+ match content_type {
+ ContentType::Html => {
+ let mut handlers: Vec<TagHandler> = vec![
+ Rc::new(RefCell::new(markdown::WebpageChromeRemover)),
+ Rc::new(RefCell::new(markdown::ParagraphHandler)),
+ Rc::new(RefCell::new(markdown::HeadingHandler)),
+ Rc::new(RefCell::new(markdown::ListHandler)),
+ Rc::new(RefCell::new(markdown::TableHandler::new())),
+ Rc::new(RefCell::new(markdown::StyledTextHandler)),
+ ];
+ if url.contains("wikipedia.org") {
+ use html_to_markdown::structure::wikipedia;
+
+ handlers.push(Rc::new(RefCell::new(wikipedia::WikipediaChromeRemover)));
+ handlers.push(Rc::new(RefCell::new(wikipedia::WikipediaInfoboxHandler)));
+ handlers.push(Rc::new(
+ RefCell::new(wikipedia::WikipediaCodeHandler::new()),
+ ));
+ } else {
+ handlers.push(Rc::new(RefCell::new(markdown::CodeHandler)));
+ }
+
+ convert_html_to_markdown(&body[..], &mut handlers)
+ }
+ ContentType::Plaintext => Ok(std::str::from_utf8(&body)?.to_owned()),
+ ContentType::Json => {
+ let json: serde_json::Value = serde_json::from_slice(&body)?;
+
+ Ok(format!(
+ "```json\n{}\n```",
+ serde_json::to_string_pretty(&json)?
+ ))
+ }
+ }
+ }
+}
+
+impl Tool for FetchTool {
+ fn name(&self) -> String {
+ "fetch".to_string()
+ }
+
+ fn description(&self) -> String {
+ include_str!("./fetch_tool/description.md").to_string()
+ }
+
+ fn input_schema(&self) -> serde_json::Value {
+ let schema = schemars::schema_for!(FetchToolInput);
+ serde_json::to_value(&schema).unwrap()
+ }
+
+ fn run(
+ self: Arc<Self>,
+ input: serde_json::Value,
+ _messages: &[LanguageModelRequestMessage],
+ _project: Entity<Project>,
+ _action_log: Entity<ActionLog>,
+ cx: &mut App,
+ ) -> Task<Result<String>> {
+ let input = match serde_json::from_value::<FetchToolInput>(input) {
+ Ok(input) => input,
+ Err(err) => return Task::ready(Err(anyhow!(err))),
+ };
+
+ let text = cx.background_spawn({
+ let http_client = self.http_client.clone();
+ let url = input.url.clone();
+ async move { Self::build_message(http_client, &url).await }
+ });
+
+ cx.foreground_executor().spawn(async move {
+ let text = text.await?;
+ if text.trim().is_empty() {
+ bail!("no textual content found");
+ }
+
+ Ok(text)
+ })
+ }
+}
@@ -0,0 +1 @@
+Fetches a URL and returns the content as Markdown.
@@ -472,7 +472,7 @@ fn main() {
prompt_builder.clone(),
cx,
);
- assistant_tools::init(cx);
+ assistant_tools::init(app_state.client.http_client(), cx);
repl::init(app_state.fs.clone(), cx);
extension_host::init(
extension_host_proxy,