From 06bdb28517d085d5c7635dacc15d4731a1b97d66 Mon Sep 17 00:00:00 2001 From: Agus Zubiaga Date: Sat, 1 Nov 2025 16:35:04 -0300 Subject: [PATCH] zeta cli: Add convert-example command (#41608) Adds a `convert-example` subcommand to the zeta cli that converts eval examples from/to `json`, `toml`, and `md` formats. Release Notes: - N/A --------- Co-authored-by: Max Brunsfeld --- Cargo.lock | 2 + .../src/cloud_zeta2_prompt.rs | 2 +- crates/zeta2/src/related_excerpts.rs | 2 +- crates/zeta_cli/Cargo.toml | 2 + crates/zeta_cli/src/example.rs | 355 ++++++++++++++++++ crates/zeta_cli/src/main.rs | 17 + 6 files changed, 378 insertions(+), 2 deletions(-) create mode 100644 crates/zeta_cli/src/example.rs diff --git a/Cargo.lock b/Cargo.lock index ec55e4af77f78a9476b147744a9973d758d0e630..c0eea670a77f03c4dbb5afdb7d1197b6d9b76159 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21757,6 +21757,7 @@ dependencies = [ "polars", "project", "prompt_store", + "pulldown-cmark 0.12.2", "release_channel", "reqwest_client", "serde", @@ -21766,6 +21767,7 @@ dependencies = [ "smol", "soa-rs", "terminal_view", + "toml 0.8.23", "util", "watch", "zeta", diff --git a/crates/cloud_zeta2_prompt/src/cloud_zeta2_prompt.rs b/crates/cloud_zeta2_prompt/src/cloud_zeta2_prompt.rs index a0df39b50eb6753397f5afd37aa30b71b853b9c5..6caf9941845146dc0c30c4606f677e5ec816c137 100644 --- a/crates/cloud_zeta2_prompt/src/cloud_zeta2_prompt.rs +++ b/crates/cloud_zeta2_prompt/src/cloud_zeta2_prompt.rs @@ -212,7 +212,7 @@ pub fn write_codeblock<'a>( include_line_numbers: bool, output: &'a mut String, ) { - writeln!(output, "`````path={}", path.display()).unwrap(); + writeln!(output, "`````{}", path.display()).unwrap(); write_excerpts( excerpts, sorted_insertions, diff --git a/crates/zeta2/src/related_excerpts.rs b/crates/zeta2/src/related_excerpts.rs index dd27992274ae2b25ec07e2a47dc8a60b46f5f3f2..44388251e32678ff8d1b3ce594ab35996b235759 100644 --- a/crates/zeta2/src/related_excerpts.rs +++ b/crates/zeta2/src/related_excerpts.rs @@ -64,7 +64,7 @@ const SEARCH_PROMPT: &str = indoc! {r#" ## Current cursor context - `````path={current_file_path} + `````{current_file_path} {cursor_excerpt} ````` diff --git a/crates/zeta_cli/Cargo.toml b/crates/zeta_cli/Cargo.toml index 19dafefbdcf8ed577a54e686b6b0c4ed90cf4512..a54298366614c3633cf527cc5746480e66c6caae 100644 --- a/crates/zeta_cli/Cargo.toml +++ b/crates/zeta_cli/Cargo.toml @@ -39,8 +39,10 @@ paths.workspace = true polars = { version = "0.51", features = ["lazy", "dtype-struct", "parquet"] } project.workspace = true prompt_store.workspace = true +pulldown-cmark.workspace = true release_channel.workspace = true reqwest_client.workspace = true +toml.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true diff --git a/crates/zeta_cli/src/example.rs b/crates/zeta_cli/src/example.rs new file mode 100644 index 0000000000000000000000000000000000000000..de95bbe8d0c97df7c12ce04f75de35ed41a660e4 --- /dev/null +++ b/crates/zeta_cli/src/example.rs @@ -0,0 +1,355 @@ +use std::{ + borrow::Cow, + env, + fmt::{self, Display}, + fs, + io::Write, + mem, + path::{Path, PathBuf}, +}; + +use anyhow::{Context as _, Result}; +use clap::ValueEnum; +use gpui::http_client::Url; +use pulldown_cmark::CowStr; +use serde::{Deserialize, Serialize}; + +const CURSOR_POSITION_HEADING: &str = "Cursor Position"; +const EDIT_HISTORY_HEADING: &str = "Edit History"; +const EXPECTED_PATCH_HEADING: &str = "Expected Patch"; +const EXPECTED_EXCERPTS_HEADING: &str = "Expected Excerpts"; +const REPOSITORY_URL_FIELD: &str = "repository_url"; +const REVISION_FIELD: &str = "revision"; + +#[derive(Debug)] +pub struct NamedExample { + pub name: String, + pub example: Example, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct Example { + pub repository_url: String, + pub revision: String, + pub cursor_path: PathBuf, + pub cursor_position: String, + pub edit_history: Vec, + pub expected_patch: String, + pub expected_excerpts: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ExpectedExcerpt { + path: PathBuf, + text: String, +} + +#[derive(ValueEnum, Debug, Clone)] +pub enum ExampleFormat { + Json, + Toml, + Md, +} + +impl NamedExample { + pub fn load(path: impl AsRef) -> Result { + let path = path.as_ref(); + let content = std::fs::read_to_string(path)?; + let ext = path.extension(); + + match ext.and_then(|s| s.to_str()) { + Some("json") => Ok(Self { + name: path.file_name().unwrap_or_default().display().to_string(), + example: serde_json::from_str(&content)?, + }), + Some("toml") => Ok(Self { + name: path.file_name().unwrap_or_default().display().to_string(), + example: toml::from_str(&content)?, + }), + Some("md") => Self::parse_md(&content), + Some(_) => { + anyhow::bail!("Unrecognized example extension: {}", ext.unwrap().display()); + } + None => { + anyhow::bail!( + "Failed to determine example type since the file does not have an extension." + ); + } + } + } + + pub fn parse_md(input: &str) -> Result { + use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Parser, Tag, TagEnd}; + + let parser = Parser::new(input); + + let mut named = NamedExample { + name: String::new(), + example: Example { + repository_url: String::new(), + revision: String::new(), + cursor_path: PathBuf::new(), + cursor_position: String::new(), + edit_history: Vec::new(), + expected_patch: String::new(), + expected_excerpts: Vec::new(), + }, + }; + + let mut text = String::new(); + let mut current_section = String::new(); + let mut block_info: CowStr = "".into(); + + for event in parser { + match event { + Event::Text(line) => { + text.push_str(&line); + + if !named.name.is_empty() + && current_section.is_empty() + // in h1 section + && let Some((field, value)) = line.split_once('=') + { + match field.trim() { + REPOSITORY_URL_FIELD => { + named.example.repository_url = value.trim().to_string(); + } + REVISION_FIELD => { + named.example.revision = value.trim().to_string(); + } + _ => { + eprintln!("Warning: Unrecognized field `{field}`"); + } + } + } + } + Event::End(TagEnd::Heading(HeadingLevel::H1)) => { + if !named.name.is_empty() { + anyhow::bail!( + "Found multiple H1 headings. There should only be one with the name of the example." + ); + } + named.name = mem::take(&mut text); + } + Event::End(TagEnd::Heading(HeadingLevel::H2)) => { + current_section = mem::take(&mut text); + } + Event::End(TagEnd::Heading(level)) => { + anyhow::bail!("Unexpected heading level: {level}"); + } + Event::Start(Tag::CodeBlock(kind)) => { + match kind { + CodeBlockKind::Fenced(info) => { + block_info = info; + } + CodeBlockKind::Indented => { + anyhow::bail!("Unexpected indented codeblock"); + } + }; + } + Event::Start(_) => { + text.clear(); + block_info = "".into(); + } + Event::End(TagEnd::CodeBlock) => { + if current_section.eq_ignore_ascii_case(EDIT_HISTORY_HEADING) { + named.example.edit_history.push(mem::take(&mut text)); + } else if current_section.eq_ignore_ascii_case(CURSOR_POSITION_HEADING) { + let path = PathBuf::from(block_info.trim()); + named.example.cursor_path = path; + named.example.cursor_position = mem::take(&mut text); + } else if current_section.eq_ignore_ascii_case(EXPECTED_PATCH_HEADING) { + named.example.expected_patch = mem::take(&mut text); + } else if current_section.eq_ignore_ascii_case(EXPECTED_EXCERPTS_HEADING) { + let path = PathBuf::from(block_info.trim()); + named.example.expected_excerpts.push(ExpectedExcerpt { + path, + text: mem::take(&mut text), + }); + } else { + eprintln!("Warning: Unrecognized section `{current_section:?}`") + } + } + _ => {} + } + } + + if named.example.cursor_path.as_path() == Path::new("") + || named.example.cursor_position.is_empty() + { + anyhow::bail!("Missing cursor position codeblock"); + } + + Ok(named) + } + + pub fn write(&self, format: ExampleFormat, mut out: impl Write) -> Result<()> { + match format { + ExampleFormat::Json => Ok(serde_json::to_writer(out, &self.example)?), + ExampleFormat::Toml => { + Ok(out.write_all(toml::to_string_pretty(&self.example)?.as_bytes())?) + } + ExampleFormat::Md => Ok(write!(out, "{}", self)?), + } + } + + #[allow(unused)] + pub async fn setup_worktree(&self) -> Result { + let worktrees_dir = env::current_dir()?.join("target").join("zeta-worktrees"); + let repos_dir = env::current_dir()?.join("target").join("zeta-repos"); + fs::create_dir_all(&repos_dir)?; + fs::create_dir_all(&worktrees_dir)?; + + let (repo_owner, repo_name) = self.repo_name()?; + + let repo_dir = repos_dir.join(repo_owner.as_ref()).join(repo_name.as_ref()); + if !repo_dir.is_dir() { + fs::create_dir_all(&repo_dir)?; + run_git(&repo_dir, &["init"]).await?; + run_git( + &repo_dir, + &["remote", "add", "origin", &self.example.repository_url], + ) + .await?; + } + + run_git( + &repo_dir, + &["fetch", "--depth", "1", "origin", &self.example.revision], + ) + .await?; + + let worktree_path = worktrees_dir.join(&self.name); + + if worktree_path.is_dir() { + run_git(&worktree_path, &["clean", "--force", "-d"]).await?; + run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?; + run_git(&worktree_path, &["checkout", &self.example.revision]).await?; + } else { + let worktree_path_string = worktree_path.to_string_lossy(); + run_git( + &repo_dir, + &[ + "worktree", + "add", + "-f", + &worktree_path_string, + &self.example.revision, + ], + ) + .await?; + } + + Ok(worktree_path) + } + + #[allow(unused)] + fn repo_name(&self) -> Result<(Cow<'_, str>, Cow<'_, str>)> { + // git@github.com:owner/repo.git + if self.example.repository_url.contains('@') { + let (owner, repo) = self + .example + .repository_url + .split_once(':') + .context("expected : in git url")? + .1 + .split_once('/') + .context("expected / in git url")?; + Ok(( + Cow::Borrowed(owner), + Cow::Borrowed(repo.trim_end_matches(".git")), + )) + // http://github.com/owner/repo.git + } else { + let url = Url::parse(&self.example.repository_url)?; + let mut segments = url.path_segments().context("empty http url")?; + let owner = segments + .next() + .context("expected owner path segment")? + .to_string(); + let repo = segments + .next() + .context("expected repo path segment")? + .trim_end_matches(".git") + .to_string(); + assert!(segments.next().is_none()); + + Ok((owner.into(), repo.into())) + } + } +} + +async fn run_git(repo_path: &Path, args: &[&str]) -> Result { + let output = smol::process::Command::new("git") + .current_dir(repo_path) + .args(args) + .output() + .await?; + + anyhow::ensure!( + output.status.success(), + "`git {}` within `{}` failed with status: {}\nstderr:\n{}\nstdout:\n{}", + args.join(" "), + repo_path.display(), + output.status, + String::from_utf8_lossy(&output.stderr), + String::from_utf8_lossy(&output.stdout), + ); + Ok(String::from_utf8(output.stdout)?.trim().to_string()) +} + +impl Display for NamedExample { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "# {}\n\n", self.name)?; + write!( + f, + "{REPOSITORY_URL_FIELD} = {}\n", + self.example.repository_url + )?; + write!(f, "{REVISION_FIELD} = {}\n\n", self.example.revision)?; + + write!( + f, + "## {CURSOR_POSITION_HEADING}\n\n`````{}\n{}`````\n", + self.example.cursor_path.display(), + self.example.cursor_position + )?; + write!(f, "## {EDIT_HISTORY_HEADING}\n\n")?; + + if !self.example.edit_history.is_empty() { + write!(f, "`````diff\n")?; + for item in &self.example.edit_history { + write!(f, "{item}")?; + } + write!(f, "`````\n")?; + } + + if !self.example.expected_patch.is_empty() { + write!( + f, + "\n## {EXPECTED_PATCH_HEADING}\n\n`````diff\n{}`````\n", + self.example.expected_patch + )?; + } + + if !self.example.expected_excerpts.is_empty() { + write!(f, "\n## {EXPECTED_EXCERPTS_HEADING}\n\n")?; + + for excerpt in &self.example.expected_excerpts { + write!( + f, + "`````{}{}\n{}`````\n\n", + excerpt + .path + .extension() + .map(|ext| format!("{} ", ext.to_string_lossy())) + .unwrap_or_default(), + excerpt.path.display(), + excerpt.text + )?; + } + } + + Ok(()) + } +} diff --git a/crates/zeta_cli/src/main.rs b/crates/zeta_cli/src/main.rs index 7a6d4b26dc87cd9db7d40fe2745520ee5f574ea6..8f19287744697e9f0d2ffd520be8a814790b8345 100644 --- a/crates/zeta_cli/src/main.rs +++ b/crates/zeta_cli/src/main.rs @@ -1,8 +1,10 @@ +mod example; mod headless; mod source_location; mod syntax_retrieval_stats; mod util; +use crate::example::{ExampleFormat, NamedExample}; use crate::syntax_retrieval_stats::retrieval_stats; use ::serde::Serialize; use ::util::paths::PathStyle; @@ -22,6 +24,7 @@ use language_model::LanguageModelRegistry; use project::{Project, Worktree}; use reqwest_client::ReqwestClient; use serde_json::json; +use std::io; use std::{collections::HashSet, path::PathBuf, process::exit, str::FromStr, sync::Arc}; use zeta2::{ContextMode, LlmContextOptions, SearchToolQuery}; @@ -48,6 +51,11 @@ enum Command { #[command(subcommand)] command: Zeta2Command, }, + ConvertExample { + path: PathBuf, + #[arg(long, value_enum, default_value_t = ExampleFormat::Md)] + output_format: ExampleFormat, + }, } #[derive(Subcommand, Debug)] @@ -641,6 +649,15 @@ fn main() { } }, }, + Command::ConvertExample { + path, + output_format, + } => { + let example = NamedExample::load(path).unwrap(); + example.write(output_format, io::stdout()).unwrap(); + let _ = cx.update(|cx| cx.quit()); + return; + } }; match result {