zeta cli: Add convert-example command (#41608)

Agus Zubiaga and Max Brunsfeld created

Adds a `convert-example` subcommand to the zeta cli that converts eval
examples from/to `json`, `toml`, and `md` formats.

Release Notes:

- N/A

---------

Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>

Change summary

Cargo.lock                                          |   2 
crates/cloud_zeta2_prompt/src/cloud_zeta2_prompt.rs |   2 
crates/zeta2/src/related_excerpts.rs                |   2 
crates/zeta_cli/Cargo.toml                          |   2 
crates/zeta_cli/src/example.rs                      | 355 +++++++++++++++
crates/zeta_cli/src/main.rs                         |  17 
6 files changed, 378 insertions(+), 2 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -21757,6 +21757,7 @@ dependencies = [
  "polars",
  "project",
  "prompt_store",
+ "pulldown-cmark 0.12.2",
  "release_channel",
  "reqwest_client",
  "serde",
@@ -21766,6 +21767,7 @@ dependencies = [
  "smol",
  "soa-rs",
  "terminal_view",
+ "toml 0.8.23",
  "util",
  "watch",
  "zeta",

crates/cloud_zeta2_prompt/src/cloud_zeta2_prompt.rs 🔗

@@ -212,7 +212,7 @@ pub fn write_codeblock<'a>(
     include_line_numbers: bool,
     output: &'a mut String,
 ) {
-    writeln!(output, "`````path={}", path.display()).unwrap();
+    writeln!(output, "`````{}", path.display()).unwrap();
     write_excerpts(
         excerpts,
         sorted_insertions,

crates/zeta2/src/related_excerpts.rs 🔗

@@ -64,7 +64,7 @@ const SEARCH_PROMPT: &str = indoc! {r#"
 
     ## Current cursor context
 
-    `````path={current_file_path}
+    `````{current_file_path}
     {cursor_excerpt}
     `````
 

crates/zeta_cli/Cargo.toml 🔗

@@ -39,8 +39,10 @@ paths.workspace = true
 polars = { version = "0.51", features = ["lazy", "dtype-struct", "parquet"] }
 project.workspace = true
 prompt_store.workspace = true
+pulldown-cmark.workspace = true
 release_channel.workspace = true
 reqwest_client.workspace = true
+toml.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 settings.workspace = true

crates/zeta_cli/src/example.rs 🔗

@@ -0,0 +1,355 @@
+use std::{
+    borrow::Cow,
+    env,
+    fmt::{self, Display},
+    fs,
+    io::Write,
+    mem,
+    path::{Path, PathBuf},
+};
+
+use anyhow::{Context as _, Result};
+use clap::ValueEnum;
+use gpui::http_client::Url;
+use pulldown_cmark::CowStr;
+use serde::{Deserialize, Serialize};
+
+const CURSOR_POSITION_HEADING: &str = "Cursor Position";
+const EDIT_HISTORY_HEADING: &str = "Edit History";
+const EXPECTED_PATCH_HEADING: &str = "Expected Patch";
+const EXPECTED_EXCERPTS_HEADING: &str = "Expected Excerpts";
+const REPOSITORY_URL_FIELD: &str = "repository_url";
+const REVISION_FIELD: &str = "revision";
+
+#[derive(Debug)]
+pub struct NamedExample {
+    pub name: String,
+    pub example: Example,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Example {
+    pub repository_url: String,
+    pub revision: String,
+    pub cursor_path: PathBuf,
+    pub cursor_position: String,
+    pub edit_history: Vec<String>,
+    pub expected_patch: String,
+    pub expected_excerpts: Vec<ExpectedExcerpt>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct ExpectedExcerpt {
+    path: PathBuf,
+    text: String,
+}
+
+#[derive(ValueEnum, Debug, Clone)]
+pub enum ExampleFormat {
+    Json,
+    Toml,
+    Md,
+}
+
+impl NamedExample {
+    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
+        let path = path.as_ref();
+        let content = std::fs::read_to_string(path)?;
+        let ext = path.extension();
+
+        match ext.and_then(|s| s.to_str()) {
+            Some("json") => Ok(Self {
+                name: path.file_name().unwrap_or_default().display().to_string(),
+                example: serde_json::from_str(&content)?,
+            }),
+            Some("toml") => Ok(Self {
+                name: path.file_name().unwrap_or_default().display().to_string(),
+                example: toml::from_str(&content)?,
+            }),
+            Some("md") => Self::parse_md(&content),
+            Some(_) => {
+                anyhow::bail!("Unrecognized example extension: {}", ext.unwrap().display());
+            }
+            None => {
+                anyhow::bail!(
+                    "Failed to determine example type since the file does not have an extension."
+                );
+            }
+        }
+    }
+
+    pub fn parse_md(input: &str) -> Result<Self> {
+        use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Parser, Tag, TagEnd};
+
+        let parser = Parser::new(input);
+
+        let mut named = NamedExample {
+            name: String::new(),
+            example: Example {
+                repository_url: String::new(),
+                revision: String::new(),
+                cursor_path: PathBuf::new(),
+                cursor_position: String::new(),
+                edit_history: Vec::new(),
+                expected_patch: String::new(),
+                expected_excerpts: Vec::new(),
+            },
+        };
+
+        let mut text = String::new();
+        let mut current_section = String::new();
+        let mut block_info: CowStr = "".into();
+
+        for event in parser {
+            match event {
+                Event::Text(line) => {
+                    text.push_str(&line);
+
+                    if !named.name.is_empty()
+                        && current_section.is_empty()
+                        // in h1 section
+                        && let Some((field, value)) = line.split_once('=')
+                    {
+                        match field.trim() {
+                            REPOSITORY_URL_FIELD => {
+                                named.example.repository_url = value.trim().to_string();
+                            }
+                            REVISION_FIELD => {
+                                named.example.revision = value.trim().to_string();
+                            }
+                            _ => {
+                                eprintln!("Warning: Unrecognized field `{field}`");
+                            }
+                        }
+                    }
+                }
+                Event::End(TagEnd::Heading(HeadingLevel::H1)) => {
+                    if !named.name.is_empty() {
+                        anyhow::bail!(
+                            "Found multiple H1 headings. There should only be one with the name of the example."
+                        );
+                    }
+                    named.name = mem::take(&mut text);
+                }
+                Event::End(TagEnd::Heading(HeadingLevel::H2)) => {
+                    current_section = mem::take(&mut text);
+                }
+                Event::End(TagEnd::Heading(level)) => {
+                    anyhow::bail!("Unexpected heading level: {level}");
+                }
+                Event::Start(Tag::CodeBlock(kind)) => {
+                    match kind {
+                        CodeBlockKind::Fenced(info) => {
+                            block_info = info;
+                        }
+                        CodeBlockKind::Indented => {
+                            anyhow::bail!("Unexpected indented codeblock");
+                        }
+                    };
+                }
+                Event::Start(_) => {
+                    text.clear();
+                    block_info = "".into();
+                }
+                Event::End(TagEnd::CodeBlock) => {
+                    if current_section.eq_ignore_ascii_case(EDIT_HISTORY_HEADING) {
+                        named.example.edit_history.push(mem::take(&mut text));
+                    } else if current_section.eq_ignore_ascii_case(CURSOR_POSITION_HEADING) {
+                        let path = PathBuf::from(block_info.trim());
+                        named.example.cursor_path = path;
+                        named.example.cursor_position = mem::take(&mut text);
+                    } else if current_section.eq_ignore_ascii_case(EXPECTED_PATCH_HEADING) {
+                        named.example.expected_patch = mem::take(&mut text);
+                    } else if current_section.eq_ignore_ascii_case(EXPECTED_EXCERPTS_HEADING) {
+                        let path = PathBuf::from(block_info.trim());
+                        named.example.expected_excerpts.push(ExpectedExcerpt {
+                            path,
+                            text: mem::take(&mut text),
+                        });
+                    } else {
+                        eprintln!("Warning: Unrecognized section `{current_section:?}`")
+                    }
+                }
+                _ => {}
+            }
+        }
+
+        if named.example.cursor_path.as_path() == Path::new("")
+            || named.example.cursor_position.is_empty()
+        {
+            anyhow::bail!("Missing cursor position codeblock");
+        }
+
+        Ok(named)
+    }
+
+    pub fn write(&self, format: ExampleFormat, mut out: impl Write) -> Result<()> {
+        match format {
+            ExampleFormat::Json => Ok(serde_json::to_writer(out, &self.example)?),
+            ExampleFormat::Toml => {
+                Ok(out.write_all(toml::to_string_pretty(&self.example)?.as_bytes())?)
+            }
+            ExampleFormat::Md => Ok(write!(out, "{}", self)?),
+        }
+    }
+
+    #[allow(unused)]
+    pub async fn setup_worktree(&self) -> Result<PathBuf> {
+        let worktrees_dir = env::current_dir()?.join("target").join("zeta-worktrees");
+        let repos_dir = env::current_dir()?.join("target").join("zeta-repos");
+        fs::create_dir_all(&repos_dir)?;
+        fs::create_dir_all(&worktrees_dir)?;
+
+        let (repo_owner, repo_name) = self.repo_name()?;
+
+        let repo_dir = repos_dir.join(repo_owner.as_ref()).join(repo_name.as_ref());
+        if !repo_dir.is_dir() {
+            fs::create_dir_all(&repo_dir)?;
+            run_git(&repo_dir, &["init"]).await?;
+            run_git(
+                &repo_dir,
+                &["remote", "add", "origin", &self.example.repository_url],
+            )
+            .await?;
+        }
+
+        run_git(
+            &repo_dir,
+            &["fetch", "--depth", "1", "origin", &self.example.revision],
+        )
+        .await?;
+
+        let worktree_path = worktrees_dir.join(&self.name);
+
+        if worktree_path.is_dir() {
+            run_git(&worktree_path, &["clean", "--force", "-d"]).await?;
+            run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?;
+            run_git(&worktree_path, &["checkout", &self.example.revision]).await?;
+        } else {
+            let worktree_path_string = worktree_path.to_string_lossy();
+            run_git(
+                &repo_dir,
+                &[
+                    "worktree",
+                    "add",
+                    "-f",
+                    &worktree_path_string,
+                    &self.example.revision,
+                ],
+            )
+            .await?;
+        }
+
+        Ok(worktree_path)
+    }
+
+    #[allow(unused)]
+    fn repo_name(&self) -> Result<(Cow<'_, str>, Cow<'_, str>)> {
+        // git@github.com:owner/repo.git
+        if self.example.repository_url.contains('@') {
+            let (owner, repo) = self
+                .example
+                .repository_url
+                .split_once(':')
+                .context("expected : in git url")?
+                .1
+                .split_once('/')
+                .context("expected / in git url")?;
+            Ok((
+                Cow::Borrowed(owner),
+                Cow::Borrowed(repo.trim_end_matches(".git")),
+            ))
+        // http://github.com/owner/repo.git
+        } else {
+            let url = Url::parse(&self.example.repository_url)?;
+            let mut segments = url.path_segments().context("empty http url")?;
+            let owner = segments
+                .next()
+                .context("expected owner path segment")?
+                .to_string();
+            let repo = segments
+                .next()
+                .context("expected repo path segment")?
+                .trim_end_matches(".git")
+                .to_string();
+            assert!(segments.next().is_none());
+
+            Ok((owner.into(), repo.into()))
+        }
+    }
+}
+
+async fn run_git(repo_path: &Path, args: &[&str]) -> Result<String> {
+    let output = smol::process::Command::new("git")
+        .current_dir(repo_path)
+        .args(args)
+        .output()
+        .await?;
+
+    anyhow::ensure!(
+        output.status.success(),
+        "`git {}` within `{}` failed with status: {}\nstderr:\n{}\nstdout:\n{}",
+        args.join(" "),
+        repo_path.display(),
+        output.status,
+        String::from_utf8_lossy(&output.stderr),
+        String::from_utf8_lossy(&output.stdout),
+    );
+    Ok(String::from_utf8(output.stdout)?.trim().to_string())
+}
+
+impl Display for NamedExample {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "# {}\n\n", self.name)?;
+        write!(
+            f,
+            "{REPOSITORY_URL_FIELD} = {}\n",
+            self.example.repository_url
+        )?;
+        write!(f, "{REVISION_FIELD} = {}\n\n", self.example.revision)?;
+
+        write!(
+            f,
+            "## {CURSOR_POSITION_HEADING}\n\n`````{}\n{}`````\n",
+            self.example.cursor_path.display(),
+            self.example.cursor_position
+        )?;
+        write!(f, "## {EDIT_HISTORY_HEADING}\n\n")?;
+
+        if !self.example.edit_history.is_empty() {
+            write!(f, "`````diff\n")?;
+            for item in &self.example.edit_history {
+                write!(f, "{item}")?;
+            }
+            write!(f, "`````\n")?;
+        }
+
+        if !self.example.expected_patch.is_empty() {
+            write!(
+                f,
+                "\n## {EXPECTED_PATCH_HEADING}\n\n`````diff\n{}`````\n",
+                self.example.expected_patch
+            )?;
+        }
+
+        if !self.example.expected_excerpts.is_empty() {
+            write!(f, "\n## {EXPECTED_EXCERPTS_HEADING}\n\n")?;
+
+            for excerpt in &self.example.expected_excerpts {
+                write!(
+                    f,
+                    "`````{}{}\n{}`````\n\n",
+                    excerpt
+                        .path
+                        .extension()
+                        .map(|ext| format!("{} ", ext.to_string_lossy()))
+                        .unwrap_or_default(),
+                    excerpt.path.display(),
+                    excerpt.text
+                )?;
+            }
+        }
+
+        Ok(())
+    }
+}

crates/zeta_cli/src/main.rs 🔗

@@ -1,8 +1,10 @@
+mod example;
 mod headless;
 mod source_location;
 mod syntax_retrieval_stats;
 mod util;
 
+use crate::example::{ExampleFormat, NamedExample};
 use crate::syntax_retrieval_stats::retrieval_stats;
 use ::serde::Serialize;
 use ::util::paths::PathStyle;
@@ -22,6 +24,7 @@ use language_model::LanguageModelRegistry;
 use project::{Project, Worktree};
 use reqwest_client::ReqwestClient;
 use serde_json::json;
+use std::io;
 use std::{collections::HashSet, path::PathBuf, process::exit, str::FromStr, sync::Arc};
 use zeta2::{ContextMode, LlmContextOptions, SearchToolQuery};
 
@@ -48,6 +51,11 @@ enum Command {
         #[command(subcommand)]
         command: Zeta2Command,
     },
+    ConvertExample {
+        path: PathBuf,
+        #[arg(long, value_enum, default_value_t = ExampleFormat::Md)]
+        output_format: ExampleFormat,
+    },
 }
 
 #[derive(Subcommand, Debug)]
@@ -641,6 +649,15 @@ fn main() {
                         }
                     },
                 },
+                Command::ConvertExample {
+                    path,
+                    output_format,
+                } => {
+                    let example = NamedExample::load(path).unwrap();
+                    example.write(output_format, io::stdout()).unwrap();
+                    let _ = cx.update(|cx| cx.quit());
+                    return;
+                }
             };
 
             match result {