run_agent_evals.rs

  1use gh_workflow::{
  2    Event, Expression, Job, Run, Schedule, Step, Strategy, Use, Workflow, WorkflowDispatch,
  3};
  4use serde_json::json;
  5
  6use crate::tasks::workflows::{
  7    runners::{self, Platform},
  8    steps::{self, FluentBuilder as _, NamedJob, named, setup_cargo_config},
  9    vars::{self, WorkflowInput},
 10};
 11
 12pub(crate) fn run_agent_evals() -> Workflow {
 13    let agent_evals = agent_evals();
 14    let model_name = WorkflowInput::string("model_name", None);
 15
 16    named::workflow()
 17        .on(Event::default().workflow_dispatch(
 18            WorkflowDispatch::default().add_input(model_name.name, model_name.input()),
 19        ))
 20        .concurrency(vars::one_workflow_per_non_main_branch())
 21        .add_env(("CARGO_TERM_COLOR", "always"))
 22        .add_env(("CARGO_INCREMENTAL", 0))
 23        .add_env(("RUST_BACKTRACE", 1))
 24        .add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY))
 25        .add_env(("OPENAI_API_KEY", vars::OPENAI_API_KEY))
 26        .add_env(("GOOGLE_AI_API_KEY", vars::GOOGLE_AI_API_KEY))
 27        .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT))
 28        .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED))
 29        .add_env(("ZED_EVAL_TELEMETRY", 1))
 30        .add_env(("MODEL_NAME", model_name.to_string()))
 31        .add_job(agent_evals.name, agent_evals.job)
 32}
 33
 34pub(crate) fn run_unit_evals() -> Workflow {
 35    let model_name = WorkflowInput::string("model_name", None);
 36    let commit_sha = WorkflowInput::string("commit_sha", None);
 37
 38    let unit_evals = named::job(unit_evals(Some(&commit_sha)));
 39
 40    named::workflow()
 41        .name("run_unit_evals")
 42        .on(Event::default().workflow_dispatch(
 43            WorkflowDispatch::default()
 44                .add_input(model_name.name, model_name.input())
 45                .add_input(commit_sha.name, commit_sha.input()),
 46        ))
 47        .concurrency(vars::allow_concurrent_runs())
 48        .add_env(("CARGO_TERM_COLOR", "always"))
 49        .add_env(("CARGO_INCREMENTAL", 0))
 50        .add_env(("RUST_BACKTRACE", 1))
 51        .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED))
 52        .add_env(("ZED_EVAL_TELEMETRY", 1))
 53        .add_env(("MODEL_NAME", model_name.to_string()))
 54        .add_job(unit_evals.name, unit_evals.job)
 55}
 56
 57fn add_api_keys(step: Step<Run>) -> Step<Run> {
 58    step.add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY))
 59        .add_env(("OPENAI_API_KEY", vars::OPENAI_API_KEY))
 60        .add_env(("GOOGLE_AI_API_KEY", vars::GOOGLE_AI_API_KEY))
 61        .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT))
 62}
 63
 64fn agent_evals() -> NamedJob {
 65    fn run_eval() -> Step<Run> {
 66        named::bash(
 67            "cargo run --package=eval -- --repetitions=8 --concurrency=1 --model \"${MODEL_NAME}\"",
 68        )
 69    }
 70
 71    named::job(
 72        Job::default()
 73            .runs_on(runners::LINUX_DEFAULT)
 74            .timeout_minutes(60_u32 * 10)
 75            .add_step(steps::checkout_repo())
 76            .add_step(steps::cache_rust_dependencies_namespace())
 77            .map(steps::install_linux_dependencies)
 78            .add_step(setup_cargo_config(Platform::Linux))
 79            .add_step(steps::script("cargo build --package=eval"))
 80            .add_step(add_api_keys(run_eval()))
 81            .add_step(steps::cleanup_cargo_config(Platform::Linux)),
 82    )
 83}
 84
 85pub(crate) fn run_cron_unit_evals() -> Workflow {
 86    let unit_evals = cron_unit_evals();
 87
 88    named::workflow()
 89        .name("run_cron_unit_evals")
 90        .on(Event::default()
 91            .schedule([
 92                // GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
 93                Schedule::default().cron("47 1 * * 2"),
 94            ])
 95            .workflow_dispatch(WorkflowDispatch::default()))
 96        .concurrency(vars::one_workflow_per_non_main_branch())
 97        .add_env(("CARGO_TERM_COLOR", "always"))
 98        .add_env(("CARGO_INCREMENTAL", 0))
 99        .add_env(("RUST_BACKTRACE", 1))
100        .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED))
101        .add_job(unit_evals.name, unit_evals.job)
102}
103
104fn cron_unit_evals() -> NamedJob {
105    fn send_failure_to_slack() -> Step<Use> {
106        named::uses(
107            "slackapi",
108            "slack-github-action",
109            "b0fa283ad8fea605de13dc3f449259339835fc52",
110        )
111        .if_condition(Expression::new("${{ failure() }}"))
112        .add_with(("method", "chat.postMessage"))
113        .add_with(("token", vars::SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN))
114        .add_with(("payload", indoc::indoc!{r#"
115            channel: C04UDRNNJFQ
116            text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
117        "#}))
118    }
119
120    named::job(cron_unit_evals_job().add_step(send_failure_to_slack()))
121}
122
123const UNIT_EVAL_MODELS: &[&str] = &[
124    "anthropic/claude-sonnet-4-5-latest",
125    "anthropic/claude-opus-4-5-latest",
126    "google/gemini-3-pro",
127    "openai/gpt-5",
128];
129
130fn cron_unit_evals_job() -> Job {
131    let script_step = add_api_keys(steps::script("./script/run-unit-evals"))
132        .add_env(("ZED_AGENT_MODEL", "${{ matrix.model }}"));
133
134    Job::default()
135        .runs_on(runners::LINUX_DEFAULT)
136        .strategy(Strategy::default().fail_fast(false).matrix(json!({
137            "model": UNIT_EVAL_MODELS
138        })))
139        .add_step(steps::checkout_repo())
140        .add_step(steps::setup_cargo_config(Platform::Linux))
141        .add_step(steps::cache_rust_dependencies_namespace())
142        .map(steps::install_linux_dependencies)
143        .add_step(steps::cargo_install_nextest())
144        .add_step(steps::clear_target_dir_if_large(Platform::Linux))
145        .add_step(script_step)
146        .add_step(steps::cleanup_cargo_config(Platform::Linux))
147}
148
149fn unit_evals(commit: Option<&WorkflowInput>) -> Job {
150    let script_step = add_api_keys(steps::script("./script/run-unit-evals"));
151
152    Job::default()
153        .runs_on(runners::LINUX_DEFAULT)
154        .add_step(steps::checkout_repo())
155        .add_step(steps::setup_cargo_config(Platform::Linux))
156        .add_step(steps::cache_rust_dependencies_namespace())
157        .map(steps::install_linux_dependencies)
158        .add_step(steps::cargo_install_nextest())
159        .add_step(steps::clear_target_dir_if_large(Platform::Linux))
160        .add_step(match commit {
161            Some(commit) => script_step.add_env(("UNIT_EVAL_COMMIT", commit)),
162            None => script_step,
163        })
164        .add_step(steps::cleanup_cargo_config(Platform::Linux))
165}