diff --git a/.github/workflows/run_cron_unit_evals.yml b/.github/workflows/run_cron_unit_evals.yml index e7dcb04f77f3507aa861aee30e152850cb36b600..5f1d13b5b8311a67ebff207d1211acffbdff5d6e 100644 --- a/.github/workflows/run_cron_unit_evals.yml +++ b/.github/workflows/run_cron_unit_evals.yml @@ -13,6 +13,12 @@ on: jobs: cron_unit_evals: runs-on: namespace-profile-16x32-ubuntu-2204 + strategy: + matrix: + model: + - anthropic/claude-sonnet-4-5-latest + - anthropic/claude-opus-4-5-latest + fail-fast: false steps: - name: steps::checkout_repo uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 @@ -49,6 +55,7 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }} GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }} + ZED_AGENT_MODEL: ${{ matrix.model }} - name: steps::cleanup_cargo_config if: always() run: | diff --git a/Cargo.lock b/Cargo.lock index 4b619bc4d1d90f817bba19cc42a4b43df46cce16..e398d20beaa0a8f7f381b4389bc5d17f310b12f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21017,6 +21017,7 @@ dependencies = [ "indexmap", "indoc", "serde", + "serde_json", "toml 0.8.23", "toml_edit 0.22.27", ] diff --git a/tooling/xtask/Cargo.toml b/tooling/xtask/Cargo.toml index 38c491dac668cf008aaee38acb30f870bcf09852..13179b2eb69ba9a63ba6be5784907b78bba1b9f2 100644 --- a/tooling/xtask/Cargo.toml +++ b/tooling/xtask/Cargo.toml @@ -18,5 +18,6 @@ toml.workspace = true indoc.workspace = true indexmap.workspace = true serde.workspace = true +serde_json.workspace = true toml_edit.workspace = true gh-workflow.workspace = true diff --git a/tooling/xtask/src/tasks/workflows/run_agent_evals.rs b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs index 220d3872f72326f42845622b5e3c61f4819f4550..34a7c6885db061191f5c3eac447838439708fbfe 100644 --- a/tooling/xtask/src/tasks/workflows/run_agent_evals.rs +++ b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs @@ -1,4 +1,7 @@ -use gh_workflow::{Event, Expression, Job, Run, Schedule, Step, Use, Workflow, WorkflowDispatch}; +use gh_workflow::{ + Event, Expression, Job, Run, Schedule, Step, Strategy, Use, Workflow, WorkflowDispatch, +}; +use serde_json::json; use crate::tasks::workflows::{ runners::{self, Platform}, @@ -114,7 +117,31 @@ fn cron_unit_evals() -> NamedJob { "#})) } - named::job(unit_evals(None).add_step(send_failure_to_slack())) + named::job(cron_unit_evals_job().add_step(send_failure_to_slack())) +} + +const UNIT_EVAL_MODELS: &[&str] = &[ + "anthropic/claude-sonnet-4-5-latest", + "anthropic/claude-opus-4-5-latest", +]; + +fn cron_unit_evals_job() -> Job { + let script_step = add_api_keys(steps::script("./script/run-unit-evals")) + .add_env(("ZED_AGENT_MODEL", "${{ matrix.model }}")); + + Job::default() + .runs_on(runners::LINUX_DEFAULT) + .strategy(Strategy::default().fail_fast(false).matrix(json!({ + "model": UNIT_EVAL_MODELS + }))) + .add_step(steps::checkout_repo()) + .add_step(steps::setup_cargo_config(Platform::Linux)) + .add_step(steps::cache_rust_dependencies_namespace()) + .map(steps::install_linux_dependencies) + .add_step(steps::cargo_install_nextest()) + .add_step(steps::clear_target_dir_if_large(Platform::Linux)) + .add_step(script_step) + .add_step(steps::cleanup_cargo_config(Platform::Linux)) } fn unit_evals(commit: Option<&WorkflowInput>) -> Job {