Run the unit evals cron in a matrix (#43907)

Richard Feldman created 1 month ago

For now, just using Sonnet 4.5 and Opus 4.5 - I'll make a separate PR
for non-Anthropic models, in case they introduce new failures.

Release Notes:

- N/A

Change summary

.github/workflows/run_cron_unit_evals.yml            |  7 +++
Cargo.lock                                           |  1 
tooling/xtask/Cargo.toml                             |  1 
tooling/xtask/src/tasks/workflows/run_agent_evals.rs | 31 +++++++++++++
4 files changed, 38 insertions(+), 2 deletions(-)

Detailed changes

.github/workflows/run_cron_unit_evals.yml 🔗

@@ -13,6 +13,12 @@ on:
 jobs:
   cron_unit_evals:
     runs-on: namespace-profile-16x32-ubuntu-2204
+    strategy:
+      matrix:
+        model:
+        - anthropic/claude-sonnet-4-5-latest
+        - anthropic/claude-opus-4-5-latest
+      fail-fast: false
     steps:
     - name: steps::checkout_repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -49,6 +55,7 @@ jobs:
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }}
         GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }}
+        ZED_AGENT_MODEL: ${{ matrix.model }}
     - name: steps::cleanup_cargo_config
       if: always()
       run: |

Cargo.lock 🔗

@@ -21017,6 +21017,7 @@ dependencies = [
  "indexmap",
  "indoc",
  "serde",
+ "serde_json",
  "toml 0.8.23",
  "toml_edit 0.22.27",
 ]

tooling/xtask/Cargo.toml 🔗

@@ -18,5 +18,6 @@ toml.workspace = true
 indoc.workspace = true
 indexmap.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 toml_edit.workspace = true
 gh-workflow.workspace = true

tooling/xtask/src/tasks/workflows/run_agent_evals.rs 🔗

@@ -1,4 +1,7 @@
-use gh_workflow::{Event, Expression, Job, Run, Schedule, Step, Use, Workflow, WorkflowDispatch};
+use gh_workflow::{
+    Event, Expression, Job, Run, Schedule, Step, Strategy, Use, Workflow, WorkflowDispatch,
+};
+use serde_json::json;
 
 use crate::tasks::workflows::{
     runners::{self, Platform},
@@ -114,7 +117,31 @@ fn cron_unit_evals() -> NamedJob {
         "#}))
     }
 
-    named::job(unit_evals(None).add_step(send_failure_to_slack()))
+    named::job(cron_unit_evals_job().add_step(send_failure_to_slack()))
+}
+
+const UNIT_EVAL_MODELS: &[&str] = &[
+    "anthropic/claude-sonnet-4-5-latest",
+    "anthropic/claude-opus-4-5-latest",
+];
+
+fn cron_unit_evals_job() -> Job {
+    let script_step = add_api_keys(steps::script("./script/run-unit-evals"))
+        .add_env(("ZED_AGENT_MODEL", "${{ matrix.model }}"));
+
+    Job::default()
+        .runs_on(runners::LINUX_DEFAULT)
+        .strategy(Strategy::default().fail_fast(false).matrix(json!({
+            "model": UNIT_EVAL_MODELS
+        })))
+        .add_step(steps::checkout_repo())
+        .add_step(steps::setup_cargo_config(Platform::Linux))
+        .add_step(steps::cache_rust_dependencies_namespace())
+        .map(steps::install_linux_dependencies)
+        .add_step(steps::cargo_install_nextest())
+        .add_step(steps::clear_target_dir_if_large(Platform::Linux))
+        .add_step(script_step)
+        .add_step(steps::cleanup_cargo_config(Platform::Linux))
 }
 
 fn unit_evals(commit: Option<&WorkflowInput>) -> Job {