Split out cron and non-cron unit evals (#42472)

Richard Feldman and Bennet Bo Fenner created

Release Notes:

- N/A

---------

Co-authored-by: Bennet Bo Fenner <bennetbo@gmx.de>

Change summary

.github/workflows/run_agent_evals.yml                |  5 
.github/workflows/run_cron_unit_evals.yml            | 78 ++++++++++++
.github/workflows/run_unit_evals.yml                 | 21 ++
script/run-unit-evals                                |  4 
tooling/xtask/src/tasks/workflows.rs                 |  4 
tooling/xtask/src/tasks/workflows/run_agent_evals.rs | 91 ++++++++++---
6 files changed, 178 insertions(+), 25 deletions(-)

Detailed changes

.github/workflows/run_agent_evals.yml 🔗

@@ -51,6 +51,11 @@ jobs:
     - name: run_agent_evals::agent_evals::run_eval
       run: cargo run --package=eval -- --repetitions=8 --concurrency=1 --model "${MODEL_NAME}"
       shell: bash -euxo pipefail {0}
+      env:
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }}
+        GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }}
     - name: steps::cleanup_cargo_config
       if: always()
       run: |

.github/workflows/run_cron_unit_evals.yml 🔗

@@ -0,0 +1,78 @@
+# Generated from xtask::workflows::run_cron_unit_evals
+# Rebuild with `cargo xtask workflows`.
+name: run_cron_unit_evals
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: '0'
+  RUST_BACKTRACE: '1'
+  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
+on:
+  schedule:
+  - cron: 47 1 * * 2
+  workflow_dispatch: {}
+jobs:
+  cron_unit_evals:
+    runs-on: namespace-profile-16x32-ubuntu-2204
+    steps:
+    - name: steps::checkout_repo
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      with:
+        clean: false
+    - name: steps::setup_cargo_config
+      run: |
+        mkdir -p ./../.cargo
+        cp ./.cargo/ci-config.toml ./../.cargo/config.toml
+      shell: bash -euxo pipefail {0}
+    - name: steps::cache_rust_dependencies_namespace
+      uses: namespacelabs/nscloud-cache-action@v1
+      with:
+        cache: rust
+    - name: steps::setup_linux
+      run: ./script/linux
+      shell: bash -euxo pipefail {0}
+    - name: steps::install_mold
+      run: ./script/install-mold
+      shell: bash -euxo pipefail {0}
+    - name: steps::download_wasi_sdk
+      run: ./script/download-wasi-sdk
+      shell: bash -euxo pipefail {0}
+    - name: steps::cargo_install_nextest
+      run: cargo install cargo-nextest --locked
+      shell: bash -euxo pipefail {0}
+    - name: steps::clear_target_dir_if_large
+      run: ./script/clear-target-dir-if-larger-than 250
+      shell: bash -euxo pipefail {0}
+    - name: ./script/run-unit-evals
+      run: ./script/run-unit-evals
+      shell: bash -euxo pipefail {0}
+      env:
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }}
+        GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }}
+    - name: run_agent_evals::unit_evals::send_failure_to_slack
+      if: ${{ failure() }}
+      uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
+      with:
+        method: chat.postMessage
+        token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
+        payload: |
+          channel: C04UDRNNJFQ
+          text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
+    - name: steps::cleanup_cargo_config
+      if: always()
+      run: |
+        rm -rf ./../.cargo
+      shell: bash -euxo pipefail {0}
+    - name: run_agent_evals::cron_unit_evals::send_failure_to_slack
+      if: ${{ failure() }}
+      uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
+      with:
+        method: chat.postMessage
+        token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
+        payload: |
+          channel: C04UDRNNJFQ
+          text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true

.github/workflows/run_unit_evals.yml 🔗

@@ -6,12 +6,21 @@ env:
   CARGO_INCREMENTAL: '0'
   RUST_BACKTRACE: '1'
   ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
+  ZED_EVAL_TELEMETRY: '1'
+  MODEL_NAME: ${{ inputs.model_name }}
 on:
-  schedule:
-  - cron: 47 1 * * 2
-  workflow_dispatch: {}
+  workflow_dispatch:
+    inputs:
+      model_name:
+        description: model_name
+        required: true
+        type: string
+      commit_sha:
+        description: commit_sha
+        required: true
+        type: string
 jobs:
-  unit_evals:
+  run_unit_evals:
     runs-on: namespace-profile-16x32-ubuntu-2204
     steps:
     - name: steps::checkout_repo
@@ -47,6 +56,10 @@ jobs:
       shell: bash -euxo pipefail {0}
       env:
         ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }}
+        GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }}
+        UNIT_EVAL_COMMIT: ${{ inputs.commit_sha }}
     - name: run_agent_evals::unit_evals::send_failure_to_slack
       if: ${{ failure() }}
       uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52

script/run-unit-evals 🔗

@@ -2,4 +2,8 @@
 
 set -euxo pipefail
 
+if [ -n "${UNIT_EVAL_COMMIT:-}" ]; then
+  git checkout "$UNIT_EVAL_COMMIT"
+fi
+
 GPUI_TEST_TIMEOUT=1500 cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)'

tooling/xtask/src/tasks/workflows.rs 🔗

@@ -33,6 +33,10 @@ pub fn run_workflows(_: GenerateWorkflowArgs) -> Result<()> {
         ("cherry_pick.yml", cherry_pick::cherry_pick()),
         ("compare_perf.yml", compare_perf::compare_perf()),
         ("run_unit_evals.yml", run_agent_evals::run_unit_evals()),
+        (
+            "run_cron_unit_evals.yml",
+            run_agent_evals::run_cron_unit_evals(),
+        ),
         ("run_agent_evals.yml", run_agent_evals::run_agent_evals()),
         ("after_release.yml", after_release::after_release()),
     ];

tooling/xtask/src/tasks/workflows/run_agent_evals.rs 🔗

@@ -28,6 +28,36 @@ pub(crate) fn run_agent_evals() -> Workflow {
         .add_job(agent_evals.name, agent_evals.job)
 }
 
+pub(crate) fn run_unit_evals() -> Workflow {
+    let model_name = Input::string("model_name", None);
+    let commit_sha = Input::string("commit_sha", None);
+
+    let unit_evals = named::job(unit_evals(Some(&commit_sha)));
+
+    named::workflow()
+        .name("run_unit_evals")
+        .on(Event::default().workflow_dispatch(
+            WorkflowDispatch::default()
+                .add_input(model_name.name, model_name.input())
+                .add_input(commit_sha.name, commit_sha.input()),
+        ))
+        .concurrency(vars::one_workflow_per_non_main_branch())
+        .add_env(("CARGO_TERM_COLOR", "always"))
+        .add_env(("CARGO_INCREMENTAL", 0))
+        .add_env(("RUST_BACKTRACE", 1))
+        .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED))
+        .add_env(("ZED_EVAL_TELEMETRY", 1))
+        .add_env(("MODEL_NAME", model_name.to_string()))
+        .add_job(unit_evals.name, unit_evals.job)
+}
+
+fn add_api_keys(step: Step<Run>) -> Step<Run> {
+    step.add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY))
+        .add_env(("OPENAI_API_KEY", vars::OPENAI_API_KEY))
+        .add_env(("GOOGLE_AI_API_KEY", vars::GOOGLE_AI_API_KEY))
+        .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT))
+}
+
 fn agent_evals() -> NamedJob {
     fn run_eval() -> Step<Run> {
         named::bash(
@@ -44,16 +74,16 @@ fn agent_evals() -> NamedJob {
             .map(steps::install_linux_dependencies)
             .add_step(setup_cargo_config(Platform::Linux))
             .add_step(steps::script("cargo build --package=eval"))
-            .add_step(run_eval())
+            .add_step(add_api_keys(run_eval()))
             .add_step(steps::cleanup_cargo_config(Platform::Linux)),
     )
 }
 
-pub(crate) fn run_unit_evals() -> Workflow {
-    let unit_evals = unit_evals();
+pub(crate) fn run_cron_unit_evals() -> Workflow {
+    let unit_evals = cron_unit_evals();
 
     named::workflow()
-        .name("run_unit_evals")
+        .name("run_cron_unit_evals")
         .on(Event::default()
             .schedule([
                 // GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
@@ -68,7 +98,7 @@ pub(crate) fn run_unit_evals() -> Workflow {
         .add_job(unit_evals.name, unit_evals.job)
 }
 
-fn unit_evals() -> NamedJob {
+fn cron_unit_evals() -> NamedJob {
     fn send_failure_to_slack() -> Step<Use> {
         named::uses(
             "slackapi",
@@ -84,20 +114,39 @@ fn unit_evals() -> NamedJob {
         "#}))
     }
 
-    named::job(
-        Job::default()
-            .runs_on(runners::LINUX_DEFAULT)
-            .add_step(steps::checkout_repo())
-            .add_step(steps::setup_cargo_config(Platform::Linux))
-            .add_step(steps::cache_rust_dependencies_namespace())
-            .map(steps::install_linux_dependencies)
-            .add_step(steps::cargo_install_nextest(Platform::Linux))
-            .add_step(steps::clear_target_dir_if_large(Platform::Linux))
-            .add_step(
-                steps::script("./script/run-unit-evals")
-                    .add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY)),
-            )
-            .add_step(send_failure_to_slack())
-            .add_step(steps::cleanup_cargo_config(Platform::Linux)),
-    )
+    named::job(unit_evals(None).add_step(send_failure_to_slack()))
+}
+
+fn unit_evals(commit: Option<&Input>) -> Job {
+    fn send_failure_to_slack() -> Step<Use> {
+        named::uses(
+            "slackapi",
+            "slack-github-action",
+            "b0fa283ad8fea605de13dc3f449259339835fc52",
+        )
+        .if_condition(Expression::new("${{ failure() }}"))
+        .add_with(("method", "chat.postMessage"))
+        .add_with(("token", vars::SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN))
+        .add_with(("payload", indoc::indoc!{r#"
+            channel: C04UDRNNJFQ
+            text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
+        "#}))
+    }
+
+    let script_step = add_api_keys(steps::script("./script/run-unit-evals"));
+
+    Job::default()
+        .runs_on(runners::LINUX_DEFAULT)
+        .add_step(steps::checkout_repo())
+        .add_step(steps::setup_cargo_config(Platform::Linux))
+        .add_step(steps::cache_rust_dependencies_namespace())
+        .map(steps::install_linux_dependencies)
+        .add_step(steps::cargo_install_nextest(Platform::Linux))
+        .add_step(steps::clear_target_dir_if_large(Platform::Linux))
+        .add_step(match commit {
+            Some(commit) => script_step.add_env(("UNIT_EVAL_COMMIT", commit)),
+            None => script_step,
+        })
+        .add_step(send_failure_to_slack())
+        .add_step(steps::cleanup_cargo_config(Platform::Linux))
 }