gh-workflow unit evals (#41637)

Ben Kunkle created

Closes #ISSUE

Release Notes:

- N/A *or* Added/Fixed/Improved ...

Change summary

.github/workflows/eval.yml                           |  71 --------
.github/workflows/run_agent_evals.yml                |  62 +++++++
.github/workflows/run_unit_evals.yml                 |  63 +++++++
.github/workflows/unit_evals.yml                     |  86 ----------
script/run-unit-evals                                |   5 
tooling/xtask/src/tasks/workflows.rs                 |   3 
tooling/xtask/src/tasks/workflows/run_agent_evals.rs | 113 ++++++++++++++
7 files changed, 246 insertions(+), 157 deletions(-)

Detailed changes

.github/workflows/eval.yml πŸ”—

@@ -1,71 +0,0 @@
-name: Run Agent Eval
-
-on:
-  schedule:
-    - cron: "0 0 * * *"
-
-  pull_request:
-    branches:
-      - "**"
-    types: [synchronize, reopened, labeled]
-
-  workflow_dispatch:
-
-concurrency:
-  # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
-env:
-  CARGO_TERM_COLOR: always
-  CARGO_INCREMENTAL: 0
-  RUST_BACKTRACE: 1
-  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
-  ZED_EVAL_TELEMETRY: 1
-
-jobs:
-  run_eval:
-    timeout-minutes: 60
-    name: Run Agent Eval
-    if: >
-      github.repository_owner == 'zed-industries' &&
-      (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
-    runs-on:
-      - namespace-profile-16x32-ubuntu-2204
-    steps:
-      - name: Add Rust to the PATH
-        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
-
-      - name: Checkout repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
-        with:
-          clean: false
-
-      - name: Cache dependencies
-        uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
-        with:
-          save-if: ${{ github.ref == 'refs/heads/main' }}
-          # cache-provider: "buildjet"
-
-      - name: Install Linux dependencies
-        run: ./script/linux
-
-      - name: Configure CI
-        run: |
-          mkdir -p ./../.cargo
-          cp ./.cargo/ci-config.toml ./../.cargo/config.toml
-
-      - name: Compile eval
-        run: cargo build --package=eval
-
-      - name: Run eval
-        run: cargo run --package=eval -- --repetitions=8 --concurrency=1
-
-      # Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
-      # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
-      # to clean up the config file, I’ve included the cleanup code here as a precaution.
-      # While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution.
-      - name: Clean CI config file
-        if: always()
-        run: rm -rf ./../.cargo

.github/workflows/run_agent_evals.yml πŸ”—

@@ -0,0 +1,62 @@
+# Generated from xtask::workflows::run_agent_evals
+# Rebuild with `cargo xtask workflows`.
+name: run_agent_evals
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: '0'
+  RUST_BACKTRACE: '1'
+  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
+  ZED_EVAL_TELEMETRY: '1'
+on:
+  pull_request:
+    types:
+    - synchronize
+    - reopened
+    - labeled
+    branches:
+    - '**'
+  schedule:
+  - cron: 0 0 * * *
+  workflow_dispatch: {}
+jobs:
+  agent_evals:
+    if: |
+      github.repository_owner == 'zed-industries' &&
+      (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
+    runs-on: namespace-profile-16x32-ubuntu-2204
+    steps:
+    - name: steps::checkout_repo
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      with:
+        clean: false
+    - name: steps::cache_rust_dependencies
+      uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
+      with:
+        save-if: ${{ github.ref == 'refs/heads/main' }}
+    - name: steps::setup_linux
+      run: ./script/linux
+      shell: bash -euxo pipefail {0}
+    - name: steps::install_mold
+      run: ./script/install-mold
+      shell: bash -euxo pipefail {0}
+    - name: steps::setup_cargo_config
+      run: |
+        mkdir -p ./../.cargo
+        cp ./.cargo/ci-config.toml ./../.cargo/config.toml
+      shell: bash -euxo pipefail {0}
+    - name: cargo build --package=eval
+      run: cargo build --package=eval
+      shell: bash -euxo pipefail {0}
+    - name: run_agent_evals::agent_evals::run_eval
+      run: cargo run --package=eval -- --repetitions=8 --concurrency=1
+      shell: bash -euxo pipefail {0}
+    - name: steps::cleanup_cargo_config
+      if: always()
+      run: |
+        rm -rf ./../.cargo
+      shell: bash -euxo pipefail {0}
+    timeout-minutes: 60
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true

.github/workflows/run_unit_evals.yml πŸ”—

@@ -0,0 +1,63 @@
+# Generated from xtask::workflows::run_agent_evals
+# Rebuild with `cargo xtask workflows`.
+name: run_agent_evals
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: '0'
+  RUST_BACKTRACE: '1'
+  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
+on:
+  schedule:
+  - cron: 47 1 * * 2
+  workflow_dispatch: {}
+jobs:
+  unit_evals:
+    runs-on: namespace-profile-16x32-ubuntu-2204
+    steps:
+    - name: steps::checkout_repo
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      with:
+        clean: false
+    - name: steps::setup_cargo_config
+      run: |
+        mkdir -p ./../.cargo
+        cp ./.cargo/ci-config.toml ./../.cargo/config.toml
+      shell: bash -euxo pipefail {0}
+    - name: steps::cache_rust_dependencies
+      uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
+      with:
+        save-if: ${{ github.ref == 'refs/heads/main' }}
+    - name: steps::setup_linux
+      run: ./script/linux
+      shell: bash -euxo pipefail {0}
+    - name: steps::install_mold
+      run: ./script/install-mold
+      shell: bash -euxo pipefail {0}
+    - name: steps::cargo_install_nextest
+      run: cargo install cargo-nextest --locked
+      shell: bash -euxo pipefail {0}
+    - name: steps::clear_target_dir_if_large
+      run: ./script/clear-target-dir-if-larger-than 100
+      shell: bash -euxo pipefail {0}
+    - name: ./script/run-unit-evals
+      run: ./script/run-unit-evals
+      shell: bash -euxo pipefail {0}
+      env:
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+    - name: run_agent_evals::unit_evals::send_failure_to_slack
+      if: ${{ failure() }}
+      uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
+      with:
+        method: chat.postMessage
+        token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
+        payload: |
+          channel: C04UDRNNJFQ
+          text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
+    - name: steps::cleanup_cargo_config
+      if: always()
+      run: |
+        rm -rf ./../.cargo
+      shell: bash -euxo pipefail {0}
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true

.github/workflows/unit_evals.yml πŸ”—

@@ -1,86 +0,0 @@
-name: Run Unit Evals
-
-on:
-  schedule:
-    # GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
-    - cron: "47 1 * * 2"
-  workflow_dispatch:
-
-concurrency:
-  # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
-env:
-  CARGO_TERM_COLOR: always
-  CARGO_INCREMENTAL: 0
-  RUST_BACKTRACE: 1
-  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
-
-jobs:
-  unit_evals:
-    if: github.repository_owner == 'zed-industries'
-    timeout-minutes: 60
-    name: Run unit evals
-    runs-on:
-      - namespace-profile-16x32-ubuntu-2204
-    steps:
-      - name: Add Rust to the PATH
-        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
-
-      - name: Checkout repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
-        with:
-          clean: false
-
-      - name: Cache dependencies
-        uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
-        with:
-          save-if: ${{ github.ref == 'refs/heads/main' }}
-          # cache-provider: "buildjet"
-
-      - name: Install Linux dependencies
-        run: ./script/linux
-
-      - name: Configure CI
-        run: |
-          mkdir -p ./../.cargo
-          cp ./.cargo/ci-config.toml ./../.cargo/config.toml
-
-      - name: Install Rust
-        shell: bash -euxo pipefail {0}
-        run: |
-          cargo install cargo-nextest --locked
-
-      - name: Install Node
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
-        with:
-          node-version: "18"
-
-      - name: Limit target directory size
-        shell: bash -euxo pipefail {0}
-        run: script/clear-target-dir-if-larger-than 100
-
-      - name: Run unit evals
-        shell: bash -euxo pipefail {0}
-        run: cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)'
-        env:
-          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-
-      - name: Send failure message to Slack channel if needed
-        if: ${{ failure() }}
-        uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
-        with:
-          method: chat.postMessage
-          token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
-          payload: |
-            channel: C04UDRNNJFQ
-            text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
-
-      # Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
-      # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
-      # to clean up the config file, I’ve included the cleanup code here as a precaution.
-      # While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution.
-      - name: Clean CI config file
-        if: always()
-        run: rm -rf ./../.cargo

script/run-unit-evals πŸ”—

@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)'

tooling/xtask/src/tasks/workflows.rs πŸ”—

@@ -10,6 +10,7 @@ mod release_nightly;
 mod run_bundling;
 
 mod release;
+mod run_agent_evals;
 mod run_tests;
 mod runners;
 mod steps;
@@ -28,6 +29,8 @@ pub fn run_workflows(_: GenerateWorkflowArgs) -> Result<()> {
         ("run_tests.yml", run_tests::run_tests()),
         ("release.yml", release::release()),
         ("compare_perf.yml", compare_perf::compare_perf()),
+        ("run_unit_evals.yml", run_agent_evals::run_unit_evals()),
+        ("run_agent_evals.yml", run_agent_evals::run_agent_evals()),
     ];
     fs::create_dir_all(dir)
         .with_context(|| format!("Failed to create directory: {}", dir.display()))?;

tooling/xtask/src/tasks/workflows/run_agent_evals.rs πŸ”—

@@ -0,0 +1,113 @@
+use gh_workflow::{
+    Event, Expression, Job, PullRequest, PullRequestType, Run, Schedule, Step, Use, Workflow,
+    WorkflowDispatch,
+};
+
+use crate::tasks::workflows::{
+    runners::{self, Platform},
+    steps::{self, FluentBuilder as _, NamedJob, named, setup_cargo_config},
+    vars,
+};
+
+pub(crate) fn run_agent_evals() -> Workflow {
+    let agent_evals = agent_evals();
+
+    named::workflow()
+        .on(Event::default()
+            .schedule([Schedule::default().cron("0 0 * * *")])
+            .pull_request(PullRequest::default().add_branch("**").types([
+                PullRequestType::Synchronize,
+                PullRequestType::Reopened,
+                PullRequestType::Labeled,
+            ]))
+            .workflow_dispatch(WorkflowDispatch::default()))
+        .concurrency(vars::one_workflow_per_non_main_branch())
+        .add_env(("CARGO_TERM_COLOR", "always"))
+        .add_env(("CARGO_INCREMENTAL", 0))
+        .add_env(("RUST_BACKTRACE", 1))
+        .add_env(("ANTHROPIC_API_KEY", "${{ secrets.ANTHROPIC_API_KEY }}"))
+        .add_env((
+            "ZED_CLIENT_CHECKSUM_SEED",
+            "${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}",
+        ))
+        .add_env(("ZED_EVAL_TELEMETRY", 1))
+        .add_job(agent_evals.name, agent_evals.job)
+}
+
+fn agent_evals() -> NamedJob {
+    fn run_eval() -> Step<Run> {
+        named::bash("cargo run --package=eval -- --repetitions=8 --concurrency=1")
+    }
+
+    named::job(
+        Job::default()
+            .cond(Expression::new(indoc::indoc!{r#"
+                github.repository_owner == 'zed-industries' &&
+                (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
+            "#}))
+            .runs_on(runners::LINUX_DEFAULT)
+            .timeout_minutes(60_u32)
+            .add_step(steps::checkout_repo())
+            .add_step(steps::cache_rust_dependencies())
+            .map(steps::install_linux_dependencies)
+            .add_step(setup_cargo_config(Platform::Linux))
+            .add_step(steps::script("cargo build --package=eval"))
+            .add_step(run_eval())
+            .add_step(steps::cleanup_cargo_config(Platform::Linux))
+    )
+}
+
+pub(crate) fn run_unit_evals() -> Workflow {
+    let unit_evals = unit_evals();
+
+    named::workflow()
+        .on(Event::default()
+            .schedule([
+                // GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
+                Schedule::default().cron("47 1 * * 2"),
+            ])
+            .workflow_dispatch(WorkflowDispatch::default()))
+        .concurrency(vars::one_workflow_per_non_main_branch())
+        .add_env(("CARGO_TERM_COLOR", "always"))
+        .add_env(("CARGO_INCREMENTAL", 0))
+        .add_env(("RUST_BACKTRACE", 1))
+        .add_env((
+            "ZED_CLIENT_CHECKSUM_SEED",
+            "${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}",
+        ))
+        .add_job(unit_evals.name, unit_evals.job)
+}
+
+fn unit_evals() -> NamedJob {
+    fn send_failure_to_slack() -> Step<Use> {
+        named::uses(
+            "slackapi",
+            "slack-github-action",
+            "b0fa283ad8fea605de13dc3f449259339835fc52",
+        )
+        .if_condition(Expression::new("${{ failure() }}"))
+        .add_with(("method", "chat.postMessage"))
+        .add_with(("token", "${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}"))
+        .add_with(("payload", indoc::indoc!{r#"
+            channel: C04UDRNNJFQ
+            text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
+        "#}))
+    }
+
+    named::job(
+        Job::default()
+            .runs_on(runners::LINUX_DEFAULT)
+            .add_step(steps::checkout_repo())
+            .add_step(steps::setup_cargo_config(Platform::Linux))
+            .add_step(steps::cache_rust_dependencies())
+            .map(steps::install_linux_dependencies)
+            .add_step(steps::cargo_install_nextest(Platform::Linux))
+            .add_step(steps::clear_target_dir_if_large(Platform::Linux))
+            .add_step(
+                steps::script("./script/run-unit-evals")
+                    .add_env(("ANTHROPIC_API_KEY", "${{ secrets.ANTHROPIC_API_KEY }}")),
+            )
+            .add_step(send_failure_to_slack())
+            .add_step(steps::cleanup_cargo_config(Platform::Linux)),
+    )
+}