eval: Add eval unit tests as a CI job (#32152)

Ben Brandt and Oleksiy Syvokon created

We run the unit evals once a day in the middle of the night, and trigger
a Slack post if it fails.


Release Notes:

- N/A

---------

Co-authored-by: Oleksiy Syvokon <oleksiy.syvokon@gmail.com>

Change summary

.github/workflows/unit_evals.yml               | 85 ++++++++++++++++++++
crates/assistant_tools/src/edit_agent/evals.rs |  2 
2 files changed, 86 insertions(+), 1 deletion(-)

Detailed changes

.github/workflows/unit_evals.yml ๐Ÿ”—

@@ -0,0 +1,85 @@
+name: Run Unit Evals
+
+on:
+  schedule:
+    # GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
+    - cron: "47 1 * * *"
+  workflow_dispatch:
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: 0
+  RUST_BACKTRACE: 1
+  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
+
+jobs:
+  unit_evals:
+    timeout-minutes: 60
+    name: Run unit evals
+    runs-on:
+      - buildjet-16vcpu-ubuntu-2204
+    steps:
+      - name: Add Rust to the PATH
+        run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+
+      - name: Checkout repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+        with:
+          clean: false
+
+      - name: Cache dependencies
+        uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
+        with:
+          save-if: ${{ github.ref == 'refs/heads/main' }}
+          cache-provider: "buildjet"
+
+      - name: Install Linux dependencies
+        run: ./script/linux
+
+      - name: Configure CI
+        run: |
+          mkdir -p ./../.cargo
+          cp ./.cargo/ci-config.toml ./../.cargo/config.toml
+
+      - name: Install Rust
+        shell: bash -euxo pipefail {0}
+        run: |
+          cargo install cargo-nextest --locked
+
+      - name: Install Node
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
+        with:
+          node-version: "18"
+
+      - name: Limit target directory size
+        shell: bash -euxo pipefail {0}
+        run: script/clear-target-dir-if-larger-than 100
+
+      - name: Run unit evals
+        shell: bash -euxo pipefail {0}
+        run: cargo nextest run --workspace --no-fail-fast --features eval --no-capture -E 'test(::eval_)' --test-threads 1
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+
+      - name: Send the pull request link into the Slack channel
+        if: ${{ failure() }}
+        uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
+          payload: |
+            channel: C04UDRNNJFQ
+            text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
+
+      # Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
+      # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
+      # to clean up the config file, Iโ€™ve included the cleanup code here as a precaution.
+      # While itโ€™s not strictly necessary at this moment, I believe itโ€™s better to err on the side of caution.
+      - name: Clean CI config file
+        if: always()
+        run: rm -rf ./../.cargo

crates/assistant_tools/src/edit_agent/evals.rs ๐Ÿ”—

@@ -1351,7 +1351,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
 
     let mismatched_tag_ratio =
         cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
-    if mismatched_tag_ratio > 0.05 {
+    if mismatched_tag_ratio > 0.10 {
         for eval_output in eval_outputs {
             println!("{}", eval_output);
         }