eval: Add eval unit tests as a CI job (#32152)
Ben Brandt
and
Oleksiy Syvokon
created 6 months ago
We run the unit evals once a day in the middle of the night, and trigger
a Slack post if it fails.
Release Notes:
- N/A
---------
Co-authored-by: Oleksiy Syvokon <oleksiy.syvokon@gmail.com>
Change summary
.github/workflows/unit_evals.yml | 85 ++++++++++++++++++++
crates/assistant_tools/src/edit_agent/evals.rs | 2
2 files changed, 86 insertions(+), 1 deletion(-)
Detailed changes
@@ -0,0 +1,85 @@
+name: Run Unit Evals
+
+on:
+ schedule:
+ # GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
+ - cron: "47 1 * * *"
+ workflow_dispatch:
+
+concurrency:
+ # Allow only one workflow per any non-`main` branch.
+ group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+ cancel-in-progress: true
+
+env:
+ CARGO_TERM_COLOR: always
+ CARGO_INCREMENTAL: 0
+ RUST_BACKTRACE: 1
+ ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
+
+jobs:
+ unit_evals:
+ timeout-minutes: 60
+ name: Run unit evals
+ runs-on:
+ - buildjet-16vcpu-ubuntu-2204
+ steps:
+ - name: Add Rust to the PATH
+ run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+
+ - name: Checkout repo
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ with:
+ clean: false
+
+ - name: Cache dependencies
+ uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
+ with:
+ save-if: ${{ github.ref == 'refs/heads/main' }}
+ cache-provider: "buildjet"
+
+ - name: Install Linux dependencies
+ run: ./script/linux
+
+ - name: Configure CI
+ run: |
+ mkdir -p ./../.cargo
+ cp ./.cargo/ci-config.toml ./../.cargo/config.toml
+
+ - name: Install Rust
+ shell: bash -euxo pipefail {0}
+ run: |
+ cargo install cargo-nextest --locked
+
+ - name: Install Node
+ uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
+ with:
+ node-version: "18"
+
+ - name: Limit target directory size
+ shell: bash -euxo pipefail {0}
+ run: script/clear-target-dir-if-larger-than 100
+
+ - name: Run unit evals
+ shell: bash -euxo pipefail {0}
+ run: cargo nextest run --workspace --no-fail-fast --features eval --no-capture -E 'test(::eval_)' --test-threads 1
+ env:
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+
+ - name: Send the pull request link into the Slack channel
+ if: ${{ failure() }}
+ uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
+ with:
+ method: chat.postMessage
+ token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
+ payload: |
+ channel: C04UDRNNJFQ
+ text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
+
+ # Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
+ # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
+ # to clean up the config file, Iโve included the cleanup code here as a precaution.
+ # While itโs not strictly necessary at this moment, I believe itโs better to err on the side of caution.
+ - name: Clean CI config file
+ if: always()
+ run: rm -rf ./../.cargo
@@ -1351,7 +1351,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
let mismatched_tag_ratio =
cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
- if mismatched_tag_ratio > 0.05 {
+ if mismatched_tag_ratio > 0.10 {
for eval_output in eval_outputs {
println!("{}", eval_output);
}