From 908ef035025335ebc94e04e807ee0a0ce5311ee6 Mon Sep 17 00:00:00 2001 From: Richard Feldman Date: Tue, 11 Nov 2025 13:45:48 -0500 Subject: [PATCH] Split out cron and non-cron unit evals (#42472) Release Notes: - N/A --------- Co-authored-by: Bennet Bo Fenner --- .github/workflows/run_agent_evals.yml | 5 + .github/workflows/run_cron_unit_evals.yml | 78 ++++++++++++++++ .github/workflows/run_unit_evals.yml | 21 ++++- script/run-unit-evals | 4 + tooling/xtask/src/tasks/workflows.rs | 4 + .../src/tasks/workflows/run_agent_evals.rs | 91 ++++++++++++++----- 6 files changed, 178 insertions(+), 25 deletions(-) create mode 100644 .github/workflows/run_cron_unit_evals.yml diff --git a/.github/workflows/run_agent_evals.yml b/.github/workflows/run_agent_evals.yml index 0ee8f3c5150589bc4565cd93326cf437d384c428..421d5a1c8003eaa42977339b4ab8e5e0df7ee014 100644 --- a/.github/workflows/run_agent_evals.yml +++ b/.github/workflows/run_agent_evals.yml @@ -51,6 +51,11 @@ jobs: - name: run_agent_evals::agent_evals::run_eval run: cargo run --package=eval -- --repetitions=8 --concurrency=1 --model "${MODEL_NAME}" shell: bash -euxo pipefail {0} + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }} + GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }} - name: steps::cleanup_cargo_config if: always() run: | diff --git a/.github/workflows/run_cron_unit_evals.yml b/.github/workflows/run_cron_unit_evals.yml new file mode 100644 index 0000000000000000000000000000000000000000..9137d1599c920d5f3c72ba7c884bc76d9aed6f54 --- /dev/null +++ b/.github/workflows/run_cron_unit_evals.yml @@ -0,0 +1,78 @@ +# Generated from xtask::workflows::run_cron_unit_evals +# Rebuild with `cargo xtask workflows`. +name: run_cron_unit_evals +env: + CARGO_TERM_COLOR: always + CARGO_INCREMENTAL: '0' + RUST_BACKTRACE: '1' + ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} +on: + schedule: + - cron: 47 1 * * 2 + workflow_dispatch: {} +jobs: + cron_unit_evals: + runs-on: namespace-profile-16x32-ubuntu-2204 + steps: + - name: steps::checkout_repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + clean: false + - name: steps::setup_cargo_config + run: | + mkdir -p ./../.cargo + cp ./.cargo/ci-config.toml ./../.cargo/config.toml + shell: bash -euxo pipefail {0} + - name: steps::cache_rust_dependencies_namespace + uses: namespacelabs/nscloud-cache-action@v1 + with: + cache: rust + - name: steps::setup_linux + run: ./script/linux + shell: bash -euxo pipefail {0} + - name: steps::install_mold + run: ./script/install-mold + shell: bash -euxo pipefail {0} + - name: steps::download_wasi_sdk + run: ./script/download-wasi-sdk + shell: bash -euxo pipefail {0} + - name: steps::cargo_install_nextest + run: cargo install cargo-nextest --locked + shell: bash -euxo pipefail {0} + - name: steps::clear_target_dir_if_large + run: ./script/clear-target-dir-if-larger-than 250 + shell: bash -euxo pipefail {0} + - name: ./script/run-unit-evals + run: ./script/run-unit-evals + shell: bash -euxo pipefail {0} + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }} + GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }} + - name: run_agent_evals::unit_evals::send_failure_to_slack + if: ${{ failure() }} + uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }} + payload: | + channel: C04UDRNNJFQ + text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}" + - name: steps::cleanup_cargo_config + if: always() + run: | + rm -rf ./../.cargo + shell: bash -euxo pipefail {0} + - name: run_agent_evals::cron_unit_evals::send_failure_to_slack + if: ${{ failure() }} + uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }} + payload: | + channel: C04UDRNNJFQ + text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}" +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + cancel-in-progress: true diff --git a/.github/workflows/run_unit_evals.yml b/.github/workflows/run_unit_evals.yml index e3f03c2f9ecd7a48423939f315ce41e13b934d7d..9f2af35dca5429488e169fd1fe6d9ac098a5059a 100644 --- a/.github/workflows/run_unit_evals.yml +++ b/.github/workflows/run_unit_evals.yml @@ -6,12 +6,21 @@ env: CARGO_INCREMENTAL: '0' RUST_BACKTRACE: '1' ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} + ZED_EVAL_TELEMETRY: '1' + MODEL_NAME: ${{ inputs.model_name }} on: - schedule: - - cron: 47 1 * * 2 - workflow_dispatch: {} + workflow_dispatch: + inputs: + model_name: + description: model_name + required: true + type: string + commit_sha: + description: commit_sha + required: true + type: string jobs: - unit_evals: + run_unit_evals: runs-on: namespace-profile-16x32-ubuntu-2204 steps: - name: steps::checkout_repo @@ -47,6 +56,10 @@ jobs: shell: bash -euxo pipefail {0} env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }} + GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }} + UNIT_EVAL_COMMIT: ${{ inputs.commit_sha }} - name: run_agent_evals::unit_evals::send_failure_to_slack if: ${{ failure() }} uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 diff --git a/script/run-unit-evals b/script/run-unit-evals index 7a72d0b6a64b9ae9f3dcf340c16d7426d88d6a0b..c5178add7a1e4c76151b3907771abe81ba46aaaf 100755 --- a/script/run-unit-evals +++ b/script/run-unit-evals @@ -2,4 +2,8 @@ set -euxo pipefail +if [ -n "${UNIT_EVAL_COMMIT:-}" ]; then + git checkout "$UNIT_EVAL_COMMIT" +fi + GPUI_TEST_TIMEOUT=1500 cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)' diff --git a/tooling/xtask/src/tasks/workflows.rs b/tooling/xtask/src/tasks/workflows.rs index bf6a332075c52cd08dcc44d73fc37239bd60a740..374a22f3ea9c65dcfc9743f77448a5c29117cedf 100644 --- a/tooling/xtask/src/tasks/workflows.rs +++ b/tooling/xtask/src/tasks/workflows.rs @@ -33,6 +33,10 @@ pub fn run_workflows(_: GenerateWorkflowArgs) -> Result<()> { ("cherry_pick.yml", cherry_pick::cherry_pick()), ("compare_perf.yml", compare_perf::compare_perf()), ("run_unit_evals.yml", run_agent_evals::run_unit_evals()), + ( + "run_cron_unit_evals.yml", + run_agent_evals::run_cron_unit_evals(), + ), ("run_agent_evals.yml", run_agent_evals::run_agent_evals()), ("after_release.yml", after_release::after_release()), ]; diff --git a/tooling/xtask/src/tasks/workflows/run_agent_evals.rs b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs index 846001201f62fd65bf9d05af53ace59646ea197c..b69216e5a00a61762625e92b2592fd4cbe0cef30 100644 --- a/tooling/xtask/src/tasks/workflows/run_agent_evals.rs +++ b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs @@ -28,6 +28,36 @@ pub(crate) fn run_agent_evals() -> Workflow { .add_job(agent_evals.name, agent_evals.job) } +pub(crate) fn run_unit_evals() -> Workflow { + let model_name = Input::string("model_name", None); + let commit_sha = Input::string("commit_sha", None); + + let unit_evals = named::job(unit_evals(Some(&commit_sha))); + + named::workflow() + .name("run_unit_evals") + .on(Event::default().workflow_dispatch( + WorkflowDispatch::default() + .add_input(model_name.name, model_name.input()) + .add_input(commit_sha.name, commit_sha.input()), + )) + .concurrency(vars::one_workflow_per_non_main_branch()) + .add_env(("CARGO_TERM_COLOR", "always")) + .add_env(("CARGO_INCREMENTAL", 0)) + .add_env(("RUST_BACKTRACE", 1)) + .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED)) + .add_env(("ZED_EVAL_TELEMETRY", 1)) + .add_env(("MODEL_NAME", model_name.to_string())) + .add_job(unit_evals.name, unit_evals.job) +} + +fn add_api_keys(step: Step) -> Step { + step.add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY)) + .add_env(("OPENAI_API_KEY", vars::OPENAI_API_KEY)) + .add_env(("GOOGLE_AI_API_KEY", vars::GOOGLE_AI_API_KEY)) + .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT)) +} + fn agent_evals() -> NamedJob { fn run_eval() -> Step { named::bash( @@ -44,16 +74,16 @@ fn agent_evals() -> NamedJob { .map(steps::install_linux_dependencies) .add_step(setup_cargo_config(Platform::Linux)) .add_step(steps::script("cargo build --package=eval")) - .add_step(run_eval()) + .add_step(add_api_keys(run_eval())) .add_step(steps::cleanup_cargo_config(Platform::Linux)), ) } -pub(crate) fn run_unit_evals() -> Workflow { - let unit_evals = unit_evals(); +pub(crate) fn run_cron_unit_evals() -> Workflow { + let unit_evals = cron_unit_evals(); named::workflow() - .name("run_unit_evals") + .name("run_cron_unit_evals") .on(Event::default() .schedule([ // GitHub might drop jobs at busy times, so we choose a random time in the middle of the night. @@ -68,7 +98,7 @@ pub(crate) fn run_unit_evals() -> Workflow { .add_job(unit_evals.name, unit_evals.job) } -fn unit_evals() -> NamedJob { +fn cron_unit_evals() -> NamedJob { fn send_failure_to_slack() -> Step { named::uses( "slackapi", @@ -84,20 +114,39 @@ fn unit_evals() -> NamedJob { "#})) } - named::job( - Job::default() - .runs_on(runners::LINUX_DEFAULT) - .add_step(steps::checkout_repo()) - .add_step(steps::setup_cargo_config(Platform::Linux)) - .add_step(steps::cache_rust_dependencies_namespace()) - .map(steps::install_linux_dependencies) - .add_step(steps::cargo_install_nextest(Platform::Linux)) - .add_step(steps::clear_target_dir_if_large(Platform::Linux)) - .add_step( - steps::script("./script/run-unit-evals") - .add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY)), - ) - .add_step(send_failure_to_slack()) - .add_step(steps::cleanup_cargo_config(Platform::Linux)), - ) + named::job(unit_evals(None).add_step(send_failure_to_slack())) +} + +fn unit_evals(commit: Option<&Input>) -> Job { + fn send_failure_to_slack() -> Step { + named::uses( + "slackapi", + "slack-github-action", + "b0fa283ad8fea605de13dc3f449259339835fc52", + ) + .if_condition(Expression::new("${{ failure() }}")) + .add_with(("method", "chat.postMessage")) + .add_with(("token", vars::SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN)) + .add_with(("payload", indoc::indoc!{r#" + channel: C04UDRNNJFQ + text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}" + "#})) + } + + let script_step = add_api_keys(steps::script("./script/run-unit-evals")); + + Job::default() + .runs_on(runners::LINUX_DEFAULT) + .add_step(steps::checkout_repo()) + .add_step(steps::setup_cargo_config(Platform::Linux)) + .add_step(steps::cache_rust_dependencies_namespace()) + .map(steps::install_linux_dependencies) + .add_step(steps::cargo_install_nextest(Platform::Linux)) + .add_step(steps::clear_target_dir_if_large(Platform::Linux)) + .add_step(match commit { + Some(commit) => script_step.add_env(("UNIT_EVAL_COMMIT", commit)), + None => script_step, + }) + .add_step(send_failure_to_slack()) + .add_step(steps::cleanup_cargo_config(Platform::Linux)) }