diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml deleted file mode 100644 index b5da9e7b7c8e293fb565f4de269a1ae266c19692..0000000000000000000000000000000000000000 --- a/.github/workflows/eval.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: Run Agent Eval - -on: - schedule: - - cron: "0 0 * * *" - - pull_request: - branches: - - "**" - types: [synchronize, reopened, labeled] - - workflow_dispatch: - -concurrency: - # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - cancel-in-progress: true - -env: - CARGO_TERM_COLOR: always - CARGO_INCREMENTAL: 0 - RUST_BACKTRACE: 1 - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} - ZED_EVAL_TELEMETRY: 1 - -jobs: - run_eval: - timeout-minutes: 60 - name: Run Agent Eval - if: > - github.repository_owner == 'zed-industries' && - (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval')) - runs-on: - - namespace-profile-16x32-ubuntu-2204 - steps: - - name: Add Rust to the PATH - run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" - - - name: Checkout repo - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - with: - clean: false - - - name: Cache dependencies - uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2 - with: - save-if: ${{ github.ref == 'refs/heads/main' }} - # cache-provider: "buildjet" - - - name: Install Linux dependencies - run: ./script/linux - - - name: Configure CI - run: | - mkdir -p ./../.cargo - cp ./.cargo/ci-config.toml ./../.cargo/config.toml - - - name: Compile eval - run: cargo build --package=eval - - - name: Run eval - run: cargo run --package=eval -- --repetitions=8 --concurrency=1 - - # Even the Linux runner is not stateful, in theory there is no need to do this cleanup. - # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code - # to clean up the config file, I’ve included the cleanup code here as a precaution. - # While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution. - - name: Clean CI config file - if: always() - run: rm -rf ./../.cargo diff --git a/.github/workflows/run_agent_evals.yml b/.github/workflows/run_agent_evals.yml new file mode 100644 index 0000000000000000000000000000000000000000..67a050cd59c973ecd674fc3f6fe7ea4da436428f --- /dev/null +++ b/.github/workflows/run_agent_evals.yml @@ -0,0 +1,62 @@ +# Generated from xtask::workflows::run_agent_evals +# Rebuild with `cargo xtask workflows`. +name: run_agent_evals +env: + CARGO_TERM_COLOR: always + CARGO_INCREMENTAL: '0' + RUST_BACKTRACE: '1' + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} + ZED_EVAL_TELEMETRY: '1' +on: + pull_request: + types: + - synchronize + - reopened + - labeled + branches: + - '**' + schedule: + - cron: 0 0 * * * + workflow_dispatch: {} +jobs: + agent_evals: + if: | + github.repository_owner == 'zed-industries' && + (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval')) + runs-on: namespace-profile-16x32-ubuntu-2204 + steps: + - name: steps::checkout_repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + clean: false + - name: steps::cache_rust_dependencies + uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 + with: + save-if: ${{ github.ref == 'refs/heads/main' }} + - name: steps::setup_linux + run: ./script/linux + shell: bash -euxo pipefail {0} + - name: steps::install_mold + run: ./script/install-mold + shell: bash -euxo pipefail {0} + - name: steps::setup_cargo_config + run: | + mkdir -p ./../.cargo + cp ./.cargo/ci-config.toml ./../.cargo/config.toml + shell: bash -euxo pipefail {0} + - name: cargo build --package=eval + run: cargo build --package=eval + shell: bash -euxo pipefail {0} + - name: run_agent_evals::agent_evals::run_eval + run: cargo run --package=eval -- --repetitions=8 --concurrency=1 + shell: bash -euxo pipefail {0} + - name: steps::cleanup_cargo_config + if: always() + run: | + rm -rf ./../.cargo + shell: bash -euxo pipefail {0} + timeout-minutes: 60 +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + cancel-in-progress: true diff --git a/.github/workflows/run_unit_evals.yml b/.github/workflows/run_unit_evals.yml new file mode 100644 index 0000000000000000000000000000000000000000..b94d54e1639c0255dbfcf9921c85ff48b8d5a476 --- /dev/null +++ b/.github/workflows/run_unit_evals.yml @@ -0,0 +1,63 @@ +# Generated from xtask::workflows::run_agent_evals +# Rebuild with `cargo xtask workflows`. +name: run_agent_evals +env: + CARGO_TERM_COLOR: always + CARGO_INCREMENTAL: '0' + RUST_BACKTRACE: '1' + ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} +on: + schedule: + - cron: 47 1 * * 2 + workflow_dispatch: {} +jobs: + unit_evals: + runs-on: namespace-profile-16x32-ubuntu-2204 + steps: + - name: steps::checkout_repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + clean: false + - name: steps::setup_cargo_config + run: | + mkdir -p ./../.cargo + cp ./.cargo/ci-config.toml ./../.cargo/config.toml + shell: bash -euxo pipefail {0} + - name: steps::cache_rust_dependencies + uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 + with: + save-if: ${{ github.ref == 'refs/heads/main' }} + - name: steps::setup_linux + run: ./script/linux + shell: bash -euxo pipefail {0} + - name: steps::install_mold + run: ./script/install-mold + shell: bash -euxo pipefail {0} + - name: steps::cargo_install_nextest + run: cargo install cargo-nextest --locked + shell: bash -euxo pipefail {0} + - name: steps::clear_target_dir_if_large + run: ./script/clear-target-dir-if-larger-than 100 + shell: bash -euxo pipefail {0} + - name: ./script/run-unit-evals + run: ./script/run-unit-evals + shell: bash -euxo pipefail {0} + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + - name: run_agent_evals::unit_evals::send_failure_to_slack + if: ${{ failure() }} + uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }} + payload: | + channel: C04UDRNNJFQ + text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}" + - name: steps::cleanup_cargo_config + if: always() + run: | + rm -rf ./../.cargo + shell: bash -euxo pipefail {0} +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + cancel-in-progress: true diff --git a/.github/workflows/unit_evals.yml b/.github/workflows/unit_evals.yml deleted file mode 100644 index 53ed33a1af300d6b641b3b9430de0bb6846b27cc..0000000000000000000000000000000000000000 --- a/.github/workflows/unit_evals.yml +++ /dev/null @@ -1,86 +0,0 @@ -name: Run Unit Evals - -on: - schedule: - # GitHub might drop jobs at busy times, so we choose a random time in the middle of the night. - - cron: "47 1 * * 2" - workflow_dispatch: - -concurrency: - # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - cancel-in-progress: true - -env: - CARGO_TERM_COLOR: always - CARGO_INCREMENTAL: 0 - RUST_BACKTRACE: 1 - ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} - -jobs: - unit_evals: - if: github.repository_owner == 'zed-industries' - timeout-minutes: 60 - name: Run unit evals - runs-on: - - namespace-profile-16x32-ubuntu-2204 - steps: - - name: Add Rust to the PATH - run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" - - - name: Checkout repo - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - with: - clean: false - - - name: Cache dependencies - uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2 - with: - save-if: ${{ github.ref == 'refs/heads/main' }} - # cache-provider: "buildjet" - - - name: Install Linux dependencies - run: ./script/linux - - - name: Configure CI - run: | - mkdir -p ./../.cargo - cp ./.cargo/ci-config.toml ./../.cargo/config.toml - - - name: Install Rust - shell: bash -euxo pipefail {0} - run: | - cargo install cargo-nextest --locked - - - name: Install Node - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4 - with: - node-version: "18" - - - name: Limit target directory size - shell: bash -euxo pipefail {0} - run: script/clear-target-dir-if-larger-than 100 - - - name: Run unit evals - shell: bash -euxo pipefail {0} - run: cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)' - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - - - name: Send failure message to Slack channel if needed - if: ${{ failure() }} - uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 - with: - method: chat.postMessage - token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }} - payload: | - channel: C04UDRNNJFQ - text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}" - - # Even the Linux runner is not stateful, in theory there is no need to do this cleanup. - # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code - # to clean up the config file, I’ve included the cleanup code here as a precaution. - # While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution. - - name: Clean CI config file - if: always() - run: rm -rf ./../.cargo diff --git a/script/run-unit-evals b/script/run-unit-evals new file mode 100755 index 0000000000000000000000000000000000000000..02481e1ce9dde7d2cbde9603f663093bf7a2ee38 --- /dev/null +++ b/script/run-unit-evals @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)' diff --git a/tooling/xtask/src/tasks/workflows.rs b/tooling/xtask/src/tasks/workflows.rs index a8472606ffd6aea48775f3fca28f9c30b2223cc5..538724bcd9648b89d303a6eff834d08ffb3bf18a 100644 --- a/tooling/xtask/src/tasks/workflows.rs +++ b/tooling/xtask/src/tasks/workflows.rs @@ -10,6 +10,7 @@ mod release_nightly; mod run_bundling; mod release; +mod run_agent_evals; mod run_tests; mod runners; mod steps; @@ -28,6 +29,8 @@ pub fn run_workflows(_: GenerateWorkflowArgs) -> Result<()> { ("run_tests.yml", run_tests::run_tests()), ("release.yml", release::release()), ("compare_perf.yml", compare_perf::compare_perf()), + ("run_unit_evals.yml", run_agent_evals::run_unit_evals()), + ("run_agent_evals.yml", run_agent_evals::run_agent_evals()), ]; fs::create_dir_all(dir) .with_context(|| format!("Failed to create directory: {}", dir.display()))?; diff --git a/tooling/xtask/src/tasks/workflows/run_agent_evals.rs b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs new file mode 100644 index 0000000000000000000000000000000000000000..b83aee8457ef61c7430431c6de6f654d9559423e --- /dev/null +++ b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs @@ -0,0 +1,113 @@ +use gh_workflow::{ + Event, Expression, Job, PullRequest, PullRequestType, Run, Schedule, Step, Use, Workflow, + WorkflowDispatch, +}; + +use crate::tasks::workflows::{ + runners::{self, Platform}, + steps::{self, FluentBuilder as _, NamedJob, named, setup_cargo_config}, + vars, +}; + +pub(crate) fn run_agent_evals() -> Workflow { + let agent_evals = agent_evals(); + + named::workflow() + .on(Event::default() + .schedule([Schedule::default().cron("0 0 * * *")]) + .pull_request(PullRequest::default().add_branch("**").types([ + PullRequestType::Synchronize, + PullRequestType::Reopened, + PullRequestType::Labeled, + ])) + .workflow_dispatch(WorkflowDispatch::default())) + .concurrency(vars::one_workflow_per_non_main_branch()) + .add_env(("CARGO_TERM_COLOR", "always")) + .add_env(("CARGO_INCREMENTAL", 0)) + .add_env(("RUST_BACKTRACE", 1)) + .add_env(("ANTHROPIC_API_KEY", "${{ secrets.ANTHROPIC_API_KEY }}")) + .add_env(( + "ZED_CLIENT_CHECKSUM_SEED", + "${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}", + )) + .add_env(("ZED_EVAL_TELEMETRY", 1)) + .add_job(agent_evals.name, agent_evals.job) +} + +fn agent_evals() -> NamedJob { + fn run_eval() -> Step { + named::bash("cargo run --package=eval -- --repetitions=8 --concurrency=1") + } + + named::job( + Job::default() + .cond(Expression::new(indoc::indoc!{r#" + github.repository_owner == 'zed-industries' && + (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval')) + "#})) + .runs_on(runners::LINUX_DEFAULT) + .timeout_minutes(60_u32) + .add_step(steps::checkout_repo()) + .add_step(steps::cache_rust_dependencies()) + .map(steps::install_linux_dependencies) + .add_step(setup_cargo_config(Platform::Linux)) + .add_step(steps::script("cargo build --package=eval")) + .add_step(run_eval()) + .add_step(steps::cleanup_cargo_config(Platform::Linux)) + ) +} + +pub(crate) fn run_unit_evals() -> Workflow { + let unit_evals = unit_evals(); + + named::workflow() + .on(Event::default() + .schedule([ + // GitHub might drop jobs at busy times, so we choose a random time in the middle of the night. + Schedule::default().cron("47 1 * * 2"), + ]) + .workflow_dispatch(WorkflowDispatch::default())) + .concurrency(vars::one_workflow_per_non_main_branch()) + .add_env(("CARGO_TERM_COLOR", "always")) + .add_env(("CARGO_INCREMENTAL", 0)) + .add_env(("RUST_BACKTRACE", 1)) + .add_env(( + "ZED_CLIENT_CHECKSUM_SEED", + "${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}", + )) + .add_job(unit_evals.name, unit_evals.job) +} + +fn unit_evals() -> NamedJob { + fn send_failure_to_slack() -> Step { + named::uses( + "slackapi", + "slack-github-action", + "b0fa283ad8fea605de13dc3f449259339835fc52", + ) + .if_condition(Expression::new("${{ failure() }}")) + .add_with(("method", "chat.postMessage")) + .add_with(("token", "${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}")) + .add_with(("payload", indoc::indoc!{r#" + channel: C04UDRNNJFQ + text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}" + "#})) + } + + named::job( + Job::default() + .runs_on(runners::LINUX_DEFAULT) + .add_step(steps::checkout_repo()) + .add_step(steps::setup_cargo_config(Platform::Linux)) + .add_step(steps::cache_rust_dependencies()) + .map(steps::install_linux_dependencies) + .add_step(steps::cargo_install_nextest(Platform::Linux)) + .add_step(steps::clear_target_dir_if_large(Platform::Linux)) + .add_step( + steps::script("./script/run-unit-evals") + .add_env(("ANTHROPIC_API_KEY", "${{ secrets.ANTHROPIC_API_KEY }}")), + ) + .add_step(send_failure_to_slack()) + .add_step(steps::cleanup_cargo_config(Platform::Linux)), + ) +}