From 359521e91d2a42d8c863c03a26f99529047a81cb Mon Sep 17 00:00:00 2001 From: Conrad Irwin Date: Mon, 10 Nov 2025 16:00:52 -0700 Subject: [PATCH] Allow passing model_name to evals (#42395) Release Notes: - N/A --- .github/workflows/run_agent_evals.yml | 24 ++++++-------- .../src/tasks/workflows/run_agent_evals.rs | 32 +++++++------------ 2 files changed, 21 insertions(+), 35 deletions(-) diff --git a/.github/workflows/run_agent_evals.yml b/.github/workflows/run_agent_evals.yml index e13bae4031174f057e555db7f2d779208d55456e..1a875aa2c463d264002f14264993b9c99ae1f49c 100644 --- a/.github/workflows/run_agent_evals.yml +++ b/.github/workflows/run_agent_evals.yml @@ -8,22 +8,16 @@ env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} ZED_EVAL_TELEMETRY: '1' + MODEL_NAME: ${{ inputs.model_name }} on: - pull_request: - types: - - synchronize - - reopened - - labeled - branches: - - '**' - schedule: - - cron: 0 0 * * * - workflow_dispatch: {} + workflow_dispatch: + inputs: + model_name: + description: model_name + required: true + type: string jobs: agent_evals: - if: | - github.repository_owner == 'zed-industries' && - (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval')) runs-on: namespace-profile-16x32-ubuntu-2204 steps: - name: steps::checkout_repo @@ -52,14 +46,14 @@ jobs: run: cargo build --package=eval shell: bash -euxo pipefail {0} - name: run_agent_evals::agent_evals::run_eval - run: cargo run --package=eval -- --repetitions=8 --concurrency=1 + run: cargo run --package=eval -- --repetitions=8 --concurrency=1 --model "${MODEL_NAME}" shell: bash -euxo pipefail {0} - name: steps::cleanup_cargo_config if: always() run: | rm -rf ./../.cargo shell: bash -euxo pipefail {0} - timeout-minutes: 60 + timeout-minutes: 600 concurrency: group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true diff --git a/tooling/xtask/src/tasks/workflows/run_agent_evals.rs b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs index 1af09f6ca8fa0bc24c99eda7a18904b1b8886bb3..4601d5a5bf6a60435a87edab9cd6d62b77ef52a8 100644 --- a/tooling/xtask/src/tasks/workflows/run_agent_evals.rs +++ b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs @@ -1,26 +1,19 @@ -use gh_workflow::{ - Event, Expression, Job, PullRequest, PullRequestType, Run, Schedule, Step, Use, Workflow, - WorkflowDispatch, -}; +use gh_workflow::{Event, Expression, Job, Run, Schedule, Step, Use, Workflow, WorkflowDispatch}; use crate::tasks::workflows::{ runners::{self, Platform}, steps::{self, FluentBuilder as _, NamedJob, named, setup_cargo_config}, - vars, + vars::{self, Input}, }; pub(crate) fn run_agent_evals() -> Workflow { let agent_evals = agent_evals(); + let model_name = Input::string("model_name", None); named::workflow() - .on(Event::default() - .schedule([Schedule::default().cron("0 0 * * *")]) - .pull_request(PullRequest::default().add_branch("**").types([ - PullRequestType::Synchronize, - PullRequestType::Reopened, - PullRequestType::Labeled, - ])) - .workflow_dispatch(WorkflowDispatch::default())) + .on(Event::default().workflow_dispatch( + WorkflowDispatch::default().add_input(model_name.name, model_name.input()), + )) .concurrency(vars::one_workflow_per_non_main_branch()) .add_env(("CARGO_TERM_COLOR", "always")) .add_env(("CARGO_INCREMENTAL", 0)) @@ -28,29 +21,28 @@ pub(crate) fn run_agent_evals() -> Workflow { .add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY)) .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED)) .add_env(("ZED_EVAL_TELEMETRY", 1)) + .add_env(("MODEL_NAME", model_name.to_string())) .add_job(agent_evals.name, agent_evals.job) } fn agent_evals() -> NamedJob { fn run_eval() -> Step { - named::bash("cargo run --package=eval -- --repetitions=8 --concurrency=1") + named::bash( + "cargo run --package=eval -- --repetitions=8 --concurrency=1 --model \"${MODEL_NAME}\"", + ) } named::job( Job::default() - .cond(Expression::new(indoc::indoc!{r#" - github.repository_owner == 'zed-industries' && - (github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval')) - "#})) .runs_on(runners::LINUX_DEFAULT) - .timeout_minutes(60_u32) + .timeout_minutes(60_u32 * 10) .add_step(steps::checkout_repo()) .add_step(steps::cache_rust_dependencies_namespace()) .map(steps::install_linux_dependencies) .add_step(setup_cargo_config(Platform::Linux)) .add_step(steps::script("cargo build --package=eval")) .add_step(run_eval()) - .add_step(steps::cleanup_cargo_config(Platform::Linux)) + .add_step(steps::cleanup_cargo_config(Platform::Linux)), ) }