run_agent_evals.yml

 1# Generated from xtask::workflows::run_agent_evals
 2# Rebuild with `cargo xtask workflows`.
 3name: run_agent_evals
 4env:
 5  CARGO_TERM_COLOR: always
 6  CARGO_INCREMENTAL: '0'
 7  RUST_BACKTRACE: '1'
 8  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
 9  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
10  ZED_EVAL_TELEMETRY: '1'
11  MODEL_NAME: ${{ inputs.model_name }}
12on:
13  workflow_dispatch:
14    inputs:
15      model_name:
16        description: model_name
17        required: true
18        type: string
19jobs:
20  agent_evals:
21    runs-on: namespace-profile-16x32-ubuntu-2204
22    steps:
23    - name: steps::checkout_repo
24      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
25      with:
26        clean: false
27    - name: steps::cache_rust_dependencies_namespace
28      uses: namespacelabs/nscloud-cache-action@v1
29      with:
30        cache: rust
31    - name: steps::setup_linux
32      run: ./script/linux
33      shell: bash -euxo pipefail {0}
34    - name: steps::install_mold
35      run: ./script/install-mold
36      shell: bash -euxo pipefail {0}
37    - name: steps::download_wasi_sdk
38      run: ./script/download-wasi-sdk
39      shell: bash -euxo pipefail {0}
40    - name: steps::setup_cargo_config
41      run: |
42        mkdir -p ./../.cargo
43        cp ./.cargo/ci-config.toml ./../.cargo/config.toml
44      shell: bash -euxo pipefail {0}
45    - name: cargo build --package=eval
46      run: cargo build --package=eval
47      shell: bash -euxo pipefail {0}
48    - name: run_agent_evals::agent_evals::run_eval
49      run: cargo run --package=eval -- --repetitions=8 --concurrency=1 --model "${MODEL_NAME}"
50      shell: bash -euxo pipefail {0}
51    - name: steps::cleanup_cargo_config
52      if: always()
53      run: |
54        rm -rf ./../.cargo
55      shell: bash -euxo pipefail {0}
56    timeout-minutes: 600
57concurrency:
58  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
59  cancel-in-progress: true