1# Generated from xtask::workflows::run_agent_evals
2# Rebuild with `cargo xtask workflows`.
3name: run_agent_evals
4env:
5 CARGO_TERM_COLOR: always
6 CARGO_INCREMENTAL: '0'
7 RUST_BACKTRACE: '1'
8 ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
9 OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
10 GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }}
11 GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }}
12 ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
13 ZED_EVAL_TELEMETRY: '1'
14 MODEL_NAME: ${{ inputs.model_name }}
15on:
16 workflow_dispatch:
17 inputs:
18 model_name:
19 description: model_name
20 required: true
21 type: string
22jobs:
23 agent_evals:
24 runs-on: namespace-profile-16x32-ubuntu-2204
25 steps:
26 - name: steps::checkout_repo
27 uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
28 with:
29 clean: false
30 - name: steps::cache_rust_dependencies_namespace
31 uses: namespacelabs/nscloud-cache-action@v1
32 with:
33 cache: rust
34 path: ~/.rustup
35 - name: steps::setup_linux
36 run: ./script/linux
37 shell: bash -euxo pipefail {0}
38 - name: steps::install_mold
39 run: ./script/install-mold
40 shell: bash -euxo pipefail {0}
41 - name: steps::download_wasi_sdk
42 run: ./script/download-wasi-sdk
43 shell: bash -euxo pipefail {0}
44 - name: steps::setup_cargo_config
45 run: |
46 mkdir -p ./../.cargo
47 cp ./.cargo/ci-config.toml ./../.cargo/config.toml
48 shell: bash -euxo pipefail {0}
49 - name: cargo build --package=eval
50 run: cargo build --package=eval
51 shell: bash -euxo pipefail {0}
52 - name: run_agent_evals::agent_evals::run_eval
53 run: cargo run --package=eval -- --repetitions=8 --concurrency=1 --model "${MODEL_NAME}"
54 shell: bash -euxo pipefail {0}
55 env:
56 ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
57 OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
58 GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }}
59 GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }}
60 - name: steps::cleanup_cargo_config
61 if: always()
62 run: |
63 rm -rf ./../.cargo
64 shell: bash -euxo pipefail {0}
65 timeout-minutes: 600
66concurrency:
67 group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
68 cancel-in-progress: true