1use gh_workflow::{
2 Event, Expression, Job, Run, Schedule, Step, Strategy, Use, Workflow, WorkflowDispatch,
3};
4use serde_json::json;
5
6use crate::tasks::workflows::{
7 runners::{self, Platform},
8 steps::{self, FluentBuilder as _, NamedJob, named, setup_cargo_config},
9 vars::{self, WorkflowInput},
10};
11
12pub(crate) fn run_agent_evals() -> Workflow {
13 let agent_evals = agent_evals();
14 let model_name = WorkflowInput::string("model_name", None);
15
16 named::workflow()
17 .on(Event::default().workflow_dispatch(
18 WorkflowDispatch::default().add_input(model_name.name, model_name.input()),
19 ))
20 .concurrency(vars::one_workflow_per_non_main_branch())
21 .add_env(("CARGO_TERM_COLOR", "always"))
22 .add_env(("CARGO_INCREMENTAL", 0))
23 .add_env(("RUST_BACKTRACE", 1))
24 .add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY))
25 .add_env(("OPENAI_API_KEY", vars::OPENAI_API_KEY))
26 .add_env(("GOOGLE_AI_API_KEY", vars::GOOGLE_AI_API_KEY))
27 .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT))
28 .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED))
29 .add_env(("ZED_EVAL_TELEMETRY", 1))
30 .add_env(("MODEL_NAME", model_name.to_string()))
31 .add_job(agent_evals.name, agent_evals.job)
32}
33
34pub(crate) fn run_unit_evals() -> Workflow {
35 let model_name = WorkflowInput::string("model_name", None);
36 let commit_sha = WorkflowInput::string("commit_sha", None);
37
38 let unit_evals = named::job(unit_evals(Some(&commit_sha)));
39
40 named::workflow()
41 .name("run_unit_evals")
42 .on(Event::default().workflow_dispatch(
43 WorkflowDispatch::default()
44 .add_input(model_name.name, model_name.input())
45 .add_input(commit_sha.name, commit_sha.input()),
46 ))
47 .concurrency(vars::allow_concurrent_runs())
48 .add_env(("CARGO_TERM_COLOR", "always"))
49 .add_env(("CARGO_INCREMENTAL", 0))
50 .add_env(("RUST_BACKTRACE", 1))
51 .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED))
52 .add_env(("ZED_EVAL_TELEMETRY", 1))
53 .add_env(("MODEL_NAME", model_name.to_string()))
54 .add_job(unit_evals.name, unit_evals.job)
55}
56
57fn add_api_keys(step: Step<Run>) -> Step<Run> {
58 step.add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY))
59 .add_env(("OPENAI_API_KEY", vars::OPENAI_API_KEY))
60 .add_env(("GOOGLE_AI_API_KEY", vars::GOOGLE_AI_API_KEY))
61 .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT))
62}
63
64fn agent_evals() -> NamedJob {
65 fn run_eval() -> Step<Run> {
66 named::bash(
67 "cargo run --package=eval -- --repetitions=8 --concurrency=1 --model \"${MODEL_NAME}\"",
68 )
69 }
70
71 named::job(
72 Job::default()
73 .runs_on(runners::LINUX_DEFAULT)
74 .timeout_minutes(60_u32 * 10)
75 .add_step(steps::checkout_repo())
76 .add_step(steps::cache_rust_dependencies_namespace())
77 .map(steps::install_linux_dependencies)
78 .add_step(setup_cargo_config(Platform::Linux))
79 .add_step(steps::script("cargo build --package=eval"))
80 .add_step(add_api_keys(run_eval()))
81 .add_step(steps::cleanup_cargo_config(Platform::Linux)),
82 )
83}
84
85pub(crate) fn run_cron_unit_evals() -> Workflow {
86 let unit_evals = cron_unit_evals();
87
88 named::workflow()
89 .name("run_cron_unit_evals")
90 .on(Event::default()
91 .schedule([
92 // GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
93 Schedule::default().cron("47 1 * * 2"),
94 ])
95 .workflow_dispatch(WorkflowDispatch::default()))
96 .concurrency(vars::one_workflow_per_non_main_branch())
97 .add_env(("CARGO_TERM_COLOR", "always"))
98 .add_env(("CARGO_INCREMENTAL", 0))
99 .add_env(("RUST_BACKTRACE", 1))
100 .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED))
101 .add_job(unit_evals.name, unit_evals.job)
102}
103
104fn cron_unit_evals() -> NamedJob {
105 fn send_failure_to_slack() -> Step<Use> {
106 named::uses(
107 "slackapi",
108 "slack-github-action",
109 "b0fa283ad8fea605de13dc3f449259339835fc52",
110 )
111 .if_condition(Expression::new("${{ failure() }}"))
112 .add_with(("method", "chat.postMessage"))
113 .add_with(("token", vars::SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN))
114 .add_with(("payload", indoc::indoc!{r#"
115 channel: C04UDRNNJFQ
116 text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
117 "#}))
118 }
119
120 named::job(cron_unit_evals_job().add_step(send_failure_to_slack()))
121}
122
123const UNIT_EVAL_MODELS: &[&str] = &[
124 "anthropic/claude-sonnet-4-5-latest",
125 "anthropic/claude-opus-4-5-latest",
126 "google/gemini-3-pro",
127 "openai/gpt-5",
128];
129
130fn cron_unit_evals_job() -> Job {
131 let script_step = add_api_keys(steps::script("./script/run-unit-evals"))
132 .add_env(("ZED_AGENT_MODEL", "${{ matrix.model }}"));
133
134 Job::default()
135 .runs_on(runners::LINUX_DEFAULT)
136 .strategy(Strategy::default().fail_fast(false).matrix(json!({
137 "model": UNIT_EVAL_MODELS
138 })))
139 .add_step(steps::checkout_repo())
140 .add_step(steps::setup_cargo_config(Platform::Linux))
141 .add_step(steps::cache_rust_dependencies_namespace())
142 .map(steps::install_linux_dependencies)
143 .add_step(steps::cargo_install_nextest())
144 .add_step(steps::clear_target_dir_if_large(Platform::Linux))
145 .add_step(script_step)
146 .add_step(steps::cleanup_cargo_config(Platform::Linux))
147}
148
149fn unit_evals(commit: Option<&WorkflowInput>) -> Job {
150 let script_step = add_api_keys(steps::script("./script/run-unit-evals"));
151
152 Job::default()
153 .runs_on(runners::LINUX_DEFAULT)
154 .add_step(steps::checkout_repo())
155 .add_step(steps::setup_cargo_config(Platform::Linux))
156 .add_step(steps::cache_rust_dependencies_namespace())
157 .map(steps::install_linux_dependencies)
158 .add_step(steps::cargo_install_nextest())
159 .add_step(steps::clear_target_dir_if_large(Platform::Linux))
160 .add_step(match commit {
161 Some(commit) => script_step.add_env(("UNIT_EVAL_COMMIT", commit)),
162 None => script_step,
163 })
164 .add_step(steps::cleanup_cargo_config(Platform::Linux))
165}