eval: Count execution errors as failures (#30712)

Oleksiy Syvokon created

- Evals returning an error (e.g., LLM API format mismatch) were silently
skipped in the aggregated results. Now we count them as a failure (0%
success score).

- Setting the `VERBOSE` environment variable to something non-empty
disables string truncation

Release Notes:

- N/A

Change summary

crates/eval/src/assertions.rs | 15 ++++++++++
crates/eval/src/eval.rs       | 48 +++++++++++++++++++-----------------
2 files changed, 39 insertions(+), 24 deletions(-)

Detailed changes

crates/eval/src/assertions.rs 🔗

@@ -28,6 +28,17 @@ impl AssertionsReport {
         }
     }
 
+    pub fn error(msg: String) -> Self {
+        let assert = RanAssertion {
+            id: "no-unhandled-errors".into(),
+            result: Err(msg),
+        };
+        AssertionsReport {
+            ran: vec![assert],
+            max: Some(1),
+        }
+    }
+
     pub fn is_empty(&self) -> bool {
         self.ran.is_empty()
     }
@@ -145,7 +156,9 @@ pub fn print_table_divider() {
 }
 
 fn truncate(assertion: &str, max_width: usize) -> String {
-    if assertion.len() <= max_width {
+    let is_verbose = std::env::var("VERBOSE").is_ok_and(|v| !v.is_empty());
+
+    if assertion.len() <= max_width || is_verbose {
         assertion.to_string()
     } else {
         let mut end_ix = max_width - 1;

crates/eval/src/eval.rs 🔗

@@ -6,7 +6,7 @@ mod ids;
 mod instance;
 mod tool_metrics;
 
-use assertions::display_error_row;
+use assertions::{AssertionsReport, display_error_row};
 use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
 pub(crate) use tool_metrics::*;
 
@@ -467,11 +467,12 @@ pub fn find_model(
     match matching_models.as_slice() {
         [model] => Ok(model.clone()),
         [] => Err(anyhow!(
-            "No language model with ID {} was available. Available models: {}",
+            "No language model with ID {}/{} was available. Available models: {}",
+            provider_id,
             model_id,
             model_registry
                 .available_models(cx)
-                .map(|model| model.id().0.clone())
+                .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
                 .collect::<Vec<_>>()
                 .join(", ")
         )),
@@ -581,12 +582,15 @@ fn print_report(
                 Err(err) => {
                     display_error_row(&mut table_rows, example.repetition, err.to_string())?;
                     error_count += 1;
+                    programmatic_scores.push(0.0);
+                    diff_scores.push(0.0);
+                    thread_scores.push(0.0);
                 }
                 Ok((run_output, judge_output)) => {
                     cumulative_tool_metrics.merge(&run_output.tool_metrics);
                     example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
 
-                    if !run_output.programmatic_assertions.total_count() > 0 {
+                    if run_output.programmatic_assertions.total_count() > 0 {
                         for assertion in &run_output.programmatic_assertions.ran {
                             assertions::display_table_row(
                                 &mut table_rows,
@@ -626,6 +630,8 @@ fn print_report(
             }
         }
 
+        let mut all_asserts = Vec::new();
+
         if !table_rows.is_empty() {
             assertions::print_table_header();
             print!("{}", table_rows);
@@ -634,33 +640,29 @@ fn print_report(
 
             for (example, result) in results.iter() {
                 if let Ok((run_output, judge_output)) = result {
+                    let asserts = [
+                        run_output.programmatic_assertions.clone(),
+                        judge_output.diff.clone(),
+                        judge_output.thread.clone(),
+                    ];
+                    all_asserts.extend_from_slice(&asserts);
+                    assertions::print_table_round_summary(
+                        &example.repetition.to_string(),
+                        asserts.iter(),
+                    )
+                } else if let Err(err) = result {
+                    let assert = AssertionsReport::error(err.to_string());
+                    all_asserts.push(assert.clone());
                     assertions::print_table_round_summary(
                         &example.repetition.to_string(),
-                        [
-                            &run_output.programmatic_assertions,
-                            &judge_output.diff,
-                            &judge_output.thread,
-                        ]
-                        .into_iter(),
+                        [assert].iter(),
                     )
                 }
             }
 
             assertions::print_table_divider();
 
-            assertions::print_table_round_summary(
-                "avg",
-                results.iter().flat_map(|(_, result)| {
-                    result.iter().flat_map(|(run_output, judge_output)| {
-                        [
-                            &run_output.programmatic_assertions,
-                            &judge_output.diff,
-                            &judge_output.thread,
-                        ]
-                        .into_iter()
-                    })
-                }),
-            );
+            assertions::print_table_round_summary("avg", all_asserts.iter());
 
             assertions::print_table_footer();
         }