evals: Allow threads explorer to search for JSON files recursively (#31509)

Oleksiy Syvokon created

It's just more convenient to call it from CLI this way.

+ minor fixes in evals

Release Notes:

- N/A

Change summary

crates/eval/src/examples/overwrite_file.rs |  10 +
crates/eval/src/explorer.rs                | 155 ++++++++++++++++++++---
2 files changed, 139 insertions(+), 26 deletions(-)

Detailed changes

crates/eval/src/examples/overwrite_file.rs 🔗

@@ -12,8 +12,10 @@ This eval tests a fix for a destructive behavior of the `edit_file` tool.
 Previously, it would rewrite existing files too aggressively, which often
 resulted in content loss.
 
-Pass rate before the fix: 10%
-Pass rate after the fix:  100%
+Model           | Pass rate
+----------------|----------
+Sonnet 3.7      | 100%
+Gemini 2.5 Pro  |  80%
 */
 
 #[async_trait(?Send)]
@@ -38,7 +40,9 @@ impl Example for FileOverwriteExample {
             let input = tool_use.parse_input::<EditFileToolInput>()?;
             match input.mode {
                 EditFileMode::Edit => false,
-                EditFileMode::Create | EditFileMode::Overwrite => true,
+                EditFileMode::Create | EditFileMode::Overwrite => {
+                    input.path.ends_with("src/language_model_selector.rs")
+                }
             }
         } else {
             false

crates/eval/src/explorer.rs 🔗

@@ -2,22 +2,65 @@ use anyhow::{Context as _, Result};
 use clap::Parser;
 use serde_json::{Value, json};
 use std::fs;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 
 #[derive(Parser, Debug)]
 #[clap(about = "Generate HTML explorer from JSON thread files")]
 struct Args {
-    /// Paths to JSON files containing thread data
+    /// Paths to JSON files or directories. If a directory is provided,
+    /// it will be searched for 'last.messages.json' files up to 2 levels deep.
     #[clap(long, required = true, num_args = 1..)]
     input: Vec<PathBuf>,
 
-    /// Path where the HTML explorer file will be written
+    /// Path where the output HTML file will be written
     #[clap(long)]
     output: PathBuf,
 }
 
-pub fn generate_explorer_html(inputs: &[PathBuf], output: &PathBuf) -> Result<String> {
-    if let Some(parent) = output.parent() {
+/// Recursively finds files with `target_filename` in `dir_path` up to `max_depth`.
+#[allow(dead_code)]
+fn find_target_files_recursive(
+    dir_path: &Path,
+    target_filename: &str,
+    current_depth: u8,
+    max_depth: u8,
+    found_files: &mut Vec<PathBuf>,
+) -> Result<()> {
+    if current_depth > max_depth {
+        return Ok(());
+    }
+
+    for entry_result in fs::read_dir(dir_path)
+        .with_context(|| format!("Failed to read directory: {}", dir_path.display()))?
+    {
+        let entry = entry_result.with_context(|| {
+            format!("Failed to read directory entry in: {}", dir_path.display())
+        })?;
+        let path = entry.path();
+
+        if path.is_dir() {
+            find_target_files_recursive(
+                &path,
+                target_filename,
+                current_depth + 1,
+                max_depth,
+                found_files,
+            )?;
+        } else if path.is_file() {
+            if let Some(filename_osstr) = path.file_name() {
+                if let Some(filename_str) = filename_osstr.to_str() {
+                    if filename_str == target_filename {
+                        found_files.push(path);
+                    }
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+pub fn generate_explorer_html(input_paths: &[PathBuf], output_path: &PathBuf) -> Result<String> {
+    if let Some(parent) = output_path.parent() {
         if !parent.exists() {
             fs::create_dir_all(parent).context(format!(
                 "Failed to create output directory: {}",
@@ -27,41 +70,67 @@ pub fn generate_explorer_html(inputs: &[PathBuf], output: &PathBuf) -> Result<St
     }
 
     let template_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/explorer.html");
-    let template = fs::read_to_string(&template_path).context(format!(
+    let template_content = fs::read_to_string(&template_path).context(format!(
         "Template file not found or couldn't be read: {}",
         template_path.display()
     ))?;
 
-    let threads = inputs
+    if input_paths.is_empty() {
+        println!(
+            "No input JSON files found to process. Explorer will be generated with template defaults or empty data."
+        );
+    }
+
+    let threads = input_paths
         .iter()
         .map(|input_path| {
-            let mut thread_data: Value = fs::read_to_string(input_path)
-                .context(format!("Failed to read file: {}", input_path.display()))?
+            let file_content = fs::read_to_string(input_path)
+                .context(format!("Failed to read file: {}", input_path.display()))?;
+            let mut thread_data: Value = file_content
                 .parse::<Value>()
-                .context(format!("Failed to parse JSON: {}", input_path.display()))?;
-            thread_data["filename"] = json!(input_path); // This will be shown in a thread heading
+                .context(format!("Failed to parse JSON from file: {}", input_path.display()))?;
+
+            if let Some(obj) = thread_data.as_object_mut() {
+                obj.insert("filename".to_string(), json!(input_path.display().to_string()));
+            } else {
+                eprintln!("Warning: JSON data in {} is not a root object. Wrapping it to include filename.", input_path.display());
+                thread_data = json!({
+                    "original_data": thread_data,
+                    "filename": input_path.display().to_string()
+                });
+            }
             Ok(thread_data)
         })
         .collect::<Result<Vec<_>>>()?;
 
-    let all_threads = json!({ "threads": threads });
-    let html_content = inject_thread_data(template, all_threads)?;
-    fs::write(&output, &html_content)
-        .context(format!("Failed to write output: {}", output.display()))?;
+    let all_threads_data = json!({ "threads": threads });
+    let html_content = inject_thread_data(template_content, all_threads_data)?;
+    fs::write(&output_path, &html_content)
+        .context(format!("Failed to write output: {}", output_path.display()))?;
 
-    println!("Saved {} thread(s) to {}", threads.len(), output.display());
+    println!(
+        "Saved data from {} resolved file(s) ({} threads) to {}",
+        input_paths.len(),
+        threads.len(),
+        output_path.display()
+    );
     Ok(html_content)
 }
 
 fn inject_thread_data(template: String, threads_data: Value) -> Result<String> {
     let injection_marker = "let threadsData = window.threadsData || { threads: [dummyThread] };";
-    template
-        .find(injection_marker)
-        .context("Could not find the thread injection point in the template")?;
+    if !template.contains(injection_marker) {
+        anyhow::bail!(
+            "Could not find the thread injection point in the template. Expected: '{}'",
+            injection_marker
+        );
+    }
 
-    let threads_json = serde_json::to_string_pretty(&threads_data)
-        .context("Failed to serialize threads data to JSON")?;
-    let script_injection = format!("let threadsData = {};", threads_json);
+    let threads_json_string = serde_json::to_string_pretty(&threads_data)
+        .context("Failed to serialize threads data to JSON")?
+        .replace("</script>", r"<\/script>");
+
+    let script_injection = format!("let threadsData = {};", threads_json_string);
     let final_html = template.replacen(injection_marker, &script_injection, 1);
 
     Ok(final_html)
@@ -71,5 +140,45 @@ fn inject_thread_data(template: String, threads_data: Value) -> Result<String> {
 #[allow(dead_code)]
 fn main() -> Result<()> {
     let args = Args::parse();
-    generate_explorer_html(&args.input, &args.output).map(|_| ())
+
+    const DEFAULT_FILENAME: &str = "last.messages.json";
+    const MAX_SEARCH_DEPTH: u8 = 2;
+
+    let mut resolved_input_files: Vec<PathBuf> = Vec::new();
+
+    for input_path_arg in &args.input {
+        if !input_path_arg.exists() {
+            eprintln!(
+                "Warning: Input path {} does not exist. Skipping.",
+                input_path_arg.display()
+            );
+            continue;
+        }
+
+        if input_path_arg.is_dir() {
+            find_target_files_recursive(
+                input_path_arg,
+                DEFAULT_FILENAME,
+                0, // starting depth
+                MAX_SEARCH_DEPTH,
+                &mut resolved_input_files,
+            )
+            .with_context(|| {
+                format!(
+                    "Error searching for '{}' files in directory: {}",
+                    DEFAULT_FILENAME,
+                    input_path_arg.display()
+                )
+            })?;
+        } else if input_path_arg.is_file() {
+            resolved_input_files.push(input_path_arg.clone());
+        }
+    }
+
+    resolved_input_files.sort_unstable();
+    resolved_input_files.dedup();
+
+    println!("No input paths provided/found.");
+
+    generate_explorer_html(&resolved_input_files, &args.output).map(|_| ())
 }