eval.rs

  1use crate::git_commands::{run_git, setup_temp_repo};
  2use crate::headless_assistant::{HeadlessAppState, HeadlessAssistant};
  3use crate::{get_exercise_language, get_exercise_name, templates_eval::Template};
  4use agent::RequestKind;
  5use anyhow::{Result, anyhow};
  6use collections::HashMap;
  7use gpui::{App, Task};
  8use language_model::{LanguageModel, TokenUsage};
  9use serde::{Deserialize, Serialize};
 10use std::{
 11    fs,
 12    io::Write,
 13    path::{Path, PathBuf},
 14    sync::Arc,
 15    time::{Duration, SystemTime},
 16};
 17
 18#[derive(Debug, Serialize, Deserialize, Clone)]
 19pub struct EvalResult {
 20    pub exercise_name: String,
 21    pub template_name: String,
 22    pub score: String,
 23    pub diff: String,
 24    pub assistant_response: String,
 25    pub elapsed_time_ms: u128,
 26    pub timestamp: u128,
 27    // Token usage fields
 28    pub input_tokens: usize,
 29    pub output_tokens: usize,
 30    pub total_tokens: usize,
 31    pub tool_use_counts: usize,
 32    pub judge_model_name: String, // Added field for judge model name
 33}
 34
 35pub struct EvalOutput {
 36    pub diff: String,
 37    pub last_message: String,
 38    pub elapsed_time: Duration,
 39    pub assistant_response_count: usize,
 40    pub tool_use_counts: HashMap<Arc<str>, u32>,
 41    pub token_usage: TokenUsage,
 42}
 43
 44#[derive(Deserialize)]
 45pub struct EvalSetup {
 46    pub url: String,
 47    pub base_sha: String,
 48}
 49
 50pub struct Eval {
 51    pub repo_path: PathBuf,
 52    pub eval_setup: EvalSetup,
 53    pub user_prompt: String,
 54}
 55
 56impl Eval {
 57    // Keep this method for potential future use, but mark it as intentionally unused
 58    #[allow(dead_code)]
 59    pub async fn load(_name: String, path: PathBuf, repos_dir: &Path) -> Result<Self> {
 60        let prompt_path = path.join("prompt.txt");
 61        let user_prompt = smol::unblock(|| std::fs::read_to_string(prompt_path)).await?;
 62        let setup_path = path.join("setup.json");
 63        let setup_contents = smol::unblock(|| std::fs::read_to_string(setup_path)).await?;
 64        let eval_setup = serde_json_lenient::from_str_lenient::<EvalSetup>(&setup_contents)?;
 65
 66        // Move this internal function inside the load method since it's only used here
 67        fn repo_dir_name(url: &str) -> String {
 68            url.trim_start_matches("https://")
 69                .replace(|c: char| !c.is_alphanumeric(), "_")
 70        }
 71
 72        let repo_path = repos_dir.join(repo_dir_name(&eval_setup.url));
 73
 74        Ok(Eval {
 75            repo_path,
 76            eval_setup,
 77            user_prompt,
 78        })
 79    }
 80
 81    pub fn run(
 82        self,
 83        app_state: Arc<HeadlessAppState>,
 84        model: Arc<dyn LanguageModel>,
 85        cx: &mut App,
 86    ) -> Task<Result<EvalOutput>> {
 87        cx.spawn(async move |cx| {
 88            run_git(&self.repo_path, &["checkout", &self.eval_setup.base_sha]).await?;
 89
 90            let (assistant, done_rx) =
 91                cx.update(|cx| HeadlessAssistant::new(app_state.clone(), cx))??;
 92
 93            let _worktree = assistant
 94                .update(cx, |assistant, cx| {
 95                    assistant.project.update(cx, |project, cx| {
 96                        project.create_worktree(&self.repo_path, true, cx)
 97                    })
 98                })?
 99                .await?;
100
101            let start_time = std::time::SystemTime::now();
102
103            let (system_prompt_context, load_error) = cx
104                .update(|cx| {
105                    assistant
106                        .read(cx)
107                        .thread
108                        .read(cx)
109                        .load_system_prompt_context(cx)
110                })?
111                .await;
112
113            if let Some(load_error) = load_error {
114                return Err(anyhow!("{:?}", load_error));
115            };
116
117            assistant.update(cx, |assistant, cx| {
118                assistant.thread.update(cx, |thread, cx| {
119                    let context = vec![];
120                    thread.insert_user_message(self.user_prompt.clone(), context, None, cx);
121                    thread.set_system_prompt_context(system_prompt_context);
122                    thread.send_to_model(model, RequestKind::Chat, cx);
123                });
124            })?;
125
126            done_rx.recv().await??;
127
128            // Add this section to check untracked files
129            println!("Checking for untracked files:");
130            let untracked = run_git(
131                &self.repo_path,
132                &["ls-files", "--others", "--exclude-standard"],
133            )
134            .await?;
135            if untracked.is_empty() {
136                println!("No untracked files found");
137            } else {
138                // Add all files to git so they appear in the diff
139                println!("Adding untracked files to git");
140                run_git(&self.repo_path, &["add", "."]).await?;
141            }
142
143            // get git status
144            let _status = run_git(&self.repo_path, &["status", "--short"]).await?;
145
146            let elapsed_time = start_time.elapsed()?;
147
148            // Get diff of staged changes (the files we just added)
149            let staged_diff = run_git(&self.repo_path, &["diff", "--staged"]).await?;
150
151            // Get diff of unstaged changes
152            let unstaged_diff = run_git(&self.repo_path, &["diff"]).await?;
153
154            // Combine both diffs
155            let diff = if unstaged_diff.is_empty() {
156                staged_diff
157            } else if staged_diff.is_empty() {
158                unstaged_diff
159            } else {
160                format!(
161                    "# Staged changes\n{}\n\n# Unstaged changes\n{}",
162                    staged_diff, unstaged_diff
163                )
164            };
165
166            assistant.update(cx, |assistant, cx| {
167                let thread = assistant.thread.read(cx);
168                let last_message = thread.messages().last().unwrap();
169                if last_message.role != language_model::Role::Assistant {
170                    return Err(anyhow!("Last message is not from assistant"));
171                }
172                let assistant_response_count = thread
173                    .messages()
174                    .filter(|message| message.role == language_model::Role::Assistant)
175                    .count();
176                Ok(EvalOutput {
177                    diff,
178                    last_message: last_message.to_string(),
179                    elapsed_time,
180                    assistant_response_count,
181                    tool_use_counts: assistant.tool_use_counts.clone(),
182                    token_usage: thread.cumulative_token_usage(),
183                })
184            })?
185        })
186    }
187}
188
189impl EvalOutput {
190    // Keep this method for potential future use, but mark it as intentionally unused
191    #[allow(dead_code)]
192    pub fn save_to_directory(&self, output_dir: &Path, eval_output_value: String) -> Result<()> {
193        // Create the output directory if it doesn't exist
194        fs::create_dir_all(&output_dir)?;
195
196        // Save the diff to a file
197        let diff_path = output_dir.join("diff.patch");
198        let mut diff_file = fs::File::create(&diff_path)?;
199        diff_file.write_all(self.diff.as_bytes())?;
200
201        // Save the last message to a file
202        let message_path = output_dir.join("assistant_response.txt");
203        let mut message_file = fs::File::create(&message_path)?;
204        message_file.write_all(self.last_message.as_bytes())?;
205
206        // Current metrics for this run
207        let current_metrics = serde_json::json!({
208            "elapsed_time_ms": self.elapsed_time.as_millis(),
209            "assistant_response_count": self.assistant_response_count,
210            "tool_use_counts": self.tool_use_counts,
211            "token_usage": self.token_usage,
212            "eval_output_value": eval_output_value,
213        });
214
215        // Get current timestamp in milliseconds
216        let timestamp = std::time::SystemTime::now()
217            .duration_since(std::time::UNIX_EPOCH)?
218            .as_millis()
219            .to_string();
220
221        // Path to metrics file
222        let metrics_path = output_dir.join("metrics.json");
223
224        // Load existing metrics if the file exists, or create a new object
225        let mut historical_metrics = if metrics_path.exists() {
226            let metrics_content = fs::read_to_string(&metrics_path)?;
227            serde_json::from_str::<serde_json::Value>(&metrics_content)
228                .unwrap_or_else(|_| serde_json::json!({}))
229        } else {
230            serde_json::json!({})
231        };
232
233        // Add new run with timestamp as key
234        if let serde_json::Value::Object(ref mut map) = historical_metrics {
235            map.insert(timestamp, current_metrics);
236        }
237
238        // Write updated metrics back to file
239        let metrics_json = serde_json::to_string_pretty(&historical_metrics)?;
240        let mut metrics_file = fs::File::create(&metrics_path)?;
241        metrics_file.write_all(metrics_json.as_bytes())?;
242
243        Ok(())
244    }
245}
246
247pub async fn read_instructions(exercise_path: &Path) -> Result<String> {
248    let instructions_path = exercise_path.join(".docs").join("instructions.md");
249    println!("Reading instructions from: {}", instructions_path.display());
250    let instructions = smol::unblock(move || std::fs::read_to_string(&instructions_path)).await?;
251    Ok(instructions)
252}
253
254pub async fn read_example_solution(exercise_path: &Path, language: &str) -> Result<String> {
255    // Map the language to the file extension
256    let language_extension = match language {
257        "python" => "py",
258        "go" => "go",
259        "rust" => "rs",
260        "typescript" => "ts",
261        "javascript" => "js",
262        "ruby" => "rb",
263        "php" => "php",
264        "bash" => "sh",
265        "multi" => "diff",
266        "internal" => "diff",
267        _ => return Err(anyhow!("Unsupported language: {}", language)),
268    };
269    let example_path = exercise_path
270        .join(".meta")
271        .join(format!("example.{}", language_extension));
272    println!("Reading example solution from: {}", example_path.display());
273    let example = smol::unblock(move || std::fs::read_to_string(&example_path)).await?;
274    Ok(example)
275}
276
277pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -> Result<()> {
278    let eval_dir = exercise_path.join("evaluation");
279    fs::create_dir_all(&eval_dir)?;
280
281    let eval_file = eval_dir.join("evals.json");
282
283    println!("Saving evaluation results to: {}", eval_file.display());
284    println!(
285        "Results to save: {} evaluations for exercise path: {}",
286        results.len(),
287        exercise_path.display()
288    );
289
290    // Check file existence before reading/writing
291    if eval_file.exists() {
292        println!("Existing evals.json file found, will update it");
293    } else {
294        println!("No existing evals.json file found, will create new one");
295    }
296
297    // Structure to organize evaluations by test name and timestamp
298    let mut eval_data: serde_json::Value = if eval_file.exists() {
299        let content = fs::read_to_string(&eval_file)?;
300        serde_json::from_str(&content).unwrap_or_else(|_| serde_json::json!({}))
301    } else {
302        serde_json::json!({})
303    };
304
305    // Get current timestamp for this batch of results
306    let timestamp = SystemTime::now()
307        .duration_since(SystemTime::UNIX_EPOCH)?
308        .as_millis()
309        .to_string();
310
311    // Group the new results by test name (exercise name)
312    for result in results {
313        let exercise_name = &result.exercise_name;
314        let template_name = &result.template_name;
315
316        println!(
317            "Adding result: exercise={}, template={}",
318            exercise_name, template_name
319        );
320
321        // Ensure the exercise entry exists
322        if eval_data.get(exercise_name).is_none() {
323            eval_data[exercise_name] = serde_json::json!({});
324        }
325
326        // Ensure the timestamp entry exists as an object
327        if eval_data[exercise_name].get(&timestamp).is_none() {
328            eval_data[exercise_name][&timestamp] = serde_json::json!({});
329        }
330
331        // Add this result under the timestamp with template name as key
332        eval_data[exercise_name][&timestamp][template_name] = serde_json::to_value(&result)?;
333    }
334
335    // Write back to file with pretty formatting
336    let json_content = serde_json::to_string_pretty(&eval_data)?;
337    match fs::write(&eval_file, json_content) {
338        Ok(_) => println!("✓ Successfully saved results to {}", eval_file.display()),
339        Err(e) => println!("✗ Failed to write results file: {}", e),
340    }
341
342    Ok(())
343}
344
345pub async fn run_exercise_eval(
346    exercise_path: PathBuf,
347    template: Template,
348    model: Arc<dyn LanguageModel>,
349    judge_model: Arc<dyn LanguageModel>,
350    app_state: Arc<HeadlessAppState>,
351    base_sha: String,
352    _framework_path: PathBuf,
353    cx: gpui::AsyncApp,
354) -> Result<EvalResult> {
355    let exercise_name = get_exercise_name(&exercise_path);
356    let language = get_exercise_language(&exercise_path)?;
357    let mut instructions = read_instructions(&exercise_path).await?;
358    instructions.push_str(&format!(
359        "\n\nWhen writing the code for this prompt, use {} to achieve the goal.",
360        language
361    ));
362    let example_solution = read_example_solution(&exercise_path, &language).await?;
363
364    println!(
365        "Running evaluation for exercise: {} with template: {}",
366        exercise_name, template.name
367    );
368
369    // Create temporary directory with exercise files
370    let temp_dir = setup_temp_repo(&exercise_path, &base_sha).await?;
371    let temp_path = temp_dir.path().to_path_buf();
372
373    if template.name == "ProjectCreation" {
374        for entry in fs::read_dir(&temp_path)? {
375            let entry = entry?;
376            let path = entry.path();
377
378            // Skip directories that start with dot (like .docs, .meta, .git)
379            if path.is_dir()
380                && path
381                    .file_name()
382                    .and_then(|name| name.to_str())
383                    .map(|name| name.starts_with("."))
384                    .unwrap_or(false)
385            {
386                continue;
387            }
388
389            // Delete regular files
390            if path.is_file() {
391                println!("  Deleting file: {}", path.display());
392                fs::remove_file(path)?;
393            }
394        }
395
396        // Commit the deletion so it shows up in the diff
397        run_git(&temp_path, &["add", "."]).await?;
398        run_git(
399            &temp_path,
400            &["commit", "-m", "Remove root files for clean slate"],
401        )
402        .await?;
403    }
404
405    let local_commit_sha = run_git(&temp_path, &["rev-parse", "HEAD"]).await?;
406
407    // Prepare prompt based on template
408    let prompt = match template.name {
409        "ProjectCreation" => format!(
410            "I need to create a new implementation for this exercise. Please create all the necessary files in the best location.\n\n{}",
411            instructions
412        ),
413        "CodeModification" => format!(
414            "I need help updating my code to meet these requirements. Please modify the appropriate files:\n\n{}",
415            instructions
416        ),
417        "ConversationalGuidance" => format!(
418            "I'm trying to solve this coding exercise but I'm not sure where to start. Can you help me understand the requirements and guide me through the solution process without writing code for me?\n\n{}",
419            instructions
420        ),
421        _ => instructions.clone(),
422    };
423
424    let start_time = SystemTime::now();
425
426    // Create a basic eval struct to work with the existing system
427    let eval = Eval {
428        repo_path: temp_path.clone(),
429        eval_setup: EvalSetup {
430            url: format!("file://{}", temp_path.display()),
431            base_sha: local_commit_sha, // Use the local commit SHA instead of the framework base SHA
432        },
433        user_prompt: prompt,
434    };
435
436    // Run the evaluation
437    let eval_output = cx
438        .update(|cx| eval.run(app_state.clone(), model.clone(), cx))?
439        .await?;
440
441    // Get diff from git
442    let diff = eval_output.diff.clone();
443
444    // For project creation template, we need to compare with reference implementation
445    let judge_output = if template.name == "ProjectCreation" {
446        let project_judge_prompt = template
447            .content
448            .replace(
449                "<!-- ```requirements go here``` -->",
450                &format!("```\n{}\n```", instructions),
451            )
452            .replace(
453                "<!-- ```reference code goes here``` -->",
454                &format!("```{}\n{}\n```", language, example_solution),
455            )
456            .replace(
457                "<!-- ```git diff goes here``` -->",
458                &format!("```\n{}\n```", diff),
459            );
460
461        // Use the run_with_prompt method which we'll add to judge.rs
462        let judge = crate::judge::Judge {
463            original_diff: None,
464            original_message: Some(project_judge_prompt),
465            model: judge_model.clone(),
466        };
467
468        cx.update(|cx| judge.run_with_prompt(cx))?.await?
469    } else if template.name == "CodeModification" {
470        // For CodeModification, we'll compare the example solution with the LLM-generated solution
471        let code_judge_prompt = template
472            .content
473            .replace(
474                "<!-- ```reference code goes here``` -->",
475                &format!("```{}\n{}\n```", language, example_solution),
476            )
477            .replace(
478                "<!-- ```git diff goes here``` -->",
479                &format!("```\n{}\n```", diff),
480            );
481
482        // Use the run_with_prompt method
483        let judge = crate::judge::Judge {
484            original_diff: None,
485            original_message: Some(code_judge_prompt),
486            model: judge_model.clone(),
487        };
488
489        cx.update(|cx| judge.run_with_prompt(cx))?.await?
490    } else {
491        // Conversational template
492        let conv_judge_prompt = template
493            .content
494            .replace(
495                "<!-- ```query goes here``` -->",
496                &format!("```\n{}\n```", instructions),
497            )
498            .replace(
499                "<!-- ```transcript goes here``` -->",
500                &format!("```\n{}\n```", eval_output.last_message),
501            )
502            .replace(
503                "<!-- ```git diff goes here``` -->",
504                &format!("```\n{}\n```", diff),
505            );
506
507        // Use the run_with_prompt method for consistency
508        let judge = crate::judge::Judge {
509            original_diff: None,
510            original_message: Some(conv_judge_prompt),
511            model: judge_model.clone(),
512        };
513
514        cx.update(|cx| judge.run_with_prompt(cx))?.await?
515    };
516
517    let elapsed_time = start_time.elapsed()?;
518
519    // Calculate total tokens as the sum of input and output tokens
520    let input_tokens = eval_output.token_usage.input_tokens;
521    let output_tokens = eval_output.token_usage.output_tokens;
522    let tool_use_counts = eval_output.tool_use_counts.values().sum::<u32>();
523    let total_tokens = input_tokens + output_tokens;
524
525    // Get judge model name
526    let judge_model_name = judge_model.id().0.to_string();
527
528    // Save results to evaluation directory
529    let result = EvalResult {
530        exercise_name: exercise_name.clone(),
531        template_name: template.name.to_string(),
532        score: judge_output.trim().to_string(),
533        diff,
534        assistant_response: eval_output.last_message.clone(),
535        elapsed_time_ms: elapsed_time.as_millis(),
536        timestamp: SystemTime::now()
537            .duration_since(SystemTime::UNIX_EPOCH)?
538            .as_millis(),
539        // Convert u32 token counts to usize
540        input_tokens: input_tokens.try_into().unwrap(),
541        output_tokens: output_tokens.try_into().unwrap(),
542        total_tokens: total_tokens.try_into().unwrap(),
543        tool_use_counts: tool_use_counts.try_into().unwrap(),
544        judge_model_name, // Add judge model name to result
545    };
546
547    Ok(result)
548}