1use crate::git_commands::{run_git, setup_temp_repo};
2use crate::headless_assistant::{HeadlessAppState, HeadlessAssistant};
3use crate::{get_exercise_language, get_exercise_name, templates_eval::Template};
4use agent::RequestKind;
5use anyhow::{Result, anyhow};
6use collections::HashMap;
7use gpui::{App, Task};
8use language_model::{LanguageModel, TokenUsage};
9use serde::{Deserialize, Serialize};
10use std::{
11 fs,
12 io::Write,
13 path::{Path, PathBuf},
14 sync::Arc,
15 time::{Duration, SystemTime},
16};
17
18#[derive(Debug, Serialize, Deserialize, Clone)]
19pub struct EvalResult {
20 pub exercise_name: String,
21 pub template_name: String,
22 pub score: String,
23 pub diff: String,
24 pub assistant_response: String,
25 pub elapsed_time_ms: u128,
26 pub timestamp: u128,
27 // Token usage fields
28 pub input_tokens: usize,
29 pub output_tokens: usize,
30 pub total_tokens: usize,
31 pub tool_use_counts: usize,
32 pub judge_model_name: String, // Added field for judge model name
33}
34
35pub struct EvalOutput {
36 pub diff: String,
37 pub last_message: String,
38 pub elapsed_time: Duration,
39 pub assistant_response_count: usize,
40 pub tool_use_counts: HashMap<Arc<str>, u32>,
41 pub token_usage: TokenUsage,
42}
43
44#[derive(Deserialize)]
45pub struct EvalSetup {
46 pub url: String,
47 pub base_sha: String,
48}
49
50pub struct Eval {
51 pub repo_path: PathBuf,
52 pub eval_setup: EvalSetup,
53 pub user_prompt: String,
54}
55
56impl Eval {
57 // Keep this method for potential future use, but mark it as intentionally unused
58 #[allow(dead_code)]
59 pub async fn load(_name: String, path: PathBuf, repos_dir: &Path) -> Result<Self> {
60 let prompt_path = path.join("prompt.txt");
61 let user_prompt = smol::unblock(|| std::fs::read_to_string(prompt_path)).await?;
62 let setup_path = path.join("setup.json");
63 let setup_contents = smol::unblock(|| std::fs::read_to_string(setup_path)).await?;
64 let eval_setup = serde_json_lenient::from_str_lenient::<EvalSetup>(&setup_contents)?;
65
66 // Move this internal function inside the load method since it's only used here
67 fn repo_dir_name(url: &str) -> String {
68 url.trim_start_matches("https://")
69 .replace(|c: char| !c.is_alphanumeric(), "_")
70 }
71
72 let repo_path = repos_dir.join(repo_dir_name(&eval_setup.url));
73
74 Ok(Eval {
75 repo_path,
76 eval_setup,
77 user_prompt,
78 })
79 }
80
81 pub fn run(
82 self,
83 app_state: Arc<HeadlessAppState>,
84 model: Arc<dyn LanguageModel>,
85 cx: &mut App,
86 ) -> Task<Result<EvalOutput>> {
87 cx.spawn(async move |cx| {
88 run_git(&self.repo_path, &["checkout", &self.eval_setup.base_sha]).await?;
89
90 let (assistant, done_rx) =
91 cx.update(|cx| HeadlessAssistant::new(app_state.clone(), cx))??;
92
93 let _worktree = assistant
94 .update(cx, |assistant, cx| {
95 assistant.project.update(cx, |project, cx| {
96 project.create_worktree(&self.repo_path, true, cx)
97 })
98 })?
99 .await?;
100
101 let start_time = std::time::SystemTime::now();
102
103 let (system_prompt_context, load_error) = cx
104 .update(|cx| {
105 assistant
106 .read(cx)
107 .thread
108 .read(cx)
109 .load_system_prompt_context(cx)
110 })?
111 .await;
112
113 if let Some(load_error) = load_error {
114 return Err(anyhow!("{:?}", load_error));
115 };
116
117 assistant.update(cx, |assistant, cx| {
118 assistant.thread.update(cx, |thread, cx| {
119 let context = vec![];
120 thread.insert_user_message(self.user_prompt.clone(), context, None, cx);
121 thread.set_system_prompt_context(system_prompt_context);
122 thread.send_to_model(model, RequestKind::Chat, cx);
123 });
124 })?;
125
126 done_rx.recv().await??;
127
128 // Add this section to check untracked files
129 println!("Checking for untracked files:");
130 let untracked = run_git(
131 &self.repo_path,
132 &["ls-files", "--others", "--exclude-standard"],
133 )
134 .await?;
135 if untracked.is_empty() {
136 println!("No untracked files found");
137 } else {
138 // Add all files to git so they appear in the diff
139 println!("Adding untracked files to git");
140 run_git(&self.repo_path, &["add", "."]).await?;
141 }
142
143 // get git status
144 let _status = run_git(&self.repo_path, &["status", "--short"]).await?;
145
146 let elapsed_time = start_time.elapsed()?;
147
148 // Get diff of staged changes (the files we just added)
149 let staged_diff = run_git(&self.repo_path, &["diff", "--staged"]).await?;
150
151 // Get diff of unstaged changes
152 let unstaged_diff = run_git(&self.repo_path, &["diff"]).await?;
153
154 // Combine both diffs
155 let diff = if unstaged_diff.is_empty() {
156 staged_diff
157 } else if staged_diff.is_empty() {
158 unstaged_diff
159 } else {
160 format!(
161 "# Staged changes\n{}\n\n# Unstaged changes\n{}",
162 staged_diff, unstaged_diff
163 )
164 };
165
166 assistant.update(cx, |assistant, cx| {
167 let thread = assistant.thread.read(cx);
168 let last_message = thread.messages().last().unwrap();
169 if last_message.role != language_model::Role::Assistant {
170 return Err(anyhow!("Last message is not from assistant"));
171 }
172 let assistant_response_count = thread
173 .messages()
174 .filter(|message| message.role == language_model::Role::Assistant)
175 .count();
176 Ok(EvalOutput {
177 diff,
178 last_message: last_message.to_string(),
179 elapsed_time,
180 assistant_response_count,
181 tool_use_counts: assistant.tool_use_counts.clone(),
182 token_usage: thread.cumulative_token_usage(),
183 })
184 })?
185 })
186 }
187}
188
189impl EvalOutput {
190 // Keep this method for potential future use, but mark it as intentionally unused
191 #[allow(dead_code)]
192 pub fn save_to_directory(&self, output_dir: &Path, eval_output_value: String) -> Result<()> {
193 // Create the output directory if it doesn't exist
194 fs::create_dir_all(&output_dir)?;
195
196 // Save the diff to a file
197 let diff_path = output_dir.join("diff.patch");
198 let mut diff_file = fs::File::create(&diff_path)?;
199 diff_file.write_all(self.diff.as_bytes())?;
200
201 // Save the last message to a file
202 let message_path = output_dir.join("assistant_response.txt");
203 let mut message_file = fs::File::create(&message_path)?;
204 message_file.write_all(self.last_message.as_bytes())?;
205
206 // Current metrics for this run
207 let current_metrics = serde_json::json!({
208 "elapsed_time_ms": self.elapsed_time.as_millis(),
209 "assistant_response_count": self.assistant_response_count,
210 "tool_use_counts": self.tool_use_counts,
211 "token_usage": self.token_usage,
212 "eval_output_value": eval_output_value,
213 });
214
215 // Get current timestamp in milliseconds
216 let timestamp = std::time::SystemTime::now()
217 .duration_since(std::time::UNIX_EPOCH)?
218 .as_millis()
219 .to_string();
220
221 // Path to metrics file
222 let metrics_path = output_dir.join("metrics.json");
223
224 // Load existing metrics if the file exists, or create a new object
225 let mut historical_metrics = if metrics_path.exists() {
226 let metrics_content = fs::read_to_string(&metrics_path)?;
227 serde_json::from_str::<serde_json::Value>(&metrics_content)
228 .unwrap_or_else(|_| serde_json::json!({}))
229 } else {
230 serde_json::json!({})
231 };
232
233 // Add new run with timestamp as key
234 if let serde_json::Value::Object(ref mut map) = historical_metrics {
235 map.insert(timestamp, current_metrics);
236 }
237
238 // Write updated metrics back to file
239 let metrics_json = serde_json::to_string_pretty(&historical_metrics)?;
240 let mut metrics_file = fs::File::create(&metrics_path)?;
241 metrics_file.write_all(metrics_json.as_bytes())?;
242
243 Ok(())
244 }
245}
246
247pub async fn read_instructions(exercise_path: &Path) -> Result<String> {
248 let instructions_path = exercise_path.join(".docs").join("instructions.md");
249 println!("Reading instructions from: {}", instructions_path.display());
250 let instructions = smol::unblock(move || std::fs::read_to_string(&instructions_path)).await?;
251 Ok(instructions)
252}
253
254pub async fn read_example_solution(exercise_path: &Path, language: &str) -> Result<String> {
255 // Map the language to the file extension
256 let language_extension = match language {
257 "python" => "py",
258 "go" => "go",
259 "rust" => "rs",
260 "typescript" => "ts",
261 "javascript" => "js",
262 "ruby" => "rb",
263 "php" => "php",
264 "bash" => "sh",
265 "multi" => "diff",
266 "internal" => "diff",
267 _ => return Err(anyhow!("Unsupported language: {}", language)),
268 };
269 let example_path = exercise_path
270 .join(".meta")
271 .join(format!("example.{}", language_extension));
272 println!("Reading example solution from: {}", example_path.display());
273 let example = smol::unblock(move || std::fs::read_to_string(&example_path)).await?;
274 Ok(example)
275}
276
277pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -> Result<()> {
278 let eval_dir = exercise_path.join("evaluation");
279 fs::create_dir_all(&eval_dir)?;
280
281 let eval_file = eval_dir.join("evals.json");
282
283 println!("Saving evaluation results to: {}", eval_file.display());
284 println!(
285 "Results to save: {} evaluations for exercise path: {}",
286 results.len(),
287 exercise_path.display()
288 );
289
290 // Check file existence before reading/writing
291 if eval_file.exists() {
292 println!("Existing evals.json file found, will update it");
293 } else {
294 println!("No existing evals.json file found, will create new one");
295 }
296
297 // Structure to organize evaluations by test name and timestamp
298 let mut eval_data: serde_json::Value = if eval_file.exists() {
299 let content = fs::read_to_string(&eval_file)?;
300 serde_json::from_str(&content).unwrap_or_else(|_| serde_json::json!({}))
301 } else {
302 serde_json::json!({})
303 };
304
305 // Get current timestamp for this batch of results
306 let timestamp = SystemTime::now()
307 .duration_since(SystemTime::UNIX_EPOCH)?
308 .as_millis()
309 .to_string();
310
311 // Group the new results by test name (exercise name)
312 for result in results {
313 let exercise_name = &result.exercise_name;
314 let template_name = &result.template_name;
315
316 println!(
317 "Adding result: exercise={}, template={}",
318 exercise_name, template_name
319 );
320
321 // Ensure the exercise entry exists
322 if eval_data.get(exercise_name).is_none() {
323 eval_data[exercise_name] = serde_json::json!({});
324 }
325
326 // Ensure the timestamp entry exists as an object
327 if eval_data[exercise_name].get(×tamp).is_none() {
328 eval_data[exercise_name][×tamp] = serde_json::json!({});
329 }
330
331 // Add this result under the timestamp with template name as key
332 eval_data[exercise_name][×tamp][template_name] = serde_json::to_value(&result)?;
333 }
334
335 // Write back to file with pretty formatting
336 let json_content = serde_json::to_string_pretty(&eval_data)?;
337 match fs::write(&eval_file, json_content) {
338 Ok(_) => println!("✓ Successfully saved results to {}", eval_file.display()),
339 Err(e) => println!("✗ Failed to write results file: {}", e),
340 }
341
342 Ok(())
343}
344
345pub async fn run_exercise_eval(
346 exercise_path: PathBuf,
347 template: Template,
348 model: Arc<dyn LanguageModel>,
349 judge_model: Arc<dyn LanguageModel>,
350 app_state: Arc<HeadlessAppState>,
351 base_sha: String,
352 _framework_path: PathBuf,
353 cx: gpui::AsyncApp,
354) -> Result<EvalResult> {
355 let exercise_name = get_exercise_name(&exercise_path);
356 let language = get_exercise_language(&exercise_path)?;
357 let mut instructions = read_instructions(&exercise_path).await?;
358 instructions.push_str(&format!(
359 "\n\nWhen writing the code for this prompt, use {} to achieve the goal.",
360 language
361 ));
362 let example_solution = read_example_solution(&exercise_path, &language).await?;
363
364 println!(
365 "Running evaluation for exercise: {} with template: {}",
366 exercise_name, template.name
367 );
368
369 // Create temporary directory with exercise files
370 let temp_dir = setup_temp_repo(&exercise_path, &base_sha).await?;
371 let temp_path = temp_dir.path().to_path_buf();
372
373 if template.name == "ProjectCreation" {
374 for entry in fs::read_dir(&temp_path)? {
375 let entry = entry?;
376 let path = entry.path();
377
378 // Skip directories that start with dot (like .docs, .meta, .git)
379 if path.is_dir()
380 && path
381 .file_name()
382 .and_then(|name| name.to_str())
383 .map(|name| name.starts_with("."))
384 .unwrap_or(false)
385 {
386 continue;
387 }
388
389 // Delete regular files
390 if path.is_file() {
391 println!(" Deleting file: {}", path.display());
392 fs::remove_file(path)?;
393 }
394 }
395
396 // Commit the deletion so it shows up in the diff
397 run_git(&temp_path, &["add", "."]).await?;
398 run_git(
399 &temp_path,
400 &["commit", "-m", "Remove root files for clean slate"],
401 )
402 .await?;
403 }
404
405 let local_commit_sha = run_git(&temp_path, &["rev-parse", "HEAD"]).await?;
406
407 // Prepare prompt based on template
408 let prompt = match template.name {
409 "ProjectCreation" => format!(
410 "I need to create a new implementation for this exercise. Please create all the necessary files in the best location.\n\n{}",
411 instructions
412 ),
413 "CodeModification" => format!(
414 "I need help updating my code to meet these requirements. Please modify the appropriate files:\n\n{}",
415 instructions
416 ),
417 "ConversationalGuidance" => format!(
418 "I'm trying to solve this coding exercise but I'm not sure where to start. Can you help me understand the requirements and guide me through the solution process without writing code for me?\n\n{}",
419 instructions
420 ),
421 _ => instructions.clone(),
422 };
423
424 let start_time = SystemTime::now();
425
426 // Create a basic eval struct to work with the existing system
427 let eval = Eval {
428 repo_path: temp_path.clone(),
429 eval_setup: EvalSetup {
430 url: format!("file://{}", temp_path.display()),
431 base_sha: local_commit_sha, // Use the local commit SHA instead of the framework base SHA
432 },
433 user_prompt: prompt,
434 };
435
436 // Run the evaluation
437 let eval_output = cx
438 .update(|cx| eval.run(app_state.clone(), model.clone(), cx))?
439 .await?;
440
441 // Get diff from git
442 let diff = eval_output.diff.clone();
443
444 // For project creation template, we need to compare with reference implementation
445 let judge_output = if template.name == "ProjectCreation" {
446 let project_judge_prompt = template
447 .content
448 .replace(
449 "<!-- ```requirements go here``` -->",
450 &format!("```\n{}\n```", instructions),
451 )
452 .replace(
453 "<!-- ```reference code goes here``` -->",
454 &format!("```{}\n{}\n```", language, example_solution),
455 )
456 .replace(
457 "<!-- ```git diff goes here``` -->",
458 &format!("```\n{}\n```", diff),
459 );
460
461 // Use the run_with_prompt method which we'll add to judge.rs
462 let judge = crate::judge::Judge {
463 original_diff: None,
464 original_message: Some(project_judge_prompt),
465 model: judge_model.clone(),
466 };
467
468 cx.update(|cx| judge.run_with_prompt(cx))?.await?
469 } else if template.name == "CodeModification" {
470 // For CodeModification, we'll compare the example solution with the LLM-generated solution
471 let code_judge_prompt = template
472 .content
473 .replace(
474 "<!-- ```reference code goes here``` -->",
475 &format!("```{}\n{}\n```", language, example_solution),
476 )
477 .replace(
478 "<!-- ```git diff goes here``` -->",
479 &format!("```\n{}\n```", diff),
480 );
481
482 // Use the run_with_prompt method
483 let judge = crate::judge::Judge {
484 original_diff: None,
485 original_message: Some(code_judge_prompt),
486 model: judge_model.clone(),
487 };
488
489 cx.update(|cx| judge.run_with_prompt(cx))?.await?
490 } else {
491 // Conversational template
492 let conv_judge_prompt = template
493 .content
494 .replace(
495 "<!-- ```query goes here``` -->",
496 &format!("```\n{}\n```", instructions),
497 )
498 .replace(
499 "<!-- ```transcript goes here``` -->",
500 &format!("```\n{}\n```", eval_output.last_message),
501 )
502 .replace(
503 "<!-- ```git diff goes here``` -->",
504 &format!("```\n{}\n```", diff),
505 );
506
507 // Use the run_with_prompt method for consistency
508 let judge = crate::judge::Judge {
509 original_diff: None,
510 original_message: Some(conv_judge_prompt),
511 model: judge_model.clone(),
512 };
513
514 cx.update(|cx| judge.run_with_prompt(cx))?.await?
515 };
516
517 let elapsed_time = start_time.elapsed()?;
518
519 // Calculate total tokens as the sum of input and output tokens
520 let input_tokens = eval_output.token_usage.input_tokens;
521 let output_tokens = eval_output.token_usage.output_tokens;
522 let tool_use_counts = eval_output.tool_use_counts.values().sum::<u32>();
523 let total_tokens = input_tokens + output_tokens;
524
525 // Get judge model name
526 let judge_model_name = judge_model.id().0.to_string();
527
528 // Save results to evaluation directory
529 let result = EvalResult {
530 exercise_name: exercise_name.clone(),
531 template_name: template.name.to_string(),
532 score: judge_output.trim().to_string(),
533 diff,
534 assistant_response: eval_output.last_message.clone(),
535 elapsed_time_ms: elapsed_time.as_millis(),
536 timestamp: SystemTime::now()
537 .duration_since(SystemTime::UNIX_EPOCH)?
538 .as_millis(),
539 // Convert u32 token counts to usize
540 input_tokens: input_tokens.try_into().unwrap(),
541 output_tokens: output_tokens.try_into().unwrap(),
542 total_tokens: total_tokens.try_into().unwrap(),
543 tool_use_counts: tool_use_counts.try_into().unwrap(),
544 judge_model_name, // Add judge model name to result
545 };
546
547 Ok(result)
548}