1use crate::eval::EvalOutput;
2use crate::headless_assistant::send_language_model_request;
3use anyhow::anyhow;
4use gpui::{App, Task};
5use language_model::{
6 LanguageModel, LanguageModelRequest, LanguageModelRequestMessage, MessageContent, Role,
7};
8use std::{path::Path, sync::Arc};
9
10pub struct Judge {
11 pub original_diff: Option<String>,
12 #[allow(dead_code)]
13 pub original_message: Option<String>,
14 pub model: Arc<dyn LanguageModel>,
15}
16
17impl Judge {
18 pub async fn load(eval_path: &Path, model: Arc<dyn LanguageModel>) -> anyhow::Result<Judge> {
19 let original_diff_path = eval_path.join("original.diff");
20 let original_diff = smol::unblock(move || {
21 if std::fs::exists(&original_diff_path)? {
22 anyhow::Ok(Some(std::fs::read_to_string(&original_diff_path)?))
23 } else {
24 anyhow::Ok(None)
25 }
26 });
27
28 let original_message_path = eval_path.join("original_message.txt");
29 let original_message = smol::unblock(move || {
30 if std::fs::exists(&original_message_path)? {
31 anyhow::Ok(Some(std::fs::read_to_string(&original_message_path)?))
32 } else {
33 anyhow::Ok(None)
34 }
35 });
36
37 Ok(Self {
38 original_diff: original_diff.await?,
39 original_message: original_message.await?,
40 model,
41 })
42 }
43
44 pub fn run(&self, eval_output: &EvalOutput, cx: &mut App) -> Task<anyhow::Result<String>> {
45 let Some(original_diff) = self.original_diff.as_ref() else {
46 return Task::ready(Err(anyhow!("No original.diff found")));
47 };
48
49 // TODO: check for empty diff?
50 let prompt = diff_comparison_prompt(&original_diff, &eval_output.diff);
51
52 let request = LanguageModelRequest {
53 messages: vec![LanguageModelRequestMessage {
54 role: Role::User,
55 content: vec![MessageContent::Text(prompt)],
56 cache: false,
57 }],
58 temperature: Some(0.0),
59 tools: Vec::new(),
60 stop: Vec::new(),
61 };
62
63 let model = self.model.clone();
64 cx.spawn(async move |cx| send_language_model_request(model, request, cx).await)
65 }
66}
67
68pub fn diff_comparison_prompt(original_diff: &str, new_diff: &str) -> String {
69 format!(
70 r#"# Git Diff Similarity Evaluation Template
71
72## Instructions
73
74Compare the two diffs and score them between 0.0 and 1.0 based on their functional similarity.
75- 1.0 = Perfect functional match (achieves identical results)
76- 0.0 = No functional similarity whatsoever
77
78## Evaluation Criteria
79
80Please consider the following aspects in order of importance:
81
821. **Functional Equivalence (60%)**
83 - Do both diffs achieve the same end result?
84 - Are the changes functionally equivalent despite possibly using different approaches?
85 - Do the modifications address the same issues or implement the same features?
86
872. **Logical Structure (20%)**
88 - Are the logical flows similar?
89 - Do the modifications affect the same code paths?
90 - Are control structures (if/else, loops, etc.) modified in similar ways?
91
923. **Code Content (15%)**
93 - Are similar lines added/removed?
94 - Are the same variables, functions, or methods being modified?
95 - Are the same APIs or libraries being used?
96
974. **File Layout (5%)**
98 - Are the same files being modified?
99 - Are changes occurring in similar locations within files?
100
101## Input
102
103Original Diff:
104```git
105{}
106```
107
108New Diff:
109```git
110{}
111```
112
113## Output Format
114
115THE ONLY OUTPUT SHOULD BE A SCORE BETWEEN 0.0 AND 1.0.
116
117Example output:
1180.85"#,
119 original_diff, new_diff
120 )
121}