judge.rs

  1use crate::eval::EvalOutput;
  2use crate::headless_assistant::send_language_model_request;
  3use anyhow::anyhow;
  4use gpui::{App, Task};
  5use language_model::{
  6    LanguageModel, LanguageModelRequest, LanguageModelRequestMessage, MessageContent, Role,
  7};
  8use std::{path::Path, sync::Arc};
  9
 10pub struct Judge {
 11    pub original_diff: Option<String>,
 12    #[allow(dead_code)]
 13    pub original_message: Option<String>,
 14    pub model: Arc<dyn LanguageModel>,
 15}
 16
 17impl Judge {
 18    pub async fn load(eval_path: &Path, model: Arc<dyn LanguageModel>) -> anyhow::Result<Judge> {
 19        let original_diff_path = eval_path.join("original.diff");
 20        let original_diff = smol::unblock(move || {
 21            if std::fs::exists(&original_diff_path)? {
 22                anyhow::Ok(Some(std::fs::read_to_string(&original_diff_path)?))
 23            } else {
 24                anyhow::Ok(None)
 25            }
 26        });
 27
 28        let original_message_path = eval_path.join("original_message.txt");
 29        let original_message = smol::unblock(move || {
 30            if std::fs::exists(&original_message_path)? {
 31                anyhow::Ok(Some(std::fs::read_to_string(&original_message_path)?))
 32            } else {
 33                anyhow::Ok(None)
 34            }
 35        });
 36
 37        Ok(Self {
 38            original_diff: original_diff.await?,
 39            original_message: original_message.await?,
 40            model,
 41        })
 42    }
 43
 44    pub fn run(&self, eval_output: &EvalOutput, cx: &mut App) -> Task<anyhow::Result<String>> {
 45        let Some(original_diff) = self.original_diff.as_ref() else {
 46            return Task::ready(Err(anyhow!("No original.diff found")));
 47        };
 48
 49        // TODO: check for empty diff?
 50        let prompt = diff_comparison_prompt(&original_diff, &eval_output.diff);
 51
 52        let request = LanguageModelRequest {
 53            messages: vec![LanguageModelRequestMessage {
 54                role: Role::User,
 55                content: vec![MessageContent::Text(prompt)],
 56                cache: false,
 57            }],
 58            temperature: Some(0.0),
 59            tools: Vec::new(),
 60            stop: Vec::new(),
 61        };
 62
 63        let model = self.model.clone();
 64        cx.spawn(async move |cx| send_language_model_request(model, request, cx).await)
 65    }
 66}
 67
 68pub fn diff_comparison_prompt(original_diff: &str, new_diff: &str) -> String {
 69    format!(
 70        r#"# Git Diff Similarity Evaluation Template
 71
 72## Instructions
 73
 74Compare the two diffs and score them between 0.0 and 1.0 based on their functional similarity.
 75- 1.0 = Perfect functional match (achieves identical results)
 76- 0.0 = No functional similarity whatsoever
 77
 78## Evaluation Criteria
 79
 80Please consider the following aspects in order of importance:
 81
 821. **Functional Equivalence (60%)**
 83   - Do both diffs achieve the same end result?
 84   - Are the changes functionally equivalent despite possibly using different approaches?
 85   - Do the modifications address the same issues or implement the same features?
 86
 872. **Logical Structure (20%)**
 88   - Are the logical flows similar?
 89   - Do the modifications affect the same code paths?
 90   - Are control structures (if/else, loops, etc.) modified in similar ways?
 91
 923. **Code Content (15%)**
 93   - Are similar lines added/removed?
 94   - Are the same variables, functions, or methods being modified?
 95   - Are the same APIs or libraries being used?
 96
 974. **File Layout (5%)**
 98   - Are the same files being modified?
 99   - Are changes occurring in similar locations within files?
100
101## Input
102
103Original Diff:
104```git
105{}
106```
107
108New Diff:
109```git
110{}
111```
112
113## Output Format
114
115THE ONLY OUTPUT SHOULD BE A SCORE BETWEEN 0.0 AND 1.0.
116
117Example output:
1180.85"#,
119        original_diff, new_diff
120    )
121}