synthesize.rs

  1use crate::{
  2    anthropic_client::PlainLlmClient,
  3    git::{ensure_repo_cloned, run_git},
  4    paths::{FAILED_EXAMPLES_DIR, LATEST_FAILED_EXAMPLES_DIR, SYNTHESIZE_STATE_FILE},
  5    progress::{InfoStyle, Progress, Step, StepProgress},
  6};
  7use anthropic::ResponseContent;
  8use anyhow::{Context as _, Result};
  9use chrono::Local;
 10use collections::{HashMap, HashSet};
 11use edit_prediction::{
 12    example_spec::ExampleSpec,
 13    udiff::{apply_diff_to_string, edits_for_diff},
 14};
 15use futures::stream::{FuturesUnordered, StreamExt};
 16use indoc::indoc;
 17use serde::{Deserialize, Serialize};
 18use std::{
 19    path::{Path, PathBuf},
 20    sync::Arc,
 21};
 22
 23#[derive(Debug, Clone)]
 24pub struct SynthesizeConfig {
 25    pub repo_urls: Vec<String>,
 26    /// Number of examples to generate per repository
 27    pub count: usize,
 28    pub max_commits: usize,
 29    pub output_dir: PathBuf,
 30    pub fresh: bool,
 31}
 32
 33#[derive(Debug, Default, Serialize, Deserialize)]
 34struct SynthesizeState {
 35    repositories: HashMap<String, RepoState>,
 36}
 37
 38#[derive(Debug, Default, Serialize, Deserialize)]
 39struct RepoState {
 40    processed_commits: HashSet<String>,
 41    examples_generated: usize,
 42}
 43
 44impl SynthesizeState {
 45    fn load() -> Self {
 46        if SYNTHESIZE_STATE_FILE.exists() {
 47            std::fs::read_to_string(&*SYNTHESIZE_STATE_FILE)
 48                .ok()
 49                .and_then(|s| serde_json::from_str(&s).ok())
 50                .unwrap_or_default()
 51        } else {
 52            Self::default()
 53        }
 54    }
 55
 56    fn save(&self) -> Result<()> {
 57        let content = serde_json::to_string_pretty(self)?;
 58        std::fs::write(&*SYNTHESIZE_STATE_FILE, content)?;
 59        Ok(())
 60    }
 61
 62    fn take_repo_state(&mut self, repo_url: &str) -> RepoState {
 63        self.repositories.remove(repo_url).unwrap_or_default()
 64    }
 65
 66    fn merge_repo_state(&mut self, repo_url: String, repo_state: RepoState) {
 67        self.repositories.insert(repo_url, repo_state);
 68    }
 69}
 70
 71impl RepoState {
 72    fn is_processed(&self, commit_sha: &str) -> bool {
 73        self.processed_commits.contains(commit_sha)
 74    }
 75
 76    fn mark_processed(&mut self, commit_sha: &str, examples_count: usize) {
 77        self.processed_commits.insert(commit_sha.to_string());
 78        self.examples_generated += examples_count;
 79    }
 80}
 81
 82#[derive(Debug)]
 83struct CommitInfo {
 84    sha: String,
 85    parent_sha: String,
 86    message: String,
 87    diff: String,
 88    expanded_diff: String,
 89}
 90
 91/// Claude's response parsed into structured form
 92#[derive(Debug)]
 93struct ClaudeResponse {
 94    name: String,
 95    reasoning: String,
 96    edit_history_hunks: Vec<String>,
 97    expected_patch_hunks: Vec<String>,
 98}
 99
100pub async fn run_synthesize(config: SynthesizeConfig) -> Result<()> {
101    let mut state = if config.fresh {
102        SynthesizeState::default()
103    } else {
104        SynthesizeState::load()
105    };
106
107    std::fs::create_dir_all(&config.output_dir)?;
108    std::fs::create_dir_all(&*FAILED_EXAMPLES_DIR)?;
109
110    // Create "latest_failed" symlink pointing to this run's failed directory
111    if LATEST_FAILED_EXAMPLES_DIR.is_symlink() {
112        std::fs::remove_file(&*LATEST_FAILED_EXAMPLES_DIR)?;
113    }
114    #[cfg(unix)]
115    std::os::unix::fs::symlink(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
116    #[cfg(windows)]
117    std::os::windows::fs::symlink_dir(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
118
119    let progress = Progress::global();
120    let total_examples = config.count * config.repo_urls.len();
121    progress.set_total_examples(total_examples);
122
123    let client = Arc::new(PlainLlmClient::new()?);
124    let config = Arc::new(config);
125
126    let mut futures: FuturesUnordered<_> = config
127        .repo_urls
128        .iter()
129        .map(|repo_url| {
130            let client = client.clone();
131            let repo_state = state.take_repo_state(repo_url);
132            let config = config.clone();
133            let repo_url = repo_url.clone();
134            async move {
135                let result = synthesize_repo(&client, repo_state, &config, &repo_url).await;
136                (repo_url, result)
137            }
138        })
139        .collect();
140
141    let mut errors = Vec::new();
142    while let Some((repo_url, result)) = futures.next().await {
143        match result {
144            Ok(repo_state) => {
145                state.merge_repo_state(repo_url, repo_state);
146            }
147            Err(e) => {
148                errors.push(e);
149            }
150        }
151    }
152
153    state.save()?;
154
155    progress.finalize();
156
157    if let Some(first_error) = errors.into_iter().next() {
158        return Err(first_error);
159    }
160
161    Ok(())
162}
163
164async fn synthesize_repo(
165    client: &PlainLlmClient,
166    mut repo_state: RepoState,
167    config: &SynthesizeConfig,
168    repo_url: &str,
169) -> Result<RepoState> {
170    let progress = Progress::global();
171    let batch_size = config.max_commits;
172
173    let clone_progress = progress.start(Step::Synthesize, &format!("clone {}", repo_url));
174    let repo_path = ensure_repo_cloned(repo_url).await?;
175    drop(clone_progress);
176
177    let mut examples_generated = 0;
178    let mut commits_skipped = 0;
179
180    'outer: loop {
181        let list_progress = progress.start(
182            Step::Synthesize,
183            &format!("{}: list-commits", repo_name_from_url(repo_url)),
184        );
185        let commits = list_commits(&repo_path, batch_size, commits_skipped).await?;
186        drop(list_progress);
187
188        if commits.is_empty() {
189            break;
190        }
191
192        commits_skipped += commits.len();
193
194        for commit in commits {
195            if examples_generated >= config.count {
196                break 'outer;
197            }
198
199            if !config.fresh && repo_state.is_processed(&commit.sha) {
200                continue;
201            }
202
203            if should_skip_commit(&commit) {
204                continue;
205            }
206
207            let repo_name = repo_name_from_url(repo_url);
208            let commit_label = format!(
209                "{}: {} {}",
210                repo_name,
211                &commit.sha[..8],
212                truncate_message(&commit.message, 40)
213            );
214            let step_progress = Arc::new(progress.start(Step::Synthesize, &commit_label));
215
216            // Single Claude call to identify and copy hunks
217            step_progress.set_substatus("analyzing...");
218            let claude_response =
219                match analyze_commit(client, repo_url, &commit, step_progress.clone()).await {
220                    Ok(Some(response)) => response,
221                    Ok(None) => {
222                        step_progress.set_info("no pattern", InfoStyle::Normal);
223                        repo_state.mark_processed(&commit.sha, 0);
224                        continue;
225                    }
226                    Err(e) => {
227                        step_progress.set_info(format!("error: {:?}", e), InfoStyle::Warning);
228                        repo_state.mark_processed(&commit.sha, 0);
229                        continue;
230                    }
231                };
232
233            // Validate and build the example
234            step_progress.set_substatus("validating...");
235            match build_example(repo_url, &commit, &repo_path, &claude_response).await {
236                Ok(spec) => {
237                    let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S");
238                    let filename = format!("{}--{}.md", repo_name, timestamp);
239                    let path = config.output_dir.join(&filename);
240                    std::fs::write(&path, spec.to_markdown())?;
241                    examples_generated += 1;
242                    step_progress.set_info(filename, InfoStyle::Normal);
243                }
244                Err(rejection_reason) => {
245                    log::debug!("Example rejected: {}", rejection_reason);
246                    let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S%.3f");
247                    let filename = format!("{}--{}.md", repo_name, timestamp);
248                    let path = FAILED_EXAMPLES_DIR.join(&filename);
249                    let content = format_rejected_example(&claude_response, &rejection_reason);
250                    if let Err(e) = std::fs::write(&path, content) {
251                        log::warn!("Failed to write rejected example: {:?}", e);
252                    }
253                    step_progress.set_info(format!("rejected: {}", filename), InfoStyle::Warning);
254                }
255            }
256
257            repo_state.mark_processed(&commit.sha, 1);
258        }
259    }
260
261    Ok(repo_state)
262}
263
264fn repo_name_from_url(url: &str) -> String {
265    url.rsplit('/')
266        .next()
267        .unwrap_or(url)
268        .trim_end_matches(".git")
269        .to_string()
270}
271
272fn truncate_message(msg: &str, max_len: usize) -> String {
273    let first_line = msg.lines().next().unwrap_or("");
274    if first_line.len() <= max_len {
275        first_line.to_string()
276    } else {
277        format!("{}...", &first_line[..max_len - 3])
278    }
279}
280
281fn should_skip_commit(commit: &CommitInfo) -> bool {
282    let lines_changed = commit
283        .diff
284        .lines()
285        .filter(|l| l.starts_with('+') || l.starts_with('-'))
286        .count();
287    lines_changed < 30
288        || lines_changed > 1000
289        || is_non_code_commit(commit)
290        || is_rename_commit(commit)
291}
292
293fn is_non_code_commit(commit: &CommitInfo) -> bool {
294    let non_code_extensions = [
295        ".md", ".txt", ".json", ".yaml", ".yml", ".toml", ".lock", ".svg", ".png", ".jpg", ".gif",
296        ".ico", ".woff", ".ttf", ".eot",
297    ];
298
299    let diff_files: Vec<&str> = commit
300        .diff
301        .lines()
302        .filter(|l| l.starts_with("+++ b/") || l.starts_with("--- a/"))
303        .filter_map(|l| {
304            l.strip_prefix("+++ b/")
305                .or_else(|| l.strip_prefix("--- a/"))
306        })
307        .collect();
308
309    if diff_files.is_empty() {
310        return false;
311    }
312
313    diff_files
314        .iter()
315        .all(|f| non_code_extensions.iter().any(|ext| f.ends_with(ext)))
316}
317
318fn is_rename_commit(commit: &CommitInfo) -> bool {
319    commit.diff.contains("similarity index")
320        || commit.diff.contains("rename from")
321        || commit.diff.contains("rename to")
322}
323
324async fn list_commits(
325    repo_path: &Path,
326    max_commits: usize,
327    skip: usize,
328) -> Result<Vec<CommitInfo>> {
329    let output = run_git(
330        repo_path,
331        &[
332            "log",
333            "--no-merges",
334            &format!("--skip={}", skip),
335            &format!("-{}", max_commits),
336            "--format=%H|%P|%s",
337        ],
338    )
339    .await?;
340
341    let mut commits = Vec::new();
342    for line in output.lines() {
343        let parts: Vec<&str> = line.splitn(3, '|').collect();
344        if parts.len() < 3 {
345            continue;
346        }
347        let sha = parts[0].to_string();
348        let parent_sha = parts[1].split_whitespace().next().unwrap_or("").to_string();
349        if parent_sha.is_empty() {
350            continue;
351        }
352
353        // Get standard diff (for skip checks)
354        let diff = run_git(repo_path, &["show", "--format=", &sha])
355            .await
356            .unwrap_or_default();
357
358        // Get expanded diff with 30 lines of context
359        let expanded_diff = run_git(repo_path, &["show", "-U30", "--format=", &sha])
360            .await
361            .unwrap_or_default();
362
363        commits.push(CommitInfo {
364            sha,
365            parent_sha,
366            message: parts[2].to_string(),
367            diff,
368            expanded_diff,
369        });
370    }
371
372    Ok(commits)
373}
374
375fn build_prompt(repo_url: &str, commit: &CommitInfo) -> String {
376    format!(
377        indoc! {r#"
378            You are analyzing a git commit to construct a realistic edit prediction example.
379
380            Your goal is to tell the story of a programmer's editing session: what sequence
381            of changes did they make, and what change logically comes next? We use these examples
382            to train a model to predict edits, so the quality of the EDIT HISTORY is what matters most.
383
384            An edit prediction example consists of:
385            1. **Edit History**: 2-6 hunks showing what the programmer did BEFORE making the expected patch.
386               This is the most important part - it must tell a coherent story of the changes leading up to the prediction.
387            2. **Expected Patch**: One small hunk that logically follows from the edit history.
388
389            Both single-file and multi-file patterns are acceptable.
390
391            ## What Makes a Good Example
392
393            The edit history should read like a story: "First the programmer changed X, then Y, then Z, and now they need to change W."
394
395            GOOD examples (rich sequences with 3+ steps):
396            - Removing a parameter: docstring update → constructor change → field removal → (predict) usage site update
397            - Adding a feature: type definition → first usage → second usage → (predict) third usage
398            - Bug fix pattern: fix in file A → fix in file B → fix in file C → (predict) fix in file D
399
400            BAD examples (respond NO_PATTERN):
401            - Commits where all changes are independent (no narrative thread)
402            - Simple find-and-replace (renaming, version bumps)
403            - Documentation-only or config-only changes
404            - Changes where you can only find 1-2 hunks for the edit history
405
406            ## Commit Information
407
408            Repository: {repo_url}
409            Commit: {sha}
410            Message: {message}
411
412            ## Diff (30 lines context)
413
414            ```diff
415            {expanded_diff}
416            ```
417
418            ## Your Task
419
420            First, THINK through whether this commit can support a good example:
421
422            1. What is the high-level pattern in this commit?
423            2. Can you identify at least 3 related hunks (2 or more for edit history + 1 for expected patch)?
424            3. What would be the narrative? (First... then... then... finally predict...)
425            4. Which specific hunk should be the expected patch (the "punchline")?
426
427            If you cannot construct a coherent 3+ hunk story, respond with just:
428            NO_PATTERN: <brief reason>
429
430            If you CAN construct a good example, respond in this format:
431
432            ANALYSIS:
433            Pattern: <one sentence describing the pattern>
434            Steps:
435            1. <file:line-range> - <what this hunk does>
436            2. <file:line-range> - <what this hunk does>
437            3. <file:line-range> - <what this hunk does>
438            4. [EXPECTED PATCH] <file:line-range> - <what this hunk does>
439
440            NAME: <short description, like a commit message, under 60 chars>
441
442            EDIT_HISTORY:
443
444            Hunk 1:
445            ```diff
446            --- a/src/models/user.py
447            +++ b/src/models/user.py
448            @@ -15,7 +15,6 @@ class User:
449                 """A user in the system.
450
451                 Attributes:
452            -        email: The user's email address.
453                     name: The user's display name.
454                 """
455            ```
456
457            Hunk 2:
458            ```diff
459            --- a/src/models/user.py
460            +++ b/src/models/user.py
461            @@ -25,10 +24,9 @@ class User:
462                 def __init__(
463                     self,
464                     name: str,
465            -        email: str,
466                     created_at: datetime,
467                 ):
468                     self.name = name
469            -        self.email = email
470                     self.created_at = created_at
471            ```
472
473            Hunk 3:
474            ```diff
475            --- a/src/api/handlers.py
476            +++ b/src/api/handlers.py
477            @@ -42,7 +42,6 @@ def create_user(request):
478                 data = request.json()
479                 user = User(
480                     name=data["name"],
481            -        email=data["email"],
482                     created_at=datetime.now(),
483                 )
484                 return user.save()
485            ```
486
487            EXPECTED_PATCH:
488            ```diff
489            --- a/src/api/handlers.py
490            +++ b/src/api/handlers.py
491            @@ -58,7 +57,6 @@ def update_user(request, user_id):
492                 user = User.get(user_id)
493                 user.name = data.get("name", user.name)
494            -    user.email = data.get("email", user.email)
495                 user.save()
496                 return user
497            ```
498
499            ## Requirements for the diffs
500
501            Edit history:
502            - MUST have 3-6 hunks (if you cannot find 3+, respond NO_PATTERN instead)
503            - Each hunk needs file headers (--- a/path and +++ b/path)
504            - Hunks must be valid unified diffs that apply to the parent commit
505            - Order hunks as a programmer would naturally make the changes
506
507            Expected patch:
508            - Must be a SINGLE hunk from a SINGLE file
509            - Must be SMALL: 1-15 changed lines (not counting context)
510            - Must be clearly predictable from the edit history narrative
511        "#},
512        repo_url = repo_url,
513        sha = commit.sha,
514        message = commit.message,
515        expanded_diff = commit.expanded_diff,
516    )
517}
518
519async fn analyze_commit(
520    client: &PlainLlmClient,
521    repo_url: &str,
522    commit: &CommitInfo,
523    step_progress: Arc<StepProgress>,
524) -> Result<Option<ClaudeResponse>> {
525    use anthropic::{Message, RequestContent, Role};
526
527    let prompt = build_prompt(repo_url, commit);
528    let messages = vec![Message {
529        role: Role::User,
530        content: vec![RequestContent::Text {
531            text: prompt,
532            cache_control: None,
533        }],
534    }];
535
536    let response = client
537        .generate_streaming("claude-sonnet-4-5", 8192, messages, |chars, _text| {
538            step_progress.set_substatus(format!("analyzing: {:.1}K", chars as f64 / 1000.0));
539        })
540        .await?;
541
542    // Extract text content from response
543    let response_text: String = response
544        .content
545        .iter()
546        .filter_map(|block| {
547            if let ResponseContent::Text { text } = block {
548                Some(text.as_str())
549            } else {
550                None
551            }
552        })
553        .collect::<Vec<_>>()
554        .join("\n");
555
556    parse_claude_response(&response_text)
557}
558
559fn parse_claude_response(response: &str) -> Result<Option<ClaudeResponse>> {
560    // Check for NO_PATTERN
561    if response.contains("NO_PATTERN:") {
562        return Ok(None);
563    }
564
565    // Parse NAME
566    let name = response
567        .lines()
568        .find(|l| l.starts_with("NAME:"))
569        .map(|l| l.strip_prefix("NAME:").unwrap_or("").trim().to_string())
570        .unwrap_or_else(|| "unnamed example".to_string());
571
572    // Parse ANALYSIS section (Claude's planning) - this is the primary reasoning
573    let reasoning = extract_section(
574        response,
575        "ANALYSIS:",
576        &["NAME:", "REASONING:", "EDIT_HISTORY:", "EXPECTED_PATCH:"],
577    )
578    .unwrap_or_default();
579
580    // Parse EDIT_HISTORY diff block
581    let edit_history_hunks = extract_diff_block(response, "EDIT_HISTORY:")?;
582
583    // Parse EXPECTED_PATCH diff block
584    let expected_patch_hunks = extract_diff_block(response, "EXPECTED_PATCH:")?;
585
586    if edit_history_hunks.is_empty() {
587        anyhow::bail!("No edit history hunks found in response");
588    }
589    if expected_patch_hunks.is_empty() {
590        anyhow::bail!("No expected patch hunks found in response");
591    }
592
593    Ok(Some(ClaudeResponse {
594        name,
595        reasoning,
596        edit_history_hunks,
597        expected_patch_hunks,
598    }))
599}
600
601fn extract_section(text: &str, start_marker: &str, end_markers: &[&str]) -> Option<String> {
602    let start_idx = text.find(start_marker)?;
603    let content_start = start_idx + start_marker.len();
604
605    let end_idx = end_markers
606        .iter()
607        .filter_map(|marker| text[content_start..].find(marker))
608        .min()
609        .map(|idx| content_start + idx)
610        .unwrap_or(text.len());
611
612    Some(text[content_start..end_idx].trim().to_string())
613}
614
615fn extract_diff_block(text: &str, section_marker: &str) -> Result<Vec<String>> {
616    let section_start = text
617        .find(section_marker)
618        .context(format!("Section {} not found", section_marker))?;
619
620    let after_marker = &text[section_start + section_marker.len()..];
621
622    // Find where the next major section starts (to bound our search)
623    let section_end = ["EXPECTED_PATCH:", "## "]
624        .iter()
625        .filter(|&&m| m != section_marker)
626        .filter_map(|marker| after_marker.find(marker))
627        .min()
628        .unwrap_or(after_marker.len());
629
630    let section_content = &after_marker[..section_end];
631
632    // Collect all ```diff blocks in this section
633    let mut hunks = Vec::new();
634    let mut search_start = 0;
635
636    while let Some(diff_start) = section_content[search_start..].find("```diff") {
637        let abs_diff_start = search_start + diff_start;
638        let block_content_start = section_content[abs_diff_start..]
639            .find('\n')
640            .map(|i| abs_diff_start + i + 1)
641            .unwrap_or(abs_diff_start);
642
643        if let Some(block_end_rel) = section_content[block_content_start..].find("```") {
644            let block_end = block_content_start + block_end_rel;
645            let diff_content = section_content[block_content_start..block_end].trim();
646
647            // Split this block into hunks (in case multiple hunks in one block)
648            hunks.extend(split_into_hunks(diff_content));
649
650            search_start = block_end + 3;
651        } else {
652            break;
653        }
654    }
655
656    if hunks.is_empty() {
657        anyhow::bail!("No diff blocks found in section {}", section_marker);
658    }
659
660    Ok(hunks)
661}
662
663/// Split a diff block into individual hunks, preserving file headers
664fn split_into_hunks(diff: &str) -> Vec<String> {
665    let mut hunks = Vec::new();
666    let mut current_file_header: Option<String> = None;
667    let mut current_hunk: Vec<String> = Vec::new();
668    let mut in_hunk = false;
669
670    for line in diff.lines() {
671        if line.starts_with("--- a/") || line.starts_with("--- /") {
672            // Start of file header - flush previous hunk
673            if in_hunk && !current_hunk.is_empty() {
674                let mut hunk_text = String::new();
675                if let Some(ref header) = current_file_header {
676                    hunk_text.push_str(header);
677                    hunk_text.push('\n');
678                }
679                hunk_text.push_str(&current_hunk.join("\n"));
680                hunks.push(hunk_text);
681                current_hunk.clear();
682            }
683            current_file_header = Some(line.to_string());
684            in_hunk = false;
685        } else if line.starts_with("+++ b/") || line.starts_with("+++ /") {
686            if let Some(ref mut header) = current_file_header {
687                header.push('\n');
688                header.push_str(line);
689            }
690        } else if line.starts_with("@@ ") {
691            // New hunk - flush previous
692            if in_hunk && !current_hunk.is_empty() {
693                let mut hunk_text = String::new();
694                if let Some(ref header) = current_file_header {
695                    hunk_text.push_str(header);
696                    hunk_text.push('\n');
697                }
698                hunk_text.push_str(&current_hunk.join("\n"));
699                hunks.push(hunk_text);
700                current_hunk.clear();
701            }
702            current_hunk.push(line.to_string());
703            in_hunk = true;
704        } else if in_hunk {
705            current_hunk.push(line.to_string());
706        }
707    }
708
709    // Flush final hunk
710    if !current_hunk.is_empty() {
711        let mut hunk_text = String::new();
712        if let Some(ref header) = current_file_header {
713            hunk_text.push_str(header);
714            hunk_text.push('\n');
715        }
716        hunk_text.push_str(&current_hunk.join("\n"));
717        hunks.push(hunk_text);
718    }
719
720    hunks
721}
722
723/// Validate Claude's output by applying diffs and build the ExampleSpec
724async fn build_example(
725    repo_url: &str,
726    commit: &CommitInfo,
727    repo_path: &Path,
728    response: &ClaudeResponse,
729) -> Result<ExampleSpec, String> {
730    // Validate expected patch hunks
731    if response.expected_patch_hunks.len() != 1 {
732        return Err(format!(
733            "Expected exactly 1 expected patch hunk, got {}",
734            response.expected_patch_hunks.len()
735        ));
736    }
737
738    // Parse the expected patch to determine cursor file
739    let expected_patch = &response.expected_patch_hunks[0];
740    let cursor_file = extract_file_from_hunk(expected_patch)
741        .ok_or_else(|| "Could not determine file from expected patch".to_string())?;
742
743    // Get the file content before the commit
744    let before_content = run_git(
745        repo_path,
746        &["show", &format!("{}^:{}", commit.sha, cursor_file)],
747    )
748    .await
749    .map_err(|e| format!("Failed to get file content for {}: {}", cursor_file, e))?;
750
751    // Build edit history diff from Claude's hunks
752    let edit_history = response.edit_history_hunks.join("\n");
753
754    // Apply edit history to get intermediate state (validates edit history)
755    let intermediate_state =
756        apply_edit_history_to_content(&before_content, &edit_history, &cursor_file)?;
757
758    // Validate expected patch applies to intermediate state
759    let expected_patch_with_header = ensure_diff_header(expected_patch, &cursor_file);
760    apply_diff_to_string(&expected_patch_with_header, &intermediate_state)
761        .map_err(|e| format!("Expected patch failed to apply: {}", e))?;
762
763    // Find where the expected patch edits would apply in the intermediate state
764    let edits = edits_for_diff(&intermediate_state, &expected_patch_with_header)
765        .map_err(|e| format!("Failed to parse expected patch: {}", e))?;
766    if edits.is_empty() {
767        return Err(
768            "Could not locate expected patch in file (context not found or ambiguous)".to_string(),
769        );
770    }
771
772    // Use the start of the first edit for cursor positioning
773    let cursor_byte_offset = edits[0].0.start;
774
775    // Extract excerpt around the edit location
776    let (excerpt, cursor_offset) = extract_cursor_excerpt(&intermediate_state, cursor_byte_offset)?;
777
778    // Build the ExampleSpec and use set_cursor_excerpt to format with comment marker
779    let comment_prefix = line_comment_prefix(&cursor_file);
780    let reasoning_with_source = format!(
781        "Source commit: {} ({})\n\n{}",
782        commit.sha,
783        truncate_message(&commit.message, 60),
784        response.reasoning
785    );
786    let mut spec = ExampleSpec {
787        name: response.name.clone(),
788        repository_url: repo_url.to_string(),
789        revision: commit.parent_sha.clone(),
790        tags: Vec::new(),
791        reasoning: Some(reasoning_with_source),
792        uncommitted_diff: String::new(),
793        cursor_path: Arc::from(Path::new(&cursor_file)),
794        cursor_position: String::new(),
795        edit_history,
796        expected_patches: vec![expected_patch_with_header],
797        rejected_patch: None,
798
799        telemetry: None,
800        human_feedback: Vec::new(),
801        rating: None,
802    };
803    spec.set_cursor_excerpt(&excerpt, cursor_offset, comment_prefix);
804
805    Ok(spec)
806}
807
808/// Extract file path from a hunk (looks for --- a/path or +++ b/path)
809fn extract_file_from_hunk(hunk: &str) -> Option<String> {
810    for line in hunk.lines() {
811        if let Some(path) = line.strip_prefix("+++ b/") {
812            return Some(path.to_string());
813        }
814        if let Some(path) = line.strip_prefix("--- a/") {
815            return Some(path.to_string());
816        }
817    }
818    None
819}
820
821/// Ensure a hunk has proper file headers
822fn ensure_diff_header(hunk: &str, file_path: &str) -> String {
823    if hunk.contains("--- a/") || hunk.contains("+++ b/") {
824        return hunk.to_string();
825    }
826    format!("--- a/{}\n+++ b/{}\n{}", file_path, file_path, hunk)
827}
828
829/// Apply edit history to file content, only if hunks affect this file
830fn apply_edit_history_to_content(
831    content: &str,
832    edit_history: &str,
833    cursor_file: &str,
834) -> Result<String, String> {
835    // Extract just the hunks for this file from the edit history
836    let file_diff = extract_file_diff_from_combined(edit_history, cursor_file);
837
838    if file_diff.is_empty() {
839        return Ok(content.to_string());
840    }
841
842    apply_diff_to_string(&file_diff, content)
843        .map_err(|e| format!("Failed to apply edit history: {}", e))
844}
845
846/// Extract hunks for a specific file from a combined diff
847fn extract_file_diff_from_combined(combined_diff: &str, target_file: &str) -> String {
848    let mut result = String::new();
849    let mut in_target_file = false;
850    let mut found_header = false;
851
852    for line in combined_diff.lines() {
853        if line.starts_with("--- a/") {
854            let file = line.strip_prefix("--- a/").unwrap_or("");
855            in_target_file = file == target_file;
856            if in_target_file {
857                result.push_str(line);
858                result.push('\n');
859                found_header = false;
860            }
861        } else if line.starts_with("+++ b/") && in_target_file {
862            result.push_str(line);
863            result.push('\n');
864            found_header = true;
865        } else if in_target_file && found_header {
866            if line.starts_with("--- a/") {
867                break;
868            }
869            result.push_str(line);
870            result.push('\n');
871        }
872    }
873
874    result
875}
876
877/// Extract a cursor position excerpt from content around a byte offset.
878/// Returns the excerpt and the cursor offset within the excerpt.
879fn extract_cursor_excerpt(
880    content: &str,
881    cursor_byte_offset: usize,
882) -> Result<(String, usize), String> {
883    // Find the line containing the cursor
884    let line_start = content[..cursor_byte_offset]
885        .rfind('\n')
886        .map(|pos| pos + 1)
887        .unwrap_or(0);
888    let line_end = content[cursor_byte_offset..]
889        .find('\n')
890        .map(|pos| cursor_byte_offset + pos)
891        .unwrap_or(content.len());
892
893    // Get context lines before
894    let lines_before: Vec<&str> = content[..line_start].lines().collect();
895    let context_before: Vec<&str> = lines_before.iter().rev().take(3).rev().cloned().collect();
896
897    // Get context lines after
898    let after_line_end = if line_end < content.len() {
899        line_end + 1
900    } else {
901        line_end
902    };
903    let context_after: Vec<&str> = content[after_line_end..].lines().take(4).collect();
904
905    // The line containing the cursor
906    let cursor_line = &content[line_start..line_end];
907    let cursor_column = cursor_byte_offset - line_start;
908
909    // Build the excerpt
910    let mut excerpt = String::new();
911    for line in context_before {
912        excerpt.push_str(line);
913        excerpt.push('\n');
914    }
915    // Track where cursor will be in the excerpt
916    let cursor_offset_in_excerpt = excerpt.len() + cursor_column;
917    // Line containing cursor
918    excerpt.push_str(cursor_line);
919    excerpt.push('\n');
920    for line in context_after {
921        excerpt.push_str(line);
922        excerpt.push('\n');
923    }
924
925    // Trim trailing newline
926    if excerpt.ends_with('\n') {
927        excerpt.pop();
928    }
929
930    Ok((excerpt, cursor_offset_in_excerpt))
931}
932
933/// Get the line comment prefix for a file based on its extension
934fn line_comment_prefix(file_path: &str) -> &'static str {
935    let extension = file_path.rsplit('.').next().unwrap_or("");
936    match extension {
937        "rs" | "c" | "cpp" | "cc" | "h" | "hpp" | "js" | "ts" | "tsx" | "jsx" | "go" | "java"
938        | "swift" | "kt" | "kts" | "scala" | "cs" | "m" | "mm" | "zig" | "v" | "d" => "//",
939        "py" | "rb" | "sh" | "bash" | "zsh" | "pl" | "pm" | "r" | "jl" | "yaml" | "yml"
940        | "toml" | "coffee" | "cr" | "ex" | "exs" | "elixir" => "#",
941        "lua" | "hs" | "sql" => "--",
942        "lisp" | "clj" | "cljs" | "scm" | "rkt" | "el" => ";",
943        "erl" | "hrl" => "%",
944        _ => "//",
945    }
946}
947
948fn format_rejected_example(response: &ClaudeResponse, rejection_reason: &str) -> String {
949    let mut content = String::new();
950    content.push_str("# Rejected Example\n\n");
951    content.push_str(&format!("## Name\n\n{}\n\n", response.name));
952    content.push_str(&format!("## Reasoning\n\n{}\n\n", response.reasoning));
953    content.push_str("## Edit History Hunks\n\n```diff\n");
954    for hunk in &response.edit_history_hunks {
955        content.push_str(hunk);
956        content.push_str("\n\n");
957    }
958    content.push_str("```\n\n");
959    content.push_str("## Expected Patch Hunks\n\n```diff\n");
960    for hunk in &response.expected_patch_hunks {
961        content.push_str(hunk);
962        content.push_str("\n\n");
963    }
964    content.push_str("```\n\n");
965    content.push_str(&format!("## Rejection Reason\n\n{}\n", rejection_reason));
966    content
967}