synthesize.rs

  1use crate::{
  2    anthropic_client::PlainLlmClient,
  3    git::{ensure_repo_cloned, run_git},
  4    paths::{FAILED_EXAMPLES_DIR, LATEST_FAILED_EXAMPLES_DIR, SYNTHESIZE_STATE_FILE},
  5    progress::{InfoStyle, Progress, Step, StepProgress},
  6};
  7use anthropic::ResponseContent;
  8use anyhow::{Context as _, Result};
  9use chrono::Local;
 10use collections::{HashMap, HashSet};
 11use edit_prediction::{
 12    example_spec::ExampleSpec,
 13    udiff::{apply_diff_to_string, edits_for_diff},
 14};
 15use futures::stream::{FuturesUnordered, StreamExt};
 16use indoc::indoc;
 17use serde::{Deserialize, Serialize};
 18use std::{
 19    path::{Path, PathBuf},
 20    sync::Arc,
 21};
 22
 23#[derive(Debug, Clone)]
 24pub struct SynthesizeConfig {
 25    pub repo_urls: Vec<String>,
 26    /// Number of examples to generate per repository
 27    pub count: usize,
 28    pub max_commits: usize,
 29    pub output_dir: PathBuf,
 30    pub fresh: bool,
 31}
 32
 33#[derive(Debug, Default, Serialize, Deserialize)]
 34struct SynthesizeState {
 35    repositories: HashMap<String, RepoState>,
 36}
 37
 38#[derive(Debug, Default, Serialize, Deserialize)]
 39struct RepoState {
 40    processed_commits: HashSet<String>,
 41    examples_generated: usize,
 42}
 43
 44impl SynthesizeState {
 45    fn load() -> Self {
 46        if SYNTHESIZE_STATE_FILE.exists() {
 47            std::fs::read_to_string(&*SYNTHESIZE_STATE_FILE)
 48                .ok()
 49                .and_then(|s| serde_json::from_str(&s).ok())
 50                .unwrap_or_default()
 51        } else {
 52            Self::default()
 53        }
 54    }
 55
 56    fn save(&self) -> Result<()> {
 57        let content = serde_json::to_string_pretty(self)?;
 58        std::fs::write(&*SYNTHESIZE_STATE_FILE, content)?;
 59        Ok(())
 60    }
 61
 62    fn take_repo_state(&mut self, repo_url: &str) -> RepoState {
 63        self.repositories.remove(repo_url).unwrap_or_default()
 64    }
 65
 66    fn merge_repo_state(&mut self, repo_url: String, repo_state: RepoState) {
 67        self.repositories.insert(repo_url, repo_state);
 68    }
 69}
 70
 71impl RepoState {
 72    fn is_processed(&self, commit_sha: &str) -> bool {
 73        self.processed_commits.contains(commit_sha)
 74    }
 75
 76    fn mark_processed(&mut self, commit_sha: &str, examples_count: usize) {
 77        self.processed_commits.insert(commit_sha.to_string());
 78        self.examples_generated += examples_count;
 79    }
 80}
 81
 82#[derive(Debug)]
 83struct CommitInfo {
 84    sha: String,
 85    parent_sha: String,
 86    message: String,
 87    diff: String,
 88    expanded_diff: String,
 89}
 90
 91/// Claude's response parsed into structured form
 92#[derive(Debug)]
 93struct ClaudeResponse {
 94    name: String,
 95    reasoning: String,
 96    edit_history_hunks: Vec<String>,
 97    expected_patch_hunks: Vec<String>,
 98}
 99
100pub async fn run_synthesize(config: SynthesizeConfig) -> Result<()> {
101    let mut state = if config.fresh {
102        SynthesizeState::default()
103    } else {
104        SynthesizeState::load()
105    };
106
107    std::fs::create_dir_all(&config.output_dir)?;
108    std::fs::create_dir_all(&*FAILED_EXAMPLES_DIR)?;
109
110    // Create "latest_failed" symlink pointing to this run's failed directory
111    if LATEST_FAILED_EXAMPLES_DIR.is_symlink() {
112        std::fs::remove_file(&*LATEST_FAILED_EXAMPLES_DIR)?;
113    }
114    #[cfg(unix)]
115    std::os::unix::fs::symlink(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
116    #[cfg(windows)]
117    std::os::windows::fs::symlink_dir(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
118
119    let progress = Progress::global();
120    let total_examples = config.count * config.repo_urls.len();
121    progress.set_total_examples(total_examples);
122
123    let client = Arc::new(PlainLlmClient::new()?);
124    let config = Arc::new(config);
125
126    let mut futures: FuturesUnordered<_> = config
127        .repo_urls
128        .iter()
129        .map(|repo_url| {
130            let client = client.clone();
131            let repo_state = state.take_repo_state(repo_url);
132            let config = config.clone();
133            let repo_url = repo_url.clone();
134            async move {
135                let result = synthesize_repo(&client, repo_state, &config, &repo_url).await;
136                (repo_url, result)
137            }
138        })
139        .collect();
140
141    let mut errors = Vec::new();
142    while let Some((repo_url, result)) = futures.next().await {
143        match result {
144            Ok(repo_state) => {
145                state.merge_repo_state(repo_url, repo_state);
146            }
147            Err(e) => {
148                errors.push(e);
149            }
150        }
151    }
152
153    state.save()?;
154
155    progress.finalize();
156
157    if let Some(first_error) = errors.into_iter().next() {
158        return Err(first_error);
159    }
160
161    Ok(())
162}
163
164async fn synthesize_repo(
165    client: &PlainLlmClient,
166    mut repo_state: RepoState,
167    config: &SynthesizeConfig,
168    repo_url: &str,
169) -> Result<RepoState> {
170    let progress = Progress::global();
171    let batch_size = config.max_commits;
172
173    let clone_progress = progress.start(Step::Synthesize, &format!("clone {}", repo_url));
174    let repo_path = ensure_repo_cloned(repo_url).await?;
175    drop(clone_progress);
176
177    let mut examples_generated = 0;
178    let mut commits_skipped = 0;
179
180    'outer: loop {
181        let list_progress = progress.start(
182            Step::Synthesize,
183            &format!("{}: list-commits", repo_name_from_url(repo_url)),
184        );
185        let commits = list_commits(&repo_path, batch_size, commits_skipped).await?;
186        drop(list_progress);
187
188        if commits.is_empty() {
189            break;
190        }
191
192        commits_skipped += commits.len();
193
194        for commit in commits {
195            if examples_generated >= config.count {
196                break 'outer;
197            }
198
199            if !config.fresh && repo_state.is_processed(&commit.sha) {
200                continue;
201            }
202
203            if should_skip_commit(&commit) {
204                continue;
205            }
206
207            let repo_name = repo_name_from_url(repo_url);
208            let commit_label = format!(
209                "{}: {} {}",
210                repo_name,
211                &commit.sha[..8],
212                truncate_message(&commit.message, 40)
213            );
214            let step_progress = Arc::new(progress.start(Step::Synthesize, &commit_label));
215
216            // Single Claude call to identify and copy hunks
217            step_progress.set_substatus("analyzing...");
218            let claude_response =
219                match analyze_commit(client, repo_url, &commit, step_progress.clone()).await {
220                    Ok(Some(response)) => response,
221                    Ok(None) => {
222                        step_progress.set_info("no pattern", InfoStyle::Normal);
223                        repo_state.mark_processed(&commit.sha, 0);
224                        continue;
225                    }
226                    Err(e) => {
227                        step_progress.set_info(format!("error: {:?}", e), InfoStyle::Warning);
228                        repo_state.mark_processed(&commit.sha, 0);
229                        continue;
230                    }
231                };
232
233            // Validate and build the example
234            step_progress.set_substatus("validating...");
235            match build_example(repo_url, &commit, &repo_path, &claude_response).await {
236                Ok(spec) => {
237                    let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S");
238                    let filename = format!("{}--{}.md", repo_name, timestamp);
239                    let path = config.output_dir.join(&filename);
240                    std::fs::write(&path, spec.to_markdown())?;
241                    examples_generated += 1;
242                    step_progress.set_info(filename, InfoStyle::Normal);
243                }
244                Err(rejection_reason) => {
245                    log::debug!("Example rejected: {}", rejection_reason);
246                    let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S%.3f");
247                    let filename = format!("{}--{}.md", repo_name, timestamp);
248                    let path = FAILED_EXAMPLES_DIR.join(&filename);
249                    let content = format_rejected_example(&claude_response, &rejection_reason);
250                    if let Err(e) = std::fs::write(&path, content) {
251                        log::warn!("Failed to write rejected example: {:?}", e);
252                    }
253                    step_progress.set_info(format!("rejected: {}", filename), InfoStyle::Warning);
254                }
255            }
256
257            repo_state.mark_processed(&commit.sha, 1);
258        }
259    }
260
261    Ok(repo_state)
262}
263
264fn repo_name_from_url(url: &str) -> String {
265    url.rsplit('/')
266        .next()
267        .unwrap_or(url)
268        .trim_end_matches(".git")
269        .to_string()
270}
271
272fn truncate_message(msg: &str, max_len: usize) -> String {
273    let first_line = msg.lines().next().unwrap_or("");
274    if first_line.len() <= max_len {
275        first_line.to_string()
276    } else {
277        format!("{}...", &first_line[..max_len - 3])
278    }
279}
280
281fn should_skip_commit(commit: &CommitInfo) -> bool {
282    let lines_changed = commit
283        .diff
284        .lines()
285        .filter(|l| l.starts_with('+') || l.starts_with('-'))
286        .count();
287    lines_changed < 10
288        || lines_changed > 1000
289        || is_non_code_commit(commit)
290        || is_rename_commit(commit)
291}
292
293fn is_non_code_commit(commit: &CommitInfo) -> bool {
294    let non_code_extensions = [
295        ".md", ".txt", ".json", ".yaml", ".yml", ".toml", ".lock", ".svg", ".png", ".jpg", ".gif",
296        ".ico", ".woff", ".ttf", ".eot",
297    ];
298
299    let diff_files: Vec<&str> = commit
300        .diff
301        .lines()
302        .filter(|l| l.starts_with("+++ b/") || l.starts_with("--- a/"))
303        .filter_map(|l| {
304            l.strip_prefix("+++ b/")
305                .or_else(|| l.strip_prefix("--- a/"))
306        })
307        .collect();
308
309    if diff_files.is_empty() {
310        return false;
311    }
312
313    diff_files
314        .iter()
315        .all(|f| non_code_extensions.iter().any(|ext| f.ends_with(ext)))
316}
317
318fn is_rename_commit(commit: &CommitInfo) -> bool {
319    commit.diff.contains("similarity index")
320        || commit.diff.contains("rename from")
321        || commit.diff.contains("rename to")
322}
323
324async fn list_commits(
325    repo_path: &Path,
326    max_commits: usize,
327    skip: usize,
328) -> Result<Vec<CommitInfo>> {
329    let output = run_git(
330        repo_path,
331        &[
332            "log",
333            "--no-merges",
334            &format!("--skip={}", skip),
335            &format!("-{}", max_commits),
336            "--format=%H|%P|%s",
337        ],
338    )
339    .await?;
340
341    let mut commits = Vec::new();
342    for line in output.lines() {
343        let parts: Vec<&str> = line.splitn(3, '|').collect();
344        if parts.len() < 3 {
345            continue;
346        }
347        let sha = parts[0].to_string();
348        let parent_sha = parts[1].split_whitespace().next().unwrap_or("").to_string();
349        if parent_sha.is_empty() {
350            continue;
351        }
352
353        // Get standard diff (for skip checks)
354        let diff = run_git(repo_path, &["show", "--format=", &sha])
355            .await
356            .unwrap_or_default();
357
358        // Get expanded diff with 30 lines of context
359        let expanded_diff = run_git(repo_path, &["show", "-U30", "--format=", &sha])
360            .await
361            .unwrap_or_default();
362
363        commits.push(CommitInfo {
364            sha,
365            parent_sha,
366            message: parts[2].to_string(),
367            diff,
368            expanded_diff,
369        });
370    }
371
372    Ok(commits)
373}
374
375fn build_prompt(repo_url: &str, commit: &CommitInfo) -> String {
376    format!(
377        indoc! {r#"
378            You are analyzing a git commit to construct a realistic edit prediction example.
379
380            Your goal is to tell the story of a programmer's editing session: what sequence of changes did they make, and what change logically comes next? We use these examples to train a model to predict edits, so the quality of the EDIT HISTORY is what matters most.
381
382            An edit prediction example consists of:
383            1. **Edit History**: 3-6 hunks showing what the programmer did BEFORE making the expected patch. This is the most important part - it must tell a coherent story of the changes leading up to the prediction.
384            2. **Expected Patch**: One small hunk that logically follows from the edit history.
385
386            Both single-file and multi-file patterns are acceptable.
387
388            ## What Makes a Good Example
389
390            The edit history should read like a story: "First the programmer changed X, then Y, then Z, and now they need to change W."
391
392            GOOD examples (rich sequences with 3+ steps):
393            - Removing a parameter: docstring update → constructor change → field removal → (predict) usage site update
394            - Adding a feature: type definition → first usage → second usage → (predict) third usage
395            - Bug fix pattern: fix in file A → fix in file B → fix in file C → (predict) fix in file D
396
397            BAD examples (respond NO_PATTERN):
398            - Commits where all changes are independent (no narrative thread)
399            - Simple find-and-replace (renaming, version bumps)
400            - Documentation-only or config-only changes
401            - Changes where you can only find 1-2 hunks for the edit history
402
403            ## Commit Information
404
405            Repository: {repo_url}
406            Commit: {sha}
407            Message: {message}
408
409            ## Diff (30 lines context)
410
411            ```diff
412            {expanded_diff}
413            ```
414
415            ## Your Task
416
417            First, THINK through whether this commit can support a good example:
418
419            1. What is the high-level pattern in this commit?
420            2. Can you identify at least 4 related hunks (3 for edit history + 1 for expected patch)?
421            3. What would be the narrative? (First... then... then... finally predict...)
422            4. Which specific hunk should be the expected patch (the "punchline")?
423
424            If you cannot construct a coherent 3+ hunk story, respond with just:
425            NO_PATTERN: <brief reason>
426
427            If you CAN construct a good example, respond in this format:
428
429            ANALYSIS:
430            Pattern: <one sentence describing the pattern>
431            Steps:
432            1. <file:line-range> - <what this hunk does>
433            2. <file:line-range> - <what this hunk does>
434            3. <file:line-range> - <what this hunk does>
435            4. [EXPECTED PATCH] <file:line-range> - <what this hunk does>
436
437            NAME: <short description, like a commit message, under 60 chars>
438
439            EDIT_HISTORY:
440
441            Hunk 1:
442            ```diff
443            --- a/src/models/user.py
444            +++ b/src/models/user.py
445            @@ -15,7 +15,6 @@ class User:
446                 """A user in the system.
447
448                 Attributes:
449            -        email: The user's email address.
450                     name: The user's display name.
451                 """
452            ```
453
454            Hunk 2:
455            ```diff
456            --- a/src/models/user.py
457            +++ b/src/models/user.py
458            @@ -25,10 +24,9 @@ class User:
459                 def __init__(
460                     self,
461                     name: str,
462            -        email: str,
463                     created_at: datetime,
464                 ):
465                     self.name = name
466            -        self.email = email
467                     self.created_at = created_at
468            ```
469
470            Hunk 3:
471            ```diff
472            --- a/src/api/handlers.py
473            +++ b/src/api/handlers.py
474            @@ -42,7 +42,6 @@ def create_user(request):
475                 data = request.json()
476                 user = User(
477                     name=data["name"],
478            -        email=data["email"],
479                     created_at=datetime.now(),
480                 )
481                 return user.save()
482            ```
483
484            EXPECTED_PATCH:
485            ```diff
486            --- a/src/api/handlers.py
487            +++ b/src/api/handlers.py
488            @@ -58,7 +57,6 @@ def update_user(request, user_id):
489                 user = User.get(user_id)
490                 user.name = data.get("name", user.name)
491            -    user.email = data.get("email", user.email)
492                 user.save()
493                 return user
494            ```
495
496            ## Requirements for the diffs
497
498            Edit history:
499            - MUST have 3-6 hunks (if you cannot find 3+, respond NO_PATTERN instead)
500            - Each hunk needs file headers (--- a/path and +++ b/path)
501            - Hunks must be valid unified diffs that apply to the parent commit
502            - Order hunks as a programmer would naturally make the changes
503
504            Expected patch:
505            - Must be a SINGLE hunk from a SINGLE file
506            - Must be SMALL: 1-15 changed lines (not counting context)
507            - Must be clearly predictable from the edit history narrative
508        "#},
509        repo_url = repo_url,
510        sha = commit.sha,
511        message = commit.message,
512        expanded_diff = commit.expanded_diff,
513    )
514}
515
516async fn analyze_commit(
517    client: &PlainLlmClient,
518    repo_url: &str,
519    commit: &CommitInfo,
520    step_progress: Arc<StepProgress>,
521) -> Result<Option<ClaudeResponse>> {
522    use anthropic::{Message, RequestContent, Role};
523
524    let prompt = build_prompt(repo_url, commit);
525    let messages = vec![Message {
526        role: Role::User,
527        content: vec![RequestContent::Text {
528            text: prompt,
529            cache_control: None,
530        }],
531    }];
532
533    let response = client
534        .generate_streaming("claude-sonnet-4-5", 8192, messages, |chars, _text| {
535            step_progress.set_substatus(format!("analyzing: {:.1}K", chars as f64 / 1000.0));
536        })
537        .await?;
538
539    // Extract text content from response
540    let response_text: String = response
541        .content
542        .iter()
543        .filter_map(|block| {
544            if let ResponseContent::Text { text } = block {
545                Some(text.as_str())
546            } else {
547                None
548            }
549        })
550        .collect::<Vec<_>>()
551        .join("\n");
552
553    parse_claude_response(&response_text)
554}
555
556fn parse_claude_response(response: &str) -> Result<Option<ClaudeResponse>> {
557    // Check for NO_PATTERN
558    if response.contains("NO_PATTERN:") {
559        return Ok(None);
560    }
561
562    // Parse NAME
563    let name = response
564        .lines()
565        .find(|l| l.starts_with("NAME:"))
566        .map(|l| l.strip_prefix("NAME:").unwrap_or("").trim().to_string())
567        .unwrap_or_else(|| "unnamed example".to_string());
568
569    // Parse ANALYSIS section (Claude's planning) - this is the primary reasoning
570    let reasoning = extract_section(
571        response,
572        "ANALYSIS:",
573        &["NAME:", "REASONING:", "EDIT_HISTORY:", "EXPECTED_PATCH:"],
574    )
575    .unwrap_or_default();
576
577    // Parse EDIT_HISTORY diff block
578    let edit_history_hunks = extract_diff_block(response, "EDIT_HISTORY:")?;
579
580    // Parse EXPECTED_PATCH diff block
581    let expected_patch_hunks = extract_diff_block(response, "EXPECTED_PATCH:")?;
582
583    if edit_history_hunks.is_empty() {
584        anyhow::bail!("No edit history hunks found in response");
585    }
586    if expected_patch_hunks.is_empty() {
587        anyhow::bail!("No expected patch hunks found in response");
588    }
589
590    Ok(Some(ClaudeResponse {
591        name,
592        reasoning,
593        edit_history_hunks,
594        expected_patch_hunks,
595    }))
596}
597
598fn extract_section(text: &str, start_marker: &str, end_markers: &[&str]) -> Option<String> {
599    let start_idx = text.find(start_marker)?;
600    let content_start = start_idx + start_marker.len();
601
602    let end_idx = end_markers
603        .iter()
604        .filter_map(|marker| text[content_start..].find(marker))
605        .min()
606        .map(|idx| content_start + idx)
607        .unwrap_or(text.len());
608
609    Some(text[content_start..end_idx].trim().to_string())
610}
611
612fn extract_diff_block(text: &str, section_marker: &str) -> Result<Vec<String>> {
613    let section_start = text
614        .find(section_marker)
615        .context(format!("Section {} not found", section_marker))?;
616
617    let after_marker = &text[section_start + section_marker.len()..];
618
619    // Find where the next major section starts (to bound our search)
620    let section_end = ["EXPECTED_PATCH:", "## "]
621        .iter()
622        .filter(|&&m| m != section_marker)
623        .filter_map(|marker| after_marker.find(marker))
624        .min()
625        .unwrap_or(after_marker.len());
626
627    let section_content = &after_marker[..section_end];
628
629    // Collect all ```diff blocks in this section
630    let mut hunks = Vec::new();
631    let mut search_start = 0;
632
633    while let Some(diff_start) = section_content[search_start..].find("```diff") {
634        let abs_diff_start = search_start + diff_start;
635        let block_content_start = section_content[abs_diff_start..]
636            .find('\n')
637            .map(|i| abs_diff_start + i + 1)
638            .unwrap_or(abs_diff_start);
639
640        if let Some(block_end_rel) = section_content[block_content_start..].find("```") {
641            let block_end = block_content_start + block_end_rel;
642            let diff_content = section_content[block_content_start..block_end].trim();
643
644            // Split this block into hunks (in case multiple hunks in one block)
645            hunks.extend(split_into_hunks(diff_content));
646
647            search_start = block_end + 3;
648        } else {
649            break;
650        }
651    }
652
653    if hunks.is_empty() {
654        anyhow::bail!("No diff blocks found in section {}", section_marker);
655    }
656
657    Ok(hunks)
658}
659
660/// Split a diff block into individual hunks, preserving file headers
661fn split_into_hunks(diff: &str) -> Vec<String> {
662    let mut hunks = Vec::new();
663    let mut current_file_header: Option<String> = None;
664    let mut current_hunk: Vec<String> = Vec::new();
665    let mut in_hunk = false;
666
667    for line in diff.lines() {
668        if line.starts_with("--- a/") || line.starts_with("--- /") {
669            // Start of file header - flush previous hunk
670            if in_hunk && !current_hunk.is_empty() {
671                let mut hunk_text = String::new();
672                if let Some(ref header) = current_file_header {
673                    hunk_text.push_str(header);
674                    hunk_text.push('\n');
675                }
676                hunk_text.push_str(&current_hunk.join("\n"));
677                hunks.push(hunk_text);
678                current_hunk.clear();
679            }
680            current_file_header = Some(line.to_string());
681            in_hunk = false;
682        } else if line.starts_with("+++ b/") || line.starts_with("+++ /") {
683            if let Some(ref mut header) = current_file_header {
684                header.push('\n');
685                header.push_str(line);
686            }
687        } else if line.starts_with("@@ ") {
688            // New hunk - flush previous
689            if in_hunk && !current_hunk.is_empty() {
690                let mut hunk_text = String::new();
691                if let Some(ref header) = current_file_header {
692                    hunk_text.push_str(header);
693                    hunk_text.push('\n');
694                }
695                hunk_text.push_str(&current_hunk.join("\n"));
696                hunks.push(hunk_text);
697                current_hunk.clear();
698            }
699            current_hunk.push(line.to_string());
700            in_hunk = true;
701        } else if in_hunk {
702            current_hunk.push(line.to_string());
703        }
704    }
705
706    // Flush final hunk
707    if !current_hunk.is_empty() {
708        let mut hunk_text = String::new();
709        if let Some(ref header) = current_file_header {
710            hunk_text.push_str(header);
711            hunk_text.push('\n');
712        }
713        hunk_text.push_str(&current_hunk.join("\n"));
714        hunks.push(hunk_text);
715    }
716
717    hunks
718}
719
720/// Validate Claude's output by applying diffs and build the ExampleSpec
721async fn build_example(
722    repo_url: &str,
723    commit: &CommitInfo,
724    repo_path: &Path,
725    response: &ClaudeResponse,
726) -> Result<ExampleSpec, String> {
727    // Validate expected patch hunks
728    if response.expected_patch_hunks.len() != 1 {
729        return Err(format!(
730            "Expected exactly 1 expected patch hunk, got {}",
731            response.expected_patch_hunks.len()
732        ));
733    }
734
735    // Parse the expected patch to determine cursor file
736    let expected_patch = &response.expected_patch_hunks[0];
737    let cursor_file = extract_file_from_hunk(expected_patch)
738        .ok_or_else(|| "Could not determine file from expected patch".to_string())?;
739
740    // Get the file content before the commit
741    let before_content = run_git(
742        repo_path,
743        &["show", &format!("{}^:{}", commit.sha, cursor_file)],
744    )
745    .await
746    .map_err(|e| format!("Failed to get file content for {}: {}", cursor_file, e))?;
747
748    // Build edit history diff from Claude's hunks
749    let edit_history = response.edit_history_hunks.join("\n");
750
751    // Apply edit history to get intermediate state (validates edit history)
752    let intermediate_state =
753        apply_edit_history_to_content(&before_content, &edit_history, &cursor_file)?;
754
755    // Validate expected patch applies to intermediate state
756    let expected_patch_with_header = ensure_diff_header(expected_patch, &cursor_file);
757    apply_diff_to_string(&expected_patch_with_header, &intermediate_state)
758        .map_err(|e| format!("Expected patch failed to apply: {}", e))?;
759
760    // Find where the expected patch edits would apply in the intermediate state
761    let edits = edits_for_diff(&intermediate_state, &expected_patch_with_header)
762        .map_err(|e| format!("Failed to parse expected patch: {}", e))?;
763    if edits.is_empty() {
764        return Err(
765            "Could not locate expected patch in file (context not found or ambiguous)".to_string(),
766        );
767    }
768
769    // Use the start of the first edit for cursor positioning
770    let cursor_byte_offset = edits[0].0.start;
771
772    // Extract excerpt around the edit location
773    let (excerpt, cursor_offset) = extract_cursor_excerpt(&intermediate_state, cursor_byte_offset)?;
774
775    // Build the ExampleSpec and use set_cursor_excerpt to format with comment marker
776    let comment_prefix = line_comment_prefix(&cursor_file);
777    let reasoning_with_source = format!(
778        "Source commit: {} ({})\n\n{}",
779        commit.sha,
780        truncate_message(&commit.message, 60),
781        response.reasoning
782    );
783    let mut spec = ExampleSpec {
784        name: response.name.clone(),
785        repository_url: repo_url.to_string(),
786        revision: commit.parent_sha.clone(),
787        tags: Vec::new(),
788        reasoning: Some(reasoning_with_source),
789        uncommitted_diff: String::new(),
790        cursor_path: Arc::from(Path::new(&cursor_file)),
791        cursor_position: String::new(),
792        edit_history,
793        expected_patches: vec![expected_patch_with_header],
794        rejected_patch: None,
795    };
796    spec.set_cursor_excerpt(&excerpt, cursor_offset, comment_prefix);
797
798    Ok(spec)
799}
800
801/// Extract file path from a hunk (looks for --- a/path or +++ b/path)
802fn extract_file_from_hunk(hunk: &str) -> Option<String> {
803    for line in hunk.lines() {
804        if let Some(path) = line.strip_prefix("+++ b/") {
805            return Some(path.to_string());
806        }
807        if let Some(path) = line.strip_prefix("--- a/") {
808            return Some(path.to_string());
809        }
810    }
811    None
812}
813
814/// Ensure a hunk has proper file headers
815fn ensure_diff_header(hunk: &str, file_path: &str) -> String {
816    if hunk.contains("--- a/") || hunk.contains("+++ b/") {
817        return hunk.to_string();
818    }
819    format!("--- a/{}\n+++ b/{}\n{}", file_path, file_path, hunk)
820}
821
822/// Apply edit history to file content, only if hunks affect this file
823fn apply_edit_history_to_content(
824    content: &str,
825    edit_history: &str,
826    cursor_file: &str,
827) -> Result<String, String> {
828    // Extract just the hunks for this file from the edit history
829    let file_diff = extract_file_diff_from_combined(edit_history, cursor_file);
830
831    if file_diff.is_empty() {
832        return Ok(content.to_string());
833    }
834
835    apply_diff_to_string(&file_diff, content)
836        .map_err(|e| format!("Failed to apply edit history: {}", e))
837}
838
839/// Extract hunks for a specific file from a combined diff
840fn extract_file_diff_from_combined(combined_diff: &str, target_file: &str) -> String {
841    let mut result = String::new();
842    let mut in_target_file = false;
843    let mut found_header = false;
844
845    for line in combined_diff.lines() {
846        if line.starts_with("--- a/") {
847            let file = line.strip_prefix("--- a/").unwrap_or("");
848            in_target_file = file == target_file;
849            if in_target_file {
850                result.push_str(line);
851                result.push('\n');
852                found_header = false;
853            }
854        } else if line.starts_with("+++ b/") && in_target_file {
855            result.push_str(line);
856            result.push('\n');
857            found_header = true;
858        } else if in_target_file && found_header {
859            if line.starts_with("--- a/") {
860                break;
861            }
862            result.push_str(line);
863            result.push('\n');
864        }
865    }
866
867    result
868}
869
870/// Extract a cursor position excerpt from content around a byte offset.
871/// Returns the excerpt and the cursor offset within the excerpt.
872fn extract_cursor_excerpt(
873    content: &str,
874    cursor_byte_offset: usize,
875) -> Result<(String, usize), String> {
876    // Find the line containing the cursor
877    let line_start = content[..cursor_byte_offset]
878        .rfind('\n')
879        .map(|pos| pos + 1)
880        .unwrap_or(0);
881    let line_end = content[cursor_byte_offset..]
882        .find('\n')
883        .map(|pos| cursor_byte_offset + pos)
884        .unwrap_or(content.len());
885
886    // Get context lines before
887    let lines_before: Vec<&str> = content[..line_start].lines().collect();
888    let context_before: Vec<&str> = lines_before.iter().rev().take(3).rev().cloned().collect();
889
890    // Get context lines after
891    let after_line_end = if line_end < content.len() {
892        line_end + 1
893    } else {
894        line_end
895    };
896    let context_after: Vec<&str> = content[after_line_end..].lines().take(4).collect();
897
898    // The line containing the cursor
899    let cursor_line = &content[line_start..line_end];
900    let cursor_column = cursor_byte_offset - line_start;
901
902    // Build the excerpt
903    let mut excerpt = String::new();
904    for line in context_before {
905        excerpt.push_str(line);
906        excerpt.push('\n');
907    }
908    // Track where cursor will be in the excerpt
909    let cursor_offset_in_excerpt = excerpt.len() + cursor_column;
910    // Line containing cursor
911    excerpt.push_str(cursor_line);
912    excerpt.push('\n');
913    for line in context_after {
914        excerpt.push_str(line);
915        excerpt.push('\n');
916    }
917
918    // Trim trailing newline
919    if excerpt.ends_with('\n') {
920        excerpt.pop();
921    }
922
923    Ok((excerpt, cursor_offset_in_excerpt))
924}
925
926/// Get the line comment prefix for a file based on its extension
927fn line_comment_prefix(file_path: &str) -> &'static str {
928    let extension = file_path.rsplit('.').next().unwrap_or("");
929    match extension {
930        "rs" | "c" | "cpp" | "cc" | "h" | "hpp" | "js" | "ts" | "tsx" | "jsx" | "go" | "java"
931        | "swift" | "kt" | "kts" | "scala" | "cs" | "m" | "mm" | "zig" | "v" | "d" => "//",
932        "py" | "rb" | "sh" | "bash" | "zsh" | "pl" | "pm" | "r" | "jl" | "yaml" | "yml"
933        | "toml" | "coffee" | "cr" | "ex" | "exs" | "elixir" => "#",
934        "lua" | "hs" | "sql" => "--",
935        "lisp" | "clj" | "cljs" | "scm" | "rkt" | "el" => ";",
936        "erl" | "hrl" => "%",
937        _ => "//",
938    }
939}
940
941fn format_rejected_example(response: &ClaudeResponse, rejection_reason: &str) -> String {
942    let mut content = String::new();
943    content.push_str("# Rejected Example\n\n");
944    content.push_str(&format!("## Name\n\n{}\n\n", response.name));
945    content.push_str(&format!("## Reasoning\n\n{}\n\n", response.reasoning));
946    content.push_str("## Edit History Hunks\n\n```diff\n");
947    for hunk in &response.edit_history_hunks {
948        content.push_str(hunk);
949        content.push_str("\n\n");
950    }
951    content.push_str("```\n\n");
952    content.push_str("## Expected Patch Hunks\n\n```diff\n");
953    for hunk in &response.expected_patch_hunks {
954        content.push_str(hunk);
955        content.push_str("\n\n");
956    }
957    content.push_str("```\n\n");
958    content.push_str(&format!("## Rejection Reason\n\n{}\n", rejection_reason));
959    content
960}