synthesize.rs

  1use crate::{
  2    anthropic_client::PlainLlmClient,
  3    git::{ensure_repo_cloned, run_git},
  4    paths::{FAILED_EXAMPLES_DIR, LATEST_FAILED_EXAMPLES_DIR, SYNTHESIZE_STATE_FILE},
  5    progress::{InfoStyle, Progress, Step, StepProgress},
  6};
  7use anthropic::ResponseContent;
  8use anyhow::{Context as _, Result};
  9use chrono::Local;
 10use collections::{HashMap, HashSet};
 11use edit_prediction::{
 12    example_spec::ExampleSpec,
 13    udiff::{apply_diff_to_string, edits_for_diff},
 14};
 15use indoc::indoc;
 16use serde::{Deserialize, Serialize};
 17use std::{
 18    path::{Path, PathBuf},
 19    sync::Arc,
 20};
 21
 22#[derive(Debug, Clone)]
 23pub struct SynthesizeConfig {
 24    pub repo_url: String,
 25    pub count: usize,
 26    pub max_commits: usize,
 27    pub output_dir: PathBuf,
 28    pub fresh: bool,
 29}
 30
 31#[derive(Debug, Default, Serialize, Deserialize)]
 32struct SynthesizeState {
 33    repositories: HashMap<String, RepoState>,
 34}
 35
 36#[derive(Debug, Default, Serialize, Deserialize)]
 37struct RepoState {
 38    processed_commits: HashSet<String>,
 39    examples_generated: usize,
 40}
 41
 42impl SynthesizeState {
 43    fn load() -> Self {
 44        if SYNTHESIZE_STATE_FILE.exists() {
 45            std::fs::read_to_string(&*SYNTHESIZE_STATE_FILE)
 46                .ok()
 47                .and_then(|s| serde_json::from_str(&s).ok())
 48                .unwrap_or_default()
 49        } else {
 50            Self::default()
 51        }
 52    }
 53
 54    fn save(&self) -> Result<()> {
 55        let content = serde_json::to_string_pretty(self)?;
 56        std::fs::write(&*SYNTHESIZE_STATE_FILE, content)?;
 57        Ok(())
 58    }
 59
 60    fn is_processed(&self, repo_url: &str, commit_sha: &str) -> bool {
 61        self.repositories
 62            .get(repo_url)
 63            .is_some_and(|repo| repo.processed_commits.contains(commit_sha))
 64    }
 65
 66    fn mark_processed(&mut self, repo_url: &str, commit_sha: &str, examples_count: usize) {
 67        let repo = self.repositories.entry(repo_url.to_string()).or_default();
 68        repo.processed_commits.insert(commit_sha.to_string());
 69        repo.examples_generated += examples_count;
 70    }
 71}
 72
 73#[derive(Debug)]
 74struct CommitInfo {
 75    sha: String,
 76    parent_sha: String,
 77    message: String,
 78    diff: String,
 79    expanded_diff: String,
 80}
 81
 82/// Claude's response parsed into structured form
 83#[derive(Debug)]
 84struct ClaudeResponse {
 85    name: String,
 86    reasoning: String,
 87    edit_history_hunks: Vec<String>,
 88    expected_patch_hunks: Vec<String>,
 89}
 90
 91pub async fn run_synthesize(config: SynthesizeConfig) -> Result<()> {
 92    let mut state = if config.fresh {
 93        SynthesizeState::default()
 94    } else {
 95        SynthesizeState::load()
 96    };
 97
 98    std::fs::create_dir_all(&config.output_dir)?;
 99    std::fs::create_dir_all(&*FAILED_EXAMPLES_DIR)?;
100
101    // Create "latest_failed" symlink pointing to this run's failed directory
102    if LATEST_FAILED_EXAMPLES_DIR.is_symlink() {
103        std::fs::remove_file(&*LATEST_FAILED_EXAMPLES_DIR)?;
104    }
105    #[cfg(unix)]
106    std::os::unix::fs::symlink(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
107    #[cfg(windows)]
108    std::os::windows::fs::symlink_dir(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
109
110    let progress = Progress::global();
111    progress.set_total_examples(config.count);
112
113    let clone_progress = progress.start(Step::Synthesize, "clone");
114    let repo_path = ensure_repo_cloned(&config.repo_url).await?;
115    drop(clone_progress);
116
117    let client = PlainLlmClient::new()?;
118    let mut examples_generated = 0;
119    let mut commits_skipped = 0;
120    let batch_size = config.max_commits;
121
122    'outer: loop {
123        let list_progress = progress.start(Step::Synthesize, "list-commits");
124        let commits = list_commits(&repo_path, batch_size, commits_skipped).await?;
125        drop(list_progress);
126
127        if commits.is_empty() {
128            break;
129        }
130
131        commits_skipped += commits.len();
132
133        for commit in commits {
134            if examples_generated >= config.count {
135                break 'outer;
136            }
137
138            if !config.fresh && state.is_processed(&config.repo_url, &commit.sha) {
139                continue;
140            }
141
142            if should_skip_commit(&commit) {
143                continue;
144            }
145
146            let commit_label = format!(
147                "{} {}",
148                &commit.sha[..8],
149                truncate_message(&commit.message, 40)
150            );
151            let step_progress = Arc::new(progress.start(Step::Synthesize, &commit_label));
152
153            // Single Claude call to identify and copy hunks
154            step_progress.set_substatus("analyzing...");
155            let claude_response =
156                match analyze_commit(&client, &config, &commit, step_progress.clone()).await {
157                    Ok(Some(response)) => response,
158                    Ok(None) => {
159                        step_progress.set_info("no pattern", InfoStyle::Normal);
160                        state.mark_processed(&config.repo_url, &commit.sha, 0);
161                        state.save()?;
162                        continue;
163                    }
164                    Err(e) => {
165                        step_progress.set_info(format!("error: {:?}", e), InfoStyle::Warning);
166                        state.mark_processed(&config.repo_url, &commit.sha, 0);
167                        state.save()?;
168                        continue;
169                    }
170                };
171
172            // Validate and build the example
173            step_progress.set_substatus("validating...");
174            match build_example(&config, &commit, &repo_path, &claude_response).await {
175                Ok(spec) => {
176                    let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S");
177                    let filename = format!("{}.md", timestamp);
178                    let path = config.output_dir.join(&filename);
179                    std::fs::write(&path, spec.to_markdown())?;
180                    examples_generated += 1;
181                    step_progress.set_info(filename, InfoStyle::Normal);
182                }
183                Err(rejection_reason) => {
184                    log::debug!("Example rejected: {}", rejection_reason);
185                    let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S%.3f");
186                    let filename = format!("{}.md", timestamp);
187                    let path = FAILED_EXAMPLES_DIR.join(&filename);
188                    let content = format_rejected_example(&claude_response, &rejection_reason);
189                    if let Err(e) = std::fs::write(&path, content) {
190                        log::warn!("Failed to write rejected example: {:?}", e);
191                    }
192                    step_progress.set_info(format!("rejected: {}", filename), InfoStyle::Warning);
193                }
194            }
195
196            state.mark_processed(&config.repo_url, &commit.sha, 1);
197            state.save()?;
198        }
199    }
200
201    progress.finalize();
202    Ok(())
203}
204
205fn truncate_message(msg: &str, max_len: usize) -> String {
206    let first_line = msg.lines().next().unwrap_or("");
207    if first_line.len() <= max_len {
208        first_line.to_string()
209    } else {
210        format!("{}...", &first_line[..max_len - 3])
211    }
212}
213
214fn should_skip_commit(commit: &CommitInfo) -> bool {
215    let lines_changed = commit
216        .diff
217        .lines()
218        .filter(|l| l.starts_with('+') || l.starts_with('-'))
219        .count();
220    lines_changed < 10
221        || lines_changed > 1000
222        || is_non_code_commit(commit)
223        || is_rename_commit(commit)
224}
225
226fn is_non_code_commit(commit: &CommitInfo) -> bool {
227    let non_code_extensions = [
228        ".md", ".txt", ".json", ".yaml", ".yml", ".toml", ".lock", ".svg", ".png", ".jpg", ".gif",
229        ".ico", ".woff", ".ttf", ".eot",
230    ];
231
232    let diff_files: Vec<&str> = commit
233        .diff
234        .lines()
235        .filter(|l| l.starts_with("+++ b/") || l.starts_with("--- a/"))
236        .filter_map(|l| {
237            l.strip_prefix("+++ b/")
238                .or_else(|| l.strip_prefix("--- a/"))
239        })
240        .collect();
241
242    if diff_files.is_empty() {
243        return false;
244    }
245
246    diff_files
247        .iter()
248        .all(|f| non_code_extensions.iter().any(|ext| f.ends_with(ext)))
249}
250
251fn is_rename_commit(commit: &CommitInfo) -> bool {
252    commit.diff.contains("similarity index")
253        || commit.diff.contains("rename from")
254        || commit.diff.contains("rename to")
255}
256
257async fn list_commits(
258    repo_path: &Path,
259    max_commits: usize,
260    skip: usize,
261) -> Result<Vec<CommitInfo>> {
262    let output = run_git(
263        repo_path,
264        &[
265            "log",
266            "--no-merges",
267            &format!("--skip={}", skip),
268            &format!("-{}", max_commits),
269            "--format=%H|%P|%s",
270        ],
271    )
272    .await?;
273
274    let mut commits = Vec::new();
275    for line in output.lines() {
276        let parts: Vec<&str> = line.splitn(3, '|').collect();
277        if parts.len() < 3 {
278            continue;
279        }
280        let sha = parts[0].to_string();
281        let parent_sha = parts[1].split_whitespace().next().unwrap_or("").to_string();
282        if parent_sha.is_empty() {
283            continue;
284        }
285
286        // Get standard diff (for skip checks)
287        let diff = run_git(repo_path, &["show", "--format=", &sha])
288            .await
289            .unwrap_or_default();
290
291        // Get expanded diff with 30 lines of context
292        let expanded_diff = run_git(repo_path, &["show", "-U30", "--format=", &sha])
293            .await
294            .unwrap_or_default();
295
296        commits.push(CommitInfo {
297            sha,
298            parent_sha,
299            message: parts[2].to_string(),
300            diff,
301            expanded_diff,
302        });
303    }
304
305    Ok(commits)
306}
307
308fn build_prompt(config: &SynthesizeConfig, commit: &CommitInfo) -> String {
309    format!(
310        indoc! {r#"
311            You are analyzing a git commit to construct a realistic edit prediction example.
312
313            Your goal is to tell the story of a programmer's editing session: what sequence of changes did they make, and what change logically comes next? We use these examples to train a model to predict edits, so the quality of the EDIT HISTORY is what matters most.
314
315            An edit prediction example consists of:
316            1. **Edit History**: 3-6 hunks showing what the programmer did BEFORE making the expected patch. This is the most important part - it must tell a coherent story of the changes leading up to the prediction.
317            2. **Expected Patch**: One small hunk that logically follows from the edit history.
318
319            Both single-file and multi-file patterns are acceptable.
320
321            ## What Makes a Good Example
322
323            The edit history should read like a story: "First the programmer changed X, then Y, then Z, and now they need to change W."
324
325            GOOD examples (rich sequences with 3+ steps):
326            - Removing a parameter: docstring update → constructor change → field removal → (predict) usage site update
327            - Adding a feature: type definition → first usage → second usage → (predict) third usage
328            - Bug fix pattern: fix in file A → fix in file B → fix in file C → (predict) fix in file D
329
330            BAD examples (respond NO_PATTERN):
331            - Commits where all changes are independent (no narrative thread)
332            - Simple find-and-replace (renaming, version bumps)
333            - Documentation-only or config-only changes
334            - Changes where you can only find 1-2 hunks for the edit history
335
336            ## Commit Information
337
338            Repository: {repo_url}
339            Commit: {sha}
340            Message: {message}
341
342            ## Diff (30 lines context)
343
344            ```diff
345            {expanded_diff}
346            ```
347
348            ## Your Task
349
350            First, THINK through whether this commit can support a good example:
351
352            1. What is the high-level pattern in this commit?
353            2. Can you identify at least 4 related hunks (3 for edit history + 1 for expected patch)?
354            3. What would be the narrative? (First... then... then... finally predict...)
355            4. Which specific hunk should be the expected patch (the "punchline")?
356
357            If you cannot construct a coherent 3+ hunk story, respond with just:
358            NO_PATTERN: <brief reason>
359
360            If you CAN construct a good example, respond in this format:
361
362            ANALYSIS:
363            Pattern: <one sentence describing the pattern>
364            Steps:
365            1. <file:line-range> - <what this hunk does>
366            2. <file:line-range> - <what this hunk does>
367            3. <file:line-range> - <what this hunk does>
368            4. [EXPECTED PATCH] <file:line-range> - <what this hunk does>
369
370            NAME: <short description, like a commit message, under 60 chars>
371
372            EDIT_HISTORY:
373
374            Hunk 1:
375            ```diff
376            --- a/src/models/user.py
377            +++ b/src/models/user.py
378            @@ -15,7 +15,6 @@ class User:
379                 """A user in the system.
380
381                 Attributes:
382            -        email: The user's email address.
383                     name: The user's display name.
384                 """
385            ```
386
387            Hunk 2:
388            ```diff
389            --- a/src/models/user.py
390            +++ b/src/models/user.py
391            @@ -25,10 +24,9 @@ class User:
392                 def __init__(
393                     self,
394                     name: str,
395            -        email: str,
396                     created_at: datetime,
397                 ):
398                     self.name = name
399            -        self.email = email
400                     self.created_at = created_at
401            ```
402
403            Hunk 3:
404            ```diff
405            --- a/src/api/handlers.py
406            +++ b/src/api/handlers.py
407            @@ -42,7 +42,6 @@ def create_user(request):
408                 data = request.json()
409                 user = User(
410                     name=data["name"],
411            -        email=data["email"],
412                     created_at=datetime.now(),
413                 )
414                 return user.save()
415            ```
416
417            EXPECTED_PATCH:
418            ```diff
419            --- a/src/api/handlers.py
420            +++ b/src/api/handlers.py
421            @@ -58,7 +57,6 @@ def update_user(request, user_id):
422                 user = User.get(user_id)
423                 user.name = data.get("name", user.name)
424            -    user.email = data.get("email", user.email)
425                 user.save()
426                 return user
427            ```
428
429            ## Requirements for the diffs
430
431            Edit history:
432            - MUST have 3-6 hunks (if you cannot find 3+, respond NO_PATTERN instead)
433            - Each hunk needs file headers (--- a/path and +++ b/path)
434            - Hunks must be valid unified diffs that apply to the parent commit
435            - Order hunks as a programmer would naturally make the changes
436
437            Expected patch:
438            - Must be a SINGLE hunk from a SINGLE file
439            - Must be SMALL: 1-15 changed lines (not counting context)
440            - Must be clearly predictable from the edit history narrative
441        "#},
442        repo_url = config.repo_url,
443        sha = commit.sha,
444        message = commit.message,
445        expanded_diff = commit.expanded_diff,
446    )
447}
448
449async fn analyze_commit(
450    client: &PlainLlmClient,
451    config: &SynthesizeConfig,
452    commit: &CommitInfo,
453    step_progress: Arc<StepProgress>,
454) -> Result<Option<ClaudeResponse>> {
455    use anthropic::{Message, RequestContent, Role};
456
457    let prompt = build_prompt(config, commit);
458    let messages = vec![Message {
459        role: Role::User,
460        content: vec![RequestContent::Text {
461            text: prompt,
462            cache_control: None,
463        }],
464    }];
465
466    let response = client
467        .generate_streaming("claude-sonnet-4-5", 8192, messages, |chars, _text| {
468            step_progress.set_substatus(format!("analyzing: {:.1}K", chars as f64 / 1000.0));
469        })
470        .await?;
471
472    // Extract text content from response
473    let response_text: String = response
474        .content
475        .iter()
476        .filter_map(|block| {
477            if let ResponseContent::Text { text } = block {
478                Some(text.as_str())
479            } else {
480                None
481            }
482        })
483        .collect::<Vec<_>>()
484        .join("\n");
485
486    parse_claude_response(&response_text)
487}
488
489fn parse_claude_response(response: &str) -> Result<Option<ClaudeResponse>> {
490    // Check for NO_PATTERN
491    if response.contains("NO_PATTERN:") {
492        return Ok(None);
493    }
494
495    // Parse NAME
496    let name = response
497        .lines()
498        .find(|l| l.starts_with("NAME:"))
499        .map(|l| l.strip_prefix("NAME:").unwrap_or("").trim().to_string())
500        .unwrap_or_else(|| "unnamed example".to_string());
501
502    // Parse ANALYSIS section (Claude's planning) - this is the primary reasoning
503    let reasoning = extract_section(
504        response,
505        "ANALYSIS:",
506        &["NAME:", "REASONING:", "EDIT_HISTORY:", "EXPECTED_PATCH:"],
507    )
508    .unwrap_or_default();
509
510    // Parse EDIT_HISTORY diff block
511    let edit_history_hunks = extract_diff_block(response, "EDIT_HISTORY:")?;
512
513    // Parse EXPECTED_PATCH diff block
514    let expected_patch_hunks = extract_diff_block(response, "EXPECTED_PATCH:")?;
515
516    if edit_history_hunks.is_empty() {
517        anyhow::bail!("No edit history hunks found in response");
518    }
519    if expected_patch_hunks.is_empty() {
520        anyhow::bail!("No expected patch hunks found in response");
521    }
522
523    Ok(Some(ClaudeResponse {
524        name,
525        reasoning,
526        edit_history_hunks,
527        expected_patch_hunks,
528    }))
529}
530
531fn extract_section(text: &str, start_marker: &str, end_markers: &[&str]) -> Option<String> {
532    let start_idx = text.find(start_marker)?;
533    let content_start = start_idx + start_marker.len();
534
535    let end_idx = end_markers
536        .iter()
537        .filter_map(|marker| text[content_start..].find(marker))
538        .min()
539        .map(|idx| content_start + idx)
540        .unwrap_or(text.len());
541
542    Some(text[content_start..end_idx].trim().to_string())
543}
544
545fn extract_diff_block(text: &str, section_marker: &str) -> Result<Vec<String>> {
546    let section_start = text
547        .find(section_marker)
548        .context(format!("Section {} not found", section_marker))?;
549
550    let after_marker = &text[section_start + section_marker.len()..];
551
552    // Find where the next major section starts (to bound our search)
553    let section_end = ["EXPECTED_PATCH:", "## "]
554        .iter()
555        .filter(|&&m| m != section_marker)
556        .filter_map(|marker| after_marker.find(marker))
557        .min()
558        .unwrap_or(after_marker.len());
559
560    let section_content = &after_marker[..section_end];
561
562    // Collect all ```diff blocks in this section
563    let mut hunks = Vec::new();
564    let mut search_start = 0;
565
566    while let Some(diff_start) = section_content[search_start..].find("```diff") {
567        let abs_diff_start = search_start + diff_start;
568        let block_content_start = section_content[abs_diff_start..]
569            .find('\n')
570            .map(|i| abs_diff_start + i + 1)
571            .unwrap_or(abs_diff_start);
572
573        if let Some(block_end_rel) = section_content[block_content_start..].find("```") {
574            let block_end = block_content_start + block_end_rel;
575            let diff_content = section_content[block_content_start..block_end].trim();
576
577            // Split this block into hunks (in case multiple hunks in one block)
578            hunks.extend(split_into_hunks(diff_content));
579
580            search_start = block_end + 3;
581        } else {
582            break;
583        }
584    }
585
586    if hunks.is_empty() {
587        anyhow::bail!("No diff blocks found in section {}", section_marker);
588    }
589
590    Ok(hunks)
591}
592
593/// Split a diff block into individual hunks, preserving file headers
594fn split_into_hunks(diff: &str) -> Vec<String> {
595    let mut hunks = Vec::new();
596    let mut current_file_header: Option<String> = None;
597    let mut current_hunk: Vec<String> = Vec::new();
598    let mut in_hunk = false;
599
600    for line in diff.lines() {
601        if line.starts_with("--- a/") || line.starts_with("--- /") {
602            // Start of file header - flush previous hunk
603            if in_hunk && !current_hunk.is_empty() {
604                let mut hunk_text = String::new();
605                if let Some(ref header) = current_file_header {
606                    hunk_text.push_str(header);
607                    hunk_text.push('\n');
608                }
609                hunk_text.push_str(&current_hunk.join("\n"));
610                hunks.push(hunk_text);
611                current_hunk.clear();
612            }
613            current_file_header = Some(line.to_string());
614            in_hunk = false;
615        } else if line.starts_with("+++ b/") || line.starts_with("+++ /") {
616            if let Some(ref mut header) = current_file_header {
617                header.push('\n');
618                header.push_str(line);
619            }
620        } else if line.starts_with("@@ ") {
621            // New hunk - flush previous
622            if in_hunk && !current_hunk.is_empty() {
623                let mut hunk_text = String::new();
624                if let Some(ref header) = current_file_header {
625                    hunk_text.push_str(header);
626                    hunk_text.push('\n');
627                }
628                hunk_text.push_str(&current_hunk.join("\n"));
629                hunks.push(hunk_text);
630                current_hunk.clear();
631            }
632            current_hunk.push(line.to_string());
633            in_hunk = true;
634        } else if in_hunk {
635            current_hunk.push(line.to_string());
636        }
637    }
638
639    // Flush final hunk
640    if !current_hunk.is_empty() {
641        let mut hunk_text = String::new();
642        if let Some(ref header) = current_file_header {
643            hunk_text.push_str(header);
644            hunk_text.push('\n');
645        }
646        hunk_text.push_str(&current_hunk.join("\n"));
647        hunks.push(hunk_text);
648    }
649
650    hunks
651}
652
653/// Validate Claude's output by applying diffs and build the ExampleSpec
654async fn build_example(
655    config: &SynthesizeConfig,
656    commit: &CommitInfo,
657    repo_path: &Path,
658    response: &ClaudeResponse,
659) -> Result<ExampleSpec, String> {
660    // Validate expected patch hunks
661    if response.expected_patch_hunks.len() != 1 {
662        return Err(format!(
663            "Expected exactly 1 expected patch hunk, got {}",
664            response.expected_patch_hunks.len()
665        ));
666    }
667
668    // Parse the expected patch to determine cursor file
669    let expected_patch = &response.expected_patch_hunks[0];
670    let cursor_file = extract_file_from_hunk(expected_patch)
671        .ok_or_else(|| "Could not determine file from expected patch".to_string())?;
672
673    // Get the file content before the commit
674    let before_content = run_git(
675        repo_path,
676        &["show", &format!("{}^:{}", commit.sha, cursor_file)],
677    )
678    .await
679    .map_err(|e| format!("Failed to get file content for {}: {}", cursor_file, e))?;
680
681    // Build edit history diff from Claude's hunks
682    let edit_history = response.edit_history_hunks.join("\n");
683
684    // Apply edit history to get intermediate state (validates edit history)
685    let intermediate_state =
686        apply_edit_history_to_content(&before_content, &edit_history, &cursor_file)?;
687
688    // Validate expected patch applies to intermediate state
689    let expected_patch_with_header = ensure_diff_header(expected_patch, &cursor_file);
690    apply_diff_to_string(&expected_patch_with_header, &intermediate_state)
691        .map_err(|e| format!("Expected patch failed to apply: {}", e))?;
692
693    // Find where the expected patch edits would apply in the intermediate state
694    let edits = edits_for_diff(&intermediate_state, &expected_patch_with_header)
695        .map_err(|e| format!("Failed to parse expected patch: {}", e))?;
696    if edits.is_empty() {
697        return Err(
698            "Could not locate expected patch in file (context not found or ambiguous)".to_string(),
699        );
700    }
701
702    // Use the start of the first edit for cursor positioning
703    let cursor_byte_offset = edits[0].0.start;
704
705    // Extract excerpt around the edit location
706    let (excerpt, cursor_offset) = extract_cursor_excerpt(&intermediate_state, cursor_byte_offset)?;
707
708    // Build the ExampleSpec and use set_cursor_excerpt to format with comment marker
709    let comment_prefix = line_comment_prefix(&cursor_file);
710    let reasoning_with_source = format!(
711        "Source commit: {} ({})\n\n{}",
712        commit.sha,
713        truncate_message(&commit.message, 60),
714        response.reasoning
715    );
716    let mut spec = ExampleSpec {
717        name: response.name.clone(),
718        repository_url: config.repo_url.clone(),
719        revision: commit.parent_sha.clone(),
720        tags: Vec::new(),
721        reasoning: Some(reasoning_with_source),
722        uncommitted_diff: String::new(),
723        cursor_path: Arc::from(Path::new(&cursor_file)),
724        cursor_position: String::new(),
725        edit_history,
726        expected_patches: vec![expected_patch_with_header],
727    };
728    spec.set_cursor_excerpt(&excerpt, cursor_offset, comment_prefix);
729
730    Ok(spec)
731}
732
733/// Extract file path from a hunk (looks for --- a/path or +++ b/path)
734fn extract_file_from_hunk(hunk: &str) -> Option<String> {
735    for line in hunk.lines() {
736        if let Some(path) = line.strip_prefix("+++ b/") {
737            return Some(path.to_string());
738        }
739        if let Some(path) = line.strip_prefix("--- a/") {
740            return Some(path.to_string());
741        }
742    }
743    None
744}
745
746/// Ensure a hunk has proper file headers
747fn ensure_diff_header(hunk: &str, file_path: &str) -> String {
748    if hunk.contains("--- a/") || hunk.contains("+++ b/") {
749        return hunk.to_string();
750    }
751    format!("--- a/{}\n+++ b/{}\n{}", file_path, file_path, hunk)
752}
753
754/// Apply edit history to file content, only if hunks affect this file
755fn apply_edit_history_to_content(
756    content: &str,
757    edit_history: &str,
758    cursor_file: &str,
759) -> Result<String, String> {
760    // Extract just the hunks for this file from the edit history
761    let file_diff = extract_file_diff_from_combined(edit_history, cursor_file);
762
763    if file_diff.is_empty() {
764        return Ok(content.to_string());
765    }
766
767    apply_diff_to_string(&file_diff, content)
768        .map_err(|e| format!("Failed to apply edit history: {}", e))
769}
770
771/// Extract hunks for a specific file from a combined diff
772fn extract_file_diff_from_combined(combined_diff: &str, target_file: &str) -> String {
773    let mut result = String::new();
774    let mut in_target_file = false;
775    let mut found_header = false;
776
777    for line in combined_diff.lines() {
778        if line.starts_with("--- a/") {
779            let file = line.strip_prefix("--- a/").unwrap_or("");
780            in_target_file = file == target_file;
781            if in_target_file {
782                result.push_str(line);
783                result.push('\n');
784                found_header = false;
785            }
786        } else if line.starts_with("+++ b/") && in_target_file {
787            result.push_str(line);
788            result.push('\n');
789            found_header = true;
790        } else if in_target_file && found_header {
791            if line.starts_with("--- a/") {
792                break;
793            }
794            result.push_str(line);
795            result.push('\n');
796        }
797    }
798
799    result
800}
801
802/// Extract a cursor position excerpt from content around a byte offset.
803/// Returns the excerpt and the cursor offset within the excerpt.
804fn extract_cursor_excerpt(
805    content: &str,
806    cursor_byte_offset: usize,
807) -> Result<(String, usize), String> {
808    // Find the line containing the cursor
809    let line_start = content[..cursor_byte_offset]
810        .rfind('\n')
811        .map(|pos| pos + 1)
812        .unwrap_or(0);
813    let line_end = content[cursor_byte_offset..]
814        .find('\n')
815        .map(|pos| cursor_byte_offset + pos)
816        .unwrap_or(content.len());
817
818    // Get context lines before
819    let lines_before: Vec<&str> = content[..line_start].lines().collect();
820    let context_before: Vec<&str> = lines_before.iter().rev().take(3).rev().cloned().collect();
821
822    // Get context lines after
823    let after_line_end = if line_end < content.len() {
824        line_end + 1
825    } else {
826        line_end
827    };
828    let context_after: Vec<&str> = content[after_line_end..].lines().take(4).collect();
829
830    // The line containing the cursor
831    let cursor_line = &content[line_start..line_end];
832    let cursor_column = cursor_byte_offset - line_start;
833
834    // Build the excerpt
835    let mut excerpt = String::new();
836    for line in context_before {
837        excerpt.push_str(line);
838        excerpt.push('\n');
839    }
840    // Track where cursor will be in the excerpt
841    let cursor_offset_in_excerpt = excerpt.len() + cursor_column;
842    // Line containing cursor
843    excerpt.push_str(cursor_line);
844    excerpt.push('\n');
845    for line in context_after {
846        excerpt.push_str(line);
847        excerpt.push('\n');
848    }
849
850    // Trim trailing newline
851    if excerpt.ends_with('\n') {
852        excerpt.pop();
853    }
854
855    Ok((excerpt, cursor_offset_in_excerpt))
856}
857
858/// Get the line comment prefix for a file based on its extension
859fn line_comment_prefix(file_path: &str) -> &'static str {
860    let extension = file_path.rsplit('.').next().unwrap_or("");
861    match extension {
862        "rs" | "c" | "cpp" | "cc" | "h" | "hpp" | "js" | "ts" | "tsx" | "jsx" | "go" | "java"
863        | "swift" | "kt" | "kts" | "scala" | "cs" | "m" | "mm" | "zig" | "v" | "d" => "//",
864        "py" | "rb" | "sh" | "bash" | "zsh" | "pl" | "pm" | "r" | "jl" | "yaml" | "yml"
865        | "toml" | "coffee" | "cr" | "ex" | "exs" | "elixir" => "#",
866        "lua" | "hs" | "sql" => "--",
867        "lisp" | "clj" | "cljs" | "scm" | "rkt" | "el" => ";",
868        "erl" | "hrl" => "%",
869        _ => "//",
870    }
871}
872
873fn format_rejected_example(response: &ClaudeResponse, rejection_reason: &str) -> String {
874    let mut content = String::new();
875    content.push_str("# Rejected Example\n\n");
876    content.push_str(&format!("## Name\n\n{}\n\n", response.name));
877    content.push_str(&format!("## Reasoning\n\n{}\n\n", response.reasoning));
878    content.push_str("## Edit History Hunks\n\n```diff\n");
879    for hunk in &response.edit_history_hunks {
880        content.push_str(hunk);
881        content.push_str("\n\n");
882    }
883    content.push_str("```\n\n");
884    content.push_str("## Expected Patch Hunks\n\n```diff\n");
885    for hunk in &response.expected_patch_hunks {
886        content.push_str(hunk);
887        content.push_str("\n\n");
888    }
889    content.push_str("```\n\n");
890    content.push_str(&format!("## Rejection Reason\n\n{}\n", rejection_reason));
891    content
892}