1use crate::{
2 anthropic_client::PlainLlmClient,
3 git::{ensure_repo_cloned, run_git},
4 paths::{FAILED_EXAMPLES_DIR, LATEST_FAILED_EXAMPLES_DIR, SYNTHESIZE_STATE_FILE},
5 progress::{InfoStyle, Progress, Step, StepProgress},
6};
7use anthropic::ResponseContent;
8use anyhow::{Context as _, Result};
9use chrono::Local;
10use collections::{HashMap, HashSet};
11use edit_prediction::{
12 example_spec::ExampleSpec,
13 udiff::{apply_diff_to_string, edits_for_diff},
14};
15use futures::stream::{FuturesUnordered, StreamExt};
16use indoc::indoc;
17use serde::{Deserialize, Serialize};
18use std::{
19 path::{Path, PathBuf},
20 sync::Arc,
21};
22
23#[derive(Debug, Clone)]
24pub struct SynthesizeConfig {
25 pub repo_urls: Vec<String>,
26 /// Number of examples to generate per repository
27 pub count: usize,
28 pub max_commits: usize,
29 pub output_dir: PathBuf,
30 pub fresh: bool,
31}
32
33#[derive(Debug, Default, Serialize, Deserialize)]
34struct SynthesizeState {
35 repositories: HashMap<String, RepoState>,
36}
37
38#[derive(Debug, Default, Serialize, Deserialize)]
39struct RepoState {
40 processed_commits: HashSet<String>,
41 examples_generated: usize,
42}
43
44impl SynthesizeState {
45 fn load() -> Self {
46 if SYNTHESIZE_STATE_FILE.exists() {
47 std::fs::read_to_string(&*SYNTHESIZE_STATE_FILE)
48 .ok()
49 .and_then(|s| serde_json::from_str(&s).ok())
50 .unwrap_or_default()
51 } else {
52 Self::default()
53 }
54 }
55
56 fn save(&self) -> Result<()> {
57 let content = serde_json::to_string_pretty(self)?;
58 std::fs::write(&*SYNTHESIZE_STATE_FILE, content)?;
59 Ok(())
60 }
61
62 fn take_repo_state(&mut self, repo_url: &str) -> RepoState {
63 self.repositories.remove(repo_url).unwrap_or_default()
64 }
65
66 fn merge_repo_state(&mut self, repo_url: String, repo_state: RepoState) {
67 self.repositories.insert(repo_url, repo_state);
68 }
69}
70
71impl RepoState {
72 fn is_processed(&self, commit_sha: &str) -> bool {
73 self.processed_commits.contains(commit_sha)
74 }
75
76 fn mark_processed(&mut self, commit_sha: &str, examples_count: usize) {
77 self.processed_commits.insert(commit_sha.to_string());
78 self.examples_generated += examples_count;
79 }
80}
81
82#[derive(Debug)]
83struct CommitInfo {
84 sha: String,
85 parent_sha: String,
86 message: String,
87 diff: String,
88 expanded_diff: String,
89}
90
91/// Claude's response parsed into structured form
92#[derive(Debug)]
93struct ClaudeResponse {
94 name: String,
95 reasoning: String,
96 edit_history_hunks: Vec<String>,
97 expected_patch_hunks: Vec<String>,
98}
99
100pub async fn run_synthesize(config: SynthesizeConfig) -> Result<()> {
101 let mut state = if config.fresh {
102 SynthesizeState::default()
103 } else {
104 SynthesizeState::load()
105 };
106
107 std::fs::create_dir_all(&config.output_dir)?;
108 std::fs::create_dir_all(&*FAILED_EXAMPLES_DIR)?;
109
110 // Create "latest_failed" symlink pointing to this run's failed directory
111 if LATEST_FAILED_EXAMPLES_DIR.is_symlink() {
112 std::fs::remove_file(&*LATEST_FAILED_EXAMPLES_DIR)?;
113 }
114 #[cfg(unix)]
115 std::os::unix::fs::symlink(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
116 #[cfg(windows)]
117 std::os::windows::fs::symlink_dir(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
118
119 let progress = Progress::global();
120 let total_examples = config.count * config.repo_urls.len();
121 progress.set_total_examples(total_examples);
122
123 let client = Arc::new(PlainLlmClient::new()?);
124 let config = Arc::new(config);
125
126 let mut futures: FuturesUnordered<_> = config
127 .repo_urls
128 .iter()
129 .map(|repo_url| {
130 let client = client.clone();
131 let repo_state = state.take_repo_state(repo_url);
132 let config = config.clone();
133 let repo_url = repo_url.clone();
134 async move {
135 let result = synthesize_repo(&client, repo_state, &config, &repo_url).await;
136 (repo_url, result)
137 }
138 })
139 .collect();
140
141 let mut errors = Vec::new();
142 while let Some((repo_url, result)) = futures.next().await {
143 match result {
144 Ok(repo_state) => {
145 state.merge_repo_state(repo_url, repo_state);
146 }
147 Err(e) => {
148 errors.push(e);
149 }
150 }
151 }
152
153 state.save()?;
154
155 progress.finalize();
156
157 if let Some(first_error) = errors.into_iter().next() {
158 return Err(first_error);
159 }
160
161 Ok(())
162}
163
164async fn synthesize_repo(
165 client: &PlainLlmClient,
166 mut repo_state: RepoState,
167 config: &SynthesizeConfig,
168 repo_url: &str,
169) -> Result<RepoState> {
170 let progress = Progress::global();
171 let batch_size = config.max_commits;
172
173 let clone_progress = progress.start(Step::Synthesize, &format!("clone {}", repo_url));
174 let repo_path = ensure_repo_cloned(repo_url).await?;
175 drop(clone_progress);
176
177 let mut examples_generated = 0;
178 let mut commits_skipped = 0;
179
180 'outer: loop {
181 let list_progress = progress.start(
182 Step::Synthesize,
183 &format!("{}: list-commits", repo_name_from_url(repo_url)),
184 );
185 let commits = list_commits(&repo_path, batch_size, commits_skipped).await?;
186 drop(list_progress);
187
188 if commits.is_empty() {
189 break;
190 }
191
192 commits_skipped += commits.len();
193
194 for commit in commits {
195 if examples_generated >= config.count {
196 break 'outer;
197 }
198
199 if !config.fresh && repo_state.is_processed(&commit.sha) {
200 continue;
201 }
202
203 if should_skip_commit(&commit) {
204 continue;
205 }
206
207 let repo_name = repo_name_from_url(repo_url);
208 let commit_label = format!(
209 "{}: {} {}",
210 repo_name,
211 &commit.sha[..8],
212 truncate_message(&commit.message, 40)
213 );
214 let step_progress = Arc::new(progress.start(Step::Synthesize, &commit_label));
215
216 // Single Claude call to identify and copy hunks
217 step_progress.set_substatus("analyzing...");
218 let claude_response =
219 match analyze_commit(client, repo_url, &commit, step_progress.clone()).await {
220 Ok(Some(response)) => response,
221 Ok(None) => {
222 step_progress.set_info("no pattern", InfoStyle::Normal);
223 repo_state.mark_processed(&commit.sha, 0);
224 continue;
225 }
226 Err(e) => {
227 step_progress.set_info(format!("error: {:?}", e), InfoStyle::Warning);
228 repo_state.mark_processed(&commit.sha, 0);
229 continue;
230 }
231 };
232
233 // Validate and build the example
234 step_progress.set_substatus("validating...");
235 match build_example(repo_url, &commit, &repo_path, &claude_response).await {
236 Ok(spec) => {
237 let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S");
238 let filename = format!("{}--{}.md", repo_name, timestamp);
239 let path = config.output_dir.join(&filename);
240 std::fs::write(&path, spec.to_markdown())?;
241 examples_generated += 1;
242 step_progress.set_info(filename, InfoStyle::Normal);
243 }
244 Err(rejection_reason) => {
245 log::debug!("Example rejected: {}", rejection_reason);
246 let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S%.3f");
247 let filename = format!("{}--{}.md", repo_name, timestamp);
248 let path = FAILED_EXAMPLES_DIR.join(&filename);
249 let content = format_rejected_example(&claude_response, &rejection_reason);
250 if let Err(e) = std::fs::write(&path, content) {
251 log::warn!("Failed to write rejected example: {:?}", e);
252 }
253 step_progress.set_info(format!("rejected: {}", filename), InfoStyle::Warning);
254 }
255 }
256
257 repo_state.mark_processed(&commit.sha, 1);
258 }
259 }
260
261 Ok(repo_state)
262}
263
264fn repo_name_from_url(url: &str) -> String {
265 url.rsplit('/')
266 .next()
267 .unwrap_or(url)
268 .trim_end_matches(".git")
269 .to_string()
270}
271
272fn truncate_message(msg: &str, max_len: usize) -> String {
273 let first_line = msg.lines().next().unwrap_or("");
274 if first_line.len() <= max_len {
275 first_line.to_string()
276 } else {
277 format!("{}...", &first_line[..max_len - 3])
278 }
279}
280
281fn should_skip_commit(commit: &CommitInfo) -> bool {
282 let lines_changed = commit
283 .diff
284 .lines()
285 .filter(|l| l.starts_with('+') || l.starts_with('-'))
286 .count();
287 lines_changed < 10
288 || lines_changed > 1000
289 || is_non_code_commit(commit)
290 || is_rename_commit(commit)
291}
292
293fn is_non_code_commit(commit: &CommitInfo) -> bool {
294 let non_code_extensions = [
295 ".md", ".txt", ".json", ".yaml", ".yml", ".toml", ".lock", ".svg", ".png", ".jpg", ".gif",
296 ".ico", ".woff", ".ttf", ".eot",
297 ];
298
299 let diff_files: Vec<&str> = commit
300 .diff
301 .lines()
302 .filter(|l| l.starts_with("+++ b/") || l.starts_with("--- a/"))
303 .filter_map(|l| {
304 l.strip_prefix("+++ b/")
305 .or_else(|| l.strip_prefix("--- a/"))
306 })
307 .collect();
308
309 if diff_files.is_empty() {
310 return false;
311 }
312
313 diff_files
314 .iter()
315 .all(|f| non_code_extensions.iter().any(|ext| f.ends_with(ext)))
316}
317
318fn is_rename_commit(commit: &CommitInfo) -> bool {
319 commit.diff.contains("similarity index")
320 || commit.diff.contains("rename from")
321 || commit.diff.contains("rename to")
322}
323
324async fn list_commits(
325 repo_path: &Path,
326 max_commits: usize,
327 skip: usize,
328) -> Result<Vec<CommitInfo>> {
329 let output = run_git(
330 repo_path,
331 &[
332 "log",
333 "--no-merges",
334 &format!("--skip={}", skip),
335 &format!("-{}", max_commits),
336 "--format=%H|%P|%s",
337 ],
338 )
339 .await?;
340
341 let mut commits = Vec::new();
342 for line in output.lines() {
343 let parts: Vec<&str> = line.splitn(3, '|').collect();
344 if parts.len() < 3 {
345 continue;
346 }
347 let sha = parts[0].to_string();
348 let parent_sha = parts[1].split_whitespace().next().unwrap_or("").to_string();
349 if parent_sha.is_empty() {
350 continue;
351 }
352
353 // Get standard diff (for skip checks)
354 let diff = run_git(repo_path, &["show", "--format=", &sha])
355 .await
356 .unwrap_or_default();
357
358 // Get expanded diff with 30 lines of context
359 let expanded_diff = run_git(repo_path, &["show", "-U30", "--format=", &sha])
360 .await
361 .unwrap_or_default();
362
363 commits.push(CommitInfo {
364 sha,
365 parent_sha,
366 message: parts[2].to_string(),
367 diff,
368 expanded_diff,
369 });
370 }
371
372 Ok(commits)
373}
374
375fn build_prompt(repo_url: &str, commit: &CommitInfo) -> String {
376 format!(
377 indoc! {r#"
378 You are analyzing a git commit to construct a realistic edit prediction example.
379
380 Your goal is to tell the story of a programmer's editing session: what sequence of changes did they make, and what change logically comes next? We use these examples to train a model to predict edits, so the quality of the EDIT HISTORY is what matters most.
381
382 An edit prediction example consists of:
383 1. **Edit History**: 3-6 hunks showing what the programmer did BEFORE making the expected patch. This is the most important part - it must tell a coherent story of the changes leading up to the prediction.
384 2. **Expected Patch**: One small hunk that logically follows from the edit history.
385
386 Both single-file and multi-file patterns are acceptable.
387
388 ## What Makes a Good Example
389
390 The edit history should read like a story: "First the programmer changed X, then Y, then Z, and now they need to change W."
391
392 GOOD examples (rich sequences with 3+ steps):
393 - Removing a parameter: docstring update → constructor change → field removal → (predict) usage site update
394 - Adding a feature: type definition → first usage → second usage → (predict) third usage
395 - Bug fix pattern: fix in file A → fix in file B → fix in file C → (predict) fix in file D
396
397 BAD examples (respond NO_PATTERN):
398 - Commits where all changes are independent (no narrative thread)
399 - Simple find-and-replace (renaming, version bumps)
400 - Documentation-only or config-only changes
401 - Changes where you can only find 1-2 hunks for the edit history
402
403 ## Commit Information
404
405 Repository: {repo_url}
406 Commit: {sha}
407 Message: {message}
408
409 ## Diff (30 lines context)
410
411 ```diff
412 {expanded_diff}
413 ```
414
415 ## Your Task
416
417 First, THINK through whether this commit can support a good example:
418
419 1. What is the high-level pattern in this commit?
420 2. Can you identify at least 4 related hunks (3 for edit history + 1 for expected patch)?
421 3. What would be the narrative? (First... then... then... finally predict...)
422 4. Which specific hunk should be the expected patch (the "punchline")?
423
424 If you cannot construct a coherent 3+ hunk story, respond with just:
425 NO_PATTERN: <brief reason>
426
427 If you CAN construct a good example, respond in this format:
428
429 ANALYSIS:
430 Pattern: <one sentence describing the pattern>
431 Steps:
432 1. <file:line-range> - <what this hunk does>
433 2. <file:line-range> - <what this hunk does>
434 3. <file:line-range> - <what this hunk does>
435 4. [EXPECTED PATCH] <file:line-range> - <what this hunk does>
436
437 NAME: <short description, like a commit message, under 60 chars>
438
439 EDIT_HISTORY:
440
441 Hunk 1:
442 ```diff
443 --- a/src/models/user.py
444 +++ b/src/models/user.py
445 @@ -15,7 +15,6 @@ class User:
446 """A user in the system.
447
448 Attributes:
449 - email: The user's email address.
450 name: The user's display name.
451 """
452 ```
453
454 Hunk 2:
455 ```diff
456 --- a/src/models/user.py
457 +++ b/src/models/user.py
458 @@ -25,10 +24,9 @@ class User:
459 def __init__(
460 self,
461 name: str,
462 - email: str,
463 created_at: datetime,
464 ):
465 self.name = name
466 - self.email = email
467 self.created_at = created_at
468 ```
469
470 Hunk 3:
471 ```diff
472 --- a/src/api/handlers.py
473 +++ b/src/api/handlers.py
474 @@ -42,7 +42,6 @@ def create_user(request):
475 data = request.json()
476 user = User(
477 name=data["name"],
478 - email=data["email"],
479 created_at=datetime.now(),
480 )
481 return user.save()
482 ```
483
484 EXPECTED_PATCH:
485 ```diff
486 --- a/src/api/handlers.py
487 +++ b/src/api/handlers.py
488 @@ -58,7 +57,6 @@ def update_user(request, user_id):
489 user = User.get(user_id)
490 user.name = data.get("name", user.name)
491 - user.email = data.get("email", user.email)
492 user.save()
493 return user
494 ```
495
496 ## Requirements for the diffs
497
498 Edit history:
499 - MUST have 3-6 hunks (if you cannot find 3+, respond NO_PATTERN instead)
500 - Each hunk needs file headers (--- a/path and +++ b/path)
501 - Hunks must be valid unified diffs that apply to the parent commit
502 - Order hunks as a programmer would naturally make the changes
503
504 Expected patch:
505 - Must be a SINGLE hunk from a SINGLE file
506 - Must be SMALL: 1-15 changed lines (not counting context)
507 - Must be clearly predictable from the edit history narrative
508 "#},
509 repo_url = repo_url,
510 sha = commit.sha,
511 message = commit.message,
512 expanded_diff = commit.expanded_diff,
513 )
514}
515
516async fn analyze_commit(
517 client: &PlainLlmClient,
518 repo_url: &str,
519 commit: &CommitInfo,
520 step_progress: Arc<StepProgress>,
521) -> Result<Option<ClaudeResponse>> {
522 use anthropic::{Message, RequestContent, Role};
523
524 let prompt = build_prompt(repo_url, commit);
525 let messages = vec![Message {
526 role: Role::User,
527 content: vec![RequestContent::Text {
528 text: prompt,
529 cache_control: None,
530 }],
531 }];
532
533 let response = client
534 .generate_streaming("claude-sonnet-4-5", 8192, messages, |chars, _text| {
535 step_progress.set_substatus(format!("analyzing: {:.1}K", chars as f64 / 1000.0));
536 })
537 .await?;
538
539 // Extract text content from response
540 let response_text: String = response
541 .content
542 .iter()
543 .filter_map(|block| {
544 if let ResponseContent::Text { text } = block {
545 Some(text.as_str())
546 } else {
547 None
548 }
549 })
550 .collect::<Vec<_>>()
551 .join("\n");
552
553 parse_claude_response(&response_text)
554}
555
556fn parse_claude_response(response: &str) -> Result<Option<ClaudeResponse>> {
557 // Check for NO_PATTERN
558 if response.contains("NO_PATTERN:") {
559 return Ok(None);
560 }
561
562 // Parse NAME
563 let name = response
564 .lines()
565 .find(|l| l.starts_with("NAME:"))
566 .map(|l| l.strip_prefix("NAME:").unwrap_or("").trim().to_string())
567 .unwrap_or_else(|| "unnamed example".to_string());
568
569 // Parse ANALYSIS section (Claude's planning) - this is the primary reasoning
570 let reasoning = extract_section(
571 response,
572 "ANALYSIS:",
573 &["NAME:", "REASONING:", "EDIT_HISTORY:", "EXPECTED_PATCH:"],
574 )
575 .unwrap_or_default();
576
577 // Parse EDIT_HISTORY diff block
578 let edit_history_hunks = extract_diff_block(response, "EDIT_HISTORY:")?;
579
580 // Parse EXPECTED_PATCH diff block
581 let expected_patch_hunks = extract_diff_block(response, "EXPECTED_PATCH:")?;
582
583 if edit_history_hunks.is_empty() {
584 anyhow::bail!("No edit history hunks found in response");
585 }
586 if expected_patch_hunks.is_empty() {
587 anyhow::bail!("No expected patch hunks found in response");
588 }
589
590 Ok(Some(ClaudeResponse {
591 name,
592 reasoning,
593 edit_history_hunks,
594 expected_patch_hunks,
595 }))
596}
597
598fn extract_section(text: &str, start_marker: &str, end_markers: &[&str]) -> Option<String> {
599 let start_idx = text.find(start_marker)?;
600 let content_start = start_idx + start_marker.len();
601
602 let end_idx = end_markers
603 .iter()
604 .filter_map(|marker| text[content_start..].find(marker))
605 .min()
606 .map(|idx| content_start + idx)
607 .unwrap_or(text.len());
608
609 Some(text[content_start..end_idx].trim().to_string())
610}
611
612fn extract_diff_block(text: &str, section_marker: &str) -> Result<Vec<String>> {
613 let section_start = text
614 .find(section_marker)
615 .context(format!("Section {} not found", section_marker))?;
616
617 let after_marker = &text[section_start + section_marker.len()..];
618
619 // Find where the next major section starts (to bound our search)
620 let section_end = ["EXPECTED_PATCH:", "## "]
621 .iter()
622 .filter(|&&m| m != section_marker)
623 .filter_map(|marker| after_marker.find(marker))
624 .min()
625 .unwrap_or(after_marker.len());
626
627 let section_content = &after_marker[..section_end];
628
629 // Collect all ```diff blocks in this section
630 let mut hunks = Vec::new();
631 let mut search_start = 0;
632
633 while let Some(diff_start) = section_content[search_start..].find("```diff") {
634 let abs_diff_start = search_start + diff_start;
635 let block_content_start = section_content[abs_diff_start..]
636 .find('\n')
637 .map(|i| abs_diff_start + i + 1)
638 .unwrap_or(abs_diff_start);
639
640 if let Some(block_end_rel) = section_content[block_content_start..].find("```") {
641 let block_end = block_content_start + block_end_rel;
642 let diff_content = section_content[block_content_start..block_end].trim();
643
644 // Split this block into hunks (in case multiple hunks in one block)
645 hunks.extend(split_into_hunks(diff_content));
646
647 search_start = block_end + 3;
648 } else {
649 break;
650 }
651 }
652
653 if hunks.is_empty() {
654 anyhow::bail!("No diff blocks found in section {}", section_marker);
655 }
656
657 Ok(hunks)
658}
659
660/// Split a diff block into individual hunks, preserving file headers
661fn split_into_hunks(diff: &str) -> Vec<String> {
662 let mut hunks = Vec::new();
663 let mut current_file_header: Option<String> = None;
664 let mut current_hunk: Vec<String> = Vec::new();
665 let mut in_hunk = false;
666
667 for line in diff.lines() {
668 if line.starts_with("--- a/") || line.starts_with("--- /") {
669 // Start of file header - flush previous hunk
670 if in_hunk && !current_hunk.is_empty() {
671 let mut hunk_text = String::new();
672 if let Some(ref header) = current_file_header {
673 hunk_text.push_str(header);
674 hunk_text.push('\n');
675 }
676 hunk_text.push_str(¤t_hunk.join("\n"));
677 hunks.push(hunk_text);
678 current_hunk.clear();
679 }
680 current_file_header = Some(line.to_string());
681 in_hunk = false;
682 } else if line.starts_with("+++ b/") || line.starts_with("+++ /") {
683 if let Some(ref mut header) = current_file_header {
684 header.push('\n');
685 header.push_str(line);
686 }
687 } else if line.starts_with("@@ ") {
688 // New hunk - flush previous
689 if in_hunk && !current_hunk.is_empty() {
690 let mut hunk_text = String::new();
691 if let Some(ref header) = current_file_header {
692 hunk_text.push_str(header);
693 hunk_text.push('\n');
694 }
695 hunk_text.push_str(¤t_hunk.join("\n"));
696 hunks.push(hunk_text);
697 current_hunk.clear();
698 }
699 current_hunk.push(line.to_string());
700 in_hunk = true;
701 } else if in_hunk {
702 current_hunk.push(line.to_string());
703 }
704 }
705
706 // Flush final hunk
707 if !current_hunk.is_empty() {
708 let mut hunk_text = String::new();
709 if let Some(ref header) = current_file_header {
710 hunk_text.push_str(header);
711 hunk_text.push('\n');
712 }
713 hunk_text.push_str(¤t_hunk.join("\n"));
714 hunks.push(hunk_text);
715 }
716
717 hunks
718}
719
720/// Validate Claude's output by applying diffs and build the ExampleSpec
721async fn build_example(
722 repo_url: &str,
723 commit: &CommitInfo,
724 repo_path: &Path,
725 response: &ClaudeResponse,
726) -> Result<ExampleSpec, String> {
727 // Validate expected patch hunks
728 if response.expected_patch_hunks.len() != 1 {
729 return Err(format!(
730 "Expected exactly 1 expected patch hunk, got {}",
731 response.expected_patch_hunks.len()
732 ));
733 }
734
735 // Parse the expected patch to determine cursor file
736 let expected_patch = &response.expected_patch_hunks[0];
737 let cursor_file = extract_file_from_hunk(expected_patch)
738 .ok_or_else(|| "Could not determine file from expected patch".to_string())?;
739
740 // Get the file content before the commit
741 let before_content = run_git(
742 repo_path,
743 &["show", &format!("{}^:{}", commit.sha, cursor_file)],
744 )
745 .await
746 .map_err(|e| format!("Failed to get file content for {}: {}", cursor_file, e))?;
747
748 // Build edit history diff from Claude's hunks
749 let edit_history = response.edit_history_hunks.join("\n");
750
751 // Apply edit history to get intermediate state (validates edit history)
752 let intermediate_state =
753 apply_edit_history_to_content(&before_content, &edit_history, &cursor_file)?;
754
755 // Validate expected patch applies to intermediate state
756 let expected_patch_with_header = ensure_diff_header(expected_patch, &cursor_file);
757 apply_diff_to_string(&expected_patch_with_header, &intermediate_state)
758 .map_err(|e| format!("Expected patch failed to apply: {}", e))?;
759
760 // Find where the expected patch edits would apply in the intermediate state
761 let edits = edits_for_diff(&intermediate_state, &expected_patch_with_header)
762 .map_err(|e| format!("Failed to parse expected patch: {}", e))?;
763 if edits.is_empty() {
764 return Err(
765 "Could not locate expected patch in file (context not found or ambiguous)".to_string(),
766 );
767 }
768
769 // Use the start of the first edit for cursor positioning
770 let cursor_byte_offset = edits[0].0.start;
771
772 // Extract excerpt around the edit location
773 let (excerpt, cursor_offset) = extract_cursor_excerpt(&intermediate_state, cursor_byte_offset)?;
774
775 // Build the ExampleSpec and use set_cursor_excerpt to format with comment marker
776 let comment_prefix = line_comment_prefix(&cursor_file);
777 let reasoning_with_source = format!(
778 "Source commit: {} ({})\n\n{}",
779 commit.sha,
780 truncate_message(&commit.message, 60),
781 response.reasoning
782 );
783 let mut spec = ExampleSpec {
784 name: response.name.clone(),
785 repository_url: repo_url.to_string(),
786 revision: commit.parent_sha.clone(),
787 tags: Vec::new(),
788 reasoning: Some(reasoning_with_source),
789 uncommitted_diff: String::new(),
790 cursor_path: Arc::from(Path::new(&cursor_file)),
791 cursor_position: String::new(),
792 edit_history,
793 expected_patches: vec![expected_patch_with_header],
794 rejected_patch: None,
795 };
796 spec.set_cursor_excerpt(&excerpt, cursor_offset, comment_prefix);
797
798 Ok(spec)
799}
800
801/// Extract file path from a hunk (looks for --- a/path or +++ b/path)
802fn extract_file_from_hunk(hunk: &str) -> Option<String> {
803 for line in hunk.lines() {
804 if let Some(path) = line.strip_prefix("+++ b/") {
805 return Some(path.to_string());
806 }
807 if let Some(path) = line.strip_prefix("--- a/") {
808 return Some(path.to_string());
809 }
810 }
811 None
812}
813
814/// Ensure a hunk has proper file headers
815fn ensure_diff_header(hunk: &str, file_path: &str) -> String {
816 if hunk.contains("--- a/") || hunk.contains("+++ b/") {
817 return hunk.to_string();
818 }
819 format!("--- a/{}\n+++ b/{}\n{}", file_path, file_path, hunk)
820}
821
822/// Apply edit history to file content, only if hunks affect this file
823fn apply_edit_history_to_content(
824 content: &str,
825 edit_history: &str,
826 cursor_file: &str,
827) -> Result<String, String> {
828 // Extract just the hunks for this file from the edit history
829 let file_diff = extract_file_diff_from_combined(edit_history, cursor_file);
830
831 if file_diff.is_empty() {
832 return Ok(content.to_string());
833 }
834
835 apply_diff_to_string(&file_diff, content)
836 .map_err(|e| format!("Failed to apply edit history: {}", e))
837}
838
839/// Extract hunks for a specific file from a combined diff
840fn extract_file_diff_from_combined(combined_diff: &str, target_file: &str) -> String {
841 let mut result = String::new();
842 let mut in_target_file = false;
843 let mut found_header = false;
844
845 for line in combined_diff.lines() {
846 if line.starts_with("--- a/") {
847 let file = line.strip_prefix("--- a/").unwrap_or("");
848 in_target_file = file == target_file;
849 if in_target_file {
850 result.push_str(line);
851 result.push('\n');
852 found_header = false;
853 }
854 } else if line.starts_with("+++ b/") && in_target_file {
855 result.push_str(line);
856 result.push('\n');
857 found_header = true;
858 } else if in_target_file && found_header {
859 if line.starts_with("--- a/") {
860 break;
861 }
862 result.push_str(line);
863 result.push('\n');
864 }
865 }
866
867 result
868}
869
870/// Extract a cursor position excerpt from content around a byte offset.
871/// Returns the excerpt and the cursor offset within the excerpt.
872fn extract_cursor_excerpt(
873 content: &str,
874 cursor_byte_offset: usize,
875) -> Result<(String, usize), String> {
876 // Find the line containing the cursor
877 let line_start = content[..cursor_byte_offset]
878 .rfind('\n')
879 .map(|pos| pos + 1)
880 .unwrap_or(0);
881 let line_end = content[cursor_byte_offset..]
882 .find('\n')
883 .map(|pos| cursor_byte_offset + pos)
884 .unwrap_or(content.len());
885
886 // Get context lines before
887 let lines_before: Vec<&str> = content[..line_start].lines().collect();
888 let context_before: Vec<&str> = lines_before.iter().rev().take(3).rev().cloned().collect();
889
890 // Get context lines after
891 let after_line_end = if line_end < content.len() {
892 line_end + 1
893 } else {
894 line_end
895 };
896 let context_after: Vec<&str> = content[after_line_end..].lines().take(4).collect();
897
898 // The line containing the cursor
899 let cursor_line = &content[line_start..line_end];
900 let cursor_column = cursor_byte_offset - line_start;
901
902 // Build the excerpt
903 let mut excerpt = String::new();
904 for line in context_before {
905 excerpt.push_str(line);
906 excerpt.push('\n');
907 }
908 // Track where cursor will be in the excerpt
909 let cursor_offset_in_excerpt = excerpt.len() + cursor_column;
910 // Line containing cursor
911 excerpt.push_str(cursor_line);
912 excerpt.push('\n');
913 for line in context_after {
914 excerpt.push_str(line);
915 excerpt.push('\n');
916 }
917
918 // Trim trailing newline
919 if excerpt.ends_with('\n') {
920 excerpt.pop();
921 }
922
923 Ok((excerpt, cursor_offset_in_excerpt))
924}
925
926/// Get the line comment prefix for a file based on its extension
927fn line_comment_prefix(file_path: &str) -> &'static str {
928 let extension = file_path.rsplit('.').next().unwrap_or("");
929 match extension {
930 "rs" | "c" | "cpp" | "cc" | "h" | "hpp" | "js" | "ts" | "tsx" | "jsx" | "go" | "java"
931 | "swift" | "kt" | "kts" | "scala" | "cs" | "m" | "mm" | "zig" | "v" | "d" => "//",
932 "py" | "rb" | "sh" | "bash" | "zsh" | "pl" | "pm" | "r" | "jl" | "yaml" | "yml"
933 | "toml" | "coffee" | "cr" | "ex" | "exs" | "elixir" => "#",
934 "lua" | "hs" | "sql" => "--",
935 "lisp" | "clj" | "cljs" | "scm" | "rkt" | "el" => ";",
936 "erl" | "hrl" => "%",
937 _ => "//",
938 }
939}
940
941fn format_rejected_example(response: &ClaudeResponse, rejection_reason: &str) -> String {
942 let mut content = String::new();
943 content.push_str("# Rejected Example\n\n");
944 content.push_str(&format!("## Name\n\n{}\n\n", response.name));
945 content.push_str(&format!("## Reasoning\n\n{}\n\n", response.reasoning));
946 content.push_str("## Edit History Hunks\n\n```diff\n");
947 for hunk in &response.edit_history_hunks {
948 content.push_str(hunk);
949 content.push_str("\n\n");
950 }
951 content.push_str("```\n\n");
952 content.push_str("## Expected Patch Hunks\n\n```diff\n");
953 for hunk in &response.expected_patch_hunks {
954 content.push_str(hunk);
955 content.push_str("\n\n");
956 }
957 content.push_str("```\n\n");
958 content.push_str(&format!("## Rejection Reason\n\n{}\n", rejection_reason));
959 content
960}