1use crate::{
2 anthropic_client::PlainLlmClient,
3 git::{ensure_repo_cloned, run_git},
4 paths::{FAILED_EXAMPLES_DIR, LATEST_FAILED_EXAMPLES_DIR, SYNTHESIZE_STATE_FILE},
5 progress::{InfoStyle, Progress, Step, StepProgress},
6};
7use anthropic::ResponseContent;
8use anyhow::{Context as _, Result};
9use chrono::Local;
10use collections::{HashMap, HashSet};
11use edit_prediction::{
12 example_spec::ExampleSpec,
13 udiff::{apply_diff_to_string, edits_for_diff},
14};
15use futures::stream::{FuturesUnordered, StreamExt};
16use indoc::indoc;
17use serde::{Deserialize, Serialize};
18use std::{
19 path::{Path, PathBuf},
20 sync::Arc,
21};
22
23#[derive(Debug, Clone)]
24pub struct SynthesizeConfig {
25 pub repo_urls: Vec<String>,
26 /// Number of examples to generate per repository
27 pub count: usize,
28 pub max_commits: usize,
29 pub output_dir: PathBuf,
30 pub fresh: bool,
31}
32
33#[derive(Debug, Default, Serialize, Deserialize)]
34struct SynthesizeState {
35 repositories: HashMap<String, RepoState>,
36}
37
38#[derive(Debug, Default, Serialize, Deserialize)]
39struct RepoState {
40 processed_commits: HashSet<String>,
41 examples_generated: usize,
42}
43
44impl SynthesizeState {
45 fn load() -> Self {
46 if SYNTHESIZE_STATE_FILE.exists() {
47 std::fs::read_to_string(&*SYNTHESIZE_STATE_FILE)
48 .ok()
49 .and_then(|s| serde_json::from_str(&s).ok())
50 .unwrap_or_default()
51 } else {
52 Self::default()
53 }
54 }
55
56 fn save(&self) -> Result<()> {
57 let content = serde_json::to_string_pretty(self)?;
58 std::fs::write(&*SYNTHESIZE_STATE_FILE, content)?;
59 Ok(())
60 }
61
62 fn take_repo_state(&mut self, repo_url: &str) -> RepoState {
63 self.repositories.remove(repo_url).unwrap_or_default()
64 }
65
66 fn merge_repo_state(&mut self, repo_url: String, repo_state: RepoState) {
67 self.repositories.insert(repo_url, repo_state);
68 }
69}
70
71impl RepoState {
72 fn is_processed(&self, commit_sha: &str) -> bool {
73 self.processed_commits.contains(commit_sha)
74 }
75
76 fn mark_processed(&mut self, commit_sha: &str, examples_count: usize) {
77 self.processed_commits.insert(commit_sha.to_string());
78 self.examples_generated += examples_count;
79 }
80}
81
82#[derive(Debug)]
83struct CommitInfo {
84 sha: String,
85 parent_sha: String,
86 message: String,
87 diff: String,
88 expanded_diff: String,
89}
90
91/// Claude's response parsed into structured form
92#[derive(Debug)]
93struct ClaudeResponse {
94 name: String,
95 reasoning: String,
96 edit_history_hunks: Vec<String>,
97 expected_patch_hunks: Vec<String>,
98}
99
100pub async fn run_synthesize(config: SynthesizeConfig) -> Result<()> {
101 let mut state = if config.fresh {
102 SynthesizeState::default()
103 } else {
104 SynthesizeState::load()
105 };
106
107 std::fs::create_dir_all(&config.output_dir)?;
108 std::fs::create_dir_all(&*FAILED_EXAMPLES_DIR)?;
109
110 // Create "latest_failed" symlink pointing to this run's failed directory
111 if LATEST_FAILED_EXAMPLES_DIR.is_symlink() {
112 std::fs::remove_file(&*LATEST_FAILED_EXAMPLES_DIR)?;
113 }
114 #[cfg(unix)]
115 std::os::unix::fs::symlink(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
116 #[cfg(windows)]
117 std::os::windows::fs::symlink_dir(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
118
119 let progress = Progress::global();
120 let total_examples = config.count * config.repo_urls.len();
121 progress.set_total_examples(total_examples);
122
123 let client = Arc::new(PlainLlmClient::new()?);
124 let config = Arc::new(config);
125
126 let mut futures: FuturesUnordered<_> = config
127 .repo_urls
128 .iter()
129 .map(|repo_url| {
130 let client = client.clone();
131 let repo_state = state.take_repo_state(repo_url);
132 let config = config.clone();
133 let repo_url = repo_url.clone();
134 async move {
135 let result = synthesize_repo(&client, repo_state, &config, &repo_url).await;
136 (repo_url, result)
137 }
138 })
139 .collect();
140
141 let mut errors = Vec::new();
142 while let Some((repo_url, result)) = futures.next().await {
143 match result {
144 Ok(repo_state) => {
145 state.merge_repo_state(repo_url, repo_state);
146 }
147 Err(e) => {
148 errors.push(e);
149 }
150 }
151 }
152
153 state.save()?;
154
155 progress.finalize();
156
157 if let Some(first_error) = errors.into_iter().next() {
158 return Err(first_error);
159 }
160
161 Ok(())
162}
163
164async fn synthesize_repo(
165 client: &PlainLlmClient,
166 mut repo_state: RepoState,
167 config: &SynthesizeConfig,
168 repo_url: &str,
169) -> Result<RepoState> {
170 let progress = Progress::global();
171 let batch_size = config.max_commits;
172
173 let clone_progress = progress.start(Step::Synthesize, &format!("clone {}", repo_url));
174 let repo_path = ensure_repo_cloned(repo_url).await?;
175 drop(clone_progress);
176
177 let mut examples_generated = 0;
178 let mut commits_skipped = 0;
179
180 'outer: loop {
181 let list_progress = progress.start(
182 Step::Synthesize,
183 &format!("{}: list-commits", repo_name_from_url(repo_url)),
184 );
185 let commits = list_commits(&repo_path, batch_size, commits_skipped).await?;
186 drop(list_progress);
187
188 if commits.is_empty() {
189 break;
190 }
191
192 commits_skipped += commits.len();
193
194 for commit in commits {
195 if examples_generated >= config.count {
196 break 'outer;
197 }
198
199 if !config.fresh && repo_state.is_processed(&commit.sha) {
200 continue;
201 }
202
203 if should_skip_commit(&commit) {
204 continue;
205 }
206
207 let repo_name = repo_name_from_url(repo_url);
208 let commit_label = format!(
209 "{}: {} {}",
210 repo_name,
211 &commit.sha[..8],
212 truncate_message(&commit.message, 40)
213 );
214 let step_progress = Arc::new(progress.start(Step::Synthesize, &commit_label));
215
216 // Single Claude call to identify and copy hunks
217 step_progress.set_substatus("analyzing...");
218 let claude_response =
219 match analyze_commit(client, repo_url, &commit, step_progress.clone()).await {
220 Ok(Some(response)) => response,
221 Ok(None) => {
222 step_progress.set_info("no pattern", InfoStyle::Normal);
223 repo_state.mark_processed(&commit.sha, 0);
224 continue;
225 }
226 Err(e) => {
227 step_progress.set_info(format!("error: {:?}", e), InfoStyle::Warning);
228 repo_state.mark_processed(&commit.sha, 0);
229 continue;
230 }
231 };
232
233 // Validate and build the example
234 step_progress.set_substatus("validating...");
235 match build_example(repo_url, &commit, &repo_path, &claude_response).await {
236 Ok(spec) => {
237 let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S");
238 let filename = format!("{}--{}.md", repo_name, timestamp);
239 let path = config.output_dir.join(&filename);
240 std::fs::write(&path, spec.to_markdown())?;
241 examples_generated += 1;
242 step_progress.set_info(filename, InfoStyle::Normal);
243 }
244 Err(rejection_reason) => {
245 log::debug!("Example rejected: {}", rejection_reason);
246 let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S%.3f");
247 let filename = format!("{}--{}.md", repo_name, timestamp);
248 let path = FAILED_EXAMPLES_DIR.join(&filename);
249 let content = format_rejected_example(&claude_response, &rejection_reason);
250 if let Err(e) = std::fs::write(&path, content) {
251 log::warn!("Failed to write rejected example: {:?}", e);
252 }
253 step_progress.set_info(format!("rejected: {}", filename), InfoStyle::Warning);
254 }
255 }
256
257 repo_state.mark_processed(&commit.sha, 1);
258 }
259 }
260
261 Ok(repo_state)
262}
263
264fn repo_name_from_url(url: &str) -> String {
265 url.rsplit('/')
266 .next()
267 .unwrap_or(url)
268 .trim_end_matches(".git")
269 .to_string()
270}
271
272fn truncate_message(msg: &str, max_len: usize) -> String {
273 let first_line = msg.lines().next().unwrap_or("");
274 if first_line.len() <= max_len {
275 first_line.to_string()
276 } else {
277 format!("{}...", &first_line[..max_len - 3])
278 }
279}
280
281fn should_skip_commit(commit: &CommitInfo) -> bool {
282 let lines_changed = commit
283 .diff
284 .lines()
285 .filter(|l| l.starts_with('+') || l.starts_with('-'))
286 .count();
287 lines_changed < 10
288 || lines_changed > 1000
289 || is_non_code_commit(commit)
290 || is_rename_commit(commit)
291}
292
293fn is_non_code_commit(commit: &CommitInfo) -> bool {
294 let non_code_extensions = [
295 ".md", ".txt", ".json", ".yaml", ".yml", ".toml", ".lock", ".svg", ".png", ".jpg", ".gif",
296 ".ico", ".woff", ".ttf", ".eot",
297 ];
298
299 let diff_files: Vec<&str> = commit
300 .diff
301 .lines()
302 .filter(|l| l.starts_with("+++ b/") || l.starts_with("--- a/"))
303 .filter_map(|l| {
304 l.strip_prefix("+++ b/")
305 .or_else(|| l.strip_prefix("--- a/"))
306 })
307 .collect();
308
309 if diff_files.is_empty() {
310 return false;
311 }
312
313 diff_files
314 .iter()
315 .all(|f| non_code_extensions.iter().any(|ext| f.ends_with(ext)))
316}
317
318fn is_rename_commit(commit: &CommitInfo) -> bool {
319 commit.diff.contains("similarity index")
320 || commit.diff.contains("rename from")
321 || commit.diff.contains("rename to")
322}
323
324async fn list_commits(
325 repo_path: &Path,
326 max_commits: usize,
327 skip: usize,
328) -> Result<Vec<CommitInfo>> {
329 let output = run_git(
330 repo_path,
331 &[
332 "log",
333 "--no-merges",
334 &format!("--skip={}", skip),
335 &format!("-{}", max_commits),
336 "--format=%H|%P|%s",
337 ],
338 )
339 .await?;
340
341 let mut commits = Vec::new();
342 for line in output.lines() {
343 let parts: Vec<&str> = line.splitn(3, '|').collect();
344 if parts.len() < 3 {
345 continue;
346 }
347 let sha = parts[0].to_string();
348 let parent_sha = parts[1].split_whitespace().next().unwrap_or("").to_string();
349 if parent_sha.is_empty() {
350 continue;
351 }
352
353 // Get standard diff (for skip checks)
354 let diff = run_git(repo_path, &["show", "--format=", &sha])
355 .await
356 .unwrap_or_default();
357
358 // Get expanded diff with 30 lines of context
359 let expanded_diff = run_git(repo_path, &["show", "-U30", "--format=", &sha])
360 .await
361 .unwrap_or_default();
362
363 commits.push(CommitInfo {
364 sha,
365 parent_sha,
366 message: parts[2].to_string(),
367 diff,
368 expanded_diff,
369 });
370 }
371
372 Ok(commits)
373}
374
375fn build_prompt(repo_url: &str, commit: &CommitInfo) -> String {
376 format!(
377 indoc! {r#"
378 You are analyzing a git commit to construct a realistic edit prediction example.
379
380 Your goal is to tell the story of a programmer's editing session: what sequence of changes did they make, and what change logically comes next? We use these examples to train a model to predict edits, so the quality of the EDIT HISTORY is what matters most.
381
382 An edit prediction example consists of:
383 1. **Edit History**: 3-6 hunks showing what the programmer did BEFORE making the expected patch. This is the most important part - it must tell a coherent story of the changes leading up to the prediction.
384 2. **Expected Patch**: One small hunk that logically follows from the edit history.
385
386 Both single-file and multi-file patterns are acceptable.
387
388 ## What Makes a Good Example
389
390 The edit history should read like a story: "First the programmer changed X, then Y, then Z, and now they need to change W."
391
392 GOOD examples (rich sequences with 3+ steps):
393 - Removing a parameter: docstring update → constructor change → field removal → (predict) usage site update
394 - Adding a feature: type definition → first usage → second usage → (predict) third usage
395 - Bug fix pattern: fix in file A → fix in file B → fix in file C → (predict) fix in file D
396
397 BAD examples (respond NO_PATTERN):
398 - Commits where all changes are independent (no narrative thread)
399 - Simple find-and-replace (renaming, version bumps)
400 - Documentation-only or config-only changes
401 - Changes where you can only find 1-2 hunks for the edit history
402
403 ## Commit Information
404
405 Repository: {repo_url}
406 Commit: {sha}
407 Message: {message}
408
409 ## Diff (30 lines context)
410
411 ```diff
412 {expanded_diff}
413 ```
414
415 ## Your Task
416
417 First, THINK through whether this commit can support a good example:
418
419 1. What is the high-level pattern in this commit?
420 2. Can you identify at least 4 related hunks (3 for edit history + 1 for expected patch)?
421 3. What would be the narrative? (First... then... then... finally predict...)
422 4. Which specific hunk should be the expected patch (the "punchline")?
423
424 If you cannot construct a coherent 3+ hunk story, respond with just:
425 NO_PATTERN: <brief reason>
426
427 If you CAN construct a good example, respond in this format:
428
429 ANALYSIS:
430 Pattern: <one sentence describing the pattern>
431 Steps:
432 1. <file:line-range> - <what this hunk does>
433 2. <file:line-range> - <what this hunk does>
434 3. <file:line-range> - <what this hunk does>
435 4. [EXPECTED PATCH] <file:line-range> - <what this hunk does>
436
437 NAME: <short description, like a commit message, under 60 chars>
438
439 EDIT_HISTORY:
440
441 Hunk 1:
442 ```diff
443 --- a/src/models/user.py
444 +++ b/src/models/user.py
445 @@ -15,7 +15,6 @@ class User:
446 """A user in the system.
447
448 Attributes:
449 - email: The user's email address.
450 name: The user's display name.
451 """
452 ```
453
454 Hunk 2:
455 ```diff
456 --- a/src/models/user.py
457 +++ b/src/models/user.py
458 @@ -25,10 +24,9 @@ class User:
459 def __init__(
460 self,
461 name: str,
462 - email: str,
463 created_at: datetime,
464 ):
465 self.name = name
466 - self.email = email
467 self.created_at = created_at
468 ```
469
470 Hunk 3:
471 ```diff
472 --- a/src/api/handlers.py
473 +++ b/src/api/handlers.py
474 @@ -42,7 +42,6 @@ def create_user(request):
475 data = request.json()
476 user = User(
477 name=data["name"],
478 - email=data["email"],
479 created_at=datetime.now(),
480 )
481 return user.save()
482 ```
483
484 EXPECTED_PATCH:
485 ```diff
486 --- a/src/api/handlers.py
487 +++ b/src/api/handlers.py
488 @@ -58,7 +57,6 @@ def update_user(request, user_id):
489 user = User.get(user_id)
490 user.name = data.get("name", user.name)
491 - user.email = data.get("email", user.email)
492 user.save()
493 return user
494 ```
495
496 ## Requirements for the diffs
497
498 Edit history:
499 - MUST have 3-6 hunks (if you cannot find 3+, respond NO_PATTERN instead)
500 - Each hunk needs file headers (--- a/path and +++ b/path)
501 - Hunks must be valid unified diffs that apply to the parent commit
502 - Order hunks as a programmer would naturally make the changes
503
504 Expected patch:
505 - Must be a SINGLE hunk from a SINGLE file
506 - Must be SMALL: 1-15 changed lines (not counting context)
507 - Must be clearly predictable from the edit history narrative
508 "#},
509 repo_url = repo_url,
510 sha = commit.sha,
511 message = commit.message,
512 expanded_diff = commit.expanded_diff,
513 )
514}
515
516async fn analyze_commit(
517 client: &PlainLlmClient,
518 repo_url: &str,
519 commit: &CommitInfo,
520 step_progress: Arc<StepProgress>,
521) -> Result<Option<ClaudeResponse>> {
522 use anthropic::{Message, RequestContent, Role};
523
524 let prompt = build_prompt(repo_url, commit);
525 let messages = vec![Message {
526 role: Role::User,
527 content: vec![RequestContent::Text {
528 text: prompt,
529 cache_control: None,
530 }],
531 }];
532
533 let response = client
534 .generate_streaming("claude-sonnet-4-5", 8192, messages, |chars, _text| {
535 step_progress.set_substatus(format!("analyzing: {:.1}K", chars as f64 / 1000.0));
536 })
537 .await?;
538
539 // Extract text content from response
540 let response_text: String = response
541 .content
542 .iter()
543 .filter_map(|block| {
544 if let ResponseContent::Text { text } = block {
545 Some(text.as_str())
546 } else {
547 None
548 }
549 })
550 .collect::<Vec<_>>()
551 .join("\n");
552
553 parse_claude_response(&response_text)
554}
555
556fn parse_claude_response(response: &str) -> Result<Option<ClaudeResponse>> {
557 // Check for NO_PATTERN
558 if response.contains("NO_PATTERN:") {
559 return Ok(None);
560 }
561
562 // Parse NAME
563 let name = response
564 .lines()
565 .find(|l| l.starts_with("NAME:"))
566 .map(|l| l.strip_prefix("NAME:").unwrap_or("").trim().to_string())
567 .unwrap_or_else(|| "unnamed example".to_string());
568
569 // Parse ANALYSIS section (Claude's planning) - this is the primary reasoning
570 let reasoning = extract_section(
571 response,
572 "ANALYSIS:",
573 &["NAME:", "REASONING:", "EDIT_HISTORY:", "EXPECTED_PATCH:"],
574 )
575 .unwrap_or_default();
576
577 // Parse EDIT_HISTORY diff block
578 let edit_history_hunks = extract_diff_block(response, "EDIT_HISTORY:")?;
579
580 // Parse EXPECTED_PATCH diff block
581 let expected_patch_hunks = extract_diff_block(response, "EXPECTED_PATCH:")?;
582
583 if edit_history_hunks.is_empty() {
584 anyhow::bail!("No edit history hunks found in response");
585 }
586 if expected_patch_hunks.is_empty() {
587 anyhow::bail!("No expected patch hunks found in response");
588 }
589
590 Ok(Some(ClaudeResponse {
591 name,
592 reasoning,
593 edit_history_hunks,
594 expected_patch_hunks,
595 }))
596}
597
598fn extract_section(text: &str, start_marker: &str, end_markers: &[&str]) -> Option<String> {
599 let start_idx = text.find(start_marker)?;
600 let content_start = start_idx + start_marker.len();
601
602 let end_idx = end_markers
603 .iter()
604 .filter_map(|marker| text[content_start..].find(marker))
605 .min()
606 .map(|idx| content_start + idx)
607 .unwrap_or(text.len());
608
609 Some(text[content_start..end_idx].trim().to_string())
610}
611
612fn extract_diff_block(text: &str, section_marker: &str) -> Result<Vec<String>> {
613 let section_start = text
614 .find(section_marker)
615 .context(format!("Section {} not found", section_marker))?;
616
617 let after_marker = &text[section_start + section_marker.len()..];
618
619 // Find where the next major section starts (to bound our search)
620 let section_end = ["EXPECTED_PATCH:", "## "]
621 .iter()
622 .filter(|&&m| m != section_marker)
623 .filter_map(|marker| after_marker.find(marker))
624 .min()
625 .unwrap_or(after_marker.len());
626
627 let section_content = &after_marker[..section_end];
628
629 // Collect all ```diff blocks in this section
630 let mut hunks = Vec::new();
631 let mut search_start = 0;
632
633 while let Some(diff_start) = section_content[search_start..].find("```diff") {
634 let abs_diff_start = search_start + diff_start;
635 let block_content_start = section_content[abs_diff_start..]
636 .find('\n')
637 .map(|i| abs_diff_start + i + 1)
638 .unwrap_or(abs_diff_start);
639
640 if let Some(block_end_rel) = section_content[block_content_start..].find("```") {
641 let block_end = block_content_start + block_end_rel;
642 let diff_content = section_content[block_content_start..block_end].trim();
643
644 // Split this block into hunks (in case multiple hunks in one block)
645 hunks.extend(split_into_hunks(diff_content));
646
647 search_start = block_end + 3;
648 } else {
649 break;
650 }
651 }
652
653 if hunks.is_empty() {
654 anyhow::bail!("No diff blocks found in section {}", section_marker);
655 }
656
657 Ok(hunks)
658}
659
660/// Split a diff block into individual hunks, preserving file headers
661fn split_into_hunks(diff: &str) -> Vec<String> {
662 let mut hunks = Vec::new();
663 let mut current_file_header: Option<String> = None;
664 let mut current_hunk: Vec<String> = Vec::new();
665 let mut in_hunk = false;
666
667 for line in diff.lines() {
668 if line.starts_with("--- a/") || line.starts_with("--- /") {
669 // Start of file header - flush previous hunk
670 if in_hunk && !current_hunk.is_empty() {
671 let mut hunk_text = String::new();
672 if let Some(ref header) = current_file_header {
673 hunk_text.push_str(header);
674 hunk_text.push('\n');
675 }
676 hunk_text.push_str(¤t_hunk.join("\n"));
677 hunks.push(hunk_text);
678 current_hunk.clear();
679 }
680 current_file_header = Some(line.to_string());
681 in_hunk = false;
682 } else if line.starts_with("+++ b/") || line.starts_with("+++ /") {
683 if let Some(ref mut header) = current_file_header {
684 header.push('\n');
685 header.push_str(line);
686 }
687 } else if line.starts_with("@@ ") {
688 // New hunk - flush previous
689 if in_hunk && !current_hunk.is_empty() {
690 let mut hunk_text = String::new();
691 if let Some(ref header) = current_file_header {
692 hunk_text.push_str(header);
693 hunk_text.push('\n');
694 }
695 hunk_text.push_str(¤t_hunk.join("\n"));
696 hunks.push(hunk_text);
697 current_hunk.clear();
698 }
699 current_hunk.push(line.to_string());
700 in_hunk = true;
701 } else if in_hunk {
702 current_hunk.push(line.to_string());
703 }
704 }
705
706 // Flush final hunk
707 if !current_hunk.is_empty() {
708 let mut hunk_text = String::new();
709 if let Some(ref header) = current_file_header {
710 hunk_text.push_str(header);
711 hunk_text.push('\n');
712 }
713 hunk_text.push_str(¤t_hunk.join("\n"));
714 hunks.push(hunk_text);
715 }
716
717 hunks
718}
719
720/// Validate Claude's output by applying diffs and build the ExampleSpec
721async fn build_example(
722 repo_url: &str,
723 commit: &CommitInfo,
724 repo_path: &Path,
725 response: &ClaudeResponse,
726) -> Result<ExampleSpec, String> {
727 // Validate expected patch hunks
728 if response.expected_patch_hunks.len() != 1 {
729 return Err(format!(
730 "Expected exactly 1 expected patch hunk, got {}",
731 response.expected_patch_hunks.len()
732 ));
733 }
734
735 // Parse the expected patch to determine cursor file
736 let expected_patch = &response.expected_patch_hunks[0];
737 let cursor_file = extract_file_from_hunk(expected_patch)
738 .ok_or_else(|| "Could not determine file from expected patch".to_string())?;
739
740 // Get the file content before the commit
741 let before_content = run_git(
742 repo_path,
743 &["show", &format!("{}^:{}", commit.sha, cursor_file)],
744 )
745 .await
746 .map_err(|e| format!("Failed to get file content for {}: {}", cursor_file, e))?;
747
748 // Build edit history diff from Claude's hunks
749 let edit_history = response.edit_history_hunks.join("\n");
750
751 // Apply edit history to get intermediate state (validates edit history)
752 let intermediate_state =
753 apply_edit_history_to_content(&before_content, &edit_history, &cursor_file)?;
754
755 // Validate expected patch applies to intermediate state
756 let expected_patch_with_header = ensure_diff_header(expected_patch, &cursor_file);
757 apply_diff_to_string(&expected_patch_with_header, &intermediate_state)
758 .map_err(|e| format!("Expected patch failed to apply: {}", e))?;
759
760 // Find where the expected patch edits would apply in the intermediate state
761 let edits = edits_for_diff(&intermediate_state, &expected_patch_with_header)
762 .map_err(|e| format!("Failed to parse expected patch: {}", e))?;
763 if edits.is_empty() {
764 return Err(
765 "Could not locate expected patch in file (context not found or ambiguous)".to_string(),
766 );
767 }
768
769 // Use the start of the first edit for cursor positioning
770 let cursor_byte_offset = edits[0].0.start;
771
772 // Extract excerpt around the edit location
773 let (excerpt, cursor_offset) = extract_cursor_excerpt(&intermediate_state, cursor_byte_offset)?;
774
775 // Build the ExampleSpec and use set_cursor_excerpt to format with comment marker
776 let comment_prefix = line_comment_prefix(&cursor_file);
777 let reasoning_with_source = format!(
778 "Source commit: {} ({})\n\n{}",
779 commit.sha,
780 truncate_message(&commit.message, 60),
781 response.reasoning
782 );
783 let mut spec = ExampleSpec {
784 name: response.name.clone(),
785 repository_url: repo_url.to_string(),
786 revision: commit.parent_sha.clone(),
787 tags: Vec::new(),
788 reasoning: Some(reasoning_with_source),
789 uncommitted_diff: String::new(),
790 cursor_path: Arc::from(Path::new(&cursor_file)),
791 cursor_position: String::new(),
792 edit_history,
793 expected_patches: vec![expected_patch_with_header],
794 };
795 spec.set_cursor_excerpt(&excerpt, cursor_offset, comment_prefix);
796
797 Ok(spec)
798}
799
800/// Extract file path from a hunk (looks for --- a/path or +++ b/path)
801fn extract_file_from_hunk(hunk: &str) -> Option<String> {
802 for line in hunk.lines() {
803 if let Some(path) = line.strip_prefix("+++ b/") {
804 return Some(path.to_string());
805 }
806 if let Some(path) = line.strip_prefix("--- a/") {
807 return Some(path.to_string());
808 }
809 }
810 None
811}
812
813/// Ensure a hunk has proper file headers
814fn ensure_diff_header(hunk: &str, file_path: &str) -> String {
815 if hunk.contains("--- a/") || hunk.contains("+++ b/") {
816 return hunk.to_string();
817 }
818 format!("--- a/{}\n+++ b/{}\n{}", file_path, file_path, hunk)
819}
820
821/// Apply edit history to file content, only if hunks affect this file
822fn apply_edit_history_to_content(
823 content: &str,
824 edit_history: &str,
825 cursor_file: &str,
826) -> Result<String, String> {
827 // Extract just the hunks for this file from the edit history
828 let file_diff = extract_file_diff_from_combined(edit_history, cursor_file);
829
830 if file_diff.is_empty() {
831 return Ok(content.to_string());
832 }
833
834 apply_diff_to_string(&file_diff, content)
835 .map_err(|e| format!("Failed to apply edit history: {}", e))
836}
837
838/// Extract hunks for a specific file from a combined diff
839fn extract_file_diff_from_combined(combined_diff: &str, target_file: &str) -> String {
840 let mut result = String::new();
841 let mut in_target_file = false;
842 let mut found_header = false;
843
844 for line in combined_diff.lines() {
845 if line.starts_with("--- a/") {
846 let file = line.strip_prefix("--- a/").unwrap_or("");
847 in_target_file = file == target_file;
848 if in_target_file {
849 result.push_str(line);
850 result.push('\n');
851 found_header = false;
852 }
853 } else if line.starts_with("+++ b/") && in_target_file {
854 result.push_str(line);
855 result.push('\n');
856 found_header = true;
857 } else if in_target_file && found_header {
858 if line.starts_with("--- a/") {
859 break;
860 }
861 result.push_str(line);
862 result.push('\n');
863 }
864 }
865
866 result
867}
868
869/// Extract a cursor position excerpt from content around a byte offset.
870/// Returns the excerpt and the cursor offset within the excerpt.
871fn extract_cursor_excerpt(
872 content: &str,
873 cursor_byte_offset: usize,
874) -> Result<(String, usize), String> {
875 // Find the line containing the cursor
876 let line_start = content[..cursor_byte_offset]
877 .rfind('\n')
878 .map(|pos| pos + 1)
879 .unwrap_or(0);
880 let line_end = content[cursor_byte_offset..]
881 .find('\n')
882 .map(|pos| cursor_byte_offset + pos)
883 .unwrap_or(content.len());
884
885 // Get context lines before
886 let lines_before: Vec<&str> = content[..line_start].lines().collect();
887 let context_before: Vec<&str> = lines_before.iter().rev().take(3).rev().cloned().collect();
888
889 // Get context lines after
890 let after_line_end = if line_end < content.len() {
891 line_end + 1
892 } else {
893 line_end
894 };
895 let context_after: Vec<&str> = content[after_line_end..].lines().take(4).collect();
896
897 // The line containing the cursor
898 let cursor_line = &content[line_start..line_end];
899 let cursor_column = cursor_byte_offset - line_start;
900
901 // Build the excerpt
902 let mut excerpt = String::new();
903 for line in context_before {
904 excerpt.push_str(line);
905 excerpt.push('\n');
906 }
907 // Track where cursor will be in the excerpt
908 let cursor_offset_in_excerpt = excerpt.len() + cursor_column;
909 // Line containing cursor
910 excerpt.push_str(cursor_line);
911 excerpt.push('\n');
912 for line in context_after {
913 excerpt.push_str(line);
914 excerpt.push('\n');
915 }
916
917 // Trim trailing newline
918 if excerpt.ends_with('\n') {
919 excerpt.pop();
920 }
921
922 Ok((excerpt, cursor_offset_in_excerpt))
923}
924
925/// Get the line comment prefix for a file based on its extension
926fn line_comment_prefix(file_path: &str) -> &'static str {
927 let extension = file_path.rsplit('.').next().unwrap_or("");
928 match extension {
929 "rs" | "c" | "cpp" | "cc" | "h" | "hpp" | "js" | "ts" | "tsx" | "jsx" | "go" | "java"
930 | "swift" | "kt" | "kts" | "scala" | "cs" | "m" | "mm" | "zig" | "v" | "d" => "//",
931 "py" | "rb" | "sh" | "bash" | "zsh" | "pl" | "pm" | "r" | "jl" | "yaml" | "yml"
932 | "toml" | "coffee" | "cr" | "ex" | "exs" | "elixir" => "#",
933 "lua" | "hs" | "sql" => "--",
934 "lisp" | "clj" | "cljs" | "scm" | "rkt" | "el" => ";",
935 "erl" | "hrl" => "%",
936 _ => "//",
937 }
938}
939
940fn format_rejected_example(response: &ClaudeResponse, rejection_reason: &str) -> String {
941 let mut content = String::new();
942 content.push_str("# Rejected Example\n\n");
943 content.push_str(&format!("## Name\n\n{}\n\n", response.name));
944 content.push_str(&format!("## Reasoning\n\n{}\n\n", response.reasoning));
945 content.push_str("## Edit History Hunks\n\n```diff\n");
946 for hunk in &response.edit_history_hunks {
947 content.push_str(hunk);
948 content.push_str("\n\n");
949 }
950 content.push_str("```\n\n");
951 content.push_str("## Expected Patch Hunks\n\n```diff\n");
952 for hunk in &response.expected_patch_hunks {
953 content.push_str(hunk);
954 content.push_str("\n\n");
955 }
956 content.push_str("```\n\n");
957 content.push_str(&format!("## Rejection Reason\n\n{}\n", rejection_reason));
958 content
959}