1use crate::{
2 anthropic_client::PlainLlmClient,
3 git::{ensure_repo_cloned, run_git},
4 paths::{FAILED_EXAMPLES_DIR, LATEST_FAILED_EXAMPLES_DIR, SYNTHESIZE_STATE_FILE},
5 progress::{InfoStyle, Progress, Step, StepProgress},
6};
7use anthropic::ResponseContent;
8use anyhow::{Context as _, Result};
9use chrono::Local;
10use collections::{HashMap, HashSet};
11use edit_prediction::{
12 example_spec::ExampleSpec,
13 udiff::{apply_diff_to_string, edits_for_diff},
14};
15use futures::stream::{FuturesUnordered, StreamExt};
16use indoc::indoc;
17use serde::{Deserialize, Serialize};
18use std::{
19 path::{Path, PathBuf},
20 sync::Arc,
21};
22
23#[derive(Debug, Clone)]
24pub struct SynthesizeConfig {
25 pub repo_urls: Vec<String>,
26 /// Number of examples to generate per repository
27 pub count: usize,
28 pub max_commits: usize,
29 pub output_dir: PathBuf,
30 pub fresh: bool,
31}
32
33#[derive(Debug, Default, Serialize, Deserialize)]
34struct SynthesizeState {
35 repositories: HashMap<String, RepoState>,
36}
37
38#[derive(Debug, Default, Serialize, Deserialize)]
39struct RepoState {
40 processed_commits: HashSet<String>,
41 examples_generated: usize,
42}
43
44impl SynthesizeState {
45 fn load() -> Self {
46 if SYNTHESIZE_STATE_FILE.exists() {
47 std::fs::read_to_string(&*SYNTHESIZE_STATE_FILE)
48 .ok()
49 .and_then(|s| serde_json::from_str(&s).ok())
50 .unwrap_or_default()
51 } else {
52 Self::default()
53 }
54 }
55
56 fn save(&self) -> Result<()> {
57 let content = serde_json::to_string_pretty(self)?;
58 std::fs::write(&*SYNTHESIZE_STATE_FILE, content)?;
59 Ok(())
60 }
61
62 fn take_repo_state(&mut self, repo_url: &str) -> RepoState {
63 self.repositories.remove(repo_url).unwrap_or_default()
64 }
65
66 fn merge_repo_state(&mut self, repo_url: String, repo_state: RepoState) {
67 self.repositories.insert(repo_url, repo_state);
68 }
69}
70
71impl RepoState {
72 fn is_processed(&self, commit_sha: &str) -> bool {
73 self.processed_commits.contains(commit_sha)
74 }
75
76 fn mark_processed(&mut self, commit_sha: &str, examples_count: usize) {
77 self.processed_commits.insert(commit_sha.to_string());
78 self.examples_generated += examples_count;
79 }
80}
81
82#[derive(Debug)]
83struct CommitInfo {
84 sha: String,
85 parent_sha: String,
86 message: String,
87 diff: String,
88 expanded_diff: String,
89}
90
91/// Claude's response parsed into structured form
92#[derive(Debug)]
93struct ClaudeResponse {
94 name: String,
95 reasoning: String,
96 edit_history_hunks: Vec<String>,
97 expected_patch_hunks: Vec<String>,
98}
99
100pub async fn run_synthesize(config: SynthesizeConfig) -> Result<()> {
101 let mut state = if config.fresh {
102 SynthesizeState::default()
103 } else {
104 SynthesizeState::load()
105 };
106
107 std::fs::create_dir_all(&config.output_dir)?;
108 std::fs::create_dir_all(&*FAILED_EXAMPLES_DIR)?;
109
110 // Create "latest_failed" symlink pointing to this run's failed directory
111 if LATEST_FAILED_EXAMPLES_DIR.is_symlink() {
112 std::fs::remove_file(&*LATEST_FAILED_EXAMPLES_DIR)?;
113 }
114 #[cfg(unix)]
115 std::os::unix::fs::symlink(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
116 #[cfg(windows)]
117 std::os::windows::fs::symlink_dir(&*FAILED_EXAMPLES_DIR, &*LATEST_FAILED_EXAMPLES_DIR)?;
118
119 let progress = Progress::global();
120 let total_examples = config.count * config.repo_urls.len();
121 progress.set_total_examples(total_examples);
122
123 let client = Arc::new(PlainLlmClient::new()?);
124 let config = Arc::new(config);
125
126 let mut futures: FuturesUnordered<_> = config
127 .repo_urls
128 .iter()
129 .map(|repo_url| {
130 let client = client.clone();
131 let repo_state = state.take_repo_state(repo_url);
132 let config = config.clone();
133 let repo_url = repo_url.clone();
134 async move {
135 let result = synthesize_repo(&client, repo_state, &config, &repo_url).await;
136 (repo_url, result)
137 }
138 })
139 .collect();
140
141 let mut errors = Vec::new();
142 while let Some((repo_url, result)) = futures.next().await {
143 match result {
144 Ok(repo_state) => {
145 state.merge_repo_state(repo_url, repo_state);
146 }
147 Err(e) => {
148 errors.push(e);
149 }
150 }
151 }
152
153 state.save()?;
154
155 progress.finalize();
156
157 if let Some(first_error) = errors.into_iter().next() {
158 return Err(first_error);
159 }
160
161 Ok(())
162}
163
164async fn synthesize_repo(
165 client: &PlainLlmClient,
166 mut repo_state: RepoState,
167 config: &SynthesizeConfig,
168 repo_url: &str,
169) -> Result<RepoState> {
170 let progress = Progress::global();
171 let batch_size = config.max_commits;
172
173 let clone_progress = progress.start(Step::Synthesize, &format!("clone {}", repo_url));
174 let repo_path = ensure_repo_cloned(repo_url).await?;
175 drop(clone_progress);
176
177 let mut examples_generated = 0;
178 let mut commits_skipped = 0;
179
180 'outer: loop {
181 let list_progress = progress.start(
182 Step::Synthesize,
183 &format!("{}: list-commits", repo_name_from_url(repo_url)),
184 );
185 let commits = list_commits(&repo_path, batch_size, commits_skipped).await?;
186 drop(list_progress);
187
188 if commits.is_empty() {
189 break;
190 }
191
192 commits_skipped += commits.len();
193
194 for commit in commits {
195 if examples_generated >= config.count {
196 break 'outer;
197 }
198
199 if !config.fresh && repo_state.is_processed(&commit.sha) {
200 continue;
201 }
202
203 if should_skip_commit(&commit) {
204 continue;
205 }
206
207 let repo_name = repo_name_from_url(repo_url);
208 let commit_label = format!(
209 "{}: {} {}",
210 repo_name,
211 &commit.sha[..8],
212 truncate_message(&commit.message, 40)
213 );
214 let step_progress = Arc::new(progress.start(Step::Synthesize, &commit_label));
215
216 // Single Claude call to identify and copy hunks
217 step_progress.set_substatus("analyzing...");
218 let claude_response =
219 match analyze_commit(client, repo_url, &commit, step_progress.clone()).await {
220 Ok(Some(response)) => response,
221 Ok(None) => {
222 step_progress.set_info("no pattern", InfoStyle::Normal);
223 repo_state.mark_processed(&commit.sha, 0);
224 continue;
225 }
226 Err(e) => {
227 step_progress.set_info(format!("error: {:?}", e), InfoStyle::Warning);
228 repo_state.mark_processed(&commit.sha, 0);
229 continue;
230 }
231 };
232
233 // Validate and build the example
234 step_progress.set_substatus("validating...");
235 match build_example(repo_url, &commit, &repo_path, &claude_response).await {
236 Ok(spec) => {
237 let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S");
238 let filename = format!("{}--{}.md", repo_name, timestamp);
239 let path = config.output_dir.join(&filename);
240 std::fs::write(&path, spec.to_markdown())?;
241 examples_generated += 1;
242 step_progress.set_info(filename, InfoStyle::Normal);
243 }
244 Err(rejection_reason) => {
245 log::debug!("Example rejected: {}", rejection_reason);
246 let timestamp = Local::now().format("%Y-%m-%d--%H-%M-%S%.3f");
247 let filename = format!("{}--{}.md", repo_name, timestamp);
248 let path = FAILED_EXAMPLES_DIR.join(&filename);
249 let content = format_rejected_example(&claude_response, &rejection_reason);
250 if let Err(e) = std::fs::write(&path, content) {
251 log::warn!("Failed to write rejected example: {:?}", e);
252 }
253 step_progress.set_info(format!("rejected: {}", filename), InfoStyle::Warning);
254 }
255 }
256
257 repo_state.mark_processed(&commit.sha, 1);
258 }
259 }
260
261 Ok(repo_state)
262}
263
264fn repo_name_from_url(url: &str) -> String {
265 url.rsplit('/')
266 .next()
267 .unwrap_or(url)
268 .trim_end_matches(".git")
269 .to_string()
270}
271
272fn truncate_message(msg: &str, max_len: usize) -> String {
273 let first_line = msg.lines().next().unwrap_or("");
274 if first_line.len() <= max_len {
275 first_line.to_string()
276 } else {
277 format!("{}...", &first_line[..max_len - 3])
278 }
279}
280
281fn should_skip_commit(commit: &CommitInfo) -> bool {
282 let lines_changed = commit
283 .diff
284 .lines()
285 .filter(|l| l.starts_with('+') || l.starts_with('-'))
286 .count();
287 lines_changed < 30
288 || lines_changed > 1000
289 || is_non_code_commit(commit)
290 || is_rename_commit(commit)
291}
292
293fn is_non_code_commit(commit: &CommitInfo) -> bool {
294 let non_code_extensions = [
295 ".md", ".txt", ".json", ".yaml", ".yml", ".toml", ".lock", ".svg", ".png", ".jpg", ".gif",
296 ".ico", ".woff", ".ttf", ".eot",
297 ];
298
299 let diff_files: Vec<&str> = commit
300 .diff
301 .lines()
302 .filter(|l| l.starts_with("+++ b/") || l.starts_with("--- a/"))
303 .filter_map(|l| {
304 l.strip_prefix("+++ b/")
305 .or_else(|| l.strip_prefix("--- a/"))
306 })
307 .collect();
308
309 if diff_files.is_empty() {
310 return false;
311 }
312
313 diff_files
314 .iter()
315 .all(|f| non_code_extensions.iter().any(|ext| f.ends_with(ext)))
316}
317
318fn is_rename_commit(commit: &CommitInfo) -> bool {
319 commit.diff.contains("similarity index")
320 || commit.diff.contains("rename from")
321 || commit.diff.contains("rename to")
322}
323
324async fn list_commits(
325 repo_path: &Path,
326 max_commits: usize,
327 skip: usize,
328) -> Result<Vec<CommitInfo>> {
329 let output = run_git(
330 repo_path,
331 &[
332 "log",
333 "--no-merges",
334 &format!("--skip={}", skip),
335 &format!("-{}", max_commits),
336 "--format=%H|%P|%s",
337 ],
338 )
339 .await?;
340
341 let mut commits = Vec::new();
342 for line in output.lines() {
343 let parts: Vec<&str> = line.splitn(3, '|').collect();
344 if parts.len() < 3 {
345 continue;
346 }
347 let sha = parts[0].to_string();
348 let parent_sha = parts[1].split_whitespace().next().unwrap_or("").to_string();
349 if parent_sha.is_empty() {
350 continue;
351 }
352
353 // Get standard diff (for skip checks)
354 let diff = run_git(repo_path, &["show", "--format=", &sha])
355 .await
356 .unwrap_or_default();
357
358 // Get expanded diff with 30 lines of context
359 let expanded_diff = run_git(repo_path, &["show", "-U30", "--format=", &sha])
360 .await
361 .unwrap_or_default();
362
363 commits.push(CommitInfo {
364 sha,
365 parent_sha,
366 message: parts[2].to_string(),
367 diff,
368 expanded_diff,
369 });
370 }
371
372 Ok(commits)
373}
374
375fn build_prompt(repo_url: &str, commit: &CommitInfo) -> String {
376 format!(
377 indoc! {r#"
378 You are analyzing a git commit to construct a realistic edit prediction example.
379
380 Your goal is to tell the story of a programmer's editing session: what sequence
381 of changes did they make, and what change logically comes next? We use these examples
382 to train a model to predict edits, so the quality of the EDIT HISTORY is what matters most.
383
384 An edit prediction example consists of:
385 1. **Edit History**: 2-6 hunks showing what the programmer did BEFORE making the expected patch.
386 This is the most important part - it must tell a coherent story of the changes leading up to the prediction.
387 2. **Expected Patch**: One small hunk that logically follows from the edit history.
388
389 Both single-file and multi-file patterns are acceptable.
390
391 ## What Makes a Good Example
392
393 The edit history should read like a story: "First the programmer changed X, then Y, then Z, and now they need to change W."
394
395 GOOD examples (rich sequences with 3+ steps):
396 - Removing a parameter: docstring update → constructor change → field removal → (predict) usage site update
397 - Adding a feature: type definition → first usage → second usage → (predict) third usage
398 - Bug fix pattern: fix in file A → fix in file B → fix in file C → (predict) fix in file D
399
400 BAD examples (respond NO_PATTERN):
401 - Commits where all changes are independent (no narrative thread)
402 - Simple find-and-replace (renaming, version bumps)
403 - Documentation-only or config-only changes
404 - Changes where you can only find 1-2 hunks for the edit history
405
406 ## Commit Information
407
408 Repository: {repo_url}
409 Commit: {sha}
410 Message: {message}
411
412 ## Diff (30 lines context)
413
414 ```diff
415 {expanded_diff}
416 ```
417
418 ## Your Task
419
420 First, THINK through whether this commit can support a good example:
421
422 1. What is the high-level pattern in this commit?
423 2. Can you identify at least 3 related hunks (2 or more for edit history + 1 for expected patch)?
424 3. What would be the narrative? (First... then... then... finally predict...)
425 4. Which specific hunk should be the expected patch (the "punchline")?
426
427 If you cannot construct a coherent 3+ hunk story, respond with just:
428 NO_PATTERN: <brief reason>
429
430 If you CAN construct a good example, respond in this format:
431
432 ANALYSIS:
433 Pattern: <one sentence describing the pattern>
434 Steps:
435 1. <file:line-range> - <what this hunk does>
436 2. <file:line-range> - <what this hunk does>
437 3. <file:line-range> - <what this hunk does>
438 4. [EXPECTED PATCH] <file:line-range> - <what this hunk does>
439
440 NAME: <short description, like a commit message, under 60 chars>
441
442 EDIT_HISTORY:
443
444 Hunk 1:
445 ```diff
446 --- a/src/models/user.py
447 +++ b/src/models/user.py
448 @@ -15,7 +15,6 @@ class User:
449 """A user in the system.
450
451 Attributes:
452 - email: The user's email address.
453 name: The user's display name.
454 """
455 ```
456
457 Hunk 2:
458 ```diff
459 --- a/src/models/user.py
460 +++ b/src/models/user.py
461 @@ -25,10 +24,9 @@ class User:
462 def __init__(
463 self,
464 name: str,
465 - email: str,
466 created_at: datetime,
467 ):
468 self.name = name
469 - self.email = email
470 self.created_at = created_at
471 ```
472
473 Hunk 3:
474 ```diff
475 --- a/src/api/handlers.py
476 +++ b/src/api/handlers.py
477 @@ -42,7 +42,6 @@ def create_user(request):
478 data = request.json()
479 user = User(
480 name=data["name"],
481 - email=data["email"],
482 created_at=datetime.now(),
483 )
484 return user.save()
485 ```
486
487 EXPECTED_PATCH:
488 ```diff
489 --- a/src/api/handlers.py
490 +++ b/src/api/handlers.py
491 @@ -58,7 +57,6 @@ def update_user(request, user_id):
492 user = User.get(user_id)
493 user.name = data.get("name", user.name)
494 - user.email = data.get("email", user.email)
495 user.save()
496 return user
497 ```
498
499 ## Requirements for the diffs
500
501 Edit history:
502 - MUST have 3-6 hunks (if you cannot find 3+, respond NO_PATTERN instead)
503 - Each hunk needs file headers (--- a/path and +++ b/path)
504 - Hunks must be valid unified diffs that apply to the parent commit
505 - Order hunks as a programmer would naturally make the changes
506
507 Expected patch:
508 - Must be a SINGLE hunk from a SINGLE file
509 - Must be SMALL: 1-15 changed lines (not counting context)
510 - Must be clearly predictable from the edit history narrative
511 "#},
512 repo_url = repo_url,
513 sha = commit.sha,
514 message = commit.message,
515 expanded_diff = commit.expanded_diff,
516 )
517}
518
519async fn analyze_commit(
520 client: &PlainLlmClient,
521 repo_url: &str,
522 commit: &CommitInfo,
523 step_progress: Arc<StepProgress>,
524) -> Result<Option<ClaudeResponse>> {
525 use anthropic::{Message, RequestContent, Role};
526
527 let prompt = build_prompt(repo_url, commit);
528 let messages = vec![Message {
529 role: Role::User,
530 content: vec![RequestContent::Text {
531 text: prompt,
532 cache_control: None,
533 }],
534 }];
535
536 let response = client
537 .generate_streaming("claude-sonnet-4-5", 8192, messages, |chars, _text| {
538 step_progress.set_substatus(format!("analyzing: {:.1}K", chars as f64 / 1000.0));
539 })
540 .await?;
541
542 // Extract text content from response
543 let response_text: String = response
544 .content
545 .iter()
546 .filter_map(|block| {
547 if let ResponseContent::Text { text } = block {
548 Some(text.as_str())
549 } else {
550 None
551 }
552 })
553 .collect::<Vec<_>>()
554 .join("\n");
555
556 parse_claude_response(&response_text)
557}
558
559fn parse_claude_response(response: &str) -> Result<Option<ClaudeResponse>> {
560 // Check for NO_PATTERN
561 if response.contains("NO_PATTERN:") {
562 return Ok(None);
563 }
564
565 // Parse NAME
566 let name = response
567 .lines()
568 .find(|l| l.starts_with("NAME:"))
569 .map(|l| l.strip_prefix("NAME:").unwrap_or("").trim().to_string())
570 .unwrap_or_else(|| "unnamed example".to_string());
571
572 // Parse ANALYSIS section (Claude's planning) - this is the primary reasoning
573 let reasoning = extract_section(
574 response,
575 "ANALYSIS:",
576 &["NAME:", "REASONING:", "EDIT_HISTORY:", "EXPECTED_PATCH:"],
577 )
578 .unwrap_or_default();
579
580 // Parse EDIT_HISTORY diff block
581 let edit_history_hunks = extract_diff_block(response, "EDIT_HISTORY:")?;
582
583 // Parse EXPECTED_PATCH diff block
584 let expected_patch_hunks = extract_diff_block(response, "EXPECTED_PATCH:")?;
585
586 if edit_history_hunks.is_empty() {
587 anyhow::bail!("No edit history hunks found in response");
588 }
589 if expected_patch_hunks.is_empty() {
590 anyhow::bail!("No expected patch hunks found in response");
591 }
592
593 Ok(Some(ClaudeResponse {
594 name,
595 reasoning,
596 edit_history_hunks,
597 expected_patch_hunks,
598 }))
599}
600
601fn extract_section(text: &str, start_marker: &str, end_markers: &[&str]) -> Option<String> {
602 let start_idx = text.find(start_marker)?;
603 let content_start = start_idx + start_marker.len();
604
605 let end_idx = end_markers
606 .iter()
607 .filter_map(|marker| text[content_start..].find(marker))
608 .min()
609 .map(|idx| content_start + idx)
610 .unwrap_or(text.len());
611
612 Some(text[content_start..end_idx].trim().to_string())
613}
614
615fn extract_diff_block(text: &str, section_marker: &str) -> Result<Vec<String>> {
616 let section_start = text
617 .find(section_marker)
618 .context(format!("Section {} not found", section_marker))?;
619
620 let after_marker = &text[section_start + section_marker.len()..];
621
622 // Find where the next major section starts (to bound our search)
623 let section_end = ["EXPECTED_PATCH:", "## "]
624 .iter()
625 .filter(|&&m| m != section_marker)
626 .filter_map(|marker| after_marker.find(marker))
627 .min()
628 .unwrap_or(after_marker.len());
629
630 let section_content = &after_marker[..section_end];
631
632 // Collect all ```diff blocks in this section
633 let mut hunks = Vec::new();
634 let mut search_start = 0;
635
636 while let Some(diff_start) = section_content[search_start..].find("```diff") {
637 let abs_diff_start = search_start + diff_start;
638 let block_content_start = section_content[abs_diff_start..]
639 .find('\n')
640 .map(|i| abs_diff_start + i + 1)
641 .unwrap_or(abs_diff_start);
642
643 if let Some(block_end_rel) = section_content[block_content_start..].find("```") {
644 let block_end = block_content_start + block_end_rel;
645 let diff_content = section_content[block_content_start..block_end].trim();
646
647 // Split this block into hunks (in case multiple hunks in one block)
648 hunks.extend(split_into_hunks(diff_content));
649
650 search_start = block_end + 3;
651 } else {
652 break;
653 }
654 }
655
656 if hunks.is_empty() {
657 anyhow::bail!("No diff blocks found in section {}", section_marker);
658 }
659
660 Ok(hunks)
661}
662
663/// Split a diff block into individual hunks, preserving file headers
664fn split_into_hunks(diff: &str) -> Vec<String> {
665 let mut hunks = Vec::new();
666 let mut current_file_header: Option<String> = None;
667 let mut current_hunk: Vec<String> = Vec::new();
668 let mut in_hunk = false;
669
670 for line in diff.lines() {
671 if line.starts_with("--- a/") || line.starts_with("--- /") {
672 // Start of file header - flush previous hunk
673 if in_hunk && !current_hunk.is_empty() {
674 let mut hunk_text = String::new();
675 if let Some(ref header) = current_file_header {
676 hunk_text.push_str(header);
677 hunk_text.push('\n');
678 }
679 hunk_text.push_str(¤t_hunk.join("\n"));
680 hunks.push(hunk_text);
681 current_hunk.clear();
682 }
683 current_file_header = Some(line.to_string());
684 in_hunk = false;
685 } else if line.starts_with("+++ b/") || line.starts_with("+++ /") {
686 if let Some(ref mut header) = current_file_header {
687 header.push('\n');
688 header.push_str(line);
689 }
690 } else if line.starts_with("@@ ") {
691 // New hunk - flush previous
692 if in_hunk && !current_hunk.is_empty() {
693 let mut hunk_text = String::new();
694 if let Some(ref header) = current_file_header {
695 hunk_text.push_str(header);
696 hunk_text.push('\n');
697 }
698 hunk_text.push_str(¤t_hunk.join("\n"));
699 hunks.push(hunk_text);
700 current_hunk.clear();
701 }
702 current_hunk.push(line.to_string());
703 in_hunk = true;
704 } else if in_hunk {
705 current_hunk.push(line.to_string());
706 }
707 }
708
709 // Flush final hunk
710 if !current_hunk.is_empty() {
711 let mut hunk_text = String::new();
712 if let Some(ref header) = current_file_header {
713 hunk_text.push_str(header);
714 hunk_text.push('\n');
715 }
716 hunk_text.push_str(¤t_hunk.join("\n"));
717 hunks.push(hunk_text);
718 }
719
720 hunks
721}
722
723/// Validate Claude's output by applying diffs and build the ExampleSpec
724async fn build_example(
725 repo_url: &str,
726 commit: &CommitInfo,
727 repo_path: &Path,
728 response: &ClaudeResponse,
729) -> Result<ExampleSpec, String> {
730 // Validate expected patch hunks
731 if response.expected_patch_hunks.len() != 1 {
732 return Err(format!(
733 "Expected exactly 1 expected patch hunk, got {}",
734 response.expected_patch_hunks.len()
735 ));
736 }
737
738 // Parse the expected patch to determine cursor file
739 let expected_patch = &response.expected_patch_hunks[0];
740 let cursor_file = extract_file_from_hunk(expected_patch)
741 .ok_or_else(|| "Could not determine file from expected patch".to_string())?;
742
743 // Get the file content before the commit
744 let before_content = run_git(
745 repo_path,
746 &["show", &format!("{}^:{}", commit.sha, cursor_file)],
747 )
748 .await
749 .map_err(|e| format!("Failed to get file content for {}: {}", cursor_file, e))?;
750
751 // Build edit history diff from Claude's hunks
752 let edit_history = response.edit_history_hunks.join("\n");
753
754 // Apply edit history to get intermediate state (validates edit history)
755 let intermediate_state =
756 apply_edit_history_to_content(&before_content, &edit_history, &cursor_file)?;
757
758 // Validate expected patch applies to intermediate state
759 let expected_patch_with_header = ensure_diff_header(expected_patch, &cursor_file);
760 apply_diff_to_string(&expected_patch_with_header, &intermediate_state)
761 .map_err(|e| format!("Expected patch failed to apply: {}", e))?;
762
763 // Find where the expected patch edits would apply in the intermediate state
764 let edits = edits_for_diff(&intermediate_state, &expected_patch_with_header)
765 .map_err(|e| format!("Failed to parse expected patch: {}", e))?;
766 if edits.is_empty() {
767 return Err(
768 "Could not locate expected patch in file (context not found or ambiguous)".to_string(),
769 );
770 }
771
772 // Use the start of the first edit for cursor positioning
773 let cursor_byte_offset = edits[0].0.start;
774
775 // Extract excerpt around the edit location
776 let (excerpt, cursor_offset) = extract_cursor_excerpt(&intermediate_state, cursor_byte_offset)?;
777
778 // Build the ExampleSpec and use set_cursor_excerpt to format with comment marker
779 let comment_prefix = line_comment_prefix(&cursor_file);
780 let reasoning_with_source = format!(
781 "Source commit: {} ({})\n\n{}",
782 commit.sha,
783 truncate_message(&commit.message, 60),
784 response.reasoning
785 );
786 let mut spec = ExampleSpec {
787 name: response.name.clone(),
788 repository_url: repo_url.to_string(),
789 revision: commit.parent_sha.clone(),
790 tags: Vec::new(),
791 reasoning: Some(reasoning_with_source),
792 uncommitted_diff: String::new(),
793 cursor_path: Arc::from(Path::new(&cursor_file)),
794 cursor_position: String::new(),
795 edit_history,
796 expected_patches: vec![expected_patch_with_header],
797 rejected_patch: None,
798
799 telemetry: None,
800 human_feedback: Vec::new(),
801 rating: None,
802 };
803 spec.set_cursor_excerpt(&excerpt, cursor_offset, comment_prefix);
804
805 Ok(spec)
806}
807
808/// Extract file path from a hunk (looks for --- a/path or +++ b/path)
809fn extract_file_from_hunk(hunk: &str) -> Option<String> {
810 for line in hunk.lines() {
811 if let Some(path) = line.strip_prefix("+++ b/") {
812 return Some(path.to_string());
813 }
814 if let Some(path) = line.strip_prefix("--- a/") {
815 return Some(path.to_string());
816 }
817 }
818 None
819}
820
821/// Ensure a hunk has proper file headers
822fn ensure_diff_header(hunk: &str, file_path: &str) -> String {
823 if hunk.contains("--- a/") || hunk.contains("+++ b/") {
824 return hunk.to_string();
825 }
826 format!("--- a/{}\n+++ b/{}\n{}", file_path, file_path, hunk)
827}
828
829/// Apply edit history to file content, only if hunks affect this file
830fn apply_edit_history_to_content(
831 content: &str,
832 edit_history: &str,
833 cursor_file: &str,
834) -> Result<String, String> {
835 // Extract just the hunks for this file from the edit history
836 let file_diff = extract_file_diff_from_combined(edit_history, cursor_file);
837
838 if file_diff.is_empty() {
839 return Ok(content.to_string());
840 }
841
842 apply_diff_to_string(&file_diff, content)
843 .map_err(|e| format!("Failed to apply edit history: {}", e))
844}
845
846/// Extract hunks for a specific file from a combined diff
847fn extract_file_diff_from_combined(combined_diff: &str, target_file: &str) -> String {
848 let mut result = String::new();
849 let mut in_target_file = false;
850 let mut found_header = false;
851
852 for line in combined_diff.lines() {
853 if line.starts_with("--- a/") {
854 let file = line.strip_prefix("--- a/").unwrap_or("");
855 in_target_file = file == target_file;
856 if in_target_file {
857 result.push_str(line);
858 result.push('\n');
859 found_header = false;
860 }
861 } else if line.starts_with("+++ b/") && in_target_file {
862 result.push_str(line);
863 result.push('\n');
864 found_header = true;
865 } else if in_target_file && found_header {
866 if line.starts_with("--- a/") {
867 break;
868 }
869 result.push_str(line);
870 result.push('\n');
871 }
872 }
873
874 result
875}
876
877/// Extract a cursor position excerpt from content around a byte offset.
878/// Returns the excerpt and the cursor offset within the excerpt.
879fn extract_cursor_excerpt(
880 content: &str,
881 cursor_byte_offset: usize,
882) -> Result<(String, usize), String> {
883 // Find the line containing the cursor
884 let line_start = content[..cursor_byte_offset]
885 .rfind('\n')
886 .map(|pos| pos + 1)
887 .unwrap_or(0);
888 let line_end = content[cursor_byte_offset..]
889 .find('\n')
890 .map(|pos| cursor_byte_offset + pos)
891 .unwrap_or(content.len());
892
893 // Get context lines before
894 let lines_before: Vec<&str> = content[..line_start].lines().collect();
895 let context_before: Vec<&str> = lines_before.iter().rev().take(3).rev().cloned().collect();
896
897 // Get context lines after
898 let after_line_end = if line_end < content.len() {
899 line_end + 1
900 } else {
901 line_end
902 };
903 let context_after: Vec<&str> = content[after_line_end..].lines().take(4).collect();
904
905 // The line containing the cursor
906 let cursor_line = &content[line_start..line_end];
907 let cursor_column = cursor_byte_offset - line_start;
908
909 // Build the excerpt
910 let mut excerpt = String::new();
911 for line in context_before {
912 excerpt.push_str(line);
913 excerpt.push('\n');
914 }
915 // Track where cursor will be in the excerpt
916 let cursor_offset_in_excerpt = excerpt.len() + cursor_column;
917 // Line containing cursor
918 excerpt.push_str(cursor_line);
919 excerpt.push('\n');
920 for line in context_after {
921 excerpt.push_str(line);
922 excerpt.push('\n');
923 }
924
925 // Trim trailing newline
926 if excerpt.ends_with('\n') {
927 excerpt.pop();
928 }
929
930 Ok((excerpt, cursor_offset_in_excerpt))
931}
932
933/// Get the line comment prefix for a file based on its extension
934fn line_comment_prefix(file_path: &str) -> &'static str {
935 let extension = file_path.rsplit('.').next().unwrap_or("");
936 match extension {
937 "rs" | "c" | "cpp" | "cc" | "h" | "hpp" | "js" | "ts" | "tsx" | "jsx" | "go" | "java"
938 | "swift" | "kt" | "kts" | "scala" | "cs" | "m" | "mm" | "zig" | "v" | "d" => "//",
939 "py" | "rb" | "sh" | "bash" | "zsh" | "pl" | "pm" | "r" | "jl" | "yaml" | "yml"
940 | "toml" | "coffee" | "cr" | "ex" | "exs" | "elixir" => "#",
941 "lua" | "hs" | "sql" => "--",
942 "lisp" | "clj" | "cljs" | "scm" | "rkt" | "el" => ";",
943 "erl" | "hrl" => "%",
944 _ => "//",
945 }
946}
947
948fn format_rejected_example(response: &ClaudeResponse, rejection_reason: &str) -> String {
949 let mut content = String::new();
950 content.push_str("# Rejected Example\n\n");
951 content.push_str(&format!("## Name\n\n{}\n\n", response.name));
952 content.push_str(&format!("## Reasoning\n\n{}\n\n", response.reasoning));
953 content.push_str("## Edit History Hunks\n\n```diff\n");
954 for hunk in &response.edit_history_hunks {
955 content.push_str(hunk);
956 content.push_str("\n\n");
957 }
958 content.push_str("```\n\n");
959 content.push_str("## Expected Patch Hunks\n\n```diff\n");
960 for hunk in &response.expected_patch_hunks {
961 content.push_str(hunk);
962 content.push_str("\n\n");
963 }
964 content.push_str("```\n\n");
965 content.push_str(&format!("## Rejection Reason\n\n{}\n", rejection_reason));
966 content
967}