split_commit.rs

   1//! `ep split-commit` implementation.
   2//!
   3//! This command generates a single evaluation example JSON object from a
   4//! chronologically-ordered unified diff (a "commit").
   5//!
   6//! TODO: Port Python code to generate chronologically-ordered commits
   7use crate::FailedHandling;
   8use crate::reorder_patch::{Patch, PatchLine, extract_edits, locate_edited_line};
   9
  10/// Find the largest valid UTF-8 char boundary at or before `index` in `s`.
  11fn floor_char_boundary(s: &str, index: usize) -> usize {
  12    if index >= s.len() {
  13        s.len()
  14    } else if s.is_char_boundary(index) {
  15        index
  16    } else {
  17        // Find the nearest valid character boundary at or before index
  18        (0..index)
  19            .rev()
  20            .find(|&i| s.is_char_boundary(i))
  21            .unwrap_or(0)
  22    }
  23}
  24use anyhow::{Context as _, Result};
  25use clap::Args;
  26use edit_prediction::example_spec::ExampleSpec;
  27use rand::Rng;
  28use rand::SeedableRng;
  29use serde::{Deserialize, Serialize};
  30use similar::{DiffTag, TextDiff};
  31use std::collections::BTreeSet;
  32use std::fs;
  33use std::io::{self, Write};
  34use std::path::Path;
  35use std::path::PathBuf;
  36
  37/// `ep split-commit` CLI args.
  38#[derive(Debug, Args, Clone)]
  39pub struct SplitCommitArgs {
  40    /// Split point (float 0.0-1.0 for fraction, or integer for index)
  41    #[arg(long, short = 's')]
  42    pub split_point: Option<String>,
  43
  44    /// Random seed for reproducibility
  45    #[arg(long)]
  46    pub seed: Option<u64>,
  47
  48    /// Pretty-print JSON output
  49    #[arg(long, short = 'p')]
  50    pub pretty: bool,
  51
  52    /// Number of samples to generate per commit (samples random split points)
  53    #[arg(long, short = 'n')]
  54    pub num_samples: Option<usize>,
  55}
  56
  57/// Input format for annotated commits (JSON Lines).
  58#[derive(Debug, Clone, Deserialize)]
  59#[allow(dead_code)]
  60pub struct AnnotatedCommit {
  61    /// Repository path (e.g., "repos/zed")
  62    pub repo: String,
  63    /// Repository URL (e.g., "https://github.com/zed-industries/zed")
  64    pub repo_url: String,
  65    /// Commit SHA
  66    pub commit_sha: String,
  67    /// Chronologically reordered commit diff
  68    pub reordered_commit: String,
  69    /// Original commit diff
  70    pub original_commit: String,
  71    /// Whether diff stats match between original and reordered
  72    pub diff_stats_match: bool,
  73}
  74
  75/// Cursor position in a file.
  76#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  77pub struct CursorPosition {
  78    pub file: String,
  79    pub line: usize,
  80    pub column: usize,
  81}
  82
  83impl std::fmt::Display for CursorPosition {
  84    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  85        write!(f, "{}:{}:{}", self.file, self.line, self.column)
  86    }
  87}
  88
  89/// Represents a split commit with source and target patches.
  90#[derive(Debug, Clone)]
  91pub struct SplitCommit {
  92    pub source_patch: String,
  93    pub target_patch: String,
  94}
  95
  96/// Split point specification for evaluation generation.
  97#[derive(Debug, Clone)]
  98pub enum SplitPoint {
  99    /// Fraction of total edits (0.0 to 1.0)
 100    Fraction(f64),
 101    /// Absolute index
 102    Index(usize),
 103}
 104
 105fn parse_split_point(value: &str) -> Option<SplitPoint> {
 106    if value.contains('.') {
 107        value.parse::<f64>().ok().map(SplitPoint::Fraction)
 108    } else {
 109        value.parse::<usize>().ok().map(SplitPoint::Index)
 110    }
 111}
 112
 113/// Entry point for the `ep split-commit` subcommand.
 114///
 115/// This runs synchronously and outputs JSON Lines (one output per input line).
 116pub fn run_split_commit(
 117    args: &SplitCommitArgs,
 118    inputs: &[PathBuf],
 119    output_path: Option<&PathBuf>,
 120    failed: FailedHandling,
 121) -> Result<()> {
 122    use std::collections::HashSet;
 123    use std::io::BufRead;
 124
 125    let stdin_path = PathBuf::from("-");
 126    let inputs = if inputs.is_empty() {
 127        std::slice::from_ref(&stdin_path)
 128    } else {
 129        inputs
 130    };
 131
 132    let split_point = args.split_point.as_deref().and_then(parse_split_point);
 133    let mut output_lines = Vec::new();
 134
 135    for input_path in inputs {
 136        let input: Box<dyn BufRead> = if input_path.as_os_str() == "-" {
 137            Box::new(io::BufReader::new(io::stdin()))
 138        } else {
 139            let file = fs::File::open(input_path)
 140                .with_context(|| format!("failed to open input file {}", input_path.display()))?;
 141            Box::new(io::BufReader::new(file))
 142        };
 143
 144        for (line_num, line_result) in input.lines().enumerate() {
 145            let line =
 146                line_result.with_context(|| format!("failed to read line {}", line_num + 1))?;
 147
 148            if line.trim().is_empty() {
 149                continue;
 150            }
 151
 152            let annotated: AnnotatedCommit = serde_json::from_str(&line)
 153                .with_context(|| format!("failed to parse JSON at line {}", line_num + 1))?;
 154
 155            // Generate multiple samples if num_samples is set
 156            if let Some(num_samples) = args.num_samples {
 157                let mut seen_samples: HashSet<String> = HashSet::new();
 158                let base_seed = args.seed.unwrap_or_else(|| rand::random());
 159
 160                for sample_idx in 0..num_samples {
 161                    let sample_seed = base_seed.wrapping_add(sample_idx as u64);
 162
 163                    let case = match generate_evaluation_example_from_ordered_commit(
 164                        &annotated.reordered_commit,
 165                        &annotated.repo_url,
 166                        &annotated.commit_sha,
 167                        None, // Use random split point for multi-sample mode
 168                        Some(sample_seed),
 169                        Some(sample_idx),
 170                    ) {
 171                        Ok(case) => case,
 172                        Err(e) => {
 173                            let err_msg = format!(
 174                                "failed to generate evaluation example for commit {} at line {} (sample {}): {}",
 175                                annotated.commit_sha,
 176                                line_num + 1,
 177                                sample_idx,
 178                                e
 179                            );
 180                            match failed {
 181                                FailedHandling::Skip | FailedHandling::SkipNoFiles => {
 182                                    eprintln!("{}", err_msg);
 183                                    continue;
 184                                }
 185                                FailedHandling::Keep => {
 186                                    anyhow::bail!(err_msg);
 187                                }
 188                            }
 189                        }
 190                    };
 191
 192                    let json = if args.pretty {
 193                        serde_json::to_string_pretty(&case)
 194                    } else {
 195                        serde_json::to_string(&case)
 196                    }
 197                    .context("failed to serialize evaluation case as JSON")?;
 198
 199                    // Only add unique samples (different split points may produce same result)
 200                    if seen_samples.insert(json.clone()) {
 201                        output_lines.push(json);
 202                    }
 203                }
 204            } else {
 205                let case = match generate_evaluation_example_from_ordered_commit(
 206                    &annotated.reordered_commit,
 207                    &annotated.repo_url,
 208                    &annotated.commit_sha,
 209                    split_point.clone(),
 210                    args.seed,
 211                    None,
 212                ) {
 213                    Ok(case) => case,
 214                    Err(e) => {
 215                        let err_msg = format!(
 216                            "failed to generate evaluation example for commit {} at line {}: {}",
 217                            annotated.commit_sha,
 218                            line_num + 1,
 219                            e
 220                        );
 221                        match failed {
 222                            FailedHandling::Skip | FailedHandling::SkipNoFiles => {
 223                                eprintln!("{}", err_msg);
 224                                continue;
 225                            }
 226                            FailedHandling::Keep => {
 227                                anyhow::bail!(err_msg);
 228                            }
 229                        }
 230                    }
 231                };
 232
 233                let json = if args.pretty {
 234                    serde_json::to_string_pretty(&case)
 235                } else {
 236                    serde_json::to_string(&case)
 237                }
 238                .context("failed to serialize evaluation case as JSON")?;
 239
 240                output_lines.push(json);
 241            }
 242        }
 243    }
 244
 245    let output_content = output_lines.join("\n") + if output_lines.is_empty() { "" } else { "\n" };
 246
 247    if let Some(path) = output_path {
 248        fs::write(path, &output_content)
 249            .with_context(|| format!("failed to write output to {}", path.display()))?;
 250    } else {
 251        io::stdout()
 252            .write_all(output_content.as_bytes())
 253            .context("failed to write to stdout")?;
 254    }
 255
 256    Ok(())
 257}
 258
 259/// Main function to generate an evaluation example from an ordered commit.
 260///
 261/// # Arguments
 262/// * `commit` - Chronologically ordered unified diff of the commit
 263/// * `repository_url` - URL of the repository
 264/// * `commit_hash` - Hash of the commit
 265/// * `split_point` - Point at which the commit will be split (None for random)
 266/// * `seed` - Optional seed for randomness
 267/// * `sample_num` - Optional sample number for generating unique names
 268pub fn generate_evaluation_example_from_ordered_commit(
 269    commit: &str,
 270    repository_url: &str,
 271    commit_hash: &str,
 272    split_point: Option<SplitPoint>,
 273    seed: Option<u64>,
 274    sample_num: Option<usize>,
 275) -> Result<ExampleSpec> {
 276    let mut rng: Box<dyn rand::RngCore> = match seed {
 277        Some(seed) => Box::new(rand::rngs::StdRng::seed_from_u64(seed)),
 278        None => Box::new(rand::rngs::ThreadRng::default()),
 279    };
 280
 281    // Parse and normalize the commit
 282    let mut patch = Patch::parse_unified_diff(commit);
 283
 284    // Filter header to only keep lines starting with "//"
 285    let header_lines: Vec<&str> = patch
 286        .header
 287        .lines()
 288        .filter(|line| line.starts_with("//"))
 289        .collect();
 290    patch.header = if header_lines.is_empty() {
 291        String::new()
 292    } else {
 293        header_lines.join("\n") + "\n"
 294    };
 295    let commit_normalized = patch.to_string();
 296
 297    // Compute the split point
 298    let stats = patch.stats();
 299    let num_edits = stats.added + stats.removed;
 300
 301    anyhow::ensure!(num_edits != 0, "no edits found in commit");
 302
 303    let split = match split_point {
 304        None => rng.random_range(1..=num_edits),
 305        Some(SplitPoint::Fraction(f)) => {
 306            let v = (f * num_edits as f64).floor() as usize;
 307            v.min(num_edits)
 308        }
 309        Some(SplitPoint::Index(i)) => i.min(num_edits),
 310    };
 311
 312    // Split the commit into source and target patches
 313    let (prefix, suffix) = split_ordered_commit(&commit_normalized, split);
 314
 315    let mut split_commit = SplitCommit {
 316        source_patch: prefix,
 317        target_patch: suffix,
 318    };
 319
 320    // Imitate human edits
 321    let human_edit_seed = rng.random_range(1..=10000u64);
 322    let (src_patch, tgt_patch, cursor_opt) = imitate_human_edits(
 323        &split_commit.source_patch,
 324        &split_commit.target_patch,
 325        human_edit_seed,
 326    );
 327    split_commit.source_patch = src_patch;
 328    split_commit.target_patch = tgt_patch;
 329
 330    // Sample cursor position
 331    let cursor = match cursor_opt {
 332        Some(c) => c,
 333        None => sample_cursor_position(&patch, &split_commit)
 334            .context("failed to sample cursor position")?,
 335    };
 336
 337    // Get cursor excerpt
 338    let cursor_excerpt = get_cursor_excerpt(
 339        &cursor,
 340        &split_commit.source_patch,
 341        &split_commit.target_patch,
 342    )
 343    .context("failed to generate cursor excerpt")?;
 344
 345    // Handle edge case where split_point == 0
 346    if split == 0 {
 347        split_commit.target_patch = String::new();
 348    }
 349
 350    let repo_name = repository_url
 351        .trim_end_matches('/')
 352        .rsplit('/')
 353        .next()
 354        .unwrap_or("unknown");
 355    let short_sha = &commit_hash[..commit_hash.len().min(8)];
 356    let name = match sample_num {
 357        Some(n) => format!("{}-{}-{}", repo_name, short_sha, n),
 358        None => format!("{}-{}", repo_name, short_sha),
 359    };
 360
 361    Ok(ExampleSpec {
 362        name,
 363        repository_url: repository_url.to_string(),
 364        revision: format!("{}~1", commit_hash),
 365        edit_history: split_commit.source_patch.clone(),
 366        // cursor_position: cursor.to_string(),
 367        cursor_path: Path::new(&cursor.file).into(),
 368        cursor_position: cursor_excerpt,
 369        expected_patches: vec![split_commit.target_patch],
 370        tags: vec![],
 371        reasoning: None,
 372        uncommitted_diff: String::new(),
 373        rejected_patch: None,
 374        captured_prompt_input: None,
 375    })
 376}
 377
 378/// Split an ordered commit into source and target commits.
 379///
 380/// # Arguments
 381/// * `commit` - Ordered commit string
 382/// * `split_pos` - Position to split the commit (number of edited lines)
 383///
 384/// # Returns
 385/// A tuple of (source_diff, target_diff)
 386pub fn split_ordered_commit(commit: &str, split_pos: usize) -> (String, String) {
 387    let patch = Patch::parse_unified_diff(commit);
 388    let source_edits: BTreeSet<usize> = (0..split_pos).collect();
 389    let (source, target) = extract_edits(&patch, &source_edits);
 390
 391    let mut source_str = source.to_string();
 392    let target_str = target.to_string();
 393
 394    // Strip last group header from the source (lines starting with "//" at the end)
 395    let source_lines: Vec<&str> = source_str.lines().collect();
 396    let mut end_idx = source_lines.len();
 397    for i in (0..source_lines.len()).rev() {
 398        if source_lines[i].starts_with("//") {
 399            end_idx = i;
 400        } else {
 401            break;
 402        }
 403    }
 404    if end_idx < source_lines.len() {
 405        source_str = source_lines[..end_idx].join("\n");
 406        if !source_str.is_empty() {
 407            source_str.push('\n');
 408        }
 409    }
 410
 411    (source_str, target_str)
 412}
 413
 414/// Tokenize text into words and non-word characters.
 415fn tokenize(text: &str) -> Vec<String> {
 416    let mut tokens = Vec::new();
 417    let mut current = String::new();
 418
 419    for ch in text.chars() {
 420        if ch.is_alphanumeric() {
 421            current.push(ch);
 422        } else if ch == '_' {
 423            // Include underscore with the current word, then flush
 424            current.push(ch);
 425            if !current.is_empty() {
 426                tokens.push(std::mem::take(&mut current));
 427            }
 428        } else {
 429            // Punctuation or whitespace - flush current word first
 430            if !current.is_empty() {
 431                tokens.push(std::mem::take(&mut current));
 432            }
 433            // Each punctuation/whitespace is its own token
 434            tokens.push(ch.to_string());
 435        }
 436    }
 437
 438    if !current.is_empty() {
 439        tokens.push(current);
 440    }
 441
 442    tokens
 443}
 444
 445/// Calculate the weight for a split position based on the character at that position.
 446///
 447/// Higher weights indicate more natural pause points (e.g., after punctuation,
 448/// at identifier boundaries). Lower weights indicate less natural points
 449/// (e.g., mid-identifier).
 450fn position_weight(text: &str, pos: usize) -> u32 {
 451    if pos == 0 || pos > text.len() {
 452        return 1;
 453    }
 454
 455    let chars: Vec<char> = text.chars().collect();
 456    if pos > chars.len() {
 457        return 1;
 458    }
 459
 460    // Get the character just before this position (what we just "typed")
 461    let prev_char = chars[pos - 1];
 462
 463    // High weight: natural pause points (end of statement/argument, opening brackets)
 464    if matches!(prev_char, ',' | ';' | ':' | '(' | '[' | '{') {
 465        return 10;
 466    }
 467
 468    // High weight: closing brackets (finished a group)
 469    if matches!(prev_char, ')' | ']' | '}') {
 470        return 8;
 471    }
 472
 473    // Medium weight: operators and method chains
 474    if matches!(
 475        prev_char,
 476        '.' | '+' | '-' | '*' | '/' | '=' | '<' | '>' | '&' | '|' | '!'
 477    ) {
 478        return 5;
 479    }
 480
 481    // Check if we're at the end of an identifier (word char followed by non-word char)
 482    let is_prev_word_char = prev_char.is_alphanumeric() || prev_char == '_';
 483    let is_next_word_char =
 484        pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_');
 485
 486    if is_prev_word_char && !is_next_word_char {
 487        // End of identifier - high weight
 488        return 8;
 489    }
 490
 491    // Whitespace is a natural pause
 492    if prev_char.is_whitespace() {
 493        return 6;
 494    }
 495
 496    // Mid-identifier: low weight (rare autocomplete scenarios)
 497    if is_prev_word_char && is_next_word_char {
 498        return 1;
 499    }
 500
 501    // Default medium-low weight
 502    3
 503}
 504
 505/// Select a weighted random index from a list of weights.
 506///
 507/// Returns an index based on the weights, using the provided seed for
 508/// deterministic selection.
 509fn weighted_select(weights: &[u32], seed: u64) -> usize {
 510    if weights.is_empty() {
 511        return 0;
 512    }
 513
 514    let total_weight: u64 = weights.iter().map(|&w| w as u64).sum();
 515    if total_weight == 0 {
 516        // Fallback to uniform selection if all weights are zero
 517        return seed as usize % weights.len();
 518    }
 519
 520    // Use seed to select a value in [0, total_weight)
 521    let target = seed % total_weight;
 522    let mut cumulative: u64 = 0;
 523
 524    for (idx, &weight) in weights.iter().enumerate() {
 525        cumulative += weight as u64;
 526        if target < cumulative {
 527            return idx;
 528        }
 529    }
 530
 531    // Fallback to last index
 532    weights.len() - 1
 533}
 534
 535/// Calculate similarity ratio between two strings (0-100).
 536fn fuzzy_ratio(s1: &str, s2: &str) -> u32 {
 537    if s1.is_empty() && s2.is_empty() {
 538        return 100;
 539    }
 540    if s1.is_empty() || s2.is_empty() {
 541        return 0;
 542    }
 543
 544    let diff = TextDiff::from_chars(s1, s2);
 545    let matching: usize = diff
 546        .ops()
 547        .iter()
 548        .filter_map(|op| {
 549            if matches!(op.tag(), DiffTag::Equal) {
 550                Some(op.new_range().len())
 551            } else {
 552                None
 553            }
 554        })
 555        .sum();
 556
 557    let total = s1.len() + s2.len();
 558    ((2 * matching * 100) / total) as u32
 559}
 560
 561/// Imitate human edits by introducing partial line edits.
 562///
 563/// This function simulates how a human might incrementally type code,
 564/// rather than making complete line replacements.
 565pub fn imitate_human_edits(
 566    source_patch: &str,
 567    target_patch: &str,
 568    seed: u64,
 569) -> (String, String, Option<CursorPosition>) {
 570    let no_change = (source_patch.to_string(), target_patch.to_string(), None);
 571
 572    let src_patch = Patch::parse_unified_diff(source_patch);
 573    let tgt_patch = Patch::parse_unified_diff(target_patch);
 574
 575    if tgt_patch.hunks.is_empty() {
 576        return no_change;
 577    }
 578
 579    // Try to locate the first edit in target
 580    let tgt_edit_loc = match locate_edited_line(&tgt_patch, 0) {
 581        Some(loc) => loc,
 582        None => return no_change,
 583    };
 584
 585    let tgt_is_addition = matches!(tgt_edit_loc.patch_line, PatchLine::Addition(_));
 586    if !tgt_is_addition {
 587        return no_change;
 588    }
 589
 590    let tgt_line = match &tgt_edit_loc.patch_line {
 591        PatchLine::Addition(s) => s.clone(),
 592        _ => return no_change,
 593    };
 594
 595    // Try to locate the last edit in source
 596    let src_edit_loc = locate_edited_line(&src_patch, -1);
 597
 598    // Check if source has ANY edit at the same line as target's first edit
 599    // We need to iterate through all edits to check this
 600    let src_has_edit_at_target_line = {
 601        let mut found = false;
 602        let mut idx = 0isize;
 603        while let Some(loc) = locate_edited_line(&src_patch, idx) {
 604            if loc.filename == tgt_edit_loc.filename
 605                && loc.target_line_number == tgt_edit_loc.target_line_number
 606            {
 607                found = true;
 608                break;
 609            }
 610            idx += 1;
 611        }
 612        found
 613    };
 614
 615    // Check if this is a replacement (deletion followed by insertion on the same line)
 616    // or a pure insertion (no corresponding deletion in source)
 617    let is_replacement = src_edit_loc.as_ref().map_or(false, |loc| {
 618        matches!(loc.patch_line, PatchLine::Deletion(_))
 619            && loc.filename == tgt_edit_loc.filename
 620            && loc.target_line_number == tgt_edit_loc.target_line_number
 621    });
 622
 623    // If source has an edit at the same line but it's not a replacement (i.e., it's an addition),
 624    // we shouldn't process this as a pure insertion either
 625    if !is_replacement && src_has_edit_at_target_line {
 626        return no_change;
 627    }
 628
 629    let src_line = if is_replacement {
 630        match &src_edit_loc.as_ref().unwrap().patch_line {
 631            PatchLine::Deletion(s) => s.clone(),
 632            _ => return no_change,
 633        }
 634    } else {
 635        // Pure insertion: source line is empty
 636        String::new()
 637    };
 638
 639    // Don't process if source and target are the same
 640    if src_line == tgt_line {
 641        return no_change;
 642    }
 643
 644    // Tokenize both lines
 645    let src_tokens = tokenize(&src_line);
 646    let tgt_tokens = tokenize(&tgt_line);
 647
 648    // Convert to slices for similar
 649    let src_refs: Vec<&str> = src_tokens.iter().map(|s| s.as_str()).collect();
 650    let tgt_refs: Vec<&str> = tgt_tokens.iter().map(|s| s.as_str()).collect();
 651
 652    // Use similar to get diff operations
 653    let diff = TextDiff::from_slices(&src_refs, &tgt_refs);
 654
 655    // Build weights for each possible split position
 656    let mut position_weights: Vec<u32> = Vec::new();
 657
 658    // Simulate the edit process to collect weights for all possible split positions
 659    {
 660        let mut current_text = String::new();
 661
 662        for op in diff.ops() {
 663            match op.tag() {
 664                DiffTag::Equal => {
 665                    for i in op.old_range() {
 666                        current_text.push_str(&src_tokens[i]);
 667                    }
 668                }
 669                DiffTag::Replace => {
 670                    let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 671                    let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 672
 673                    // For insertion part
 674                    for ch in ins.chars() {
 675                        current_text.push(ch);
 676                        let weight = position_weight(&current_text, current_text.len());
 677                        position_weights.push(weight);
 678                    }
 679
 680                    // For deletion part (we're "untyping" from source)
 681                    for _ in del.chars() {
 682                        // Weight deletions lower as they represent removing text
 683                        position_weights.push(2);
 684                    }
 685                }
 686                DiffTag::Insert => {
 687                    let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 688                    for ch in ins.chars() {
 689                        current_text.push(ch);
 690                        let weight = position_weight(&current_text, current_text.len());
 691                        position_weights.push(weight);
 692                    }
 693                }
 694                DiffTag::Delete => {
 695                    let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 696                    for _ in del.chars() {
 697                        // Weight deletions lower
 698                        position_weights.push(2);
 699                    }
 700                }
 701            }
 702        }
 703    }
 704
 705    // Use weighted selection to choose split index
 706    if position_weights.is_empty() {
 707        return no_change;
 708    }
 709    let split_index = weighted_select(&position_weights, seed);
 710
 711    let mut edit_index = 0usize;
 712    let mut new_src = String::new();
 713    let mut split_found = false;
 714    let mut last_old_end = 0usize;
 715
 716    for op in diff.ops() {
 717        match op.tag() {
 718            DiffTag::Equal => {
 719                for i in op.old_range() {
 720                    new_src.push_str(&src_tokens[i]);
 721                }
 722                last_old_end = op.old_range().end;
 723            }
 724            DiffTag::Replace => {
 725                // Handle replace as delete + insert
 726                let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 727                let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 728                let repl_len = del.len() + ins.len();
 729                if edit_index + repl_len >= split_index {
 730                    // Split within this replace operation
 731                    let offset = split_index - edit_index;
 732                    if offset < ins.len() {
 733                        let safe_offset = floor_char_boundary(&ins, offset);
 734                        new_src.push_str(&ins[..safe_offset]);
 735                    } else {
 736                        new_src.push_str(&ins);
 737                        let del_offset = offset - ins.len();
 738                        let safe_del_offset = floor_char_boundary(&del, del_offset.min(del.len()));
 739                        new_src.push_str(&del[..safe_del_offset]);
 740                    }
 741                    split_found = true;
 742                    last_old_end = op.old_range().end;
 743                    break;
 744                } else {
 745                    edit_index += repl_len;
 746                    new_src.push_str(&ins);
 747                    last_old_end = op.old_range().end;
 748                }
 749            }
 750            DiffTag::Insert => {
 751                let repl: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 752                if edit_index + repl.len() >= split_index {
 753                    let offset = split_index - edit_index;
 754                    let safe_offset = floor_char_boundary(&repl, offset);
 755                    new_src.push_str(&repl[..safe_offset]);
 756                    split_found = true;
 757                    break;
 758                } else {
 759                    edit_index += repl.len();
 760                    new_src.push_str(&repl);
 761                }
 762            }
 763            DiffTag::Delete => {
 764                let repl: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 765                if edit_index + repl.len() >= split_index {
 766                    let offset = split_index - edit_index;
 767                    let safe_offset = floor_char_boundary(&repl, offset);
 768                    new_src.push_str(&repl[..safe_offset]);
 769                    split_found = true;
 770                    last_old_end = op.old_range().start + safe_offset.min(op.old_range().len());
 771                    break;
 772                } else {
 773                    edit_index += repl.len();
 774                    new_src.push_str(&repl);
 775                    last_old_end = op.old_range().end;
 776                }
 777            }
 778        }
 779    }
 780
 781    if !split_found {
 782        return no_change;
 783    }
 784
 785    // Calculate cursor position
 786    let cursor = CursorPosition {
 787        file: tgt_edit_loc.filename.clone(),
 788        line: if is_replacement {
 789            src_edit_loc.as_ref().unwrap().source_line_number
 790        } else {
 791            tgt_edit_loc.target_line_number
 792        },
 793        column: new_src.len() + 1,
 794    };
 795
 796    // Add remainder of source if similar enough to target remainder
 797    let remainder_src: String = (last_old_end..src_tokens.len())
 798        .map(|i| src_tokens[i].as_str())
 799        .collect();
 800    let remainder_tgt: String = (last_old_end..tgt_tokens.len())
 801        .filter_map(|i| tgt_tokens.get(i).map(|s| s.as_str()))
 802        .collect();
 803
 804    let ratio = fuzzy_ratio(&remainder_src, &remainder_tgt);
 805    if ratio > 35 {
 806        new_src.push_str(&remainder_src);
 807    }
 808
 809    if new_src.trim().is_empty() {
 810        return no_change;
 811    }
 812
 813    if new_src == src_line {
 814        return no_change;
 815    }
 816
 817    // Build new source patch with the intermediate line
 818    let mut new_src_patch = src_patch;
 819    if is_replacement {
 820        // For replacements, insert after the deletion line
 821        let src_loc = src_edit_loc.as_ref().unwrap();
 822        if let Some(hunk) = new_src_patch.hunks.get_mut(src_loc.hunk_index) {
 823            hunk.lines.insert(
 824                src_loc.line_index_within_hunk + 1,
 825                PatchLine::Addition(new_src.clone()),
 826            );
 827            hunk.new_count += 1;
 828        }
 829    } else {
 830        // For pure insertions, insert after the last edit in source patch
 831        // This imitates human typing - the intermediate content is what the user is currently typing
 832        let last_src_edit = locate_edited_line(&new_src_patch, -1);
 833
 834        if let Some(src_loc) = last_src_edit {
 835            // Insert after the last edit in source
 836            if let Some(hunk) = new_src_patch.hunks.get_mut(src_loc.hunk_index) {
 837                hunk.lines.insert(
 838                    src_loc.line_index_within_hunk + 1,
 839                    PatchLine::Addition(new_src.clone()),
 840                );
 841                hunk.new_count += 1;
 842            }
 843        } else {
 844            // Source patch is empty or has incompatible hunk structure, create a new hunk based on target
 845            if let Some(tgt_hunk) = tgt_patch.hunks.get(tgt_edit_loc.hunk_index) {
 846                let mut new_hunk = tgt_hunk.clone();
 847                // Replace the full addition with the partial one
 848                new_hunk.lines.clear();
 849                for (i, line) in tgt_hunk.lines.iter().enumerate() {
 850                    if i == tgt_edit_loc.line_index_within_hunk {
 851                        new_hunk.lines.push(PatchLine::Addition(new_src.clone()));
 852                    } else {
 853                        match line {
 854                            PatchLine::Addition(_) => {
 855                                // Skip other additions from target
 856                            }
 857                            _ => new_hunk.lines.push(line.clone()),
 858                        }
 859                    }
 860                }
 861                new_hunk.new_count = new_hunk.old_count + 1;
 862                new_src_patch.hunks.push(new_hunk);
 863                // Copy header from target if source doesn't have one
 864                if new_src_patch.header.is_empty() {
 865                    new_src_patch.header = tgt_patch.header.clone();
 866                }
 867            }
 868        }
 869    }
 870
 871    // Build new target patch with the intermediate line as deletion
 872    let mut new_tgt_patch = tgt_patch;
 873    if let Some(hunk) = new_tgt_patch.hunks.get_mut(tgt_edit_loc.hunk_index) {
 874        hunk.lines.insert(
 875            tgt_edit_loc.line_index_within_hunk,
 876            PatchLine::Deletion(new_src),
 877        );
 878        hunk.old_count += 1;
 879    }
 880
 881    (
 882        new_src_patch.to_string(),
 883        new_tgt_patch.to_string(),
 884        Some(cursor),
 885    )
 886}
 887
 888/// Locate the end of the last edit in a patch.
 889fn locate_end_of_last_edit(patch: &Patch) -> Option<CursorPosition> {
 890    let loc = locate_edited_line(patch, -1)?;
 891
 892    let (line, col) = match &loc.patch_line {
 893        PatchLine::Addition(content) => (loc.target_line_number, content.len()),
 894        PatchLine::Deletion(_) => (loc.target_line_number, 1),
 895        _ => return None,
 896    };
 897
 898    Some(CursorPosition {
 899        file: loc.filename,
 900        line,
 901        column: col,
 902    })
 903}
 904
 905/// Locate the beginning of the first edit in a patch.
 906fn locate_beginning_of_first_edit(patch: &Patch) -> Option<CursorPosition> {
 907    let loc = locate_edited_line(patch, 0)?;
 908
 909    let hunk = patch.hunks.get(loc.hunk_index)?;
 910    let column = if loc.line_index_within_hunk > 0 {
 911        if let Some(prev_line) = hunk.lines.get(loc.line_index_within_hunk - 1) {
 912            let content = match prev_line {
 913                PatchLine::Context(s) | PatchLine::Addition(s) | PatchLine::Deletion(s) => s,
 914                _ => return None,
 915            };
 916            content.len().max(1) - 1
 917        } else {
 918            0
 919        }
 920    } else {
 921        0
 922    };
 923
 924    let line = loc.target_line_number.saturating_sub(1).max(1);
 925
 926    Some(CursorPosition {
 927        file: loc.filename,
 928        line,
 929        column,
 930    })
 931}
 932
 933/// Sample cursor position according to the following rules:
 934/// 1. 50% chance of cursor being at the end of the source patch
 935/// 2. 50% chance of cursor being at the beginning of the target patch
 936pub fn sample_cursor_position(patch: &Patch, split_commit: &SplitCommit) -> Option<CursorPosition> {
 937    // Try end of history first
 938    let src_patch = Patch::parse_unified_diff(&split_commit.source_patch);
 939    if let Some(cursor) = locate_end_of_last_edit(&src_patch) {
 940        return Some(cursor);
 941    }
 942
 943    // Try beginning of target
 944    let tgt_patch = Patch::parse_unified_diff(&split_commit.target_patch);
 945    if let Some(cursor) = locate_beginning_of_first_edit(&tgt_patch) {
 946        return Some(cursor);
 947    }
 948
 949    // Fallback: use the original patch
 950    locate_end_of_last_edit(patch)
 951}
 952
 953/// Get cursor excerpt from the patches.
 954///
 955/// This extracts the lines around the cursor position with a cursor marker.
 956pub fn get_cursor_excerpt(
 957    cursor: &CursorPosition,
 958    source_patch: &str,
 959    target_patch: &str,
 960) -> Option<String> {
 961    let mut excerpt_lines: Vec<String> = Vec::new();
 962    let mut excerpt_first_line: usize = 0;
 963
 964    // Search in the last hunk of source patch
 965    let src = Patch::parse_unified_diff(source_patch);
 966    if let Some(loc) = locate_edited_line(&src, -1) {
 967        if loc.filename == cursor.file && loc.target_line_number == cursor.line {
 968            if let Some(hunk) = src.hunks.get(loc.hunk_index) {
 969                excerpt_first_line = hunk.new_start as usize;
 970                for line in &hunk.lines {
 971                    match line {
 972                        PatchLine::Addition(s) | PatchLine::Context(s) => {
 973                            excerpt_lines.push(s.clone());
 974                        }
 975                        _ => {}
 976                    }
 977                }
 978                // If hunk only has deletions (file deletion), include deletion lines
 979                if excerpt_lines.is_empty() {
 980                    excerpt_first_line = hunk.old_start as usize;
 981                    for line in &hunk.lines {
 982                        match line {
 983                            PatchLine::Deletion(s) => {
 984                                excerpt_lines.push(s.clone());
 985                            }
 986                            _ => {}
 987                        }
 988                    }
 989                }
 990            }
 991        }
 992    }
 993
 994    // Search in target patch if not found
 995    if excerpt_lines.is_empty() {
 996        let tgt = Patch::parse_unified_diff(target_patch);
 997        // Search all hunks for the cursor file, not just the first edit's hunk
 998        for hunk in &tgt.hunks {
 999            if hunk.filename == cursor.file {
1000                excerpt_first_line = hunk.new_start as usize;
1001                // First try to collect deletions and context (what exists before edits)
1002                for line in &hunk.lines {
1003                    match line {
1004                        PatchLine::Deletion(s) | PatchLine::Context(s) => {
1005                            excerpt_lines.push(s.clone());
1006                        }
1007                        _ => {}
1008                    }
1009                }
1010                // If hunk only has additions (no deletions/context), include all lines
1011                // This handles cases like adding to an empty file or section
1012                if excerpt_lines.is_empty() {
1013                    for line in &hunk.lines {
1014                        match line {
1015                            PatchLine::Addition(s)
1016                            | PatchLine::Deletion(s)
1017                            | PatchLine::Context(s) => {
1018                                excerpt_lines.push(s.clone());
1019                            }
1020                            _ => {}
1021                        }
1022                    }
1023                }
1024                if !excerpt_lines.is_empty() {
1025                    break;
1026                }
1027            }
1028        }
1029    }
1030
1031    // Also search source patch hunks if still not found (for fallback cursor case)
1032    if excerpt_lines.is_empty() {
1033        for hunk in &src.hunks {
1034            if hunk.filename == cursor.file {
1035                excerpt_first_line = hunk.new_start as usize;
1036                for line in &hunk.lines {
1037                    match line {
1038                        PatchLine::Addition(s) | PatchLine::Context(s) => {
1039                            excerpt_lines.push(s.clone());
1040                        }
1041                        _ => {}
1042                    }
1043                }
1044                // If hunk only has deletions, include deletion lines
1045                if excerpt_lines.is_empty() {
1046                    excerpt_first_line = hunk.old_start as usize;
1047                    for line in &hunk.lines {
1048                        match line {
1049                            PatchLine::Deletion(s) => {
1050                                excerpt_lines.push(s.clone());
1051                            }
1052                            _ => {}
1053                        }
1054                    }
1055                }
1056                if !excerpt_lines.is_empty() {
1057                    break;
1058                }
1059            }
1060        }
1061    }
1062
1063    if excerpt_lines.is_empty() {
1064        return None;
1065    }
1066
1067    // Add cursor marker
1068    for (i, line) in excerpt_lines.iter_mut().enumerate() {
1069        let line_num = excerpt_first_line + i;
1070        if line_num == cursor.line {
1071            let col = cursor.column.min(line.len());
1072            // Ensure we split at a valid UTF-8 character boundary
1073            let col = if line.is_char_boundary(col) {
1074                col
1075            } else {
1076                // Find the nearest valid character boundary
1077                (0..=col)
1078                    .rev()
1079                    .find(|&i| line.is_char_boundary(i))
1080                    .unwrap_or(0)
1081            };
1082            let (before, after) = line.split_at(col);
1083            *line = format!("{}<|user_cursor|>{}", before, after);
1084            break;
1085        }
1086    }
1087
1088    Some(excerpt_lines.join("\n"))
1089}
1090
1091#[cfg(test)]
1092mod tests {
1093    use std::path::Path;
1094
1095    use edit_prediction::example_spec::ExampleSpec;
1096
1097    use super::*;
1098
1099    #[test]
1100    fn test_tokenize() {
1101        let tokens = tokenize("hello world");
1102        assert_eq!(tokens, vec!["hello", " ", "world"]);
1103
1104        let tokens = tokenize("foo_bar123 + baz");
1105        assert_eq!(tokens, vec!["foo_", "bar123", " ", "+", " ", "baz"]);
1106
1107        let tokens = tokenize("print(\"hello\")");
1108        assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
1109
1110        let tokens = tokenize("hello_world");
1111        assert_eq!(tokens, vec!["hello_", "world"]);
1112
1113        let tokens = tokenize("fn();");
1114        assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
1115    }
1116
1117    #[test]
1118    fn test_fuzzy_ratio() {
1119        assert_eq!(fuzzy_ratio("hello", "hello"), 100);
1120        assert_eq!(fuzzy_ratio("", ""), 100);
1121        assert!(fuzzy_ratio("hello", "world") < 50);
1122        assert!(fuzzy_ratio("hello world", "hello worl") > 80);
1123    }
1124
1125    #[test]
1126    fn test_split_ordered_commit() {
1127        let commit = r#"// First change
1128--- a/test.rs
1129+++ b/test.rs
1130@@ -1,3 +1,4 @@
1131 fn main() {
1132+    println!("hello");
1133+    println!("world");
1134 }
1135"#;
1136        let patch = Patch::parse_unified_diff(commit);
1137        let stats = patch.stats();
1138        assert_eq!(stats.added, 2);
1139
1140        let (source, target) = split_ordered_commit(commit, 1);
1141
1142        // Source should have 1 addition
1143        let src_patch = Patch::parse_unified_diff(&source);
1144        assert_eq!(src_patch.stats().added, 1);
1145
1146        // Target should have 1 addition
1147        let tgt_patch = Patch::parse_unified_diff(&target);
1148        assert_eq!(tgt_patch.stats().added, 1);
1149    }
1150
1151    #[test]
1152    fn test_split_ordered_commit_with_deletions() {
1153        let commit = r#"// Change
1154--- a/test.rs
1155+++ b/test.rs
1156@@ -1,3 +1,3 @@
1157 fn main() {
1158-    println!("old");
1159+    println!("new");
1160 }
1161"#;
1162        let patch = Patch::parse_unified_diff(commit);
1163        let stats = patch.stats();
1164        assert_eq!(stats.added, 1);
1165        assert_eq!(stats.removed, 1);
1166
1167        // Split at position 1 (after the deletion)
1168        let (source, target) = split_ordered_commit(commit, 1);
1169
1170        let src_patch = Patch::parse_unified_diff(&source);
1171        let tgt_patch = Patch::parse_unified_diff(&target);
1172
1173        // Source should have the deletion
1174        assert_eq!(src_patch.stats().removed, 1);
1175        // Target should have the addition
1176        assert_eq!(tgt_patch.stats().added, 1);
1177    }
1178
1179    #[test]
1180    fn test_generate_evaluation_example() {
1181        let commit = r#"commit abc123
1182Author: Test <test@example.com>
1183Date: Mon Jan 1 00:00:00 2024
1184
1185    Test commit
1186
1187////////////////////////////////////////////////////////////////////////////////
1188// Add greeting
1189////////////////////////////////////////////////////////////////////////////////
1190--- a/test.rs
1191+++ b/test.rs
1192@@ -1,3 +1,5 @@
1193 fn main() {
1194+    println!("hello");
1195+    println!("world");
1196 }
1197"#;
1198
1199        let result = generate_evaluation_example_from_ordered_commit(
1200            commit,
1201            "https://github.com/test/repo",
1202            "abc123",
1203            Some(SplitPoint::Fraction(0.5)),
1204            Some(42),
1205            None,
1206        );
1207
1208        assert!(result.is_ok());
1209        let case = result.unwrap();
1210        assert_eq!(case.repository_url, "https://github.com/test/repo");
1211        assert_eq!(case.revision, "abc123~1");
1212        assert!(!case.edit_history.is_empty());
1213    }
1214
1215    #[test]
1216    fn test_generate_evaluation_example_reproducible() {
1217        let commit = r#"////////////////////////////////////////////////////////////////////////////////
1218// Add greeting
1219////////////////////////////////////////////////////////////////////////////////
1220--- a/test.rs
1221+++ b/test.rs
1222@@ -1,3 +1,5 @@
1223 fn main() {
1224+    println!("hello");
1225+    println!("world");
1226 }
1227"#;
1228
1229        // Run twice with the same seed
1230        let result1 = generate_evaluation_example_from_ordered_commit(
1231            commit,
1232            "https://github.com/test/repo",
1233            "abc123",
1234            Some(SplitPoint::Fraction(0.5)),
1235            Some(12345),
1236            None,
1237        )
1238        .unwrap();
1239
1240        let result2 = generate_evaluation_example_from_ordered_commit(
1241            commit,
1242            "https://github.com/test/repo",
1243            "abc123",
1244            Some(SplitPoint::Fraction(0.5)),
1245            Some(12345),
1246            None,
1247        )
1248        .unwrap();
1249
1250        // Results should be identical
1251        assert_eq!(result1.edit_history, result2.edit_history);
1252        assert_eq!(result1.expected_patches, result2.expected_patches);
1253        assert_eq!(result1.cursor_position, result2.cursor_position);
1254    }
1255
1256    #[test]
1257    fn test_cursor_position_display() {
1258        let cursor = CursorPosition {
1259            file: "src/main.rs".to_string(),
1260            line: 42,
1261            column: 10,
1262        };
1263        assert_eq!(cursor.to_string(), "src/main.rs:42:10");
1264    }
1265
1266    #[test]
1267    fn test_imitate_human_edits_no_change_when_no_replacement() {
1268        // Source and target patches that don't form a replacement pattern
1269        let source = r#"--- a/test.rs
1270+++ b/test.rs
1271@@ -1,3 +1,4 @@
1272 fn main() {
1273+    println!("hello");
1274 }
1275"#;
1276        let target = r#"--- a/test.rs
1277+++ b/test.rs
1278@@ -1,3 +1,4 @@
1279 fn main() {
1280+    println!("world");
1281 }
1282"#;
1283
1284        let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, 42);
1285
1286        // Should return unchanged when not a replacement pattern
1287        assert_eq!(new_src, source);
1288        assert_eq!(new_tgt, target);
1289        assert!(cursor.is_none());
1290    }
1291
1292    #[test]
1293    fn test_split_point_fraction() {
1294        let commit = r#"// Change
1295--- a/test.rs
1296+++ b/test.rs
1297@@ -1,5 +1,10 @@
1298 fn main() {
1299+    line1();
1300+    line2();
1301+    line3();
1302+    line4();
1303+    line5();
1304 }
1305"#;
1306
1307        // Split at 20% should give first edit in source
1308        let result = generate_evaluation_example_from_ordered_commit(
1309            commit,
1310            "",
1311            "hash",
1312            Some(SplitPoint::Fraction(0.2)),
1313            Some(1),
1314            None,
1315        );
1316
1317        assert!(result.is_ok());
1318        let case = result.unwrap();
1319
1320        // Source should have some edits
1321        let src_patch = Patch::parse_unified_diff(&case.edit_history);
1322        assert!(src_patch.stats().added > 0);
1323    }
1324
1325    #[test]
1326    fn test_split_point_index() {
1327        let commit = r#"// Change
1328--- a/test.rs
1329+++ b/test.rs
1330@@ -1,5 +1,10 @@
1331 fn main() {
1332+    line1();
1333+    line2();
1334+    line3();
1335+    line4();
1336+    line5();
1337 }
1338"#;
1339
1340        // Split at index 2 should give first 2 edits in source
1341        // With pure insertion handling, source gets 2 original + 1 partial = 3 additions
1342        let result = generate_evaluation_example_from_ordered_commit(
1343            commit,
1344            "",
1345            "hash",
1346            Some(SplitPoint::Index(2)),
1347            Some(1),
1348            None,
1349        );
1350
1351        assert!(result.is_ok());
1352        let case = result.unwrap();
1353
1354        let src_patch = Patch::parse_unified_diff(&case.edit_history);
1355        // Pure insertion adds a partial line, so we expect 3 (2 original + 1 partial)
1356        assert_eq!(src_patch.stats().added, 3);
1357    }
1358
1359    #[test]
1360    fn test_cursor_excerpt_contains_marker() {
1361        let commit = r#"////////////////////////////////////////////////////////////////////////////////
1362// Add code
1363////////////////////////////////////////////////////////////////////////////////
1364--- a/test.rs
1365+++ b/test.rs
1366@@ -1,3 +1,5 @@
1367 fn main() {
1368+    println!("hello");
1369+    println!("world");
1370 }
1371"#;
1372
1373        let result = generate_evaluation_example_from_ordered_commit(
1374            commit,
1375            "",
1376            "hash",
1377            Some(SplitPoint::Fraction(0.5)),
1378            Some(42),
1379            None,
1380        )
1381        .unwrap();
1382
1383        // Cursor excerpt should contain the cursor marker
1384        assert!(
1385            result.cursor_position.contains("<|user_cursor|>"),
1386            "Cursor excerpt should contain marker: {}",
1387            result.cursor_position
1388        );
1389    }
1390
1391    #[test]
1392    fn test_evaluation_case_json_serialization() {
1393        let case = ExampleSpec {
1394            name: "test-abc123".to_string(),
1395            repository_url: "https://github.com/test/repo".to_string(),
1396            revision: "abc123~1".to_string(),
1397            edit_history: "patch1".to_string(),
1398            // cursor_position: "file.rs:10:5".to_string(),
1399            cursor_path: Path::new("file.rs").into(),
1400            cursor_position: "some code<|user_cursor|>".to_string(),
1401            expected_patches: vec!["patch".to_string()],
1402            tags: vec![],
1403            reasoning: None,
1404            uncommitted_diff: String::new(),
1405            rejected_patch: None,
1406            captured_prompt_input: None,
1407        };
1408
1409        let json = serde_json::to_string(&case).unwrap();
1410        let deserialized: ExampleSpec = serde_json::from_str(&json).unwrap();
1411
1412        assert_eq!(case.repository_url, deserialized.repository_url);
1413        assert_eq!(case.revision, deserialized.revision);
1414        assert_eq!(case.cursor_position, deserialized.cursor_position);
1415    }
1416
1417    #[test]
1418    fn test_empty_commit_returns_error() {
1419        let commit = "";
1420
1421        let result = generate_evaluation_example_from_ordered_commit(
1422            commit,
1423            "",
1424            "hash",
1425            Some(SplitPoint::Fraction(0.5)),
1426            Some(1),
1427            None,
1428        );
1429
1430        assert!(result.is_err());
1431    }
1432
1433    #[test]
1434    fn test_header_filtering() {
1435        let commit = r#"commit abc123
1436Author: Test
1437Date: Today
1438
1439    Message
1440
1441diff --git a/test.rs b/test.rs
1442index 123..456 789
1443////////////////////////////////////////////////////////////////////////////////
1444// First group
1445////////////////////////////////////////////////////////////////////////////////
1446--- a/test.rs
1447+++ b/test.rs
1448@@ -1,3 +1,4 @@
1449 fn main() {
1450+    code();
1451 }
1452"#;
1453
1454        let result = generate_evaluation_example_from_ordered_commit(
1455            commit,
1456            "",
1457            "hash",
1458            Some(SplitPoint::Index(1)),
1459            Some(1),
1460            None,
1461        );
1462
1463        assert!(result.is_ok());
1464        let case = result.unwrap();
1465
1466        // The edit history should contain the group header (// lines)
1467        // but not the commit metadata
1468        assert!(!case.edit_history.contains("Author:"));
1469        assert!(!case.edit_history.contains("Date:"));
1470    }
1471
1472    #[test]
1473    fn test_position_weight() {
1474        // High weight positions (natural pause points)
1475        assert_eq!(position_weight("foo(", 4), 10); // After '('
1476        assert_eq!(position_weight("a, b", 2), 10); // After ','
1477        assert_eq!(position_weight("x;", 2), 10); // After ';'
1478        assert_eq!(position_weight("a: b", 2), 10); // After ':'
1479        assert_eq!(position_weight("[", 1), 10); // After '['
1480        assert_eq!(position_weight("{", 1), 10); // After '{'
1481
1482        // High weight for closing brackets
1483        assert_eq!(position_weight("foo)", 4), 8); // After ')'
1484        assert_eq!(position_weight("]", 1), 8); // After ']'
1485        assert_eq!(position_weight("}", 1), 8); // After '}'
1486
1487        // High weight at end of identifier
1488        assert_eq!(position_weight("foo ", 3), 8); // End of 'foo' before space
1489        assert_eq!(position_weight("bar(", 3), 8); // End of 'bar' before '('
1490
1491        // Medium weight for operators
1492        assert_eq!(position_weight("a + b", 3), 5); // After '+'
1493        assert_eq!(position_weight("x.", 2), 5); // After '.'
1494        assert_eq!(position_weight("a=b", 2), 5); // After '='
1495
1496        // Medium weight for whitespace
1497        assert_eq!(position_weight("a ", 2), 6); // After space
1498
1499        // Low weight mid-identifier
1500        assert_eq!(position_weight("foobar", 3), 1); // Mid-identifier 'foo|bar'
1501
1502        // Edge cases
1503        assert_eq!(position_weight("", 0), 1); // Empty string
1504        assert_eq!(position_weight("a", 0), 1); // Position 0
1505    }
1506
1507    #[test]
1508    fn test_weighted_select() {
1509        // Test that weighted selection returns correct indices
1510        let weights = vec![1, 10, 1];
1511
1512        // With total weight 12, seed 0 should select index 0
1513        // seed 0 % 12 = 0, cumulative: 1 at idx 0, so returns 0
1514        assert_eq!(weighted_select(&weights, 0), 0);
1515
1516        // seed 1 % 12 = 1, cumulative: 1 at idx 0 (1 < 1 is false), 11 at idx 1 (1 < 11 is true)
1517        assert_eq!(weighted_select(&weights, 1), 1);
1518
1519        // seed 10 % 12 = 10, cumulative: 1, 11 at idx 1 (10 < 11 is true)
1520        assert_eq!(weighted_select(&weights, 10), 1);
1521
1522        // seed 11 % 12 = 11, cumulative: 1, 11 at idx 1 (11 < 11 is false), 12 at idx 2 (11 < 12 is true)
1523        assert_eq!(weighted_select(&weights, 11), 2);
1524
1525        // Empty weights should return 0
1526        let empty: Vec<u32> = vec![];
1527        assert_eq!(weighted_select(&empty, 42), 0);
1528
1529        // Single weight should always return index 0
1530        let single = vec![10];
1531        assert_eq!(weighted_select(&single, 0), 0);
1532        assert_eq!(weighted_select(&single, 100), 0);
1533    }
1534
1535    #[test]
1536    fn test_weighted_split_prefers_natural_boundaries() {
1537        // Test that with different seeds, weighted selection tends to prefer
1538        // positions after punctuation over mid-identifier positions
1539        let text_with_punctuation = "foo(bar, baz)";
1540        let text_mid_identifier = "foobar";
1541
1542        // Position after '(' should have high weight
1543        let weight_after_paren = position_weight(text_with_punctuation, 4);
1544        // Position after ',' should have high weight
1545        let weight_after_comma = position_weight(text_with_punctuation, 8);
1546        // Position mid-identifier should have low weight
1547        let weight_mid_ident = position_weight(text_mid_identifier, 3);
1548
1549        assert!(
1550            weight_after_paren > weight_mid_ident,
1551            "After '(' ({}) should be weighted higher than mid-identifier ({})",
1552            weight_after_paren,
1553            weight_mid_ident
1554        );
1555        assert!(
1556            weight_after_comma > weight_mid_ident,
1557            "After ',' ({}) should be weighted higher than mid-identifier ({})",
1558            weight_after_comma,
1559            weight_mid_ident
1560        );
1561    }
1562
1563    #[test]
1564    fn test_imitate_human_edits_pure_insertion() {
1565        // Source patch is empty (no edits yet)
1566        // Target patch has a pure insertion (adding a new line)
1567        let source = r#"--- a/test.rs
1568+++ b/test.rs
1569@@ -1,2 +1,2 @@
1570 fn main() {
1571 }
1572"#;
1573        let target = r#"--- a/test.rs
1574+++ b/test.rs
1575@@ -1,2 +1,3 @@
1576 fn main() {
1577+    println!("debug");
1578 }
1579"#;
1580
1581        let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, 42);
1582
1583        // Should have transformed the patches
1584        assert_ne!(
1585            new_src, source,
1586            "Source should be modified for pure insertion"
1587        );
1588        assert_ne!(
1589            new_tgt, target,
1590            "Target should be modified for pure insertion"
1591        );
1592        assert!(cursor.is_some(), "Cursor should be set");
1593
1594        // Source should now have a partial addition
1595        let src_patch = Patch::parse_unified_diff(&new_src);
1596        assert!(
1597            src_patch.stats().added > 0,
1598            "Source should have added lines"
1599        );
1600
1601        // Target should have both a deletion (of partial) and addition (of full)
1602        let tgt_patch = Patch::parse_unified_diff(&new_tgt);
1603        assert!(
1604            tgt_patch.stats().removed > 0,
1605            "Target should have removed lines (partial)"
1606        );
1607        assert!(
1608            tgt_patch.stats().added > 0,
1609            "Target should have added lines (full)"
1610        );
1611
1612        // The cursor should be in test.rs
1613        let cursor = cursor.unwrap();
1614        assert_eq!(cursor.file, "test.rs");
1615    }
1616
1617    #[test]
1618    fn test_imitate_human_edits_pure_insertion_empty_source() {
1619        // Source patch has no hunks at all
1620        let source = "";
1621        let target = r#"--- a/test.rs
1622+++ b/test.rs
1623@@ -1,2 +1,3 @@
1624 fn main() {
1625+    println!("hello");
1626 }
1627"#;
1628
1629        let (new_src, _new_tgt, cursor) = imitate_human_edits(source, target, 123);
1630
1631        // Should have created a source patch with partial insertion
1632        assert!(!new_src.is_empty(), "Source should not be empty");
1633        assert!(cursor.is_some(), "Cursor should be set");
1634
1635        let src_patch = Patch::parse_unified_diff(&new_src);
1636        assert!(
1637            src_patch.stats().added > 0,
1638            "Source should have added lines"
1639        );
1640    }
1641
1642    #[test]
1643    fn test_imitate_human_edits_pure_insertion_intermediate_content() {
1644        // Verify the actual intermediate content is a realistic partial typing state
1645        let source = "";
1646        let target = r#"--- a/test.rs
1647+++ b/test.rs
1648@@ -1,2 +1,3 @@
1649 fn main() {
1650+    println!("hello world");
1651 }
1652"#;
1653
1654        // Test with multiple seeds to see different split points
1655        let mut found_partial = false;
1656        for seed in 1..=50 {
1657            let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, seed);
1658
1659            if cursor.is_some() {
1660                let src_patch = Patch::parse_unified_diff(&new_src);
1661                let tgt_patch = Patch::parse_unified_diff(&new_tgt);
1662
1663                // Find the added line in source
1664                for hunk in &src_patch.hunks {
1665                    for line in &hunk.lines {
1666                        if let PatchLine::Addition(content) = line {
1667                            // The partial line should be a prefix of the full line
1668                            let full_line = "    println!(\"hello world\");";
1669                            if content != full_line && full_line.starts_with(content) {
1670                                found_partial = true;
1671
1672                                // Verify target has the partial as deletion
1673                                let mut has_deletion = false;
1674                                for tgt_hunk in &tgt_patch.hunks {
1675                                    for tgt_line in &tgt_hunk.lines {
1676                                        if let PatchLine::Deletion(del_content) = tgt_line {
1677                                            if del_content == content {
1678                                                has_deletion = true;
1679                                            }
1680                                        }
1681                                    }
1682                                }
1683                                assert!(
1684                                    has_deletion,
1685                                    "Target should have deletion of partial line"
1686                                );
1687                            }
1688                        }
1689                    }
1690                }
1691            }
1692        }
1693
1694        assert!(
1695            found_partial,
1696            "At least one seed should produce a partial intermediate state"
1697        );
1698    }
1699
1700    #[test]
1701    fn test_imitate_human_edits_inserts_after_last_source_edit() {
1702        // Regression test: intermediate content should appear after the last edit
1703        // in the source patch, not at the position of the first target edit.
1704        // This ensures the diff output correctly imitates human typing order.
1705        //
1706        // The bug was: when source has edits and target has a pure insertion,
1707        // the intermediate content was inserted at tgt_edit_loc.line_index_within_hunk
1708        // (position of first target edit) instead of after the last source edit.
1709        //
1710        // Source patch has edits at lines 1-4, target has a new edit at line 10
1711        // (different location to avoid the "same line" early return)
1712        let source = r#"--- a/test.py
1713+++ b/test.py
1714@@ -1,4 +1,5 @@
1715+import foo
1716 import bar
1717-import old
1718 import baz
1719+import qux
1720"#;
1721        // Target has a pure insertion at a different line (line 10, not overlapping with source)
1722        let target = r#"--- a/test.py
1723+++ b/test.py
1724@@ -10,3 +10,4 @@
1725 def main():
1726+    print("hello world")
1727     pass
1728"#;
1729
1730        // Use a seed that produces a partial result
1731        let (new_src, _new_tgt, cursor) = imitate_human_edits(source, target, 42);
1732
1733        // The function should produce a modified patch
1734        assert!(cursor.is_some(), "Should produce intermediate state");
1735
1736        let src_patch = Patch::parse_unified_diff(&new_src);
1737        let all_additions: Vec<_> = src_patch
1738            .hunks
1739            .iter()
1740            .flat_map(|h| h.lines.iter())
1741            .filter_map(|l| match l {
1742                PatchLine::Addition(s) => Some(s.as_str()),
1743                _ => None,
1744            })
1745            .collect();
1746
1747        // The intermediate content (partial 'print("hello world")') should be
1748        // the LAST addition, appearing after "+import qux" (the last source edit)
1749        let last_addition = all_additions.last().expect("Should have additions");
1750        assert!(
1751            last_addition.trim_start().starts_with("pr"),
1752            "Intermediate content should be the last addition (partial 'print'), but last was: {:?}",
1753            last_addition
1754        );
1755
1756        // Verify the original source edits are still in order before the intermediate
1757        let foo_pos = all_additions.iter().position(|s| *s == "import foo");
1758        let qux_pos = all_additions.iter().position(|s| *s == "import qux");
1759        let intermediate_pos = all_additions
1760            .iter()
1761            .position(|s| s.trim_start().starts_with("pr"));
1762
1763        assert!(foo_pos.is_some(), "Should have 'import foo'");
1764        assert!(qux_pos.is_some(), "Should have 'import qux'");
1765        assert!(
1766            intermediate_pos.is_some(),
1767            "Should have intermediate content"
1768        );
1769
1770        assert!(
1771            foo_pos < qux_pos && qux_pos < intermediate_pos,
1772            "Order should be: foo < qux < intermediate. Got foo={:?}, qux={:?}, intermediate={:?}",
1773            foo_pos,
1774            qux_pos,
1775            intermediate_pos
1776        );
1777    }
1778
1779    #[test]
1780    fn test_cursor_excerpt_with_multibyte_utf8() {
1781        // Test that cursor excerpt handles multi-byte UTF-8 characters correctly
1782        // The Chinese character '第' is 3 bytes (0..3)
1783        let cursor = CursorPosition {
1784            file: "test.md".to_string(),
1785            line: 1,
1786            column: 1, // Byte index 1 is inside '第' (bytes 0..3)
1787        };
1788
1789        let source_patch = r#"--- a/test.md
1790+++ b/test.md
1791@@ -1,1 +1,1 @@
1792+第 14 章 Flask 工作原理与机制解析**
1793"#;
1794
1795        let target_patch = "";
1796
1797        // This should not panic even though column=1 is not a char boundary
1798        let result = get_cursor_excerpt(&cursor, source_patch, target_patch);
1799
1800        // The function should handle the invalid byte index gracefully
1801        if let Some(excerpt) = result {
1802            assert!(
1803                excerpt.contains("<|user_cursor|>"),
1804                "Cursor excerpt should contain marker"
1805            );
1806            // The marker should be placed at a valid character boundary
1807            // (either at the start or after '第')
1808        }
1809    }
1810}