split_commit.rs

   1//! `ep split-commit` implementation.
   2//!
   3//! This command generates a single evaluation example JSON object from a
   4//! chronologically-ordered unified diff (a "commit").
   5//!
   6//! TODO: Port Python code to generate chronologically-ordered commits
   7use crate::FailedHandling;
   8use crate::reorder_patch::{Patch, PatchLine, extract_edits, locate_edited_line};
   9
  10/// Find the largest valid UTF-8 char boundary at or before `index` in `s`.
  11fn floor_char_boundary(s: &str, index: usize) -> usize {
  12    if index >= s.len() {
  13        s.len()
  14    } else if s.is_char_boundary(index) {
  15        index
  16    } else {
  17        // Find the nearest valid character boundary at or before index
  18        (0..index)
  19            .rev()
  20            .find(|&i| s.is_char_boundary(i))
  21            .unwrap_or(0)
  22    }
  23}
  24use anyhow::{Context as _, Result};
  25use clap::Args;
  26use edit_prediction::example_spec::ExampleSpec;
  27use rand::Rng;
  28use rand::SeedableRng;
  29use serde::{Deserialize, Serialize};
  30use similar::{DiffTag, TextDiff};
  31use std::collections::BTreeSet;
  32use std::fs;
  33use std::io::{self, Write};
  34use std::path::Path;
  35use std::path::PathBuf;
  36
  37/// `ep split-commit` CLI args.
  38#[derive(Debug, Args, Clone)]
  39pub struct SplitCommitArgs {
  40    /// Split point (float 0.0-1.0 for fraction, or integer for index)
  41    #[arg(long, short = 's')]
  42    pub split_point: Option<String>,
  43
  44    /// Random seed for reproducibility
  45    #[arg(long)]
  46    pub seed: Option<u64>,
  47
  48    /// Pretty-print JSON output
  49    #[arg(long, short = 'p')]
  50    pub pretty: bool,
  51
  52    /// Number of samples to generate per commit (samples random split points)
  53    #[arg(long, short = 'n')]
  54    pub num_samples: Option<usize>,
  55}
  56
  57/// Input format for annotated commits (JSON Lines).
  58#[derive(Debug, Clone, Deserialize)]
  59#[allow(dead_code)]
  60pub struct AnnotatedCommit {
  61    /// Repository path (e.g., "repos/zed")
  62    pub repo: String,
  63    /// Repository URL (e.g., "https://github.com/zed-industries/zed")
  64    pub repo_url: String,
  65    /// Commit SHA
  66    pub commit_sha: String,
  67    /// Chronologically reordered commit diff
  68    pub reordered_commit: String,
  69    /// Original commit diff
  70    pub original_commit: String,
  71    /// Whether diff stats match between original and reordered
  72    pub diff_stats_match: bool,
  73}
  74
  75/// Cursor position in a file.
  76#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  77pub struct CursorPosition {
  78    pub file: String,
  79    pub line: usize,
  80    pub column: usize,
  81}
  82
  83impl std::fmt::Display for CursorPosition {
  84    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  85        write!(f, "{}:{}:{}", self.file, self.line, self.column)
  86    }
  87}
  88
  89/// Represents a split commit with source and target patches.
  90#[derive(Debug, Clone)]
  91pub struct SplitCommit {
  92    pub source_patch: String,
  93    pub target_patch: String,
  94}
  95
  96/// Split point specification for evaluation generation.
  97#[derive(Debug, Clone)]
  98pub enum SplitPoint {
  99    /// Fraction of total edits (0.0 to 1.0)
 100    Fraction(f64),
 101    /// Absolute index
 102    Index(usize),
 103}
 104
 105fn parse_split_point(value: &str) -> Option<SplitPoint> {
 106    if value.contains('.') {
 107        value.parse::<f64>().ok().map(SplitPoint::Fraction)
 108    } else {
 109        value.parse::<usize>().ok().map(SplitPoint::Index)
 110    }
 111}
 112
 113/// Entry point for the `ep split-commit` subcommand.
 114///
 115/// This runs synchronously and outputs JSON Lines (one output per input line).
 116pub fn run_split_commit(
 117    args: &SplitCommitArgs,
 118    inputs: &[PathBuf],
 119    output_path: Option<&PathBuf>,
 120    failed: FailedHandling,
 121) -> Result<()> {
 122    use std::collections::HashSet;
 123    use std::io::BufRead;
 124
 125    let stdin_path = PathBuf::from("-");
 126    let inputs = if inputs.is_empty() {
 127        std::slice::from_ref(&stdin_path)
 128    } else {
 129        inputs
 130    };
 131
 132    let split_point = args.split_point.as_deref().and_then(parse_split_point);
 133    let mut output_lines = Vec::new();
 134
 135    for input_path in inputs {
 136        let input: Box<dyn BufRead> = if input_path.as_os_str() == "-" {
 137            Box::new(io::BufReader::new(io::stdin()))
 138        } else {
 139            let file = fs::File::open(input_path)
 140                .with_context(|| format!("failed to open input file {}", input_path.display()))?;
 141            Box::new(io::BufReader::new(file))
 142        };
 143
 144        for (line_num, line_result) in input.lines().enumerate() {
 145            let line =
 146                line_result.with_context(|| format!("failed to read line {}", line_num + 1))?;
 147
 148            if line.trim().is_empty() {
 149                continue;
 150            }
 151
 152            let annotated: AnnotatedCommit = serde_json::from_str(&line)
 153                .with_context(|| format!("failed to parse JSON at line {}", line_num + 1))?;
 154
 155            // Generate multiple samples if num_samples is set
 156            if let Some(num_samples) = args.num_samples {
 157                let mut seen_samples: HashSet<String> = HashSet::new();
 158                let base_seed = args.seed.unwrap_or_else(|| rand::random());
 159
 160                for sample_idx in 0..num_samples {
 161                    let sample_seed = base_seed.wrapping_add(sample_idx as u64);
 162
 163                    let case = match generate_evaluation_example_from_ordered_commit(
 164                        &annotated.reordered_commit,
 165                        &annotated.repo_url,
 166                        &annotated.commit_sha,
 167                        None, // Use random split point for multi-sample mode
 168                        Some(sample_seed),
 169                        Some(sample_idx),
 170                    ) {
 171                        Ok(case) => case,
 172                        Err(e) => {
 173                            let err_msg = format!(
 174                                "failed to generate evaluation example for commit {} at line {} (sample {}): {}",
 175                                annotated.commit_sha,
 176                                line_num + 1,
 177                                sample_idx,
 178                                e
 179                            );
 180                            match failed {
 181                                FailedHandling::Skip => {
 182                                    eprintln!("{}", err_msg);
 183                                    continue;
 184                                }
 185                                FailedHandling::Keep => {
 186                                    anyhow::bail!(err_msg);
 187                                }
 188                            }
 189                        }
 190                    };
 191
 192                    let json = if args.pretty {
 193                        serde_json::to_string_pretty(&case)
 194                    } else {
 195                        serde_json::to_string(&case)
 196                    }
 197                    .context("failed to serialize evaluation case as JSON")?;
 198
 199                    // Only add unique samples (different split points may produce same result)
 200                    if seen_samples.insert(json.clone()) {
 201                        output_lines.push(json);
 202                    }
 203                }
 204            } else {
 205                let case = match generate_evaluation_example_from_ordered_commit(
 206                    &annotated.reordered_commit,
 207                    &annotated.repo_url,
 208                    &annotated.commit_sha,
 209                    split_point.clone(),
 210                    args.seed,
 211                    None,
 212                ) {
 213                    Ok(case) => case,
 214                    Err(e) => {
 215                        let err_msg = format!(
 216                            "failed to generate evaluation example for commit {} at line {}: {}",
 217                            annotated.commit_sha,
 218                            line_num + 1,
 219                            e
 220                        );
 221                        match failed {
 222                            FailedHandling::Skip => {
 223                                eprintln!("{}", err_msg);
 224                                continue;
 225                            }
 226                            FailedHandling::Keep => {
 227                                anyhow::bail!(err_msg);
 228                            }
 229                        }
 230                    }
 231                };
 232
 233                let json = if args.pretty {
 234                    serde_json::to_string_pretty(&case)
 235                } else {
 236                    serde_json::to_string(&case)
 237                }
 238                .context("failed to serialize evaluation case as JSON")?;
 239
 240                output_lines.push(json);
 241            }
 242        }
 243    }
 244
 245    let output_content = output_lines.join("\n") + if output_lines.is_empty() { "" } else { "\n" };
 246
 247    if let Some(path) = output_path {
 248        fs::write(path, &output_content)
 249            .with_context(|| format!("failed to write output to {}", path.display()))?;
 250    } else {
 251        io::stdout()
 252            .write_all(output_content.as_bytes())
 253            .context("failed to write to stdout")?;
 254    }
 255
 256    Ok(())
 257}
 258
 259/// Main function to generate an evaluation example from an ordered commit.
 260///
 261/// # Arguments
 262/// * `commit` - Chronologically ordered unified diff of the commit
 263/// * `repository_url` - URL of the repository
 264/// * `commit_hash` - Hash of the commit
 265/// * `split_point` - Point at which the commit will be split (None for random)
 266/// * `seed` - Optional seed for randomness
 267/// * `sample_num` - Optional sample number for generating unique names
 268pub fn generate_evaluation_example_from_ordered_commit(
 269    commit: &str,
 270    repository_url: &str,
 271    commit_hash: &str,
 272    split_point: Option<SplitPoint>,
 273    seed: Option<u64>,
 274    sample_num: Option<usize>,
 275) -> Result<ExampleSpec> {
 276    let mut rng: Box<dyn rand::RngCore> = match seed {
 277        Some(seed) => Box::new(rand::rngs::StdRng::seed_from_u64(seed)),
 278        None => Box::new(rand::rngs::ThreadRng::default()),
 279    };
 280
 281    // Parse and normalize the commit
 282    let mut patch = Patch::parse_unified_diff(commit);
 283
 284    // Filter header to only keep lines starting with "//"
 285    let header_lines: Vec<&str> = patch
 286        .header
 287        .lines()
 288        .filter(|line| line.starts_with("//"))
 289        .collect();
 290    patch.header = if header_lines.is_empty() {
 291        String::new()
 292    } else {
 293        header_lines.join("\n") + "\n"
 294    };
 295    let commit_normalized = patch.to_string();
 296
 297    // Compute the split point
 298    let stats = patch.stats();
 299    let num_edits = stats.added + stats.removed;
 300
 301    anyhow::ensure!(num_edits != 0, "no edits found in commit");
 302
 303    let split = match split_point {
 304        None => rng.random_range(1..=num_edits),
 305        Some(SplitPoint::Fraction(f)) => {
 306            let v = (f * num_edits as f64).floor() as usize;
 307            v.min(num_edits)
 308        }
 309        Some(SplitPoint::Index(i)) => i.min(num_edits),
 310    };
 311
 312    // Split the commit into source and target patches
 313    let (prefix, suffix) = split_ordered_commit(&commit_normalized, split);
 314
 315    let mut split_commit = SplitCommit {
 316        source_patch: prefix,
 317        target_patch: suffix,
 318    };
 319
 320    // Imitate human edits
 321    let human_edit_seed = rng.random_range(1..=10000u64);
 322    let (src_patch, tgt_patch, cursor_opt) = imitate_human_edits(
 323        &split_commit.source_patch,
 324        &split_commit.target_patch,
 325        human_edit_seed,
 326    );
 327    split_commit.source_patch = src_patch;
 328    split_commit.target_patch = tgt_patch;
 329
 330    // Sample cursor position
 331    let cursor = match cursor_opt {
 332        Some(c) => c,
 333        None => sample_cursor_position(&patch, &split_commit)
 334            .context("failed to sample cursor position")?,
 335    };
 336
 337    // Get cursor excerpt
 338    let cursor_excerpt = get_cursor_excerpt(
 339        &cursor,
 340        &split_commit.source_patch,
 341        &split_commit.target_patch,
 342    )
 343    .context("failed to generate cursor excerpt")?;
 344
 345    // Handle edge case where split_point == 0
 346    if split == 0 {
 347        split_commit.target_patch = String::new();
 348    }
 349
 350    let repo_name = repository_url
 351        .trim_end_matches('/')
 352        .rsplit('/')
 353        .next()
 354        .unwrap_or("unknown");
 355    let short_sha = &commit_hash[..commit_hash.len().min(8)];
 356    let name = match sample_num {
 357        Some(n) => format!("{}-{}-{}", repo_name, short_sha, n),
 358        None => format!("{}-{}", repo_name, short_sha),
 359    };
 360
 361    Ok(ExampleSpec {
 362        name,
 363        repository_url: repository_url.to_string(),
 364        revision: format!("{}~1", commit_hash),
 365        edit_history: split_commit.source_patch.clone(),
 366        // cursor_position: cursor.to_string(),
 367        cursor_path: Path::new(&cursor.file).into(),
 368        cursor_position: cursor_excerpt,
 369        expected_patches: vec![split_commit.target_patch],
 370        tags: vec![],
 371        reasoning: None,
 372        uncommitted_diff: String::new(),
 373        rejected_patch: None,
 374    })
 375}
 376
 377/// Split an ordered commit into source and target commits.
 378///
 379/// # Arguments
 380/// * `commit` - Ordered commit string
 381/// * `split_pos` - Position to split the commit (number of edited lines)
 382///
 383/// # Returns
 384/// A tuple of (source_diff, target_diff)
 385pub fn split_ordered_commit(commit: &str, split_pos: usize) -> (String, String) {
 386    let patch = Patch::parse_unified_diff(commit);
 387    let source_edits: BTreeSet<usize> = (0..split_pos).collect();
 388    let (source, target) = extract_edits(&patch, &source_edits);
 389
 390    let mut source_str = source.to_string();
 391    let target_str = target.to_string();
 392
 393    // Strip last group header from the source (lines starting with "//" at the end)
 394    let source_lines: Vec<&str> = source_str.lines().collect();
 395    let mut end_idx = source_lines.len();
 396    for i in (0..source_lines.len()).rev() {
 397        if source_lines[i].starts_with("//") {
 398            end_idx = i;
 399        } else {
 400            break;
 401        }
 402    }
 403    if end_idx < source_lines.len() {
 404        source_str = source_lines[..end_idx].join("\n");
 405        if !source_str.is_empty() {
 406            source_str.push('\n');
 407        }
 408    }
 409
 410    (source_str, target_str)
 411}
 412
 413/// Tokenize text into words and non-word characters.
 414fn tokenize(text: &str) -> Vec<String> {
 415    let mut tokens = Vec::new();
 416    let mut current = String::new();
 417
 418    for ch in text.chars() {
 419        if ch.is_alphanumeric() {
 420            current.push(ch);
 421        } else if ch == '_' {
 422            // Include underscore with the current word, then flush
 423            current.push(ch);
 424            if !current.is_empty() {
 425                tokens.push(std::mem::take(&mut current));
 426            }
 427        } else {
 428            // Punctuation or whitespace - flush current word first
 429            if !current.is_empty() {
 430                tokens.push(std::mem::take(&mut current));
 431            }
 432            // Each punctuation/whitespace is its own token
 433            tokens.push(ch.to_string());
 434        }
 435    }
 436
 437    if !current.is_empty() {
 438        tokens.push(current);
 439    }
 440
 441    tokens
 442}
 443
 444/// Calculate the weight for a split position based on the character at that position.
 445///
 446/// Higher weights indicate more natural pause points (e.g., after punctuation,
 447/// at identifier boundaries). Lower weights indicate less natural points
 448/// (e.g., mid-identifier).
 449fn position_weight(text: &str, pos: usize) -> u32 {
 450    if pos == 0 || pos > text.len() {
 451        return 1;
 452    }
 453
 454    let chars: Vec<char> = text.chars().collect();
 455    if pos > chars.len() {
 456        return 1;
 457    }
 458
 459    // Get the character just before this position (what we just "typed")
 460    let prev_char = chars[pos - 1];
 461
 462    // High weight: natural pause points (end of statement/argument, opening brackets)
 463    if matches!(prev_char, ',' | ';' | ':' | '(' | '[' | '{') {
 464        return 10;
 465    }
 466
 467    // High weight: closing brackets (finished a group)
 468    if matches!(prev_char, ')' | ']' | '}') {
 469        return 8;
 470    }
 471
 472    // Medium weight: operators and method chains
 473    if matches!(
 474        prev_char,
 475        '.' | '+' | '-' | '*' | '/' | '=' | '<' | '>' | '&' | '|' | '!'
 476    ) {
 477        return 5;
 478    }
 479
 480    // Check if we're at the end of an identifier (word char followed by non-word char)
 481    let is_prev_word_char = prev_char.is_alphanumeric() || prev_char == '_';
 482    let is_next_word_char =
 483        pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_');
 484
 485    if is_prev_word_char && !is_next_word_char {
 486        // End of identifier - high weight
 487        return 8;
 488    }
 489
 490    // Whitespace is a natural pause
 491    if prev_char.is_whitespace() {
 492        return 6;
 493    }
 494
 495    // Mid-identifier: low weight (rare autocomplete scenarios)
 496    if is_prev_word_char && is_next_word_char {
 497        return 1;
 498    }
 499
 500    // Default medium-low weight
 501    3
 502}
 503
 504/// Select a weighted random index from a list of weights.
 505///
 506/// Returns an index based on the weights, using the provided seed for
 507/// deterministic selection.
 508fn weighted_select(weights: &[u32], seed: u64) -> usize {
 509    if weights.is_empty() {
 510        return 0;
 511    }
 512
 513    let total_weight: u64 = weights.iter().map(|&w| w as u64).sum();
 514    if total_weight == 0 {
 515        // Fallback to uniform selection if all weights are zero
 516        return seed as usize % weights.len();
 517    }
 518
 519    // Use seed to select a value in [0, total_weight)
 520    let target = seed % total_weight;
 521    let mut cumulative: u64 = 0;
 522
 523    for (idx, &weight) in weights.iter().enumerate() {
 524        cumulative += weight as u64;
 525        if target < cumulative {
 526            return idx;
 527        }
 528    }
 529
 530    // Fallback to last index
 531    weights.len() - 1
 532}
 533
 534/// Calculate similarity ratio between two strings (0-100).
 535fn fuzzy_ratio(s1: &str, s2: &str) -> u32 {
 536    if s1.is_empty() && s2.is_empty() {
 537        return 100;
 538    }
 539    if s1.is_empty() || s2.is_empty() {
 540        return 0;
 541    }
 542
 543    let diff = TextDiff::from_chars(s1, s2);
 544    let matching: usize = diff
 545        .ops()
 546        .iter()
 547        .filter_map(|op| {
 548            if matches!(op.tag(), DiffTag::Equal) {
 549                Some(op.new_range().len())
 550            } else {
 551                None
 552            }
 553        })
 554        .sum();
 555
 556    let total = s1.len() + s2.len();
 557    ((2 * matching * 100) / total) as u32
 558}
 559
 560/// Imitate human edits by introducing partial line edits.
 561///
 562/// This function simulates how a human might incrementally type code,
 563/// rather than making complete line replacements.
 564pub fn imitate_human_edits(
 565    source_patch: &str,
 566    target_patch: &str,
 567    seed: u64,
 568) -> (String, String, Option<CursorPosition>) {
 569    let no_change = (source_patch.to_string(), target_patch.to_string(), None);
 570
 571    let src_patch = Patch::parse_unified_diff(source_patch);
 572    let tgt_patch = Patch::parse_unified_diff(target_patch);
 573
 574    if tgt_patch.hunks.is_empty() {
 575        return no_change;
 576    }
 577
 578    // Try to locate the first edit in target
 579    let tgt_edit_loc = match locate_edited_line(&tgt_patch, 0) {
 580        Some(loc) => loc,
 581        None => return no_change,
 582    };
 583
 584    let tgt_is_addition = matches!(tgt_edit_loc.patch_line, PatchLine::Addition(_));
 585    if !tgt_is_addition {
 586        return no_change;
 587    }
 588
 589    let tgt_line = match &tgt_edit_loc.patch_line {
 590        PatchLine::Addition(s) => s.clone(),
 591        _ => return no_change,
 592    };
 593
 594    // Try to locate the last edit in source
 595    let src_edit_loc = locate_edited_line(&src_patch, -1);
 596
 597    // Check if source has ANY edit at the same line as target's first edit
 598    // We need to iterate through all edits to check this
 599    let src_has_edit_at_target_line = {
 600        let mut found = false;
 601        let mut idx = 0isize;
 602        while let Some(loc) = locate_edited_line(&src_patch, idx) {
 603            if loc.filename == tgt_edit_loc.filename
 604                && loc.target_line_number == tgt_edit_loc.target_line_number
 605            {
 606                found = true;
 607                break;
 608            }
 609            idx += 1;
 610        }
 611        found
 612    };
 613
 614    // Check if this is a replacement (deletion followed by insertion on the same line)
 615    // or a pure insertion (no corresponding deletion in source)
 616    let is_replacement = src_edit_loc.as_ref().map_or(false, |loc| {
 617        matches!(loc.patch_line, PatchLine::Deletion(_))
 618            && loc.filename == tgt_edit_loc.filename
 619            && loc.target_line_number == tgt_edit_loc.target_line_number
 620    });
 621
 622    // If source has an edit at the same line but it's not a replacement (i.e., it's an addition),
 623    // we shouldn't process this as a pure insertion either
 624    if !is_replacement && src_has_edit_at_target_line {
 625        return no_change;
 626    }
 627
 628    let src_line = if is_replacement {
 629        match &src_edit_loc.as_ref().unwrap().patch_line {
 630            PatchLine::Deletion(s) => s.clone(),
 631            _ => return no_change,
 632        }
 633    } else {
 634        // Pure insertion: source line is empty
 635        String::new()
 636    };
 637
 638    // Don't process if source and target are the same
 639    if src_line == tgt_line {
 640        return no_change;
 641    }
 642
 643    // Tokenize both lines
 644    let src_tokens = tokenize(&src_line);
 645    let tgt_tokens = tokenize(&tgt_line);
 646
 647    // Convert to slices for similar
 648    let src_refs: Vec<&str> = src_tokens.iter().map(|s| s.as_str()).collect();
 649    let tgt_refs: Vec<&str> = tgt_tokens.iter().map(|s| s.as_str()).collect();
 650
 651    // Use similar to get diff operations
 652    let diff = TextDiff::from_slices(&src_refs, &tgt_refs);
 653
 654    // Build weights for each possible split position
 655    let mut position_weights: Vec<u32> = Vec::new();
 656
 657    // Simulate the edit process to collect weights for all possible split positions
 658    {
 659        let mut current_text = String::new();
 660
 661        for op in diff.ops() {
 662            match op.tag() {
 663                DiffTag::Equal => {
 664                    for i in op.old_range() {
 665                        current_text.push_str(&src_tokens[i]);
 666                    }
 667                }
 668                DiffTag::Replace => {
 669                    let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 670                    let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 671
 672                    // For insertion part
 673                    for ch in ins.chars() {
 674                        current_text.push(ch);
 675                        let weight = position_weight(&current_text, current_text.len());
 676                        position_weights.push(weight);
 677                    }
 678
 679                    // For deletion part (we're "untyping" from source)
 680                    for _ in del.chars() {
 681                        // Weight deletions lower as they represent removing text
 682                        position_weights.push(2);
 683                    }
 684                }
 685                DiffTag::Insert => {
 686                    let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 687                    for ch in ins.chars() {
 688                        current_text.push(ch);
 689                        let weight = position_weight(&current_text, current_text.len());
 690                        position_weights.push(weight);
 691                    }
 692                }
 693                DiffTag::Delete => {
 694                    let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 695                    for _ in del.chars() {
 696                        // Weight deletions lower
 697                        position_weights.push(2);
 698                    }
 699                }
 700            }
 701        }
 702    }
 703
 704    // Use weighted selection to choose split index
 705    if position_weights.is_empty() {
 706        return no_change;
 707    }
 708    let split_index = weighted_select(&position_weights, seed);
 709
 710    let mut edit_index = 0usize;
 711    let mut new_src = String::new();
 712    let mut split_found = false;
 713    let mut last_old_end = 0usize;
 714
 715    for op in diff.ops() {
 716        match op.tag() {
 717            DiffTag::Equal => {
 718                for i in op.old_range() {
 719                    new_src.push_str(&src_tokens[i]);
 720                }
 721                last_old_end = op.old_range().end;
 722            }
 723            DiffTag::Replace => {
 724                // Handle replace as delete + insert
 725                let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 726                let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 727                let repl_len = del.len() + ins.len();
 728                if edit_index + repl_len >= split_index {
 729                    // Split within this replace operation
 730                    let offset = split_index - edit_index;
 731                    if offset < ins.len() {
 732                        let safe_offset = floor_char_boundary(&ins, offset);
 733                        new_src.push_str(&ins[..safe_offset]);
 734                    } else {
 735                        new_src.push_str(&ins);
 736                        let del_offset = offset - ins.len();
 737                        let safe_del_offset = floor_char_boundary(&del, del_offset.min(del.len()));
 738                        new_src.push_str(&del[..safe_del_offset]);
 739                    }
 740                    split_found = true;
 741                    last_old_end = op.old_range().end;
 742                    break;
 743                } else {
 744                    edit_index += repl_len;
 745                    new_src.push_str(&ins);
 746                    last_old_end = op.old_range().end;
 747                }
 748            }
 749            DiffTag::Insert => {
 750                let repl: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 751                if edit_index + repl.len() >= split_index {
 752                    let offset = split_index - edit_index;
 753                    let safe_offset = floor_char_boundary(&repl, offset);
 754                    new_src.push_str(&repl[..safe_offset]);
 755                    split_found = true;
 756                    break;
 757                } else {
 758                    edit_index += repl.len();
 759                    new_src.push_str(&repl);
 760                }
 761            }
 762            DiffTag::Delete => {
 763                let repl: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 764                if edit_index + repl.len() >= split_index {
 765                    let offset = split_index - edit_index;
 766                    let safe_offset = floor_char_boundary(&repl, offset);
 767                    new_src.push_str(&repl[..safe_offset]);
 768                    split_found = true;
 769                    last_old_end = op.old_range().start + safe_offset.min(op.old_range().len());
 770                    break;
 771                } else {
 772                    edit_index += repl.len();
 773                    new_src.push_str(&repl);
 774                    last_old_end = op.old_range().end;
 775                }
 776            }
 777        }
 778    }
 779
 780    if !split_found {
 781        return no_change;
 782    }
 783
 784    // Calculate cursor position
 785    let cursor = CursorPosition {
 786        file: tgt_edit_loc.filename.clone(),
 787        line: if is_replacement {
 788            src_edit_loc.as_ref().unwrap().source_line_number
 789        } else {
 790            tgt_edit_loc.target_line_number
 791        },
 792        column: new_src.len() + 1,
 793    };
 794
 795    // Add remainder of source if similar enough to target remainder
 796    let remainder_src: String = (last_old_end..src_tokens.len())
 797        .map(|i| src_tokens[i].as_str())
 798        .collect();
 799    let remainder_tgt: String = (last_old_end..tgt_tokens.len())
 800        .filter_map(|i| tgt_tokens.get(i).map(|s| s.as_str()))
 801        .collect();
 802
 803    let ratio = fuzzy_ratio(&remainder_src, &remainder_tgt);
 804    if ratio > 35 {
 805        new_src.push_str(&remainder_src);
 806    }
 807
 808    if new_src.trim().is_empty() {
 809        return no_change;
 810    }
 811
 812    if new_src == src_line {
 813        return no_change;
 814    }
 815
 816    // Build new source patch with the intermediate line
 817    let mut new_src_patch = src_patch;
 818    if is_replacement {
 819        // For replacements, insert after the deletion line
 820        let src_loc = src_edit_loc.as_ref().unwrap();
 821        if let Some(hunk) = new_src_patch.hunks.get_mut(src_loc.hunk_index) {
 822            hunk.lines.insert(
 823                src_loc.line_index_within_hunk + 1,
 824                PatchLine::Addition(new_src.clone()),
 825            );
 826            hunk.new_count += 1;
 827        }
 828    } else {
 829        // For pure insertions, insert after the last edit in source patch
 830        // This imitates human typing - the intermediate content is what the user is currently typing
 831        let last_src_edit = locate_edited_line(&new_src_patch, -1);
 832
 833        if let Some(src_loc) = last_src_edit {
 834            // Insert after the last edit in source
 835            if let Some(hunk) = new_src_patch.hunks.get_mut(src_loc.hunk_index) {
 836                hunk.lines.insert(
 837                    src_loc.line_index_within_hunk + 1,
 838                    PatchLine::Addition(new_src.clone()),
 839                );
 840                hunk.new_count += 1;
 841            }
 842        } else {
 843            // Source patch is empty or has incompatible hunk structure, create a new hunk based on target
 844            if let Some(tgt_hunk) = tgt_patch.hunks.get(tgt_edit_loc.hunk_index) {
 845                let mut new_hunk = tgt_hunk.clone();
 846                // Replace the full addition with the partial one
 847                new_hunk.lines.clear();
 848                for (i, line) in tgt_hunk.lines.iter().enumerate() {
 849                    if i == tgt_edit_loc.line_index_within_hunk {
 850                        new_hunk.lines.push(PatchLine::Addition(new_src.clone()));
 851                    } else {
 852                        match line {
 853                            PatchLine::Addition(_) => {
 854                                // Skip other additions from target
 855                            }
 856                            _ => new_hunk.lines.push(line.clone()),
 857                        }
 858                    }
 859                }
 860                new_hunk.new_count = new_hunk.old_count + 1;
 861                new_src_patch.hunks.push(new_hunk);
 862                // Copy header from target if source doesn't have one
 863                if new_src_patch.header.is_empty() {
 864                    new_src_patch.header = tgt_patch.header.clone();
 865                }
 866            }
 867        }
 868    }
 869
 870    // Build new target patch with the intermediate line as deletion
 871    let mut new_tgt_patch = tgt_patch;
 872    if let Some(hunk) = new_tgt_patch.hunks.get_mut(tgt_edit_loc.hunk_index) {
 873        hunk.lines.insert(
 874            tgt_edit_loc.line_index_within_hunk,
 875            PatchLine::Deletion(new_src),
 876        );
 877        hunk.old_count += 1;
 878    }
 879
 880    (
 881        new_src_patch.to_string(),
 882        new_tgt_patch.to_string(),
 883        Some(cursor),
 884    )
 885}
 886
 887/// Locate the end of the last edit in a patch.
 888fn locate_end_of_last_edit(patch: &Patch) -> Option<CursorPosition> {
 889    let loc = locate_edited_line(patch, -1)?;
 890
 891    let (line, col) = match &loc.patch_line {
 892        PatchLine::Addition(content) => (loc.target_line_number, content.len()),
 893        PatchLine::Deletion(_) => (loc.target_line_number, 1),
 894        _ => return None,
 895    };
 896
 897    Some(CursorPosition {
 898        file: loc.filename,
 899        line,
 900        column: col,
 901    })
 902}
 903
 904/// Locate the beginning of the first edit in a patch.
 905fn locate_beginning_of_first_edit(patch: &Patch) -> Option<CursorPosition> {
 906    let loc = locate_edited_line(patch, 0)?;
 907
 908    let hunk = patch.hunks.get(loc.hunk_index)?;
 909    let column = if loc.line_index_within_hunk > 0 {
 910        if let Some(prev_line) = hunk.lines.get(loc.line_index_within_hunk - 1) {
 911            let content = match prev_line {
 912                PatchLine::Context(s) | PatchLine::Addition(s) | PatchLine::Deletion(s) => s,
 913                _ => return None,
 914            };
 915            content.len().max(1) - 1
 916        } else {
 917            0
 918        }
 919    } else {
 920        0
 921    };
 922
 923    let line = loc.target_line_number.saturating_sub(1).max(1);
 924
 925    Some(CursorPosition {
 926        file: loc.filename,
 927        line,
 928        column,
 929    })
 930}
 931
 932/// Sample cursor position according to the following rules:
 933/// 1. 50% chance of cursor being at the end of the source patch
 934/// 2. 50% chance of cursor being at the beginning of the target patch
 935pub fn sample_cursor_position(patch: &Patch, split_commit: &SplitCommit) -> Option<CursorPosition> {
 936    // Try end of history first
 937    let src_patch = Patch::parse_unified_diff(&split_commit.source_patch);
 938    if let Some(cursor) = locate_end_of_last_edit(&src_patch) {
 939        return Some(cursor);
 940    }
 941
 942    // Try beginning of target
 943    let tgt_patch = Patch::parse_unified_diff(&split_commit.target_patch);
 944    if let Some(cursor) = locate_beginning_of_first_edit(&tgt_patch) {
 945        return Some(cursor);
 946    }
 947
 948    // Fallback: use the original patch
 949    locate_end_of_last_edit(patch)
 950}
 951
 952/// Get cursor excerpt from the patches.
 953///
 954/// This extracts the lines around the cursor position with a cursor marker.
 955pub fn get_cursor_excerpt(
 956    cursor: &CursorPosition,
 957    source_patch: &str,
 958    target_patch: &str,
 959) -> Option<String> {
 960    let mut excerpt_lines: Vec<String> = Vec::new();
 961    let mut excerpt_first_line: usize = 0;
 962
 963    // Search in the last hunk of source patch
 964    let src = Patch::parse_unified_diff(source_patch);
 965    if let Some(loc) = locate_edited_line(&src, -1) {
 966        if loc.filename == cursor.file && loc.target_line_number == cursor.line {
 967            if let Some(hunk) = src.hunks.get(loc.hunk_index) {
 968                excerpt_first_line = hunk.new_start as usize;
 969                for line in &hunk.lines {
 970                    match line {
 971                        PatchLine::Addition(s) | PatchLine::Context(s) => {
 972                            excerpt_lines.push(s.clone());
 973                        }
 974                        _ => {}
 975                    }
 976                }
 977                // If hunk only has deletions (file deletion), include deletion lines
 978                if excerpt_lines.is_empty() {
 979                    excerpt_first_line = hunk.old_start as usize;
 980                    for line in &hunk.lines {
 981                        match line {
 982                            PatchLine::Deletion(s) => {
 983                                excerpt_lines.push(s.clone());
 984                            }
 985                            _ => {}
 986                        }
 987                    }
 988                }
 989            }
 990        }
 991    }
 992
 993    // Search in target patch if not found
 994    if excerpt_lines.is_empty() {
 995        let tgt = Patch::parse_unified_diff(target_patch);
 996        // Search all hunks for the cursor file, not just the first edit's hunk
 997        for hunk in &tgt.hunks {
 998            if hunk.filename == cursor.file {
 999                excerpt_first_line = hunk.new_start as usize;
1000                // First try to collect deletions and context (what exists before edits)
1001                for line in &hunk.lines {
1002                    match line {
1003                        PatchLine::Deletion(s) | PatchLine::Context(s) => {
1004                            excerpt_lines.push(s.clone());
1005                        }
1006                        _ => {}
1007                    }
1008                }
1009                // If hunk only has additions (no deletions/context), include all lines
1010                // This handles cases like adding to an empty file or section
1011                if excerpt_lines.is_empty() {
1012                    for line in &hunk.lines {
1013                        match line {
1014                            PatchLine::Addition(s)
1015                            | PatchLine::Deletion(s)
1016                            | PatchLine::Context(s) => {
1017                                excerpt_lines.push(s.clone());
1018                            }
1019                            _ => {}
1020                        }
1021                    }
1022                }
1023                if !excerpt_lines.is_empty() {
1024                    break;
1025                }
1026            }
1027        }
1028    }
1029
1030    // Also search source patch hunks if still not found (for fallback cursor case)
1031    if excerpt_lines.is_empty() {
1032        for hunk in &src.hunks {
1033            if hunk.filename == cursor.file {
1034                excerpt_first_line = hunk.new_start as usize;
1035                for line in &hunk.lines {
1036                    match line {
1037                        PatchLine::Addition(s) | PatchLine::Context(s) => {
1038                            excerpt_lines.push(s.clone());
1039                        }
1040                        _ => {}
1041                    }
1042                }
1043                // If hunk only has deletions, include deletion lines
1044                if excerpt_lines.is_empty() {
1045                    excerpt_first_line = hunk.old_start as usize;
1046                    for line in &hunk.lines {
1047                        match line {
1048                            PatchLine::Deletion(s) => {
1049                                excerpt_lines.push(s.clone());
1050                            }
1051                            _ => {}
1052                        }
1053                    }
1054                }
1055                if !excerpt_lines.is_empty() {
1056                    break;
1057                }
1058            }
1059        }
1060    }
1061
1062    if excerpt_lines.is_empty() {
1063        return None;
1064    }
1065
1066    // Add cursor marker
1067    for (i, line) in excerpt_lines.iter_mut().enumerate() {
1068        let line_num = excerpt_first_line + i;
1069        if line_num == cursor.line {
1070            let col = cursor.column.min(line.len());
1071            // Ensure we split at a valid UTF-8 character boundary
1072            let col = if line.is_char_boundary(col) {
1073                col
1074            } else {
1075                // Find the nearest valid character boundary
1076                (0..=col)
1077                    .rev()
1078                    .find(|&i| line.is_char_boundary(i))
1079                    .unwrap_or(0)
1080            };
1081            let (before, after) = line.split_at(col);
1082            *line = format!("{}<|user_cursor|>{}", before, after);
1083            break;
1084        }
1085    }
1086
1087    Some(excerpt_lines.join("\n"))
1088}
1089
1090#[cfg(test)]
1091mod tests {
1092    use std::path::Path;
1093
1094    use edit_prediction::example_spec::ExampleSpec;
1095
1096    use super::*;
1097
1098    #[test]
1099    fn test_tokenize() {
1100        let tokens = tokenize("hello world");
1101        assert_eq!(tokens, vec!["hello", " ", "world"]);
1102
1103        let tokens = tokenize("foo_bar123 + baz");
1104        assert_eq!(tokens, vec!["foo_", "bar123", " ", "+", " ", "baz"]);
1105
1106        let tokens = tokenize("print(\"hello\")");
1107        assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
1108
1109        let tokens = tokenize("hello_world");
1110        assert_eq!(tokens, vec!["hello_", "world"]);
1111
1112        let tokens = tokenize("fn();");
1113        assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
1114    }
1115
1116    #[test]
1117    fn test_fuzzy_ratio() {
1118        assert_eq!(fuzzy_ratio("hello", "hello"), 100);
1119        assert_eq!(fuzzy_ratio("", ""), 100);
1120        assert!(fuzzy_ratio("hello", "world") < 50);
1121        assert!(fuzzy_ratio("hello world", "hello worl") > 80);
1122    }
1123
1124    #[test]
1125    fn test_split_ordered_commit() {
1126        let commit = r#"// First change
1127--- a/test.rs
1128+++ b/test.rs
1129@@ -1,3 +1,4 @@
1130 fn main() {
1131+    println!("hello");
1132+    println!("world");
1133 }
1134"#;
1135        let patch = Patch::parse_unified_diff(commit);
1136        let stats = patch.stats();
1137        assert_eq!(stats.added, 2);
1138
1139        let (source, target) = split_ordered_commit(commit, 1);
1140
1141        // Source should have 1 addition
1142        let src_patch = Patch::parse_unified_diff(&source);
1143        assert_eq!(src_patch.stats().added, 1);
1144
1145        // Target should have 1 addition
1146        let tgt_patch = Patch::parse_unified_diff(&target);
1147        assert_eq!(tgt_patch.stats().added, 1);
1148    }
1149
1150    #[test]
1151    fn test_split_ordered_commit_with_deletions() {
1152        let commit = r#"// Change
1153--- a/test.rs
1154+++ b/test.rs
1155@@ -1,3 +1,3 @@
1156 fn main() {
1157-    println!("old");
1158+    println!("new");
1159 }
1160"#;
1161        let patch = Patch::parse_unified_diff(commit);
1162        let stats = patch.stats();
1163        assert_eq!(stats.added, 1);
1164        assert_eq!(stats.removed, 1);
1165
1166        // Split at position 1 (after the deletion)
1167        let (source, target) = split_ordered_commit(commit, 1);
1168
1169        let src_patch = Patch::parse_unified_diff(&source);
1170        let tgt_patch = Patch::parse_unified_diff(&target);
1171
1172        // Source should have the deletion
1173        assert_eq!(src_patch.stats().removed, 1);
1174        // Target should have the addition
1175        assert_eq!(tgt_patch.stats().added, 1);
1176    }
1177
1178    #[test]
1179    fn test_generate_evaluation_example() {
1180        let commit = r#"commit abc123
1181Author: Test <test@example.com>
1182Date: Mon Jan 1 00:00:00 2024
1183
1184    Test commit
1185
1186////////////////////////////////////////////////////////////////////////////////
1187// Add greeting
1188////////////////////////////////////////////////////////////////////////////////
1189--- a/test.rs
1190+++ b/test.rs
1191@@ -1,3 +1,5 @@
1192 fn main() {
1193+    println!("hello");
1194+    println!("world");
1195 }
1196"#;
1197
1198        let result = generate_evaluation_example_from_ordered_commit(
1199            commit,
1200            "https://github.com/test/repo",
1201            "abc123",
1202            Some(SplitPoint::Fraction(0.5)),
1203            Some(42),
1204            None,
1205        );
1206
1207        assert!(result.is_ok());
1208        let case = result.unwrap();
1209        assert_eq!(case.repository_url, "https://github.com/test/repo");
1210        assert_eq!(case.revision, "abc123~1");
1211        assert!(!case.edit_history.is_empty());
1212    }
1213
1214    #[test]
1215    fn test_generate_evaluation_example_reproducible() {
1216        let commit = r#"////////////////////////////////////////////////////////////////////////////////
1217// Add greeting
1218////////////////////////////////////////////////////////////////////////////////
1219--- a/test.rs
1220+++ b/test.rs
1221@@ -1,3 +1,5 @@
1222 fn main() {
1223+    println!("hello");
1224+    println!("world");
1225 }
1226"#;
1227
1228        // Run twice with the same seed
1229        let result1 = generate_evaluation_example_from_ordered_commit(
1230            commit,
1231            "https://github.com/test/repo",
1232            "abc123",
1233            Some(SplitPoint::Fraction(0.5)),
1234            Some(12345),
1235            None,
1236        )
1237        .unwrap();
1238
1239        let result2 = generate_evaluation_example_from_ordered_commit(
1240            commit,
1241            "https://github.com/test/repo",
1242            "abc123",
1243            Some(SplitPoint::Fraction(0.5)),
1244            Some(12345),
1245            None,
1246        )
1247        .unwrap();
1248
1249        // Results should be identical
1250        assert_eq!(result1.edit_history, result2.edit_history);
1251        assert_eq!(result1.expected_patches, result2.expected_patches);
1252        assert_eq!(result1.cursor_position, result2.cursor_position);
1253    }
1254
1255    #[test]
1256    fn test_cursor_position_display() {
1257        let cursor = CursorPosition {
1258            file: "src/main.rs".to_string(),
1259            line: 42,
1260            column: 10,
1261        };
1262        assert_eq!(cursor.to_string(), "src/main.rs:42:10");
1263    }
1264
1265    #[test]
1266    fn test_imitate_human_edits_no_change_when_no_replacement() {
1267        // Source and target patches that don't form a replacement pattern
1268        let source = r#"--- a/test.rs
1269+++ b/test.rs
1270@@ -1,3 +1,4 @@
1271 fn main() {
1272+    println!("hello");
1273 }
1274"#;
1275        let target = r#"--- a/test.rs
1276+++ b/test.rs
1277@@ -1,3 +1,4 @@
1278 fn main() {
1279+    println!("world");
1280 }
1281"#;
1282
1283        let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, 42);
1284
1285        // Should return unchanged when not a replacement pattern
1286        assert_eq!(new_src, source);
1287        assert_eq!(new_tgt, target);
1288        assert!(cursor.is_none());
1289    }
1290
1291    #[test]
1292    fn test_split_point_fraction() {
1293        let commit = r#"// Change
1294--- a/test.rs
1295+++ b/test.rs
1296@@ -1,5 +1,10 @@
1297 fn main() {
1298+    line1();
1299+    line2();
1300+    line3();
1301+    line4();
1302+    line5();
1303 }
1304"#;
1305
1306        // Split at 20% should give first edit in source
1307        let result = generate_evaluation_example_from_ordered_commit(
1308            commit,
1309            "",
1310            "hash",
1311            Some(SplitPoint::Fraction(0.2)),
1312            Some(1),
1313            None,
1314        );
1315
1316        assert!(result.is_ok());
1317        let case = result.unwrap();
1318
1319        // Source should have some edits
1320        let src_patch = Patch::parse_unified_diff(&case.edit_history);
1321        assert!(src_patch.stats().added > 0);
1322    }
1323
1324    #[test]
1325    fn test_split_point_index() {
1326        let commit = r#"// Change
1327--- a/test.rs
1328+++ b/test.rs
1329@@ -1,5 +1,10 @@
1330 fn main() {
1331+    line1();
1332+    line2();
1333+    line3();
1334+    line4();
1335+    line5();
1336 }
1337"#;
1338
1339        // Split at index 2 should give first 2 edits in source
1340        // With pure insertion handling, source gets 2 original + 1 partial = 3 additions
1341        let result = generate_evaluation_example_from_ordered_commit(
1342            commit,
1343            "",
1344            "hash",
1345            Some(SplitPoint::Index(2)),
1346            Some(1),
1347            None,
1348        );
1349
1350        assert!(result.is_ok());
1351        let case = result.unwrap();
1352
1353        let src_patch = Patch::parse_unified_diff(&case.edit_history);
1354        // Pure insertion adds a partial line, so we expect 3 (2 original + 1 partial)
1355        assert_eq!(src_patch.stats().added, 3);
1356    }
1357
1358    #[test]
1359    fn test_cursor_excerpt_contains_marker() {
1360        let commit = r#"////////////////////////////////////////////////////////////////////////////////
1361// Add code
1362////////////////////////////////////////////////////////////////////////////////
1363--- a/test.rs
1364+++ b/test.rs
1365@@ -1,3 +1,5 @@
1366 fn main() {
1367+    println!("hello");
1368+    println!("world");
1369 }
1370"#;
1371
1372        let result = generate_evaluation_example_from_ordered_commit(
1373            commit,
1374            "",
1375            "hash",
1376            Some(SplitPoint::Fraction(0.5)),
1377            Some(42),
1378            None,
1379        )
1380        .unwrap();
1381
1382        // Cursor excerpt should contain the cursor marker
1383        assert!(
1384            result.cursor_position.contains("<|user_cursor|>"),
1385            "Cursor excerpt should contain marker: {}",
1386            result.cursor_position
1387        );
1388    }
1389
1390    #[test]
1391    fn test_evaluation_case_json_serialization() {
1392        let case = ExampleSpec {
1393            name: "test-abc123".to_string(),
1394            repository_url: "https://github.com/test/repo".to_string(),
1395            revision: "abc123~1".to_string(),
1396            edit_history: "patch1".to_string(),
1397            // cursor_position: "file.rs:10:5".to_string(),
1398            cursor_path: Path::new("file.rs").into(),
1399            cursor_position: "some code<|user_cursor|>".to_string(),
1400            expected_patches: vec!["patch".to_string()],
1401            tags: vec![],
1402            reasoning: None,
1403            uncommitted_diff: String::new(),
1404            rejected_patch: None,
1405        };
1406
1407        let json = serde_json::to_string(&case).unwrap();
1408        let deserialized: ExampleSpec = serde_json::from_str(&json).unwrap();
1409
1410        assert_eq!(case.repository_url, deserialized.repository_url);
1411        assert_eq!(case.revision, deserialized.revision);
1412        assert_eq!(case.cursor_position, deserialized.cursor_position);
1413    }
1414
1415    #[test]
1416    fn test_empty_commit_returns_error() {
1417        let commit = "";
1418
1419        let result = generate_evaluation_example_from_ordered_commit(
1420            commit,
1421            "",
1422            "hash",
1423            Some(SplitPoint::Fraction(0.5)),
1424            Some(1),
1425            None,
1426        );
1427
1428        assert!(result.is_err());
1429    }
1430
1431    #[test]
1432    fn test_header_filtering() {
1433        let commit = r#"commit abc123
1434Author: Test
1435Date: Today
1436
1437    Message
1438
1439diff --git a/test.rs b/test.rs
1440index 123..456 789
1441////////////////////////////////////////////////////////////////////////////////
1442// First group
1443////////////////////////////////////////////////////////////////////////////////
1444--- a/test.rs
1445+++ b/test.rs
1446@@ -1,3 +1,4 @@
1447 fn main() {
1448+    code();
1449 }
1450"#;
1451
1452        let result = generate_evaluation_example_from_ordered_commit(
1453            commit,
1454            "",
1455            "hash",
1456            Some(SplitPoint::Index(1)),
1457            Some(1),
1458            None,
1459        );
1460
1461        assert!(result.is_ok());
1462        let case = result.unwrap();
1463
1464        // The edit history should contain the group header (// lines)
1465        // but not the commit metadata
1466        assert!(!case.edit_history.contains("Author:"));
1467        assert!(!case.edit_history.contains("Date:"));
1468    }
1469
1470    #[test]
1471    fn test_position_weight() {
1472        // High weight positions (natural pause points)
1473        assert_eq!(position_weight("foo(", 4), 10); // After '('
1474        assert_eq!(position_weight("a, b", 2), 10); // After ','
1475        assert_eq!(position_weight("x;", 2), 10); // After ';'
1476        assert_eq!(position_weight("a: b", 2), 10); // After ':'
1477        assert_eq!(position_weight("[", 1), 10); // After '['
1478        assert_eq!(position_weight("{", 1), 10); // After '{'
1479
1480        // High weight for closing brackets
1481        assert_eq!(position_weight("foo)", 4), 8); // After ')'
1482        assert_eq!(position_weight("]", 1), 8); // After ']'
1483        assert_eq!(position_weight("}", 1), 8); // After '}'
1484
1485        // High weight at end of identifier
1486        assert_eq!(position_weight("foo ", 3), 8); // End of 'foo' before space
1487        assert_eq!(position_weight("bar(", 3), 8); // End of 'bar' before '('
1488
1489        // Medium weight for operators
1490        assert_eq!(position_weight("a + b", 3), 5); // After '+'
1491        assert_eq!(position_weight("x.", 2), 5); // After '.'
1492        assert_eq!(position_weight("a=b", 2), 5); // After '='
1493
1494        // Medium weight for whitespace
1495        assert_eq!(position_weight("a ", 2), 6); // After space
1496
1497        // Low weight mid-identifier
1498        assert_eq!(position_weight("foobar", 3), 1); // Mid-identifier 'foo|bar'
1499
1500        // Edge cases
1501        assert_eq!(position_weight("", 0), 1); // Empty string
1502        assert_eq!(position_weight("a", 0), 1); // Position 0
1503    }
1504
1505    #[test]
1506    fn test_weighted_select() {
1507        // Test that weighted selection returns correct indices
1508        let weights = vec![1, 10, 1];
1509
1510        // With total weight 12, seed 0 should select index 0
1511        // seed 0 % 12 = 0, cumulative: 1 at idx 0, so returns 0
1512        assert_eq!(weighted_select(&weights, 0), 0);
1513
1514        // seed 1 % 12 = 1, cumulative: 1 at idx 0 (1 < 1 is false), 11 at idx 1 (1 < 11 is true)
1515        assert_eq!(weighted_select(&weights, 1), 1);
1516
1517        // seed 10 % 12 = 10, cumulative: 1, 11 at idx 1 (10 < 11 is true)
1518        assert_eq!(weighted_select(&weights, 10), 1);
1519
1520        // seed 11 % 12 = 11, cumulative: 1, 11 at idx 1 (11 < 11 is false), 12 at idx 2 (11 < 12 is true)
1521        assert_eq!(weighted_select(&weights, 11), 2);
1522
1523        // Empty weights should return 0
1524        let empty: Vec<u32> = vec![];
1525        assert_eq!(weighted_select(&empty, 42), 0);
1526
1527        // Single weight should always return index 0
1528        let single = vec![10];
1529        assert_eq!(weighted_select(&single, 0), 0);
1530        assert_eq!(weighted_select(&single, 100), 0);
1531    }
1532
1533    #[test]
1534    fn test_weighted_split_prefers_natural_boundaries() {
1535        // Test that with different seeds, weighted selection tends to prefer
1536        // positions after punctuation over mid-identifier positions
1537        let text_with_punctuation = "foo(bar, baz)";
1538        let text_mid_identifier = "foobar";
1539
1540        // Position after '(' should have high weight
1541        let weight_after_paren = position_weight(text_with_punctuation, 4);
1542        // Position after ',' should have high weight
1543        let weight_after_comma = position_weight(text_with_punctuation, 8);
1544        // Position mid-identifier should have low weight
1545        let weight_mid_ident = position_weight(text_mid_identifier, 3);
1546
1547        assert!(
1548            weight_after_paren > weight_mid_ident,
1549            "After '(' ({}) should be weighted higher than mid-identifier ({})",
1550            weight_after_paren,
1551            weight_mid_ident
1552        );
1553        assert!(
1554            weight_after_comma > weight_mid_ident,
1555            "After ',' ({}) should be weighted higher than mid-identifier ({})",
1556            weight_after_comma,
1557            weight_mid_ident
1558        );
1559    }
1560
1561    #[test]
1562    fn test_imitate_human_edits_pure_insertion() {
1563        // Source patch is empty (no edits yet)
1564        // Target patch has a pure insertion (adding a new line)
1565        let source = r#"--- a/test.rs
1566+++ b/test.rs
1567@@ -1,2 +1,2 @@
1568 fn main() {
1569 }
1570"#;
1571        let target = r#"--- a/test.rs
1572+++ b/test.rs
1573@@ -1,2 +1,3 @@
1574 fn main() {
1575+    println!("debug");
1576 }
1577"#;
1578
1579        let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, 42);
1580
1581        // Should have transformed the patches
1582        assert_ne!(
1583            new_src, source,
1584            "Source should be modified for pure insertion"
1585        );
1586        assert_ne!(
1587            new_tgt, target,
1588            "Target should be modified for pure insertion"
1589        );
1590        assert!(cursor.is_some(), "Cursor should be set");
1591
1592        // Source should now have a partial addition
1593        let src_patch = Patch::parse_unified_diff(&new_src);
1594        assert!(
1595            src_patch.stats().added > 0,
1596            "Source should have added lines"
1597        );
1598
1599        // Target should have both a deletion (of partial) and addition (of full)
1600        let tgt_patch = Patch::parse_unified_diff(&new_tgt);
1601        assert!(
1602            tgt_patch.stats().removed > 0,
1603            "Target should have removed lines (partial)"
1604        );
1605        assert!(
1606            tgt_patch.stats().added > 0,
1607            "Target should have added lines (full)"
1608        );
1609
1610        // The cursor should be in test.rs
1611        let cursor = cursor.unwrap();
1612        assert_eq!(cursor.file, "test.rs");
1613    }
1614
1615    #[test]
1616    fn test_imitate_human_edits_pure_insertion_empty_source() {
1617        // Source patch has no hunks at all
1618        let source = "";
1619        let target = r#"--- a/test.rs
1620+++ b/test.rs
1621@@ -1,2 +1,3 @@
1622 fn main() {
1623+    println!("hello");
1624 }
1625"#;
1626
1627        let (new_src, _new_tgt, cursor) = imitate_human_edits(source, target, 123);
1628
1629        // Should have created a source patch with partial insertion
1630        assert!(!new_src.is_empty(), "Source should not be empty");
1631        assert!(cursor.is_some(), "Cursor should be set");
1632
1633        let src_patch = Patch::parse_unified_diff(&new_src);
1634        assert!(
1635            src_patch.stats().added > 0,
1636            "Source should have added lines"
1637        );
1638    }
1639
1640    #[test]
1641    fn test_imitate_human_edits_pure_insertion_intermediate_content() {
1642        // Verify the actual intermediate content is a realistic partial typing state
1643        let source = "";
1644        let target = r#"--- a/test.rs
1645+++ b/test.rs
1646@@ -1,2 +1,3 @@
1647 fn main() {
1648+    println!("hello world");
1649 }
1650"#;
1651
1652        // Test with multiple seeds to see different split points
1653        let mut found_partial = false;
1654        for seed in 1..=50 {
1655            let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, seed);
1656
1657            if cursor.is_some() {
1658                let src_patch = Patch::parse_unified_diff(&new_src);
1659                let tgt_patch = Patch::parse_unified_diff(&new_tgt);
1660
1661                // Find the added line in source
1662                for hunk in &src_patch.hunks {
1663                    for line in &hunk.lines {
1664                        if let PatchLine::Addition(content) = line {
1665                            // The partial line should be a prefix of the full line
1666                            let full_line = "    println!(\"hello world\");";
1667                            if content != full_line && full_line.starts_with(content) {
1668                                found_partial = true;
1669
1670                                // Verify target has the partial as deletion
1671                                let mut has_deletion = false;
1672                                for tgt_hunk in &tgt_patch.hunks {
1673                                    for tgt_line in &tgt_hunk.lines {
1674                                        if let PatchLine::Deletion(del_content) = tgt_line {
1675                                            if del_content == content {
1676                                                has_deletion = true;
1677                                            }
1678                                        }
1679                                    }
1680                                }
1681                                assert!(
1682                                    has_deletion,
1683                                    "Target should have deletion of partial line"
1684                                );
1685                            }
1686                        }
1687                    }
1688                }
1689            }
1690        }
1691
1692        assert!(
1693            found_partial,
1694            "At least one seed should produce a partial intermediate state"
1695        );
1696    }
1697
1698    #[test]
1699    fn test_imitate_human_edits_inserts_after_last_source_edit() {
1700        // Regression test: intermediate content should appear after the last edit
1701        // in the source patch, not at the position of the first target edit.
1702        // This ensures the diff output correctly imitates human typing order.
1703        //
1704        // The bug was: when source has edits and target has a pure insertion,
1705        // the intermediate content was inserted at tgt_edit_loc.line_index_within_hunk
1706        // (position of first target edit) instead of after the last source edit.
1707        //
1708        // Source patch has edits at lines 1-4, target has a new edit at line 10
1709        // (different location to avoid the "same line" early return)
1710        let source = r#"--- a/test.py
1711+++ b/test.py
1712@@ -1,4 +1,5 @@
1713+import foo
1714 import bar
1715-import old
1716 import baz
1717+import qux
1718"#;
1719        // Target has a pure insertion at a different line (line 10, not overlapping with source)
1720        let target = r#"--- a/test.py
1721+++ b/test.py
1722@@ -10,3 +10,4 @@
1723 def main():
1724+    print("hello world")
1725     pass
1726"#;
1727
1728        // Use a seed that produces a partial result
1729        let (new_src, _new_tgt, cursor) = imitate_human_edits(source, target, 42);
1730
1731        // The function should produce a modified patch
1732        assert!(cursor.is_some(), "Should produce intermediate state");
1733
1734        let src_patch = Patch::parse_unified_diff(&new_src);
1735        let all_additions: Vec<_> = src_patch
1736            .hunks
1737            .iter()
1738            .flat_map(|h| h.lines.iter())
1739            .filter_map(|l| match l {
1740                PatchLine::Addition(s) => Some(s.as_str()),
1741                _ => None,
1742            })
1743            .collect();
1744
1745        // The intermediate content (partial 'print("hello world")') should be
1746        // the LAST addition, appearing after "+import qux" (the last source edit)
1747        let last_addition = all_additions.last().expect("Should have additions");
1748        assert!(
1749            last_addition.trim_start().starts_with("pr"),
1750            "Intermediate content should be the last addition (partial 'print'), but last was: {:?}",
1751            last_addition
1752        );
1753
1754        // Verify the original source edits are still in order before the intermediate
1755        let foo_pos = all_additions.iter().position(|s| *s == "import foo");
1756        let qux_pos = all_additions.iter().position(|s| *s == "import qux");
1757        let intermediate_pos = all_additions
1758            .iter()
1759            .position(|s| s.trim_start().starts_with("pr"));
1760
1761        assert!(foo_pos.is_some(), "Should have 'import foo'");
1762        assert!(qux_pos.is_some(), "Should have 'import qux'");
1763        assert!(
1764            intermediate_pos.is_some(),
1765            "Should have intermediate content"
1766        );
1767
1768        assert!(
1769            foo_pos < qux_pos && qux_pos < intermediate_pos,
1770            "Order should be: foo < qux < intermediate. Got foo={:?}, qux={:?}, intermediate={:?}",
1771            foo_pos,
1772            qux_pos,
1773            intermediate_pos
1774        );
1775    }
1776
1777    #[test]
1778    fn test_cursor_excerpt_with_multibyte_utf8() {
1779        // Test that cursor excerpt handles multi-byte UTF-8 characters correctly
1780        // The Chinese character '第' is 3 bytes (0..3)
1781        let cursor = CursorPosition {
1782            file: "test.md".to_string(),
1783            line: 1,
1784            column: 1, // Byte index 1 is inside '第' (bytes 0..3)
1785        };
1786
1787        let source_patch = r#"--- a/test.md
1788+++ b/test.md
1789@@ -1,1 +1,1 @@
1790+第 14 章 Flask 工作原理与机制解析**
1791"#;
1792
1793        let target_patch = "";
1794
1795        // This should not panic even though column=1 is not a char boundary
1796        let result = get_cursor_excerpt(&cursor, source_patch, target_patch);
1797
1798        // The function should handle the invalid byte index gracefully
1799        if let Some(excerpt) = result {
1800            assert!(
1801                excerpt.contains("<|user_cursor|>"),
1802                "Cursor excerpt should contain marker"
1803            );
1804            // The marker should be placed at a valid character boundary
1805            // (either at the start or after '第')
1806        }
1807    }
1808}