split_commit.rs

   1//! `ep split-commit` implementation.
   2//!
   3//! This command generates a single evaluation example JSON object from a
   4//! chronologically-ordered unified diff (a "commit").
   5//!
   6//! TODO: Port Python code to generate chronologically-ordered commits
   7use crate::reorder_patch::{Patch, PatchLine, extract_edits, locate_edited_line};
   8
   9/// Find the largest valid UTF-8 char boundary at or before `index` in `s`.
  10fn floor_char_boundary(s: &str, index: usize) -> usize {
  11    if index >= s.len() {
  12        s.len()
  13    } else if s.is_char_boundary(index) {
  14        index
  15    } else {
  16        // Find the nearest valid character boundary at or before index
  17        (0..index)
  18            .rev()
  19            .find(|&i| s.is_char_boundary(i))
  20            .unwrap_or(0)
  21    }
  22}
  23use anyhow::{Context as _, Result};
  24use clap::Args;
  25use edit_prediction::example_spec::ExampleSpec;
  26use rand::Rng;
  27use rand::SeedableRng;
  28use serde::{Deserialize, Serialize};
  29use similar::{DiffTag, TextDiff};
  30use std::collections::BTreeSet;
  31use std::fs;
  32use std::io::{self, Write};
  33use std::path::Path;
  34use std::path::PathBuf;
  35
  36/// `ep split-commit` CLI args.
  37#[derive(Debug, Args, Clone)]
  38pub struct SplitCommitArgs {
  39    /// Split point (float 0.0-1.0 for fraction, or integer for index)
  40    #[arg(long, short = 's')]
  41    pub split_point: Option<String>,
  42
  43    /// Random seed for reproducibility
  44    #[arg(long)]
  45    pub seed: Option<u64>,
  46
  47    /// Pretty-print JSON output
  48    #[arg(long, short = 'p')]
  49    pub pretty: bool,
  50
  51    /// Number of samples to generate per commit (samples random split points)
  52    #[arg(long, short = 'n')]
  53    pub num_samples: Option<usize>,
  54}
  55
  56/// Input format for annotated commits (JSON Lines).
  57#[derive(Debug, Clone, Deserialize)]
  58#[allow(dead_code)]
  59pub struct AnnotatedCommit {
  60    /// Repository path (e.g., "repos/zed")
  61    pub repo: String,
  62    /// Repository URL (e.g., "https://github.com/zed-industries/zed")
  63    pub repo_url: String,
  64    /// Commit SHA
  65    pub commit_sha: String,
  66    /// Chronologically reordered commit diff
  67    pub reordered_commit: String,
  68    /// Original commit diff
  69    pub original_commit: String,
  70    /// Whether diff stats match between original and reordered
  71    pub diff_stats_match: bool,
  72}
  73
  74/// Cursor position in a file.
  75#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  76pub struct CursorPosition {
  77    pub file: String,
  78    pub line: usize,
  79    pub column: usize,
  80}
  81
  82impl std::fmt::Display for CursorPosition {
  83    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  84        write!(f, "{}:{}:{}", self.file, self.line, self.column)
  85    }
  86}
  87
  88/// Represents a split commit with source and target patches.
  89#[derive(Debug, Clone)]
  90pub struct SplitCommit {
  91    pub source_patch: String,
  92    pub target_patch: String,
  93}
  94
  95/// Split point specification for evaluation generation.
  96#[derive(Debug, Clone)]
  97pub enum SplitPoint {
  98    /// Fraction of total edits (0.0 to 1.0)
  99    Fraction(f64),
 100    /// Absolute index
 101    Index(usize),
 102}
 103
 104fn parse_split_point(value: &str) -> Option<SplitPoint> {
 105    if value.contains('.') {
 106        value.parse::<f64>().ok().map(SplitPoint::Fraction)
 107    } else {
 108        value.parse::<usize>().ok().map(SplitPoint::Index)
 109    }
 110}
 111
 112/// Entry point for the `ep split-commit` subcommand.
 113///
 114/// This runs synchronously and outputs JSON Lines (one output per input line).
 115pub fn run_split_commit(
 116    args: &SplitCommitArgs,
 117    inputs: &[PathBuf],
 118    output_path: Option<&PathBuf>,
 119) -> Result<()> {
 120    use std::collections::HashSet;
 121    use std::io::BufRead;
 122
 123    let stdin_path = PathBuf::from("-");
 124    let inputs = if inputs.is_empty() {
 125        std::slice::from_ref(&stdin_path)
 126    } else {
 127        inputs
 128    };
 129
 130    let split_point = args.split_point.as_deref().and_then(parse_split_point);
 131    let mut output_lines = Vec::new();
 132
 133    for input_path in inputs {
 134        let input: Box<dyn BufRead> = if input_path.as_os_str() == "-" {
 135            Box::new(io::BufReader::new(io::stdin()))
 136        } else {
 137            let file = fs::File::open(input_path)
 138                .with_context(|| format!("failed to open input file {}", input_path.display()))?;
 139            Box::new(io::BufReader::new(file))
 140        };
 141
 142        for (line_num, line_result) in input.lines().enumerate() {
 143            let line =
 144                line_result.with_context(|| format!("failed to read line {}", line_num + 1))?;
 145
 146            if line.trim().is_empty() {
 147                continue;
 148            }
 149
 150            let annotated: AnnotatedCommit = serde_json::from_str(&line)
 151                .with_context(|| format!("failed to parse JSON at line {}", line_num + 1))?;
 152
 153            // Generate multiple samples if num_samples is set
 154            if let Some(num_samples) = args.num_samples {
 155                let mut seen_samples: HashSet<String> = HashSet::new();
 156                let base_seed = args.seed.unwrap_or_else(|| rand::random());
 157
 158                for sample_idx in 0..num_samples {
 159                    let sample_seed = base_seed.wrapping_add(sample_idx as u64);
 160
 161                    let case = generate_evaluation_example_from_ordered_commit(
 162                        &annotated.reordered_commit,
 163                        &annotated.repo_url,
 164                        &annotated.commit_sha,
 165                        None, // Use random split point for multi-sample mode
 166                        Some(sample_seed),
 167                        Some(sample_idx),
 168                    )
 169                    .with_context(|| {
 170                        format!(
 171                            "failed to generate evaluation example for commit {} at line {} (sample {})",
 172                            annotated.commit_sha,
 173                            line_num + 1,
 174                            sample_idx
 175                        )
 176                    })?;
 177
 178                    let json = if args.pretty {
 179                        serde_json::to_string_pretty(&case)
 180                    } else {
 181                        serde_json::to_string(&case)
 182                    }
 183                    .context("failed to serialize evaluation case as JSON")?;
 184
 185                    // Only add unique samples (different split points may produce same result)
 186                    if seen_samples.insert(json.clone()) {
 187                        output_lines.push(json);
 188                    }
 189                }
 190            } else {
 191                let case = generate_evaluation_example_from_ordered_commit(
 192                    &annotated.reordered_commit,
 193                    &annotated.repo_url,
 194                    &annotated.commit_sha,
 195                    split_point.clone(),
 196                    args.seed,
 197                    None,
 198                )
 199                .with_context(|| {
 200                    format!(
 201                        "failed to generate evaluation example for commit {} at line {}",
 202                        annotated.commit_sha,
 203                        line_num + 1
 204                    )
 205                })?;
 206
 207                let json = if args.pretty {
 208                    serde_json::to_string_pretty(&case)
 209                } else {
 210                    serde_json::to_string(&case)
 211                }
 212                .context("failed to serialize evaluation case as JSON")?;
 213
 214                output_lines.push(json);
 215            }
 216        }
 217    }
 218
 219    let output_content = output_lines.join("\n") + if output_lines.is_empty() { "" } else { "\n" };
 220
 221    if let Some(path) = output_path {
 222        fs::write(path, &output_content)
 223            .with_context(|| format!("failed to write output to {}", path.display()))?;
 224    } else {
 225        io::stdout()
 226            .write_all(output_content.as_bytes())
 227            .context("failed to write to stdout")?;
 228    }
 229
 230    Ok(())
 231}
 232
 233/// Main function to generate an evaluation example from an ordered commit.
 234///
 235/// # Arguments
 236/// * `commit` - Chronologically ordered unified diff of the commit
 237/// * `repository_url` - URL of the repository
 238/// * `commit_hash` - Hash of the commit
 239/// * `split_point` - Point at which the commit will be split (None for random)
 240/// * `seed` - Optional seed for randomness
 241/// * `sample_num` - Optional sample number for generating unique names
 242pub fn generate_evaluation_example_from_ordered_commit(
 243    commit: &str,
 244    repository_url: &str,
 245    commit_hash: &str,
 246    split_point: Option<SplitPoint>,
 247    seed: Option<u64>,
 248    sample_num: Option<usize>,
 249) -> Result<ExampleSpec> {
 250    let mut rng: Box<dyn rand::RngCore> = match seed {
 251        Some(seed) => Box::new(rand::rngs::StdRng::seed_from_u64(seed)),
 252        None => Box::new(rand::rngs::ThreadRng::default()),
 253    };
 254
 255    // Parse and normalize the commit
 256    let mut patch = Patch::parse_unified_diff(commit);
 257
 258    // Filter header to only keep lines starting with "//"
 259    let header_lines: Vec<&str> = patch
 260        .header
 261        .lines()
 262        .filter(|line| line.starts_with("//"))
 263        .collect();
 264    patch.header = if header_lines.is_empty() {
 265        String::new()
 266    } else {
 267        header_lines.join("\n") + "\n"
 268    };
 269    let commit_normalized = patch.to_string();
 270
 271    // Compute the split point
 272    let stats = patch.stats();
 273    let num_edits = stats.added + stats.removed;
 274
 275    anyhow::ensure!(num_edits != 0, "no edits found in commit");
 276
 277    let split = match split_point {
 278        None => rng.random_range(1..=num_edits),
 279        Some(SplitPoint::Fraction(f)) => {
 280            let v = (f * num_edits as f64).floor() as usize;
 281            v.min(num_edits)
 282        }
 283        Some(SplitPoint::Index(i)) => i.min(num_edits),
 284    };
 285
 286    // Split the commit into source and target patches
 287    let (prefix, suffix) = split_ordered_commit(&commit_normalized, split);
 288
 289    let mut split_commit = SplitCommit {
 290        source_patch: prefix,
 291        target_patch: suffix,
 292    };
 293
 294    // Imitate human edits
 295    let human_edit_seed = rng.random_range(1..=10000u64);
 296    let (src_patch, tgt_patch, cursor_opt) = imitate_human_edits(
 297        &split_commit.source_patch,
 298        &split_commit.target_patch,
 299        human_edit_seed,
 300    );
 301    split_commit.source_patch = src_patch;
 302    split_commit.target_patch = tgt_patch;
 303
 304    // Sample cursor position
 305    let cursor = match cursor_opt {
 306        Some(c) => c,
 307        None => sample_cursor_position(&patch, &split_commit)
 308            .context("failed to sample cursor position")?,
 309    };
 310
 311    // Get cursor excerpt
 312    let cursor_excerpt = get_cursor_excerpt(
 313        &cursor,
 314        &split_commit.source_patch,
 315        &split_commit.target_patch,
 316    )
 317    .context("failed to generate cursor excerpt")?;
 318
 319    // Handle edge case where split_point == 0
 320    if split == 0 {
 321        split_commit.target_patch = String::new();
 322    }
 323
 324    let repo_name = repository_url
 325        .trim_end_matches('/')
 326        .rsplit('/')
 327        .next()
 328        .unwrap_or("unknown");
 329    let short_sha = &commit_hash[..commit_hash.len().min(8)];
 330    let name = match sample_num {
 331        Some(n) => format!("{}-{}-{}", repo_name, short_sha, n),
 332        None => format!("{}-{}", repo_name, short_sha),
 333    };
 334
 335    Ok(ExampleSpec {
 336        name,
 337        repository_url: repository_url.to_string(),
 338        revision: format!("{}~1", commit_hash),
 339        edit_history: split_commit.source_patch.clone(),
 340        // cursor_position: cursor.to_string(),
 341        cursor_path: Path::new(&cursor.file).into(),
 342        cursor_position: cursor_excerpt,
 343        expected_patches: vec![split_commit.target_patch],
 344        tags: vec![],
 345        reasoning: None,
 346        uncommitted_diff: String::new(),
 347    })
 348}
 349
 350/// Split an ordered commit into source and target commits.
 351///
 352/// # Arguments
 353/// * `commit` - Ordered commit string
 354/// * `split_pos` - Position to split the commit (number of edited lines)
 355///
 356/// # Returns
 357/// A tuple of (source_diff, target_diff)
 358pub fn split_ordered_commit(commit: &str, split_pos: usize) -> (String, String) {
 359    let patch = Patch::parse_unified_diff(commit);
 360    let source_edits: BTreeSet<usize> = (0..split_pos).collect();
 361    let (source, target) = extract_edits(&patch, &source_edits);
 362
 363    let mut source_str = source.to_string();
 364    let target_str = target.to_string();
 365
 366    // Strip last group header from the source (lines starting with "//" at the end)
 367    let source_lines: Vec<&str> = source_str.lines().collect();
 368    let mut end_idx = source_lines.len();
 369    for i in (0..source_lines.len()).rev() {
 370        if source_lines[i].starts_with("//") {
 371            end_idx = i;
 372        } else {
 373            break;
 374        }
 375    }
 376    if end_idx < source_lines.len() {
 377        source_str = source_lines[..end_idx].join("\n");
 378        if !source_str.is_empty() {
 379            source_str.push('\n');
 380        }
 381    }
 382
 383    (source_str, target_str)
 384}
 385
 386/// Tokenize text into words and non-word characters.
 387fn tokenize(text: &str) -> Vec<String> {
 388    let mut tokens = Vec::new();
 389    let mut current = String::new();
 390
 391    for ch in text.chars() {
 392        if ch.is_alphanumeric() {
 393            current.push(ch);
 394        } else if ch == '_' {
 395            // Include underscore with the current word, then flush
 396            current.push(ch);
 397            if !current.is_empty() {
 398                tokens.push(std::mem::take(&mut current));
 399            }
 400        } else {
 401            // Punctuation or whitespace - flush current word first
 402            if !current.is_empty() {
 403                tokens.push(std::mem::take(&mut current));
 404            }
 405            // Each punctuation/whitespace is its own token
 406            tokens.push(ch.to_string());
 407        }
 408    }
 409
 410    if !current.is_empty() {
 411        tokens.push(current);
 412    }
 413
 414    tokens
 415}
 416
 417/// Calculate the weight for a split position based on the character at that position.
 418///
 419/// Higher weights indicate more natural pause points (e.g., after punctuation,
 420/// at identifier boundaries). Lower weights indicate less natural points
 421/// (e.g., mid-identifier).
 422fn position_weight(text: &str, pos: usize) -> u32 {
 423    if pos == 0 || pos > text.len() {
 424        return 1;
 425    }
 426
 427    let chars: Vec<char> = text.chars().collect();
 428    if pos > chars.len() {
 429        return 1;
 430    }
 431
 432    // Get the character just before this position (what we just "typed")
 433    let prev_char = chars[pos - 1];
 434
 435    // High weight: natural pause points (end of statement/argument, opening brackets)
 436    if matches!(prev_char, ',' | ';' | ':' | '(' | '[' | '{') {
 437        return 10;
 438    }
 439
 440    // High weight: closing brackets (finished a group)
 441    if matches!(prev_char, ')' | ']' | '}') {
 442        return 8;
 443    }
 444
 445    // Medium weight: operators and method chains
 446    if matches!(
 447        prev_char,
 448        '.' | '+' | '-' | '*' | '/' | '=' | '<' | '>' | '&' | '|' | '!'
 449    ) {
 450        return 5;
 451    }
 452
 453    // Check if we're at the end of an identifier (word char followed by non-word char)
 454    let is_prev_word_char = prev_char.is_alphanumeric() || prev_char == '_';
 455    let is_next_word_char =
 456        pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_');
 457
 458    if is_prev_word_char && !is_next_word_char {
 459        // End of identifier - high weight
 460        return 8;
 461    }
 462
 463    // Whitespace is a natural pause
 464    if prev_char.is_whitespace() {
 465        return 6;
 466    }
 467
 468    // Mid-identifier: low weight (rare autocomplete scenarios)
 469    if is_prev_word_char && is_next_word_char {
 470        return 1;
 471    }
 472
 473    // Default medium-low weight
 474    3
 475}
 476
 477/// Select a weighted random index from a list of weights.
 478///
 479/// Returns an index based on the weights, using the provided seed for
 480/// deterministic selection.
 481fn weighted_select(weights: &[u32], seed: u64) -> usize {
 482    if weights.is_empty() {
 483        return 0;
 484    }
 485
 486    let total_weight: u64 = weights.iter().map(|&w| w as u64).sum();
 487    if total_weight == 0 {
 488        // Fallback to uniform selection if all weights are zero
 489        return seed as usize % weights.len();
 490    }
 491
 492    // Use seed to select a value in [0, total_weight)
 493    let target = seed % total_weight;
 494    let mut cumulative: u64 = 0;
 495
 496    for (idx, &weight) in weights.iter().enumerate() {
 497        cumulative += weight as u64;
 498        if target < cumulative {
 499            return idx;
 500        }
 501    }
 502
 503    // Fallback to last index
 504    weights.len() - 1
 505}
 506
 507/// Calculate similarity ratio between two strings (0-100).
 508fn fuzzy_ratio(s1: &str, s2: &str) -> u32 {
 509    if s1.is_empty() && s2.is_empty() {
 510        return 100;
 511    }
 512    if s1.is_empty() || s2.is_empty() {
 513        return 0;
 514    }
 515
 516    let diff = TextDiff::from_chars(s1, s2);
 517    let matching: usize = diff
 518        .ops()
 519        .iter()
 520        .filter_map(|op| {
 521            if matches!(op.tag(), DiffTag::Equal) {
 522                Some(op.new_range().len())
 523            } else {
 524                None
 525            }
 526        })
 527        .sum();
 528
 529    let total = s1.len() + s2.len();
 530    ((2 * matching * 100) / total) as u32
 531}
 532
 533/// Imitate human edits by introducing partial line edits.
 534///
 535/// This function simulates how a human might incrementally type code,
 536/// rather than making complete line replacements.
 537pub fn imitate_human_edits(
 538    source_patch: &str,
 539    target_patch: &str,
 540    seed: u64,
 541) -> (String, String, Option<CursorPosition>) {
 542    let no_change = (source_patch.to_string(), target_patch.to_string(), None);
 543
 544    let src_patch = Patch::parse_unified_diff(source_patch);
 545    let tgt_patch = Patch::parse_unified_diff(target_patch);
 546
 547    if tgt_patch.hunks.is_empty() {
 548        return no_change;
 549    }
 550
 551    // Try to locate the first edit in target
 552    let tgt_edit_loc = match locate_edited_line(&tgt_patch, 0) {
 553        Some(loc) => loc,
 554        None => return no_change,
 555    };
 556
 557    let tgt_is_addition = matches!(tgt_edit_loc.patch_line, PatchLine::Addition(_));
 558    if !tgt_is_addition {
 559        return no_change;
 560    }
 561
 562    let tgt_line = match &tgt_edit_loc.patch_line {
 563        PatchLine::Addition(s) => s.clone(),
 564        _ => return no_change,
 565    };
 566
 567    // Try to locate the last edit in source
 568    let src_edit_loc = locate_edited_line(&src_patch, -1);
 569
 570    // Check if source has ANY edit at the same line as target's first edit
 571    // We need to iterate through all edits to check this
 572    let src_has_edit_at_target_line = {
 573        let mut found = false;
 574        let mut idx = 0isize;
 575        while let Some(loc) = locate_edited_line(&src_patch, idx) {
 576            if loc.filename == tgt_edit_loc.filename
 577                && loc.target_line_number == tgt_edit_loc.target_line_number
 578            {
 579                found = true;
 580                break;
 581            }
 582            idx += 1;
 583        }
 584        found
 585    };
 586
 587    // Check if this is a replacement (deletion followed by insertion on the same line)
 588    // or a pure insertion (no corresponding deletion in source)
 589    let is_replacement = src_edit_loc.as_ref().map_or(false, |loc| {
 590        matches!(loc.patch_line, PatchLine::Deletion(_))
 591            && loc.filename == tgt_edit_loc.filename
 592            && loc.target_line_number == tgt_edit_loc.target_line_number
 593    });
 594
 595    // If source has an edit at the same line but it's not a replacement (i.e., it's an addition),
 596    // we shouldn't process this as a pure insertion either
 597    if !is_replacement && src_has_edit_at_target_line {
 598        return no_change;
 599    }
 600
 601    let src_line = if is_replacement {
 602        match &src_edit_loc.as_ref().unwrap().patch_line {
 603            PatchLine::Deletion(s) => s.clone(),
 604            _ => return no_change,
 605        }
 606    } else {
 607        // Pure insertion: source line is empty
 608        String::new()
 609    };
 610
 611    // Don't process if source and target are the same
 612    if src_line == tgt_line {
 613        return no_change;
 614    }
 615
 616    // Tokenize both lines
 617    let src_tokens = tokenize(&src_line);
 618    let tgt_tokens = tokenize(&tgt_line);
 619
 620    // Convert to slices for similar
 621    let src_refs: Vec<&str> = src_tokens.iter().map(|s| s.as_str()).collect();
 622    let tgt_refs: Vec<&str> = tgt_tokens.iter().map(|s| s.as_str()).collect();
 623
 624    // Use similar to get diff operations
 625    let diff = TextDiff::from_slices(&src_refs, &tgt_refs);
 626
 627    // Build weights for each possible split position
 628    let mut position_weights: Vec<u32> = Vec::new();
 629
 630    // Simulate the edit process to collect weights for all possible split positions
 631    {
 632        let mut current_text = String::new();
 633
 634        for op in diff.ops() {
 635            match op.tag() {
 636                DiffTag::Equal => {
 637                    for i in op.old_range() {
 638                        current_text.push_str(&src_tokens[i]);
 639                    }
 640                }
 641                DiffTag::Replace => {
 642                    let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 643                    let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 644
 645                    // For insertion part
 646                    for ch in ins.chars() {
 647                        current_text.push(ch);
 648                        let weight = position_weight(&current_text, current_text.len());
 649                        position_weights.push(weight);
 650                    }
 651
 652                    // For deletion part (we're "untyping" from source)
 653                    for _ in del.chars() {
 654                        // Weight deletions lower as they represent removing text
 655                        position_weights.push(2);
 656                    }
 657                }
 658                DiffTag::Insert => {
 659                    let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 660                    for ch in ins.chars() {
 661                        current_text.push(ch);
 662                        let weight = position_weight(&current_text, current_text.len());
 663                        position_weights.push(weight);
 664                    }
 665                }
 666                DiffTag::Delete => {
 667                    let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 668                    for _ in del.chars() {
 669                        // Weight deletions lower
 670                        position_weights.push(2);
 671                    }
 672                }
 673            }
 674        }
 675    }
 676
 677    // Use weighted selection to choose split index
 678    if position_weights.is_empty() {
 679        return no_change;
 680    }
 681    let split_index = weighted_select(&position_weights, seed);
 682
 683    let mut edit_index = 0usize;
 684    let mut new_src = String::new();
 685    let mut split_found = false;
 686    let mut last_old_end = 0usize;
 687
 688    for op in diff.ops() {
 689        match op.tag() {
 690            DiffTag::Equal => {
 691                for i in op.old_range() {
 692                    new_src.push_str(&src_tokens[i]);
 693                }
 694                last_old_end = op.old_range().end;
 695            }
 696            DiffTag::Replace => {
 697                // Handle replace as delete + insert
 698                let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 699                let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 700                let repl_len = del.len() + ins.len();
 701                if edit_index + repl_len >= split_index {
 702                    // Split within this replace operation
 703                    let offset = split_index - edit_index;
 704                    if offset < ins.len() {
 705                        let safe_offset = floor_char_boundary(&ins, offset);
 706                        new_src.push_str(&ins[..safe_offset]);
 707                    } else {
 708                        new_src.push_str(&ins);
 709                        let del_offset = offset - ins.len();
 710                        let safe_del_offset = floor_char_boundary(&del, del_offset.min(del.len()));
 711                        new_src.push_str(&del[..safe_del_offset]);
 712                    }
 713                    split_found = true;
 714                    last_old_end = op.old_range().end;
 715                    break;
 716                } else {
 717                    edit_index += repl_len;
 718                    new_src.push_str(&ins);
 719                    last_old_end = op.old_range().end;
 720                }
 721            }
 722            DiffTag::Insert => {
 723                let repl: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
 724                if edit_index + repl.len() >= split_index {
 725                    let offset = split_index - edit_index;
 726                    let safe_offset = floor_char_boundary(&repl, offset);
 727                    new_src.push_str(&repl[..safe_offset]);
 728                    split_found = true;
 729                    break;
 730                } else {
 731                    edit_index += repl.len();
 732                    new_src.push_str(&repl);
 733                }
 734            }
 735            DiffTag::Delete => {
 736                let repl: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
 737                if edit_index + repl.len() >= split_index {
 738                    let offset = split_index - edit_index;
 739                    let safe_offset = floor_char_boundary(&repl, offset);
 740                    new_src.push_str(&repl[..safe_offset]);
 741                    split_found = true;
 742                    last_old_end = op.old_range().start + safe_offset.min(op.old_range().len());
 743                    break;
 744                } else {
 745                    edit_index += repl.len();
 746                    new_src.push_str(&repl);
 747                    last_old_end = op.old_range().end;
 748                }
 749            }
 750        }
 751    }
 752
 753    if !split_found {
 754        return no_change;
 755    }
 756
 757    // Calculate cursor position
 758    let cursor = CursorPosition {
 759        file: tgt_edit_loc.filename.clone(),
 760        line: if is_replacement {
 761            src_edit_loc.as_ref().unwrap().source_line_number
 762        } else {
 763            tgt_edit_loc.target_line_number
 764        },
 765        column: new_src.len() + 1,
 766    };
 767
 768    // Add remainder of source if similar enough to target remainder
 769    let remainder_src: String = (last_old_end..src_tokens.len())
 770        .map(|i| src_tokens[i].as_str())
 771        .collect();
 772    let remainder_tgt: String = (last_old_end..tgt_tokens.len())
 773        .filter_map(|i| tgt_tokens.get(i).map(|s| s.as_str()))
 774        .collect();
 775
 776    let ratio = fuzzy_ratio(&remainder_src, &remainder_tgt);
 777    if ratio > 35 {
 778        new_src.push_str(&remainder_src);
 779    }
 780
 781    if new_src.trim().is_empty() {
 782        return no_change;
 783    }
 784
 785    if new_src == src_line {
 786        return no_change;
 787    }
 788
 789    // Build new source patch with the intermediate line
 790    let mut new_src_patch = src_patch;
 791    if is_replacement {
 792        // For replacements, insert after the deletion line
 793        let src_loc = src_edit_loc.as_ref().unwrap();
 794        if let Some(hunk) = new_src_patch.hunks.get_mut(src_loc.hunk_index) {
 795            hunk.lines.insert(
 796                src_loc.line_index_within_hunk + 1,
 797                PatchLine::Addition(new_src.clone()),
 798            );
 799            hunk.new_count += 1;
 800        }
 801    } else {
 802        // For pure insertions, insert after the last edit in source patch
 803        // This imitates human typing - the intermediate content is what the user is currently typing
 804        let last_src_edit = locate_edited_line(&new_src_patch, -1);
 805
 806        if let Some(src_loc) = last_src_edit {
 807            // Insert after the last edit in source
 808            if let Some(hunk) = new_src_patch.hunks.get_mut(src_loc.hunk_index) {
 809                hunk.lines.insert(
 810                    src_loc.line_index_within_hunk + 1,
 811                    PatchLine::Addition(new_src.clone()),
 812                );
 813                hunk.new_count += 1;
 814            }
 815        } else {
 816            // Source patch is empty or has incompatible hunk structure, create a new hunk based on target
 817            if let Some(tgt_hunk) = tgt_patch.hunks.get(tgt_edit_loc.hunk_index) {
 818                let mut new_hunk = tgt_hunk.clone();
 819                // Replace the full addition with the partial one
 820                new_hunk.lines.clear();
 821                for (i, line) in tgt_hunk.lines.iter().enumerate() {
 822                    if i == tgt_edit_loc.line_index_within_hunk {
 823                        new_hunk.lines.push(PatchLine::Addition(new_src.clone()));
 824                    } else {
 825                        match line {
 826                            PatchLine::Addition(_) => {
 827                                // Skip other additions from target
 828                            }
 829                            _ => new_hunk.lines.push(line.clone()),
 830                        }
 831                    }
 832                }
 833                new_hunk.new_count = new_hunk.old_count + 1;
 834                new_src_patch.hunks.push(new_hunk);
 835                // Copy header from target if source doesn't have one
 836                if new_src_patch.header.is_empty() {
 837                    new_src_patch.header = tgt_patch.header.clone();
 838                }
 839            }
 840        }
 841    }
 842
 843    // Build new target patch with the intermediate line as deletion
 844    let mut new_tgt_patch = tgt_patch;
 845    if let Some(hunk) = new_tgt_patch.hunks.get_mut(tgt_edit_loc.hunk_index) {
 846        hunk.lines.insert(
 847            tgt_edit_loc.line_index_within_hunk,
 848            PatchLine::Deletion(new_src),
 849        );
 850        hunk.old_count += 1;
 851    }
 852
 853    (
 854        new_src_patch.to_string(),
 855        new_tgt_patch.to_string(),
 856        Some(cursor),
 857    )
 858}
 859
 860/// Locate the end of the last edit in a patch.
 861fn locate_end_of_last_edit(patch: &Patch) -> Option<CursorPosition> {
 862    let loc = locate_edited_line(patch, -1)?;
 863
 864    let (line, col) = match &loc.patch_line {
 865        PatchLine::Addition(content) => (loc.target_line_number, content.len()),
 866        PatchLine::Deletion(_) => (loc.target_line_number, 1),
 867        _ => return None,
 868    };
 869
 870    Some(CursorPosition {
 871        file: loc.filename,
 872        line,
 873        column: col,
 874    })
 875}
 876
 877/// Locate the beginning of the first edit in a patch.
 878fn locate_beginning_of_first_edit(patch: &Patch) -> Option<CursorPosition> {
 879    let loc = locate_edited_line(patch, 0)?;
 880
 881    let hunk = patch.hunks.get(loc.hunk_index)?;
 882    let column = if loc.line_index_within_hunk > 0 {
 883        if let Some(prev_line) = hunk.lines.get(loc.line_index_within_hunk - 1) {
 884            let content = match prev_line {
 885                PatchLine::Context(s) | PatchLine::Addition(s) | PatchLine::Deletion(s) => s,
 886                _ => return None,
 887            };
 888            content.len().max(1) - 1
 889        } else {
 890            0
 891        }
 892    } else {
 893        0
 894    };
 895
 896    let line = loc.target_line_number.saturating_sub(1).max(1);
 897
 898    Some(CursorPosition {
 899        file: loc.filename,
 900        line,
 901        column,
 902    })
 903}
 904
 905/// Sample cursor position according to the following rules:
 906/// 1. 50% chance of cursor being at the end of the source patch
 907/// 2. 50% chance of cursor being at the beginning of the target patch
 908pub fn sample_cursor_position(patch: &Patch, split_commit: &SplitCommit) -> Option<CursorPosition> {
 909    // Try end of history first
 910    let src_patch = Patch::parse_unified_diff(&split_commit.source_patch);
 911    if let Some(cursor) = locate_end_of_last_edit(&src_patch) {
 912        return Some(cursor);
 913    }
 914
 915    // Try beginning of target
 916    let tgt_patch = Patch::parse_unified_diff(&split_commit.target_patch);
 917    if let Some(cursor) = locate_beginning_of_first_edit(&tgt_patch) {
 918        return Some(cursor);
 919    }
 920
 921    // Fallback: use the original patch
 922    locate_end_of_last_edit(patch)
 923}
 924
 925/// Get cursor excerpt from the patches.
 926///
 927/// This extracts the lines around the cursor position with a cursor marker.
 928pub fn get_cursor_excerpt(
 929    cursor: &CursorPosition,
 930    source_patch: &str,
 931    target_patch: &str,
 932) -> Option<String> {
 933    let mut excerpt_lines: Vec<String> = Vec::new();
 934    let mut excerpt_first_line: usize = 0;
 935
 936    // Search in the last hunk of source patch
 937    let src = Patch::parse_unified_diff(source_patch);
 938    if let Some(loc) = locate_edited_line(&src, -1) {
 939        if loc.filename == cursor.file && loc.target_line_number == cursor.line {
 940            if let Some(hunk) = src.hunks.get(loc.hunk_index) {
 941                excerpt_first_line = hunk.new_start as usize;
 942                for line in &hunk.lines {
 943                    match line {
 944                        PatchLine::Addition(s) | PatchLine::Context(s) => {
 945                            excerpt_lines.push(s.clone());
 946                        }
 947                        _ => {}
 948                    }
 949                }
 950                // If hunk only has deletions (file deletion), include deletion lines
 951                if excerpt_lines.is_empty() {
 952                    excerpt_first_line = hunk.old_start as usize;
 953                    for line in &hunk.lines {
 954                        match line {
 955                            PatchLine::Deletion(s) => {
 956                                excerpt_lines.push(s.clone());
 957                            }
 958                            _ => {}
 959                        }
 960                    }
 961                }
 962            }
 963        }
 964    }
 965
 966    // Search in target patch if not found
 967    if excerpt_lines.is_empty() {
 968        let tgt = Patch::parse_unified_diff(target_patch);
 969        // Search all hunks for the cursor file, not just the first edit's hunk
 970        for hunk in &tgt.hunks {
 971            if hunk.filename == cursor.file {
 972                excerpt_first_line = hunk.new_start as usize;
 973                // First try to collect deletions and context (what exists before edits)
 974                for line in &hunk.lines {
 975                    match line {
 976                        PatchLine::Deletion(s) | PatchLine::Context(s) => {
 977                            excerpt_lines.push(s.clone());
 978                        }
 979                        _ => {}
 980                    }
 981                }
 982                // If hunk only has additions (no deletions/context), include all lines
 983                // This handles cases like adding to an empty file or section
 984                if excerpt_lines.is_empty() {
 985                    for line in &hunk.lines {
 986                        match line {
 987                            PatchLine::Addition(s)
 988                            | PatchLine::Deletion(s)
 989                            | PatchLine::Context(s) => {
 990                                excerpt_lines.push(s.clone());
 991                            }
 992                            _ => {}
 993                        }
 994                    }
 995                }
 996                if !excerpt_lines.is_empty() {
 997                    break;
 998                }
 999            }
1000        }
1001    }
1002
1003    // Also search source patch hunks if still not found (for fallback cursor case)
1004    if excerpt_lines.is_empty() {
1005        for hunk in &src.hunks {
1006            if hunk.filename == cursor.file {
1007                excerpt_first_line = hunk.new_start as usize;
1008                for line in &hunk.lines {
1009                    match line {
1010                        PatchLine::Addition(s) | PatchLine::Context(s) => {
1011                            excerpt_lines.push(s.clone());
1012                        }
1013                        _ => {}
1014                    }
1015                }
1016                // If hunk only has deletions, include deletion lines
1017                if excerpt_lines.is_empty() {
1018                    excerpt_first_line = hunk.old_start as usize;
1019                    for line in &hunk.lines {
1020                        match line {
1021                            PatchLine::Deletion(s) => {
1022                                excerpt_lines.push(s.clone());
1023                            }
1024                            _ => {}
1025                        }
1026                    }
1027                }
1028                if !excerpt_lines.is_empty() {
1029                    break;
1030                }
1031            }
1032        }
1033    }
1034
1035    if excerpt_lines.is_empty() {
1036        return None;
1037    }
1038
1039    // Add cursor marker
1040    for (i, line) in excerpt_lines.iter_mut().enumerate() {
1041        let line_num = excerpt_first_line + i;
1042        if line_num == cursor.line {
1043            let col = cursor.column.min(line.len());
1044            // Ensure we split at a valid UTF-8 character boundary
1045            let col = if line.is_char_boundary(col) {
1046                col
1047            } else {
1048                // Find the nearest valid character boundary
1049                (0..=col)
1050                    .rev()
1051                    .find(|&i| line.is_char_boundary(i))
1052                    .unwrap_or(0)
1053            };
1054            let (before, after) = line.split_at(col);
1055            *line = format!("{}<|user_cursor|>{}", before, after);
1056            break;
1057        }
1058    }
1059
1060    Some(excerpt_lines.join("\n"))
1061}
1062
1063#[cfg(test)]
1064mod tests {
1065    use std::path::Path;
1066
1067    use edit_prediction::example_spec::ExampleSpec;
1068
1069    use super::*;
1070
1071    #[test]
1072    fn test_tokenize() {
1073        let tokens = tokenize("hello world");
1074        assert_eq!(tokens, vec!["hello", " ", "world"]);
1075
1076        let tokens = tokenize("foo_bar123 + baz");
1077        assert_eq!(tokens, vec!["foo_", "bar123", " ", "+", " ", "baz"]);
1078
1079        let tokens = tokenize("print(\"hello\")");
1080        assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
1081
1082        let tokens = tokenize("hello_world");
1083        assert_eq!(tokens, vec!["hello_", "world"]);
1084
1085        let tokens = tokenize("fn();");
1086        assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
1087    }
1088
1089    #[test]
1090    fn test_fuzzy_ratio() {
1091        assert_eq!(fuzzy_ratio("hello", "hello"), 100);
1092        assert_eq!(fuzzy_ratio("", ""), 100);
1093        assert!(fuzzy_ratio("hello", "world") < 50);
1094        assert!(fuzzy_ratio("hello world", "hello worl") > 80);
1095    }
1096
1097    #[test]
1098    fn test_split_ordered_commit() {
1099        let commit = r#"// First change
1100--- a/test.rs
1101+++ b/test.rs
1102@@ -1,3 +1,4 @@
1103 fn main() {
1104+    println!("hello");
1105+    println!("world");
1106 }
1107"#;
1108        let patch = Patch::parse_unified_diff(commit);
1109        let stats = patch.stats();
1110        assert_eq!(stats.added, 2);
1111
1112        let (source, target) = split_ordered_commit(commit, 1);
1113
1114        // Source should have 1 addition
1115        let src_patch = Patch::parse_unified_diff(&source);
1116        assert_eq!(src_patch.stats().added, 1);
1117
1118        // Target should have 1 addition
1119        let tgt_patch = Patch::parse_unified_diff(&target);
1120        assert_eq!(tgt_patch.stats().added, 1);
1121    }
1122
1123    #[test]
1124    fn test_split_ordered_commit_with_deletions() {
1125        let commit = r#"// Change
1126--- a/test.rs
1127+++ b/test.rs
1128@@ -1,3 +1,3 @@
1129 fn main() {
1130-    println!("old");
1131+    println!("new");
1132 }
1133"#;
1134        let patch = Patch::parse_unified_diff(commit);
1135        let stats = patch.stats();
1136        assert_eq!(stats.added, 1);
1137        assert_eq!(stats.removed, 1);
1138
1139        // Split at position 1 (after the deletion)
1140        let (source, target) = split_ordered_commit(commit, 1);
1141
1142        let src_patch = Patch::parse_unified_diff(&source);
1143        let tgt_patch = Patch::parse_unified_diff(&target);
1144
1145        // Source should have the deletion
1146        assert_eq!(src_patch.stats().removed, 1);
1147        // Target should have the addition
1148        assert_eq!(tgt_patch.stats().added, 1);
1149    }
1150
1151    #[test]
1152    fn test_generate_evaluation_example() {
1153        let commit = r#"commit abc123
1154Author: Test <test@example.com>
1155Date: Mon Jan 1 00:00:00 2024
1156
1157    Test commit
1158
1159////////////////////////////////////////////////////////////////////////////////
1160// Add greeting
1161////////////////////////////////////////////////////////////////////////////////
1162--- a/test.rs
1163+++ b/test.rs
1164@@ -1,3 +1,5 @@
1165 fn main() {
1166+    println!("hello");
1167+    println!("world");
1168 }
1169"#;
1170
1171        let result = generate_evaluation_example_from_ordered_commit(
1172            commit,
1173            "https://github.com/test/repo",
1174            "abc123",
1175            Some(SplitPoint::Fraction(0.5)),
1176            Some(42),
1177            None,
1178        );
1179
1180        assert!(result.is_ok());
1181        let case = result.unwrap();
1182        assert_eq!(case.repository_url, "https://github.com/test/repo");
1183        assert_eq!(case.revision, "abc123~1");
1184        assert!(!case.edit_history.is_empty());
1185    }
1186
1187    #[test]
1188    fn test_generate_evaluation_example_reproducible() {
1189        let commit = r#"////////////////////////////////////////////////////////////////////////////////
1190// Add greeting
1191////////////////////////////////////////////////////////////////////////////////
1192--- a/test.rs
1193+++ b/test.rs
1194@@ -1,3 +1,5 @@
1195 fn main() {
1196+    println!("hello");
1197+    println!("world");
1198 }
1199"#;
1200
1201        // Run twice with the same seed
1202        let result1 = generate_evaluation_example_from_ordered_commit(
1203            commit,
1204            "https://github.com/test/repo",
1205            "abc123",
1206            Some(SplitPoint::Fraction(0.5)),
1207            Some(12345),
1208            None,
1209        )
1210        .unwrap();
1211
1212        let result2 = generate_evaluation_example_from_ordered_commit(
1213            commit,
1214            "https://github.com/test/repo",
1215            "abc123",
1216            Some(SplitPoint::Fraction(0.5)),
1217            Some(12345),
1218            None,
1219        )
1220        .unwrap();
1221
1222        // Results should be identical
1223        assert_eq!(result1.edit_history, result2.edit_history);
1224        assert_eq!(result1.expected_patches, result2.expected_patches);
1225        assert_eq!(result1.cursor_position, result2.cursor_position);
1226    }
1227
1228    #[test]
1229    fn test_cursor_position_display() {
1230        let cursor = CursorPosition {
1231            file: "src/main.rs".to_string(),
1232            line: 42,
1233            column: 10,
1234        };
1235        assert_eq!(cursor.to_string(), "src/main.rs:42:10");
1236    }
1237
1238    #[test]
1239    fn test_imitate_human_edits_no_change_when_no_replacement() {
1240        // Source and target patches that don't form a replacement pattern
1241        let source = r#"--- a/test.rs
1242+++ b/test.rs
1243@@ -1,3 +1,4 @@
1244 fn main() {
1245+    println!("hello");
1246 }
1247"#;
1248        let target = r#"--- a/test.rs
1249+++ b/test.rs
1250@@ -1,3 +1,4 @@
1251 fn main() {
1252+    println!("world");
1253 }
1254"#;
1255
1256        let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, 42);
1257
1258        // Should return unchanged when not a replacement pattern
1259        assert_eq!(new_src, source);
1260        assert_eq!(new_tgt, target);
1261        assert!(cursor.is_none());
1262    }
1263
1264    #[test]
1265    fn test_split_point_fraction() {
1266        let commit = r#"// Change
1267--- a/test.rs
1268+++ b/test.rs
1269@@ -1,5 +1,10 @@
1270 fn main() {
1271+    line1();
1272+    line2();
1273+    line3();
1274+    line4();
1275+    line5();
1276 }
1277"#;
1278
1279        // Split at 20% should give first edit in source
1280        let result = generate_evaluation_example_from_ordered_commit(
1281            commit,
1282            "",
1283            "hash",
1284            Some(SplitPoint::Fraction(0.2)),
1285            Some(1),
1286            None,
1287        );
1288
1289        assert!(result.is_ok());
1290        let case = result.unwrap();
1291
1292        // Source should have some edits
1293        let src_patch = Patch::parse_unified_diff(&case.edit_history);
1294        assert!(src_patch.stats().added > 0);
1295    }
1296
1297    #[test]
1298    fn test_split_point_index() {
1299        let commit = r#"// Change
1300--- a/test.rs
1301+++ b/test.rs
1302@@ -1,5 +1,10 @@
1303 fn main() {
1304+    line1();
1305+    line2();
1306+    line3();
1307+    line4();
1308+    line5();
1309 }
1310"#;
1311
1312        // Split at index 2 should give first 2 edits in source
1313        // With pure insertion handling, source gets 2 original + 1 partial = 3 additions
1314        let result = generate_evaluation_example_from_ordered_commit(
1315            commit,
1316            "",
1317            "hash",
1318            Some(SplitPoint::Index(2)),
1319            Some(1),
1320            None,
1321        );
1322
1323        assert!(result.is_ok());
1324        let case = result.unwrap();
1325
1326        let src_patch = Patch::parse_unified_diff(&case.edit_history);
1327        // Pure insertion adds a partial line, so we expect 3 (2 original + 1 partial)
1328        assert_eq!(src_patch.stats().added, 3);
1329    }
1330
1331    #[test]
1332    fn test_cursor_excerpt_contains_marker() {
1333        let commit = r#"////////////////////////////////////////////////////////////////////////////////
1334// Add code
1335////////////////////////////////////////////////////////////////////////////////
1336--- a/test.rs
1337+++ b/test.rs
1338@@ -1,3 +1,5 @@
1339 fn main() {
1340+    println!("hello");
1341+    println!("world");
1342 }
1343"#;
1344
1345        let result = generate_evaluation_example_from_ordered_commit(
1346            commit,
1347            "",
1348            "hash",
1349            Some(SplitPoint::Fraction(0.5)),
1350            Some(42),
1351            None,
1352        )
1353        .unwrap();
1354
1355        // Cursor excerpt should contain the cursor marker
1356        assert!(
1357            result.cursor_position.contains("<|user_cursor|>"),
1358            "Cursor excerpt should contain marker: {}",
1359            result.cursor_position
1360        );
1361    }
1362
1363    #[test]
1364    fn test_evaluation_case_json_serialization() {
1365        let case = ExampleSpec {
1366            name: "test-abc123".to_string(),
1367            repository_url: "https://github.com/test/repo".to_string(),
1368            revision: "abc123~1".to_string(),
1369            edit_history: "patch1".to_string(),
1370            // cursor_position: "file.rs:10:5".to_string(),
1371            cursor_path: Path::new("file.rs").into(),
1372            cursor_position: "some code<|user_cursor|>".to_string(),
1373            expected_patches: vec!["patch".to_string()],
1374            tags: vec![],
1375            reasoning: None,
1376            uncommitted_diff: String::new(),
1377        };
1378
1379        let json = serde_json::to_string(&case).unwrap();
1380        let deserialized: ExampleSpec = serde_json::from_str(&json).unwrap();
1381
1382        assert_eq!(case.repository_url, deserialized.repository_url);
1383        assert_eq!(case.revision, deserialized.revision);
1384        assert_eq!(case.cursor_position, deserialized.cursor_position);
1385    }
1386
1387    #[test]
1388    fn test_empty_commit_returns_error() {
1389        let commit = "";
1390
1391        let result = generate_evaluation_example_from_ordered_commit(
1392            commit,
1393            "",
1394            "hash",
1395            Some(SplitPoint::Fraction(0.5)),
1396            Some(1),
1397            None,
1398        );
1399
1400        assert!(result.is_err());
1401    }
1402
1403    #[test]
1404    fn test_header_filtering() {
1405        let commit = r#"commit abc123
1406Author: Test
1407Date: Today
1408
1409    Message
1410
1411diff --git a/test.rs b/test.rs
1412index 123..456 789
1413////////////////////////////////////////////////////////////////////////////////
1414// First group
1415////////////////////////////////////////////////////////////////////////////////
1416--- a/test.rs
1417+++ b/test.rs
1418@@ -1,3 +1,4 @@
1419 fn main() {
1420+    code();
1421 }
1422"#;
1423
1424        let result = generate_evaluation_example_from_ordered_commit(
1425            commit,
1426            "",
1427            "hash",
1428            Some(SplitPoint::Index(1)),
1429            Some(1),
1430            None,
1431        );
1432
1433        assert!(result.is_ok());
1434        let case = result.unwrap();
1435
1436        // The edit history should contain the group header (// lines)
1437        // but not the commit metadata
1438        assert!(!case.edit_history.contains("Author:"));
1439        assert!(!case.edit_history.contains("Date:"));
1440    }
1441
1442    #[test]
1443    fn test_position_weight() {
1444        // High weight positions (natural pause points)
1445        assert_eq!(position_weight("foo(", 4), 10); // After '('
1446        assert_eq!(position_weight("a, b", 2), 10); // After ','
1447        assert_eq!(position_weight("x;", 2), 10); // After ';'
1448        assert_eq!(position_weight("a: b", 2), 10); // After ':'
1449        assert_eq!(position_weight("[", 1), 10); // After '['
1450        assert_eq!(position_weight("{", 1), 10); // After '{'
1451
1452        // High weight for closing brackets
1453        assert_eq!(position_weight("foo)", 4), 8); // After ')'
1454        assert_eq!(position_weight("]", 1), 8); // After ']'
1455        assert_eq!(position_weight("}", 1), 8); // After '}'
1456
1457        // High weight at end of identifier
1458        assert_eq!(position_weight("foo ", 3), 8); // End of 'foo' before space
1459        assert_eq!(position_weight("bar(", 3), 8); // End of 'bar' before '('
1460
1461        // Medium weight for operators
1462        assert_eq!(position_weight("a + b", 3), 5); // After '+'
1463        assert_eq!(position_weight("x.", 2), 5); // After '.'
1464        assert_eq!(position_weight("a=b", 2), 5); // After '='
1465
1466        // Medium weight for whitespace
1467        assert_eq!(position_weight("a ", 2), 6); // After space
1468
1469        // Low weight mid-identifier
1470        assert_eq!(position_weight("foobar", 3), 1); // Mid-identifier 'foo|bar'
1471
1472        // Edge cases
1473        assert_eq!(position_weight("", 0), 1); // Empty string
1474        assert_eq!(position_weight("a", 0), 1); // Position 0
1475    }
1476
1477    #[test]
1478    fn test_weighted_select() {
1479        // Test that weighted selection returns correct indices
1480        let weights = vec![1, 10, 1];
1481
1482        // With total weight 12, seed 0 should select index 0
1483        // seed 0 % 12 = 0, cumulative: 1 at idx 0, so returns 0
1484        assert_eq!(weighted_select(&weights, 0), 0);
1485
1486        // seed 1 % 12 = 1, cumulative: 1 at idx 0 (1 < 1 is false), 11 at idx 1 (1 < 11 is true)
1487        assert_eq!(weighted_select(&weights, 1), 1);
1488
1489        // seed 10 % 12 = 10, cumulative: 1, 11 at idx 1 (10 < 11 is true)
1490        assert_eq!(weighted_select(&weights, 10), 1);
1491
1492        // seed 11 % 12 = 11, cumulative: 1, 11 at idx 1 (11 < 11 is false), 12 at idx 2 (11 < 12 is true)
1493        assert_eq!(weighted_select(&weights, 11), 2);
1494
1495        // Empty weights should return 0
1496        let empty: Vec<u32> = vec![];
1497        assert_eq!(weighted_select(&empty, 42), 0);
1498
1499        // Single weight should always return index 0
1500        let single = vec![10];
1501        assert_eq!(weighted_select(&single, 0), 0);
1502        assert_eq!(weighted_select(&single, 100), 0);
1503    }
1504
1505    #[test]
1506    fn test_weighted_split_prefers_natural_boundaries() {
1507        // Test that with different seeds, weighted selection tends to prefer
1508        // positions after punctuation over mid-identifier positions
1509        let text_with_punctuation = "foo(bar, baz)";
1510        let text_mid_identifier = "foobar";
1511
1512        // Position after '(' should have high weight
1513        let weight_after_paren = position_weight(text_with_punctuation, 4);
1514        // Position after ',' should have high weight
1515        let weight_after_comma = position_weight(text_with_punctuation, 8);
1516        // Position mid-identifier should have low weight
1517        let weight_mid_ident = position_weight(text_mid_identifier, 3);
1518
1519        assert!(
1520            weight_after_paren > weight_mid_ident,
1521            "After '(' ({}) should be weighted higher than mid-identifier ({})",
1522            weight_after_paren,
1523            weight_mid_ident
1524        );
1525        assert!(
1526            weight_after_comma > weight_mid_ident,
1527            "After ',' ({}) should be weighted higher than mid-identifier ({})",
1528            weight_after_comma,
1529            weight_mid_ident
1530        );
1531    }
1532
1533    #[test]
1534    fn test_imitate_human_edits_pure_insertion() {
1535        // Source patch is empty (no edits yet)
1536        // Target patch has a pure insertion (adding a new line)
1537        let source = r#"--- a/test.rs
1538+++ b/test.rs
1539@@ -1,2 +1,2 @@
1540 fn main() {
1541 }
1542"#;
1543        let target = r#"--- a/test.rs
1544+++ b/test.rs
1545@@ -1,2 +1,3 @@
1546 fn main() {
1547+    println!("debug");
1548 }
1549"#;
1550
1551        let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, 42);
1552
1553        // Should have transformed the patches
1554        assert_ne!(
1555            new_src, source,
1556            "Source should be modified for pure insertion"
1557        );
1558        assert_ne!(
1559            new_tgt, target,
1560            "Target should be modified for pure insertion"
1561        );
1562        assert!(cursor.is_some(), "Cursor should be set");
1563
1564        // Source should now have a partial addition
1565        let src_patch = Patch::parse_unified_diff(&new_src);
1566        assert!(
1567            src_patch.stats().added > 0,
1568            "Source should have added lines"
1569        );
1570
1571        // Target should have both a deletion (of partial) and addition (of full)
1572        let tgt_patch = Patch::parse_unified_diff(&new_tgt);
1573        assert!(
1574            tgt_patch.stats().removed > 0,
1575            "Target should have removed lines (partial)"
1576        );
1577        assert!(
1578            tgt_patch.stats().added > 0,
1579            "Target should have added lines (full)"
1580        );
1581
1582        // The cursor should be in test.rs
1583        let cursor = cursor.unwrap();
1584        assert_eq!(cursor.file, "test.rs");
1585    }
1586
1587    #[test]
1588    fn test_imitate_human_edits_pure_insertion_empty_source() {
1589        // Source patch has no hunks at all
1590        let source = "";
1591        let target = r#"--- a/test.rs
1592+++ b/test.rs
1593@@ -1,2 +1,3 @@
1594 fn main() {
1595+    println!("hello");
1596 }
1597"#;
1598
1599        let (new_src, _new_tgt, cursor) = imitate_human_edits(source, target, 123);
1600
1601        // Should have created a source patch with partial insertion
1602        assert!(!new_src.is_empty(), "Source should not be empty");
1603        assert!(cursor.is_some(), "Cursor should be set");
1604
1605        let src_patch = Patch::parse_unified_diff(&new_src);
1606        assert!(
1607            src_patch.stats().added > 0,
1608            "Source should have added lines"
1609        );
1610    }
1611
1612    #[test]
1613    fn test_imitate_human_edits_pure_insertion_intermediate_content() {
1614        // Verify the actual intermediate content is a realistic partial typing state
1615        let source = "";
1616        let target = r#"--- a/test.rs
1617+++ b/test.rs
1618@@ -1,2 +1,3 @@
1619 fn main() {
1620+    println!("hello world");
1621 }
1622"#;
1623
1624        // Test with multiple seeds to see different split points
1625        let mut found_partial = false;
1626        for seed in 1..=50 {
1627            let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, seed);
1628
1629            if cursor.is_some() {
1630                let src_patch = Patch::parse_unified_diff(&new_src);
1631                let tgt_patch = Patch::parse_unified_diff(&new_tgt);
1632
1633                // Find the added line in source
1634                for hunk in &src_patch.hunks {
1635                    for line in &hunk.lines {
1636                        if let PatchLine::Addition(content) = line {
1637                            // The partial line should be a prefix of the full line
1638                            let full_line = "    println!(\"hello world\");";
1639                            if content != full_line && full_line.starts_with(content) {
1640                                found_partial = true;
1641
1642                                // Verify target has the partial as deletion
1643                                let mut has_deletion = false;
1644                                for tgt_hunk in &tgt_patch.hunks {
1645                                    for tgt_line in &tgt_hunk.lines {
1646                                        if let PatchLine::Deletion(del_content) = tgt_line {
1647                                            if del_content == content {
1648                                                has_deletion = true;
1649                                            }
1650                                        }
1651                                    }
1652                                }
1653                                assert!(
1654                                    has_deletion,
1655                                    "Target should have deletion of partial line"
1656                                );
1657                            }
1658                        }
1659                    }
1660                }
1661            }
1662        }
1663
1664        assert!(
1665            found_partial,
1666            "At least one seed should produce a partial intermediate state"
1667        );
1668    }
1669
1670    #[test]
1671    fn test_imitate_human_edits_inserts_after_last_source_edit() {
1672        // Regression test: intermediate content should appear after the last edit
1673        // in the source patch, not at the position of the first target edit.
1674        // This ensures the diff output correctly imitates human typing order.
1675        //
1676        // The bug was: when source has edits and target has a pure insertion,
1677        // the intermediate content was inserted at tgt_edit_loc.line_index_within_hunk
1678        // (position of first target edit) instead of after the last source edit.
1679        //
1680        // Source patch has edits at lines 1-4, target has a new edit at line 10
1681        // (different location to avoid the "same line" early return)
1682        let source = r#"--- a/test.py
1683+++ b/test.py
1684@@ -1,4 +1,5 @@
1685+import foo
1686 import bar
1687-import old
1688 import baz
1689+import qux
1690"#;
1691        // Target has a pure insertion at a different line (line 10, not overlapping with source)
1692        let target = r#"--- a/test.py
1693+++ b/test.py
1694@@ -10,3 +10,4 @@
1695 def main():
1696+    print("hello world")
1697     pass
1698"#;
1699
1700        // Use a seed that produces a partial result
1701        let (new_src, _new_tgt, cursor) = imitate_human_edits(source, target, 42);
1702
1703        // The function should produce a modified patch
1704        assert!(cursor.is_some(), "Should produce intermediate state");
1705
1706        let src_patch = Patch::parse_unified_diff(&new_src);
1707        let all_additions: Vec<_> = src_patch
1708            .hunks
1709            .iter()
1710            .flat_map(|h| h.lines.iter())
1711            .filter_map(|l| match l {
1712                PatchLine::Addition(s) => Some(s.as_str()),
1713                _ => None,
1714            })
1715            .collect();
1716
1717        // The intermediate content (partial 'print("hello world")') should be
1718        // the LAST addition, appearing after "+import qux" (the last source edit)
1719        let last_addition = all_additions.last().expect("Should have additions");
1720        assert!(
1721            last_addition.trim_start().starts_with("pr"),
1722            "Intermediate content should be the last addition (partial 'print'), but last was: {:?}",
1723            last_addition
1724        );
1725
1726        // Verify the original source edits are still in order before the intermediate
1727        let foo_pos = all_additions.iter().position(|s| *s == "import foo");
1728        let qux_pos = all_additions.iter().position(|s| *s == "import qux");
1729        let intermediate_pos = all_additions
1730            .iter()
1731            .position(|s| s.trim_start().starts_with("pr"));
1732
1733        assert!(foo_pos.is_some(), "Should have 'import foo'");
1734        assert!(qux_pos.is_some(), "Should have 'import qux'");
1735        assert!(
1736            intermediate_pos.is_some(),
1737            "Should have intermediate content"
1738        );
1739
1740        assert!(
1741            foo_pos < qux_pos && qux_pos < intermediate_pos,
1742            "Order should be: foo < qux < intermediate. Got foo={:?}, qux={:?}, intermediate={:?}",
1743            foo_pos,
1744            qux_pos,
1745            intermediate_pos
1746        );
1747    }
1748
1749    #[test]
1750    fn test_cursor_excerpt_with_multibyte_utf8() {
1751        // Test that cursor excerpt handles multi-byte UTF-8 characters correctly
1752        // The Chinese character '第' is 3 bytes (0..3)
1753        let cursor = CursorPosition {
1754            file: "test.md".to_string(),
1755            line: 1,
1756            column: 1, // Byte index 1 is inside '第' (bytes 0..3)
1757        };
1758
1759        let source_patch = r#"--- a/test.md
1760+++ b/test.md
1761@@ -1,1 +1,1 @@
1762+第 14 章 Flask 工作原理与机制解析**
1763"#;
1764
1765        let target_patch = "";
1766
1767        // This should not panic even though column=1 is not a char boundary
1768        let result = get_cursor_excerpt(&cursor, source_patch, target_patch);
1769
1770        // The function should handle the invalid byte index gracefully
1771        if let Some(excerpt) = result {
1772            assert!(
1773                excerpt.contains("<|user_cursor|>"),
1774                "Cursor excerpt should contain marker"
1775            );
1776            // The marker should be placed at a valid character boundary
1777            // (either at the start or after '第')
1778        }
1779    }
1780}