Add `ep split-commit` command (#46067)

Oleksiy Syvokon created 1 week ago

Generates a training or evaluation example from a
chronologically-ordered commit. This is a port from the Python codebase
(except for the reorder_patch.rs, which was originally written in Rust
in).


Release Notes:

- N/A

Change summary

Cargo.lock                                      |    8 
crates/edit_prediction_cli/Cargo.toml           |    4 
crates/edit_prediction_cli/src/main.rs          |   17 
crates/edit_prediction_cli/src/reorder_patch.rs | 1462 ++++++++++++++++++
crates/edit_prediction_cli/src/split_commit.rs  | 1465 +++++++++++++++++++
typos.toml                                      |    2 
6 files changed, 2,956 insertions(+), 2 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -5295,12 +5295,14 @@ dependencies = [
  "pretty_assertions",
  "project",
  "prompt_store",
+ "rand 0.9.2",
  "release_channel",
  "reqwest_client",
  "serde",
  "serde_json",
  "settings",
  "shellexpand 2.1.2",
+ "similar",
  "smol",
  "sqlez",
  "sqlez_macros",
@@ -15077,6 +15079,12 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
 
+[[package]]
+name = "similar"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa"
+
 [[package]]
 name = "simple_asn1"
 version = "0.6.3"

crates/edit_prediction_cli/Cargo.toml 🔗

@@ -17,7 +17,7 @@ anyhow.workspace = true
 anthropic.workspace = true
 http_client.workspace = true
 chrono.workspace = true
-clap.workspace = true
+clap = "4"
 client.workspace = true
 cloud_llm_client.workspace= true
 collections.workspace = true
@@ -55,6 +55,8 @@ watch.workspace = true
 edit_prediction = { workspace = true, features = ["cli-support"] }
 wasmtime.workspace = true
 zeta_prompt.workspace = true
+rand.workspace = true
+similar = "2.7.0"
 
 # Wasmtime is included as a dependency in order to enable the same
 # features that are enabled in Zed.

crates/edit_prediction_cli/src/main.rs 🔗

@@ -9,8 +9,10 @@ mod metrics;
 mod paths;
 mod predict;
 mod progress;
+mod reorder_patch;
 mod retrieve_context;
 mod score;
+mod split_commit;
 mod synthesize;
 
 use clap::{Args, CommandFactory, Parser, Subcommand, ValueEnum};
@@ -30,6 +32,7 @@ use crate::predict::run_prediction;
 use crate::progress::Progress;
 use crate::retrieve_context::run_context_retrieval;
 use crate::score::run_scoring;
+use crate::split_commit::SplitCommitArgs;
 use crate::synthesize::{SynthesizeConfig, run_synthesize};
 
 #[derive(Parser, Debug)]
@@ -74,6 +77,8 @@ enum Command {
     Synthesize(SynthesizeArgs),
     /// Remove git repositories and worktrees
     Clean,
+    /// Generate an evaluation example by splitting a chronologically-ordered commit
+    SplitCommit(SplitCommitArgs),
 }
 
 impl Display for Command {
@@ -127,6 +132,7 @@ impl Display for Command {
                 write!(f, "synthesize --repo={}", args.repo)
             }
             Command::Clean => write!(f, "clean"),
+            Command::SplitCommit(_) => write!(f, "split-commit"),
         }
     }
 }
@@ -235,6 +241,13 @@ fn main() {
             });
             return;
         }
+        Command::SplitCommit(split_commit_args) => {
+            if let Err(error) = split_commit::run_split_commit(split_commit_args) {
+                eprintln!("{error:#}");
+                std::process::exit(1);
+            }
+            return;
+        }
         _ => {}
     }
 
@@ -302,7 +315,9 @@ fn main() {
                                         run_scoring(example, &args, app_state.clone(), cx.clone())
                                             .await?;
                                     }
-                                    Command::Clean | Command::Synthesize(_) => {
+                                    Command::Clean
+                                    | Command::Synthesize(_)
+                                    | Command::SplitCommit(_) => {
                                         unreachable!()
                                     }
                                 }

crates/edit_prediction_cli/src/reorder_patch.rs 🔗

@@ -0,0 +1,1462 @@
+#![allow(unused)]
+
+use std::collections::{BTreeMap, BTreeSet, HashMap};
+
+/// Reorder selected groups of edits (additions & deletions) into a new patch.
+///
+/// Intuition:
+/// Think of the original patch as a timeline of atomic edit indices (0..N),
+/// where one edit is one deleted or inserted line.
+/// This function recombines these edits into a new patch which can be thought
+/// of as a sequence of patches.
+///
+/// You provide `edits_order` describing logical chunks (e.g., "write a feature",
+/// "refactor", "add tests"). For each group the function:
+///  1. Extracts those edits
+///  2. Appends them to the output patch
+///  3. Removes them from an internal remainder so subsequent original indices
+///     still point to the right (yet-to-be-extracted) edits.
+///
+/// The returned `Patch` contains only the edits you listed, emitted group by
+/// group. The leftover remainder is discarded.
+///
+/// Parameters:
+/// * `patch` - Source patch
+/// * `edits_order` - Vector of sets of original (0-based) edit indexes
+///
+/// Returns:
+/// * A new `Patch` containing the grouped edits in the requested order.
+///
+/// Example:
+/// ```rust
+/// use std::collections::BTreeSet;
+/// use reorder_patch::{Patch, reorder_edits};
+///
+/// // Edits (indexes): 0:-old, 1:+new, 2:-old2, 3:+new2, 4:+added
+/// let diff = "\
+/// --- a/a.txt
+/// +++ b/a.txt
+/// @@ -1,3 +1,3 @@
+///  one
+/// -old
+/// +new
+///  end
+/// @@ -5,3 +5,4 @@
+///  tail
+/// -old2
+/// +new2
+/// +added
+///  fin
+/// ";
+/// let patch = Patch::parse_unified_diff(diff);
+///
+/// // First take the part of the second hunk's edits (2),
+/// // then the first hunk (0,1), then the rest of the second hunk (3,4)
+/// let order = vec![BTreeSet::from([2]), BTreeSet::from([0, 1]), BTreeSet::from([3, 4])];
+/// let reordered = reorder_edits(&patch, order);
+/// println!("{}", reordered.to_string());
+/// ```
+pub fn reorder_edits(patch: &Patch, edits_order: Vec<BTreeSet<usize>>) -> Patch {
+    let mut result = Patch {
+        header: patch.header.clone(),
+        hunks: Vec::new(),
+    };
+
+    let mut remainder = patch.clone();
+
+    // Indexes in `edits_order` will shift as we apply edits.
+    // This structure maps the original index to the actual index.
+    let stats = patch.stats();
+    let total_edits = stats.added + stats.removed;
+    let mut indexes_map = BTreeMap::from_iter((0..total_edits).map(|i| (i, Some(i))));
+
+    for patch_edits_order in edits_order {
+        // Skip duplicated indexes that were already processed
+        let patch_edits_order = patch_edits_order
+            .into_iter()
+            .filter(|&i| indexes_map[&i].is_some()) // skip duplicated indexes
+            .collect::<BTreeSet<_>>();
+
+        if patch_edits_order.is_empty() {
+            continue;
+        }
+
+        let order = patch_edits_order
+            .iter()
+            .map(|&i| {
+                indexes_map[&i].unwrap_or_else(|| panic!("Edit index {i} has been already used. Perhaps your spec contains duplicates"))
+            })
+            .collect::<BTreeSet<_>>();
+
+        let extracted;
+        (extracted, remainder) = extract_edits(&remainder, &order);
+
+        result.hunks.extend(extracted.hunks);
+
+        // Update indexes_map to reflect applied edits. For example:
+        //
+        // Original_index | Removed?  | Mapped_value
+        //       0        | false     | 0
+        //       1        | true      | None
+        //       2        | true      | None
+        //       3        | false     | 1
+
+        for index in patch_edits_order {
+            indexes_map.insert(index, None);
+            for j in (index + 1)..total_edits {
+                if let Some(val) = indexes_map[&j] {
+                    indexes_map.insert(j, Some(val - 1));
+                }
+            }
+        }
+    }
+
+    result
+}
+
+/// Split a patch into (extracted, remainder) based on a set of edit indexes.
+/// The first returned patch contains only the chosen edits; the second contains
+/// everything else with those edits applied (converted into context).
+pub fn extract_edits(patch: &Patch, edit_indexes: &BTreeSet<usize>) -> (Patch, Patch) {
+    let mut extracted = patch.clone();
+    let mut remainder = patch.clone();
+
+    let stats = patch.stats();
+    let num_edits = stats.added + stats.removed;
+    let this_edits = edit_indexes.iter().cloned().collect::<Vec<_>>();
+    let other_edits = (0..num_edits)
+        .filter(|i| !edit_indexes.contains(i))
+        .collect();
+
+    remove_edits(&mut extracted, other_edits);
+    apply_edits(&mut remainder, this_edits);
+
+    (extracted, remainder)
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct Patch {
+    pub header: String,
+    pub hunks: Vec<Hunk>,
+}
+
+pub struct DiffStats {
+    pub added: usize,
+    pub removed: usize,
+}
+
+impl ToString for Patch {
+    fn to_string(&self) -> String {
+        let mut result = self.header.clone();
+        for hunk in &self.hunks {
+            let current_file = hunk.filename.clone();
+            result.push_str(&format!("--- a/{}\n", current_file));
+            result.push_str(&format!("+++ b/{}\n", current_file));
+            result.push_str(&hunk.to_string());
+        }
+
+        result
+    }
+}
+
+impl Patch {
+    /// Parse a unified diff (git style) string into a `Patch`.
+    pub fn parse_unified_diff(unified_diff: &str) -> Patch {
+        let mut current_file = String::new();
+        let mut is_filename_inherited = false;
+        let mut hunk = Hunk::default();
+        let mut patch = Patch::default();
+        let mut in_header = true;
+
+        for line in unified_diff.lines() {
+            if line.starts_with("--- ") || line.starts_with("+++ ") || line.starts_with("@@") {
+                in_header = false;
+            }
+
+            if in_header {
+                patch.header.push_str(format!("{}\n", &line).as_ref());
+                continue;
+            }
+
+            if line.starts_with("@@") {
+                if !hunk.lines.is_empty() {
+                    patch.hunks.push(hunk);
+                }
+                hunk = Hunk::from_header(line, &current_file, is_filename_inherited);
+                is_filename_inherited = true;
+            } else if let Some(path) = line.strip_prefix("--- ") {
+                is_filename_inherited = false;
+                current_file = path.trim().strip_prefix("a/").unwrap_or(path).into();
+            } else if let Some(path) = line.strip_prefix("+++ ") {
+                is_filename_inherited = false;
+                current_file = path.trim().strip_prefix("b/").unwrap_or(path).into();
+            } else if let Some(line) = line.strip_prefix("+") {
+                hunk.lines.push(PatchLine::Addition(line.to_string()));
+            } else if let Some(line) = line.strip_prefix("-") {
+                hunk.lines.push(PatchLine::Deletion(line.to_string()));
+            } else if let Some(line) = line.strip_prefix(" ") {
+                hunk.lines.push(PatchLine::Context(line.to_string()));
+            } else {
+                hunk.lines.push(PatchLine::Garbage(line.to_string()));
+            }
+        }
+
+        if !hunk.lines.is_empty() {
+            patch.hunks.push(hunk);
+        }
+
+        let header_lines = patch.header.lines().collect::<Vec<&str>>();
+        let len = header_lines.len();
+        if len >= 2 {
+            if header_lines[len - 2].starts_with("diff --git")
+                && header_lines[len - 1].starts_with("index ")
+            {
+                patch.header = header_lines[..len - 2].join("\n") + "\n";
+            }
+        }
+        if patch.header.trim().is_empty() {
+            patch.header = String::new();
+        }
+
+        patch
+    }
+
+    /// Drop hunks that contain no additions or deletions.
+    pub fn remove_empty_hunks(&mut self) {
+        self.hunks.retain(|hunk| {
+            hunk.lines
+                .iter()
+                .any(|line| matches!(line, PatchLine::Addition(_) | PatchLine::Deletion(_)))
+        });
+    }
+
+    /// Make sure there are no more than `context_lines` lines of context around each change.
+    pub fn normalize_hunks(&mut self, context_lines: usize) {
+        for hunk in &mut self.hunks {
+            // Find indices of all changes (additions and deletions)
+            let change_indices: Vec<usize> = hunk
+                .lines
+                .iter()
+                .enumerate()
+                .filter_map(|(i, line)| match line {
+                    PatchLine::Addition(_) | PatchLine::Deletion(_) => Some(i),
+                    _ => None,
+                })
+                .collect();
+
+            // If there are no changes, clear the hunk (it's all context)
+            if change_indices.is_empty() {
+                hunk.lines.clear();
+                hunk.old_count = 0;
+                hunk.new_count = 0;
+                continue;
+            }
+
+            // Determine the range to keep
+            let first_change = change_indices[0];
+            let last_change = change_indices[change_indices.len() - 1];
+
+            let start = first_change.saturating_sub(context_lines);
+            let end = (last_change + context_lines + 1).min(hunk.lines.len());
+
+            // Count lines trimmed from the beginning
+            let (old_lines_before, new_lines_before) = count_lines(&hunk.lines[0..start]);
+
+            // Keep only the lines in range + garbage
+            let garbage_before = hunk.lines[..start]
+                .iter()
+                .filter(|line| matches!(line, PatchLine::Garbage(_)));
+            let garbage_after = hunk.lines[end..]
+                .iter()
+                .filter(|line| matches!(line, PatchLine::Garbage(_)));
+
+            hunk.lines = garbage_before
+                .chain(hunk.lines[start..end].iter())
+                .chain(garbage_after)
+                .cloned()
+                .collect();
+
+            // Update hunk header
+            let (old_count, new_count) = count_lines(&hunk.lines);
+            hunk.old_start += old_lines_before as isize;
+            hunk.new_start += new_lines_before as isize;
+            hunk.old_count = old_count as isize;
+            hunk.new_count = new_count as isize;
+        }
+    }
+
+    /// Count total added and removed lines
+    pub fn stats(&self) -> DiffStats {
+        let mut added = 0;
+        let mut removed = 0;
+
+        for hunk in &self.hunks {
+            for line in &hunk.lines {
+                match line {
+                    PatchLine::Addition(_) => added += 1,
+                    PatchLine::Deletion(_) => removed += 1,
+                    _ => {}
+                }
+            }
+        }
+
+        DiffStats { added, removed }
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct Hunk {
+    pub old_start: isize,
+    pub old_count: isize,
+    pub new_start: isize,
+    pub new_count: isize,
+    pub comment: String,
+    pub filename: String,
+    pub is_filename_inherited: bool,
+    pub lines: Vec<PatchLine>,
+}
+
+impl ToString for Hunk {
+    fn to_string(&self) -> String {
+        let header = self.header_string();
+        let lines = self
+            .lines
+            .iter()
+            .map(|line| line.to_string() + "\n")
+            .collect::<Vec<String>>()
+            .join("");
+        format!("{header}\n{lines}")
+    }
+}
+
+impl Hunk {
+    /// Render the hunk header
+    pub fn header_string(&self) -> String {
+        format!(
+            "@@ -{},{} +{},{} @@ {}",
+            self.old_start,
+            self.old_count,
+            self.new_start,
+            self.new_count,
+            self.comment.clone()
+        )
+        .trim_end()
+        .into()
+    }
+
+    /// Create a `Hunk` from a raw header line and associated filename.
+    pub fn from_header(header: &str, filename: &str, is_filename_inherited: bool) -> Self {
+        let (old_start, old_count, new_start, new_count, comment) = Self::parse_hunk_header(header);
+        Self {
+            old_start,
+            old_count,
+            new_start,
+            new_count,
+            comment,
+            filename: filename.to_string(),
+            is_filename_inherited,
+            lines: Vec::new(),
+        }
+    }
+
+    /// Parse hunk headers like `@@ -3,2 +3,2 @@ some garbage"
+    fn parse_hunk_header(line: &str) -> (isize, isize, isize, isize, String) {
+        let header_part = line.trim_start_matches("@@").trim();
+        let parts: Vec<&str> = header_part.split_whitespace().collect();
+
+        if parts.len() < 2 {
+            return (0, 0, 0, 0, String::new());
+        }
+
+        let old_part = parts[0].trim_start_matches('-');
+        let new_part = parts[1].trim_start_matches('+');
+
+        let (old_start, old_count) = Hunk::parse_hunk_header_range(old_part);
+        let (new_start, new_count) = Hunk::parse_hunk_header_range(new_part);
+
+        let comment = if parts.len() > 2 {
+            parts[2..]
+                .join(" ")
+                .trim_start_matches("@@")
+                .trim()
+                .to_string()
+        } else {
+            String::new()
+        };
+
+        (
+            old_start as isize,
+            old_count as isize,
+            new_start as isize,
+            new_count as isize,
+            comment,
+        )
+    }
+
+    fn parse_hunk_header_range(part: &str) -> (usize, usize) {
+        let (old_start, old_count) = if part.contains(',') {
+            let old_parts: Vec<&str> = part.split(',').collect();
+            (
+                old_parts[0].parse().unwrap_or(0),
+                old_parts[1].parse().unwrap_or(0),
+            )
+        } else {
+            (part.parse().unwrap_or(0), 1)
+        };
+        (old_start, old_count)
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum PatchLine {
+    Context(String),
+    Addition(String),
+    Deletion(String),
+    HunkHeader(usize, usize, usize, usize, String),
+    FileStartMinus(String),
+    FileStartPlus(String),
+    Garbage(String),
+}
+
+impl PatchLine {
+    pub fn parse(line: &str) -> Self {
+        if let Some(line) = line.strip_prefix("+") {
+            Self::Addition(line.to_string())
+        } else if let Some(line) = line.strip_prefix("-") {
+            Self::Deletion(line.to_string())
+        } else if let Some(line) = line.strip_prefix(" ") {
+            Self::Context(line.to_string())
+        } else {
+            Self::Garbage(line.to_string())
+        }
+    }
+}
+
+impl ToString for PatchLine {
+    fn to_string(&self) -> String {
+        match self {
+            PatchLine::Context(line) => format!(" {}", line),
+            PatchLine::Addition(line) => format!("+{}", line),
+            PatchLine::Deletion(line) => format!("-{}", line),
+            PatchLine::HunkHeader(old_start, old_end, new_start, new_end, comment) => format!(
+                "@@ -{},{} +{},{} @@ {}",
+                old_start, old_end, new_start, new_end, comment
+            )
+            .trim_end()
+            .into(),
+            PatchLine::FileStartMinus(filename) => format!("--- {}", filename),
+            PatchLine::FileStartPlus(filename) => format!("+++ {}", filename),
+            PatchLine::Garbage(line) => line.to_string(),
+        }
+    }
+}
+
+///
+/// Removes specified edits from a patch by their indexes and adjusts line numbers accordingly.
+///
+/// This function removes edits (additions and deletions) from the patch as they never were made.
+/// The resulting patch is adjusted to maintain correctness.
+///
+/// # Arguments
+///
+/// * `patch` - A patch to modify
+/// * `edit_indexes` - A vector of edit indexes to remove (0-based, counting only additions and deletions)
+/// ```
+pub fn remove_edits(patch: &mut Patch, edit_indexes: Vec<usize>) {
+    let mut current_edit_index: isize = -1;
+    let mut new_start_delta_by_file: HashMap<String, isize> = HashMap::new();
+
+    for hunk in &mut patch.hunks {
+        if !hunk.is_filename_inherited {
+            new_start_delta_by_file.insert(hunk.filename.clone(), 0);
+        }
+        let delta = new_start_delta_by_file
+            .entry(hunk.filename.clone())
+            .or_insert(0);
+        hunk.new_start += *delta;
+
+        hunk.lines = hunk
+            .lines
+            .drain(..)
+            .filter_map(|line| {
+                let is_edit = matches!(line, PatchLine::Addition(_) | PatchLine::Deletion(_));
+                if is_edit {
+                    current_edit_index += 1;
+                    if !edit_indexes.contains(&(current_edit_index as usize)) {
+                        return Some(line);
+                    }
+                }
+                match line {
+                    PatchLine::Addition(_) => {
+                        hunk.new_count -= 1;
+                        *delta -= 1;
+                        None
+                    }
+                    PatchLine::Deletion(content) => {
+                        hunk.new_count += 1;
+                        *delta += 1;
+                        Some(PatchLine::Context(content))
+                    }
+                    _ => Some(line),
+                }
+            })
+            .collect();
+    }
+
+    patch.normalize_hunks(3);
+    patch.remove_empty_hunks();
+}
+
+///
+/// Apply specified edits in the patch.
+///
+/// This generates another patch that looks like selected edits are already made
+/// and became part of the context
+///
+/// See also: `remove_edits()`
+///
+pub fn apply_edits(patch: &mut Patch, edit_indexes: Vec<usize>) {
+    let mut current_edit_index: isize = -1;
+    let mut delta_by_file: HashMap<String, isize> = HashMap::new();
+
+    for hunk in &mut patch.hunks {
+        if !hunk.is_filename_inherited {
+            delta_by_file.insert(hunk.filename.clone(), 0);
+        }
+        let delta = delta_by_file.entry(hunk.filename.clone()).or_insert(0);
+        hunk.old_start += *delta;
+
+        hunk.lines = hunk
+            .lines
+            .drain(..)
+            .filter_map(|line| {
+                let is_edit = matches!(line, PatchLine::Addition(_) | PatchLine::Deletion(_));
+                if is_edit {
+                    current_edit_index += 1;
+                    if !edit_indexes.contains(&(current_edit_index as usize)) {
+                        return Some(line);
+                    }
+                }
+                match line {
+                    PatchLine::Addition(content) => {
+                        hunk.old_count += 1;
+                        *delta += 1;
+                        Some(PatchLine::Context(content))
+                    }
+                    PatchLine::Deletion(_) => {
+                        hunk.old_count -= 1;
+                        *delta -= 1;
+                        None
+                    }
+                    _ => Some(line),
+                }
+            })
+            .collect();
+    }
+
+    patch.normalize_hunks(3);
+    patch.remove_empty_hunks();
+}
+
+/// Parse an order specification text into groups of edit indexes.
+/// Supports numbers, ranges (a-b), commas, comments starting with `//`, and blank lines.
+///
+/// # Example spec
+///
+/// // Add new dependency
+/// 1, 49
+///
+/// // Add new imports and types
+/// 8-9, 51
+///
+/// // Add new struct and methods
+/// 10-47
+///
+/// // Update tests
+/// 48, 50
+///
+pub fn parse_order_spec(spec: &str) -> Vec<BTreeSet<usize>> {
+    let mut order = Vec::new();
+
+    for line in spec.lines() {
+        let line = line.trim();
+
+        // Skip empty lines and comments
+        if line.is_empty() || line.starts_with("//") {
+            continue;
+        }
+
+        // Parse the line into a BTreeSet
+        let mut set = BTreeSet::new();
+
+        for part in line.split(',') {
+            let part = part.trim();
+
+            if part.contains('-') {
+                // Handle ranges like "8-9" or "10-47"
+                let range_parts: Vec<&str> = part.split('-').collect();
+                if range_parts.len() == 2 {
+                    if let (Ok(start), Ok(end)) = (
+                        range_parts[0].parse::<usize>(),
+                        range_parts[1].parse::<usize>(),
+                    ) {
+                        for i in start..=end {
+                            set.insert(i);
+                        }
+                    } else {
+                        eprintln!("Warning: Invalid range format '{}'", part);
+                    }
+                } else {
+                    eprintln!("Warning: Invalid range format '{}'", part);
+                }
+            } else {
+                // Handle single numbers
+                if let Ok(num) = part.parse::<usize>() {
+                    set.insert(num);
+                } else {
+                    eprintln!("Warning: Invalid number format '{}'", part);
+                }
+            }
+        }
+
+        if !set.is_empty() {
+            order.push(set);
+        }
+    }
+
+    order
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub struct EditLocation {
+    pub filename: String,
+    pub source_line_number: usize,
+    pub target_line_number: usize,
+    pub patch_line: PatchLine,
+    pub hunk_index: usize,
+    pub line_index_within_hunk: usize,
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum EditType {
+    Deletion,
+    Insertion,
+}
+
+pub fn locate_edited_line(patch: &Patch, mut edit_index: isize) -> Option<EditLocation> {
+    let mut edit_locations = vec![];
+
+    for (hunk_index, hunk) in patch.hunks.iter().enumerate() {
+        let mut old_line_number = hunk.old_start;
+        let mut new_line_number = hunk.new_start;
+        for (line_index, line) in hunk.lines.iter().enumerate() {
+            if matches!(line, PatchLine::Context(_)) {
+                old_line_number += 1;
+                new_line_number += 1;
+                continue;
+            }
+
+            if !matches!(line, PatchLine::Addition(_) | PatchLine::Deletion(_)) {
+                continue;
+            }
+
+            // old  new
+            //  1    1       context
+            //  2    2       context
+            //  3    3      -deleted
+            //  4    3      +insert
+            //  4    4       more context
+            //
+            // old   new
+            //  1     1      context
+            //  2     2      context
+            //  3     3     +inserted
+            //  3     4      more context
+            //
+            // old  new
+            //  1    1      -deleted
+            //
+            // old  new
+            //  1    1       context
+            //  2    2       context
+            //  3    3      -deleted
+            //  4    3       more context
+
+            edit_locations.push(EditLocation {
+                filename: hunk.filename.clone(),
+                source_line_number: old_line_number as usize,
+                target_line_number: new_line_number as usize,
+                patch_line: line.clone(),
+                hunk_index,
+                line_index_within_hunk: line_index,
+            });
+
+            match line {
+                PatchLine::Addition(_) => new_line_number += 1,
+                PatchLine::Deletion(_) => old_line_number += 1,
+                PatchLine::Context(_) => (),
+                _ => (),
+            };
+        }
+    }
+
+    if edit_index < 0 {
+        edit_index += edit_locations.len() as isize; // take from end
+    }
+    (0..edit_locations.len())
+        .contains(&(edit_index as usize))
+        .then(|| edit_locations.swap_remove(edit_index as usize)) // remove to take ownership
+}
+//
+// Helper function to count old and new lines
+fn count_lines(lines: &[PatchLine]) -> (usize, usize) {
+    lines.iter().fold((0, 0), |(old, new), line| match line {
+        PatchLine::Context(_) => (old + 1, new + 1),
+        PatchLine::Deletion(_) => (old + 1, new),
+        PatchLine::Addition(_) => (old, new + 1),
+        _ => (old, new),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use indoc::indoc;
+
+    #[test]
+    fn test_parse_unified_diff() {
+        let patch_str = indoc! {"
+            Patch header
+            ============
+
+            diff --git a/text.txt b/text.txt
+            index 86c770d..a1fd855 100644
+            --- a/text.txt
+            +++ b/text.txt
+            @@ -1,7 +1,7 @@
+             azuere
+             beige
+             black
+            -blue
+            +dark blue
+             brown
+             cyan
+             gold
+
+            Some garbage
+
+            diff --git a/second.txt b/second.txt
+            index 86c770d..a1fd855 100644
+            --- a/second.txt
+            +++ b/second.txt
+            @@ -9,6 +9,7 @@ gray
+             green
+             indigo
+             magenta
+            +silver
+             orange
+             pink
+             purple
+            diff --git a/text.txt b/text.txt
+            index 86c770d..a1fd855 100644
+            --- a/text.txt
+            +++ b/text.txt
+            @@ -16,4 +17,3 @@ red
+             violet
+             white
+             yellow
+            -zinc
+        "};
+        let patch = Patch::parse_unified_diff(patch_str);
+
+        assert_eq!(patch.header, "Patch header\n============\n\n");
+        assert_eq!(patch.hunks.len(), 3);
+        assert_eq!(patch.hunks[0].header_string(), "@@ -1,7 +1,7 @@");
+        assert_eq!(patch.hunks[1].header_string(), "@@ -9,6 +9,7 @@ gray");
+        assert_eq!(patch.hunks[2].header_string(), "@@ -16,4 +17,3 @@ red");
+        assert_eq!(patch.hunks[0].is_filename_inherited, false);
+        assert_eq!(patch.hunks[1].is_filename_inherited, false);
+        assert_eq!(patch.hunks[2].is_filename_inherited, false);
+    }
+
+    #[test]
+    fn test_locate_edited_line() {
+        let patch_str = indoc! {"
+            Patch header
+            ============
+
+            diff --git a/text.txt b/text.txt
+            index 86c770d..a1fd855 100644
+            --- a/text.txt
+            +++ b/text.txt
+            @@ -1,7 +1,7 @@
+             azuere
+             beige
+             black
+            -blue
+            +dark blue
+             brown
+             cyan
+             gold
+            diff --git a/second.txt b/second.txt
+            index 86c770d..a1fd855 100644
+            --- a/second.txt
+            +++ b/second.txt
+            @@ -9,6 +9,7 @@ gray
+             green
+             indigo
+             magenta
+            +silver
+             orange
+             pink
+             purple
+            diff --git a/text.txt b/text.txt
+            index 86c770d..a1fd855 100644
+            --- a/text.txt
+            +++ b/text.txt
+            @@ -16,4 +17,3 @@ red
+             violet
+             white
+             yellow
+            -zinc
+        "};
+        let patch = Patch::parse_unified_diff(patch_str);
+
+        assert_eq!(
+            locate_edited_line(&patch, 0), // -blue
+            Some(EditLocation {
+                filename: "text.txt".to_string(),
+                source_line_number: 4,
+                target_line_number: 4,
+                patch_line: PatchLine::Deletion("blue".to_string()),
+                hunk_index: 0,
+                line_index_within_hunk: 3
+            })
+        );
+        assert_eq!(
+            locate_edited_line(&patch, 1), // +dark blue
+            Some(EditLocation {
+                filename: "text.txt".to_string(),
+                source_line_number: 5,
+                target_line_number: 4,
+                patch_line: PatchLine::Addition("dark blue".to_string()),
+                hunk_index: 0,
+                line_index_within_hunk: 4
+            })
+        );
+        assert_eq!(
+            locate_edited_line(&patch, 2), // +silver
+            Some(EditLocation {
+                filename: "second.txt".to_string(),
+                source_line_number: 12,
+                target_line_number: 12,
+                patch_line: PatchLine::Addition("silver".to_string()),
+                hunk_index: 1,
+                line_index_within_hunk: 3
+            })
+        );
+    }
+
+    mod remove_edits {
+        use super::*;
+        use indoc::indoc;
+        use pretty_assertions::assert_eq;
+
+        static PATCH: &'static str = indoc! {"
+            diff --git a/text.txt b/text.txt
+            index 86c770d..a1fd855 100644
+            --- a/text.txt
+            +++ b/text.txt
+            @@ -1,7 +1,7 @@
+             azuere
+             beige
+             black
+            -blue
+            +dark blue
+             brown
+             cyan
+             gold
+            @@ -9,6 +9,7 @@ gray
+             green
+             indigo
+             magenta
+            +silver
+             orange
+             pink
+             purple
+            @@ -16,4 +17,3 @@ red
+             violet
+             white
+             yellow
+            -zinc
+        "};
+
+        #[test]
+        fn test_removes_hunks_without_edits() {
+            // Remove the first two edits:
+            // -blue
+            // +dark blue
+            let mut patch = Patch::parse_unified_diff(PATCH);
+            remove_edits(&mut patch, vec![0, 1]);
+
+            // The whole hunk should be removed since there are no other edits in it
+            let actual = patch.to_string();
+            let expected = indoc! {"
+                --- a/text.txt
+                +++ b/text.txt
+                @@ -9,6 +9,7 @@ gray
+                 green
+                 indigo
+                 magenta
+                +silver
+                 orange
+                 pink
+                 purple
+                --- a/text.txt
+                +++ b/text.txt
+                @@ -16,4 +17,3 @@ red
+                 violet
+                 white
+                 yellow
+                -zinc
+            "};
+            assert_eq!(actual, String::from(expected));
+        }
+
+        #[test]
+        fn test_adjust_line_numbers_after_deletion() {
+            // Remove the first deletion (`-blue`)
+            let mut patch = Patch::parse_unified_diff(PATCH);
+            remove_edits(&mut patch, vec![0]);
+
+            // The line numbers should be adjusted in the subsequent hunks
+            println!("{}", &patch.to_string());
+            assert_eq!(patch.hunks[0].header_string(), "@@ -2,6 +2,7 @@");
+            assert_eq!(patch.hunks[1].header_string(), "@@ -9,6 +10,7 @@ gray");
+            assert_eq!(patch.hunks[2].header_string(), "@@ -16,4 +18,3 @@ red");
+        }
+        #[test]
+        fn test_adjust_line_numbers_after_insertion() {
+            // Remove the first insertion (`+dark blue`)
+            let mut patch = Patch::parse_unified_diff(PATCH);
+            remove_edits(&mut patch, vec![1]);
+
+            // The line numbers should be adjusted in the subsequent hunks
+            assert_eq!(patch.hunks[0].header_string(), "@@ -1,7 +1,6 @@");
+            assert_eq!(patch.hunks[1].header_string(), "@@ -9,6 +8,7 @@ gray");
+            assert_eq!(patch.hunks[2].header_string(), "@@ -16,4 +16,3 @@ red");
+        }
+        #[test]
+        fn test_adjust_line_numbers_multifile_case() {
+            // Given a patch that spans multiple files
+            let patch_str = indoc! {"
+                --- a/first.txt
+                +++ b/first.txt
+                @@ -1,7 +1,7 @@
+                 azuere
+                 beige
+                 black
+                -blue
+                +dark blue
+                 brown
+                 cyan
+                 gold
+                @@ -16,4 +17,3 @@ red
+                 violet
+                 white
+                 yellow
+                -zinc
+                --- a/second.txt
+                +++ b/second.txt
+                @@ -9,6 +9,7 @@ gray
+                 green
+                 indigo
+                 magenta
+                +silver
+                 orange
+                 pink
+                 purple
+            "};
+
+            // When removing edit from one of the files (`+dark blue`)
+            let mut patch = Patch::parse_unified_diff(patch_str);
+            remove_edits(&mut patch, vec![1]);
+
+            // Then the line numbers should only be adjusted in subsequent hunks from that file
+            assert_eq!(patch.hunks[0].header_string(), "@@ -1,7 +1,6 @@"); // edited hunk
+            assert_eq!(patch.hunks[1].header_string(), "@@ -16,4 +16,3 @@ red"); // hunk from edited file again
+            assert_eq!(patch.hunks[2].header_string(), "@@ -9,6 +9,7 @@ gray"); // hunk from another file
+
+            // When removing hunk from `second.txt`
+            let mut patch = Patch::parse_unified_diff(patch_str);
+            remove_edits(&mut patch, vec![3]);
+
+            // Then patch serialization should list `first.txt` only once
+            // (because hunks from that file become adjacent)
+            let expected = indoc! {"
+                --- a/first.txt
+                +++ b/first.txt
+                @@ -1,7 +1,7 @@
+                 azuere
+                 beige
+                 black
+                -blue
+                +dark blue
+                 brown
+                 cyan
+                 gold
+                --- a/first.txt
+                +++ b/first.txt
+                @@ -16,4 +17,3 @@ red
+                 violet
+                 white
+                 yellow
+                -zinc
+            "};
+            assert_eq!(patch.to_string(), expected);
+        }
+
+        #[test]
+        fn test_dont_adjust_line_numbers_samefile_case() {
+            // Given a patch that has hunks in the same file, but with a file header
+            // (which makes `git apply` flush edits so far and start counting lines numbers afresh)
+            let patch_str = indoc! {"
+                diff --git a/text.txt b/text.txt
+                index 86c770d..a1fd855 100644
+                --- a/text.txt
+                +++ b/text.txt
+                @@ -1,7 +1,7 @@
+                 azuere
+                 beige
+                 black
+                -blue
+                +dark blue
+                 brown
+                 cyan
+                 gold
+                --- a/text.txt
+                +++ b/text.txt
+                @@ -16,4 +16,3 @@ red
+                 violet
+                 white
+                 yellow
+                -zinc
+        "};
+
+            // When removing edit from one of the files (`+dark blue`)
+            let mut patch = Patch::parse_unified_diff(patch_str);
+            remove_edits(&mut patch, vec![1]);
+
+            // Then the line numbers should **not** be adjusted in a subsequent hunk,
+            // because it starts with a file header
+            assert_eq!(patch.hunks[0].header_string(), "@@ -1,7 +1,6 @@"); // edited hunk
+            assert_eq!(patch.hunks[1].header_string(), "@@ -16,4 +16,3 @@ red"); // subsequent hunk
+        }
+    }
+
+    mod apply_edits {
+        use super::*;
+        use indoc::indoc;
+        use pretty_assertions::assert_eq;
+
+        static PATCH: &'static str = indoc! {"
+            diff --git a/text.txt b/text.txt
+            index 86c770d..a1fd855 100644
+            --- a/text.txt
+            +++ b/text.txt
+            @@ -1,7 +1,7 @@
+             azuere
+             beige
+             black
+            -blue
+            +dark blue
+             brown
+             cyan
+             gold
+             --- a/text.txt
+             +++ b/text.txt
+            @@ -9,6 +9,7 @@ gray
+             green
+             indigo
+             magenta
+            +silver
+             orange
+             pink
+             purple
+             --- a/text.txt
+             +++ b/text.txt
+            @@ -16,4 +17,3 @@ red
+             violet
+             white
+             yellow
+            -zinc
+        "};
+
+        #[test]
+        fn test_removes_hunks_without_edits() {
+            // When applying the first two edits (`-blue`, `+dark blue`)
+            let mut patch = Patch::parse_unified_diff(PATCH);
+            apply_edits(&mut patch, vec![0, 1]);
+
+            // Then the whole hunk should be removed since there are no other edits in it,
+            // and the line numbers should be adjusted in the subsequent hunks
+            assert_eq!(patch.hunks[0].header_string(), "@@ -9,6 +9,7 @@ gray");
+            assert_eq!(patch.hunks[1].header_string(), "@@ -16,4 +17,3 @@ red");
+            assert_eq!(patch.hunks.len(), 2);
+        }
+
+        #[test]
+        fn test_adjust_line_numbers_after_applying_deletion() {
+            // Apply the first deletion (`-blue`)
+            let mut patch = Patch::parse_unified_diff(PATCH);
+            apply_edits(&mut patch, vec![0]);
+
+            // The line numbers should be adjusted
+            assert_eq!(patch.hunks[0].header_string(), "@@ -1,6 +1,7 @@");
+            assert_eq!(patch.hunks[1].header_string(), "@@ -8,6 +9,7 @@ gray");
+            assert_eq!(patch.hunks[2].header_string(), "@@ -15,4 +17,3 @@ red");
+        }
+        #[test]
+        fn test_adjust_line_numbers_after_applying_insertion() {
+            // Apply the first insertion (`+dark blue`)
+            let mut patch = Patch::parse_unified_diff(PATCH);
+            apply_edits(&mut patch, vec![1]);
+
+            // The line numbers should be adjusted in the subsequent hunks
+            println!("{}", &patch.to_string());
+            assert_eq!(patch.hunks[0].header_string(), "@@ -1,7 +1,6 @@");
+            assert_eq!(patch.hunks[1].header_string(), "@@ -10,6 +9,7 @@ gray");
+            assert_eq!(patch.hunks[2].header_string(), "@@ -17,4 +17,3 @@ red");
+        }
+    }
+
+    mod reorder_edits {
+        use super::*;
+        use indoc::indoc;
+        use pretty_assertions::assert_eq;
+
+        static PATCH: &'static str = indoc! {"
+            Some header.
+
+            diff --git a/first.txt b/first.txt
+            index 86c770d..a1fd855 100644
+            --- a/first.txt
+            +++ b/first.txt
+            @@ -1,7 +1,7 @@
+             azuere
+             beige
+             black
+            -blue
+            +dark blue
+             brown
+             cyan
+             gold
+            --- a/second.txt
+            +++ b/second.txt
+            @@ -9,6 +9,7 @@ gray
+             green
+             indigo
+             magenta
+            +silver
+             orange
+             pink
+             purple
+            --- a/first.txt
+            +++ b/first.txt
+            @@ -16,4 +17,3 @@ red
+             violet
+             white
+             yellow
+            -zinc
+        "};
+
+        #[test]
+        fn test_reorder_1() {
+            let edits_order = vec![
+                BTreeSet::from([2]),    // +silver
+                BTreeSet::from([3]),    // -zinc
+                BTreeSet::from([0, 1]), // -blue +dark blue
+            ];
+
+            let patch = Patch::parse_unified_diff(PATCH);
+            let reordered_patch = reorder_edits(&patch, edits_order);
+
+            // The whole hunk should be removed since there are no other edits in it
+            let actual = reordered_patch.to_string();
+
+            println!("{}", actual);
+
+            let expected = indoc! {"
+               Some header.
+
+               --- a/second.txt
+               +++ b/second.txt
+               @@ -9,6 +9,7 @@ gray
+                green
+                indigo
+                magenta
+               +silver
+                orange
+                pink
+                purple
+               --- a/first.txt
+               +++ b/first.txt
+               @@ -16,4 +17,3 @@ red
+                violet
+                white
+                yellow
+               -zinc
+               --- a/first.txt
+               +++ b/first.txt
+               @@ -1,7 +1,7 @@
+                azuere
+                beige
+                black
+               -blue
+               +dark blue
+                brown
+                cyan
+                gold
+            "};
+            assert_eq!(actual, String::from(expected));
+        }
+
+        #[test]
+        fn test_reorder_duplicates() {
+            let edits_order = vec![
+                BTreeSet::from([2]), // +silver
+                BTreeSet::from([2]), // +silver again
+                BTreeSet::from([3]), // -zinc
+            ];
+
+            let patch = Patch::parse_unified_diff(PATCH);
+            let reordered_patch = reorder_edits(&patch, edits_order);
+
+            // The whole hunk should be removed since there are no other edits in it
+            let actual = reordered_patch.to_string();
+
+            println!("{}", actual);
+
+            let expected = indoc! {"
+                       Some header.
+
+                       --- a/second.txt
+                       +++ b/second.txt
+                       @@ -9,6 +9,7 @@ gray
+                        green
+                        indigo
+                        magenta
+                       +silver
+                        orange
+                        pink
+                        purple
+                       --- a/first.txt
+                       +++ b/first.txt
+                       @@ -16,4 +17,3 @@ red
+                        violet
+                        white
+                        yellow
+                       -zinc
+                    "};
+            assert_eq!(actual, String::from(expected));
+        }
+    }
+
+    mod extract_edits {
+
+        use super::*;
+        use indoc::indoc;
+        use pretty_assertions::assert_eq;
+
+        static PATCH: &'static str = indoc! {"
+            Some header.
+
+            diff --git a/first.txt b/first.txt
+            index 86c770d..a1fd855 100644
+            --- a/first.txt
+            +++ b/first.txt
+            @@ -1,7 +1,7 @@
+             azuere
+             beige
+             black
+            -blue
+            +dark blue
+             brown
+             cyan
+             gold
+            @@ -16,4 +17,3 @@ red
+             violet
+             white
+             yellow
+            -zinc
+            --- a/second.txt
+            +++ b/second.txt
+            @@ -9,6 +9,7 @@ gray
+             green
+             indigo
+             magenta
+            +silver
+             orange
+             pink
+             purple
+        "};
+
+        #[test]
+        fn test_extract_edits() {
+            let to_extract = BTreeSet::from([
+                3, // +silver
+                0, // -blue
+            ]);
+
+            let mut patch = Patch::parse_unified_diff(PATCH);
+            let (extracted, remainder) = extract_edits(&mut patch, &to_extract);
+
+            // Edits will be extracted in the sorted order, so [0, 3]
+            let expected_extracted = indoc! {"
+               Some header.
+
+               --- a/first.txt
+               +++ b/first.txt
+               @@ -1,7 +1,6 @@
+                azuere
+                beige
+                black
+               -blue
+                brown
+                cyan
+                gold
+               --- a/second.txt
+               +++ b/second.txt
+               @@ -9,6 +9,7 @@ gray
+                green
+                indigo
+                magenta
+               +silver
+                orange
+                pink
+                purple
+            "};
+
+            let expected_remainder = indoc! {"
+                Some header.
+
+                --- a/first.txt
+                +++ b/first.txt
+                @@ -1,6 +1,7 @@
+                 azuere
+                 beige
+                 black
+                +dark blue
+                 brown
+                 cyan
+                 gold
+                --- a/first.txt
+                +++ b/first.txt
+                @@ -15,4 +17,3 @@ red
+                 violet
+                 white
+                 yellow
+                -zinc
+            "};
+            assert_eq!(extracted.to_string(), String::from(expected_extracted));
+            assert_eq!(remainder.to_string(), String::from(expected_remainder));
+        }
+    }
+
+    #[test]
+    fn test_parse_order_file() {
+        let content = r#"
+// Add new dependency
+1, 49
+
+// Add new imports and types
+8-9, 51
+
+// Add new struct and login command method
+10-47
+
+// Modify AgentServerDelegate to make status_tx optional
+2-3
+
+// Update status_tx usage to handle optional value
+4
+5-7
+
+// Update all existing callers to use None for status_tx
+48, 50
+
+// Update the main login implementation to use custom command
+52-55
+56-95
+"#;
+
+        let order = parse_order_spec(content);
+
+        assert_eq!(order.len(), 9);
+
+        // First group: 1, 49
+        assert_eq!(order[0], BTreeSet::from([1, 49]));
+
+        // Second group: 8-9, 51
+        assert_eq!(order[1], BTreeSet::from([8, 9, 51]));
+
+        // Third group: 10-47
+        let expected_range: BTreeSet<usize> = (10..=47).collect();
+        assert_eq!(order[2], expected_range);
+
+        // Fourth group: 2-3
+        assert_eq!(order[3], BTreeSet::from([2, 3]));
+
+        // Fifth group: 4
+        assert_eq!(order[4], BTreeSet::from([4]));
+
+        // Sixth group: 5-7
+        assert_eq!(order[5], BTreeSet::from([5, 6, 7]));
+
+        // Seventh group: 48, 50
+        assert_eq!(order[6], BTreeSet::from([48, 50]));
+
+        // Eighth group: 52-55
+        assert_eq!(order[7], BTreeSet::from([52, 53, 54, 55]));
+
+        // Ninth group: 56-95
+        let expected_range_2: BTreeSet<usize> = (56..=95).collect();
+        assert_eq!(order[8], expected_range_2);
+    }
+
+    #[test]
+    fn test_normalize_hunk() {
+        let mut patch = Patch::parse_unified_diff(indoc! {"
+            This patch has too many lines of context.
+
+            --- a/first.txt
+            +++ b/first.txt
+            @@ -1,7 +1,6 @@
+             azuere
+             beige
+             black
+            -blue
+             brown
+             cyan
+             gold
+            // Some garbage
+        "});
+
+        patch.normalize_hunks(1);
+        let actual = patch.to_string();
+        assert_eq!(
+            actual,
+            indoc! {"
+            This patch has too many lines of context.
+
+            --- a/first.txt
+            +++ b/first.txt
+            @@ -3,3 +3,2 @@
+             black
+            -blue
+             brown
+            // Some garbage
+        "}
+        );
+    }
+}

crates/edit_prediction_cli/src/split_commit.rs 🔗

@@ -0,0 +1,1465 @@
+//! `ep split-commit` implementation.
+//!
+//! This command generates a single evaluation example JSON object from a
+//! chronologically-ordered unified diff (a "commit").
+//!
+//! TODO: Port Python code to generate chronologically-ordered commits
+use crate::reorder_patch::{Patch, PatchLine, extract_edits, locate_edited_line};
+use anyhow::{Context as _, Result};
+use clap::Args;
+use rand::Rng;
+use rand::SeedableRng;
+use serde::{Deserialize, Serialize};
+use similar::{DiffTag, TextDiff};
+use std::collections::BTreeSet;
+use std::fs;
+use std::io::{self, Read};
+
+/// `ep split-commit` CLI args.
+#[derive(Debug, Args)]
+pub struct SplitCommitArgs {
+    /// Path to the commit file (use "-" for stdin)
+    #[arg(long, short = 'c')]
+    pub commit: String,
+
+    /// Repository URL
+    #[arg(long, short = 'r', default_value_t = String::new())]
+    pub repository_url: String,
+
+    /// Commit hash
+    #[arg(long, default_value_t = String::new())]
+    pub commit_hash: String,
+
+    /// Split point (float 0.0-1.0 for fraction, or integer for index)
+    #[arg(long, short = 's')]
+    pub split_point: Option<String>,
+
+    /// Random seed for reproducibility
+    #[arg(long)]
+    pub seed: Option<u64>,
+
+    /// Pretty-print JSON output
+    #[arg(long, short = 'p')]
+    pub pretty: bool,
+}
+
+/// Cursor position in a file.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct CursorPosition {
+    pub file: String,
+    pub line: usize,
+    pub column: usize,
+}
+
+impl std::fmt::Display for CursorPosition {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}:{}", self.file, self.line, self.column)
+    }
+}
+
+/// Represents a split commit with source and target patches.
+#[derive(Debug, Clone)]
+pub struct SplitCommit {
+    pub source_patch: String,
+    pub target_patch: String,
+}
+
+/// The evaluation case structure that will be serialized to JSON.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EvaluationCase {
+    pub repository_url: String,
+    pub commit: String,
+    pub edit_history: Vec<String>,
+    pub cursor_position: String,
+    pub cursor_excerpt: String,
+    pub expected_hunks: Vec<String>,
+    pub expected_patch: String,
+    pub allowed_patch: String,
+    pub expected_context_excerpts: Vec<String>,
+    pub extra: serde_json::Value,
+}
+
+/// Split point specification for evaluation generation.
+#[derive(Debug, Clone)]
+pub enum SplitPoint {
+    /// Fraction of total edits (0.0 to 1.0)
+    Fraction(f64),
+    /// Absolute index
+    Index(usize),
+}
+
+fn parse_split_point(value: &str) -> Option<SplitPoint> {
+    if value.contains('.') {
+        value.parse::<f64>().ok().map(SplitPoint::Fraction)
+    } else {
+        value.parse::<usize>().ok().map(SplitPoint::Index)
+    }
+}
+
+/// Entry point for the `ep split-commit` subcommand.
+///
+/// This runs synchronously and prints a single JSON object to stdout.
+pub fn run_split_commit(args: &SplitCommitArgs) -> Result<()> {
+    let commit = if args.commit == "-" {
+        let mut content = String::new();
+        io::stdin()
+            .read_to_string(&mut content)
+            .context("failed to read commit diff from stdin")?;
+        content
+    } else {
+        fs::read_to_string(&args.commit)
+            .with_context(|| format!("failed to read commit diff from {}", args.commit))?
+    };
+
+    let split_point = args.split_point.as_deref().and_then(parse_split_point);
+
+    let case = generate_evaluation_example_from_ordered_commit(
+        &commit,
+        &args.repository_url,
+        &args.commit_hash,
+        split_point,
+        args.seed,
+    )
+    .context("failed to generate evaluation example")?;
+
+    let json = if args.pretty {
+        serde_json::to_string_pretty(&case)
+    } else {
+        serde_json::to_string(&case)
+    }
+    .context("failed to serialize evaluation case as JSON")?;
+
+    println!("{json}");
+    Ok(())
+}
+
+/// Main function to generate an evaluation example from an ordered commit.
+///
+/// # Arguments
+/// * `commit` - Chronologically ordered unified diff of the commit
+/// * `repository_url` - URL of the repository
+/// * `commit_hash` - Hash of the commit
+/// * `split_point` - Point at which the commit will be split (None for random)
+/// * `seed` - Optional seed for randomness
+pub fn generate_evaluation_example_from_ordered_commit(
+    commit: &str,
+    repository_url: &str,
+    commit_hash: &str,
+    split_point: Option<SplitPoint>,
+    seed: Option<u64>,
+) -> Result<EvaluationCase> {
+    let mut rng: Box<dyn rand::RngCore> = match seed {
+        Some(seed) => Box::new(rand::rngs::StdRng::seed_from_u64(seed)),
+        None => Box::new(rand::rngs::ThreadRng::default()),
+    };
+
+    // Parse and normalize the commit
+    let mut patch = Patch::parse_unified_diff(commit);
+
+    // Filter header to only keep lines starting with "//"
+    let header_lines: Vec<&str> = patch
+        .header
+        .lines()
+        .filter(|line| line.starts_with("//"))
+        .collect();
+    patch.header = if header_lines.is_empty() {
+        String::new()
+    } else {
+        header_lines.join("\n") + "\n"
+    };
+    let commit_normalized = patch.to_string();
+
+    // Compute the split point
+    let stats = patch.stats();
+    let num_edits = stats.added + stats.removed;
+
+    anyhow::ensure!(num_edits != 0, "no edits found in commit");
+
+    let split = match split_point {
+        None => rng.random_range(1..=num_edits),
+        Some(SplitPoint::Fraction(f)) => {
+            let v = (f * num_edits as f64).floor() as usize;
+            v.min(num_edits)
+        }
+        Some(SplitPoint::Index(i)) => i.min(num_edits),
+    };
+
+    // Split the commit into source and target patches
+    let (prefix, suffix) = split_ordered_commit(&commit_normalized, split);
+
+    let mut split_commit = SplitCommit {
+        source_patch: prefix,
+        target_patch: suffix,
+    };
+
+    // Imitate human edits
+    let human_edit_seed = rng.random_range(1..=10000u64);
+    let (src_patch, tgt_patch, cursor_opt) = imitate_human_edits(
+        &split_commit.source_patch,
+        &split_commit.target_patch,
+        human_edit_seed,
+    );
+    split_commit.source_patch = src_patch;
+    split_commit.target_patch = tgt_patch;
+
+    // Sample cursor position
+    let cursor = match cursor_opt {
+        Some(c) => c,
+        None => sample_cursor_position(&patch, &split_commit)
+            .context("failed to sample cursor position")?,
+    };
+
+    // Get cursor excerpt
+    let cursor_excerpt = get_cursor_excerpt(
+        &cursor,
+        &split_commit.source_patch,
+        &split_commit.target_patch,
+    )
+    .context("failed to generate cursor excerpt")?;
+
+    // Handle edge case where split_point == 0
+    if split == 0 {
+        split_commit.target_patch = String::new();
+    }
+
+    Ok(EvaluationCase {
+        repository_url: repository_url.to_string(),
+        commit: format!("{}~1", commit_hash),
+        edit_history: vec![split_commit.source_patch.clone()],
+        cursor_position: cursor.to_string(),
+        cursor_excerpt,
+        expected_hunks: vec![split_commit.target_patch.clone()],
+        expected_patch: split_commit.target_patch.clone(),
+        allowed_patch: split_commit.target_patch,
+        expected_context_excerpts: vec![],
+        extra: serde_json::json!({}),
+    })
+}
+
+/// Split an ordered commit into source and target commits.
+///
+/// # Arguments
+/// * `commit` - Ordered commit string
+/// * `split_pos` - Position to split the commit (number of edited lines)
+///
+/// # Returns
+/// A tuple of (source_diff, target_diff)
+pub fn split_ordered_commit(commit: &str, split_pos: usize) -> (String, String) {
+    let patch = Patch::parse_unified_diff(commit);
+    let source_edits: BTreeSet<usize> = (0..split_pos).collect();
+    let (source, target) = extract_edits(&patch, &source_edits);
+
+    let mut source_str = source.to_string();
+    let target_str = target.to_string();
+
+    // Strip last group header from the source (lines starting with "//" at the end)
+    let source_lines: Vec<&str> = source_str.lines().collect();
+    let mut end_idx = source_lines.len();
+    for i in (0..source_lines.len()).rev() {
+        if source_lines[i].starts_with("//") {
+            end_idx = i;
+        } else {
+            break;
+        }
+    }
+    if end_idx < source_lines.len() {
+        source_str = source_lines[..end_idx].join("\n");
+        if !source_str.is_empty() {
+            source_str.push('\n');
+        }
+    }
+
+    (source_str, target_str)
+}
+
+/// Tokenize text into words and non-word characters.
+fn tokenize(text: &str) -> Vec<String> {
+    let mut tokens = Vec::new();
+    let mut current = String::new();
+
+    for ch in text.chars() {
+        if ch.is_alphanumeric() {
+            current.push(ch);
+        } else if ch == '_' {
+            // Include underscore with the current word, then flush
+            current.push(ch);
+            if !current.is_empty() {
+                tokens.push(std::mem::take(&mut current));
+            }
+        } else {
+            // Punctuation or whitespace - flush current word first
+            if !current.is_empty() {
+                tokens.push(std::mem::take(&mut current));
+            }
+            // Each punctuation/whitespace is its own token
+            tokens.push(ch.to_string());
+        }
+    }
+
+    if !current.is_empty() {
+        tokens.push(current);
+    }
+
+    tokens
+}
+
+/// Calculate the weight for a split position based on the character at that position.
+///
+/// Higher weights indicate more natural pause points (e.g., after punctuation,
+/// at identifier boundaries). Lower weights indicate less natural points
+/// (e.g., mid-identifier).
+fn position_weight(text: &str, pos: usize) -> u32 {
+    if pos == 0 || pos > text.len() {
+        return 1;
+    }
+
+    let chars: Vec<char> = text.chars().collect();
+    if pos > chars.len() {
+        return 1;
+    }
+
+    // Get the character just before this position (what we just "typed")
+    let prev_char = chars[pos - 1];
+
+    // High weight: natural pause points (end of statement/argument, opening brackets)
+    if matches!(prev_char, ',' | ';' | ':' | '(' | '[' | '{') {
+        return 10;
+    }
+
+    // High weight: closing brackets (finished a group)
+    if matches!(prev_char, ')' | ']' | '}') {
+        return 8;
+    }
+
+    // Medium weight: operators and method chains
+    if matches!(
+        prev_char,
+        '.' | '+' | '-' | '*' | '/' | '=' | '<' | '>' | '&' | '|' | '!'
+    ) {
+        return 5;
+    }
+
+    // Check if we're at the end of an identifier (word char followed by non-word char)
+    let is_prev_word_char = prev_char.is_alphanumeric() || prev_char == '_';
+    let is_next_word_char =
+        pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_');
+
+    if is_prev_word_char && !is_next_word_char {
+        // End of identifier - high weight
+        return 8;
+    }
+
+    // Whitespace is a natural pause
+    if prev_char.is_whitespace() {
+        return 6;
+    }
+
+    // Mid-identifier: low weight (rare autocomplete scenarios)
+    if is_prev_word_char && is_next_word_char {
+        return 1;
+    }
+
+    // Default medium-low weight
+    3
+}
+
+/// Select a weighted random index from a list of weights.
+///
+/// Returns an index based on the weights, using the provided seed for
+/// deterministic selection.
+fn weighted_select(weights: &[u32], seed: u64) -> usize {
+    if weights.is_empty() {
+        return 0;
+    }
+
+    let total_weight: u64 = weights.iter().map(|&w| w as u64).sum();
+    if total_weight == 0 {
+        // Fallback to uniform selection if all weights are zero
+        return seed as usize % weights.len();
+    }
+
+    // Use seed to select a value in [0, total_weight)
+    let target = seed % total_weight;
+    let mut cumulative: u64 = 0;
+
+    for (idx, &weight) in weights.iter().enumerate() {
+        cumulative += weight as u64;
+        if target < cumulative {
+            return idx;
+        }
+    }
+
+    // Fallback to last index
+    weights.len() - 1
+}
+
+/// Calculate similarity ratio between two strings (0-100).
+fn fuzzy_ratio(s1: &str, s2: &str) -> u32 {
+    if s1.is_empty() && s2.is_empty() {
+        return 100;
+    }
+    if s1.is_empty() || s2.is_empty() {
+        return 0;
+    }
+
+    let diff = TextDiff::from_chars(s1, s2);
+    let matching: usize = diff
+        .ops()
+        .iter()
+        .filter_map(|op| {
+            if matches!(op.tag(), DiffTag::Equal) {
+                Some(op.new_range().len())
+            } else {
+                None
+            }
+        })
+        .sum();
+
+    let total = s1.len() + s2.len();
+    ((2 * matching * 100) / total) as u32
+}
+
+/// Imitate human edits by introducing partial line edits.
+///
+/// This function simulates how a human might incrementally type code,
+/// rather than making complete line replacements.
+pub fn imitate_human_edits(
+    source_patch: &str,
+    target_patch: &str,
+    seed: u64,
+) -> (String, String, Option<CursorPosition>) {
+    let no_change = (source_patch.to_string(), target_patch.to_string(), None);
+
+    let src_patch = Patch::parse_unified_diff(source_patch);
+    let tgt_patch = Patch::parse_unified_diff(target_patch);
+
+    if tgt_patch.hunks.is_empty() {
+        return no_change;
+    }
+
+    // Try to locate the first edit in target
+    let tgt_edit_loc = match locate_edited_line(&tgt_patch, 0) {
+        Some(loc) => loc,
+        None => return no_change,
+    };
+
+    let tgt_is_addition = matches!(tgt_edit_loc.patch_line, PatchLine::Addition(_));
+    if !tgt_is_addition {
+        return no_change;
+    }
+
+    let tgt_line = match &tgt_edit_loc.patch_line {
+        PatchLine::Addition(s) => s.clone(),
+        _ => return no_change,
+    };
+
+    // Try to locate the last edit in source
+    let src_edit_loc = locate_edited_line(&src_patch, -1);
+
+    // Check if source has ANY edit at the same line as target's first edit
+    // We need to iterate through all edits to check this
+    let src_has_edit_at_target_line = {
+        let mut found = false;
+        let mut idx = 0isize;
+        while let Some(loc) = locate_edited_line(&src_patch, idx) {
+            if loc.filename == tgt_edit_loc.filename
+                && loc.target_line_number == tgt_edit_loc.target_line_number
+            {
+                found = true;
+                break;
+            }
+            idx += 1;
+        }
+        found
+    };
+
+    // Check if this is a replacement (deletion followed by insertion on the same line)
+    // or a pure insertion (no corresponding deletion in source)
+    let is_replacement = src_edit_loc.as_ref().map_or(false, |loc| {
+        matches!(loc.patch_line, PatchLine::Deletion(_))
+            && loc.filename == tgt_edit_loc.filename
+            && loc.target_line_number == tgt_edit_loc.target_line_number
+    });
+
+    // If source has an edit at the same line but it's not a replacement (i.e., it's an addition),
+    // we shouldn't process this as a pure insertion either
+    if !is_replacement && src_has_edit_at_target_line {
+        return no_change;
+    }
+
+    let src_line = if is_replacement {
+        match &src_edit_loc.as_ref().unwrap().patch_line {
+            PatchLine::Deletion(s) => s.clone(),
+            _ => return no_change,
+        }
+    } else {
+        // Pure insertion: source line is empty
+        String::new()
+    };
+
+    // Don't process if source and target are the same
+    if src_line == tgt_line {
+        return no_change;
+    }
+
+    // Tokenize both lines
+    let src_tokens = tokenize(&src_line);
+    let tgt_tokens = tokenize(&tgt_line);
+
+    // Convert to slices for similar
+    let src_refs: Vec<&str> = src_tokens.iter().map(|s| s.as_str()).collect();
+    let tgt_refs: Vec<&str> = tgt_tokens.iter().map(|s| s.as_str()).collect();
+
+    // Use similar to get diff operations
+    let diff = TextDiff::from_slices(&src_refs, &tgt_refs);
+
+    // Build weights for each possible split position
+    let mut position_weights: Vec<u32> = Vec::new();
+
+    // Simulate the edit process to collect weights for all possible split positions
+    {
+        let mut current_text = String::new();
+
+        for op in diff.ops() {
+            match op.tag() {
+                DiffTag::Equal => {
+                    for i in op.old_range() {
+                        current_text.push_str(&src_tokens[i]);
+                    }
+                }
+                DiffTag::Replace => {
+                    let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
+                    let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
+
+                    // For insertion part
+                    for ch in ins.chars() {
+                        current_text.push(ch);
+                        let weight = position_weight(&current_text, current_text.len());
+                        position_weights.push(weight);
+                    }
+
+                    // For deletion part (we're "untyping" from source)
+                    for _ in del.chars() {
+                        // Weight deletions lower as they represent removing text
+                        position_weights.push(2);
+                    }
+                }
+                DiffTag::Insert => {
+                    let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
+                    for ch in ins.chars() {
+                        current_text.push(ch);
+                        let weight = position_weight(&current_text, current_text.len());
+                        position_weights.push(weight);
+                    }
+                }
+                DiffTag::Delete => {
+                    let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
+                    for _ in del.chars() {
+                        // Weight deletions lower
+                        position_weights.push(2);
+                    }
+                }
+            }
+        }
+    }
+
+    // Use weighted selection to choose split index
+    if position_weights.is_empty() {
+        return no_change;
+    }
+    let split_index = weighted_select(&position_weights, seed);
+
+    let mut edit_index = 0usize;
+    let mut new_src = String::new();
+    let mut split_found = false;
+    let mut last_old_end = 0usize;
+
+    for op in diff.ops() {
+        match op.tag() {
+            DiffTag::Equal => {
+                for i in op.old_range() {
+                    new_src.push_str(&src_tokens[i]);
+                }
+                last_old_end = op.old_range().end;
+            }
+            DiffTag::Replace => {
+                // Handle replace as delete + insert
+                let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
+                let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
+                let repl_len = del.len() + ins.len();
+                if edit_index + repl_len >= split_index {
+                    // Split within this replace operation
+                    let offset = split_index - edit_index;
+                    if offset < ins.len() {
+                        new_src.push_str(&ins[..offset]);
+                    } else {
+                        new_src.push_str(&ins);
+                        let del_offset = offset - ins.len();
+                        new_src.push_str(&del[..del_offset.min(del.len())]);
+                    }
+                    split_found = true;
+                    last_old_end = op.old_range().end;
+                    break;
+                } else {
+                    edit_index += repl_len;
+                    new_src.push_str(&ins);
+                    last_old_end = op.old_range().end;
+                }
+            }
+            DiffTag::Insert => {
+                let repl: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
+                if edit_index + repl.len() >= split_index {
+                    let offset = split_index - edit_index;
+                    new_src.push_str(&repl[..offset]);
+                    split_found = true;
+                    break;
+                } else {
+                    edit_index += repl.len();
+                    new_src.push_str(&repl);
+                }
+            }
+            DiffTag::Delete => {
+                let repl: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
+                if edit_index + repl.len() >= split_index {
+                    let offset = split_index - edit_index;
+                    new_src.push_str(&repl[..offset]);
+                    split_found = true;
+                    last_old_end = op.old_range().start + offset.min(op.old_range().len());
+                    break;
+                } else {
+                    edit_index += repl.len();
+                    new_src.push_str(&repl);
+                    last_old_end = op.old_range().end;
+                }
+            }
+        }
+    }
+
+    if !split_found {
+        return no_change;
+    }
+
+    // Calculate cursor position
+    let cursor = CursorPosition {
+        file: tgt_edit_loc.filename.clone(),
+        line: if is_replacement {
+            src_edit_loc.as_ref().unwrap().source_line_number
+        } else {
+            tgt_edit_loc.target_line_number
+        },
+        column: new_src.len() + 1,
+    };
+
+    // Add remainder of source if similar enough to target remainder
+    let remainder_src: String = (last_old_end..src_tokens.len())
+        .map(|i| src_tokens[i].as_str())
+        .collect();
+    let remainder_tgt: String = (last_old_end..tgt_tokens.len())
+        .filter_map(|i| tgt_tokens.get(i).map(|s| s.as_str()))
+        .collect();
+
+    let ratio = fuzzy_ratio(&remainder_src, &remainder_tgt);
+    if ratio > 35 {
+        new_src.push_str(&remainder_src);
+    }
+
+    if new_src.trim().is_empty() {
+        return no_change;
+    }
+
+    if new_src == src_line {
+        return no_change;
+    }
+
+    // Build new source patch with the intermediate line
+    let mut new_src_patch = src_patch;
+    if is_replacement {
+        // For replacements, insert after the deletion line
+        let src_loc = src_edit_loc.as_ref().unwrap();
+        if let Some(hunk) = new_src_patch.hunks.get_mut(src_loc.hunk_index) {
+            hunk.lines.insert(
+                src_loc.line_index_within_hunk + 1,
+                PatchLine::Addition(new_src.clone()),
+            );
+            hunk.new_count += 1;
+        }
+    } else {
+        // For pure insertions, we need to add or modify a hunk
+        if let Some(hunk) = new_src_patch.hunks.get_mut(tgt_edit_loc.hunk_index) {
+            // Insert the partial line at the same position as target
+            hunk.lines.insert(
+                tgt_edit_loc.line_index_within_hunk,
+                PatchLine::Addition(new_src.clone()),
+            );
+            hunk.new_count += 1;
+        } else if new_src_patch.hunks.is_empty() {
+            // Source patch is empty, create a new hunk based on target
+            if let Some(tgt_hunk) = tgt_patch.hunks.get(tgt_edit_loc.hunk_index) {
+                let mut new_hunk = tgt_hunk.clone();
+                // Replace the full addition with the partial one
+                new_hunk.lines.clear();
+                for (i, line) in tgt_hunk.lines.iter().enumerate() {
+                    if i == tgt_edit_loc.line_index_within_hunk {
+                        new_hunk.lines.push(PatchLine::Addition(new_src.clone()));
+                    } else {
+                        match line {
+                            PatchLine::Addition(_) => {
+                                // Skip other additions from target
+                            }
+                            _ => new_hunk.lines.push(line.clone()),
+                        }
+                    }
+                }
+                new_hunk.new_count = new_hunk.old_count + 1;
+                new_src_patch.hunks.push(new_hunk);
+                // Copy header from target if source doesn't have one
+                if new_src_patch.header.is_empty() {
+                    new_src_patch.header = tgt_patch.header.clone();
+                }
+            }
+        }
+    }
+
+    // Build new target patch with the intermediate line as deletion
+    let mut new_tgt_patch = tgt_patch;
+    if let Some(hunk) = new_tgt_patch.hunks.get_mut(tgt_edit_loc.hunk_index) {
+        hunk.lines.insert(
+            tgt_edit_loc.line_index_within_hunk,
+            PatchLine::Deletion(new_src),
+        );
+        hunk.old_count += 1;
+    }
+
+    (
+        new_src_patch.to_string(),
+        new_tgt_patch.to_string(),
+        Some(cursor),
+    )
+}
+
+/// Locate the end of the last edit in a patch.
+fn locate_end_of_last_edit(patch: &Patch) -> Option<CursorPosition> {
+    let loc = locate_edited_line(patch, -1)?;
+
+    let (line, col) = match &loc.patch_line {
+        PatchLine::Addition(content) => (loc.target_line_number, content.len()),
+        PatchLine::Deletion(_) => (loc.target_line_number, 1),
+        _ => return None,
+    };
+
+    Some(CursorPosition {
+        file: loc.filename,
+        line,
+        column: col,
+    })
+}
+
+/// Locate the beginning of the first edit in a patch.
+fn locate_beginning_of_first_edit(patch: &Patch) -> Option<CursorPosition> {
+    let loc = locate_edited_line(patch, 0)?;
+
+    let hunk = patch.hunks.get(loc.hunk_index)?;
+    let column = if loc.line_index_within_hunk > 0 {
+        if let Some(prev_line) = hunk.lines.get(loc.line_index_within_hunk - 1) {
+            let content = match prev_line {
+                PatchLine::Context(s) | PatchLine::Addition(s) | PatchLine::Deletion(s) => s,
+                _ => return None,
+            };
+            content.len().max(1) - 1
+        } else {
+            0
+        }
+    } else {
+        0
+    };
+
+    let line = loc.target_line_number.saturating_sub(1).max(1);
+
+    Some(CursorPosition {
+        file: loc.filename,
+        line,
+        column,
+    })
+}
+
+/// Sample cursor position according to the following rules:
+/// 1. 50% chance of cursor being at the end of the source patch
+/// 2. 50% chance of cursor being at the beginning of the target patch
+pub fn sample_cursor_position(patch: &Patch, split_commit: &SplitCommit) -> Option<CursorPosition> {
+    // Try end of history first
+    let src_patch = Patch::parse_unified_diff(&split_commit.source_patch);
+    if let Some(cursor) = locate_end_of_last_edit(&src_patch) {
+        return Some(cursor);
+    }
+
+    // Try beginning of target
+    let tgt_patch = Patch::parse_unified_diff(&split_commit.target_patch);
+    if let Some(cursor) = locate_beginning_of_first_edit(&tgt_patch) {
+        return Some(cursor);
+    }
+
+    // Fallback: use the original patch
+    locate_end_of_last_edit(patch)
+}
+
+/// Get cursor excerpt from the patches.
+///
+/// This extracts the lines around the cursor position with a cursor marker.
+pub fn get_cursor_excerpt(
+    cursor: &CursorPosition,
+    source_patch: &str,
+    target_patch: &str,
+) -> Option<String> {
+    let mut excerpt_lines: Vec<String> = Vec::new();
+    let mut excerpt_first_line: usize = 0;
+
+    // Search in the last hunk of source patch
+    let src = Patch::parse_unified_diff(source_patch);
+    if let Some(loc) = locate_edited_line(&src, -1) {
+        if loc.filename == cursor.file && loc.target_line_number == cursor.line {
+            if let Some(hunk) = src.hunks.get(loc.hunk_index) {
+                excerpt_first_line = hunk.new_start as usize;
+                for line in &hunk.lines {
+                    match line {
+                        PatchLine::Addition(s) | PatchLine::Context(s) => {
+                            excerpt_lines.push(s.clone());
+                        }
+                        _ => {}
+                    }
+                }
+            }
+        }
+    }
+
+    // Search in target patch if not found
+    if excerpt_lines.is_empty() {
+        let tgt = Patch::parse_unified_diff(target_patch);
+        if let Some(loc) = locate_edited_line(&tgt, 0) {
+            if loc.filename == cursor.file {
+                if let Some(hunk) = tgt.hunks.get(loc.hunk_index) {
+                    excerpt_first_line = hunk.new_start as usize;
+                    for line in &hunk.lines {
+                        match line {
+                            PatchLine::Deletion(s) | PatchLine::Context(s) => {
+                                excerpt_lines.push(s.clone());
+                            }
+                            _ => {}
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if excerpt_lines.is_empty() {
+        return None;
+    }
+
+    // Add cursor marker
+    for (i, line) in excerpt_lines.iter_mut().enumerate() {
+        let line_num = excerpt_first_line + i;
+        if line_num == cursor.line {
+            let col = cursor.column.min(line.len());
+            let (before, after) = line.split_at(col);
+            *line = format!("{}<|user_cursor|>{}", before, after);
+            break;
+        }
+    }
+
+    Some(excerpt_lines.join("\n"))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_tokenize() {
+        let tokens = tokenize("hello world");
+        assert_eq!(tokens, vec!["hello", " ", "world"]);
+
+        let tokens = tokenize("foo_bar123 + baz");
+        assert_eq!(tokens, vec!["foo_", "bar123", " ", "+", " ", "baz"]);
+
+        let tokens = tokenize("print(\"hello\")");
+        assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
+
+        let tokens = tokenize("hello_world");
+        assert_eq!(tokens, vec!["hello_", "world"]);
+
+        let tokens = tokenize("fn();");
+        assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
+    }
+
+    #[test]
+    fn test_fuzzy_ratio() {
+        assert_eq!(fuzzy_ratio("hello", "hello"), 100);
+        assert_eq!(fuzzy_ratio("", ""), 100);
+        assert!(fuzzy_ratio("hello", "world") < 50);
+        assert!(fuzzy_ratio("hello world", "hello worl") > 80);
+    }
+
+    #[test]
+    fn test_split_ordered_commit() {
+        let commit = r#"// First change
+--- a/test.rs
++++ b/test.rs
+@@ -1,3 +1,4 @@
+ fn main() {
++    println!("hello");
++    println!("world");
+ }
+"#;
+        let patch = Patch::parse_unified_diff(commit);
+        let stats = patch.stats();
+        assert_eq!(stats.added, 2);
+
+        let (source, target) = split_ordered_commit(commit, 1);
+
+        // Source should have 1 addition
+        let src_patch = Patch::parse_unified_diff(&source);
+        assert_eq!(src_patch.stats().added, 1);
+
+        // Target should have 1 addition
+        let tgt_patch = Patch::parse_unified_diff(&target);
+        assert_eq!(tgt_patch.stats().added, 1);
+    }
+
+    #[test]
+    fn test_split_ordered_commit_with_deletions() {
+        let commit = r#"// Change
+--- a/test.rs
++++ b/test.rs
+@@ -1,3 +1,3 @@
+ fn main() {
+-    println!("old");
++    println!("new");
+ }
+"#;
+        let patch = Patch::parse_unified_diff(commit);
+        let stats = patch.stats();
+        assert_eq!(stats.added, 1);
+        assert_eq!(stats.removed, 1);
+
+        // Split at position 1 (after the deletion)
+        let (source, target) = split_ordered_commit(commit, 1);
+
+        let src_patch = Patch::parse_unified_diff(&source);
+        let tgt_patch = Patch::parse_unified_diff(&target);
+
+        // Source should have the deletion
+        assert_eq!(src_patch.stats().removed, 1);
+        // Target should have the addition
+        assert_eq!(tgt_patch.stats().added, 1);
+    }
+
+    #[test]
+    fn test_generate_evaluation_example() {
+        let commit = r#"commit abc123
+Author: Test <test@example.com>
+Date: Mon Jan 1 00:00:00 2024
+
+    Test commit
+
+////////////////////////////////////////////////////////////////////////////////
+// Add greeting
+////////////////////////////////////////////////////////////////////////////////
+--- a/test.rs
++++ b/test.rs
+@@ -1,3 +1,5 @@
+ fn main() {
++    println!("hello");
++    println!("world");
+ }
+"#;
+
+        let result = generate_evaluation_example_from_ordered_commit(
+            commit,
+            "https://github.com/test/repo",
+            "abc123",
+            Some(SplitPoint::Fraction(0.5)),
+            Some(42),
+        );
+
+        assert!(result.is_ok());
+        let case = result.unwrap();
+        assert_eq!(case.repository_url, "https://github.com/test/repo");
+        assert_eq!(case.commit, "abc123~1");
+        assert!(!case.edit_history.is_empty());
+    }
+
+    #[test]
+    fn test_generate_evaluation_example_reproducible() {
+        let commit = r#"////////////////////////////////////////////////////////////////////////////////
+// Add greeting
+////////////////////////////////////////////////////////////////////////////////
+--- a/test.rs
++++ b/test.rs
+@@ -1,3 +1,5 @@
+ fn main() {
++    println!("hello");
++    println!("world");
+ }
+"#;
+
+        // Run twice with the same seed
+        let result1 = generate_evaluation_example_from_ordered_commit(
+            commit,
+            "https://github.com/test/repo",
+            "abc123",
+            Some(SplitPoint::Fraction(0.5)),
+            Some(12345),
+        )
+        .unwrap();
+
+        let result2 = generate_evaluation_example_from_ordered_commit(
+            commit,
+            "https://github.com/test/repo",
+            "abc123",
+            Some(SplitPoint::Fraction(0.5)),
+            Some(12345),
+        )
+        .unwrap();
+
+        // Results should be identical
+        assert_eq!(result1.edit_history, result2.edit_history);
+        assert_eq!(result1.expected_patch, result2.expected_patch);
+        assert_eq!(result1.cursor_position, result2.cursor_position);
+    }
+
+    #[test]
+    fn test_cursor_position_display() {
+        let cursor = CursorPosition {
+            file: "src/main.rs".to_string(),
+            line: 42,
+            column: 10,
+        };
+        assert_eq!(cursor.to_string(), "src/main.rs:42:10");
+    }
+
+    #[test]
+    fn test_imitate_human_edits_no_change_when_no_replacement() {
+        // Source and target patches that don't form a replacement pattern
+        let source = r#"--- a/test.rs
++++ b/test.rs
+@@ -1,3 +1,4 @@
+ fn main() {
++    println!("hello");
+ }
+"#;
+        let target = r#"--- a/test.rs
++++ b/test.rs
+@@ -1,3 +1,4 @@
+ fn main() {
++    println!("world");
+ }
+"#;
+
+        let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, 42);
+
+        // Should return unchanged when not a replacement pattern
+        assert_eq!(new_src, source);
+        assert_eq!(new_tgt, target);
+        assert!(cursor.is_none());
+    }
+
+    #[test]
+    fn test_split_point_fraction() {
+        let commit = r#"// Change
+--- a/test.rs
++++ b/test.rs
+@@ -1,5 +1,10 @@
+ fn main() {
++    line1();
++    line2();
++    line3();
++    line4();
++    line5();
+ }
+"#;
+
+        // Split at 20% should give first edit in source
+        let result = generate_evaluation_example_from_ordered_commit(
+            commit,
+            "",
+            "hash",
+            Some(SplitPoint::Fraction(0.2)),
+            Some(1),
+        );
+
+        assert!(result.is_ok());
+        let case = result.unwrap();
+
+        // Source should have some edits
+        let src_patch = Patch::parse_unified_diff(&case.edit_history[0]);
+        assert!(src_patch.stats().added > 0);
+    }
+
+    #[test]
+    fn test_split_point_index() {
+        let commit = r#"// Change
+--- a/test.rs
++++ b/test.rs
+@@ -1,5 +1,10 @@
+ fn main() {
++    line1();
++    line2();
++    line3();
++    line4();
++    line5();
+ }
+"#;
+
+        // Split at index 2 should give first 2 edits in source
+        // With pure insertion handling, source gets 2 original + 1 partial = 3 additions
+        let result = generate_evaluation_example_from_ordered_commit(
+            commit,
+            "",
+            "hash",
+            Some(SplitPoint::Index(2)),
+            Some(1),
+        );
+
+        assert!(result.is_ok());
+        let case = result.unwrap();
+
+        let src_patch = Patch::parse_unified_diff(&case.edit_history[0]);
+        // Pure insertion adds a partial line, so we expect 3 (2 original + 1 partial)
+        assert_eq!(src_patch.stats().added, 3);
+    }
+
+    #[test]
+    fn test_cursor_excerpt_contains_marker() {
+        let commit = r#"////////////////////////////////////////////////////////////////////////////////
+// Add code
+////////////////////////////////////////////////////////////////////////////////
+--- a/test.rs
++++ b/test.rs
+@@ -1,3 +1,5 @@
+ fn main() {
++    println!("hello");
++    println!("world");
+ }
+"#;
+
+        let result = generate_evaluation_example_from_ordered_commit(
+            commit,
+            "",
+            "hash",
+            Some(SplitPoint::Fraction(0.5)),
+            Some(42),
+        )
+        .unwrap();
+
+        // Cursor excerpt should contain the cursor marker
+        assert!(
+            result.cursor_excerpt.contains("<|user_cursor|>"),
+            "Cursor excerpt should contain marker: {}",
+            result.cursor_excerpt
+        );
+    }
+
+    #[test]
+    fn test_evaluation_case_json_serialization() {
+        let case = EvaluationCase {
+            repository_url: "https://github.com/test/repo".to_string(),
+            commit: "abc123~1".to_string(),
+            edit_history: vec!["patch1".to_string()],
+            cursor_position: "file.rs:10:5".to_string(),
+            cursor_excerpt: "some code<|user_cursor|>".to_string(),
+            expected_hunks: vec!["hunk1".to_string()],
+            expected_patch: "patch".to_string(),
+            allowed_patch: "patch".to_string(),
+            expected_context_excerpts: vec![],
+            extra: serde_json::json!({}),
+        };
+
+        let json = serde_json::to_string(&case).unwrap();
+        let deserialized: EvaluationCase = serde_json::from_str(&json).unwrap();
+
+        assert_eq!(case.repository_url, deserialized.repository_url);
+        assert_eq!(case.commit, deserialized.commit);
+        assert_eq!(case.cursor_position, deserialized.cursor_position);
+    }
+
+    #[test]
+    fn test_empty_commit_returns_error() {
+        let commit = "";
+
+        let result = generate_evaluation_example_from_ordered_commit(
+            commit,
+            "",
+            "hash",
+            Some(SplitPoint::Fraction(0.5)),
+            Some(1),
+        );
+
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_header_filtering() {
+        let commit = r#"commit abc123
+Author: Test
+Date: Today
+
+    Message
+
+diff --git a/test.rs b/test.rs
+index 123..456 789
+////////////////////////////////////////////////////////////////////////////////
+// First group
+////////////////////////////////////////////////////////////////////////////////
+--- a/test.rs
++++ b/test.rs
+@@ -1,3 +1,4 @@
+ fn main() {
++    code();
+ }
+"#;
+
+        let result = generate_evaluation_example_from_ordered_commit(
+            commit,
+            "",
+            "hash",
+            Some(SplitPoint::Index(1)),
+            Some(1),
+        );
+
+        assert!(result.is_ok());
+        let case = result.unwrap();
+
+        // The edit history should contain the group header (// lines)
+        // but not the commit metadata
+        assert!(!case.edit_history[0].contains("Author:"));
+        assert!(!case.edit_history[0].contains("Date:"));
+    }
+
+    #[test]
+    fn test_position_weight() {
+        // High weight positions (natural pause points)
+        assert_eq!(position_weight("foo(", 4), 10); // After '('
+        assert_eq!(position_weight("a, b", 2), 10); // After ','
+        assert_eq!(position_weight("x;", 2), 10); // After ';'
+        assert_eq!(position_weight("a: b", 2), 10); // After ':'
+        assert_eq!(position_weight("[", 1), 10); // After '['
+        assert_eq!(position_weight("{", 1), 10); // After '{'
+
+        // High weight for closing brackets
+        assert_eq!(position_weight("foo)", 4), 8); // After ')'
+        assert_eq!(position_weight("]", 1), 8); // After ']'
+        assert_eq!(position_weight("}", 1), 8); // After '}'
+
+        // High weight at end of identifier
+        assert_eq!(position_weight("foo ", 3), 8); // End of 'foo' before space
+        assert_eq!(position_weight("bar(", 3), 8); // End of 'bar' before '('
+
+        // Medium weight for operators
+        assert_eq!(position_weight("a + b", 3), 5); // After '+'
+        assert_eq!(position_weight("x.", 2), 5); // After '.'
+        assert_eq!(position_weight("a=b", 2), 5); // After '='
+
+        // Medium weight for whitespace
+        assert_eq!(position_weight("a ", 2), 6); // After space
+
+        // Low weight mid-identifier
+        assert_eq!(position_weight("foobar", 3), 1); // Mid-identifier 'foo|bar'
+
+        // Edge cases
+        assert_eq!(position_weight("", 0), 1); // Empty string
+        assert_eq!(position_weight("a", 0), 1); // Position 0
+    }
+
+    #[test]
+    fn test_weighted_select() {
+        // Test that weighted selection returns correct indices
+        let weights = vec![1, 10, 1];
+
+        // With total weight 12, seed 0 should select index 0
+        // seed 0 % 12 = 0, cumulative: 1 at idx 0, so returns 0
+        assert_eq!(weighted_select(&weights, 0), 0);
+
+        // seed 1 % 12 = 1, cumulative: 1 at idx 0 (1 < 1 is false), 11 at idx 1 (1 < 11 is true)
+        assert_eq!(weighted_select(&weights, 1), 1);
+
+        // seed 10 % 12 = 10, cumulative: 1, 11 at idx 1 (10 < 11 is true)
+        assert_eq!(weighted_select(&weights, 10), 1);
+
+        // seed 11 % 12 = 11, cumulative: 1, 11 at idx 1 (11 < 11 is false), 12 at idx 2 (11 < 12 is true)
+        assert_eq!(weighted_select(&weights, 11), 2);
+
+        // Empty weights should return 0
+        let empty: Vec<u32> = vec![];
+        assert_eq!(weighted_select(&empty, 42), 0);
+
+        // Single weight should always return index 0
+        let single = vec![10];
+        assert_eq!(weighted_select(&single, 0), 0);
+        assert_eq!(weighted_select(&single, 100), 0);
+    }
+
+    #[test]
+    fn test_weighted_split_prefers_natural_boundaries() {
+        // Test that with different seeds, weighted selection tends to prefer
+        // positions after punctuation over mid-identifier positions
+        let text_with_punctuation = "foo(bar, baz)";
+        let text_mid_identifier = "foobar";
+
+        // Position after '(' should have high weight
+        let weight_after_paren = position_weight(text_with_punctuation, 4);
+        // Position after ',' should have high weight
+        let weight_after_comma = position_weight(text_with_punctuation, 8);
+        // Position mid-identifier should have low weight
+        let weight_mid_ident = position_weight(text_mid_identifier, 3);
+
+        assert!(
+            weight_after_paren > weight_mid_ident,
+            "After '(' ({}) should be weighted higher than mid-identifier ({})",
+            weight_after_paren,
+            weight_mid_ident
+        );
+        assert!(
+            weight_after_comma > weight_mid_ident,
+            "After ',' ({}) should be weighted higher than mid-identifier ({})",
+            weight_after_comma,
+            weight_mid_ident
+        );
+    }
+
+    #[test]
+    fn test_imitate_human_edits_pure_insertion() {
+        // Source patch is empty (no edits yet)
+        // Target patch has a pure insertion (adding a new line)
+        let source = r#"--- a/test.rs
++++ b/test.rs
+@@ -1,2 +1,2 @@
+ fn main() {
+ }
+"#;
+        let target = r#"--- a/test.rs
++++ b/test.rs
+@@ -1,2 +1,3 @@
+ fn main() {
++    println!("debug");
+ }
+"#;
+
+        let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, 42);
+
+        // Should have transformed the patches
+        assert_ne!(
+            new_src, source,
+            "Source should be modified for pure insertion"
+        );
+        assert_ne!(
+            new_tgt, target,
+            "Target should be modified for pure insertion"
+        );
+        assert!(cursor.is_some(), "Cursor should be set");
+
+        // Source should now have a partial addition
+        let src_patch = Patch::parse_unified_diff(&new_src);
+        assert!(
+            src_patch.stats().added > 0,
+            "Source should have added lines"
+        );
+
+        // Target should have both a deletion (of partial) and addition (of full)
+        let tgt_patch = Patch::parse_unified_diff(&new_tgt);
+        assert!(
+            tgt_patch.stats().removed > 0,
+            "Target should have removed lines (partial)"
+        );
+        assert!(
+            tgt_patch.stats().added > 0,
+            "Target should have added lines (full)"
+        );
+
+        // The cursor should be in test.rs
+        let cursor = cursor.unwrap();
+        assert_eq!(cursor.file, "test.rs");
+    }
+
+    #[test]
+    fn test_imitate_human_edits_pure_insertion_empty_source() {
+        // Source patch has no hunks at all
+        let source = "";
+        let target = r#"--- a/test.rs
++++ b/test.rs
+@@ -1,2 +1,3 @@
+ fn main() {
++    println!("hello");
+ }
+"#;
+
+        let (new_src, _new_tgt, cursor) = imitate_human_edits(source, target, 123);
+
+        // Should have created a source patch with partial insertion
+        assert!(!new_src.is_empty(), "Source should not be empty");
+        assert!(cursor.is_some(), "Cursor should be set");
+
+        let src_patch = Patch::parse_unified_diff(&new_src);
+        assert!(
+            src_patch.stats().added > 0,
+            "Source should have added lines"
+        );
+    }
+
+    #[test]
+    fn test_imitate_human_edits_pure_insertion_intermediate_content() {
+        // Verify the actual intermediate content is a realistic partial typing state
+        let source = "";
+        let target = r#"--- a/test.rs
++++ b/test.rs
+@@ -1,2 +1,3 @@
+ fn main() {
++    println!("hello world");
+ }
+"#;
+
+        // Test with multiple seeds to see different split points
+        let mut found_partial = false;
+        for seed in 1..=50 {
+            let (new_src, new_tgt, cursor) = imitate_human_edits(source, target, seed);
+
+            if cursor.is_some() {
+                let src_patch = Patch::parse_unified_diff(&new_src);
+                let tgt_patch = Patch::parse_unified_diff(&new_tgt);
+
+                // Find the added line in source
+                for hunk in &src_patch.hunks {
+                    for line in &hunk.lines {
+                        if let PatchLine::Addition(content) = line {
+                            // The partial line should be a prefix of the full line
+                            let full_line = "    println!(\"hello world\");";
+                            if content != full_line && full_line.starts_with(content) {
+                                found_partial = true;
+
+                                // Verify target has the partial as deletion
+                                let mut has_deletion = false;
+                                for tgt_hunk in &tgt_patch.hunks {
+                                    for tgt_line in &tgt_hunk.lines {
+                                        if let PatchLine::Deletion(del_content) = tgt_line {
+                                            if del_content == content {
+                                                has_deletion = true;
+                                            }
+                                        }
+                                    }
+                                }
+                                assert!(
+                                    has_deletion,
+                                    "Target should have deletion of partial line"
+                                );
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        assert!(
+            found_partial,
+            "At least one seed should produce a partial intermediate state"
+        );
+    }
+}

typos.toml 🔗

@@ -57,6 +57,8 @@ extend-exclude = [
     "crates/multi_buffer/src/multi_buffer_tests.rs",
     # Macos apis
     "crates/gpui/src/platform/mac/dispatcher.rs",
+    # Tests contain partially incomplete words (by design)
+    "crates/edit_prediction_cli/src/split_commit.rs",
 ]
 
 [default]