ep: Parse "user accepted prediction" markers in evals (#49598)

Oleksiy Syvokon created

Also:
- Add two evals
- Remove duplicated Example 6 from the teacher prompt 

Release Notes:

- N/A

Change summary

crates/edit_prediction/src/example_spec.rs                                | 90 
crates/edit_prediction_cli/evals/flask--rename-accepted-prediction.md     | 98 
crates/edit_prediction_cli/evals/hello-world--rename-accepted-group-by.md | 81 
crates/edit_prediction_cli/src/prompts/teacher.md                         | 59 
typos.toml                                                                |  2 
5 files changed, 271 insertions(+), 59 deletions(-)

Detailed changes

crates/edit_prediction/src/example_spec.rs 🔗

@@ -181,6 +181,7 @@ const EDIT_HISTORY_HEADING: &str = "Edit History";
 const CURSOR_POSITION_HEADING: &str = "Cursor Position";
 const EXPECTED_PATCH_HEADING: &str = "Expected Patch";
 const REJECTED_PATCH_HEADING: &str = "Rejected Patch";
+const ACCEPTED_PREDICTION_MARKER: &str = "// User accepted prediction:";
 
 #[derive(Serialize, Deserialize)]
 struct FrontMatter<'a> {
@@ -352,6 +353,7 @@ impl ExampleSpec {
         }
 
         let mut current_section = Section::Start;
+        let mut next_edit_predicted = false;
 
         for event in parser {
             match event {
@@ -387,6 +389,12 @@ impl ExampleSpec {
                     anyhow::bail!("Unexpected heading level: {level}");
                 }
                 Event::Start(Tag::CodeBlock(kind)) => {
+                    if current_section == Section::EditHistory
+                        && text.trim() == ACCEPTED_PREDICTION_MARKER
+                    {
+                        next_edit_predicted = true;
+                    }
+                    text.clear();
                     match kind {
                         CodeBlockKind::Fenced(info) => {
                             block_info = info;
@@ -407,6 +415,11 @@ impl ExampleSpec {
                             spec.uncommitted_diff = mem::take(&mut text);
                         }
                         Section::EditHistory => {
+                            if next_edit_predicted {
+                                spec.edit_history
+                                    .push_str(&format!("{}\n", ACCEPTED_PREDICTION_MARKER));
+                                next_edit_predicted = false;
+                            }
                             spec.edit_history.push_str(&mem::take(&mut text));
                         }
                         Section::CursorPosition => {
@@ -908,4 +921,81 @@ mod tests {
         let results = spec.expected_patches_with_cursor_positions();
         assert_eq!(results, vec![(clean_patch, None)]);
     }
+
+    #[test]
+    fn test_from_markdown_accepted_prediction_marker() {
+        let markdown = indoc! {r#"
+            +++
+            repository_url = "https://github.com/example/repo"
+            revision = "abc123"
+            +++
+
+            ## Edit History
+
+            ```diff
+            --- a/src/main.rs
+            +++ b/src/main.rs
+            @@ -1,3 +1,3 @@
+            -fn hello() {}
+            +fn hello_world() {}
+            ```
+
+            // User accepted prediction:
+            ```diff
+            --- a/src/main.rs
+            +++ b/src/main.rs
+            @@ -1,3 +1,3 @@
+            -fn hello_world() {}
+            +fn hello_world() { println!("hi"); }
+            ```
+
+            ```diff
+            --- a/src/main.rs
+            +++ b/src/main.rs
+            @@ -1,3 +1,3 @@
+            -fn hello_world() { println!("hi"); }
+            +fn hello_world() { println!("hello"); }
+            ```
+
+            ## Cursor Position
+
+            ```src/main.rs
+            fn hello_world() { println!("hello"); }
+            #                                    ^[CURSOR_POSITION]
+            ```
+
+            ## Expected Patch
+
+            ```diff
+            --- a/src/main.rs
+            +++ b/src/main.rs
+            @@ -1,3 +1,3 @@
+            -fn hello_world() { println!("hello"); }
+            +fn hello_world() { println!("hello, world!"); }
+            ```
+        "#};
+
+        let spec = ExampleSpec::from_markdown(markdown).unwrap();
+
+        // The first diff should NOT have the marker
+        assert!(spec.edit_history.starts_with("--- a/src/main.rs"));
+
+        // The second diff should be preceded by the accepted prediction marker
+        assert!(
+            spec.edit_history
+                .contains("// User accepted prediction:\n--- a/src/main.rs")
+        );
+
+        // Count occurrences of the marker - should be exactly one
+        let marker_count = spec
+            .edit_history
+            .matches("// User accepted prediction:")
+            .count();
+        assert_eq!(marker_count, 1);
+
+        // The third diff should NOT have the marker
+        // Verify all three diffs are present
+        let diff_count = spec.edit_history.matches("--- a/src/main.rs").count();
+        assert_eq!(diff_count, 3);
+    }
 }

crates/edit_prediction_cli/evals/flask--rename-accepted-prediction.md 🔗

@@ -0,0 +1,98 @@
++++
+repository_url = "https://github.com/pallets/flask"
+revision = "2fec0b206c6e83ea813ab26597e15c96fab08be7"
++++
+
+## Edit History
+
+```diff
+--- a/tests/test_basic.py
++++ b/tests/test_basic.py
+@@ -356,3 +356,6 @@
+     cookie = rv.headers["set-cookie"].lower()
+     assert "samesite=lax" in cookie
+
+
++de
++
++
+ def test_missing_session(app):
+```
+
+// User accepted prediction:
+```diff
+--- a/tests/test_basic.py
++++ b/tests/test_basic.py
+@@ -358,6 +358,14 @@
+
+
+-de
++def test_session_cookie_httponly(app, client):
++    app.config["SESSION_COOKIE_HTTPONLY"] = True
++
++    @app.route("/")
++    def index():
++        flask.session["testing"] = 42
++        return "Hello World"
++
++    rv = client.get("/")
++    assert "httponly" in rv.headers["set-cookie"].lower()
+
+
+ def test_missing_session(app):
+```
+
+```diff
+--- a/tests/test_basic.py
++++ b/tests/test_basic.py
+@@ -358,14 +358,14 @@
+
+
+-def test_session_cookie_httponly(app, client):
++def test_session_cookie_secur(app, client):
+     app.config["SESSION_COOKIE_HTTPONLY"] = True
+```
+
+## Cursor Position
+
+```tests/test_basic.py
+    cookie = rv.headers["set-cookie"].lower()
+    assert "samesite=lax" in cookie
+
+
+def test_session_cookie_secur(app, client):
+#                            ^[CURSOR_POSITION]
+    app.config["SESSION_COOKIE_HTTPONLY"] = True
+
+    @app.route("/")
+    def index():
+        flask.session["testing"] = 42
+        return "Hello World"
+
+    rv = client.get("/")
+    assert "httponly" in rv.headers["set-cookie"].lower()
+
+
+def test_missing_session(app):
+```
+
+## Expected Patch
+
+```diff
+--- a/tests/test_basic.py
++++ b/tests/test_basic.py
+@@ -358,14 +358,14 @@
+-def test_session_cookie_secur(app, client):
+-    app.config["SESSION_COOKIE_HTTPONLY"] = True
++def test_session_cookie_secure(app, client):
++    app.config["SESSION_COOKIE_SECURE"] = True
+
+     @app.route("/")
+     def index():
+         flask.session["testing"] = 42
+         return "Hello World"
+
+     rv = client.get("/")
+-    assert "httponly" in rv.headers["set-cookie"].lower()
++    assert "secure" in rv.headers["set-cookie"].lower()
+```

crates/edit_prediction_cli/evals/hello-world--rename-accepted-group-by.md 🔗

@@ -0,0 +1,81 @@
++++
+repository_url = "https://github.com/octocat/hello-world"
+revision = "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d"
++++
+
+## Edit History
+
+```diff
+--- a/README
++++ b/README
+@@ -1,1 +1,6 @@
+-Hello World!
++function filterByStatus(items, status) {
++    return items.filter(item => item.status === status);
++}
++
++function groupBy
++
+```
+
+// User accepted prediction:
+```diff
+--- a/README
++++ b/README
+@@ -4,3 +4,9 @@
+ 
+-function groupBy
++function groupByStatus(items) {
++    return items.reduce((groups, item) => {
++        const key = item.status;
++        (groups[key] = groups[key] || []).push(item);
++        return groups;
++    }, {});
++}
+ 
+```
+
+```diff
+--- a/README
++++ b/README
+@@ -4,4 +4,4 @@
+ 
+-function groupByStatus(items) {
++function groupByCat(items) {
+     return items.reduce((groups, item) => {
+```
+
+## Cursor Position
+
+```README
+function filterByStatus(items, status) {
+    return items.filter(item => item.status === status);
+}
+
+function groupByCat(items) {
+#                  ^[CURSOR_POSITION]
+    return items.reduce((groups, item) => {
+        const key = item.status;
+        (groups[key] = groups[key] || []).push(item);
+        return groups;
+    }, {});
+}
+
+```
+
+## Expected Patch
+
+```diff
+--- a/README
++++ b/README
+@@ -5,7 +5,7 @@
+-function groupByCat(items) {
++function groupByCategory(items) {
+#                        ^[CURSOR_POSITION]
+     return items.reduce((groups, item) => {
+-        const key = item.status;
++        const key = item.category;
+         (groups[key] = groups[key] || []).push(item);
+         return groups;
+     }, {});
+```

crates/edit_prediction_cli/src/prompts/teacher.md 🔗

@@ -238,65 +238,6 @@ The user just fixed a bug in the `add` function, changing subtraction to additio
 NO_EDITS
 `````
 
-## Example 6
-
-The user accepted a prediction for a function, then started renaming it. The original arguments were auto-generated (marked with `// User accepted prediction:`), so they CAN be updated to match the new function name. This is NOT reverting user input—it's improving auto-generated scaffolding.
-
-### User Edit History
-
-`````
---- a/math_utils.py
-+++ b/math_utils.py
-@@ -3,3 +3,5 @@
- def calculate_rectangle_area(width, height):
-     return width * height
-
-+de
-
-// User accepted prediction:
---- a/math_utils.py
-+++ b/math_utils.py
-@@ -3,5 +3,7 @@
- def calculate_rectangle_area(width, height):
-     return width * height
-
--de
-+def calculate_rectangle_perimeter(width, height):
-+
-
---- a/math_utils.py
-+++ b/math_utils.py
-@@ -5,5 +5,5 @@
-     return width * height
-
--def calculate_rectangle_perimeter(width, height):
-+def calculate_sq_perimeter(width, height):
-
-`````
-
-### Current File
-
-`````math_utils.py
-def calculate_rectangle_area(width, height):
-    return width * height
-
-<|editable_region_start|>
-def calculate_sq<|user_cursor|>_perimeter(width, height):
-
-<|editable_region_end|>
-`````
-
-### Output
-
-The user accepted a prediction for `calculate_rectangle_perimeter(width, height)`, then started renaming `rectangle` to `square`. Since squares have equal sides, the arguments should change from `(width, height)` to `(side)`. The arguments were auto-generated (from an accepted prediction), so modifying them is appropriate.
-
-`````
-<|editable_region_start|>
-def calculate_square_perimeter(side):
-    <|user_cursor|>
-<|editable_region_end|>
-`````
-
 ## Example 5
 
 The user just deleted code, leaving behind what looks incomplete. You must NOT "complete" it by restoring deleted content—that would undo their edit. Output NO_EDITS. **This is the correct response even though the code appears broken.**

typos.toml 🔗

@@ -60,6 +60,8 @@ extend-exclude = [
     "crates/gpui/src/platform/mac/dispatcher.rs",
     # Tests contain partially incomplete words (by design)
     "crates/edit_prediction_cli/src/split_commit.rs",
+    # Eval examples contain intentionally partial words (e.g. "secur" for "secure")
+    "crates/edit_prediction_cli/evals/",
     # Tests contain `baˇr` that cause `"ba" should be "by" or "be".`-like false-positives
     "crates/editor/src/document_symbols.rs",
 ]