ep: Make `extract_last_codeblock` more robust (#49347)

Oleksiy Syvokon created

Release Notes:

- N/A

Change summary

crates/edit_prediction_cli/src/format_prompt.rs | 111 +++++++++++++-----
crates/edit_prediction_cli/src/repair.rs        |  19 +-
2 files changed, 90 insertions(+), 40 deletions(-)

Detailed changes

crates/edit_prediction_cli/src/format_prompt.rs 🔗

@@ -210,9 +210,10 @@ impl TeacherPrompt {
 
     pub fn parse(example: &Example, response: &str) -> Result<(String, Option<ActualCursor>)> {
         // Check if the model indicated no edits are needed
-        let last_codeblock = extract_last_codeblock(&response);
-        if last_codeblock.trim() == Self::NO_EDITS {
-            return Ok((String::new(), None));
+        if let Some(last_codeblock) = extract_last_codeblock(&response) {
+            if last_codeblock.trim() == Self::NO_EDITS {
+                return Ok((String::new(), None));
+            }
         }
 
         // Extract updated (new) editable region from the model response.
@@ -431,36 +432,49 @@ pub fn extract_cursor_excerpt_from_example(example: &Example) -> Option<String>
     Some(result)
 }
 
-pub(crate) fn extract_last_codeblock(text: &str) -> String {
-    let mut last_block = None;
-    let mut search_start = 0;
-
-    while let Some(start) = text[search_start..].find("```") {
-        let start = start + search_start;
-        let bytes = text.as_bytes();
-        let mut backtick_end = start;
-
-        while backtick_end < bytes.len() && bytes[backtick_end] == b'`' {
-            backtick_end += 1;
-        }
+pub(crate) fn extract_last_codeblock(text: &str) -> Option<String> {
+    let lines: Vec<&str> = text.lines().collect();
 
-        let backtick_count = backtick_end - start;
-        let closing_pattern = format!("\n{}", "`".repeat(backtick_count));
+    // Search from the end for a closing fence (line containing only backticks, 3+)
+    let mut closing_line_idx = None;
+    let mut backtick_count = 0;
 
-        while backtick_end < bytes.len() && bytes[backtick_end] != b'\n' {
-            backtick_end += 1;
+    for i in (0..lines.len()).rev() {
+        let line = lines[i].trim();
+        if line.len() >= 3 && line.chars().all(|c| c == '`') {
+            closing_line_idx = Some(i);
+            backtick_count = line.len();
+            break;
         }
+    }
 
-        if let Some(end_pos) = text[backtick_end..].find(&closing_pattern) {
-            let code_block = &text[backtick_end + 1..backtick_end + end_pos + 1];
-            last_block = Some(code_block.to_string());
-            search_start = backtick_end + end_pos + closing_pattern.len();
-        } else {
-            break;
+    let closing_idx = closing_line_idx?;
+
+    // Search backwards for matching opening fence
+    // Opening fence starts with same backtick count, possibly followed by language/metadata
+    let opening_pattern = "`".repeat(backtick_count);
+
+    for i in (0..closing_idx).rev() {
+        let line = lines[i];
+        if line.starts_with(&opening_pattern) {
+            // Ensure it's exactly the right number of backticks (not more)
+            let rest = &line[backtick_count..];
+            if rest.is_empty() || !rest.starts_with('`') {
+                // Found matching opening fence
+                // Extract content between opening and closing (exclusive)
+                if closing_idx > i + 1 {
+                    let content = lines[i + 1..closing_idx].join("\n");
+                    // Preserve trailing newline to match previous behavior
+                    return Some(format!("{}\n", content));
+                } else {
+                    // Empty block
+                    return Some(String::new());
+                }
+            }
         }
     }
 
-    last_block.unwrap_or_else(|| text.to_string())
+    None
 }
 
 #[cfg(test)]
@@ -480,7 +494,7 @@ mod tests {
             last block
             `````
             "};
-        let last_block = extract_last_codeblock(text);
+        let last_block = extract_last_codeblock(text).unwrap();
         assert_eq!(last_block, "last block\n");
     }
 
@@ -493,7 +507,7 @@ mod tests {
             more content
             `````
             "};
-        let last_block = extract_last_codeblock(text);
+        let last_block = extract_last_codeblock(text).unwrap();
         assert_eq!(
             last_block,
             "content with ``` inline\nand ```python nested\nmore content\n"
@@ -508,7 +522,7 @@ mod tests {
             and here```more```stuff
             `````
             "};
-        let last_block = extract_last_codeblock(text);
+        let last_block = extract_last_codeblock(text).unwrap();
         assert_eq!(
             last_block,
             "here is some `code` with inline backticks\nand here```more```stuff\n"
@@ -557,7 +571,7 @@ mod tests {
             ```
             `````
             "#};
-        let last_block = extract_last_codeblock(text);
+        let last_block = extract_last_codeblock(text).unwrap();
         assert_eq!(
             last_block,
             indoc::indoc! {r#"
@@ -599,7 +613,42 @@ mod tests {
             NO_EDITS
             `````
         "};
-        let codeblock = extract_last_codeblock(response);
+        let codeblock = extract_last_codeblock(response).unwrap();
         assert_eq!(codeblock.trim(), TeacherPrompt::NO_EDITS);
     }
+
+    #[test]
+    fn test_extract_codeblock_no_valid_block() {
+        // Text with no code blocks should return None
+        let text = "Just some plain text without any code blocks";
+        assert!(extract_last_codeblock(text).is_none());
+
+        // Unclosed code block should return None
+        let text = indoc::indoc! {"
+            ```
+            unclosed block
+        "};
+        assert!(extract_last_codeblock(text).is_none());
+
+        // Analysis text with nested markdown but no proper outer block
+        let text = indoc::indoc! {"
+            # Analysis
+            Looking at this:
+            ```
+            some code
+            ```
+            But then more analysis without wrapping block
+        "};
+        // This should find the inner block
+        let result = extract_last_codeblock(text).unwrap();
+        assert_eq!(result, "some code\n");
+    }
+
+    #[test]
+    fn test_extract_codeblock_no_trailing_newline() {
+        // Text ending without trailing newline after closing fence
+        let text = "`````\ncontent here\n`````";
+        let result = extract_last_codeblock(text).unwrap();
+        assert_eq!(result, "content here\n");
+    }
 }

crates/edit_prediction_cli/src/repair.rs 🔗

@@ -227,15 +227,16 @@ pub fn needs_repair(example: &Example, confidence_threshold: u8) -> bool {
 /// Handles the `KEEP_PREVIOUS` sentinel by copying the teacher's prediction,
 /// and delegates normal output to `TeacherPrompt::parse`.
 pub fn parse(example: &Example, actual_output: &str) -> Result<(String, Option<ActualCursor>)> {
-    let last_codeblock = extract_last_codeblock(actual_output);
-    if last_codeblock.trim() == KEEP_PREVIOUS {
-        let original = example
-            .predictions
-            .first()
-            .context("no original prediction to keep")?;
-        let patch = original.actual_patch.clone().unwrap_or_default();
-        let cursor = original.actual_cursor.clone();
-        return Ok((patch, cursor));
+    if let Some(last_codeblock) = extract_last_codeblock(actual_output) {
+        if last_codeblock.trim() == KEEP_PREVIOUS {
+            let original = example
+                .predictions
+                .first()
+                .context("no original prediction to keep")?;
+            let patch = original.actual_patch.clone().unwrap_or_default();
+            let cursor = original.actual_cursor.clone();
+            return Ok((patch, cursor));
+        }
     }
 
     TeacherPrompt::parse(example, actual_output)