From 08e9b2e8481bb87f3ae76843d1f24f4d5a29f0bc Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 30 Oct 2023 13:32:47 -0400 Subject: [PATCH] added parsing support for <|S| |E|> spans --- crates/assistant/src/codegen.rs | 108 +++++++++++++++++++++++++------- crates/assistant/src/prompts.rs | 18 +++--- 2 files changed, 95 insertions(+), 31 deletions(-) diff --git a/crates/assistant/src/codegen.rs b/crates/assistant/src/codegen.rs index b6ef6b5cfa7fef58936828e0f121946290bc8b48..b26b1713b2fa18013e6aeedd13b4c9ddb735fe0d 100644 --- a/crates/assistant/src/codegen.rs +++ b/crates/assistant/src/codegen.rs @@ -117,7 +117,7 @@ impl Codegen { let (mut hunks_tx, mut hunks_rx) = mpsc::channel(1); let diff = cx.background().spawn(async move { - let chunks = strip_markdown_codeblock(response.await?); + let chunks = strip_invalid_spans_from_codeblock(response.await?); futures::pin_mut!(chunks); let mut diff = StreamingDiff::new(selected_text.to_string()); @@ -278,12 +278,13 @@ impl Codegen { } } -fn strip_markdown_codeblock( +fn strip_invalid_spans_from_codeblock( stream: impl Stream>, ) -> impl Stream> { let mut first_line = true; let mut buffer = String::new(); - let mut starts_with_fenced_code_block = false; + let mut starts_with_markdown_codeblock = false; + let mut includes_start_or_end_span = false; stream.filter_map(move |chunk| { let chunk = match chunk { Ok(chunk) => chunk, @@ -291,11 +292,31 @@ fn strip_markdown_codeblock( }; buffer.push_str(&chunk); + if buffer.len() > "<|S|".len() && buffer.starts_with("<|S|") { + includes_start_or_end_span = true; + + buffer = buffer + .strip_prefix("<|S|>") + .or_else(|| buffer.strip_prefix("<|S|")) + .unwrap_or(&buffer) + .to_string(); + } else if buffer.ends_with("|E|>") { + includes_start_or_end_span = true; + } else if buffer.starts_with("<|") + || buffer.starts_with("<|S") + || buffer.starts_with("<|S|") + || buffer.ends_with("|") + || buffer.ends_with("|E") + || buffer.ends_with("|E|") + { + return future::ready(None); + } + if first_line { if buffer == "" || buffer == "`" || buffer == "``" { return future::ready(None); } else if buffer.starts_with("```") { - starts_with_fenced_code_block = true; + starts_with_markdown_codeblock = true; if let Some(newline_ix) = buffer.find('\n') { buffer.replace_range(..newline_ix + 1, ""); first_line = false; @@ -305,16 +326,26 @@ fn strip_markdown_codeblock( } } - let text = if starts_with_fenced_code_block { - buffer + let mut text = buffer.to_string(); + if starts_with_markdown_codeblock { + text = text .strip_suffix("\n```\n") - .or_else(|| buffer.strip_suffix("\n```")) - .or_else(|| buffer.strip_suffix("\n``")) - .or_else(|| buffer.strip_suffix("\n`")) - .or_else(|| buffer.strip_suffix('\n')) - .unwrap_or(&buffer) - } else { - &buffer + .or_else(|| text.strip_suffix("\n```")) + .or_else(|| text.strip_suffix("\n``")) + .or_else(|| text.strip_suffix("\n`")) + .or_else(|| text.strip_suffix('\n')) + .unwrap_or(&text) + .to_string(); + } + + if includes_start_or_end_span { + text = text + .strip_suffix("|E|>") + .or_else(|| text.strip_suffix("E|>")) + .or_else(|| text.strip_prefix("|>")) + .or_else(|| text.strip_prefix(">")) + .unwrap_or(&text) + .to_string(); }; if text.contains('\n') { @@ -327,6 +358,7 @@ fn strip_markdown_codeblock( } else { Some(Ok(buffer.clone())) }; + buffer = remainder; future::ready(result) }) @@ -537,50 +569,82 @@ mod tests { } #[gpui::test] - async fn test_strip_markdown_codeblock() { + async fn test_strip_invalid_spans_from_codeblock() { assert_eq!( - strip_markdown_codeblock(chunks("Lorem ipsum dolor", 2)) + strip_invalid_spans_from_codeblock(chunks("Lorem ipsum dolor", 2)) .map(|chunk| chunk.unwrap()) .collect::() .await, "Lorem ipsum dolor" ); assert_eq!( - strip_markdown_codeblock(chunks("```\nLorem ipsum dolor", 2)) + strip_invalid_spans_from_codeblock(chunks("```\nLorem ipsum dolor", 2)) .map(|chunk| chunk.unwrap()) .collect::() .await, "Lorem ipsum dolor" ); assert_eq!( - strip_markdown_codeblock(chunks("```\nLorem ipsum dolor\n```", 2)) + strip_invalid_spans_from_codeblock(chunks("```\nLorem ipsum dolor\n```", 2)) .map(|chunk| chunk.unwrap()) .collect::() .await, "Lorem ipsum dolor" ); assert_eq!( - strip_markdown_codeblock(chunks("```\nLorem ipsum dolor\n```\n", 2)) + strip_invalid_spans_from_codeblock(chunks("```\nLorem ipsum dolor\n```\n", 2)) .map(|chunk| chunk.unwrap()) .collect::() .await, "Lorem ipsum dolor" ); assert_eq!( - strip_markdown_codeblock(chunks("```html\n```js\nLorem ipsum dolor\n```\n```", 2)) + strip_invalid_spans_from_codeblock(chunks( + "```html\n```js\nLorem ipsum dolor\n```\n```", + 2 + )) + .map(|chunk| chunk.unwrap()) + .collect::() + .await, + "```js\nLorem ipsum dolor\n```" + ); + assert_eq!( + strip_invalid_spans_from_codeblock(chunks("``\nLorem ipsum dolor\n```", 2)) .map(|chunk| chunk.unwrap()) .collect::() .await, - "```js\nLorem ipsum dolor\n```" + "``\nLorem ipsum dolor\n```" ); assert_eq!( - strip_markdown_codeblock(chunks("``\nLorem ipsum dolor\n```", 2)) + strip_invalid_spans_from_codeblock(chunks("<|S|Lorem ipsum|E|>", 2)) .map(|chunk| chunk.unwrap()) .collect::() .await, - "``\nLorem ipsum dolor\n```" + "Lorem ipsum" ); + assert_eq!( + strip_invalid_spans_from_codeblock(chunks("<|S|>Lorem ipsum", 2)) + .map(|chunk| chunk.unwrap()) + .collect::() + .await, + "Lorem ipsum" + ); + + assert_eq!( + strip_invalid_spans_from_codeblock(chunks("```\n<|S|>Lorem ipsum\n```", 2)) + .map(|chunk| chunk.unwrap()) + .collect::() + .await, + "Lorem ipsum" + ); + assert_eq!( + strip_invalid_spans_from_codeblock(chunks("```\n<|S|Lorem ipsum|E|>\n```", 2)) + .map(|chunk| chunk.unwrap()) + .collect::() + .await, + "Lorem ipsum" + ); fn chunks(text: &str, size: usize) -> impl Stream> { stream::iter( text.chars() diff --git a/crates/assistant/src/prompts.rs b/crates/assistant/src/prompts.rs index dffcbc29234d3f24174d1d9a6610045105eae890..66425d31f7877f5742b251505f91ae0ef86511b2 100644 --- a/crates/assistant/src/prompts.rs +++ b/crates/assistant/src/prompts.rs @@ -79,12 +79,12 @@ fn summarize(buffer: &BufferSnapshot, selected_range: Range) -> S if !flushed_selection { // The collapsed node ends after the selection starts, so we'll flush the selection first. summary.extend(buffer.text_for_range(offset..selected_range.start)); - summary.push_str("<|START|"); + summary.push_str("<|S|"); if selected_range.end == selected_range.start { summary.push_str(">"); } else { summary.extend(buffer.text_for_range(selected_range.clone())); - summary.push_str("|END|>"); + summary.push_str("|E|>"); } offset = selected_range.end; flushed_selection = true; @@ -106,12 +106,12 @@ fn summarize(buffer: &BufferSnapshot, selected_range: Range) -> S // Flush selection if we haven't already done so. if !flushed_selection && offset <= selected_range.start { summary.extend(buffer.text_for_range(offset..selected_range.start)); - summary.push_str("<|START|"); + summary.push_str("<|S|"); if selected_range.end == selected_range.start { summary.push_str(">"); } else { summary.extend(buffer.text_for_range(selected_range.clone())); - summary.push_str("|END|>"); + summary.push_str("|E|>"); } offset = selected_range.end; } @@ -259,7 +259,7 @@ pub(crate) mod tests { summarize(&snapshot, Point::new(1, 4)..Point::new(1, 4)), indoc! {" struct X { - <|START|>a: usize, + <|S|>a: usize, b: usize, } @@ -285,7 +285,7 @@ pub(crate) mod tests { impl X { fn new() -> Self { - let <|START|a |END|>= 1; + let <|S|a |E|>= 1; let b = 2; Self { a, b } } @@ -306,7 +306,7 @@ pub(crate) mod tests { } impl X { - <|START|> + <|S|> fn new() -> Self {} pub fn a(&self, param: bool) -> usize {} @@ -332,7 +332,7 @@ pub(crate) mod tests { pub fn b(&self) -> usize {} } - <|START|>"} + <|S|>"} ); // Ensure nested functions get collapsed properly. @@ -368,7 +368,7 @@ pub(crate) mod tests { assert_eq!( summarize(&snapshot, Point::new(0, 0)..Point::new(0, 0)), indoc! {" - <|START|>struct X { + <|S|>struct X { a: usize, b: usize, }