editor: Improve rewrap of markdown lists, todos, and block quotes (#33702)

Smit Barmase created 8 months ago

Closes #19644 #18151

Now, rewrapping markdown lists (unordered, ordered, and to-do lists) and
block quotes wrap them separately, without merging them together.
Additionally, it correctly indents subsequent lines.

With this input: 

```md
1. This is a list item that is short.
2. This list item is a bit longer because I want to see if it wraps correctly after a rewrap operation in Zed. What do you think?
3. another short item
```

Output would be:

```md
1. This is a list item that is short.
2. This list item is a bit longer because I want to see if it wraps correctly
   after a rewrap operation in Zed. What do you think?
3. another short item
```

Instead of:

```md
1. This is a list item that is short. 2. This list item is a bit longer because 
I want to see if it wraps correctly after a rewrap operation in Zed. What 
do you think? 3. another short item
```

Release Notes:

- Improved rewrap for markdown lists, todos, and block quotes.

Change summary

Cargo.lock                                |   1 
crates/editor/Cargo.toml                  |   1 
crates/editor/src/editor.rs               | 194 +++++++++++++++++-------
crates/editor/src/editor_tests.rs         |  81 ++++++++++
crates/language/src/language.rs           |  34 ++++
crates/languages/src/markdown/config.toml |   6 
6 files changed, 256 insertions(+), 61 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -4813,6 +4813,7 @@ dependencies = [
  "pretty_assertions",
  "project",
  "rand 0.8.5",
+ "regex",
  "release_channel",
  "rpc",
  "schemars",

crates/editor/Cargo.toml 🔗

@@ -61,6 +61,7 @@ parking_lot.workspace = true
 pretty_assertions.workspace = true
 project.workspace = true
 rand.workspace = true
+regex.workspace = true
 rpc.workspace = true
 schemars.workspace = true
 serde.workspace = true

crates/editor/src/editor.rs 🔗

@@ -11541,66 +11541,90 @@ impl Editor {
             let language_settings = buffer.language_settings_at(selection.head(), cx);
             let language_scope = buffer.language_scope_at(selection.head());
 
+            let indent_and_prefix_for_row =
+                |row: u32| -> (IndentSize, Option<String>, Option<String>) {
+                    let indent = buffer.indent_size_for_line(MultiBufferRow(row));
+                    let (comment_prefix, rewrap_prefix) =
+                        if let Some(language_scope) = &language_scope {
+                            let indent_end = Point::new(row, indent.len);
+                            let comment_prefix = language_scope
+                                .line_comment_prefixes()
+                                .iter()
+                                .find(|prefix| buffer.contains_str_at(indent_end, prefix))
+                                .map(|prefix| prefix.to_string());
+                            let line_end = Point::new(row, buffer.line_len(MultiBufferRow(row)));
+                            let line_text_after_indent = buffer
+                                .text_for_range(indent_end..line_end)
+                                .collect::<String>();
+                            let rewrap_prefix = language_scope
+                                .rewrap_prefixes()
+                                .iter()
+                                .find_map(|prefix_regex| {
+                                    prefix_regex.find(&line_text_after_indent).map(|mat| {
+                                        if mat.start() == 0 {
+                                            Some(mat.as_str().to_string())
+                                        } else {
+                                            None
+                                        }
+                                    })
+                                })
+                                .flatten();
+                            (comment_prefix, rewrap_prefix)
+                        } else {
+                            (None, None)
+                        };
+                    (indent, comment_prefix, rewrap_prefix)
+                };
+
             let mut ranges = Vec::new();
-            let mut current_range_start = first_row;
             let from_empty_selection = selection.is_empty();
 
+            let mut current_range_start = first_row;
             let mut prev_row = first_row;
-            let mut prev_indent = buffer.indent_size_for_line(MultiBufferRow(first_row));
-            let mut prev_comment_prefix = if let Some(language_scope) = &language_scope {
-                let indent = buffer.indent_size_for_line(MultiBufferRow(first_row));
-                let indent_end = Point::new(first_row, indent.len);
-                language_scope
-                    .line_comment_prefixes()
-                    .iter()
-                    .find(|prefix| buffer.contains_str_at(indent_end, prefix))
-                    .cloned()
-            } else {
-                None
-            };
+            let (
+                mut current_range_indent,
+                mut current_range_comment_prefix,
+                mut current_range_rewrap_prefix,
+            ) = indent_and_prefix_for_row(first_row);
 
             for row in non_blank_rows_iter.skip(1) {
                 let has_paragraph_break = row > prev_row + 1;
 
-                let row_indent = buffer.indent_size_for_line(MultiBufferRow(row));
-                let row_comment_prefix = if let Some(language_scope) = &language_scope {
-                    let indent = buffer.indent_size_for_line(MultiBufferRow(row));
-                    let indent_end = Point::new(row, indent.len);
-                    language_scope
-                        .line_comment_prefixes()
-                        .iter()
-                        .find(|prefix| buffer.contains_str_at(indent_end, prefix))
-                        .cloned()
-                } else {
-                    None
-                };
+                let (row_indent, row_comment_prefix, row_rewrap_prefix) =
+                    indent_and_prefix_for_row(row);
+
+                let has_indent_change = row_indent != current_range_indent;
+                let has_comment_change = row_comment_prefix != current_range_comment_prefix;
 
-                let has_boundary_change =
-                    row_indent != prev_indent || row_comment_prefix != prev_comment_prefix;
+                let has_boundary_change = has_comment_change
+                    || row_rewrap_prefix.is_some()
+                    || (has_indent_change && current_range_comment_prefix.is_some());
 
                 if has_paragraph_break || has_boundary_change {
                     ranges.push((
                         language_settings.clone(),
                         Point::new(current_range_start, 0)
                             ..Point::new(prev_row, buffer.line_len(MultiBufferRow(prev_row))),
-                        prev_indent,
-                        prev_comment_prefix.clone(),
+                        current_range_indent,
+                        current_range_comment_prefix.clone(),
+                        current_range_rewrap_prefix.clone(),
                         from_empty_selection,
                     ));
                     current_range_start = row;
+                    current_range_indent = row_indent;
+                    current_range_comment_prefix = row_comment_prefix;
+                    current_range_rewrap_prefix = row_rewrap_prefix;
                 }
-
                 prev_row = row;
-                prev_indent = row_indent;
-                prev_comment_prefix = row_comment_prefix;
             }
 
             ranges.push((
                 language_settings.clone(),
                 Point::new(current_range_start, 0)
                     ..Point::new(prev_row, buffer.line_len(MultiBufferRow(prev_row))),
-                prev_indent,
-                prev_comment_prefix,
+                current_range_indent,
+                current_range_comment_prefix,
+                current_range_rewrap_prefix,
                 from_empty_selection,
             ));
 
@@ -11610,8 +11634,14 @@ impl Editor {
         let mut edits = Vec::new();
         let mut rewrapped_row_ranges = Vec::<RangeInclusive<u32>>::new();
 
-        for (language_settings, wrap_range, indent_size, comment_prefix, from_empty_selection) in
-            wrap_ranges
+        for (
+            language_settings,
+            wrap_range,
+            indent_size,
+            comment_prefix,
+            rewrap_prefix,
+            from_empty_selection,
+        ) in wrap_ranges
         {
             let mut start_row = wrap_range.start.row;
             let mut end_row = wrap_range.end.row;
@@ -11627,12 +11657,16 @@ impl Editor {
 
             let tab_size = language_settings.tab_size;
 
-            let mut line_prefix = indent_size.chars().collect::<String>();
+            let indent_prefix = indent_size.chars().collect::<String>();
+            let mut line_prefix = indent_prefix.clone();
             let mut inside_comment = false;
             if let Some(prefix) = &comment_prefix {
                 line_prefix.push_str(prefix);
                 inside_comment = true;
             }
+            if let Some(prefix) = &rewrap_prefix {
+                line_prefix.push_str(prefix);
+            }
 
             let allow_rewrap_based_on_language = match language_settings.allow_rewrap {
                 RewrapBehavior::InComments => inside_comment,
@@ -11679,12 +11713,18 @@ impl Editor {
             let selection_text = buffer.text_for_range(start..end).collect::<String>();
             let Some(lines_without_prefixes) = selection_text
                 .lines()
-                .map(|line| {
-                    line.strip_prefix(&line_prefix)
-                        .or_else(|| line.trim_start().strip_prefix(&line_prefix.trim_start()))
-                        .with_context(|| {
-                            format!("line did not start with prefix {line_prefix:?}: {line:?}")
-                        })
+                .enumerate()
+                .map(|(ix, line)| {
+                    let line_trimmed = line.trim_start();
+                    if rewrap_prefix.is_some() && ix > 0 {
+                        Ok(line_trimmed)
+                    } else {
+                        line_trimmed
+                            .strip_prefix(&line_prefix.trim_start())
+                            .with_context(|| {
+                                format!("line did not start with prefix {line_prefix:?}: {line:?}")
+                            })
+                    }
                 })
                 .collect::<Result<Vec<_>, _>>()
                 .log_err()
@@ -11697,8 +11737,16 @@ impl Editor {
                     .language_settings_at(Point::new(start_row, 0), cx)
                     .preferred_line_length as usize
             });
+
+            let subsequent_lines_prefix = if let Some(rewrap_prefix_str) = &rewrap_prefix {
+                format!("{}{}", indent_prefix, " ".repeat(rewrap_prefix_str.len()))
+            } else {
+                line_prefix.clone()
+            };
+
             let wrapped_text = wrap_with_prefix(
                 line_prefix,
+                subsequent_lines_prefix,
                 lines_without_prefixes.join("\n"),
                 wrap_column,
                 tab_size,
@@ -21200,18 +21248,22 @@ fn test_word_breaking_tokenizer() {
 }
 
 fn wrap_with_prefix(
-    line_prefix: String,
+    first_line_prefix: String,
+    subsequent_lines_prefix: String,
     unwrapped_text: String,
     wrap_column: usize,
     tab_size: NonZeroU32,
     preserve_existing_whitespace: bool,
 ) -> String {
-    let line_prefix_len = char_len_with_expanded_tabs(0, &line_prefix, tab_size);
+    let first_line_prefix_len = char_len_with_expanded_tabs(0, &first_line_prefix, tab_size);
+    let subsequent_lines_prefix_len =
+        char_len_with_expanded_tabs(0, &subsequent_lines_prefix, tab_size);
     let mut wrapped_text = String::new();
-    let mut current_line = line_prefix.clone();
+    let mut current_line = first_line_prefix.clone();
+    let mut is_first_line = true;
 
     let tokenizer = WordBreakingTokenizer::new(&unwrapped_text);
-    let mut current_line_len = line_prefix_len;
+    let mut current_line_len = first_line_prefix_len;
     let mut in_whitespace = false;
     for token in tokenizer {
         let have_preceding_whitespace = in_whitespace;
@@ -21221,13 +21273,19 @@ fn wrap_with_prefix(
                 grapheme_len,
             } => {
                 in_whitespace = false;
+                let current_prefix_len = if is_first_line {
+                    first_line_prefix_len
+                } else {
+                    subsequent_lines_prefix_len
+                };
                 if current_line_len + grapheme_len > wrap_column
-                    && current_line_len != line_prefix_len
+                    && current_line_len != current_prefix_len
                 {
                     wrapped_text.push_str(current_line.trim_end());
                     wrapped_text.push('\n');
-                    current_line.truncate(line_prefix.len());
-                    current_line_len = line_prefix_len;
+                    is_first_line = false;
+                    current_line = subsequent_lines_prefix.clone();
+                    current_line_len = subsequent_lines_prefix_len;
                 }
                 current_line.push_str(token);
                 current_line_len += grapheme_len;
@@ -21244,32 +21302,46 @@ fn wrap_with_prefix(
                     token = " ";
                     grapheme_len = 1;
                 }
+                let current_prefix_len = if is_first_line {
+                    first_line_prefix_len
+                } else {
+                    subsequent_lines_prefix_len
+                };
                 if current_line_len + grapheme_len > wrap_column {
                     wrapped_text.push_str(current_line.trim_end());
                     wrapped_text.push('\n');
-                    current_line.truncate(line_prefix.len());
-                    current_line_len = line_prefix_len;
-                } else if current_line_len != line_prefix_len || preserve_existing_whitespace {
+                    is_first_line = false;
+                    current_line = subsequent_lines_prefix.clone();
+                    current_line_len = subsequent_lines_prefix_len;
+                } else if current_line_len != current_prefix_len || preserve_existing_whitespace {
                     current_line.push_str(token);
                     current_line_len += grapheme_len;
                 }
             }
             WordBreakToken::Newline => {
                 in_whitespace = true;
+                let current_prefix_len = if is_first_line {
+                    first_line_prefix_len
+                } else {
+                    subsequent_lines_prefix_len
+                };
                 if preserve_existing_whitespace {
                     wrapped_text.push_str(current_line.trim_end());
                     wrapped_text.push('\n');
-                    current_line.truncate(line_prefix.len());
-                    current_line_len = line_prefix_len;
+                    is_first_line = false;
+                    current_line = subsequent_lines_prefix.clone();
+                    current_line_len = subsequent_lines_prefix_len;
                 } else if have_preceding_whitespace {
                     continue;
-                } else if current_line_len + 1 > wrap_column && current_line_len != line_prefix_len
+                } else if current_line_len + 1 > wrap_column
+                    && current_line_len != current_prefix_len
                 {
                     wrapped_text.push_str(current_line.trim_end());
                     wrapped_text.push('\n');
-                    current_line.truncate(line_prefix.len());
-                    current_line_len = line_prefix_len;
-                } else if current_line_len != line_prefix_len {
+                    is_first_line = false;
+                    current_line = subsequent_lines_prefix.clone();
+                    current_line_len = subsequent_lines_prefix_len;
+                } else if current_line_len != current_prefix_len {
                     current_line.push(' ');
                     current_line_len += 1;
                 }
@@ -21287,6 +21359,7 @@ fn wrap_with_prefix(
 fn test_wrap_with_prefix() {
     assert_eq!(
         wrap_with_prefix(
+            "# ".to_string(),
             "# ".to_string(),
             "abcdefg".to_string(),
             4,
@@ -21297,6 +21370,7 @@ fn test_wrap_with_prefix() {
     );
     assert_eq!(
         wrap_with_prefix(
+            "".to_string(),
             "".to_string(),
             "\thello world".to_string(),
             8,
@@ -21307,6 +21381,7 @@ fn test_wrap_with_prefix() {
     );
     assert_eq!(
         wrap_with_prefix(
+            "// ".to_string(),
             "// ".to_string(),
             "xx \nyy zz aa bb cc".to_string(),
             12,
@@ -21317,6 +21392,7 @@ fn test_wrap_with_prefix() {
     );
     assert_eq!(
         wrap_with_prefix(
+            String::new(),
             String::new(),
             "这是什么 \n 钢笔".to_string(),
             3,

crates/editor/src/editor_tests.rs 🔗

@@ -5210,6 +5210,10 @@ async fn test_rewrap(cx: &mut TestAppContext) {
     let markdown_language = Arc::new(Language::new(
         LanguageConfig {
             name: "Markdown".into(),
+            rewrap_prefixes: vec![
+                regex::Regex::new("\\d+\\.\\s+").unwrap(),
+                regex::Regex::new("[-*+]\\s+").unwrap(),
+            ],
             ..LanguageConfig::default()
         },
         None,
@@ -5372,7 +5376,82 @@ async fn test_rewrap(cx: &mut TestAppContext) {
             A long long long line of markdown text
             to wrap.ˇ
          "},
-        markdown_language,
+        markdown_language.clone(),
+        &mut cx,
+    );
+
+    // Test that rewrapping boundary works and preserves relative indent for Markdown documents
+    assert_rewrap(
+        indoc! {"
+            «1. This is a numbered list item that is very long and needs to be wrapped properly.
+            2. This is a numbered list item that is very long and needs to be wrapped properly.
+            - This is an unordered list item that is also very long and should not merge with the numbered item.ˇ»
+        "},
+        indoc! {"
+            «1. This is a numbered list item that is
+               very long and needs to be wrapped
+               properly.
+            2. This is a numbered list item that is
+               very long and needs to be wrapped
+               properly.
+            - This is an unordered list item that is
+              also very long and should not merge
+              with the numbered item.ˇ»
+        "},
+        markdown_language.clone(),
+        &mut cx,
+    );
+
+    // Test that rewrapping add indents for rewrapping boundary if not exists already.
+    assert_rewrap(
+        indoc! {"
+            «1. This is a numbered list item that is
+            very long and needs to be wrapped
+            properly.
+            2. This is a numbered list item that is
+            very long and needs to be wrapped
+            properly.
+            - This is an unordered list item that is
+            also very long and should not merge with
+            the numbered item.ˇ»
+        "},
+        indoc! {"
+            «1. This is a numbered list item that is
+               very long and needs to be wrapped
+               properly.
+            2. This is a numbered list item that is
+               very long and needs to be wrapped
+               properly.
+            - This is an unordered list item that is
+              also very long and should not merge
+              with the numbered item.ˇ»
+        "},
+        markdown_language.clone(),
+        &mut cx,
+    );
+
+    // Test that rewrapping maintain indents even when they already exists.
+    assert_rewrap(
+        indoc! {"
+            «1. This is a numbered list
+               item that is very long and needs to be wrapped properly.
+            2. This is a numbered list
+               item that is very long and needs to be wrapped properly.
+            - This is an unordered list item that is also very long and
+              should not merge with the numbered item.ˇ»
+        "},
+        indoc! {"
+            «1. This is a numbered list item that is
+               very long and needs to be wrapped
+               properly.
+            2. This is a numbered list item that is
+               very long and needs to be wrapped
+               properly.
+            - This is an unordered list item that is
+              also very long and should not merge
+              with the numbered item.ˇ»
+        "},
+        markdown_language.clone(),
         &mut cx,
     );

crates/language/src/language.rs 🔗

@@ -39,7 +39,7 @@ use lsp::{CodeActionKind, InitializeParams, LanguageServerBinary, LanguageServer
 pub use manifest::{ManifestDelegate, ManifestName, ManifestProvider, ManifestQuery};
 use parking_lot::Mutex;
 use regex::Regex;
-use schemars::{JsonSchema, json_schema};
+use schemars::{JsonSchema, SchemaGenerator, json_schema};
 use serde::{Deserialize, Deserializer, Serialize, Serializer, de};
 use serde_json::Value;
 use settings::WorktreeId;
@@ -730,6 +730,13 @@ pub struct LanguageConfig {
     /// Starting and closing characters of a block comment.
     #[serde(default)]
     pub block_comment: Option<(Arc<str>, Arc<str>)>,
+    /// A list of additional regex patterns that should be treated as prefixes
+    /// for creating boundaries during rewrapping, ensuring content from one
+    /// prefixed section doesn't merge with another (e.g., markdown list items).
+    /// By default, Zed treats as paragraph and comment prefixes as boundaries.
+    #[serde(default, deserialize_with = "deserialize_regex_vec")]
+    #[schemars(schema_with = "regex_vec_json_schema")]
+    pub rewrap_prefixes: Vec<Regex>,
     /// A list of language servers that are allowed to run on subranges of a given language.
     #[serde(default)]
     pub scope_opt_in_language_servers: Vec<LanguageServerName>,
@@ -909,6 +916,7 @@ impl Default for LanguageConfig {
             autoclose_before: Default::default(),
             line_comments: Default::default(),
             block_comment: Default::default(),
+            rewrap_prefixes: Default::default(),
             scope_opt_in_language_servers: Default::default(),
             overrides: Default::default(),
             word_characters: Default::default(),
@@ -955,6 +963,22 @@ where
     }
 }
 
+fn deserialize_regex_vec<'de, D: Deserializer<'de>>(d: D) -> Result<Vec<Regex>, D::Error> {
+    let sources = Vec::<String>::deserialize(d)?;
+    let mut regexes = Vec::new();
+    for source in sources {
+        regexes.push(regex::Regex::new(&source).map_err(de::Error::custom)?);
+    }
+    Ok(regexes)
+}
+
+fn regex_vec_json_schema(_: &mut SchemaGenerator) -> schemars::Schema {
+    json_schema!({
+        "type": "array",
+        "items": { "type": "string" }
+    })
+}
+
 #[doc(hidden)]
 #[cfg(any(test, feature = "test-support"))]
 pub struct FakeLspAdapter {
@@ -1831,6 +1855,14 @@ impl LanguageScope {
         .map(|e| (&e.0, &e.1))
     }
 
+    /// Returns additional regex patterns that act as prefix markers for creating
+    /// boundaries during rewrapping.
+    ///
+    /// By default, Zed treats as paragraph and comment prefixes as boundaries.
+    pub fn rewrap_prefixes(&self) -> &[Regex] {
+        &self.language.config.rewrap_prefixes
+    }
+
     /// Returns a list of language-specific word characters.
     ///
     /// By default, Zed treats alphanumeric characters (and '_') as word characters for

crates/languages/src/markdown/config.toml 🔗

@@ -13,6 +13,12 @@ brackets = [
     { start = "'", end = "'", close = false, newline = false },
     { start = "`", end = "`", close = false, newline = false },
 ]
+rewrap_prefixes = [
+    "[-*+]\\s+",
+    "\\d+\\.\\s+",
+    ">\\s*",
+    "[-*+]\\s+\\[[\\sx]\\]\\s+"
+]
 
 auto_indent_on_paste = false
 auto_indent_using_last_non_empty_line = false