From f2601ce52ce82eb201799ae6c4f1f92f42ccf7c8 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Mon, 26 May 2025 09:11:56 -0400 Subject: [PATCH] Fix text wrapping in commit message editors (#31030) Don't hard wrap interactively; instead, soft wrap in `Bounded` mode (editor width or 72 chars, whichever is smaller), and then hard wrap before sending the commit message to git. This also makes the soft wrap mode and width for commit messages configurable in language settings. Previously we didn't support soft wrap modes other than `EditorWidth` in auto-height editors; I tried to add support for this by analogy with code that was already there, and it seems to work pretty well. Closes #27508 Release Notes: - Fixed confusing wrapping behavior in commit message editors. --- Cargo.lock | 3 +- assets/settings/default.json | 4 +- crates/editor/Cargo.toml | 1 - crates/editor/src/editor.rs | 343 +---------------------------- crates/editor/src/element.rs | 22 +- crates/git_ui/src/git_panel.rs | 18 +- crates/language/src/language.rs | 6 +- crates/util/Cargo.toml | 2 + crates/util/src/util.rs | 373 ++++++++++++++++++++++++++++++-- 9 files changed, 396 insertions(+), 376 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 310c9d1fdd4b65f4f11d8024cadec574ad8ed98b..17518e119ad1412779583d245be7f38c02e37570 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4729,7 +4729,6 @@ dependencies = [ "tree-sitter-rust", "tree-sitter-typescript", "ui", - "unicode-script", "unicode-segmentation", "unindent", "url", @@ -17106,6 +17105,8 @@ dependencies = [ "tempfile", "tendril", "unicase", + "unicode-script", + "unicode-segmentation", "util_macros", "walkdir", "workspace-hack", diff --git a/assets/settings/default.json b/assets/settings/default.json index e9032e9c19b456a26d79a25878a13e0a2dc934d5..22cc6a753e3a07927001c32bd1a6cc27a6f546e7 100644 --- a/assets/settings/default.json +++ b/assets/settings/default.json @@ -1431,7 +1431,9 @@ "language_servers": ["erlang-ls", "!elp", "..."] }, "Git Commit": { - "allow_rewrap": "anywhere" + "allow_rewrap": "anywhere", + "preferred_line_length": 72, + "soft_wrap": "bounded" }, "Go": { "code_actions_on_format": { diff --git a/crates/editor/Cargo.toml b/crates/editor/Cargo.toml index cd1db877e700b92b231e231d5c010b66189dcdee..ccaeee9dd6269e4fab6c7a1a7c4c68ae1cab09b8 100644 --- a/crates/editor/Cargo.toml +++ b/crates/editor/Cargo.toml @@ -82,7 +82,6 @@ tree-sitter-rust = { workspace = true, optional = true } tree-sitter-typescript = { workspace = true, optional = true } tree-sitter-python = { workspace = true, optional = true } unicode-segmentation.workspace = true -unicode-script.workspace = true unindent = { workspace = true, optional = true } ui.workspace = true url.workspace = true diff --git a/crates/editor/src/editor.rs b/crates/editor/src/editor.rs index 355bcb0bd62747a28a34a290b102adeca189017b..366df3b97d2fe619386a99eb879406443a5beca1 100644 --- a/crates/editor/src/editor.rs +++ b/crates/editor/src/editor.rs @@ -201,7 +201,7 @@ use ui::{ ButtonSize, ButtonStyle, ContextMenu, Disclosure, IconButton, IconButtonShape, IconName, IconSize, Indicator, Key, Tooltip, h_flex, prelude::*, }; -use util::{RangeExt, ResultExt, TryFutureExt, maybe, post_inc}; +use util::{RangeExt, ResultExt, TryFutureExt, maybe, post_inc, wrap_with_prefix}; use workspace::{ CollaboratorId, Item as WorkspaceItem, ItemId, ItemNavHistory, OpenInTerminal, OpenTerminal, RestoreOnStartupBehavior, SERIALIZATION_THROTTLE_TIME, SplitDirection, TabBarSettings, Toast, @@ -19440,347 +19440,6 @@ fn update_uncommitted_diff_for_buffer( }) } -fn char_len_with_expanded_tabs(offset: usize, text: &str, tab_size: NonZeroU32) -> usize { - let tab_size = tab_size.get() as usize; - let mut width = offset; - - for ch in text.chars() { - width += if ch == '\t' { - tab_size - (width % tab_size) - } else { - 1 - }; - } - - width - offset -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_string_size_with_expanded_tabs() { - let nz = |val| NonZeroU32::new(val).unwrap(); - assert_eq!(char_len_with_expanded_tabs(0, "", nz(4)), 0); - assert_eq!(char_len_with_expanded_tabs(0, "hello", nz(4)), 5); - assert_eq!(char_len_with_expanded_tabs(0, "\thello", nz(4)), 9); - assert_eq!(char_len_with_expanded_tabs(0, "abc\tab", nz(4)), 6); - assert_eq!(char_len_with_expanded_tabs(0, "hello\t", nz(4)), 8); - assert_eq!(char_len_with_expanded_tabs(0, "\t\t", nz(8)), 16); - assert_eq!(char_len_with_expanded_tabs(0, "x\t", nz(8)), 8); - assert_eq!(char_len_with_expanded_tabs(7, "x\t", nz(8)), 9); - } -} - -/// Tokenizes a string into runs of text that should stick together, or that is whitespace. -struct WordBreakingTokenizer<'a> { - input: &'a str, -} - -impl<'a> WordBreakingTokenizer<'a> { - fn new(input: &'a str) -> Self { - Self { input } - } -} - -fn is_char_ideographic(ch: char) -> bool { - use unicode_script::Script::*; - use unicode_script::UnicodeScript; - matches!(ch.script(), Han | Tangut | Yi) -} - -fn is_grapheme_ideographic(text: &str) -> bool { - text.chars().any(is_char_ideographic) -} - -fn is_grapheme_whitespace(text: &str) -> bool { - text.chars().any(|x| x.is_whitespace()) -} - -fn should_stay_with_preceding_ideograph(text: &str) -> bool { - text.chars().next().map_or(false, |ch| { - matches!(ch, '。' | '、' | ',' | '?' | '!' | ':' | ';' | '…') - }) -} - -#[derive(PartialEq, Eq, Debug, Clone, Copy)] -enum WordBreakToken<'a> { - Word { token: &'a str, grapheme_len: usize }, - InlineWhitespace { token: &'a str, grapheme_len: usize }, - Newline, -} - -impl<'a> Iterator for WordBreakingTokenizer<'a> { - /// Yields a span, the count of graphemes in the token, and whether it was - /// whitespace. Note that it also breaks at word boundaries. - type Item = WordBreakToken<'a>; - - fn next(&mut self) -> Option { - use unicode_segmentation::UnicodeSegmentation; - if self.input.is_empty() { - return None; - } - - let mut iter = self.input.graphemes(true).peekable(); - let mut offset = 0; - let mut grapheme_len = 0; - if let Some(first_grapheme) = iter.next() { - let is_newline = first_grapheme == "\n"; - let is_whitespace = is_grapheme_whitespace(first_grapheme); - offset += first_grapheme.len(); - grapheme_len += 1; - if is_grapheme_ideographic(first_grapheme) && !is_whitespace { - if let Some(grapheme) = iter.peek().copied() { - if should_stay_with_preceding_ideograph(grapheme) { - offset += grapheme.len(); - grapheme_len += 1; - } - } - } else { - let mut words = self.input[offset..].split_word_bound_indices().peekable(); - let mut next_word_bound = words.peek().copied(); - if next_word_bound.map_or(false, |(i, _)| i == 0) { - next_word_bound = words.next(); - } - while let Some(grapheme) = iter.peek().copied() { - if next_word_bound.map_or(false, |(i, _)| i == offset) { - break; - }; - if is_grapheme_whitespace(grapheme) != is_whitespace - || (grapheme == "\n") != is_newline - { - break; - }; - offset += grapheme.len(); - grapheme_len += 1; - iter.next(); - } - } - let token = &self.input[..offset]; - self.input = &self.input[offset..]; - if token == "\n" { - Some(WordBreakToken::Newline) - } else if is_whitespace { - Some(WordBreakToken::InlineWhitespace { - token, - grapheme_len, - }) - } else { - Some(WordBreakToken::Word { - token, - grapheme_len, - }) - } - } else { - None - } - } -} - -#[test] -fn test_word_breaking_tokenizer() { - let tests: &[(&str, &[WordBreakToken<'static>])] = &[ - ("", &[]), - (" ", &[whitespace(" ", 2)]), - ("Ʒ", &[word("Ʒ", 1)]), - ("Ǽ", &[word("Ǽ", 1)]), - ("⋑", &[word("⋑", 1)]), - ("⋑⋑", &[word("⋑⋑", 2)]), - ( - "原理,进而", - &[word("原", 1), word("理,", 2), word("进", 1), word("而", 1)], - ), - ( - "hello world", - &[word("hello", 5), whitespace(" ", 1), word("world", 5)], - ), - ( - "hello, world", - &[word("hello,", 6), whitespace(" ", 1), word("world", 5)], - ), - ( - " hello world", - &[ - whitespace(" ", 2), - word("hello", 5), - whitespace(" ", 1), - word("world", 5), - ], - ), - ( - "这是什么 \n 钢笔", - &[ - word("这", 1), - word("是", 1), - word("什", 1), - word("么", 1), - whitespace(" ", 1), - newline(), - whitespace(" ", 1), - word("钢", 1), - word("笔", 1), - ], - ), - (" mutton", &[whitespace(" ", 1), word("mutton", 6)]), - ]; - - fn word(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> { - WordBreakToken::Word { - token, - grapheme_len, - } - } - - fn whitespace(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> { - WordBreakToken::InlineWhitespace { - token, - grapheme_len, - } - } - - fn newline() -> WordBreakToken<'static> { - WordBreakToken::Newline - } - - for (input, result) in tests { - assert_eq!( - WordBreakingTokenizer::new(input) - .collect::>() - .as_slice(), - *result, - ); - } -} - -fn wrap_with_prefix( - line_prefix: String, - unwrapped_text: String, - wrap_column: usize, - tab_size: NonZeroU32, - preserve_existing_whitespace: bool, -) -> String { - let line_prefix_len = char_len_with_expanded_tabs(0, &line_prefix, tab_size); - let mut wrapped_text = String::new(); - let mut current_line = line_prefix.clone(); - - let tokenizer = WordBreakingTokenizer::new(&unwrapped_text); - let mut current_line_len = line_prefix_len; - let mut in_whitespace = false; - for token in tokenizer { - let have_preceding_whitespace = in_whitespace; - match token { - WordBreakToken::Word { - token, - grapheme_len, - } => { - in_whitespace = false; - if current_line_len + grapheme_len > wrap_column - && current_line_len != line_prefix_len - { - wrapped_text.push_str(current_line.trim_end()); - wrapped_text.push('\n'); - current_line.truncate(line_prefix.len()); - current_line_len = line_prefix_len; - } - current_line.push_str(token); - current_line_len += grapheme_len; - } - WordBreakToken::InlineWhitespace { - mut token, - mut grapheme_len, - } => { - in_whitespace = true; - if have_preceding_whitespace && !preserve_existing_whitespace { - continue; - } - if !preserve_existing_whitespace { - token = " "; - grapheme_len = 1; - } - if current_line_len + grapheme_len > wrap_column { - wrapped_text.push_str(current_line.trim_end()); - wrapped_text.push('\n'); - current_line.truncate(line_prefix.len()); - current_line_len = line_prefix_len; - } else if current_line_len != line_prefix_len || preserve_existing_whitespace { - current_line.push_str(token); - current_line_len += grapheme_len; - } - } - WordBreakToken::Newline => { - in_whitespace = true; - if preserve_existing_whitespace { - wrapped_text.push_str(current_line.trim_end()); - wrapped_text.push('\n'); - current_line.truncate(line_prefix.len()); - current_line_len = line_prefix_len; - } else if have_preceding_whitespace { - continue; - } else if current_line_len + 1 > wrap_column && current_line_len != line_prefix_len - { - wrapped_text.push_str(current_line.trim_end()); - wrapped_text.push('\n'); - current_line.truncate(line_prefix.len()); - current_line_len = line_prefix_len; - } else if current_line_len != line_prefix_len { - current_line.push(' '); - current_line_len += 1; - } - } - } - } - - if !current_line.is_empty() { - wrapped_text.push_str(¤t_line); - } - wrapped_text -} - -#[test] -fn test_wrap_with_prefix() { - assert_eq!( - wrap_with_prefix( - "# ".to_string(), - "abcdefg".to_string(), - 4, - NonZeroU32::new(4).unwrap(), - false, - ), - "# abcdefg" - ); - assert_eq!( - wrap_with_prefix( - "".to_string(), - "\thello world".to_string(), - 8, - NonZeroU32::new(4).unwrap(), - false, - ), - "hello\nworld" - ); - assert_eq!( - wrap_with_prefix( - "// ".to_string(), - "xx \nyy zz aa bb cc".to_string(), - 12, - NonZeroU32::new(4).unwrap(), - false, - ), - "// xx yy zz\n// aa bb cc" - ); - assert_eq!( - wrap_with_prefix( - String::new(), - "这是什么 \n 钢笔".to_string(), - 3, - NonZeroU32::new(4).unwrap(), - false, - ), - "这是什\n么 钢\n笔" - ); -} - pub trait CollaborationHub { fn collaborators<'a>(&self, cx: &'a App) -> &'a HashMap; fn user_participant_indices<'a>(&self, cx: &'a App) -> &'a HashMap; diff --git a/crates/editor/src/element.rs b/crates/editor/src/element.rs index 368b79dbc75cda7e5ff7c9f2f3b52142f40020ac..be29ff624c4eaaab3b39483b2e7ea3c0e8ba3290 100644 --- a/crates/editor/src/element.rs +++ b/crates/editor/src/element.rs @@ -7396,10 +7396,7 @@ impl Element for EditorElement { editor.gutter_dimensions = gutter_dimensions; editor.set_visible_line_count(bounds.size.height / line_height, window, cx); - if matches!( - editor.mode, - EditorMode::AutoHeight { .. } | EditorMode::Minimap { .. } - ) { + if matches!(editor.mode, EditorMode::Minimap { .. }) { snapshot } else { let wrap_width_for = |column: u32| (column as f32 * em_advance).ceil(); @@ -9390,6 +9387,7 @@ fn compute_auto_height_layout( let font_size = style.text.font_size.to_pixels(window.rem_size()); let line_height = style.text.line_height_in_pixels(window.rem_size()); let em_width = window.text_system().em_width(font_id, font_size).unwrap(); + let em_advance = window.text_system().em_advance(font_id, font_size).unwrap(); let mut snapshot = editor.snapshot(window, cx); let gutter_dimensions = snapshot @@ -9406,10 +9404,18 @@ fn compute_auto_height_layout( let overscroll = size(em_width, px(0.)); let editor_width = text_width - gutter_dimensions.margin - overscroll.width - em_width; - if !matches!(editor.soft_wrap_mode(cx), SoftWrap::None) { - if editor.set_wrap_width(Some(editor_width), cx) { - snapshot = editor.snapshot(window, cx); - } + let content_offset = point(gutter_dimensions.margin, Pixels::ZERO); + let editor_content_width = editor_width - content_offset.x; + let wrap_width_for = |column: u32| (column as f32 * em_advance).ceil(); + let wrap_width = match editor.soft_wrap_mode(cx) { + SoftWrap::GitDiff => None, + SoftWrap::None => Some(wrap_width_for(MAX_LINE_LEN as u32 / 2)), + SoftWrap::EditorWidth => Some(editor_content_width), + SoftWrap::Column(column) => Some(wrap_width_for(column)), + SoftWrap::Bounded(column) => Some(editor_content_width.min(wrap_width_for(column))), + }; + if editor.set_wrap_width(wrap_width, cx) { + snapshot = editor.snapshot(window, cx); } let scroll_height = (snapshot.max_point().row().next_row().0 as f32) * line_height; diff --git a/crates/git_ui/src/git_panel.rs b/crates/git_ui/src/git_panel.rs index 4946bd0ecd02bfb6a6b6a43ff41962b36325db70..dad4c9647827bda17bfde772e32a39a310ea09ec 100644 --- a/crates/git_ui/src/git_panel.rs +++ b/crates/git_ui/src/git_panel.rs @@ -54,6 +54,7 @@ use project::{ use serde::{Deserialize, Serialize}; use settings::{Settings as _, SettingsStore}; use std::future::Future; +use std::num::NonZeroU32; use std::path::{Path, PathBuf}; use std::{collections::HashSet, sync::Arc, time::Duration, usize}; use strum::{IntoEnumIterator, VariantNames}; @@ -62,7 +63,7 @@ use ui::{ Checkbox, ContextMenu, ElevationIndex, PopoverMenu, Scrollbar, ScrollbarState, SplitButton, Tooltip, prelude::*, }; -use util::{ResultExt, TryFutureExt, maybe}; +use util::{ResultExt, TryFutureExt, maybe, wrap_with_prefix}; use workspace::AppState; use notifications::status_toast::{StatusToast, ToastIcon}; @@ -382,7 +383,6 @@ pub(crate) fn commit_message_editor( commit_editor.set_show_gutter(false, cx); commit_editor.set_show_wrap_guides(false, cx); commit_editor.set_show_indent_guides(false, cx); - commit_editor.set_hard_wrap(Some(72), cx); let placeholder = placeholder.unwrap_or("Enter commit message".into()); commit_editor.set_placeholder_text(placeholder, cx); commit_editor @@ -1484,8 +1484,22 @@ impl GitPanel { fn custom_or_suggested_commit_message(&self, cx: &mut Context) -> Option { let message = self.commit_editor.read(cx).text(cx); + let width = self + .commit_editor + .read(cx) + .buffer() + .read(cx) + .language_settings(cx) + .preferred_line_length as usize; if !message.trim().is_empty() { + let message = wrap_with_prefix( + String::new(), + message, + width, + NonZeroU32::new(8).unwrap(), // tab size doesn't matter when prefix is empty + false, + ); return Some(message); } diff --git a/crates/language/src/language.rs b/crates/language/src/language.rs index 77884634fc78bf7f6f309227d0ce1f55451d403b..3da6101f4199e51c291259914222b0f87e7cfd38 100644 --- a/crates/language/src/language.rs +++ b/crates/language/src/language.rs @@ -666,7 +666,7 @@ pub struct CodeLabel { pub filter_range: Range, } -#[derive(Clone, Deserialize, JsonSchema)] +#[derive(Clone, Debug, Deserialize, JsonSchema)] pub struct LanguageConfig { /// Human-readable name of the language. pub name: LanguageName, @@ -777,7 +777,7 @@ pub struct LanguageMatcher { } /// The configuration for JSX tag auto-closing. -#[derive(Clone, Deserialize, JsonSchema)] +#[derive(Clone, Debug, Deserialize, JsonSchema)] pub struct JsxTagAutoCloseConfig { /// The name of the node for a opening tag pub open_tag_node_name: String, @@ -810,7 +810,7 @@ pub struct JsxTagAutoCloseConfig { } /// The configuration for documentation block for this language. -#[derive(Clone, Deserialize, JsonSchema)] +#[derive(Clone, Debug, Deserialize, JsonSchema)] pub struct DocumentationConfig { /// A start tag of documentation block. pub start: Arc, diff --git a/crates/util/Cargo.toml b/crates/util/Cargo.toml index f6fc4b5164722f8051d846ce50605b73cd1ac8fa..3b5ffcb24cbe5625c7aa091afee45bdb4568d4c4 100644 --- a/crates/util/Cargo.toml +++ b/crates/util/Cargo.toml @@ -37,6 +37,8 @@ smol.workspace = true take-until.workspace = true tempfile.workspace = true unicase.workspace = true +unicode-script.workspace = true +unicode-segmentation.workspace = true util_macros = { workspace = true, optional = true } walkdir.workspace = true workspace-hack.workspace = true diff --git a/crates/util/src/util.rs b/crates/util/src/util.rs index d726b5aae8f35f41d59f505b151e748e8f1ccdd4..40f67cd62e164adada2649034260bc4c20c1cd8d 100644 --- a/crates/util/src/util.rs +++ b/crates/util/src/util.rs @@ -14,6 +14,7 @@ use anyhow::Result; use futures::Future; use itertools::Either; use regex::Regex; +use std::num::NonZeroU32; use std::sync::{LazyLock, OnceLock}; use std::{ borrow::Cow, @@ -183,29 +184,208 @@ pub fn truncate_lines_to_byte_limit(s: &str, max_bytes: usize) -> &str { truncate_to_byte_limit(s, max_bytes) } -#[test] -fn test_truncate_lines_to_byte_limit() { - let text = "Line 1\nLine 2\nLine 3\nLine 4"; +fn char_len_with_expanded_tabs(offset: usize, text: &str, tab_size: NonZeroU32) -> usize { + let tab_size = tab_size.get() as usize; + let mut width = offset; - // Limit that includes all lines - assert_eq!(truncate_lines_to_byte_limit(text, 100), text); + for ch in text.chars() { + width += if ch == '\t' { + tab_size - (width % tab_size) + } else { + 1 + }; + } - // Exactly the first line - assert_eq!(truncate_lines_to_byte_limit(text, 7), "Line 1\n"); + width - offset +} - // Limit between lines - assert_eq!(truncate_lines_to_byte_limit(text, 13), "Line 1\n"); - assert_eq!(truncate_lines_to_byte_limit(text, 20), "Line 1\nLine 2\n"); +/// Tokenizes a string into runs of text that should stick together, or that is whitespace. +struct WordBreakingTokenizer<'a> { + input: &'a str, +} - // Limit before first newline - assert_eq!(truncate_lines_to_byte_limit(text, 6), "Line "); +impl<'a> WordBreakingTokenizer<'a> { + fn new(input: &'a str) -> Self { + Self { input } + } +} - // Test with non-ASCII characters - let text_utf8 = "Line 1\nLíne 2\nLine 3"; - assert_eq!( - truncate_lines_to_byte_limit(text_utf8, 15), - "Line 1\nLíne 2\n" - ); +fn is_char_ideographic(ch: char) -> bool { + use unicode_script::Script::*; + use unicode_script::UnicodeScript; + matches!(ch.script(), Han | Tangut | Yi) +} + +fn is_grapheme_ideographic(text: &str) -> bool { + text.chars().any(is_char_ideographic) +} + +fn is_grapheme_whitespace(text: &str) -> bool { + text.chars().any(|x| x.is_whitespace()) +} + +fn should_stay_with_preceding_ideograph(text: &str) -> bool { + text.chars().next().map_or(false, |ch| { + matches!(ch, '。' | '、' | ',' | '?' | '!' | ':' | ';' | '…') + }) +} + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +enum WordBreakToken<'a> { + Word { token: &'a str, grapheme_len: usize }, + InlineWhitespace { token: &'a str, grapheme_len: usize }, + Newline, +} + +impl<'a> Iterator for WordBreakingTokenizer<'a> { + /// Yields a span, the count of graphemes in the token, and whether it was + /// whitespace. Note that it also breaks at word boundaries. + type Item = WordBreakToken<'a>; + + fn next(&mut self) -> Option { + use unicode_segmentation::UnicodeSegmentation; + if self.input.is_empty() { + return None; + } + + let mut iter = self.input.graphemes(true).peekable(); + let mut offset = 0; + let mut grapheme_len = 0; + if let Some(first_grapheme) = iter.next() { + let is_newline = first_grapheme == "\n"; + let is_whitespace = is_grapheme_whitespace(first_grapheme); + offset += first_grapheme.len(); + grapheme_len += 1; + if is_grapheme_ideographic(first_grapheme) && !is_whitespace { + if let Some(grapheme) = iter.peek().copied() { + if should_stay_with_preceding_ideograph(grapheme) { + offset += grapheme.len(); + grapheme_len += 1; + } + } + } else { + let mut words = self.input[offset..].split_word_bound_indices().peekable(); + let mut next_word_bound = words.peek().copied(); + if next_word_bound.map_or(false, |(i, _)| i == 0) { + next_word_bound = words.next(); + } + while let Some(grapheme) = iter.peek().copied() { + if next_word_bound.map_or(false, |(i, _)| i == offset) { + break; + }; + if is_grapheme_whitespace(grapheme) != is_whitespace + || (grapheme == "\n") != is_newline + { + break; + }; + offset += grapheme.len(); + grapheme_len += 1; + iter.next(); + } + } + let token = &self.input[..offset]; + self.input = &self.input[offset..]; + if token == "\n" { + Some(WordBreakToken::Newline) + } else if is_whitespace { + Some(WordBreakToken::InlineWhitespace { + token, + grapheme_len, + }) + } else { + Some(WordBreakToken::Word { + token, + grapheme_len, + }) + } + } else { + None + } + } +} + +pub fn wrap_with_prefix( + line_prefix: String, + unwrapped_text: String, + wrap_column: usize, + tab_size: NonZeroU32, + preserve_existing_whitespace: bool, +) -> String { + let line_prefix_len = char_len_with_expanded_tabs(0, &line_prefix, tab_size); + let mut wrapped_text = String::new(); + let mut current_line = line_prefix.clone(); + + let tokenizer = WordBreakingTokenizer::new(&unwrapped_text); + let mut current_line_len = line_prefix_len; + let mut in_whitespace = false; + for token in tokenizer { + let have_preceding_whitespace = in_whitespace; + match token { + WordBreakToken::Word { + token, + grapheme_len, + } => { + in_whitespace = false; + if current_line_len + grapheme_len > wrap_column + && current_line_len != line_prefix_len + { + wrapped_text.push_str(current_line.trim_end()); + wrapped_text.push('\n'); + current_line.truncate(line_prefix.len()); + current_line_len = line_prefix_len; + } + current_line.push_str(token); + current_line_len += grapheme_len; + } + WordBreakToken::InlineWhitespace { + mut token, + mut grapheme_len, + } => { + in_whitespace = true; + if have_preceding_whitespace && !preserve_existing_whitespace { + continue; + } + if !preserve_existing_whitespace { + token = " "; + grapheme_len = 1; + } + if current_line_len + grapheme_len > wrap_column { + wrapped_text.push_str(current_line.trim_end()); + wrapped_text.push('\n'); + current_line.truncate(line_prefix.len()); + current_line_len = line_prefix_len; + } else if current_line_len != line_prefix_len || preserve_existing_whitespace { + current_line.push_str(token); + current_line_len += grapheme_len; + } + } + WordBreakToken::Newline => { + in_whitespace = true; + if preserve_existing_whitespace { + wrapped_text.push_str(current_line.trim_end()); + wrapped_text.push('\n'); + current_line.truncate(line_prefix.len()); + current_line_len = line_prefix_len; + } else if have_preceding_whitespace { + continue; + } else if current_line_len + 1 > wrap_column && current_line_len != line_prefix_len + { + wrapped_text.push_str(current_line.trim_end()); + wrapped_text.push('\n'); + current_line.truncate(line_prefix.len()); + current_line_len = line_prefix_len; + } else if current_line_len != line_prefix_len { + current_line.push(' '); + current_line_len += 1; + } + } + } + } + + if !current_line.is_empty() { + wrapped_text.push_str(¤t_line); + } + wrapped_text } pub fn post_inc + AddAssign + Copy>(value: &mut T) -> T { @@ -1302,4 +1482,161 @@ Line 3"# (0..8).collect::>() ); } + + #[test] + fn test_truncate_lines_to_byte_limit() { + let text = "Line 1\nLine 2\nLine 3\nLine 4"; + + // Limit that includes all lines + assert_eq!(truncate_lines_to_byte_limit(text, 100), text); + + // Exactly the first line + assert_eq!(truncate_lines_to_byte_limit(text, 7), "Line 1\n"); + + // Limit between lines + assert_eq!(truncate_lines_to_byte_limit(text, 13), "Line 1\n"); + assert_eq!(truncate_lines_to_byte_limit(text, 20), "Line 1\nLine 2\n"); + + // Limit before first newline + assert_eq!(truncate_lines_to_byte_limit(text, 6), "Line "); + + // Test with non-ASCII characters + let text_utf8 = "Line 1\nLíne 2\nLine 3"; + assert_eq!( + truncate_lines_to_byte_limit(text_utf8, 15), + "Line 1\nLíne 2\n" + ); + } + + #[test] + fn test_string_size_with_expanded_tabs() { + let nz = |val| NonZeroU32::new(val).unwrap(); + assert_eq!(char_len_with_expanded_tabs(0, "", nz(4)), 0); + assert_eq!(char_len_with_expanded_tabs(0, "hello", nz(4)), 5); + assert_eq!(char_len_with_expanded_tabs(0, "\thello", nz(4)), 9); + assert_eq!(char_len_with_expanded_tabs(0, "abc\tab", nz(4)), 6); + assert_eq!(char_len_with_expanded_tabs(0, "hello\t", nz(4)), 8); + assert_eq!(char_len_with_expanded_tabs(0, "\t\t", nz(8)), 16); + assert_eq!(char_len_with_expanded_tabs(0, "x\t", nz(8)), 8); + assert_eq!(char_len_with_expanded_tabs(7, "x\t", nz(8)), 9); + } + + #[test] + fn test_word_breaking_tokenizer() { + let tests: &[(&str, &[WordBreakToken<'static>])] = &[ + ("", &[]), + (" ", &[whitespace(" ", 2)]), + ("Ʒ", &[word("Ʒ", 1)]), + ("Ǽ", &[word("Ǽ", 1)]), + ("⋑", &[word("⋑", 1)]), + ("⋑⋑", &[word("⋑⋑", 2)]), + ( + "原理,进而", + &[word("原", 1), word("理,", 2), word("进", 1), word("而", 1)], + ), + ( + "hello world", + &[word("hello", 5), whitespace(" ", 1), word("world", 5)], + ), + ( + "hello, world", + &[word("hello,", 6), whitespace(" ", 1), word("world", 5)], + ), + ( + " hello world", + &[ + whitespace(" ", 2), + word("hello", 5), + whitespace(" ", 1), + word("world", 5), + ], + ), + ( + "这是什么 \n 钢笔", + &[ + word("这", 1), + word("是", 1), + word("什", 1), + word("么", 1), + whitespace(" ", 1), + newline(), + whitespace(" ", 1), + word("钢", 1), + word("笔", 1), + ], + ), + (" mutton", &[whitespace(" ", 1), word("mutton", 6)]), + ]; + + fn word(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> { + WordBreakToken::Word { + token, + grapheme_len, + } + } + + fn whitespace(token: &'static str, grapheme_len: usize) -> WordBreakToken<'static> { + WordBreakToken::InlineWhitespace { + token, + grapheme_len, + } + } + + fn newline() -> WordBreakToken<'static> { + WordBreakToken::Newline + } + + for (input, result) in tests { + assert_eq!( + WordBreakingTokenizer::new(input) + .collect::>() + .as_slice(), + *result, + ); + } + } + + #[test] + fn test_wrap_with_prefix() { + assert_eq!( + wrap_with_prefix( + "# ".to_string(), + "abcdefg".to_string(), + 4, + NonZeroU32::new(4).unwrap(), + false, + ), + "# abcdefg" + ); + assert_eq!( + wrap_with_prefix( + "".to_string(), + "\thello world".to_string(), + 8, + NonZeroU32::new(4).unwrap(), + false, + ), + "hello\nworld" + ); + assert_eq!( + wrap_with_prefix( + "// ".to_string(), + "xx \nyy zz aa bb cc".to_string(), + 12, + NonZeroU32::new(4).unwrap(), + false, + ), + "// xx yy zz\n// aa bb cc" + ); + assert_eq!( + wrap_with_prefix( + String::new(), + "这是什么 \n 钢笔".to_string(), + 3, + NonZeroU32::new(4).unwrap(), + false, + ), + "这是什\n么 钢\n笔" + ); + } }