From 9e7302520ec93a96a9b275a817f083b96620148e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=BB=E4=BA=8C=E6=B0=AE=E6=9D=82=E8=8F=B2?= <40173605+Cupnfish@users.noreply.github.com> Date: Tue, 23 Sep 2025 20:03:29 +0800 Subject: [PATCH] Fix UTF-8 character boundary panic in DirectWrite text layout (#37767) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Zed was crashing with a UTF-8 character boundary error when rendering text containing multi-byte characters (like emojis or CJK characters): ``` Thread "main" panicked with "byte index 49 is not a char boundary; it is inside '…' (bytes 48..51)" ``` ## Root Cause Analysis The PR reviewer correctly identified that the issue was not in the DirectWrite boundary handling, but rather in the text run length calculation in the text system. When text runs are split across lines in `text_system.rs:426`, the calculation: ```rust let run_len_within_line = cmp::min(line_end, run_start + run.len) - run_start; ``` This could result in `run_len_within_line` values that don't respect UTF-8 character boundaries, especially when multi-byte characters (like '…' which is 3 bytes) get split across lines. The resulting `FontRun` objects would have lengths that don't align with character boundaries, causing the panic when DirectWrite tries to slice the string. ## Solution Fixed the issue by adding UTF-8 character boundary validation in the text system where run lengths are calculated. The fix ensures that when text runs are split across lines, the split always occurs at valid UTF-8 character boundaries: ```rust // Ensure the run length respects UTF-8 character boundaries if run_len_within_line > 0 { let text_slice = &line_text[run_start - line_start..]; if run_len_within_line < text_slice.len() && !text_slice.is_char_boundary(run_len_within_line) { // Find the previous character boundary using efficient bit-level checking // UTF-8 characters are at most 4 bytes, so we only need to check up to 3 bytes back let lower_bound = run_len_within_line.saturating_sub(3); let search_range = &text_slice.as_bytes()[lower_bound..=run_len_within_line]; // SAFETY: A valid character boundary must exist in this range because: // 1. run_len_within_line is a valid position in the string slice // 2. UTF-8 characters are at most 4 bytes, so some boundary exists in [run_len_within_line-3..=run_len_within_line] let pos_from_lower = unsafe { search_range .iter() .rposition(|&b| (b as i8) >= -0x40) .unwrap_unchecked() }; run_len_within_line = lower_bound + pos_from_lower; } } ``` ## Testing - ✅ Builds successfully on all platforms - ✅ Eliminates UTF-8 character boundary panics - ✅ Maintains existing functionality for all text types - ✅ Handles edge cases like very long multi-byte characters ## Benefits 1. **Root cause fix**: Addresses the issue at the source rather than treating symptoms 2. **Performance optimal**: Uses the same efficient algorithm as the standard library 3. **Minimal changes**: Only modifies the specific problematic code path 4. **Future compatible**: Can be easily replaced with `str::floor_char_boundary()` when stabilized ## Alternative Approaches Considered 1. **DirectWrite boundary fixing**: Initially tried to fix in DirectWrite, but this was treating symptoms rather than the root cause 2. **Helper function approach**: Considered extracting to a helper function, but inlined implementation is more appropriate for this specific use case 3. **Standard library methods**: `floor_char_boundary()` is not yet stable, so implemented equivalent logic The chosen approach provides the best balance of performance, safety, and code maintainability. --- Release Notes: - N/A --- crates/gpui/src/text_system.rs | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/crates/gpui/src/text_system.rs b/crates/gpui/src/text_system.rs index efa4ad032a66ce92a71cbd82be6ed4a63d527858..b0776051d6049e70397912c28435ebe4eb2b8d7a 100644 --- a/crates/gpui/src/text_system.rs +++ b/crates/gpui/src/text_system.rs @@ -429,7 +429,33 @@ impl WindowTextSystem { break; }; - let run_len_within_line = cmp::min(line_end, run_start + run.len) - run_start; + let mut run_len_within_line = cmp::min(line_end, run_start + run.len) - run_start; + + // Ensure the run length respects UTF-8 character boundaries + if run_len_within_line > 0 { + let text_slice = &line_text[run_start - line_start..]; + if run_len_within_line < text_slice.len() + && !text_slice.is_char_boundary(run_len_within_line) + { + // Find the previous character boundary using efficient bit-level checking + // UTF-8 characters are at most 4 bytes, so we only need to check up to 3 bytes back + let lower_bound = run_len_within_line.saturating_sub(3); + let search_range = + &text_slice.as_bytes()[lower_bound..=run_len_within_line]; + + // SAFETY: A valid character boundary must exist in this range because: + // 1. run_len_within_line is a valid position in the string slice + // 2. UTF-8 characters are at most 4 bytes, so some boundary exists in [run_len_within_line-3..=run_len_within_line] + let pos_from_lower = unsafe { + search_range + .iter() + .rposition(|&b| (b as i8) >= -0x40) + .unwrap_unchecked() + }; + + run_len_within_line = lower_bound + pos_from_lower; + } + } if last_font == Some(run.font.clone()) { font_runs.last_mut().unwrap().len += run_len_within_line;