From 90fcf8539eacfefd604e1c95ffe242b149129f3d Mon Sep 17 00:00:00 2001 From: David Alecrim <35930364+davidalecrim1@users.noreply.github.com> Date: Fri, 3 Apr 2026 08:00:13 -0300 Subject: [PATCH] fuzzy: Fix crash with Unicode chars whose lowercase expands to multiple codepoints (#52989) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-Review Checklist: - [x] I've reviewed my own diff for quality, security, and reliability - [x] Unsafe blocks (if any) have justifying comments - [x] The content is consistent with the [UI/UX checklist](https://github.com/zed-industries/zed/blob/main/CONTRIBUTING.md#uiux-checklist) - [x] Tests cover the new/changed behavior - [x] Performance impact has been considered and is acceptable Closes #52973 ## Problem The file picker crashes with `highlight index N is not a valid UTF-8 boundary` when file paths contain Unicode characters whose lowercase form expands to multiple codepoints. Turkish `İ` (U+0130) is the trigger here: Rust's `char::to_lowercase()` turns it into `i` + combining dot above (two codepoints). That expansion breaks the fuzzy matcher in two ways: 1. The `j_regular` index mapping mixes the expanded lowercase index space with the original character index space, so highlight positions land on invalid byte boundaries. 2. The scoring matrices are allocated with the expanded length but indexed with the original length as stride, so rows alias each other and corrupt stored values. Users with Turkish locale filenames were hitting this on v0.229.0 and v0.230.0 stable. ## Fix I went with simple 1:1 case mapping: a `simple_lowercase` helper in `char_bag.rs` that takes only the first codepoint from `to_lowercase()` and drops any trailing combining characters. For `İ` this gives `i`, which is what anyone would actually type in a search query. The same function is used in the matcher, the char bag pre-filter, and both query-lowercasing call sites (`paths.rs` and `strings.rs`). This gets rid of the `extra_lowercase_chars` BTreeMap, the `j_regular` adjustment, and the matrix sizing discrepancy. The matcher now works with a flat character array where `lowercase_candidate_chars.len() == candidate_chars.len()`, so there's no expanded-vs-original index space to get wrong. I also fixed `CharBag::insert`, which used `to_ascii_lowercase()` and silently ignored non-ASCII characters. A file like `aİbİcdef.txt` wouldn't show up when searching `ai` because `İ` was never registered as `i` in the bag. It now goes through `simple_lowercase` too. The alternative was keeping full case folding and fixing the index tracking with a `Vec` mapping expanded positions back to originals. That would work but keeps the dual-index-space complexity that caused these bugs, plus adds a per-candidate allocation for the mapping vector. ## Prior art fzf uses Go's `unicode.To(unicode.LowerCase, r)`, which is simple case mapping -- always one rune in, one rune out. `İ` maps to `i`, no expansion. VS Code's `String.toLowerCase()` does produce the expanded form, but the scorer compares UTF-16 code units independently and sidesteps the problem in practice. Neither tool maintains a mapping between expanded and original index spaces. ## Trade-off Searching for the combining dot above (U+0307) won't match `İ` in a path anymore. Nobody types combining characters in a file picker, and fzf doesn't support it either. ## Screenshot Screenshot 2026-04-02 at 09 56 34 Release Notes: - Fixed a crash and improved matching and highlighting in the file picker for paths with non-ASCII characters (e.g., Turkish İ, ß, fi). --------- Co-authored-by: Oleksiy Syvokon --- crates/fuzzy/src/char_bag.rs | 6 ++- crates/fuzzy/src/matcher.rs | 93 +++++++++++++++++++++++------------- crates/fuzzy/src/paths.rs | 9 ++-- crates/fuzzy/src/strings.rs | 3 +- 4 files changed, 73 insertions(+), 38 deletions(-) diff --git a/crates/fuzzy/src/char_bag.rs b/crates/fuzzy/src/char_bag.rs index 13b00816ed0141117fb6d5ac9265e4b82c7aa57d..1821a63793337862d9d6ad01a6a42072588d7be5 100644 --- a/crates/fuzzy/src/char_bag.rs +++ b/crates/fuzzy/src/char_bag.rs @@ -1,5 +1,9 @@ use std::iter::FromIterator; +pub fn simple_lowercase(c: char) -> char { + c.to_lowercase().next().unwrap_or(c) +} + #[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)] pub struct CharBag(u64); @@ -9,7 +13,7 @@ impl CharBag { } fn insert(&mut self, c: char) { - let c = c.to_ascii_lowercase(); + let c = simple_lowercase(c); if c.is_ascii_lowercase() { let mut count = self.0; let idx = c as u8 - b'a'; diff --git a/crates/fuzzy/src/matcher.rs b/crates/fuzzy/src/matcher.rs index 782c9caca832d81fb6e4bce8f49b4f310664b292..102708d2fad6b560b1a606c34246033587affdda 100644 --- a/crates/fuzzy/src/matcher.rs +++ b/crates/fuzzy/src/matcher.rs @@ -1,10 +1,9 @@ use std::{ borrow::Borrow, - collections::BTreeMap, sync::atomic::{self, AtomicBool}, }; -use crate::CharBag; +use crate::{CharBag, char_bag::simple_lowercase}; const BASE_DISTANCE_PENALTY: f64 = 0.6; const ADDITIONAL_DISTANCE_PENALTY: f64 = 0.05; @@ -69,7 +68,6 @@ impl<'a> Matcher<'a> { { let mut candidate_chars = Vec::new(); let mut lowercase_candidate_chars = Vec::new(); - let mut extra_lowercase_chars = BTreeMap::new(); for candidate in candidates { if !candidate.borrow().has_chars(self.query_char_bag) { @@ -82,14 +80,9 @@ impl<'a> Matcher<'a> { candidate_chars.clear(); lowercase_candidate_chars.clear(); - extra_lowercase_chars.clear(); - for (i, c) in candidate.borrow().candidate_chars().enumerate() { + for c in candidate.borrow().candidate_chars() { candidate_chars.push(c); - let mut char_lowercased = c.to_lowercase().collect::>(); - if char_lowercased.len() > 1 { - extra_lowercase_chars.insert(i, char_lowercased.len() - 1); - } - lowercase_candidate_chars.append(&mut char_lowercased); + lowercase_candidate_chars.push(simple_lowercase(c)); } if !self.find_last_positions(lowercase_prefix, &lowercase_candidate_chars) { @@ -108,7 +101,6 @@ impl<'a> Matcher<'a> { &lowercase_candidate_chars, prefix, lowercase_prefix, - &extra_lowercase_chars, ); if score > 0.0 { @@ -146,7 +138,6 @@ impl<'a> Matcher<'a> { path_lowercased: &[char], prefix: &[char], lowercase_prefix: &[char], - extra_lowercase_chars: &BTreeMap, ) -> f64 { let score = self.recursive_score_match( path, @@ -156,7 +147,6 @@ impl<'a> Matcher<'a> { 0, 0, self.query.len() as f64, - extra_lowercase_chars, ) * self.query.len() as f64; if score <= 0.0 { @@ -201,7 +191,6 @@ impl<'a> Matcher<'a> { query_idx: usize, path_idx: usize, cur_score: f64, - extra_lowercase_chars: &BTreeMap, ) -> f64 { if query_idx == self.query.len() { return 1.0; @@ -228,13 +217,6 @@ impl<'a> Matcher<'a> { let mut last_slash = 0; for j in path_idx..=safe_limit { - let extra_lowercase_chars_count = extra_lowercase_chars - .iter() - .take_while(|&(&i, _)| i < j) - .map(|(_, increment)| increment) - .sum::(); - let j_regular = j - extra_lowercase_chars_count; - let path_char = if j < prefix.len() { lowercase_prefix[j] } else { @@ -247,20 +229,20 @@ impl<'a> Matcher<'a> { let is_path_sep = path_char == '/'; if query_idx == 0 && is_path_sep { - last_slash = j_regular; + last_slash = j; } let need_to_score = query_char == path_char || (is_path_sep && query_char == '_'); if need_to_score { - let curr = match prefix.get(j_regular) { + let curr = match prefix.get(j) { Some(&curr) => curr, - None => path[j_regular - prefix.len()], + None => path[j - prefix.len()], }; let mut char_score = 1.0; if j > path_idx { - let last = match prefix.get(j_regular - 1) { + let last = match prefix.get(j - 1) { Some(&last) => last, - None => path[j_regular - 1 - prefix.len()], + None => path[j - 1 - prefix.len()], }; if last == '/' { @@ -316,12 +298,11 @@ impl<'a> Matcher<'a> { query_idx + 1, j + 1, next_score, - extra_lowercase_chars, ) * multiplier; if new_score > score { score = new_score; - best_position = j_regular; + best_position = j; // Optimization: can't score better than 1. if new_score == 1.0 { break; @@ -469,12 +450,12 @@ mod tests { assert_eq!( match_single_path_query("İo/oluş", false, &mixed_unicode_paths), - vec![("İolu/oluş", vec![0, 2, 4, 6, 8, 10, 12])] + vec![("İolu/oluş", vec![0, 2, 5, 6, 7, 8, 9])] ); assert_eq!( match_single_path_query("İst/code", false, &mixed_unicode_paths), - vec![("İstanbul/code", vec![0, 2, 4, 6, 8, 10, 12, 14])] + vec![("İstanbul/code", vec![0, 2, 3, 9, 10, 11, 12, 13])] ); assert_eq!( @@ -536,12 +517,60 @@ mod tests { ); } + #[test] + fn test_positions_are_valid_char_boundaries_with_expanding_lowercase() { + // İ (U+0130) lowercases to "i\u{307}" (2 chars) under full case folding. + // With simple case mapping (used by this matcher), İ → 'i' (1 char), + // so positions remain valid byte boundaries. + let paths = vec!["İstanbul/code.rs", "aİbİc/dİeİf.txt", "src/İmport/İndex.ts"]; + + for query in &["code", "İst", "dİe", "İndex", "İmport", "abcdef"] { + let results = match_single_path_query(query, false, &paths); + for (path, positions) in &results { + for &pos in positions { + assert!( + path.is_char_boundary(pos), + "Position {pos} is not a valid char boundary in path {path:?} \ + (query: {query:?}, all positions: {positions:?})" + ); + } + } + } + } + + #[test] + fn test_positions_valid_with_various_multibyte_chars() { + // German ß uppercases to SS but lowercases to itself — no expansion. + // Armenian ligatures and other characters that could expand under full + // case folding should still produce valid byte boundaries. + let paths = vec![ + "straße/config.rs", + "Straße/München/file.txt", + "file/path.rs", // fi (U+FB01, fi ligature) + "ffoo/bar.txt", // ff (U+FB00, ff ligature) + "aÇbŞc/dÖeÜf.txt", // Turkish chars that don't expand + ]; + + for query in &["config", "Mün", "file", "bar", "abcdef", "straße", "ÇŞ"] { + let results = match_single_path_query(query, false, &paths); + for (path, positions) in &results { + for &pos in positions { + assert!( + path.is_char_boundary(pos), + "Position {pos} is not a valid char boundary in path {path:?} \ + (query: {query:?}, all positions: {positions:?})" + ); + } + } + } + } + fn match_single_path_query<'a>( query: &str, smart_case: bool, paths: &[&'a str], ) -> Vec<(&'a str, Vec)> { - let lowercase_query = query.to_lowercase().chars().collect::>(); + let lowercase_query = query.chars().map(simple_lowercase).collect::>(); let query = query.chars().collect::>(); let query_chars = CharBag::from(&lowercase_query[..]); @@ -551,7 +580,7 @@ mod tests { .collect::>(); let mut path_entries = Vec::new(); for (i, path) in paths.iter().enumerate() { - let lowercase_path = path.to_lowercase().chars().collect::>(); + let lowercase_path: Vec = path.chars().map(simple_lowercase).collect(); let char_bag = CharBag::from(lowercase_path.as_slice()); path_entries.push(PathMatchCandidate { is_dir: false, diff --git a/crates/fuzzy/src/paths.rs b/crates/fuzzy/src/paths.rs index cce0e082840c4cd05d6e2b21eac0073d3eb7700f..2f92f05b96a3be2da7053365d8a7c53722db6ab8 100644 --- a/crates/fuzzy/src/paths.rs +++ b/crates/fuzzy/src/paths.rs @@ -10,6 +10,7 @@ use util::{paths::PathStyle, rel_path::RelPath}; use crate::{ CharBag, + char_bag::simple_lowercase, matcher::{MatchCandidate, Matcher}, }; @@ -94,7 +95,7 @@ pub fn match_fixed_path_set( max_results: usize, path_style: PathStyle, ) -> Vec { - let lowercase_query = query.to_lowercase().chars().collect::>(); + let lowercase_query = query.chars().map(simple_lowercase).collect::>(); let query = query.chars().collect::>(); let query_char_bag = CharBag::from(&lowercase_query[..]); @@ -110,7 +111,7 @@ pub fn match_fixed_path_set( path_prefix_chars.extend(path_style.primary_separator().chars()); let lowercase_pfx = path_prefix_chars .iter() - .map(|c| c.to_ascii_lowercase()) + .map(|c| simple_lowercase(*c)) .collect::>(); (worktree_root_name, path_prefix_chars, lowercase_pfx) @@ -171,7 +172,7 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>( let lowercase_query = query .iter() - .map(|query| query.to_ascii_lowercase()) + .map(|query| simple_lowercase(*query)) .collect::>(); let query = &query; @@ -217,7 +218,7 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>( } let lowercase_prefix = prefix .iter() - .map(|c| c.to_ascii_lowercase()) + .map(|c| simple_lowercase(*c)) .collect::>(); matcher.match_candidates( &prefix, diff --git a/crates/fuzzy/src/strings.rs b/crates/fuzzy/src/strings.rs index 54539840cfb0ca251428d9f78d5d134f16afdf4c..fb191bd9dcadd81a5a9890032ef8b185cdf7342e 100644 --- a/crates/fuzzy/src/strings.rs +++ b/crates/fuzzy/src/strings.rs @@ -1,5 +1,6 @@ use crate::{ CharBag, + char_bag::simple_lowercase, matcher::{MatchCandidate, Matcher}, }; use gpui::BackgroundExecutor; @@ -141,7 +142,7 @@ where .collect(); } - let lowercase_query = query.to_lowercase().chars().collect::>(); + let lowercase_query = query.chars().map(simple_lowercase).collect::>(); let query = query.chars().collect::>(); let lowercase_query = &lowercase_query;