diff --git a/crates/fuzzy/src/char_bag.rs b/crates/fuzzy/src/char_bag.rs index 13b00816ed0141117fb6d5ac9265e4b82c7aa57d..1821a63793337862d9d6ad01a6a42072588d7be5 100644 --- a/crates/fuzzy/src/char_bag.rs +++ b/crates/fuzzy/src/char_bag.rs @@ -1,5 +1,9 @@ use std::iter::FromIterator; +pub fn simple_lowercase(c: char) -> char { + c.to_lowercase().next().unwrap_or(c) +} + #[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)] pub struct CharBag(u64); @@ -9,7 +13,7 @@ impl CharBag { } fn insert(&mut self, c: char) { - let c = c.to_ascii_lowercase(); + let c = simple_lowercase(c); if c.is_ascii_lowercase() { let mut count = self.0; let idx = c as u8 - b'a'; diff --git a/crates/fuzzy/src/matcher.rs b/crates/fuzzy/src/matcher.rs index 782c9caca832d81fb6e4bce8f49b4f310664b292..102708d2fad6b560b1a606c34246033587affdda 100644 --- a/crates/fuzzy/src/matcher.rs +++ b/crates/fuzzy/src/matcher.rs @@ -1,10 +1,9 @@ use std::{ borrow::Borrow, - collections::BTreeMap, sync::atomic::{self, AtomicBool}, }; -use crate::CharBag; +use crate::{CharBag, char_bag::simple_lowercase}; const BASE_DISTANCE_PENALTY: f64 = 0.6; const ADDITIONAL_DISTANCE_PENALTY: f64 = 0.05; @@ -69,7 +68,6 @@ impl<'a> Matcher<'a> { { let mut candidate_chars = Vec::new(); let mut lowercase_candidate_chars = Vec::new(); - let mut extra_lowercase_chars = BTreeMap::new(); for candidate in candidates { if !candidate.borrow().has_chars(self.query_char_bag) { @@ -82,14 +80,9 @@ impl<'a> Matcher<'a> { candidate_chars.clear(); lowercase_candidate_chars.clear(); - extra_lowercase_chars.clear(); - for (i, c) in candidate.borrow().candidate_chars().enumerate() { + for c in candidate.borrow().candidate_chars() { candidate_chars.push(c); - let mut char_lowercased = c.to_lowercase().collect::>(); - if char_lowercased.len() > 1 { - extra_lowercase_chars.insert(i, char_lowercased.len() - 1); - } - lowercase_candidate_chars.append(&mut char_lowercased); + lowercase_candidate_chars.push(simple_lowercase(c)); } if !self.find_last_positions(lowercase_prefix, &lowercase_candidate_chars) { @@ -108,7 +101,6 @@ impl<'a> Matcher<'a> { &lowercase_candidate_chars, prefix, lowercase_prefix, - &extra_lowercase_chars, ); if score > 0.0 { @@ -146,7 +138,6 @@ impl<'a> Matcher<'a> { path_lowercased: &[char], prefix: &[char], lowercase_prefix: &[char], - extra_lowercase_chars: &BTreeMap, ) -> f64 { let score = self.recursive_score_match( path, @@ -156,7 +147,6 @@ impl<'a> Matcher<'a> { 0, 0, self.query.len() as f64, - extra_lowercase_chars, ) * self.query.len() as f64; if score <= 0.0 { @@ -201,7 +191,6 @@ impl<'a> Matcher<'a> { query_idx: usize, path_idx: usize, cur_score: f64, - extra_lowercase_chars: &BTreeMap, ) -> f64 { if query_idx == self.query.len() { return 1.0; @@ -228,13 +217,6 @@ impl<'a> Matcher<'a> { let mut last_slash = 0; for j in path_idx..=safe_limit { - let extra_lowercase_chars_count = extra_lowercase_chars - .iter() - .take_while(|&(&i, _)| i < j) - .map(|(_, increment)| increment) - .sum::(); - let j_regular = j - extra_lowercase_chars_count; - let path_char = if j < prefix.len() { lowercase_prefix[j] } else { @@ -247,20 +229,20 @@ impl<'a> Matcher<'a> { let is_path_sep = path_char == '/'; if query_idx == 0 && is_path_sep { - last_slash = j_regular; + last_slash = j; } let need_to_score = query_char == path_char || (is_path_sep && query_char == '_'); if need_to_score { - let curr = match prefix.get(j_regular) { + let curr = match prefix.get(j) { Some(&curr) => curr, - None => path[j_regular - prefix.len()], + None => path[j - prefix.len()], }; let mut char_score = 1.0; if j > path_idx { - let last = match prefix.get(j_regular - 1) { + let last = match prefix.get(j - 1) { Some(&last) => last, - None => path[j_regular - 1 - prefix.len()], + None => path[j - 1 - prefix.len()], }; if last == '/' { @@ -316,12 +298,11 @@ impl<'a> Matcher<'a> { query_idx + 1, j + 1, next_score, - extra_lowercase_chars, ) * multiplier; if new_score > score { score = new_score; - best_position = j_regular; + best_position = j; // Optimization: can't score better than 1. if new_score == 1.0 { break; @@ -469,12 +450,12 @@ mod tests { assert_eq!( match_single_path_query("İo/oluş", false, &mixed_unicode_paths), - vec![("İolu/oluş", vec![0, 2, 4, 6, 8, 10, 12])] + vec![("İolu/oluş", vec![0, 2, 5, 6, 7, 8, 9])] ); assert_eq!( match_single_path_query("İst/code", false, &mixed_unicode_paths), - vec![("İstanbul/code", vec![0, 2, 4, 6, 8, 10, 12, 14])] + vec![("İstanbul/code", vec![0, 2, 3, 9, 10, 11, 12, 13])] ); assert_eq!( @@ -536,12 +517,60 @@ mod tests { ); } + #[test] + fn test_positions_are_valid_char_boundaries_with_expanding_lowercase() { + // İ (U+0130) lowercases to "i\u{307}" (2 chars) under full case folding. + // With simple case mapping (used by this matcher), İ → 'i' (1 char), + // so positions remain valid byte boundaries. + let paths = vec!["İstanbul/code.rs", "aİbİc/dİeİf.txt", "src/İmport/İndex.ts"]; + + for query in &["code", "İst", "dİe", "İndex", "İmport", "abcdef"] { + let results = match_single_path_query(query, false, &paths); + for (path, positions) in &results { + for &pos in positions { + assert!( + path.is_char_boundary(pos), + "Position {pos} is not a valid char boundary in path {path:?} \ + (query: {query:?}, all positions: {positions:?})" + ); + } + } + } + } + + #[test] + fn test_positions_valid_with_various_multibyte_chars() { + // German ß uppercases to SS but lowercases to itself — no expansion. + // Armenian ligatures and other characters that could expand under full + // case folding should still produce valid byte boundaries. + let paths = vec![ + "straße/config.rs", + "Straße/München/file.txt", + "file/path.rs", // fi (U+FB01, fi ligature) + "ffoo/bar.txt", // ff (U+FB00, ff ligature) + "aÇbŞc/dÖeÜf.txt", // Turkish chars that don't expand + ]; + + for query in &["config", "Mün", "file", "bar", "abcdef", "straße", "ÇŞ"] { + let results = match_single_path_query(query, false, &paths); + for (path, positions) in &results { + for &pos in positions { + assert!( + path.is_char_boundary(pos), + "Position {pos} is not a valid char boundary in path {path:?} \ + (query: {query:?}, all positions: {positions:?})" + ); + } + } + } + } + fn match_single_path_query<'a>( query: &str, smart_case: bool, paths: &[&'a str], ) -> Vec<(&'a str, Vec)> { - let lowercase_query = query.to_lowercase().chars().collect::>(); + let lowercase_query = query.chars().map(simple_lowercase).collect::>(); let query = query.chars().collect::>(); let query_chars = CharBag::from(&lowercase_query[..]); @@ -551,7 +580,7 @@ mod tests { .collect::>(); let mut path_entries = Vec::new(); for (i, path) in paths.iter().enumerate() { - let lowercase_path = path.to_lowercase().chars().collect::>(); + let lowercase_path: Vec = path.chars().map(simple_lowercase).collect(); let char_bag = CharBag::from(lowercase_path.as_slice()); path_entries.push(PathMatchCandidate { is_dir: false, diff --git a/crates/fuzzy/src/paths.rs b/crates/fuzzy/src/paths.rs index cce0e082840c4cd05d6e2b21eac0073d3eb7700f..2f92f05b96a3be2da7053365d8a7c53722db6ab8 100644 --- a/crates/fuzzy/src/paths.rs +++ b/crates/fuzzy/src/paths.rs @@ -10,6 +10,7 @@ use util::{paths::PathStyle, rel_path::RelPath}; use crate::{ CharBag, + char_bag::simple_lowercase, matcher::{MatchCandidate, Matcher}, }; @@ -94,7 +95,7 @@ pub fn match_fixed_path_set( max_results: usize, path_style: PathStyle, ) -> Vec { - let lowercase_query = query.to_lowercase().chars().collect::>(); + let lowercase_query = query.chars().map(simple_lowercase).collect::>(); let query = query.chars().collect::>(); let query_char_bag = CharBag::from(&lowercase_query[..]); @@ -110,7 +111,7 @@ pub fn match_fixed_path_set( path_prefix_chars.extend(path_style.primary_separator().chars()); let lowercase_pfx = path_prefix_chars .iter() - .map(|c| c.to_ascii_lowercase()) + .map(|c| simple_lowercase(*c)) .collect::>(); (worktree_root_name, path_prefix_chars, lowercase_pfx) @@ -171,7 +172,7 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>( let lowercase_query = query .iter() - .map(|query| query.to_ascii_lowercase()) + .map(|query| simple_lowercase(*query)) .collect::>(); let query = &query; @@ -217,7 +218,7 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>( } let lowercase_prefix = prefix .iter() - .map(|c| c.to_ascii_lowercase()) + .map(|c| simple_lowercase(*c)) .collect::>(); matcher.match_candidates( &prefix, diff --git a/crates/fuzzy/src/strings.rs b/crates/fuzzy/src/strings.rs index 54539840cfb0ca251428d9f78d5d134f16afdf4c..fb191bd9dcadd81a5a9890032ef8b185cdf7342e 100644 --- a/crates/fuzzy/src/strings.rs +++ b/crates/fuzzy/src/strings.rs @@ -1,5 +1,6 @@ use crate::{ CharBag, + char_bag::simple_lowercase, matcher::{MatchCandidate, Matcher}, }; use gpui::BackgroundExecutor; @@ -141,7 +142,7 @@ where .collect(); } - let lowercase_query = query.to_lowercase().chars().collect::>(); + let lowercase_query = query.chars().map(simple_lowercase).collect::>(); let query = query.chars().collect::>(); let lowercase_query = &lowercase_query;