Properly score fuzzy match queries with multiple chars in lower case (#29794)

Kirill Bulatov created

Closes https://github.com/zed-industries/zed/issues/29526

Release Notes:

- Fixed file finder crashing for certain file names with multiple chars
in lowercase form

Change summary

crates/file_finder/src/file_finder_tests.rs | 32 +++++++++++++
crates/fuzzy/src/matcher.rs                 | 53 +++++++++++++++-------
2 files changed, 68 insertions(+), 17 deletions(-)

Detailed changes

crates/file_finder/src/file_finder_tests.rs 🔗

@@ -242,6 +242,38 @@ async fn test_matching_paths(cx: &mut TestAppContext) {
     }
 }
 
+#[gpui::test]
+async fn test_unicode_paths(cx: &mut TestAppContext) {
+    let app_state = init_test(cx);
+    app_state
+        .fs
+        .as_fake()
+        .insert_tree(
+            path!("/root"),
+            json!({
+                "a": {
+                    "İg": " ",
+                }
+            }),
+        )
+        .await;
+
+    let project = Project::test(app_state.fs.clone(), [path!("/root").as_ref()], cx).await;
+
+    let (picker, workspace, cx) = build_find_picker(project, cx);
+
+    cx.simulate_input("g");
+    picker.update(cx, |picker, _| {
+        assert_eq!(picker.delegate.matches.len(), 1);
+    });
+    cx.dispatch_action(SelectNext);
+    cx.dispatch_action(Confirm);
+    cx.read(|cx| {
+        let active_editor = workspace.read(cx).active_item_as::<Editor>(cx).unwrap();
+        assert_eq!(active_editor.read(cx).title(cx), "İg");
+    });
+}
+
 #[gpui::test]
 async fn test_absolute_paths(cx: &mut TestAppContext) {
     let app_state = init_test(cx);

crates/fuzzy/src/matcher.rs 🔗

@@ -1,5 +1,6 @@
 use std::{
     borrow::{Borrow, Cow},
+    collections::BTreeMap,
     sync::atomic::{self, AtomicBool},
 };
 
@@ -50,7 +51,7 @@ impl<'a> Matcher<'a> {
 
     /// Filter and score fuzzy match candidates. Results are returned unsorted, in the same order as
     /// the input candidates.
-    pub fn match_candidates<C, R, F, T>(
+    pub(crate) fn match_candidates<C, R, F, T>(
         &mut self,
         prefix: &[char],
         lowercase_prefix: &[char],
@@ -65,6 +66,7 @@ impl<'a> Matcher<'a> {
     {
         let mut candidate_chars = Vec::new();
         let mut lowercase_candidate_chars = Vec::new();
+        let mut extra_lowercase_chars = BTreeMap::new();
 
         for candidate in candidates {
             if !candidate.borrow().has_chars(self.query_char_bag) {
@@ -77,9 +79,14 @@ impl<'a> Matcher<'a> {
 
             candidate_chars.clear();
             lowercase_candidate_chars.clear();
-            for c in candidate.borrow().to_string().chars() {
+            extra_lowercase_chars.clear();
+            for (i, c) in candidate.borrow().to_string().chars().enumerate() {
                 candidate_chars.push(c);
-                lowercase_candidate_chars.append(&mut c.to_lowercase().collect::<Vec<_>>());
+                let mut char_lowercased = c.to_lowercase().collect::<Vec<_>>();
+                if char_lowercased.len() > 1 {
+                    extra_lowercase_chars.insert(i, char_lowercased.len() - 1);
+                }
+                lowercase_candidate_chars.append(&mut char_lowercased);
             }
 
             if !self.find_last_positions(lowercase_prefix, &lowercase_candidate_chars) {
@@ -97,6 +104,7 @@ impl<'a> Matcher<'a> {
                 &lowercase_candidate_chars,
                 prefix,
                 lowercase_prefix,
+                &extra_lowercase_chars,
             );
 
             if score > 0.0 {
@@ -131,18 +139,20 @@ impl<'a> Matcher<'a> {
     fn score_match(
         &mut self,
         path: &[char],
-        path_cased: &[char],
+        path_lowercased: &[char],
         prefix: &[char],
         lowercase_prefix: &[char],
+        extra_lowercase_chars: &BTreeMap<usize, usize>,
     ) -> f64 {
         let score = self.recursive_score_match(
             path,
-            path_cased,
+            path_lowercased,
             prefix,
             lowercase_prefix,
             0,
             0,
             self.query.len() as f64,
+            extra_lowercase_chars,
         ) * self.query.len() as f64;
 
         if score <= 0.0 {
@@ -173,12 +183,13 @@ impl<'a> Matcher<'a> {
     fn recursive_score_match(
         &mut self,
         path: &[char],
-        path_cased: &[char],
+        path_lowercased: &[char],
         prefix: &[char],
         lowercase_prefix: &[char],
         query_idx: usize,
         path_idx: usize,
         cur_score: f64,
+        extra_lowercase_chars: &BTreeMap<usize, usize>,
     ) -> f64 {
         use std::path::MAIN_SEPARATOR;
 
@@ -200,15 +211,22 @@ impl<'a> Matcher<'a> {
 
         let mut last_slash = 0;
         for j in path_idx..=limit {
-            let path_char = if j < prefix.len() {
+            let extra_lowercase_chars_count = extra_lowercase_chars
+                .iter()
+                .take_while(|(i, _)| i < &&j)
+                .map(|(_, increment)| increment)
+                .sum::<usize>();
+            let j_regular = j - extra_lowercase_chars_count;
+
+            let path_char = if j_regular < prefix.len() {
                 lowercase_prefix[j]
             } else {
-                path_cased[j - prefix.len()]
+                path_lowercased[j - prefix.len()]
             };
             let is_path_sep = path_char == MAIN_SEPARATOR;
 
             if query_idx == 0 && is_path_sep {
-                last_slash = j;
+                last_slash = j_regular;
             }
 
             #[cfg(not(target_os = "windows"))]
@@ -218,18 +236,18 @@ impl<'a> Matcher<'a> {
             #[cfg(target_os = "windows")]
             let need_to_score = query_char == path_char || (is_path_sep && query_char == '_');
             if need_to_score {
-                let curr = if j < prefix.len() {
-                    prefix[j]
+                let curr = if j_regular < prefix.len() {
+                    prefix[j_regular]
                 } else {
-                    path[j - prefix.len()]
+                    path[j_regular - prefix.len()]
                 };
 
                 let mut char_score = 1.0;
                 if j > path_idx {
-                    let last = if j - 1 < prefix.len() {
-                        prefix[j - 1]
+                    let last = if j_regular - 1 < prefix.len() {
+                        prefix[j_regular - 1]
                     } else {
-                        path[j - 1 - prefix.len()]
+                        path[j_regular - 1 - prefix.len()]
                     };
 
                     if last == MAIN_SEPARATOR {
@@ -279,17 +297,18 @@ impl<'a> Matcher<'a> {
 
                 let new_score = self.recursive_score_match(
                     path,
-                    path_cased,
+                    path_lowercased,
                     prefix,
                     lowercase_prefix,
                     query_idx + 1,
                     j + 1,
                     next_score,
+                    extra_lowercase_chars,
                 ) * multiplier;
 
                 if new_score > score {
                     score = new_score;
-                    best_position = j;
+                    best_position = j_regular;
                     // Optimization: can't score better than 1.
                     if new_score == 1.0 {
                         break;