Fuzzy match performance improvements redo (#22561)

Michael Sloan created

Release Notes:

- N/A

Change summary

crates/fuzzy/src/matcher.rs | 41 +++++++-------------------
crates/fuzzy/src/paths.rs   | 59 ++++++++++++++------------------------
crates/fuzzy/src/strings.rs | 39 +++++++------------------
crates/util/src/util.rs     | 46 +++++++++++++++++++++++++++++
4 files changed, 89 insertions(+), 96 deletions(-)

Detailed changes

crates/fuzzy/src/matcher.rs 🔗

@@ -14,7 +14,6 @@ pub struct Matcher<'a> {
     lowercase_query: &'a [char],
     query_char_bag: CharBag,
     smart_case: bool,
-    max_results: usize,
     min_score: f64,
     match_positions: Vec<usize>,
     last_positions: Vec<usize>,
@@ -22,11 +21,6 @@ pub struct Matcher<'a> {
     best_position_matrix: Vec<usize>,
 }
 
-pub trait Match: Ord {
-    fn score(&self) -> f64;
-    fn set_positions(&mut self, positions: Vec<usize>);
-}
-
 pub trait MatchCandidate {
     fn has_chars(&self, bag: CharBag) -> bool;
     fn to_string(&self) -> Cow<'_, str>;
@@ -38,7 +32,6 @@ impl<'a> Matcher<'a> {
         lowercase_query: &'a [char],
         query_char_bag: CharBag,
         smart_case: bool,
-        max_results: usize,
     ) -> Self {
         Self {
             query,
@@ -50,10 +43,11 @@ impl<'a> Matcher<'a> {
             score_matrix: Vec::new(),
             best_position_matrix: Vec::new(),
             smart_case,
-            max_results,
         }
     }
 
+    /// Filter and score fuzzy match candidates. Results are returned unsorted, in the same order as
+    /// the input candidates.
     pub fn match_candidates<C: MatchCandidate, R, F>(
         &mut self,
         prefix: &[char],
@@ -63,8 +57,7 @@ impl<'a> Matcher<'a> {
         cancel_flag: &AtomicBool,
         build_match: F,
     ) where
-        R: Match,
-        F: Fn(&C, f64) -> R,
+        F: Fn(&C, f64, &Vec<usize>) -> R,
     {
         let mut candidate_chars = Vec::new();
         let mut lowercase_candidate_chars = Vec::new();
@@ -103,20 +96,7 @@ impl<'a> Matcher<'a> {
             );
 
             if score > 0.0 {
-                let mut mat = build_match(&candidate, score);
-                if let Err(i) = results.binary_search_by(|m| mat.cmp(m)) {
-                    if results.len() < self.max_results {
-                        mat.set_positions(self.match_positions.clone());
-                        results.insert(i, mat);
-                    } else if i < results.len() {
-                        results.pop();
-                        mat.set_positions(self.match_positions.clone());
-                        results.insert(i, mat);
-                    }
-                    if results.len() == self.max_results {
-                        self.min_score = results.last().unwrap().score();
-                    }
-                }
+                results.push(build_match(&candidate, score, &self.match_positions));
             }
         }
     }
@@ -325,18 +305,18 @@ mod tests {
     #[test]
     fn test_get_last_positions() {
         let mut query: &[char] = &['d', 'c'];
-        let mut matcher = Matcher::new(query, query, query.into(), false, 10);
+        let mut matcher = Matcher::new(query, query, query.into(), false);
         let result = matcher.find_last_positions(&['a', 'b', 'c'], &['b', 'd', 'e', 'f']);
         assert!(!result);
 
         query = &['c', 'd'];
-        let mut matcher = Matcher::new(query, query, query.into(), false, 10);
+        let mut matcher = Matcher::new(query, query, query.into(), false);
         let result = matcher.find_last_positions(&['a', 'b', 'c'], &['b', 'd', 'e', 'f']);
         assert!(result);
         assert_eq!(matcher.last_positions, vec![2, 4]);
 
         query = &['z', '/', 'z', 'f'];
-        let mut matcher = Matcher::new(query, query, query.into(), false, 10);
+        let mut matcher = Matcher::new(query, query, query.into(), false);
         let result = matcher.find_last_positions(&['z', 'e', 'd', '/'], &['z', 'e', 'd', '/', 'f']);
         assert!(result);
         assert_eq!(matcher.last_positions, vec![0, 3, 4, 8]);
@@ -451,7 +431,7 @@ mod tests {
             });
         }
 
-        let mut matcher = Matcher::new(&query, &lowercase_query, query_chars, smart_case, 100);
+        let mut matcher = Matcher::new(&query, &lowercase_query, query_chars, smart_case);
 
         let cancel_flag = AtomicBool::new(false);
         let mut results = Vec::new();
@@ -462,16 +442,17 @@ mod tests {
             path_entries.into_iter(),
             &mut results,
             &cancel_flag,
-            |candidate, score| PathMatch {
+            |candidate, score, positions| PathMatch {
                 score,
                 worktree_id: 0,
-                positions: Vec::new(),
+                positions: positions.clone(),
                 path: Arc::from(candidate.path),
                 path_prefix: "".into(),
                 distance_to_relative_ancestor: usize::MAX,
                 is_dir: false,
             },
         );
+        results.sort_by(|a, b| b.cmp(a));
 
         results
             .into_iter()

crates/fuzzy/src/paths.rs 🔗

@@ -3,11 +3,14 @@ use std::{
     borrow::Cow,
     cmp::{self, Ordering},
     path::Path,
-    sync::{atomic::AtomicBool, Arc},
+    sync::{
+        atomic::{self, AtomicBool},
+        Arc,
+    },
 };
 
 use crate::{
-    matcher::{Match, MatchCandidate, Matcher},
+    matcher::{MatchCandidate, Matcher},
     CharBag,
 };
 
@@ -42,16 +45,6 @@ pub trait PathMatchCandidateSet<'a>: Send + Sync {
     fn candidates(&'a self, start: usize) -> Self::Candidates;
 }
 
-impl Match for PathMatch {
-    fn score(&self) -> f64 {
-        self.score
-    }
-
-    fn set_positions(&mut self, positions: Vec<usize>) {
-        self.positions = positions;
-    }
-}
-
 impl<'a> MatchCandidate for PathMatchCandidate<'a> {
     fn has_chars(&self, bag: CharBag) -> bool {
         self.char_bag.is_superset(bag)
@@ -102,13 +95,7 @@ pub fn match_fixed_path_set(
     let query = query.chars().collect::<Vec<_>>();
     let query_char_bag = CharBag::from(&lowercase_query[..]);
 
-    let mut matcher = Matcher::new(
-        &query,
-        &lowercase_query,
-        query_char_bag,
-        smart_case,
-        max_results,
-    );
+    let mut matcher = Matcher::new(&query, &lowercase_query, query_char_bag, smart_case);
 
     let mut results = Vec::new();
     matcher.match_candidates(
@@ -117,16 +104,17 @@ pub fn match_fixed_path_set(
         candidates.into_iter(),
         &mut results,
         &AtomicBool::new(false),
-        |candidate, score| PathMatch {
+        |candidate, score, positions| PathMatch {
             score,
             worktree_id,
-            positions: Vec::new(),
+            positions: positions.clone(),
             is_dir: candidate.is_dir,
             path: Arc::from(candidate.path),
             path_prefix: Arc::default(),
             distance_to_relative_ancestor: usize::MAX,
         },
     );
+    util::truncate_to_bottom_n_sorted_by(&mut results, max_results, &|a, b| b.cmp(a));
     results
 }
 
@@ -164,16 +152,15 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
                 scope.spawn(async move {
                     let segment_start = segment_idx * segment_size;
                     let segment_end = segment_start + segment_size;
-                    let mut matcher = Matcher::new(
-                        query,
-                        lowercase_query,
-                        query_char_bag,
-                        smart_case,
-                        max_results,
-                    );
+                    let mut matcher =
+                        Matcher::new(query, lowercase_query, query_char_bag, smart_case);
 
                     let mut tree_start = 0;
                     for candidate_set in candidate_sets {
+                        if cancel_flag.load(atomic::Ordering::Relaxed) {
+                            break;
+                        }
+
                         let tree_end = tree_start + candidate_set.len();
 
                         if tree_start < segment_end && segment_start < tree_end {
@@ -193,10 +180,10 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
                                 candidates,
                                 results,
                                 cancel_flag,
-                                |candidate, score| PathMatch {
+                                |candidate, score, positions| PathMatch {
                                     score,
                                     worktree_id,
-                                    positions: Vec::new(),
+                                    positions: positions.clone(),
                                     path: Arc::from(candidate.path),
                                     is_dir: candidate.is_dir,
                                     path_prefix: candidate_set.prefix(),
@@ -222,14 +209,12 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
         })
         .await;
 
-    let mut results = Vec::new();
-    for segment_result in segment_results {
-        if results.is_empty() {
-            results = segment_result;
-        } else {
-            util::extend_sorted(&mut results, segment_result, max_results, |a, b| b.cmp(a));
-        }
+    if cancel_flag.load(atomic::Ordering::Relaxed) {
+        return Vec::new();
     }
+
+    let mut results = segment_results.concat();
+    util::truncate_to_bottom_n_sorted_by(&mut results, max_results, &|a, b| b.cmp(a));
     results
 }
 

crates/fuzzy/src/strings.rs 🔗

@@ -1,5 +1,5 @@
 use crate::{
-    matcher::{Match, MatchCandidate, Matcher},
+    matcher::{MatchCandidate, Matcher},
     CharBag,
 };
 use gpui::BackgroundExecutor;
@@ -8,7 +8,7 @@ use std::{
     cmp::{self, Ordering},
     iter,
     ops::Range,
-    sync::atomic::AtomicBool,
+    sync::atomic::{self, AtomicBool},
 };
 
 #[derive(Clone, Debug)]
@@ -46,16 +46,6 @@ pub struct StringMatch {
     pub string: String,
 }
 
-impl Match for StringMatch {
-    fn score(&self) -> f64 {
-        self.score
-    }
-
-    fn set_positions(&mut self, positions: Vec<usize>) {
-        self.positions = positions;
-    }
-}
-
 impl StringMatch {
     pub fn ranges(&self) -> impl '_ + Iterator<Item = Range<usize>> {
         let mut positions = self.positions.iter().peekable();
@@ -167,13 +157,8 @@ pub async fn match_strings(
                 scope.spawn(async move {
                     let segment_start = cmp::min(segment_idx * segment_size, candidates.len());
                     let segment_end = cmp::min(segment_start + segment_size, candidates.len());
-                    let mut matcher = Matcher::new(
-                        query,
-                        lowercase_query,
-                        query_char_bag,
-                        smart_case,
-                        max_results,
-                    );
+                    let mut matcher =
+                        Matcher::new(query, lowercase_query, query_char_bag, smart_case);
 
                     matcher.match_candidates(
                         &[],
@@ -181,10 +166,10 @@ pub async fn match_strings(
                         candidates[segment_start..segment_end].iter(),
                         results,
                         cancel_flag,
-                        |candidate, score| StringMatch {
+                        |candidate, score, positions| StringMatch {
                             candidate_id: candidate.id,
                             score,
-                            positions: Vec::new(),
+                            positions: positions.clone(),
                             string: candidate.string.to_string(),
                         },
                     );
@@ -193,13 +178,11 @@ pub async fn match_strings(
         })
         .await;
 
-    let mut results = Vec::new();
-    for segment_result in segment_results {
-        if results.is_empty() {
-            results = segment_result;
-        } else {
-            util::extend_sorted(&mut results, segment_result, max_results, |a, b| b.cmp(a));
-        }
+    if cancel_flag.load(atomic::Ordering::Relaxed) {
+        return Vec::new();
     }
+
+    let mut results = segment_results.concat();
+    util::truncate_to_bottom_n_sorted_by(&mut results, max_results, &|a, b| b.cmp(a));
     results
 }

crates/util/src/util.rs 🔗

@@ -8,7 +8,6 @@ pub mod test;
 
 use anyhow::{anyhow, Context as _, Result};
 use futures::Future;
-
 use itertools::Either;
 use regex::Regex;
 use std::sync::{LazyLock, OnceLock};
@@ -111,6 +110,28 @@ where
     }
 }
 
+pub fn truncate_to_bottom_n_sorted_by<T, F>(items: &mut Vec<T>, limit: usize, compare: &F)
+where
+    F: Fn(&T, &T) -> Ordering,
+{
+    if limit == 0 {
+        items.truncate(0);
+    }
+    if items.len() <= limit {
+        items.sort_by(compare);
+        return;
+    }
+    // When limit is near to items.len() it may be more efficient to sort the whole list and
+    // truncate, rather than always doing selection first as is done below. It's hard to analyze
+    // where the threshold for this should be since the quickselect style algorithm used by
+    // `select_nth_unstable_by` makes the prefix partially sorted, and so its work is not wasted -
+    // the expected number of comparisons needed by `sort_by` is less than it is for some arbitrary
+    // unsorted input.
+    items.select_nth_unstable_by(limit, compare);
+    items.truncate(limit);
+    items.sort_by(compare);
+}
+
 #[cfg(unix)]
 pub fn load_shell_from_passwd() -> Result<()> {
     let buflen = match unsafe { libc::sysconf(libc::_SC_GETPW_R_SIZE_MAX) } {
@@ -734,6 +755,29 @@ mod tests {
         assert_eq!(vec, &[1000, 101, 21, 19, 17, 13, 9, 8]);
     }
 
+    #[test]
+    fn test_truncate_to_bottom_n_sorted_by() {
+        let mut vec: Vec<u32> = vec![5, 2, 3, 4, 1];
+        truncate_to_bottom_n_sorted_by(&mut vec, 10, &u32::cmp);
+        assert_eq!(vec, &[1, 2, 3, 4, 5]);
+
+        vec = vec![5, 2, 3, 4, 1];
+        truncate_to_bottom_n_sorted_by(&mut vec, 5, &u32::cmp);
+        assert_eq!(vec, &[1, 2, 3, 4, 5]);
+
+        vec = vec![5, 2, 3, 4, 1];
+        truncate_to_bottom_n_sorted_by(&mut vec, 4, &u32::cmp);
+        assert_eq!(vec, &[1, 2, 3, 4]);
+
+        vec = vec![5, 2, 3, 4, 1];
+        truncate_to_bottom_n_sorted_by(&mut vec, 1, &u32::cmp);
+        assert_eq!(vec, &[1]);
+
+        vec = vec![5, 2, 3, 4, 1];
+        truncate_to_bottom_n_sorted_by(&mut vec, 0, &u32::cmp);
+        assert!(vec.is_empty());
+    }
+
     #[test]
     fn test_iife() {
         fn option_returning_function() -> Option<()> {