diff --git a/Cargo.lock b/Cargo.lock index b93f6ddb22d0bcfb6516ef7e8933ba8ba7505e35..9cc4011ff11502c9a992f3a1a00d7325bca6e74d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6772,6 +6772,8 @@ dependencies = [ name = "fuzzy_nucleo" version = "0.1.0" dependencies = [ + "criterion", + "fuzzy", "gpui", "nucleo", "util", diff --git a/crates/file_finder/src/file_finder.rs b/crates/file_finder/src/file_finder.rs index a4d9ea042dea898b9dd9db7d40354cf960d210d5..ddba89c9c744f80cdddd84c35ae5856d7b2464b9 100644 --- a/crates/file_finder/src/file_finder.rs +++ b/crates/file_finder/src/file_finder.rs @@ -698,13 +698,18 @@ fn matching_history_items<'a>( .into_iter() .chain(currently_opened) .map(|found_path| { - let candidate = PathMatchCandidate { - is_dir: false, // You can't open directories as project items - path: &found_path.project.path, - // Only match history items names, otherwise their paths may match too many queries, producing false positives. - // E.g. `foo` would match both `something/foo/bar.rs` and `something/foo/foo.rs` and if the former is a history item, - // it would be shown first always, despite the latter being a better match. - }; + // Only match history items names, otherwise their paths may match too many queries, + // producing false positives. E.g. `foo` would match both `something/foo/bar.rs` and + // `something/foo/foo.rs` and if the former is a history item, it would be shown first + // always, despite the latter being a better match. + let candidate = PathMatchCandidate::new( + &found_path.project.path, + false, + worktree_name_by_id + .as_ref() + .and_then(|m| m.get(&found_path.project.worktree_id)) + .map(|prefix| prefix.as_ref()), + ); candidates_paths.insert(&found_path.project, found_path); (found_path.project.worktree_id, candidate) }) @@ -731,7 +736,7 @@ fn matching_history_items<'a>( worktree.to_usize(), worktree_root_name, query.path_query(), - false, + fuzzy_nucleo::Case::Ignore, max_results, path_style, ) @@ -914,7 +919,7 @@ impl FileFinderDelegate { candidate_sets.as_slice(), query.path_query(), &relative_to, - false, + fuzzy_nucleo::Case::Ignore, 100, &cancel_flag, cx.background_executor().clone(), diff --git a/crates/fuzzy_nucleo/Cargo.toml b/crates/fuzzy_nucleo/Cargo.toml index 59e8b642524777f449f79edba85093eef069ebff..b2152035ff317aeee5a675e07db1b923213db2f5 100644 --- a/crates/fuzzy_nucleo/Cargo.toml +++ b/crates/fuzzy_nucleo/Cargo.toml @@ -13,9 +13,15 @@ path = "src/fuzzy_nucleo.rs" doctest = false [dependencies] +fuzzy.workspace = true nucleo.workspace = true gpui.workspace = true util.workspace = true [dev-dependencies] -util = {workspace = true, features = ["test-support"]} +criterion.workspace = true +util = { workspace = true, features = ["test-support"] } + +[[bench]] +name = "match_benchmark" +harness = false diff --git a/crates/fuzzy_nucleo/benches/match_benchmark.rs b/crates/fuzzy_nucleo/benches/match_benchmark.rs new file mode 100644 index 0000000000000000000000000000000000000000..3aab6e756fcb944e04e218bc286fe59cc70496a9 --- /dev/null +++ b/crates/fuzzy_nucleo/benches/match_benchmark.rs @@ -0,0 +1,253 @@ +use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main}; +use fuzzy::CharBag; +use util::{paths::PathStyle, rel_path::RelPath}; + +const DIRS: &[&str] = &[ + "src", + "crates/gpui/src", + "crates/editor/src", + "crates/fuzzy_nucleo/src", + "crates/workspace/src", + "crates/project/src", + "crates/language/src", + "crates/terminal/src", + "crates/assistant/src", + "crates/theme/src", + "tests/integration", + "tests/unit", + "docs/architecture", + "scripts", + "assets/icons", + "assets/fonts", + "crates/git/src", + "crates/rpc/src", + "crates/settings/src", + "crates/diagnostics/src", + "crates/search/src", + "crates/collab/src", + "crates/db/src", + "crates/lsp/src", +]; + +const FILENAMES: &[&str] = &[ + "parser.rs", + "main.rs", + "executor.rs", + "editor.rs", + "strings.rs", + "workspace.rs", + "project.rs", + "buffer.rs", + "colors.rs", + "panel.rs", + "renderer.rs", + "dispatcher.rs", + "matcher.rs", + "paths.rs", + "context.rs", + "toolbar.rs", + "statusbar.rs", + "keymap.rs", + "config.rs", + "settings.rs", + "diagnostics.rs", + "completion.rs", + "hover.rs", + "references.rs", + "inlay_hints.rs", + "git_blame.rs", + "terminal.rs", + "search.rs", + "replace.rs", + "outline.rs", + "breadcrumbs.rs", + "tab_bar.rs", + "Cargo.toml", + "README.md", + "build.sh", + "LICENSE", + "overview.md", + "string_helpers.rs", + "test_helpers.rs", + "fixtures.json", + "schema.sql", +]; + +const QUERY_WORDS: &[&str] = &[ + "par", + "edi", + "buf", + "set", + "mat", + "con", + "ren", + "dis", + "sea", + "ter", + "col", + "hov", + "out", + "rep", + "key", + "too", + "pan", + "str", + "dia", + "com", + "executor", + "workspace", + "settings", + "terminal", + "breadcrumbs", + "git_blame", + "fixtures", + "schema", + "config", + "toolbar", +]; + +/// Deterministic query generation from QUERY_WORDS using a simple LCG. +/// Returns `count` queries of each arity: 1, 2, and 4 space-separated words. +fn generate_queries(count: usize) -> (Vec, Vec, Vec) { + let mut state: u64 = 0xDEAD_BEEF; + let mut next = || -> usize { + // LCG: simple, fast, deterministic + state = state.wrapping_mul(6364136223846793005).wrapping_add(1); + (state >> 33) as usize + }; + let mut n_word = |n: usize| -> Vec { + (0..count) + .map(|_| { + (0..n) + .map(|_| QUERY_WORDS[next() % QUERY_WORDS.len()]) + .collect::>() + .join(" ") + }) + .collect() + }; + + (n_word(1), n_word(2), n_word(4)) +} + +fn generate_path_strings(count: usize) -> &'static [String] { + let paths: Box<[String]> = (0..count) + .map(|id| { + let dir = DIRS[id % DIRS.len()]; + let file = FILENAMES[id / DIRS.len() % FILENAMES.len()]; + format!("{dir}/{file}") + }) + .collect(); + Box::leak(paths) +} + +fn generate_nucleo_path_candidates( + paths: &'static [String], +) -> Vec> { + paths + .iter() + .map(|path| { + fuzzy_nucleo::PathMatchCandidate::new(RelPath::unix(path).unwrap(), false, None) + }) + .collect() +} + +fn generate_fuzzy_path_candidates( + paths: &'static [String], +) -> Vec> { + paths + .iter() + .map(|path| fuzzy::PathMatchCandidate { + is_dir: false, + path: RelPath::unix(path).unwrap(), + char_bag: CharBag::from(path.as_str()), + }) + .collect() +} + +fn capitalize_each_word(query: &str) -> String { + query + .split_whitespace() + .map(|w| { + let mut chars = w.chars(); + match chars.next() { + Some(c) => c.to_ascii_uppercase().to_string() + chars.as_str(), + None => String::new(), + } + }) + .collect::>() + .join(" ") +} + +fn bench_path_matching(criterion: &mut Criterion) { + let sizes = [100, 1000, 10_000]; + let all_path_strings = sizes.map(generate_path_strings); + let query_count = 200; + let (q1, q2, q4) = generate_queries(query_count); + let q1_upper: Vec = q1.iter().map(|q| capitalize_each_word(q)).collect(); + let q2_upper: Vec = q2.iter().map(|q| capitalize_each_word(q)).collect(); + let q4_upper: Vec = q4.iter().map(|q| capitalize_each_word(q)).collect(); + + for (label, queries, case) in [ + ("path/1-word", &q1, fuzzy_nucleo::Case::Ignore), + ("path/2-word", &q2, fuzzy_nucleo::Case::Ignore), + ("path/4-word", &q4, fuzzy_nucleo::Case::Ignore), + ("path_smart/1-word", &q1_upper, fuzzy_nucleo::Case::Smart), + ("path_smart/2-word", &q2_upper, fuzzy_nucleo::Case::Smart), + ("path_smart/4-word", &q4_upper, fuzzy_nucleo::Case::Smart), + ] { + let mut group = criterion.benchmark_group(label); + for (size_index, &size) in sizes.iter().enumerate() { + let path_strings = all_path_strings[size_index]; + + let mut query_idx = 0usize; + group.bench_function(BenchmarkId::new("nucleo", size), |b| { + b.iter_batched( + || { + let query = queries[query_idx % queries.len()].as_str(); + query_idx += 1; + (generate_nucleo_path_candidates(path_strings), query) + }, + |(candidates, query)| { + fuzzy_nucleo::match_fixed_path_set( + candidates, + 0, + None, + query, + case, + size, + PathStyle::Posix, + ) + }, + BatchSize::SmallInput, + ) + }); + + let mut query_idx = 0usize; + group.bench_function(BenchmarkId::new("fuzzy", size), |b| { + b.iter_batched( + || { + let query = queries[query_idx % queries.len()].as_str(); + query_idx += 1; + (generate_fuzzy_path_candidates(path_strings), query) + }, + |(candidates, query)| { + fuzzy::match_fixed_path_set( + candidates, + 0, + None, + query, + false, + size, + PathStyle::Posix, + ) + }, + BatchSize::SmallInput, + ) + }); + } + group.finish(); + } +} + +criterion_group!(benches, bench_path_matching); +criterion_main!(benches); diff --git a/crates/fuzzy_nucleo/src/fuzzy_nucleo.rs b/crates/fuzzy_nucleo/src/fuzzy_nucleo.rs index ddaa5c3489cf55d41d31440f037214b1dce0358c..dcc9edf37d4bf3575dd95cb78a57aa7eb14e0ede 100644 --- a/crates/fuzzy_nucleo/src/fuzzy_nucleo.rs +++ b/crates/fuzzy_nucleo/src/fuzzy_nucleo.rs @@ -3,3 +3,53 @@ mod paths; pub use paths::{ PathMatch, PathMatchCandidate, PathMatchCandidateSet, match_fixed_path_set, match_path_sets, }; + +pub(crate) struct Cancelled; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum Case { + Smart, + Ignore, +} + +impl Case { + pub fn from_smart(smart: bool) -> Self { + if smart { Self::Smart } else { Self::Ignore } + } + + pub fn is_smart(self) -> bool { + matches!(self, Self::Smart) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum LengthPenalty { + On, + Off, +} + +impl LengthPenalty { + pub fn from_bool(on: bool) -> Self { + if on { Self::On } else { Self::Off } + } + + pub fn is_on(self) -> bool { + matches!(self, Self::On) + } +} + +/// Reconstruct byte-offset match positions from a list of matched char offsets +/// that is already sorted ascending and deduplicated. +pub(crate) fn positions_from_sorted(s: &str, sorted_char_indices: &[u32]) -> Vec { + let mut iter = sorted_char_indices.iter().copied().peekable(); + let mut out = Vec::with_capacity(sorted_char_indices.len()); + for (char_offset, (byte_offset, _)) in s.char_indices().enumerate() { + if iter.peek().is_none() { + break; + } + if iter.next_if(|&m| m == char_offset as u32).is_some() { + out.push(byte_offset); + } + } + out +} diff --git a/crates/fuzzy_nucleo/src/matcher.rs b/crates/fuzzy_nucleo/src/matcher.rs index b31da011106341420095bcffbfd012f40014ad6c..21142b517e6678056a8e9d23df26ab8e41fafe84 100644 --- a/crates/fuzzy_nucleo/src/matcher.rs +++ b/crates/fuzzy_nucleo/src/matcher.rs @@ -4,8 +4,15 @@ static MATCHERS: Mutex> = Mutex::new(Vec::new()); pub const LENGTH_PENALTY: f64 = 0.01; +fn pool_cap() -> usize { + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(8) + .max(1) +} + pub fn get_matcher(config: nucleo::Config) -> nucleo::Matcher { - let mut matchers = MATCHERS.lock().unwrap(); + let mut matchers = MATCHERS.lock().unwrap_or_else(|e| e.into_inner()); match matchers.pop() { Some(mut matcher) => { matcher.config = config; @@ -16,12 +23,15 @@ pub fn get_matcher(config: nucleo::Config) -> nucleo::Matcher { } pub fn return_matcher(matcher: nucleo::Matcher) { - MATCHERS.lock().unwrap().push(matcher); + let mut pool = MATCHERS.lock().unwrap_or_else(|e| e.into_inner()); + if pool.len() < pool_cap() { + pool.push(matcher); + } } pub fn get_matchers(n: usize, config: nucleo::Config) -> Vec { let mut matchers: Vec<_> = { - let mut pool = MATCHERS.lock().unwrap(); + let mut pool = MATCHERS.lock().unwrap_or_else(|e| e.into_inner()); let available = pool.len().min(n); pool.drain(..available) .map(|mut matcher| { @@ -34,6 +44,9 @@ pub fn get_matchers(n: usize, config: nucleo::Config) -> Vec { matchers } -pub fn return_matchers(mut matchers: Vec) { - MATCHERS.lock().unwrap().append(&mut matchers); +pub fn return_matchers(matchers: Vec) { + let cap = pool_cap(); + let mut pool = MATCHERS.lock().unwrap_or_else(|e| e.into_inner()); + let space = cap.saturating_sub(pool.len()); + pool.extend(matchers.into_iter().take(space)); } diff --git a/crates/fuzzy_nucleo/src/paths.rs b/crates/fuzzy_nucleo/src/paths.rs index ac766622c9d12c6e2a119fbcd7dd7fe7a3b5a90d..dd4594ce37e52270221b264985f7bee2ae28f172 100644 --- a/crates/fuzzy_nucleo/src/paths.rs +++ b/crates/fuzzy_nucleo/src/paths.rs @@ -11,12 +11,35 @@ use util::{paths::PathStyle, rel_path::RelPath}; use nucleo::Utf32Str; use nucleo::pattern::{Atom, AtomKind, CaseMatching, Normalization}; +use fuzzy::CharBag; + use crate::matcher::{self, LENGTH_PENALTY}; +use crate::{Cancelled, Case, positions_from_sorted}; #[derive(Clone, Debug)] pub struct PathMatchCandidate<'a> { pub is_dir: bool, pub path: &'a RelPath, + pub char_bag: CharBag, +} + +impl<'a> PathMatchCandidate<'a> { + /// Build a candidate whose prefilter bag covers both the worktree prefix and the path. + /// Pass `None` when matching against paths that have no worktree prefix. + pub fn new(path: &'a RelPath, is_dir: bool, path_prefix: Option<&RelPath>) -> Self { + let mut char_bag = CharBag::default(); + if let Some(prefix) = path_prefix + && !prefix.is_empty() + { + char_bag.extend(prefix.as_unix_str().chars().map(|c| c.to_ascii_lowercase())); + } + char_bag.extend(path.as_unix_str().chars().map(|c| c.to_ascii_lowercase())); + Self { + is_dir, + path, + char_bag, + } + } } #[derive(Clone, Debug)] @@ -62,8 +85,7 @@ impl PartialOrd for PathMatch { impl Ord for PathMatch { fn cmp(&self, other: &Self) -> Ordering { self.score - .partial_cmp(&other.score) - .unwrap_or(Ordering::Equal) + .total_cmp(&other.score) .then_with(|| self.worktree_id.cmp(&other.worktree_id)) .then_with(|| { other @@ -74,18 +96,47 @@ impl Ord for PathMatch { } } -fn make_atoms(query: &str, smart_case: bool) -> Vec { - let case = if smart_case { - CaseMatching::Smart - } else { - CaseMatching::Ignore - }; +// Path matching is always case-insensitive at the nucleo level. `Case::Smart` +// is honored as a *scoring hint*: when the query contains uppercase, candidates +// whose matched characters disagree in case are downranked by a factor per +// mismatch rather than dropped. This keeps `"Editor: Backspace"` matching +// `"editor: backspace"` while still preferring exact-case hits. +const SMART_CASE_PENALTY_PER_MISMATCH: f64 = 0.9; + +pub(crate) fn make_atoms(query: &str) -> Vec { query .split_whitespace() - .map(|word| Atom::new(word, case, Normalization::Smart, AtomKind::Fuzzy, false)) + .map(|word| { + Atom::new( + word, + CaseMatching::Ignore, + Normalization::Smart, + AtomKind::Fuzzy, + false, + ) + }) .collect() } +// Only populated when we will actually charge a smart-case penalty, so the hot +// path can iterate a plain `&[Atom]` and ignore this slice entirely. +fn make_source_words(query: &str, case: Case) -> Option>> { + (case.is_smart() && query.chars().any(|c| c.is_uppercase())).then(|| { + query + .split_whitespace() + .map(|word| word.chars().collect()) + .collect() + }) +} + +fn case_penalty(mismatches: u32) -> f64 { + if mismatches == 0 { + 1.0 + } else { + SMART_CASE_PENALTY_PER_MISMATCH.powi(mismatches as i32) + } +} + pub(crate) fn distance_between_paths(path: &RelPath, relative_to: &RelPath) -> usize { let mut path_components = path.components(); let mut relative_components = relative_to.components(); @@ -121,11 +172,12 @@ fn get_filename_match_bonus( } total_score as f64 / filename.len().max(1) as f64 } -struct Cancelled; fn path_match_helper<'a>( matcher: &mut nucleo::Matcher, atoms: &[Atom], + source_words: Option<&[Vec]>, + query_bag: CharBag, candidates: impl Iterator>, results: &mut Vec, worktree_id: usize, @@ -146,6 +198,7 @@ fn path_match_helper<'a>( let mut buf = Vec::new(); let mut matched_chars: Vec = Vec::new(); let mut atom_matched_chars = Vec::new(); + let mut candidate_chars: Vec = Vec::new(); for candidate in candidates { buf.clear(); matched_chars.clear(); @@ -153,6 +206,10 @@ fn path_match_helper<'a>( return Err(Cancelled); } + if !candidate.char_bag.is_superset(query_bag) { + continue; + } + candidate_buf.truncate(path_prefix_len); if root_is_file { candidate_buf.push_str(path_prefix.as_unix_str()); @@ -162,18 +219,36 @@ fn path_match_helper<'a>( let haystack = Utf32Str::new(&candidate_buf, &mut buf); + if source_words.is_some() { + candidate_chars.clear(); + candidate_chars.extend(candidate_buf.chars()); + } + let mut total_score: u32 = 0; + let mut case_mismatches: u32 = 0; let mut all_matched = true; - for atom in atoms { + for (atom_idx, atom) in atoms.iter().enumerate() { atom_matched_chars.clear(); - if let Some(score) = atom.indices(haystack, matcher, &mut atom_matched_chars) { - total_score = total_score.saturating_add(score as u32); - matched_chars.extend_from_slice(&atom_matched_chars); - } else { + let Some(score) = atom.indices(haystack, matcher, &mut atom_matched_chars) else { all_matched = false; break; + }; + total_score = total_score.saturating_add(score as u32); + if let Some(source_words) = source_words { + let query_chars = &source_words[atom_idx]; + if query_chars.len() == atom_matched_chars.len() { + for (&query_char, &pos) in query_chars.iter().zip(&atom_matched_chars) { + if let Some(&candidate_char) = candidate_chars.get(pos as usize) + && candidate_char != query_char + && candidate_char.eq_ignore_ascii_case(&query_char) + { + case_mismatches += 1; + } + } + } } + matched_chars.extend_from_slice(&atom_matched_chars); } if all_matched && !atoms.is_empty() { @@ -182,17 +257,9 @@ fn path_match_helper<'a>( let length_penalty = candidate_buf.len() as f64 * LENGTH_PENALTY; let filename_bonus = get_filename_match_bonus(&candidate_buf, atoms, matcher); - let adjusted_score = total_score as f64 + filename_bonus - length_penalty; - let mut positions: Vec = candidate_buf - .char_indices() - .enumerate() - .filter_map(|(char_offset, (byte_offset, _))| { - matched_chars - .contains(&(char_offset as u32)) - .then_some(byte_offset) - }) - .collect(); - positions.sort_unstable(); + let positive = (total_score as f64 + filename_bonus) * case_penalty(case_mismatches); + let adjusted_score = positive - length_penalty; + let positions = positions_from_sorted(&candidate_buf, &matched_chars); results.push(PathMatch { score: adjusted_score, @@ -225,7 +292,7 @@ pub fn match_fixed_path_set( worktree_id: usize, worktree_root_name: Option>, query: &str, - smart_case: bool, + case: Case, max_results: usize, path_style: PathStyle, ) -> Vec { @@ -233,7 +300,9 @@ pub fn match_fixed_path_set( config.set_match_paths(); let mut matcher = matcher::get_matcher(config); - let atoms = make_atoms(query, smart_case); + let atoms = make_atoms(query); + let source_words = make_source_words(query, case); + let query_bag = CharBag::from(query); let root_is_file = worktree_root_name.is_some() && candidates.iter().all(|c| c.path.is_empty()); @@ -244,6 +313,8 @@ pub fn match_fixed_path_set( path_match_helper( &mut matcher, &atoms, + source_words.as_deref(), + query_bag, candidates.into_iter(), &mut results, worktree_id, @@ -263,7 +334,7 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>( candidate_sets: &'a [Set], query: &str, relative_to: &Option>, - smart_case: bool, + case: Case, max_results: usize, cancel_flag: &AtomicBool, executor: BackgroundExecutor, @@ -281,7 +352,9 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>( query.to_owned() }; - let atoms = make_atoms(&query, smart_case); + let atoms = make_atoms(&query); + let source_words = make_source_words(&query, case); + let query_bag = CharBag::from(query.as_str()); let num_cpus = executor.num_cpus().min(path_count); let segment_size = path_count.div_ceil(num_cpus); @@ -299,6 +372,7 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>( .enumerate() { let atoms = atoms.clone(); + let source_words = source_words.clone(); let relative_to = relative_to.clone(); scope.spawn(async move { let segment_start = segment_idx * segment_size; @@ -316,6 +390,8 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>( if path_match_helper( matcher, &atoms, + source_words.as_deref(), + query_bag, candidates, results, candidate_set.id(), diff --git a/crates/project/src/project.rs b/crates/project/src/project.rs index a8bd461d3d94839d5222164ef88d536abc1bcaf4..f45a6632f40f945de9043ab246e891595618d324 100644 --- a/crates/project/src/project.rs +++ b/crates/project/src/project.rs @@ -6439,6 +6439,7 @@ impl<'a> Iterator for PathMatchCandidateSetNucleoIter<'a> { .map(|entry| fuzzy_nucleo::PathMatchCandidate { is_dir: entry.kind.is_dir(), path: &entry.path, + char_bag: entry.char_bag, }) } }