Remove dependency on bm25. Use exact prefix match in settings UI search instead (#49231)

Ben Kunkle and Zed Zippy created

Closes #ISSUE

Before you mark this PR as ready for review, make sure that you have:
- [ ] Added a solid test coverage and/or screenshots from doing manual
testing
- [ ] Done a self-review taking into account security and performance
aspects
- [ ] Aligned any UI changes with the [UI
checklist](https://github.com/zed-industries/zed/blob/main/CONTRIBUTING.md#uiux-checklist)

Release Notes:

- N/A *or* Added/Fixed/Improved ...

---------

Co-authored-by: Zed Zippy <234243425+zed-zippy[bot]@users.noreply.github.com>

Change summary

Cargo.lock                            |  49 ------------
crates/settings_ui/Cargo.toml         |   1 
crates/settings_ui/src/settings_ui.rs | 112 +++++++++++++++-------------
3 files changed, 60 insertions(+), 102 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -2186,20 +2186,6 @@ dependencies = [
  "piper",
 ]
 
-[[package]]
-name = "bm25"
-version = "2.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cbd8ffdfb7b4c2ff038726178a780a94f90525ed0ad264c0afaa75dd8c18a64"
-dependencies = [
- "cached",
- "deunicode",
- "fxhash",
- "rust-stemmers",
- "stop-words",
- "unicode-segmentation",
-]
-
 [[package]]
 name = "bon"
 version = "3.8.2"
@@ -4831,12 +4817,6 @@ dependencies = [
  "syn 2.0.106",
 ]
 
-[[package]]
-name = "deunicode"
-version = "1.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abd57806937c9cc163efc8ea3910e00a62e2aeb0b8119f1793a978088f8f6b04"
-
 [[package]]
 name = "dev_container"
 version = "0.1.0"
@@ -6730,15 +6710,6 @@ dependencies = [
  "thread_local",
 ]
 
-[[package]]
-name = "fxhash"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
-dependencies = [
- "byteorder",
-]
-
 [[package]]
 name = "gemm"
 version = "0.17.1"
@@ -14286,16 +14257,6 @@ dependencies = [
  "walkdir",
 ]
 
-[[package]]
-name = "rust-stemmers"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
-dependencies = [
- "serde",
- "serde_derive",
-]
-
 [[package]]
 name = "rust_decimal"
 version = "1.39.0"
@@ -15222,7 +15183,6 @@ dependencies = [
  "anyhow",
  "assets",
  "audio",
- "bm25",
  "client",
  "codestral",
  "component",
@@ -15986,15 +15946,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
 
-[[package]]
-name = "stop-words"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "645a3d441ccf4bf47f2e4b7681461986681a6eeea9937d4c3bc9febd61d17c71"
-dependencies = [
- "serde_json",
-]
-
 [[package]]
 name = "story"
 version = "0.1.0"

crates/settings_ui/Cargo.toml 🔗

@@ -20,7 +20,6 @@ agent.workspace = true
 agent_settings.workspace = true
 anyhow.workspace = true
 audio.workspace = true
-bm25 = "2.3.2"
 component.workspace = true
 codestral.workspace = true
 copilot.workspace = true

crates/settings_ui/src/settings_ui.rs 🔗

@@ -758,8 +758,13 @@ pub struct SettingsWindow {
     pub(crate) regex_validation_error: Option<String>,
 }
 
+struct SearchDocument {
+    id: usize,
+    words: Vec<String>,
+}
+
 struct SearchIndex {
-    bm25_engine: bm25::SearchEngine<usize>,
+    documents: Vec<SearchDocument>,
     fuzzy_match_candidates: Vec<StringMatchCandidate>,
     key_lut: Vec<SearchKeyLUTEntry>,
 }
@@ -1919,11 +1924,25 @@ impl SettingsWindow {
         let search_index = self.search_index.as_ref().unwrap().clone();
 
         self.search_task = Some(cx.spawn(async move |this, cx| {
-            let bm25_task = cx.background_spawn({
+            let exact_match_task = cx.background_spawn({
                 let search_index = search_index.clone();
-                let max_results = search_index.key_lut.len();
                 let query = query.clone();
-                async move { search_index.bm25_engine.search(&query, max_results) }
+                async move {
+                    let query_lower = query.to_lowercase();
+                    let query_words: Vec<&str> = query_lower.split_whitespace().collect();
+                    search_index
+                        .documents
+                        .iter()
+                        .filter(|doc| {
+                            query_words.iter().any(|query_word| {
+                                doc.words
+                                    .iter()
+                                    .any(|doc_word| doc_word.starts_with(query_word))
+                            })
+                        })
+                        .map(|doc| doc.id)
+                        .collect::<Vec<usize>>()
+                }
             });
             let cancel_flag = std::sync::atomic::AtomicBool::new(false);
             let fuzzy_search_task = fuzzy::match_strings(
@@ -1937,46 +1956,16 @@ impl SettingsWindow {
             );
 
             let fuzzy_matches = fuzzy_search_task.await;
-            // PERF:
-            // If results are slow to appear, we should:
-            // - return to the structure we had previously where we wait on fuzzy matches first (they resolve quickly) with a min match score of 0.3
-            // - wait on bm25 and replace fuzzy matches with bm25 matches
-            // - to deal with lack of fuzzyness with bm25 searches however, we should keep the fuzzy matches around, and merge fuzzy matches with high score (>0.75?) into bm25 results
-            let bm25_matches = bm25_task.await;
+            let exact_matches = exact_match_task.await;
 
             _ = this
                 .update(cx, |this, cx| {
-                    // For tuning the score threshold
-                    // for fuzzy_match in &fuzzy_matches {
-                    //     let SearchItemKey {
-                    //         page_index,
-                    //         header_index,
-                    //         item_index,
-                    //     } = search_index.key_lut[fuzzy_match.candidate_id];
-                    //     let SettingsPageItem::SectionHeader(header) =
-                    //         this.pages[page_index].items[header_index]
-                    //     else {
-                    //         continue;
-                    //     };
-                    //     let SettingsPageItem::SettingItem(SettingItem {
-                    //         title, description, ..
-                    //     }) = this.pages[page_index].items[item_index]
-                    //     else {
-                    //         continue;
-                    //     };
-                    //     let score = fuzzy_match.score;
-                    //     eprint!("# {header} :: QUERY = {query} :: SCORE = {score}\n{title}\n{description}\n\n");
-                    // }
+                    let exact_indices = exact_matches.into_iter();
                     let fuzzy_indices = fuzzy_matches
                         .into_iter()
-                        // MAGIC NUMBER: Was found to have right balance between not too many weird matches, but also
-                        // flexible enough to catch misspellings and <4 letter queries
                         .take_while(|fuzzy_match| fuzzy_match.score >= 0.5)
                         .map(|fuzzy_match| fuzzy_match.candidate_id);
-                    let bm25_indices = bm25_matches
-                        .into_iter()
-                        .map(|bm25_match| bm25_match.document.id);
-                    let merged_indices = bm25_indices.chain(fuzzy_indices);
+                    let merged_indices = exact_indices.chain(fuzzy_indices);
 
                     this.apply_match_indices(merged_indices);
                     cx.notify();
@@ -1997,8 +1986,19 @@ impl SettingsWindow {
     }
 
     fn build_search_index(&mut self) {
+        fn split_into_words(parts: &[&str]) -> Vec<String> {
+            parts
+                .iter()
+                .flat_map(|s| {
+                    s.split(|c: char| !c.is_alphanumeric())
+                        .filter(|w| !w.is_empty())
+                        .map(|w| w.to_lowercase())
+                })
+                .collect()
+        }
+
         let mut key_lut: Vec<SearchKeyLUTEntry> = vec![];
-        let mut documents = Vec::default();
+        let mut documents: Vec<SearchDocument> = Vec::default();
         let mut fuzzy_match_candidates = Vec::default();
 
         fn push_candidates(
@@ -2029,18 +2029,22 @@ impl SettingsWindow {
                             .field
                             .json_path()
                             .map(|path| path.trim_end_matches('$'));
-                        documents.push(bm25::Document {
+                        documents.push(SearchDocument {
                             id: key_index,
-                            contents: [page.title, header_str, item.title, item.description]
-                                .join("\n"),
+                            words: split_into_words(&[
+                                page.title,
+                                header_str,
+                                item.title,
+                                item.description,
+                            ]),
                         });
                         push_candidates(&mut fuzzy_match_candidates, key_index, item.title);
                         push_candidates(&mut fuzzy_match_candidates, key_index, item.description);
                     }
                     SettingsPageItem::SectionHeader(header) => {
-                        documents.push(bm25::Document {
+                        documents.push(SearchDocument {
                             id: key_index,
-                            contents: header.to_string(),
+                            words: split_into_words(&[header]),
                         });
                         push_candidates(&mut fuzzy_match_candidates, key_index, header);
                         header_index = item_index;
@@ -2048,10 +2052,13 @@ impl SettingsWindow {
                     }
                     SettingsPageItem::SubPageLink(sub_page_link) => {
                         json_path = sub_page_link.json_path;
-                        documents.push(bm25::Document {
+                        documents.push(SearchDocument {
                             id: key_index,
-                            contents: [page.title, header_str, sub_page_link.title.as_ref()]
-                                .join("\n"),
+                            words: split_into_words(&[
+                                page.title,
+                                header_str,
+                                sub_page_link.title.as_ref(),
+                            ]),
                         });
                         push_candidates(
                             &mut fuzzy_match_candidates,
@@ -2060,10 +2067,13 @@ impl SettingsWindow {
                         );
                     }
                     SettingsPageItem::ActionLink(action_link) => {
-                        documents.push(bm25::Document {
+                        documents.push(SearchDocument {
                             id: key_index,
-                            contents: [page.title, header_str, action_link.title.as_ref()]
-                                .join("\n"),
+                            words: split_into_words(&[
+                                page.title,
+                                header_str,
+                                action_link.title.as_ref(),
+                            ]),
                         });
                         push_candidates(
                             &mut fuzzy_match_candidates,
@@ -2083,10 +2093,8 @@ impl SettingsWindow {
                 });
             }
         }
-        let engine =
-            bm25::SearchEngineBuilder::with_documents(bm25::Language::English, documents).build();
         self.search_index = Some(Arc::new(SearchIndex {
-            bm25_engine: engine,
+            documents,
             key_lut,
             fuzzy_match_candidates,
         }));