@@ -2186,20 +2186,6 @@ dependencies = [
"piper",
]
-[[package]]
-name = "bm25"
-version = "2.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cbd8ffdfb7b4c2ff038726178a780a94f90525ed0ad264c0afaa75dd8c18a64"
-dependencies = [
- "cached",
- "deunicode",
- "fxhash",
- "rust-stemmers",
- "stop-words",
- "unicode-segmentation",
-]
-
[[package]]
name = "bon"
version = "3.8.2"
@@ -4831,12 +4817,6 @@ dependencies = [
"syn 2.0.106",
]
-[[package]]
-name = "deunicode"
-version = "1.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abd57806937c9cc163efc8ea3910e00a62e2aeb0b8119f1793a978088f8f6b04"
-
[[package]]
name = "dev_container"
version = "0.1.0"
@@ -6730,15 +6710,6 @@ dependencies = [
"thread_local",
]
-[[package]]
-name = "fxhash"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
-dependencies = [
- "byteorder",
-]
-
[[package]]
name = "gemm"
version = "0.17.1"
@@ -14286,16 +14257,6 @@ dependencies = [
"walkdir",
]
-[[package]]
-name = "rust-stemmers"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
-dependencies = [
- "serde",
- "serde_derive",
-]
-
[[package]]
name = "rust_decimal"
version = "1.39.0"
@@ -15222,7 +15183,6 @@ dependencies = [
"anyhow",
"assets",
"audio",
- "bm25",
"client",
"codestral",
"component",
@@ -15986,15 +15946,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
-[[package]]
-name = "stop-words"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "645a3d441ccf4bf47f2e4b7681461986681a6eeea9937d4c3bc9febd61d17c71"
-dependencies = [
- "serde_json",
-]
-
[[package]]
name = "story"
version = "0.1.0"
@@ -758,8 +758,13 @@ pub struct SettingsWindow {
pub(crate) regex_validation_error: Option<String>,
}
+struct SearchDocument {
+ id: usize,
+ words: Vec<String>,
+}
+
struct SearchIndex {
- bm25_engine: bm25::SearchEngine<usize>,
+ documents: Vec<SearchDocument>,
fuzzy_match_candidates: Vec<StringMatchCandidate>,
key_lut: Vec<SearchKeyLUTEntry>,
}
@@ -1919,11 +1924,25 @@ impl SettingsWindow {
let search_index = self.search_index.as_ref().unwrap().clone();
self.search_task = Some(cx.spawn(async move |this, cx| {
- let bm25_task = cx.background_spawn({
+ let exact_match_task = cx.background_spawn({
let search_index = search_index.clone();
- let max_results = search_index.key_lut.len();
let query = query.clone();
- async move { search_index.bm25_engine.search(&query, max_results) }
+ async move {
+ let query_lower = query.to_lowercase();
+ let query_words: Vec<&str> = query_lower.split_whitespace().collect();
+ search_index
+ .documents
+ .iter()
+ .filter(|doc| {
+ query_words.iter().any(|query_word| {
+ doc.words
+ .iter()
+ .any(|doc_word| doc_word.starts_with(query_word))
+ })
+ })
+ .map(|doc| doc.id)
+ .collect::<Vec<usize>>()
+ }
});
let cancel_flag = std::sync::atomic::AtomicBool::new(false);
let fuzzy_search_task = fuzzy::match_strings(
@@ -1937,46 +1956,16 @@ impl SettingsWindow {
);
let fuzzy_matches = fuzzy_search_task.await;
- // PERF:
- // If results are slow to appear, we should:
- // - return to the structure we had previously where we wait on fuzzy matches first (they resolve quickly) with a min match score of 0.3
- // - wait on bm25 and replace fuzzy matches with bm25 matches
- // - to deal with lack of fuzzyness with bm25 searches however, we should keep the fuzzy matches around, and merge fuzzy matches with high score (>0.75?) into bm25 results
- let bm25_matches = bm25_task.await;
+ let exact_matches = exact_match_task.await;
_ = this
.update(cx, |this, cx| {
- // For tuning the score threshold
- // for fuzzy_match in &fuzzy_matches {
- // let SearchItemKey {
- // page_index,
- // header_index,
- // item_index,
- // } = search_index.key_lut[fuzzy_match.candidate_id];
- // let SettingsPageItem::SectionHeader(header) =
- // this.pages[page_index].items[header_index]
- // else {
- // continue;
- // };
- // let SettingsPageItem::SettingItem(SettingItem {
- // title, description, ..
- // }) = this.pages[page_index].items[item_index]
- // else {
- // continue;
- // };
- // let score = fuzzy_match.score;
- // eprint!("# {header} :: QUERY = {query} :: SCORE = {score}\n{title}\n{description}\n\n");
- // }
+ let exact_indices = exact_matches.into_iter();
let fuzzy_indices = fuzzy_matches
.into_iter()
- // MAGIC NUMBER: Was found to have right balance between not too many weird matches, but also
- // flexible enough to catch misspellings and <4 letter queries
.take_while(|fuzzy_match| fuzzy_match.score >= 0.5)
.map(|fuzzy_match| fuzzy_match.candidate_id);
- let bm25_indices = bm25_matches
- .into_iter()
- .map(|bm25_match| bm25_match.document.id);
- let merged_indices = bm25_indices.chain(fuzzy_indices);
+ let merged_indices = exact_indices.chain(fuzzy_indices);
this.apply_match_indices(merged_indices);
cx.notify();
@@ -1997,8 +1986,19 @@ impl SettingsWindow {
}
fn build_search_index(&mut self) {
+ fn split_into_words(parts: &[&str]) -> Vec<String> {
+ parts
+ .iter()
+ .flat_map(|s| {
+ s.split(|c: char| !c.is_alphanumeric())
+ .filter(|w| !w.is_empty())
+ .map(|w| w.to_lowercase())
+ })
+ .collect()
+ }
+
let mut key_lut: Vec<SearchKeyLUTEntry> = vec![];
- let mut documents = Vec::default();
+ let mut documents: Vec<SearchDocument> = Vec::default();
let mut fuzzy_match_candidates = Vec::default();
fn push_candidates(
@@ -2029,18 +2029,22 @@ impl SettingsWindow {
.field
.json_path()
.map(|path| path.trim_end_matches('$'));
- documents.push(bm25::Document {
+ documents.push(SearchDocument {
id: key_index,
- contents: [page.title, header_str, item.title, item.description]
- .join("\n"),
+ words: split_into_words(&[
+ page.title,
+ header_str,
+ item.title,
+ item.description,
+ ]),
});
push_candidates(&mut fuzzy_match_candidates, key_index, item.title);
push_candidates(&mut fuzzy_match_candidates, key_index, item.description);
}
SettingsPageItem::SectionHeader(header) => {
- documents.push(bm25::Document {
+ documents.push(SearchDocument {
id: key_index,
- contents: header.to_string(),
+ words: split_into_words(&[header]),
});
push_candidates(&mut fuzzy_match_candidates, key_index, header);
header_index = item_index;
@@ -2048,10 +2052,13 @@ impl SettingsWindow {
}
SettingsPageItem::SubPageLink(sub_page_link) => {
json_path = sub_page_link.json_path;
- documents.push(bm25::Document {
+ documents.push(SearchDocument {
id: key_index,
- contents: [page.title, header_str, sub_page_link.title.as_ref()]
- .join("\n"),
+ words: split_into_words(&[
+ page.title,
+ header_str,
+ sub_page_link.title.as_ref(),
+ ]),
});
push_candidates(
&mut fuzzy_match_candidates,
@@ -2060,10 +2067,13 @@ impl SettingsWindow {
);
}
SettingsPageItem::ActionLink(action_link) => {
- documents.push(bm25::Document {
+ documents.push(SearchDocument {
id: key_index,
- contents: [page.title, header_str, action_link.title.as_ref()]
- .join("\n"),
+ words: split_into_words(&[
+ page.title,
+ header_str,
+ action_link.title.as_ref(),
+ ]),
});
push_candidates(
&mut fuzzy_match_candidates,
@@ -2083,10 +2093,8 @@ impl SettingsWindow {
});
}
}
- let engine =
- bm25::SearchEngineBuilder::with_documents(bm25::Language::English, documents).build();
self.search_index = Some(Arc::new(SearchIndex {
- bm25_engine: engine,
+ documents,
key_lut,
fuzzy_match_candidates,
}));