Case Insensitive Unicode Text Search: Fallback To Regex (#28752)

François Mockers created

Closes #9980

Release Notes:

- Fixed: case insensitive text search with unicode characters

Change summary

crates/project/src/project_tests.rs | 81 +++++++++++++++++++++++++++++++
crates/project/src/search.rs        | 15 +++++
2 files changed, 96 insertions(+)

Detailed changes

crates/project/src/project_tests.rs 🔗

@@ -5425,6 +5425,87 @@ async fn test_search_in_gitignored_dirs(cx: &mut gpui::TestAppContext) {
     );
 }
 
+#[gpui::test]
+async fn test_search_with_unicode(cx: &mut gpui::TestAppContext) {
+    init_test(cx);
+
+    let fs = FakeFs::new(cx.executor());
+    fs.insert_tree(
+        path!("/dir"),
+        json!({
+            "one.rs": "// ПРИВЕТ? привет!",
+            "two.rs": "// ПРИВЕТ.",
+            "three.rs": "// привет",
+        }),
+    )
+    .await;
+    let project = Project::test(fs.clone(), [path!("/dir").as_ref()], cx).await;
+
+    let unicode_case_sensitive_query = SearchQuery::text(
+        "привет",
+        false,
+        true,
+        false,
+        Default::default(),
+        Default::default(),
+        None,
+    );
+    assert_matches!(unicode_case_sensitive_query, Ok(SearchQuery::Text { .. }));
+    assert_eq!(
+        search(&project, unicode_case_sensitive_query.unwrap(), cx)
+            .await
+            .unwrap(),
+        HashMap::from_iter([
+            (separator!("dir/one.rs").to_string(), vec![17..29]),
+            (separator!("dir/three.rs").to_string(), vec![3..15]),
+        ])
+    );
+
+    let unicode_case_insensitive_query = SearchQuery::text(
+        "привет",
+        false,
+        false,
+        false,
+        Default::default(),
+        Default::default(),
+        None,
+    );
+    assert_matches!(
+        unicode_case_insensitive_query,
+        Ok(SearchQuery::Regex { .. })
+    );
+    assert_eq!(
+        search(&project, unicode_case_insensitive_query.unwrap(), cx)
+            .await
+            .unwrap(),
+        HashMap::from_iter([
+            (separator!("dir/one.rs").to_string(), vec![3..15, 17..29]),
+            (separator!("dir/two.rs").to_string(), vec![3..15]),
+            (separator!("dir/three.rs").to_string(), vec![3..15]),
+        ])
+    );
+
+    assert_eq!(
+        search(
+            &project,
+            SearchQuery::text(
+                "привет.",
+                false,
+                false,
+                false,
+                Default::default(),
+                Default::default(),
+                None,
+            )
+            .unwrap(),
+            cx
+        )
+        .await
+        .unwrap(),
+        HashMap::from_iter([(separator!("dir/two.rs").to_string(), vec![3..16]),])
+    );
+}
+
 #[gpui::test]
 async fn test_create_entry(cx: &mut gpui::TestAppContext) {
     init_test(cx);

crates/project/src/search.rs 🔗

@@ -93,6 +93,21 @@ impl SearchQuery {
         buffers: Option<Vec<Entity<Buffer>>>,
     ) -> Result<Self> {
         let query = query.to_string();
+        if !case_sensitive && !query.is_ascii() {
+            // AhoCorasickBuilder doesn't support case-insensitive search with unicode characters
+            // Fallback to regex search as recommended by
+            // https://docs.rs/aho-corasick/1.1/aho_corasick/struct.AhoCorasickBuilder.html#method.ascii_case_insensitive
+            return Self::regex(
+                regex::escape(&query),
+                whole_word,
+                case_sensitive,
+                include_ignored,
+                false,
+                files_to_include,
+                files_to_exclude,
+                buffers,
+            );
+        }
         let search = AhoCorasickBuilder::new()
             .ascii_case_insensitive(!case_sensitive)
             .build([&query])?;