Semantic search v2 (#2789)

Kyle Caverly created

Move semantic search from navigation modal, to project search option.
This PR is intended to be released in Preview only, and requires an
opt-in semantic_index option to enable. Without this opt-in setting
enable, the user should perceive no differences between previous project
search.

Release Notes: (Preview-only)

- Added Semantic Search as a opt-in feature within Project Search
- Show indexing feedback on indexing process within project search view

Change summary

Cargo.lock                                           |  370 +--
Cargo.toml                                           |    4 
assets/settings/default.json                         |    4 
crates/language/src/language.rs                      |   30 
crates/search/Cargo.toml                             |    1 
crates/search/src/project_search.rs                  |  249 ++
crates/search/src/search.rs                          |    4 
crates/semantic_index/Cargo.toml                     |   19 
crates/semantic_index/README.md                      |    0 
crates/semantic_index/src/db.rs                      |  221 +
crates/semantic_index/src/embedding.rs               |   34 
crates/semantic_index/src/parsing.rs                 |  299 +++
crates/semantic_index/src/semantic_index.rs          |  777 +++++++++
crates/semantic_index/src/semantic_index_settings.rs |   10 
crates/semantic_index/src/semantic_index_tests.rs    | 1142 ++++++++++++++
crates/vector_store/src/modal.rs                     |  172 --
crates/vector_store/src/parsing.rs                   |  115 -
crates/vector_store/src/vector_store.rs              |  770 ---------
crates/vector_store/src/vector_store_tests.rs        |  161 -
crates/zed/Cargo.toml                                |    2 
crates/zed/src/languages/c/embedding.scm             |   43 
crates/zed/src/languages/cpp/embedding.scm           |   61 
crates/zed/src/languages/elixir/embedding.scm        |   27 
crates/zed/src/languages/go/embedding.scm            |   24 
crates/zed/src/languages/javascript/embedding.scm    |  117 
crates/zed/src/languages/json/embedding.scm          |   14 
crates/zed/src/languages/rust/config.toml            |    1 
crates/zed/src/languages/rust/embedding.scm          |   54 
crates/zed/src/languages/tsx/embedding.scm           |  110 
crates/zed/src/languages/typescript/embedding.scm    |  132 
crates/zed/src/main.rs                               |    2 
31 files changed, 3,276 insertions(+), 1,693 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -36,11 +36,11 @@ dependencies = [
 
 [[package]]
 name = "addr2line"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97"
+checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3"
 dependencies = [
- "gimli 0.27.2",
+ "gimli 0.27.3",
 ]
 
 [[package]]
@@ -61,7 +61,7 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
 dependencies = [
- "getrandom 0.2.9",
+ "getrandom 0.2.10",
  "once_cell",
  "version_check",
 ]
@@ -88,9 +88,9 @@ dependencies = [
 
 [[package]]
 name = "aho-corasick"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04"
+checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41"
 dependencies = [
  "memchr",
 ]
@@ -118,7 +118,7 @@ dependencies = [
  "settings",
  "smol",
  "theme",
- "tiktoken-rs 0.4.2",
+ "tiktoken-rs 0.4.5",
  "util",
  "workspace",
 ]
@@ -151,7 +151,7 @@ dependencies = [
  "alacritty_config",
  "alacritty_config_derive",
  "base64 0.13.1",
- "bitflags",
+ "bitflags 1.3.2",
  "dirs 4.0.0",
  "libc",
  "log",
@@ -177,6 +177,12 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd"
 
+[[package]]
+name = "allocator-api2"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
+
 [[package]]
 name = "alsa"
 version = "0.7.0"
@@ -184,7 +190,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8512c9117059663fb5606788fbca3619e2a91dac0e3fe516242eab1fa6be5e44"
 dependencies = [
  "alsa-sys",
- "bitflags",
+ "bitflags 1.3.2",
  "libc",
  "nix",
 ]
@@ -205,6 +211,12 @@ version = "0.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec8ad6edb4840b78c5c3d88de606b22252d552b55f3a4699fbb10fc070ec3049"
 
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -225,7 +237,7 @@ dependencies = [
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
- "is-terminal 0.4.7",
+ "is-terminal 0.4.9",
  "utf8parse",
 ]
 
@@ -250,7 +262,7 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
 dependencies = [
- "windows-sys 0.48.0",
+ "windows-sys",
 ]
 
 [[package]]
@@ -260,14 +272,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
 dependencies = [
  "anstyle",
- "windows-sys 0.48.0",
+ "windows-sys",
 ]
 
 [[package]]
 name = "anyhow"
-version = "1.0.71"
+version = "1.0.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
+checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854"
 
 [[package]]
 name = "arrayref"
@@ -283,9 +295,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
 
 [[package]]
 name = "arrayvec"
-version = "0.7.2"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
+checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
 
 [[package]]
 name = "ascii"
@@ -306,9 +318,9 @@ dependencies = [
 
 [[package]]
 name = "async-channel"
-version = "1.8.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf46fee83e5ccffc220104713af3292ff9bc7c64c7de289f66dae8e38d826833"
+checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
 dependencies = [
  "concurrent-queue",
  "event-listener",
@@ -324,7 +336,7 @@ dependencies = [
  "futures-core",
  "futures-io",
  "once_cell",
- "pin-project-lite 0.2.9",
+ "pin-project-lite 0.2.10",
  "tokio",
 ]
 
@@ -338,7 +350,7 @@ dependencies = [
  "futures-core",
  "futures-io",
  "memchr",
- "pin-project-lite 0.2.9",
+ "pin-project-lite 0.2.10",
 ]
 
 [[package]]
@@ -350,7 +362,7 @@ dependencies = [
  "async-lock",
  "async-task",
  "concurrent-queue",
- "fastrand",
+ "fastrand 1.9.0",
  "futures-lite",
  "slab",
 ]
@@ -362,7 +374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "279cf904654eeebfa37ac9bb1598880884924aab82e290aa65c9e77a0e142e06"
 dependencies = [
  "async-lock",
- "autocfg 1.1.0",
+ "autocfg",
  "blocking",
  "futures-lite",
 ]
@@ -389,14 +401,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af"
 dependencies = [
  "async-lock",
- "autocfg 1.1.0",
+ "autocfg",
  "cfg-if 1.0.0",
  "concurrent-queue",
  "futures-lite",
  "log",
  "parking",
  "polling",
- "rustix 0.37.19",
+ "rustix 0.37.23",
  "slab",
  "socket2",
  "waker-fn",
@@ -418,7 +430,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4051e67316bc7eff608fe723df5d32ed639946adcd69e07df41fd42a7b411f1f"
 dependencies = [
  "async-io",
- "autocfg 1.1.0",
+ "autocfg",
  "blocking",
  "futures-lite",
 ]
@@ -440,14 +452,14 @@ checksum = "7a9d28b1d97e08915212e2e45310d47854eafa69600756fc735fb788f75199c9"
 dependencies = [
  "async-io",
  "async-lock",
- "autocfg 1.1.0",
+ "autocfg",
  "blocking",
  "cfg-if 1.0.0",
  "event-listener",
  "futures-lite",
- "rustix 0.37.19",
+ "rustix 0.37.23",
  "signal-hook",
- "windows-sys 0.48.0",
+ "windows-sys",
 ]
 
 [[package]]
@@ -469,7 +481,7 @@ checksum = "0e97ce7de6cf12de5d7226c73f5ba9811622f4db3a5b91b55c53e987e5f91cba"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.18",
+ "syn 2.0.27",
 ]
 
 [[package]]
@@ -482,7 +494,7 @@ dependencies = [
  "async-global-executor",
  "async-io",
  "async-lock",
- "crossbeam-utils 0.8.15",
+ "crossbeam-utils",
  "futures-channel",
  "futures-core",
  "futures-io",
@@ -492,7 +504,7 @@ dependencies = [
  "log",
  "memchr",
  "once_cell",
- "pin-project-lite 0.2.9",
+ "pin-project-lite 0.2.10",
  "pin-utils",
  "slab",
  "wasm-bindgen-futures",
@@ -506,7 +518,7 @@ checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51"
 dependencies = [
  "async-stream-impl",
  "futures-core",
- "pin-project-lite 0.2.9",
+ "pin-project-lite 0.2.10",
 ]
 
 [[package]]
@@ -517,7 +529,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.18",
+ "syn 2.0.27",
 ]
 
 [[package]]
@@ -554,13 +566,13 @@ dependencies = [
 
 [[package]]
 name = "async-trait"
-version = "0.1.68"
+version = "0.1.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
+checksum = "cc6dde6e4ed435a4c1ee4e73592f5ba9da2151af10076cc04858746af9352d09"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.18",
+ "syn 2.0.27",
 ]
 
 [[package]]
@@ -573,7 +585,7 @@ dependencies = [
  "futures-io",
  "futures-util",
  "log",
- "pin-project-lite 0.2.9",
+ "pin-project-lite 0.2.10",
  "tungstenite 0.16.0",
 ]
 
@@ -588,12 +600,9 @@ dependencies = [
 
 [[package]]
 name = "atomic"
-version = "0.5.1"
+version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b88d82667eca772c4aa12f0f1348b3ae643424c8876448f3f7bd5787032e234c"
-dependencies = [
- "autocfg 1.1.0",
-]
+checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
 
 [[package]]
 name = "atomic-waker"
@@ -649,15 +658,6 @@ dependencies = [
  "workspace",
 ]
 
-[[package]]
-name = "autocfg"
-version = "0.1.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0dde43e75fd43e8a1bf86103336bc699aa8d17ad1be60c76c0bdfd4828e19b78"
-dependencies = [
- "autocfg 1.1.0",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -673,19 +673,19 @@ dependencies = [
  "async-trait",
  "axum-core",
  "base64 0.13.1",
- "bitflags",
+ "bitflags 1.3.2",
  "bytes 1.4.0",
  "futures-util",
  "headers",
  "http",
  "http-body",
  "hyper",
- "itoa 1.0.6",
+ "itoa 1.0.9",
  "matchit",
  "memchr",
  "mime",
  "percent-encoding",
- "pin-project-lite 0.2.9",
+ "pin-project-lite 0.2.10",
  "serde",
  "serde_json",
  "serde_urlencoded",
@@ -726,7 +726,7 @@ dependencies = [
  "futures-util",
  "http",
  "mime",
- "pin-project-lite 0.2.9",
+ "pin-project-lite 0.2.10",
  "serde",
  "serde_json",
  "tokio",
@@ -738,16 +738,16 @@ dependencies = [
 
 [[package]]
 name = "backtrace"
-version = "0.3.67"
+version = "0.3.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca"
+checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12"
 dependencies = [
- "addr2line 0.19.0",
+ "addr2line 0.20.0",
  "cc",
  "cfg-if 1.0.0",
  "libc",
- "miniz_oxide 0.6.2",
- "object 0.30.3",
+ "miniz_oxide 0.7.1",
+ "object 0.31.1",
  "rustc-demangle",
 ]
 
@@ -797,7 +797,7 @@ version = "0.64.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "cexpr",
  "clang-sys",
  "lazy_static",
@@ -817,7 +817,7 @@ version = "0.65.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "cexpr",
  "clang-sys",
  "lazy_static",
@@ -830,7 +830,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.18",
+ "syn 2.0.27",
  "which",
 ]
 
@@ -855,6 +855,24 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
+[[package]]
+name = "bitflags"
+version = "2.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
+
+[[package]]
+name = "bitvec"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+dependencies = [
+ "funty",
+ "radium",
+ "tap",
+ "wyz",
+]
+
 [[package]]
 name = "block"
 version = "0.1.6"
@@ -889,7 +907,7 @@ dependencies = [
  "async-lock",
  "async-task",
  "atomic-waker",
- "fastrand",
+ "fastrand 1.9.0",
  "futures-lite",
  "log",
 ]
@@ -980,15 +998,15 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.12.2"
+version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c6ed94e98ecff0c12dd1b04c15ec0d7d9458ca8fe806cea6f12954efe74c63b"
+checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
 
 [[package]]
 name = "bytecheck"
-version = "0.6.10"
+version = "0.6.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13fe11640a23eb24562225322cd3e452b93a3d4091d62fab69c70542fcd17d1f"
+checksum = "8b6372023ac861f6e6dc89c8344a8f398fb42aaba2b5dbc649ca0c0e9dbcb627"
 dependencies = [
  "bytecheck_derive",
  "ptr_meta",
@@ -997,9 +1015,9 @@ dependencies = [
 
 [[package]]
 name = "bytecheck_derive"
-version = "0.6.10"
+version = "0.6.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e31225543cb46f81a7e224762764f4a6a0f097b1db0b175f69e8065efaa42de5"
+checksum = "a7ec4c6f261935ad534c0c22dbef2201b45918860eb1c574b972bd213a76af61"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1170,13 +1188,13 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "chrono"
-version = "0.4.24"
+version = "0.4.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
+checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5"
 dependencies = [
+ "android-tzdata",
  "iana-time-zone",
  "js-sys",
- "num-integer",
  "num-traits",
  "serde",
  "time 0.1.45",
@@ -1207,7 +1225,7 @@ checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
 dependencies = [
  "glob",
  "libc",
- "libloading",
+ "libloading 0.7.4",
 ]
 
 [[package]]
@@ -1217,7 +1235,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
 dependencies = [
  "atty",
- "bitflags",
+ "bitflags 1.3.2",
  "clap_derive 3.2.25",
  "clap_lex 0.2.4",
  "indexmap 1.9.3",
@@ -1229,24 +1247,23 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.3.5"
+version = "4.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2686c4115cb0810d9a984776e197823d08ec94f176549a89a9efded477c456dc"
+checksum = "5fd304a20bff958a57f04c4e96a2e7594cc4490a0e809cbd48bb6437edaa452d"
 dependencies = [
  "clap_builder",
- "clap_derive 4.3.2",
+ "clap_derive 4.3.12",
  "once_cell",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.3.5"
+version = "4.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e53afce1efce6ed1f633cf0e57612fe51db54a1ee4fd8f8503d078fe02d69ae"
+checksum = "01c6a3f08f1fe5662a35cfe393aec09c4df95f60ee93b7556505260f75eee9e1"
 dependencies = [
  "anstream",
  "anstyle",
- "bitflags",
  "clap_lex 0.5.0",
  "strsim",
 ]
@@ -1266,14 +1283,14 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.3.2"
+version = "4.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f"
+checksum = "54a9bb5758fc5dfe728d1019941681eccaf0cf8a4189b692a0ee2f2ecf90a050"
 dependencies = [
  "heck 0.4.1",
  "proc-macro2",
  "quote",
- "syn 2.0.18",
+ "syn 2.0.27",
 ]
 
 [[package]]
@@ -1340,11 +1357,11 @@ dependencies = [
  "sum_tree",
  "tempfile",
  "thiserror",
- "time 0.3.21",
+ "time 0.3.23",
  "tiny_http",
  "url",
  "util",
- "uuid 1.3.2",
+ "uuid 1.4.1",
 ]
 
 [[package]]
@@ -1368,7 +1385,7 @@ name = "cocoa"
 version = "0.24.0"
 source = "git+https://github.com/servo/core-foundation-rs?rev=079665882507dd5e2ff77db3de5070c1f6c0fb85#079665882507dd5e2ff77db3de5070c1f6c0fb85"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "block",
  "cocoa-foundation",
  "core-foundation",
@@ -1383,7 +1400,7 @@ name = "cocoa-foundation"
 version = "0.1.1"
 source = "git+https://github.com/servo/core-foundation-rs?rev=079665882507dd5e2ff77db3de5070c1f6c0fb85#079665882507dd5e2ff77db3de5070c1f6c0fb85"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "block",
  "core-foundation",
  "core-graphics-types",
@@ -1392,16 +1409,6 @@ dependencies = [
  "objc",
 ]
 
-[[package]]
-name = "codespan-reporting"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
-dependencies = [
- "termcolor",
- "unicode-width",
-]
-
 [[package]]
 name = "collab"
 version = "0.16.0"
@@ -1452,7 +1459,7 @@ dependencies = [
  "sha-1 0.9.8",
  "sqlx",
  "theme",
- "time 0.3.21",
+ "time 0.3.23",
  "tokio",
  "tokio-tungstenite",
  "toml",
@@ -1554,7 +1561,7 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c"
 dependencies = [
- "crossbeam-utils 0.8.15",
+ "crossbeam-utils",
 ]
 
 [[package]]
@@ -1645,7 +1652,7 @@ name = "core-graphics"
 version = "0.22.3"
 source = "git+https://github.com/servo/core-foundation-rs?rev=079665882507dd5e2ff77db3de5070c1f6c0fb85#079665882507dd5e2ff77db3de5070c1f6c0fb85"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "core-foundation",
  "core-graphics-types",
  "foreign-types",
@@ -1657,7 +1664,7 @@ name = "core-graphics-types"
 version = "0.1.1"
 source = "git+https://github.com/servo/core-foundation-rs?rev=079665882507dd5e2ff77db3de5070c1f6c0fb85#079665882507dd5e2ff77db3de5070c1f6c0fb85"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "core-foundation",
  "foreign-types",
  "libc",
@@ -1690,7 +1697,7 @@ version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cb17e2d1795b1996419648915df94bc7103c28f7b48062d7acf4652fc371b2ff"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "core-foundation-sys 0.6.2",
  "coreaudio-sys",
 ]
@@ -1740,9 +1747,9 @@ dependencies = [
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.7"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
+checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
 dependencies = [
  "libc",
 ]
@@ -1867,16 +1874,6 @@ dependencies = [
  "cfg-if 1.0.0",
 ]
 
-[[package]]
-name = "crossbeam-channel"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b153fe7cbef478c567df0f972e02e6d736db11affe43dfc9c56a9374d1adfb87"
-dependencies = [
- "crossbeam-utils 0.7.2",
- "maybe-uninit",
-]
-
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.8"
@@ -1884,7 +1881,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
 dependencies = [
  "cfg-if 1.0.0",
- "crossbeam-utils 0.8.15",
+ "crossbeam-utils",
 ]
 
 [[package]]
@@ -1895,19 +1892,19 @@ checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
 dependencies = [
  "cfg-if 1.0.0",
  "crossbeam-epoch",
- "crossbeam-utils 0.8.15",
+ "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.14"
+version = "0.9.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
+checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
 dependencies = [
- "autocfg 1.1.0",
+ "autocfg",
  "cfg-if 1.0.0",
- "crossbeam-utils 0.8.15",
- "memoffset 0.8.0",
+ "crossbeam-utils",
+ "memoffset 0.9.0",
  "scopeguard",
 ]
 
@@ -1918,25 +1915,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add"
 dependencies = [
  "cfg-if 1.0.0",
- "crossbeam-utils 0.8.15",
+ "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.7.2"
+version = "0.8.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8"
-dependencies = [
- "autocfg 1.1.0",
- "cfg-if 0.1.10",
- "lazy_static",
-]
-
-[[package]]
-name = "crossbeam-utils"
-version = "0.8.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
+checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
 dependencies = [
  "cfg-if 1.0.0",
 ]
@@ -1988,9 +1974,9 @@ dependencies = [
 
 [[package]]
 name = "curl-sys"
-version = "0.4.61+curl-8.0.1"
+version = "0.4.64+curl-8.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14d05c10f541ae6f3bc5b3d923c20001f47db7d5f0b2bc6ad16490133842db79"
+checksum = "f96069f0b1cb1241c838740659a771ef143363f52772a9ce1bd9c04c75eee0dc"
 dependencies = [
  "cc",
  "libc",
@@ -2001,61 +1987,17 @@ dependencies = [
  "winapi 0.3.9",
 ]
 
-[[package]]
-name = "cxx"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93"
-dependencies = [
- "cc",
- "cxxbridge-flags",
- "cxxbridge-macro",
- "link-cplusplus",
-]
-
-[[package]]
-name = "cxx-build"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b"
-dependencies = [
- "cc",
- "codespan-reporting",
- "once_cell",
- "proc-macro2",
- "quote",
- "scratch",
- "syn 2.0.18",
-]
-
-[[package]]
-name = "cxxbridge-flags"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb"
-
-[[package]]
-name = "cxxbridge-macro"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.18",
-]
-
 [[package]]
 name = "dashmap"
-version = "5.4.0"
+version = "5.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
  "cfg-if 1.0.0",
- "hashbrown 0.12.3",
+ "hashbrown 0.14.0",
  "lock_api",
  "once_cell",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]
 
 [[package]]
@@ -2160,9 +2102,9 @@ dependencies = [
 
 [[package]]
 name = "digest"
-version = "0.10.6"
+version = "0.10.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer 0.10.4",
  "crypto-common",
@@ -2231,11 +2173,11 @@ dependencies = [
 
 [[package]]
 name = "dlib"
-version = "0.5.0"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac1b7517328c04c2aa68422fc60a41b92208182142ed04a25879c26c8f878794"
+checksum = "330c60081dcc4c72131f8eb70510f1ac07223e5d4163db481a04a0befcffa412"
 dependencies = [
- "libloading",
+ "libloading 0.8.0",
 ]
 
 [[package]]
@@ -2266,9 +2208,9 @@ dependencies = [
 
 [[package]]
 name = "dyn-clone"
-version = "1.0.11"
+version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68b0cf012f1230e43cd00ebb729c6bb58707ecfa8ad08b52ef3a4ccd2697fc30"
+checksum = "304e6508efa593091e97a9abbc10f90aa7ca635b6d2784feff3c89d41dd12272"
 
 [[package]]
 name = "editor"
@@ -2316,7 +2258,7 @@ dependencies = [
  "tree-sitter",
  "tree-sitter-html",
  "tree-sitter-rust",
- "tree-sitter-typescript",
+ "tree-sitter-typescript 0.20.2 (git+https://github.com/tree-sitter/tree-sitter-typescript?rev=5d20856f34315b068c41edaee2ac8a100081d259)",
  "unindent",
  "util",
  "workspace",
@@ -2324,9 +2266,9 @@ dependencies = [
 
 [[package]]
 name = "either"
-version = "1.8.1"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
+checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
 
 [[package]]
 name = "encoding_rs"
@@ -2357,7 +2299,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
 dependencies = [
  "humantime",
- "is-terminal 0.4.7",
+ "is-terminal 0.4.9",
  "log",
  "regex",
  "termcolor",
@@ -2374,15 +2316,15 @@ dependencies = [
 
 [[package]]
 name = "equivalent"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88bffebc5d80432c9b140ee17875ff173a8ab62faad5b257da912bd2f6c1c0a1"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "erased-serde"
-version = "0.3.25"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f2b0c2380453a92ea8b6c8e5f64ecaafccddde8ceab55ff7a8ac1029f894569"
+checksum = "da96524cc884f6558f1769b6c46686af2fe8e8b4cd253bd5a3cdba8181b8e070"
 dependencies = [
  "serde",
 ]
@@ -2406,7 +2348,7 @@ checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
 dependencies = [
  "errno-dragonfly",
  "libc",
- "windows-sys 0.48.0",
+ "windows-sys",
 ]
 
 [[package]]
@@ -2421,9 +2363,9 @@ dependencies = [
 
 [[package]]
 name = "etagere"
-version = "0.2.7"
+version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6301151a318f367f392c31395beb1cfba5ccd9abc44d1db0db3a4b27b9601c89"
+checksum = "fcf22f748754352918e082e0039335ee92454a5d62bcaf69b5e8daf5907d9644"
 dependencies = [
  "euclid",
  "svg_fmt",
@@ -2475,6 +2417,12 @@ dependencies = [
  "instant",
 ]
 
+[[package]]
+name = "fastrand"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
+
 [[package]]
 name = "feedback"
 version = "0.1.0"
@@ -2546,7 +2494,7 @@ dependencies = [
  "cfg-if 1.0.0",
  "libc",
  "redox_syscall 0.2.16",
- "windows-sys 0.48.0",
+ "windows-sys",
 ]
 
 [[package]]
@@ -2600,7 +2548,7 @@ name = "font-kit"
 version = "0.11.0"
 source = "git+https://github.com/zed-industries/font-kit?rev=b2f77d56f450338aa4f7dd2f0197d8c9acb0cf18#b2f77d56f450338aa4f7dd2f0197d8c9acb0cf18"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "byteorder",
  "core-foundation",
  "core-graphics",
@@ -2647,9 +2595,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
 
 [[package]]
 name = "form_urlencoded"
-version = "1.1.0"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8"
+checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652"
 dependencies = [
  "percent-encoding",
 ]
@@ -2700,7 +2648,7 @@ dependencies = [
  "smol",
  "sum_tree",
  "tempfile",
- "time 0.3.21",
+ "time 0.3.23",
  "util",
 ]
 
@@ -2719,7 +2667,7 @@ dependencies = [
 name = "fsevent"
 version = "2.0.2"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "fsevent-sys",
  "parking_lot 0.11.2",
  "tempdir",
@@ -2746,7 +2694,7 @@ version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "fuchsia-zircon-sys",
 ]
 

Cargo.toml 🔗

@@ -63,7 +63,7 @@ members = [
     "crates/theme",
     "crates/theme_selector",
     "crates/util",
-    "crates/vector_store",
+    "crates/semantic_index",
     "crates/vim",
     "crates/vcs_menu",
     "crates/workspace",
@@ -133,7 +133,7 @@ tree-sitter-yaml = { git = "https://github.com/zed-industries/tree-sitter-yaml",
 tree-sitter-lua = "0.0.14"
 
 [patch.crates-io]
-tree-sitter = { git = "https://github.com/tree-sitter/tree-sitter", rev = "49226023693107fba9a1191136a4f47f38cdca73" }
+tree-sitter = { git = "https://github.com/tree-sitter/tree-sitter", rev = "1c65ca24bc9a734ab70115188f465e12eecf224e" }
 async-task = { git = "https://github.com/zed-industries/async-task", rev = "341b57d6de98cdfd7b418567b8de2022ca993a6e" }
 
 # TODO - Remove when a version is released with this PR: https://github.com/servo/core-foundation-rs/pull/457

assets/settings/default.json 🔗

@@ -324,8 +324,8 @@
     // the terminal will default to matching the buffer's font family.
     // "font_family": "Zed Mono"
   },
-  // Difference settings for vector_store
-  "vector_store": {
+  // Difference settings for semantic_index
+  "semantic_index": {
     "enabled": false,
     "reindexing_delay_seconds": 600
   },

crates/language/src/language.rs 🔗

@@ -339,6 +339,8 @@ pub struct LanguageConfig {
     #[serde(default)]
     pub line_comment: Option<Arc<str>>,
     #[serde(default)]
+    pub collapsed_placeholder: String,
+    #[serde(default)]
     pub block_comment: Option<(Arc<str>, Arc<str>)>,
     #[serde(default)]
     pub overrides: HashMap<String, LanguageConfigOverride>,
@@ -408,6 +410,7 @@ impl Default for LanguageConfig {
             line_comment: Default::default(),
             block_comment: Default::default(),
             overrides: Default::default(),
+            collapsed_placeholder: Default::default(),
         }
     }
 }
@@ -523,9 +526,10 @@ pub struct OutlineConfig {
 pub struct EmbeddingConfig {
     pub query: Query,
     pub item_capture_ix: u32,
-    pub name_capture_ix: u32,
+    pub name_capture_ix: Option<u32>,
     pub context_capture_ix: Option<u32>,
-    pub extra_context_capture_ix: Option<u32>,
+    pub collapse_capture_ix: Option<u32>,
+    pub keep_capture_ix: Option<u32>,
 }
 
 struct InjectionConfig {
@@ -1247,23 +1251,26 @@ impl Language {
         let mut item_capture_ix = None;
         let mut name_capture_ix = None;
         let mut context_capture_ix = None;
-        let mut extra_context_capture_ix = None;
+        let mut collapse_capture_ix = None;
+        let mut keep_capture_ix = None;
         get_capture_indices(
             &query,
             &mut [
                 ("item", &mut item_capture_ix),
                 ("name", &mut name_capture_ix),
                 ("context", &mut context_capture_ix),
-                ("context.extra", &mut extra_context_capture_ix),
+                ("keep", &mut keep_capture_ix),
+                ("collapse", &mut collapse_capture_ix),
             ],
         );
-        if let Some((item_capture_ix, name_capture_ix)) = item_capture_ix.zip(name_capture_ix) {
+        if let Some(item_capture_ix) = item_capture_ix {
             grammar.embedding_config = Some(EmbeddingConfig {
                 query,
                 item_capture_ix,
                 name_capture_ix,
                 context_capture_ix,
-                extra_context_capture_ix,
+                collapse_capture_ix,
+                keep_capture_ix,
             });
         }
         Ok(self)
@@ -1548,9 +1555,20 @@ impl Language {
     pub fn grammar(&self) -> Option<&Arc<Grammar>> {
         self.grammar.as_ref()
     }
+
+    pub fn default_scope(self: &Arc<Self>) -> LanguageScope {
+        LanguageScope {
+            language: self.clone(),
+            override_id: None,
+        }
+    }
 }
 
 impl LanguageScope {
+    pub fn collapsed_placeholder(&self) -> &str {
+        self.language.config.collapsed_placeholder.as_ref()
+    }
+
     pub fn line_comment_prefix(&self) -> Option<&Arc<str>> {
         Override::as_option(
             self.config_override().map(|o| &o.line_comment),

crates/search/Cargo.toml 🔗

@@ -20,6 +20,7 @@ settings = { path = "../settings" }
 theme = { path = "../theme" }
 util = { path = "../util" }
 workspace = { path = "../workspace" }
+semantic_index = { path = "../semantic_index" }
 anyhow.workspace = true
 futures.workspace = true
 log.workspace = true

crates/search/src/project_search.rs 🔗

@@ -18,7 +18,9 @@ use gpui::{
     Task, View, ViewContext, ViewHandle, WeakModelHandle, WeakViewHandle,
 };
 use menu::Confirm;
+use postage::stream::Stream;
 use project::{search::SearchQuery, Entry, Project};
+use semantic_index::SemanticIndex;
 use smallvec::SmallVec;
 use std::{
     any::{Any, TypeId},
@@ -36,7 +38,10 @@ use workspace::{
     ItemNavHistory, Pane, ToolbarItemLocation, ToolbarItemView, Workspace, WorkspaceId,
 };
 
-actions!(project_search, [SearchInNew, ToggleFocus, NextField]);
+actions!(
+    project_search,
+    [SearchInNew, ToggleFocus, NextField, ToggleSemanticSearch]
+);
 
 #[derive(Default)]
 struct ActiveSearches(HashMap<WeakModelHandle<Project>, WeakViewHandle<ProjectSearchView>>);
@@ -89,6 +94,7 @@ pub struct ProjectSearchView {
     model: ModelHandle<ProjectSearch>,
     query_editor: ViewHandle<Editor>,
     results_editor: ViewHandle<Editor>,
+    semantic: Option<SemanticSearchState>,
     search_options: SearchOptions,
     panels_with_errors: HashSet<InputPanel>,
     active_match_index: Option<usize>,
@@ -98,6 +104,12 @@ pub struct ProjectSearchView {
     excluded_files_editor: ViewHandle<Editor>,
 }
 
+struct SemanticSearchState {
+    file_count: usize,
+    outstanding_file_count: usize,
+    _progress_task: Task<()>,
+}
+
 pub struct ProjectSearchBar {
     active_project_search: Option<ViewHandle<ProjectSearchView>>,
     subscription: Option<Subscription>,
@@ -172,6 +184,63 @@ impl ProjectSearch {
         }));
         cx.notify();
     }
+
+    fn semantic_search(
+        &mut self,
+        query: String,
+        include_files: Vec<GlobMatcher>,
+        exclude_files: Vec<GlobMatcher>,
+        cx: &mut ModelContext<Self>,
+    ) {
+        let search = SemanticIndex::global(cx).map(|index| {
+            index.update(cx, |semantic_index, cx| {
+                semantic_index.search_project(
+                    self.project.clone(),
+                    query.clone(),
+                    10,
+                    include_files,
+                    exclude_files,
+                    cx,
+                )
+            })
+        });
+        self.search_id += 1;
+        self.match_ranges.clear();
+        self.pending_search = Some(cx.spawn(|this, mut cx| async move {
+            let results = search?.await.log_err()?;
+
+            let (_task, mut match_ranges) = this.update(&mut cx, |this, cx| {
+                this.excerpts.update(cx, |excerpts, cx| {
+                    excerpts.clear(cx);
+
+                    let matches = results
+                        .into_iter()
+                        .map(|result| (result.buffer, vec![result.range.start..result.range.start]))
+                        .collect();
+
+                    excerpts.stream_excerpts_with_context_lines(matches, 3, cx)
+                })
+            });
+
+            while let Some(match_range) = match_ranges.next().await {
+                this.update(&mut cx, |this, cx| {
+                    this.match_ranges.push(match_range);
+                    while let Ok(Some(match_range)) = match_ranges.try_next() {
+                        this.match_ranges.push(match_range);
+                    }
+                    cx.notify();
+                });
+            }
+
+            this.update(&mut cx, |this, cx| {
+                this.pending_search.take();
+                cx.notify();
+            });
+
+            None
+        }));
+        cx.notify();
+    }
 }
 
 pub enum ViewEvent {
@@ -195,13 +264,24 @@ impl View for ProjectSearchView {
             enum Status {}
 
             let theme = theme::current(cx).clone();
-            let text = if self.query_editor.read(cx).text(cx).is_empty() {
-                ""
-            } else if model.pending_search.is_some() {
-                "Searching..."
+            let text = if model.pending_search.is_some() {
+                Cow::Borrowed("Searching...")
+            } else if let Some(semantic) = &self.semantic {
+                if semantic.outstanding_file_count > 0 {
+                    Cow::Owned(format!(
+                        "Indexing. {} of {}...",
+                        semantic.file_count - semantic.outstanding_file_count,
+                        semantic.file_count
+                    ))
+                } else {
+                    Cow::Borrowed("Indexing complete")
+                }
+            } else if self.query_editor.read(cx).text(cx).is_empty() {
+                Cow::Borrowed("")
             } else {
-                "No results"
+                Cow::Borrowed("No results")
             };
+
             MouseEventHandler::<Status, _>::new(0, cx, |_, _| {
                 Label::new(text, theme.search.results_status.clone())
                     .aligned()
@@ -490,6 +570,7 @@ impl ProjectSearchView {
             model,
             query_editor,
             results_editor,
+            semantic: None,
             search_options: options,
             panels_with_errors: HashSet::new(),
             active_match_index: None,
@@ -577,11 +658,59 @@ impl ProjectSearchView {
     }
 
     fn search(&mut self, cx: &mut ViewContext<Self>) {
+        if let Some(semantic) = &mut self.semantic {
+            if semantic.outstanding_file_count > 0 {
+                return;
+            }
+
+            let query = self.query_editor.read(cx).text(cx);
+            if let Some((included_files, exclude_files)) =
+                self.get_included_and_excluded_globsets(cx)
+            {
+                self.model.update(cx, |model, cx| {
+                    model.semantic_search(query, included_files, exclude_files, cx)
+                });
+            }
+            return;
+        }
+
         if let Some(query) = self.build_search_query(cx) {
             self.model.update(cx, |model, cx| model.search(query, cx));
         }
     }
 
+    fn get_included_and_excluded_globsets(
+        &mut self,
+        cx: &mut ViewContext<Self>,
+    ) -> Option<(Vec<GlobMatcher>, Vec<GlobMatcher>)> {
+        let included_files =
+            match Self::load_glob_set(&self.included_files_editor.read(cx).text(cx)) {
+                Ok(included_files) => {
+                    self.panels_with_errors.remove(&InputPanel::Include);
+                    included_files
+                }
+                Err(_e) => {
+                    self.panels_with_errors.insert(InputPanel::Include);
+                    cx.notify();
+                    return None;
+                }
+            };
+        let excluded_files =
+            match Self::load_glob_set(&self.excluded_files_editor.read(cx).text(cx)) {
+                Ok(excluded_files) => {
+                    self.panels_with_errors.remove(&InputPanel::Exclude);
+                    excluded_files
+                }
+                Err(_e) => {
+                    self.panels_with_errors.insert(InputPanel::Exclude);
+                    cx.notify();
+                    return None;
+                }
+            };
+
+        Some((included_files, excluded_files))
+    }
+
     fn build_search_query(&mut self, cx: &mut ViewContext<Self>) -> Option<SearchQuery> {
         let text = self.query_editor.read(cx).text(cx);
         let included_files =
@@ -873,6 +1002,7 @@ impl ProjectSearchBar {
         if let Some(search_view) = self.active_project_search.as_ref() {
             search_view.update(cx, |search_view, cx| {
                 search_view.search_options.toggle(option);
+                search_view.semantic = None;
                 search_view.search(cx);
             });
             cx.notify();
@@ -882,6 +1012,61 @@ impl ProjectSearchBar {
         }
     }
 
+    fn toggle_semantic_search(&mut self, cx: &mut ViewContext<Self>) -> bool {
+        if let Some(search_view) = self.active_project_search.as_ref() {
+            search_view.update(cx, |search_view, cx| {
+                if search_view.semantic.is_some() {
+                    search_view.semantic = None;
+                } else if let Some(semantic_index) = SemanticIndex::global(cx) {
+                    // TODO: confirm that it's ok to send this project
+                    search_view.search_options = SearchOptions::none();
+
+                    let project = search_view.model.read(cx).project.clone();
+                    let index_task = semantic_index.update(cx, |semantic_index, cx| {
+                        semantic_index.index_project(project, cx)
+                    });
+
+                    cx.spawn(|search_view, mut cx| async move {
+                        let (files_to_index, mut files_remaining_rx) = index_task.await?;
+
+                        search_view.update(&mut cx, |search_view, cx| {
+                            cx.notify();
+                            search_view.semantic = Some(SemanticSearchState {
+                                file_count: files_to_index,
+                                outstanding_file_count: files_to_index,
+                                _progress_task: cx.spawn(|search_view, mut cx| async move {
+                                    while let Some(count) = files_remaining_rx.recv().await {
+                                        search_view
+                                            .update(&mut cx, |search_view, cx| {
+                                                if let Some(semantic_search_state) =
+                                                    &mut search_view.semantic
+                                                {
+                                                    semantic_search_state.outstanding_file_count =
+                                                        count;
+                                                    cx.notify();
+                                                    if count == 0 {
+                                                        return;
+                                                    }
+                                                }
+                                            })
+                                            .ok();
+                                    }
+                                }),
+                            });
+                        })?;
+                        anyhow::Ok(())
+                    })
+                    .detach_and_log_err(cx);
+                }
+                cx.notify();
+            });
+            cx.notify();
+            true
+        } else {
+            false
+        }
+    }
+
     fn render_nav_button(
         &self,
         icon: &'static str,
@@ -959,6 +1144,42 @@ impl ProjectSearchBar {
         .into_any()
     }
 
+    fn render_semantic_search_button(&self, cx: &mut ViewContext<Self>) -> AnyElement<Self> {
+        let tooltip_style = theme::current(cx).tooltip.clone();
+        let is_active = if let Some(search) = self.active_project_search.as_ref() {
+            let search = search.read(cx);
+            search.semantic.is_some()
+        } else {
+            false
+        };
+
+        let region_id = 3;
+
+        MouseEventHandler::<Self, _>::new(region_id, cx, |state, cx| {
+            let theme = theme::current(cx);
+            let style = theme
+                .search
+                .option_button
+                .in_state(is_active)
+                .style_for(state);
+            Label::new("Semantic", style.text.clone())
+                .contained()
+                .with_style(style.container)
+        })
+        .on_click(MouseButton::Left, move |_, this, cx| {
+            this.toggle_semantic_search(cx);
+        })
+        .with_cursor_style(CursorStyle::PointingHand)
+        .with_tooltip::<Self>(
+            region_id,
+            format!("Toggle Semantic Search"),
+            Some(Box::new(ToggleSemanticSearch)),
+            tooltip_style,
+            cx,
+        )
+        .into_any()
+    }
+
     fn is_option_enabled(&self, option: SearchOptions, cx: &AppContext) -> bool {
         if let Some(search) = self.active_project_search.as_ref() {
             search.read(cx).search_options.contains(option)
@@ -1048,8 +1269,14 @@ impl View for ProjectSearchBar {
                                 .with_child(self.render_nav_button(">", Direction::Next, cx))
                                 .aligned(),
                         )
-                        .with_child(
-                            Flex::row()
+                        .with_child({
+                            let row = if SemanticIndex::enabled(cx) {
+                                Flex::row().with_child(self.render_semantic_search_button(cx))
+                            } else {
+                                Flex::row()
+                            };
+
+                            let row = row
                                 .with_child(self.render_option_button(
                                     "Case",
                                     SearchOptions::CASE_SENSITIVE,
@@ -1067,8 +1294,10 @@ impl View for ProjectSearchBar {
                                 ))
                                 .contained()
                                 .with_style(theme.search.option_button_group)
-                                .aligned(),
-                        )
+                                .aligned();
+
+                            row
+                        })
                         .contained()
                         .with_margin_bottom(row_spacing),
                 )

crates/search/src/search.rs 🔗

@@ -53,6 +53,10 @@ impl SearchOptions {
         }
     }
 
+    pub fn none() -> SearchOptions {
+        SearchOptions::NONE
+    }
+
     pub fn from_query(query: &SearchQuery) -> SearchOptions {
         let mut options = SearchOptions::NONE;
         options.set(SearchOptions::WHOLE_WORD, query.whole_word());

crates/vector_store/Cargo.toml → crates/semantic_index/Cargo.toml 🔗

@@ -1,11 +1,11 @@
 [package]
-name = "vector_store"
+name = "semantic_index"
 version = "0.1.0"
 edition = "2021"
 publish = false
 
 [lib]
-path = "src/vector_store.rs"
+path = "src/semantic_index.rs"
 doctest = false
 
 [dependencies]
@@ -20,6 +20,7 @@ editor = { path = "../editor" }
 rpc = { path = "../rpc" }
 settings = { path = "../settings" }
 anyhow.workspace = true
+postage.workspace = true
 futures.workspace = true
 smol.workspace = true
 rusqlite = { version = "0.27.0", features = ["blob", "array", "modern_sqlite"] }
@@ -33,8 +34,10 @@ async-trait.workspace = true
 bincode = "1.3.3"
 matrixmultiply = "0.3.7"
 tiktoken-rs = "0.5.0"
+parking_lot.workspace = true
 rand.workspace = true
 schemars.workspace = true
+globset.workspace = true
 
 [dev-dependencies]
 gpui = { path = "../gpui", features = ["test-support"] }
@@ -43,7 +46,17 @@ project = { path = "../project", features = ["test-support"] }
 rpc = { path = "../rpc", features = ["test-support"] }
 workspace = { path = "../workspace", features = ["test-support"] }
 settings = { path = "../settings", features = ["test-support"]}
-tree-sitter-rust = "*"
+
+pretty_assertions.workspace = true
 rand.workspace = true
 unindent.workspace = true
 tempdir.workspace = true
+ctor.workspace = true
+env_logger.workspace = true
+
+tree-sitter-typescript = "*"
+tree-sitter-json = "*"
+tree-sitter-rust = "*"
+tree-sitter-toml = "*"
+tree-sitter-cpp = "*"
+tree-sitter-elixir = "*"

crates/vector_store/src/db.rs → crates/semantic_index/src/db.rs 🔗

@@ -1,21 +1,22 @@
+use crate::{parsing::Document, SEMANTIC_INDEX_VERSION};
+use anyhow::{anyhow, Context, Result};
+use globset::GlobMatcher;
+use project::Fs;
+use rpc::proto::Timestamp;
+use rusqlite::{
+    params,
+    types::{FromSql, FromSqlResult, ValueRef},
+};
 use std::{
     cmp::Ordering,
     collections::HashMap,
+    ops::Range,
     path::{Path, PathBuf},
     rc::Rc,
+    sync::Arc,
     time::SystemTime,
 };
 
-use anyhow::{anyhow, Result};
-
-use crate::parsing::ParsedFile;
-use crate::VECTOR_STORE_VERSION;
-use rpc::proto::Timestamp;
-use rusqlite::{
-    params,
-    types::{FromSql, FromSqlResult, ValueRef},
-};
-
 #[derive(Debug)]
 pub struct FileRecord {
     pub id: usize,
@@ -42,48 +43,94 @@ pub struct VectorDatabase {
 }
 
 impl VectorDatabase {
-    pub fn new(path: String) -> Result<Self> {
+    pub async fn new(fs: Arc<dyn Fs>, path: Arc<PathBuf>) -> Result<Self> {
+        if let Some(db_directory) = path.parent() {
+            fs.create_dir(db_directory).await?;
+        }
+
         let this = Self {
-            db: rusqlite::Connection::open(path)?,
+            db: rusqlite::Connection::open(path.as_path())?,
         };
         this.initialize_database()?;
         Ok(this)
     }
 
+    fn get_existing_version(&self) -> Result<i64> {
+        let mut version_query = self
+            .db
+            .prepare("SELECT version from semantic_index_config")?;
+        version_query
+            .query_row([], |row| Ok(row.get::<_, i64>(0)?))
+            .map_err(|err| anyhow!("version query failed: {err}"))
+    }
+
     fn initialize_database(&self) -> Result<()> {
         rusqlite::vtab::array::load_module(&self.db)?;
 
-        // This will create the database if it doesnt exist
+        // Delete existing tables, if SEMANTIC_INDEX_VERSION is bumped
+        if self
+            .get_existing_version()
+            .map_or(false, |version| version == SEMANTIC_INDEX_VERSION as i64)
+        {
+            log::trace!("vector database schema up to date");
+            return Ok(());
+        }
+
+        log::trace!("vector database schema out of date. updating...");
+        self.db
+            .execute("DROP TABLE IF EXISTS documents", [])
+            .context("failed to drop 'documents' table")?;
+        self.db
+            .execute("DROP TABLE IF EXISTS files", [])
+            .context("failed to drop 'files' table")?;
+        self.db
+            .execute("DROP TABLE IF EXISTS worktrees", [])
+            .context("failed to drop 'worktrees' table")?;
+        self.db
+            .execute("DROP TABLE IF EXISTS semantic_index_config", [])
+            .context("failed to drop 'semantic_index_config' table")?;
 
         // Initialize Vector Databasing Tables
         self.db.execute(
-            "CREATE TABLE IF NOT EXISTS worktrees (
+            "CREATE TABLE semantic_index_config (
+                version INTEGER NOT NULL
+            )",
+            [],
+        )?;
+
+        self.db.execute(
+            "INSERT INTO semantic_index_config (version) VALUES (?1)",
+            params![SEMANTIC_INDEX_VERSION],
+        )?;
+
+        self.db.execute(
+            "CREATE TABLE worktrees (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 absolute_path VARCHAR NOT NULL
             );
-            CREATE UNIQUE INDEX IF NOT EXISTS worktrees_absolute_path ON worktrees (absolute_path);
+            CREATE UNIQUE INDEX worktrees_absolute_path ON worktrees (absolute_path);
             ",
             [],
         )?;
 
         self.db.execute(
-            "CREATE TABLE IF NOT EXISTS files (
+            "CREATE TABLE files (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 worktree_id INTEGER NOT NULL,
                 relative_path VARCHAR NOT NULL,
                 mtime_seconds INTEGER NOT NULL,
                 mtime_nanos INTEGER NOT NULL,
-                vector_store_version INTEGER NOT NULL,
                 FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE
             )",
             [],
         )?;
 
         self.db.execute(
-            "CREATE TABLE IF NOT EXISTS documents (
+            "CREATE TABLE documents (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 file_id INTEGER NOT NULL,
-                offset INTEGER NOT NULL,
+                start_byte INTEGER NOT NULL,
+                end_byte INTEGER NOT NULL,
                 name VARCHAR NOT NULL,
                 embedding BLOB NOT NULL,
                 FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
@@ -91,6 +138,7 @@ impl VectorDatabase {
             [],
         )?;
 
+        log::trace!("vector database initialized with updated schema.");
         Ok(())
     }
 
@@ -102,43 +150,44 @@ impl VectorDatabase {
         Ok(())
     }
 
-    pub fn insert_file(&self, worktree_id: i64, indexed_file: ParsedFile) -> Result<()> {
+    pub fn insert_file(
+        &self,
+        worktree_id: i64,
+        path: PathBuf,
+        mtime: SystemTime,
+        documents: Vec<Document>,
+    ) -> Result<()> {
         // Write to files table, and return generated id.
         self.db.execute(
             "
             DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;
             ",
-            params![worktree_id, indexed_file.path.to_str()],
+            params![worktree_id, path.to_str()],
         )?;
-        let mtime = Timestamp::from(indexed_file.mtime);
+        let mtime = Timestamp::from(mtime);
         self.db.execute(
             "
             INSERT INTO files
-            (worktree_id, relative_path, mtime_seconds, mtime_nanos, vector_store_version)
+            (worktree_id, relative_path, mtime_seconds, mtime_nanos)
             VALUES
-            (?1, ?2, $3, $4, $5);
+            (?1, ?2, $3, $4);
             ",
-            params![
-                worktree_id,
-                indexed_file.path.to_str(),
-                mtime.seconds,
-                mtime.nanos,
-                VECTOR_STORE_VERSION
-            ],
+            params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
         )?;
 
         let file_id = self.db.last_insert_rowid();
 
         // Currently inserting at approximately 3400 documents a second
         // I imagine we can speed this up with a bulk insert of some kind.
-        for document in indexed_file.documents {
+        for document in documents {
             let embedding_blob = bincode::serialize(&document.embedding)?;
 
             self.db.execute(
-                "INSERT INTO documents (file_id, offset, name, embedding) VALUES (?1, ?2, ?3, ?4)",
+                "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding) VALUES (?1, ?2, ?3, ?4, $5)",
                 params![
                     file_id,
-                    document.offset.to_string(),
+                    document.range.start.to_string(),
+                    document.range.end.to_string(),
                     document.name,
                     embedding_blob
                 ],
@@ -148,6 +197,23 @@ impl VectorDatabase {
         Ok(())
     }
 
+    pub fn worktree_previously_indexed(&self, worktree_root_path: &Path) -> Result<bool> {
+        let mut worktree_query = self
+            .db
+            .prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?;
+        let worktree_id = worktree_query
+            .query_row(params![worktree_root_path.to_string_lossy()], |row| {
+                Ok(row.get::<_, i64>(0)?)
+            })
+            .map_err(|err| anyhow!(err));
+
+        if worktree_id.is_ok() {
+            return Ok(true);
+        } else {
+            return Ok(false);
+        }
+    }
+
     pub fn find_or_create_worktree(&self, worktree_root_path: &Path) -> Result<i64> {
         // Check that the absolute path doesnt exist
         let mut worktree_query = self
@@ -204,19 +270,26 @@ impl VectorDatabase {
         worktree_ids: &[i64],
         query_embedding: &Vec<f32>,
         limit: usize,
-    ) -> Result<Vec<(i64, PathBuf, usize, String)>> {
+        include_globs: Vec<GlobMatcher>,
+        exclude_globs: Vec<GlobMatcher>,
+    ) -> Result<Vec<(i64, PathBuf, Range<usize>)>> {
         let mut results = Vec::<(i64, f32)>::with_capacity(limit + 1);
-        self.for_each_document(&worktree_ids, |id, embedding| {
-            let similarity = dot(&embedding, &query_embedding);
-            let ix = match results
-                .binary_search_by(|(_, s)| similarity.partial_cmp(&s).unwrap_or(Ordering::Equal))
-            {
-                Ok(ix) => ix,
-                Err(ix) => ix,
-            };
-            results.insert(ix, (id, similarity));
-            results.truncate(limit);
-        })?;
+        self.for_each_document(
+            &worktree_ids,
+            include_globs,
+            exclude_globs,
+            |id, embedding| {
+                let similarity = dot(&embedding, &query_embedding);
+                let ix = match results.binary_search_by(|(_, s)| {
+                    similarity.partial_cmp(&s).unwrap_or(Ordering::Equal)
+                }) {
+                    Ok(ix) => ix,
+                    Err(ix) => ix,
+                };
+                results.insert(ix, (id, similarity));
+                results.truncate(limit);
+            },
+        )?;
 
         let ids = results.into_iter().map(|(id, _)| id).collect::<Vec<_>>();
         self.get_documents_by_ids(&ids)
@@ -225,22 +298,51 @@ impl VectorDatabase {
     fn for_each_document(
         &self,
         worktree_ids: &[i64],
+        include_globs: Vec<GlobMatcher>,
+        exclude_globs: Vec<GlobMatcher>,
         mut f: impl FnMut(i64, Vec<f32>),
     ) -> Result<()> {
+        let mut file_query = self.db.prepare(
+            "
+            SELECT
+                id, relative_path
+            FROM
+                files
+            WHERE
+                worktree_id IN rarray(?)
+            ",
+        )?;
+
+        let mut file_ids = Vec::<i64>::new();
+        let mut rows = file_query.query([ids_to_sql(worktree_ids)])?;
+        while let Some(row) = rows.next()? {
+            let file_id = row.get(0)?;
+            let relative_path = row.get_ref(1)?.as_str()?;
+            let included = include_globs.is_empty()
+                || include_globs
+                    .iter()
+                    .any(|glob| glob.is_match(relative_path));
+            let excluded = exclude_globs
+                .iter()
+                .any(|glob| glob.is_match(relative_path));
+            if included && !excluded {
+                file_ids.push(file_id);
+            }
+        }
+
         let mut query_statement = self.db.prepare(
             "
             SELECT
-                documents.id, documents.embedding
+                id, embedding
             FROM
-                documents, files
+                documents
             WHERE
-                documents.file_id = files.id AND
-                files.worktree_id IN rarray(?)
+                file_id IN rarray(?)
             ",
         )?;
 
         query_statement
-            .query_map(params![ids_to_sql(worktree_ids)], |row| {
+            .query_map(params![ids_to_sql(&file_ids)], |row| {
                 Ok((row.get(0)?, row.get::<_, Embedding>(1)?))
             })?
             .filter_map(|row| row.ok())
@@ -248,11 +350,15 @@ impl VectorDatabase {
         Ok(())
     }
 
-    fn get_documents_by_ids(&self, ids: &[i64]) -> Result<Vec<(i64, PathBuf, usize, String)>> {
+    fn get_documents_by_ids(&self, ids: &[i64]) -> Result<Vec<(i64, PathBuf, Range<usize>)>> {
         let mut statement = self.db.prepare(
             "
                 SELECT
-                    documents.id, files.worktree_id, files.relative_path, documents.offset, documents.name
+                    documents.id,
+                    files.worktree_id,
+                    files.relative_path,
+                    documents.start_byte,
+                    documents.end_byte
                 FROM
                     documents, files
                 WHERE
@@ -266,15 +372,14 @@ impl VectorDatabase {
                 row.get::<_, i64>(0)?,
                 row.get::<_, i64>(1)?,
                 row.get::<_, String>(2)?.into(),
-                row.get(3)?,
-                row.get(4)?,
+                row.get(3)?..row.get(4)?,
             ))
         })?;
 
-        let mut values_by_id = HashMap::<i64, (i64, PathBuf, usize, String)>::default();
+        let mut values_by_id = HashMap::<i64, (i64, PathBuf, Range<usize>)>::default();
         for row in result_iter {
-            let (id, worktree_id, path, offset, name) = row?;
-            values_by_id.insert(id, (worktree_id, path, offset, name));
+            let (id, worktree_id, path, range) = row?;
+            values_by_id.insert(id, (worktree_id, path, range));
         }
 
         let mut results = Vec::with_capacity(ids.len());

crates/vector_store/src/embedding.rs → crates/semantic_index/src/embedding.rs 🔗

@@ -67,17 +67,16 @@ impl EmbeddingProvider for DummyEmbeddings {
     }
 }
 
-const INPUT_LIMIT: usize = 8190;
+const OPENAI_INPUT_LIMIT: usize = 8190;
 
 impl OpenAIEmbeddings {
     fn truncate(span: String) -> String {
         let mut tokens = OPENAI_BPE_TOKENIZER.encode_with_special_tokens(span.as_ref());
-        if tokens.len() > INPUT_LIMIT {
-            tokens.truncate(INPUT_LIMIT);
+        if tokens.len() > OPENAI_INPUT_LIMIT {
+            tokens.truncate(OPENAI_INPUT_LIMIT);
             let result = OPENAI_BPE_TOKENIZER.decode(tokens.clone());
             if result.is_ok() {
                 let transformed = result.unwrap();
-                // assert_ne!(transformed, span);
                 return transformed;
             }
         }
@@ -88,6 +87,7 @@ impl OpenAIEmbeddings {
     async fn send_request(&self, api_key: &str, spans: Vec<&str>) -> Result<Response<AsyncBody>> {
         let request = Request::post("https://api.openai.com/v1/embeddings")
             .redirect_policy(isahc::config::RedirectPolicy::Follow)
+            .timeout(Duration::from_secs(4))
             .header("Content-Type", "application/json")
             .header("Authorization", format!("Bearer {}", api_key))
             .body(
@@ -106,7 +106,7 @@ impl OpenAIEmbeddings {
 #[async_trait]
 impl EmbeddingProvider for OpenAIEmbeddings {
     async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>> {
-        const BACKOFF_SECONDS: [usize; 3] = [65, 180, 360];
+        const BACKOFF_SECONDS: [usize; 3] = [45, 75, 125];
         const MAX_RETRIES: usize = 3;
 
         let api_key = OPENAI_API_KEY
@@ -114,6 +114,7 @@ impl EmbeddingProvider for OpenAIEmbeddings {
             .ok_or_else(|| anyhow!("no api key"))?;
 
         let mut request_number = 0;
+        let mut truncated = false;
         let mut response: Response<AsyncBody>;
         let mut spans: Vec<String> = spans.iter().map(|x| x.to_string()).collect();
         while request_number < MAX_RETRIES {
@@ -132,14 +133,25 @@ impl EmbeddingProvider for OpenAIEmbeddings {
             match response.status() {
                 StatusCode::TOO_MANY_REQUESTS => {
                     let delay = Duration::from_secs(BACKOFF_SECONDS[request_number - 1] as u64);
+                    log::trace!(
+                        "open ai rate limiting, delaying request by {:?} seconds",
+                        delay.as_secs()
+                    );
                     self.executor.timer(delay).await;
                 }
                 StatusCode::BAD_REQUEST => {
-                    log::info!("BAD REQUEST: {:?}", &response.status());
-                    // Don't worry about delaying bad request, as we can assume
-                    // we haven't been rate limited yet.
-                    for span in spans.iter_mut() {
-                        *span = Self::truncate(span.to_string());
+                    // Only truncate if it hasnt been truncated before
+                    if !truncated {
+                        for span in spans.iter_mut() {
+                            *span = Self::truncate(span.clone());
+                        }
+                        truncated = true;
+                    } else {
+                        // If failing once already truncated, log the error and break the loop
+                        let mut body = String::new();
+                        response.body_mut().read_to_string(&mut body).await?;
+                        log::trace!("open ai bad request: {:?} {:?}", &response.status(), body);
+                        break;
                     }
                 }
                 StatusCode::OK => {
@@ -147,7 +159,7 @@ impl EmbeddingProvider for OpenAIEmbeddings {
                     response.body_mut().read_to_string(&mut body).await?;
                     let response: OpenAIEmbeddingResponse = serde_json::from_str(&body)?;
 
-                    log::info!(
+                    log::trace!(
                         "openai embedding completed. tokens: {:?}",
                         response.usage.total_tokens
                     );

crates/semantic_index/src/parsing.rs 🔗

@@ -0,0 +1,299 @@
+use anyhow::{anyhow, Ok, Result};
+use language::{Grammar, Language};
+use std::{
+    cmp::{self, Reverse},
+    collections::HashSet,
+    ops::Range,
+    path::Path,
+    sync::Arc,
+};
+use tree_sitter::{Parser, QueryCursor};
+
+#[derive(Debug, PartialEq, Clone)]
+pub struct Document {
+    pub name: String,
+    pub range: Range<usize>,
+    pub content: String,
+    pub embedding: Vec<f32>,
+}
+
+const CODE_CONTEXT_TEMPLATE: &str =
+    "The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
+const ENTIRE_FILE_TEMPLATE: &str =
+    "The below snippet is from file '<path>'\n\n```<language>\n<item>\n```";
+pub const PARSEABLE_ENTIRE_FILE_TYPES: &[&str] = &["TOML", "YAML", "CSS"];
+
+pub struct CodeContextRetriever {
+    pub parser: Parser,
+    pub cursor: QueryCursor,
+}
+
+// Every match has an item, this represents the fundamental treesitter symbol and anchors the search
+// Every match has one or more 'name' captures. These indicate the display range of the item for deduplication.
+// If there are preceeding comments, we track this with a context capture
+// If there is a piece that should be collapsed in hierarchical queries, we capture it with a collapse capture
+// If there is a piece that should be kept inside a collapsed node, we capture it with a keep capture
+#[derive(Debug, Clone)]
+pub struct CodeContextMatch {
+    pub start_col: usize,
+    pub item_range: Option<Range<usize>>,
+    pub name_range: Option<Range<usize>>,
+    pub context_ranges: Vec<Range<usize>>,
+    pub collapse_ranges: Vec<Range<usize>>,
+}
+
+impl CodeContextRetriever {
+    pub fn new() -> Self {
+        Self {
+            parser: Parser::new(),
+            cursor: QueryCursor::new(),
+        }
+    }
+
+    fn parse_entire_file(
+        &self,
+        relative_path: &Path,
+        language_name: Arc<str>,
+        content: &str,
+    ) -> Result<Vec<Document>> {
+        let document_span = ENTIRE_FILE_TEMPLATE
+            .replace("<path>", relative_path.to_string_lossy().as_ref())
+            .replace("<language>", language_name.as_ref())
+            .replace("item", &content);
+
+        Ok(vec![Document {
+            range: 0..content.len(),
+            content: document_span,
+            embedding: Vec::new(),
+            name: language_name.to_string(),
+        }])
+    }
+
+    fn get_matches_in_file(
+        &mut self,
+        content: &str,
+        grammar: &Arc<Grammar>,
+    ) -> Result<Vec<CodeContextMatch>> {
+        let embedding_config = grammar
+            .embedding_config
+            .as_ref()
+            .ok_or_else(|| anyhow!("no embedding queries"))?;
+        self.parser.set_language(grammar.ts_language).unwrap();
+
+        let tree = self
+            .parser
+            .parse(&content, None)
+            .ok_or_else(|| anyhow!("parsing failed"))?;
+
+        let mut captures: Vec<CodeContextMatch> = Vec::new();
+        let mut collapse_ranges: Vec<Range<usize>> = Vec::new();
+        let mut keep_ranges: Vec<Range<usize>> = Vec::new();
+        for mat in self.cursor.matches(
+            &embedding_config.query,
+            tree.root_node(),
+            content.as_bytes(),
+        ) {
+            let mut start_col = 0;
+            let mut item_range: Option<Range<usize>> = None;
+            let mut name_range: Option<Range<usize>> = None;
+            let mut context_ranges: Vec<Range<usize>> = Vec::new();
+            collapse_ranges.clear();
+            keep_ranges.clear();
+            for capture in mat.captures {
+                if capture.index == embedding_config.item_capture_ix {
+                    item_range = Some(capture.node.byte_range());
+                    start_col = capture.node.start_position().column;
+                } else if Some(capture.index) == embedding_config.name_capture_ix {
+                    name_range = Some(capture.node.byte_range());
+                } else if Some(capture.index) == embedding_config.context_capture_ix {
+                    context_ranges.push(capture.node.byte_range());
+                } else if Some(capture.index) == embedding_config.collapse_capture_ix {
+                    collapse_ranges.push(capture.node.byte_range());
+                } else if Some(capture.index) == embedding_config.keep_capture_ix {
+                    keep_ranges.push(capture.node.byte_range());
+                }
+            }
+
+            captures.push(CodeContextMatch {
+                start_col,
+                item_range,
+                name_range,
+                context_ranges,
+                collapse_ranges: subtract_ranges(&collapse_ranges, &keep_ranges),
+            });
+        }
+        Ok(captures)
+    }
+
+    pub fn parse_file_with_template(
+        &mut self,
+        relative_path: &Path,
+        content: &str,
+        language: Arc<Language>,
+    ) -> Result<Vec<Document>> {
+        let language_name = language.name();
+
+        if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language_name.as_ref()) {
+            return self.parse_entire_file(relative_path, language_name, &content);
+        }
+
+        let mut documents = self.parse_file(content, language)?;
+        for document in &mut documents {
+            document.content = CODE_CONTEXT_TEMPLATE
+                .replace("<path>", relative_path.to_string_lossy().as_ref())
+                .replace("<language>", language_name.as_ref())
+                .replace("item", &document.content);
+        }
+        Ok(documents)
+    }
+
+    pub fn parse_file(&mut self, content: &str, language: Arc<Language>) -> Result<Vec<Document>> {
+        let grammar = language
+            .grammar()
+            .ok_or_else(|| anyhow!("no grammar for language"))?;
+
+        // Iterate through query matches
+        let matches = self.get_matches_in_file(content, grammar)?;
+
+        let language_scope = language.default_scope();
+        let placeholder = language_scope.collapsed_placeholder();
+
+        let mut documents = Vec::new();
+        let mut collapsed_ranges_within = Vec::new();
+        let mut parsed_name_ranges = HashSet::new();
+        for (i, context_match) in matches.iter().enumerate() {
+            // Items which are collapsible but not embeddable have no item range
+            let item_range = if let Some(item_range) = context_match.item_range.clone() {
+                item_range
+            } else {
+                continue;
+            };
+
+            // Checks for deduplication
+            let name;
+            if let Some(name_range) = context_match.name_range.clone() {
+                name = content
+                    .get(name_range.clone())
+                    .map_or(String::new(), |s| s.to_string());
+                if parsed_name_ranges.contains(&name_range) {
+                    continue;
+                }
+                parsed_name_ranges.insert(name_range);
+            } else {
+                name = String::new();
+            }
+
+            collapsed_ranges_within.clear();
+            'outer: for remaining_match in &matches[(i + 1)..] {
+                for collapsed_range in &remaining_match.collapse_ranges {
+                    if item_range.start <= collapsed_range.start
+                        && item_range.end >= collapsed_range.end
+                    {
+                        collapsed_ranges_within.push(collapsed_range.clone());
+                    } else {
+                        break 'outer;
+                    }
+                }
+            }
+
+            collapsed_ranges_within.sort_by_key(|r| (r.start, Reverse(r.end)));
+
+            let mut document_content = String::new();
+            for context_range in &context_match.context_ranges {
+                document_content.push_str(&content[context_range.clone()]);
+                document_content.push_str("\n");
+            }
+
+            let mut offset = item_range.start;
+            for collapsed_range in &collapsed_ranges_within {
+                if collapsed_range.start > offset {
+                    add_content_from_range(
+                        &mut document_content,
+                        content,
+                        offset..collapsed_range.start,
+                        context_match.start_col,
+                    );
+                    offset = collapsed_range.start;
+                }
+
+                if collapsed_range.end > offset {
+                    document_content.push_str(placeholder);
+                    offset = collapsed_range.end;
+                }
+            }
+
+            if offset < item_range.end {
+                add_content_from_range(
+                    &mut document_content,
+                    content,
+                    offset..item_range.end,
+                    context_match.start_col,
+                );
+            }
+
+            documents.push(Document {
+                name,
+                content: document_content,
+                range: item_range.clone(),
+                embedding: vec![],
+            })
+        }
+
+        return Ok(documents);
+    }
+}
+
+pub(crate) fn subtract_ranges(
+    ranges: &[Range<usize>],
+    ranges_to_subtract: &[Range<usize>],
+) -> Vec<Range<usize>> {
+    let mut result = Vec::new();
+
+    let mut ranges_to_subtract = ranges_to_subtract.iter().peekable();
+
+    for range in ranges {
+        let mut offset = range.start;
+
+        while offset < range.end {
+            if let Some(range_to_subtract) = ranges_to_subtract.peek() {
+                if offset < range_to_subtract.start {
+                    let next_offset = cmp::min(range_to_subtract.start, range.end);
+                    result.push(offset..next_offset);
+                    offset = next_offset;
+                } else {
+                    let next_offset = cmp::min(range_to_subtract.end, range.end);
+                    offset = next_offset;
+                }
+
+                if offset >= range_to_subtract.end {
+                    ranges_to_subtract.next();
+                }
+            } else {
+                result.push(offset..range.end);
+                offset = range.end;
+            }
+        }
+    }
+
+    result
+}
+
+fn add_content_from_range(
+    output: &mut String,
+    content: &str,
+    range: Range<usize>,
+    start_col: usize,
+) {
+    for mut line in content.get(range.clone()).unwrap_or("").lines() {
+        for _ in 0..start_col {
+            if line.starts_with(' ') {
+                line = &line[1..];
+            } else {
+                break;
+            }
+        }
+        output.push_str(line);
+        output.push('\n');
+    }
+    output.pop();
+}

crates/semantic_index/src/semantic_index.rs 🔗

@@ -0,0 +1,777 @@
+mod db;
+mod embedding;
+mod parsing;
+pub mod semantic_index_settings;
+
+#[cfg(test)]
+mod semantic_index_tests;
+
+use crate::semantic_index_settings::SemanticIndexSettings;
+use anyhow::{anyhow, Result};
+use db::VectorDatabase;
+use embedding::{EmbeddingProvider, OpenAIEmbeddings};
+use futures::{channel::oneshot, Future};
+use globset::GlobMatcher;
+use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle};
+use language::{Anchor, Buffer, Language, LanguageRegistry};
+use parking_lot::Mutex;
+use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES};
+use postage::watch;
+use project::{Fs, Project, WorktreeId};
+use smol::channel;
+use std::{
+    collections::HashMap,
+    mem,
+    ops::Range,
+    path::{Path, PathBuf},
+    sync::{Arc, Weak},
+    time::{Instant, SystemTime},
+};
+use util::{
+    channel::{ReleaseChannel, RELEASE_CHANNEL, RELEASE_CHANNEL_NAME},
+    http::HttpClient,
+    paths::EMBEDDINGS_DIR,
+    ResultExt,
+};
+
+const SEMANTIC_INDEX_VERSION: usize = 6;
+const EMBEDDINGS_BATCH_SIZE: usize = 80;
+
+pub fn init(
+    fs: Arc<dyn Fs>,
+    http_client: Arc<dyn HttpClient>,
+    language_registry: Arc<LanguageRegistry>,
+    cx: &mut AppContext,
+) {
+    settings::register::<SemanticIndexSettings>(cx);
+
+    let db_file_path = EMBEDDINGS_DIR
+        .join(Path::new(RELEASE_CHANNEL_NAME.as_str()))
+        .join("embeddings_db");
+
+    if *RELEASE_CHANNEL == ReleaseChannel::Stable
+        || !settings::get::<SemanticIndexSettings>(cx).enabled
+    {
+        return;
+    }
+
+    cx.spawn(move |mut cx| async move {
+        let semantic_index = SemanticIndex::new(
+            fs,
+            db_file_path,
+            Arc::new(OpenAIEmbeddings {
+                client: http_client,
+                executor: cx.background(),
+            }),
+            language_registry,
+            cx.clone(),
+        )
+        .await?;
+
+        cx.update(|cx| {
+            cx.set_global(semantic_index.clone());
+        });
+
+        anyhow::Ok(())
+    })
+    .detach();
+}
+
+pub struct SemanticIndex {
+    fs: Arc<dyn Fs>,
+    database_url: Arc<PathBuf>,
+    embedding_provider: Arc<dyn EmbeddingProvider>,
+    language_registry: Arc<LanguageRegistry>,
+    db_update_tx: channel::Sender<DbOperation>,
+    parsing_files_tx: channel::Sender<PendingFile>,
+    _db_update_task: Task<()>,
+    _embed_batch_tasks: Vec<Task<()>>,
+    _batch_files_task: Task<()>,
+    _parsing_files_tasks: Vec<Task<()>>,
+    projects: HashMap<WeakModelHandle<Project>, ProjectState>,
+}
+
+struct ProjectState {
+    worktree_db_ids: Vec<(WorktreeId, i64)>,
+    outstanding_job_count_rx: watch::Receiver<usize>,
+    _outstanding_job_count_tx: Arc<Mutex<watch::Sender<usize>>>,
+}
+
+struct JobHandle {
+    tx: Weak<Mutex<watch::Sender<usize>>>,
+}
+
+impl ProjectState {
+    fn db_id_for_worktree_id(&self, id: WorktreeId) -> Option<i64> {
+        self.worktree_db_ids
+            .iter()
+            .find_map(|(worktree_id, db_id)| {
+                if *worktree_id == id {
+                    Some(*db_id)
+                } else {
+                    None
+                }
+            })
+    }
+
+    fn worktree_id_for_db_id(&self, id: i64) -> Option<WorktreeId> {
+        self.worktree_db_ids
+            .iter()
+            .find_map(|(worktree_id, db_id)| {
+                if *db_id == id {
+                    Some(*worktree_id)
+                } else {
+                    None
+                }
+            })
+    }
+}
+
+pub struct PendingFile {
+    worktree_db_id: i64,
+    relative_path: PathBuf,
+    absolute_path: PathBuf,
+    language: Arc<Language>,
+    modified_time: SystemTime,
+    job_handle: JobHandle,
+}
+
+pub struct SearchResult {
+    pub buffer: ModelHandle<Buffer>,
+    pub range: Range<Anchor>,
+}
+
+enum DbOperation {
+    InsertFile {
+        worktree_id: i64,
+        documents: Vec<Document>,
+        path: PathBuf,
+        mtime: SystemTime,
+        job_handle: JobHandle,
+    },
+    Delete {
+        worktree_id: i64,
+        path: PathBuf,
+    },
+    FindOrCreateWorktree {
+        path: PathBuf,
+        sender: oneshot::Sender<Result<i64>>,
+    },
+    FileMTimes {
+        worktree_id: i64,
+        sender: oneshot::Sender<Result<HashMap<PathBuf, SystemTime>>>,
+    },
+    WorktreePreviouslyIndexed {
+        path: Arc<Path>,
+        sender: oneshot::Sender<Result<bool>>,
+    },
+}
+
+enum EmbeddingJob {
+    Enqueue {
+        worktree_id: i64,
+        path: PathBuf,
+        mtime: SystemTime,
+        documents: Vec<Document>,
+        job_handle: JobHandle,
+    },
+    Flush,
+}
+
+impl SemanticIndex {
+    pub fn global(cx: &AppContext) -> Option<ModelHandle<SemanticIndex>> {
+        if cx.has_global::<ModelHandle<Self>>() {
+            Some(cx.global::<ModelHandle<SemanticIndex>>().clone())
+        } else {
+            None
+        }
+    }
+
+    pub fn enabled(cx: &AppContext) -> bool {
+        settings::get::<SemanticIndexSettings>(cx).enabled
+            && *RELEASE_CHANNEL != ReleaseChannel::Stable
+    }
+
+    async fn new(
+        fs: Arc<dyn Fs>,
+        database_url: PathBuf,
+        embedding_provider: Arc<dyn EmbeddingProvider>,
+        language_registry: Arc<LanguageRegistry>,
+        mut cx: AsyncAppContext,
+    ) -> Result<ModelHandle<Self>> {
+        let t0 = Instant::now();
+        let database_url = Arc::new(database_url);
+
+        let db = cx
+            .background()
+            .spawn(VectorDatabase::new(fs.clone(), database_url.clone()))
+            .await?;
+
+        log::trace!(
+            "db initialization took {:?} milliseconds",
+            t0.elapsed().as_millis()
+        );
+
+        Ok(cx.add_model(|cx| {
+            let t0 = Instant::now();
+            // Perform database operations
+            let (db_update_tx, db_update_rx) = channel::unbounded();
+            let _db_update_task = cx.background().spawn({
+                async move {
+                    while let Ok(job) = db_update_rx.recv().await {
+                        Self::run_db_operation(&db, job)
+                    }
+                }
+            });
+
+            // Group documents into batches and send them to the embedding provider.
+            let (embed_batch_tx, embed_batch_rx) =
+                channel::unbounded::<Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>>();
+            let mut _embed_batch_tasks = Vec::new();
+            for _ in 0..cx.background().num_cpus() {
+                let embed_batch_rx = embed_batch_rx.clone();
+                _embed_batch_tasks.push(cx.background().spawn({
+                    let db_update_tx = db_update_tx.clone();
+                    let embedding_provider = embedding_provider.clone();
+                    async move {
+                        while let Ok(embeddings_queue) = embed_batch_rx.recv().await {
+                            Self::compute_embeddings_for_batch(
+                                embeddings_queue,
+                                &embedding_provider,
+                                &db_update_tx,
+                            )
+                            .await;
+                        }
+                    }
+                }));
+            }
+
+            // Group documents into batches and send them to the embedding provider.
+            let (batch_files_tx, batch_files_rx) = channel::unbounded::<EmbeddingJob>();
+            let _batch_files_task = cx.background().spawn(async move {
+                let mut queue_len = 0;
+                let mut embeddings_queue = vec![];
+                while let Ok(job) = batch_files_rx.recv().await {
+                    Self::enqueue_documents_to_embed(
+                        job,
+                        &mut queue_len,
+                        &mut embeddings_queue,
+                        &embed_batch_tx,
+                    );
+                }
+            });
+
+            // Parse files into embeddable documents.
+            let (parsing_files_tx, parsing_files_rx) = channel::unbounded::<PendingFile>();
+            let mut _parsing_files_tasks = Vec::new();
+            for _ in 0..cx.background().num_cpus() {
+                let fs = fs.clone();
+                let parsing_files_rx = parsing_files_rx.clone();
+                let batch_files_tx = batch_files_tx.clone();
+                let db_update_tx = db_update_tx.clone();
+                _parsing_files_tasks.push(cx.background().spawn(async move {
+                    let mut retriever = CodeContextRetriever::new();
+                    while let Ok(pending_file) = parsing_files_rx.recv().await {
+                        Self::parse_file(
+                            &fs,
+                            pending_file,
+                            &mut retriever,
+                            &batch_files_tx,
+                            &parsing_files_rx,
+                            &db_update_tx,
+                        )
+                        .await;
+                    }
+                }));
+            }
+
+            log::trace!(
+                "semantic index task initialization took {:?} milliseconds",
+                t0.elapsed().as_millis()
+            );
+            Self {
+                fs,
+                database_url,
+                embedding_provider,
+                language_registry,
+                db_update_tx,
+                parsing_files_tx,
+                _db_update_task,
+                _embed_batch_tasks,
+                _batch_files_task,
+                _parsing_files_tasks,
+                projects: HashMap::new(),
+            }
+        }))
+    }
+
+    fn run_db_operation(db: &VectorDatabase, job: DbOperation) {
+        match job {
+            DbOperation::InsertFile {
+                worktree_id,
+                documents,
+                path,
+                mtime,
+                job_handle,
+            } => {
+                db.insert_file(worktree_id, path, mtime, documents)
+                    .log_err();
+                drop(job_handle)
+            }
+            DbOperation::Delete { worktree_id, path } => {
+                db.delete_file(worktree_id, path).log_err();
+            }
+            DbOperation::FindOrCreateWorktree { path, sender } => {
+                let id = db.find_or_create_worktree(&path);
+                sender.send(id).ok();
+            }
+            DbOperation::FileMTimes {
+                worktree_id: worktree_db_id,
+                sender,
+            } => {
+                let file_mtimes = db.get_file_mtimes(worktree_db_id);
+                sender.send(file_mtimes).ok();
+            }
+            DbOperation::WorktreePreviouslyIndexed { path, sender } => {
+                let worktree_indexed = db.worktree_previously_indexed(path.as_ref());
+                sender.send(worktree_indexed).ok();
+            }
+        }
+    }
+
+    async fn compute_embeddings_for_batch(
+        mut embeddings_queue: Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>,
+        embedding_provider: &Arc<dyn EmbeddingProvider>,
+        db_update_tx: &channel::Sender<DbOperation>,
+    ) {
+        let mut batch_documents = vec![];
+        for (_, documents, _, _, _) in embeddings_queue.iter() {
+            batch_documents.extend(documents.iter().map(|document| document.content.as_str()));
+        }
+
+        if let Ok(embeddings) = embedding_provider.embed_batch(batch_documents).await {
+            log::trace!(
+                "created {} embeddings for {} files",
+                embeddings.len(),
+                embeddings_queue.len(),
+            );
+
+            let mut i = 0;
+            let mut j = 0;
+
+            for embedding in embeddings.iter() {
+                while embeddings_queue[i].1.len() == j {
+                    i += 1;
+                    j = 0;
+                }
+
+                embeddings_queue[i].1[j].embedding = embedding.to_owned();
+                j += 1;
+            }
+
+            for (worktree_id, documents, path, mtime, job_handle) in embeddings_queue.into_iter() {
+                db_update_tx
+                    .send(DbOperation::InsertFile {
+                        worktree_id,
+                        documents,
+                        path,
+                        mtime,
+                        job_handle,
+                    })
+                    .await
+                    .unwrap();
+            }
+        }
+    }
+
+    fn enqueue_documents_to_embed(
+        job: EmbeddingJob,
+        queue_len: &mut usize,
+        embeddings_queue: &mut Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>,
+        embed_batch_tx: &channel::Sender<Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>>,
+    ) {
+        let should_flush = match job {
+            EmbeddingJob::Enqueue {
+                documents,
+                worktree_id,
+                path,
+                mtime,
+                job_handle,
+            } => {
+                *queue_len += &documents.len();
+                embeddings_queue.push((worktree_id, documents, path, mtime, job_handle));
+                *queue_len >= EMBEDDINGS_BATCH_SIZE
+            }
+            EmbeddingJob::Flush => true,
+        };
+
+        if should_flush {
+            embed_batch_tx
+                .try_send(mem::take(embeddings_queue))
+                .unwrap();
+            *queue_len = 0;
+        }
+    }
+
+    async fn parse_file(
+        fs: &Arc<dyn Fs>,
+        pending_file: PendingFile,
+        retriever: &mut CodeContextRetriever,
+        batch_files_tx: &channel::Sender<EmbeddingJob>,
+        parsing_files_rx: &channel::Receiver<PendingFile>,
+        db_update_tx: &channel::Sender<DbOperation>,
+    ) {
+        if let Some(content) = fs.load(&pending_file.absolute_path).await.log_err() {
+            if let Some(documents) = retriever
+                .parse_file_with_template(
+                    &pending_file.relative_path,
+                    &content,
+                    pending_file.language,
+                )
+                .log_err()
+            {
+                log::trace!(
+                    "parsed path {:?}: {} documents",
+                    pending_file.relative_path,
+                    documents.len()
+                );
+
+                if documents.len() == 0 {
+                    db_update_tx
+                        .send(DbOperation::InsertFile {
+                            worktree_id: pending_file.worktree_db_id,
+                            documents,
+                            path: pending_file.relative_path,
+                            mtime: pending_file.modified_time,
+                            job_handle: pending_file.job_handle,
+                        })
+                        .await
+                        .unwrap();
+                } else {
+                    batch_files_tx
+                        .try_send(EmbeddingJob::Enqueue {
+                            worktree_id: pending_file.worktree_db_id,
+                            path: pending_file.relative_path,
+                            mtime: pending_file.modified_time,
+                            job_handle: pending_file.job_handle,
+                            documents,
+                        })
+                        .unwrap();
+                }
+            }
+        }
+
+        if parsing_files_rx.len() == 0 {
+            batch_files_tx.try_send(EmbeddingJob::Flush).unwrap();
+        }
+    }
+
+    fn find_or_create_worktree(&self, path: PathBuf) -> impl Future<Output = Result<i64>> {
+        let (tx, rx) = oneshot::channel();
+        self.db_update_tx
+            .try_send(DbOperation::FindOrCreateWorktree { path, sender: tx })
+            .unwrap();
+        async move { rx.await? }
+    }
+
+    fn get_file_mtimes(
+        &self,
+        worktree_id: i64,
+    ) -> impl Future<Output = Result<HashMap<PathBuf, SystemTime>>> {
+        let (tx, rx) = oneshot::channel();
+        self.db_update_tx
+            .try_send(DbOperation::FileMTimes {
+                worktree_id,
+                sender: tx,
+            })
+            .unwrap();
+        async move { rx.await? }
+    }
+
+    fn worktree_previously_indexed(&self, path: Arc<Path>) -> impl Future<Output = Result<bool>> {
+        let (tx, rx) = oneshot::channel();
+        self.db_update_tx
+            .try_send(DbOperation::WorktreePreviouslyIndexed { path, sender: tx })
+            .unwrap();
+        async move { rx.await? }
+    }
+
+    pub fn project_previously_indexed(
+        &mut self,
+        project: ModelHandle<Project>,
+        cx: &mut ModelContext<Self>,
+    ) -> Task<Result<bool>> {
+        let worktree_scans_complete = project
+            .read(cx)
+            .worktrees(cx)
+            .map(|worktree| {
+                let scan_complete = worktree.read(cx).as_local().unwrap().scan_complete();
+                async move {
+                    scan_complete.await;
+                }
+            })
+            .collect::<Vec<_>>();
+
+        let worktrees_indexed_previously = project
+            .read(cx)
+            .worktrees(cx)
+            .map(|worktree| self.worktree_previously_indexed(worktree.read(cx).abs_path()))
+            .collect::<Vec<_>>();
+
+        cx.spawn(|_, _cx| async move {
+            futures::future::join_all(worktree_scans_complete).await;
+
+            let worktree_indexed_previously =
+                futures::future::join_all(worktrees_indexed_previously).await;
+
+            Ok(worktree_indexed_previously
+                .iter()
+                .filter(|worktree| worktree.is_ok())
+                .all(|v| v.as_ref().log_err().is_some_and(|v| v.to_owned())))
+        })
+    }
+
+    pub fn index_project(
+        &mut self,
+        project: ModelHandle<Project>,
+        cx: &mut ModelContext<Self>,
+    ) -> Task<Result<(usize, watch::Receiver<usize>)>> {
+        let t0 = Instant::now();
+        let worktree_scans_complete = project
+            .read(cx)
+            .worktrees(cx)
+            .map(|worktree| {
+                let scan_complete = worktree.read(cx).as_local().unwrap().scan_complete();
+                async move {
+                    scan_complete.await;
+                }
+            })
+            .collect::<Vec<_>>();
+        let worktree_db_ids = project
+            .read(cx)
+            .worktrees(cx)
+            .map(|worktree| {
+                self.find_or_create_worktree(worktree.read(cx).abs_path().to_path_buf())
+            })
+            .collect::<Vec<_>>();
+
+        let language_registry = self.language_registry.clone();
+        let db_update_tx = self.db_update_tx.clone();
+        let parsing_files_tx = self.parsing_files_tx.clone();
+
+        cx.spawn(|this, mut cx| async move {
+            futures::future::join_all(worktree_scans_complete).await;
+
+            let worktree_db_ids = futures::future::join_all(worktree_db_ids).await;
+
+            let worktrees = project.read_with(&cx, |project, cx| {
+                project
+                    .worktrees(cx)
+                    .map(|worktree| worktree.read(cx).snapshot())
+                    .collect::<Vec<_>>()
+            });
+
+            let mut worktree_file_mtimes = HashMap::new();
+            let mut db_ids_by_worktree_id = HashMap::new();
+            for (worktree, db_id) in worktrees.iter().zip(worktree_db_ids) {
+                let db_id = db_id?;
+                db_ids_by_worktree_id.insert(worktree.id(), db_id);
+                worktree_file_mtimes.insert(
+                    worktree.id(),
+                    this.read_with(&cx, |this, _| this.get_file_mtimes(db_id))
+                        .await?,
+                );
+            }
+
+            let (job_count_tx, job_count_rx) = watch::channel_with(0);
+            let job_count_tx = Arc::new(Mutex::new(job_count_tx));
+            this.update(&mut cx, |this, _| {
+                this.projects.insert(
+                    project.downgrade(),
+                    ProjectState {
+                        worktree_db_ids: db_ids_by_worktree_id
+                            .iter()
+                            .map(|(a, b)| (*a, *b))
+                            .collect(),
+                        outstanding_job_count_rx: job_count_rx.clone(),
+                        _outstanding_job_count_tx: job_count_tx.clone(),
+                    },
+                );
+            });
+
+            cx.background()
+                .spawn(async move {
+                    let mut count = 0;
+                    for worktree in worktrees.into_iter() {
+                        let mut file_mtimes = worktree_file_mtimes.remove(&worktree.id()).unwrap();
+                        for file in worktree.files(false, 0) {
+                            let absolute_path = worktree.absolutize(&file.path);
+
+                            if let Ok(language) = language_registry
+                                .language_for_file(&absolute_path, None)
+                                .await
+                            {
+                                if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref())
+                                    && language
+                                        .grammar()
+                                        .and_then(|grammar| grammar.embedding_config.as_ref())
+                                        .is_none()
+                                {
+                                    continue;
+                                }
+
+                                let path_buf = file.path.to_path_buf();
+                                let stored_mtime = file_mtimes.remove(&file.path.to_path_buf());
+                                let already_stored = stored_mtime
+                                    .map_or(false, |existing_mtime| existing_mtime == file.mtime);
+
+                                if !already_stored {
+                                    count += 1;
+                                    *job_count_tx.lock().borrow_mut() += 1;
+                                    let job_handle = JobHandle {
+                                        tx: Arc::downgrade(&job_count_tx),
+                                    };
+                                    parsing_files_tx
+                                        .try_send(PendingFile {
+                                            worktree_db_id: db_ids_by_worktree_id[&worktree.id()],
+                                            relative_path: path_buf,
+                                            absolute_path,
+                                            language,
+                                            job_handle,
+                                            modified_time: file.mtime,
+                                        })
+                                        .unwrap();
+                                }
+                            }
+                        }
+                        for file in file_mtimes.keys() {
+                            db_update_tx
+                                .try_send(DbOperation::Delete {
+                                    worktree_id: db_ids_by_worktree_id[&worktree.id()],
+                                    path: file.to_owned(),
+                                })
+                                .unwrap();
+                        }
+                    }
+
+                    log::trace!(
+                        "walking worktree took {:?} milliseconds",
+                        t0.elapsed().as_millis()
+                    );
+                    anyhow::Ok((count, job_count_rx))
+                })
+                .await
+        })
+    }
+
+    pub fn outstanding_job_count_rx(
+        &self,
+        project: &ModelHandle<Project>,
+    ) -> Option<watch::Receiver<usize>> {
+        Some(
+            self.projects
+                .get(&project.downgrade())?
+                .outstanding_job_count_rx
+                .clone(),
+        )
+    }
+
+    pub fn search_project(
+        &mut self,
+        project: ModelHandle<Project>,
+        phrase: String,
+        limit: usize,
+        include_globs: Vec<GlobMatcher>,
+        exclude_globs: Vec<GlobMatcher>,
+        cx: &mut ModelContext<Self>,
+    ) -> Task<Result<Vec<SearchResult>>> {
+        let project_state = if let Some(state) = self.projects.get(&project.downgrade()) {
+            state
+        } else {
+            return Task::ready(Err(anyhow!("project not added")));
+        };
+
+        let worktree_db_ids = project
+            .read(cx)
+            .worktrees(cx)
+            .filter_map(|worktree| {
+                let worktree_id = worktree.read(cx).id();
+                project_state.db_id_for_worktree_id(worktree_id)
+            })
+            .collect::<Vec<_>>();
+
+        let embedding_provider = self.embedding_provider.clone();
+        let database_url = self.database_url.clone();
+        let fs = self.fs.clone();
+        cx.spawn(|this, mut cx| async move {
+            let documents = cx
+                .background()
+                .spawn(async move {
+                    let database = VectorDatabase::new(fs, database_url).await?;
+
+                    let phrase_embedding = embedding_provider
+                        .embed_batch(vec![&phrase])
+                        .await?
+                        .into_iter()
+                        .next()
+                        .unwrap();
+
+                    database.top_k_search(
+                        &worktree_db_ids,
+                        &phrase_embedding,
+                        limit,
+                        include_globs,
+                        exclude_globs,
+                    )
+                })
+                .await?;
+
+            let mut tasks = Vec::new();
+            let mut ranges = Vec::new();
+            let weak_project = project.downgrade();
+            project.update(&mut cx, |project, cx| {
+                for (worktree_db_id, file_path, byte_range) in documents {
+                    let project_state =
+                        if let Some(state) = this.read(cx).projects.get(&weak_project) {
+                            state
+                        } else {
+                            return Err(anyhow!("project not added"));
+                        };
+                    if let Some(worktree_id) = project_state.worktree_id_for_db_id(worktree_db_id) {
+                        tasks.push(project.open_buffer((worktree_id, file_path), cx));
+                        ranges.push(byte_range);
+                    }
+                }
+
+                Ok(())
+            })?;
+
+            let buffers = futures::future::join_all(tasks).await;
+
+            Ok(buffers
+                .into_iter()
+                .zip(ranges)
+                .filter_map(|(buffer, range)| {
+                    let buffer = buffer.log_err()?;
+                    let range = buffer.read_with(&cx, |buffer, _| {
+                        buffer.anchor_before(range.start)..buffer.anchor_after(range.end)
+                    });
+                    Some(SearchResult { buffer, range })
+                })
+                .collect::<Vec<_>>())
+        })
+    }
+}
+
+impl Entity for SemanticIndex {
+    type Event = ();
+}
+
+impl Drop for JobHandle {
+    fn drop(&mut self) {
+        if let Some(tx) = self.tx.upgrade() {
+            let mut tx = tx.lock();
+            *tx.borrow_mut() -= 1;
+        }
+    }
+}

crates/vector_store/src/vector_store_settings.rs → crates/semantic_index/src/semantic_index_settings.rs 🔗

@@ -4,21 +4,21 @@ use serde::{Deserialize, Serialize};
 use settings::Setting;
 
 #[derive(Deserialize, Debug)]
-pub struct VectorStoreSettings {
+pub struct SemanticIndexSettings {
     pub enabled: bool,
     pub reindexing_delay_seconds: usize,
 }
 
 #[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug)]
-pub struct VectorStoreSettingsContent {
+pub struct SemanticIndexSettingsContent {
     pub enabled: Option<bool>,
     pub reindexing_delay_seconds: Option<usize>,
 }
 
-impl Setting for VectorStoreSettings {
-    const KEY: Option<&'static str> = Some("vector_store");
+impl Setting for SemanticIndexSettings {
+    const KEY: Option<&'static str> = Some("semantic_index");
 
-    type FileContent = VectorStoreSettingsContent;
+    type FileContent = SemanticIndexSettingsContent;
 
     fn load(
         default_value: &Self::FileContent,

crates/semantic_index/src/semantic_index_tests.rs 🔗

@@ -0,0 +1,1142 @@
+use crate::{
+    db::dot,
+    embedding::EmbeddingProvider,
+    parsing::{subtract_ranges, CodeContextRetriever, Document},
+    semantic_index_settings::SemanticIndexSettings,
+    SearchResult, SemanticIndex,
+};
+use anyhow::Result;
+use async_trait::async_trait;
+use globset::Glob;
+use gpui::{Task, TestAppContext};
+use language::{Language, LanguageConfig, LanguageRegistry, ToOffset};
+use pretty_assertions::assert_eq;
+use project::{project_settings::ProjectSettings, FakeFs, Fs, Project};
+use rand::{rngs::StdRng, Rng};
+use serde_json::json;
+use settings::SettingsStore;
+use std::{
+    path::Path,
+    sync::{
+        atomic::{self, AtomicUsize},
+        Arc,
+    },
+};
+use unindent::Unindent;
+
+#[ctor::ctor]
+fn init_logger() {
+    if std::env::var("RUST_LOG").is_ok() {
+        env_logger::init();
+    }
+}
+
+#[gpui::test]
+async fn test_semantic_index(cx: &mut TestAppContext) {
+    cx.update(|cx| {
+        cx.set_global(SettingsStore::test(cx));
+        settings::register::<SemanticIndexSettings>(cx);
+        settings::register::<ProjectSettings>(cx);
+    });
+
+    let fs = FakeFs::new(cx.background());
+    fs.insert_tree(
+        "/the-root",
+        json!({
+            "src": {
+                "file1.rs": "
+                    fn aaa() {
+                        println!(\"aaaaaaaaaaaa!\");
+                    }
+
+                    fn zzzzz() {
+                        println!(\"SLEEPING\");
+                    }
+                ".unindent(),
+                "file2.rs": "
+                    fn bbb() {
+                        println!(\"bbbbbbbbbbbbb!\");
+                    }
+                ".unindent(),
+                "file3.toml": "
+                    ZZZZZZZZZZZZZZZZZZ = 5
+                ".unindent(),
+            }
+        }),
+    )
+    .await;
+
+    let languages = Arc::new(LanguageRegistry::new(Task::ready(())));
+    let rust_language = rust_lang();
+    let toml_language = toml_lang();
+    languages.add(rust_language);
+    languages.add(toml_language);
+
+    let db_dir = tempdir::TempDir::new("vector-store").unwrap();
+    let db_path = db_dir.path().join("db.sqlite");
+
+    let embedding_provider = Arc::new(FakeEmbeddingProvider::default());
+    let store = SemanticIndex::new(
+        fs.clone(),
+        db_path,
+        embedding_provider.clone(),
+        languages,
+        cx.to_async(),
+    )
+    .await
+    .unwrap();
+
+    let project = Project::test(fs.clone(), ["/the-root".as_ref()], cx).await;
+    let (file_count, outstanding_file_count) = store
+        .update(cx, |store, cx| store.index_project(project.clone(), cx))
+        .await
+        .unwrap();
+    assert_eq!(file_count, 3);
+    cx.foreground().run_until_parked();
+    assert_eq!(*outstanding_file_count.borrow(), 0);
+
+    let search_results = store
+        .update(cx, |store, cx| {
+            store.search_project(
+                project.clone(),
+                "aaaaaabbbbzz".to_string(),
+                5,
+                vec![],
+                vec![],
+                cx,
+            )
+        })
+        .await
+        .unwrap();
+
+    assert_search_results(
+        &search_results,
+        &[
+            (Path::new("src/file1.rs").into(), 0),
+            (Path::new("src/file2.rs").into(), 0),
+            (Path::new("src/file3.toml").into(), 0),
+            (Path::new("src/file1.rs").into(), 45),
+        ],
+        cx,
+    );
+
+    // Test Include Files Functonality
+    let include_files = vec![Glob::new("*.rs").unwrap().compile_matcher()];
+    let exclude_files = vec![Glob::new("*.rs").unwrap().compile_matcher()];
+    let rust_only_search_results = store
+        .update(cx, |store, cx| {
+            store.search_project(
+                project.clone(),
+                "aaaaaabbbbzz".to_string(),
+                5,
+                include_files,
+                vec![],
+                cx,
+            )
+        })
+        .await
+        .unwrap();
+
+    assert_search_results(
+        &rust_only_search_results,
+        &[
+            (Path::new("src/file1.rs").into(), 0),
+            (Path::new("src/file2.rs").into(), 0),
+            (Path::new("src/file1.rs").into(), 45),
+        ],
+        cx,
+    );
+
+    let no_rust_search_results = store
+        .update(cx, |store, cx| {
+            store.search_project(
+                project.clone(),
+                "aaaaaabbbbzz".to_string(),
+                5,
+                vec![],
+                exclude_files,
+                cx,
+            )
+        })
+        .await
+        .unwrap();
+
+    assert_search_results(
+        &no_rust_search_results,
+        &[(Path::new("src/file3.toml").into(), 0)],
+        cx,
+    );
+
+    fs.save(
+        "/the-root/src/file2.rs".as_ref(),
+        &"
+            fn dddd() { println!(\"ddddd!\"); }
+            struct pqpqpqp {}
+        "
+        .unindent()
+        .into(),
+        Default::default(),
+    )
+    .await
+    .unwrap();
+
+    cx.foreground().run_until_parked();
+
+    let prev_embedding_count = embedding_provider.embedding_count();
+    let (file_count, outstanding_file_count) = store
+        .update(cx, |store, cx| store.index_project(project.clone(), cx))
+        .await
+        .unwrap();
+    assert_eq!(file_count, 1);
+
+    cx.foreground().run_until_parked();
+    assert_eq!(*outstanding_file_count.borrow(), 0);
+
+    assert_eq!(
+        embedding_provider.embedding_count() - prev_embedding_count,
+        2
+    );
+}
+
+#[track_caller]
+fn assert_search_results(
+    actual: &[SearchResult],
+    expected: &[(Arc<Path>, usize)],
+    cx: &TestAppContext,
+) {
+    let actual = actual
+        .iter()
+        .map(|search_result| {
+            search_result.buffer.read_with(cx, |buffer, _cx| {
+                (
+                    buffer.file().unwrap().path().clone(),
+                    search_result.range.start.to_offset(buffer),
+                )
+            })
+        })
+        .collect::<Vec<_>>();
+    assert_eq!(actual, expected);
+}
+
+#[gpui::test]
+async fn test_code_context_retrieval_rust() {
+    let language = rust_lang();
+    let mut retriever = CodeContextRetriever::new();
+
+    let text = "
+        /// A doc comment
+        /// that spans multiple lines
+        #[gpui::test]
+        fn a() {
+            b
+        }
+
+        impl C for D {
+        }
+
+        impl E {
+            // This is also a preceding comment
+            pub fn function_1() -> Option<()> {
+                todo!();
+            }
+
+            // This is a preceding comment
+            fn function_2() -> Result<()> {
+                todo!();
+            }
+        }
+    "
+    .unindent();
+
+    let documents = retriever.parse_file(&text, language).unwrap();
+
+    assert_documents_eq(
+        &documents,
+        &[
+            (
+                "
+                /// A doc comment
+                /// that spans multiple lines
+                #[gpui::test]
+                fn a() {
+                    b
+                }"
+                .unindent(),
+                text.find("fn a").unwrap(),
+            ),
+            (
+                "
+                impl C for D {
+                }"
+                .unindent(),
+                text.find("impl C").unwrap(),
+            ),
+            (
+                "
+                impl E {
+                    // This is also a preceding comment
+                    pub fn function_1() -> Option<()> { /* ... */ }
+
+                    // This is a preceding comment
+                    fn function_2() -> Result<()> { /* ... */ }
+                }"
+                .unindent(),
+                text.find("impl E").unwrap(),
+            ),
+            (
+                "
+                // This is also a preceding comment
+                pub fn function_1() -> Option<()> {
+                    todo!();
+                }"
+                .unindent(),
+                text.find("pub fn function_1").unwrap(),
+            ),
+            (
+                "
+                // This is a preceding comment
+                fn function_2() -> Result<()> {
+                    todo!();
+                }"
+                .unindent(),
+                text.find("fn function_2").unwrap(),
+            ),
+        ],
+    );
+}
+
+#[gpui::test]
+async fn test_code_context_retrieval_json() {
+    let language = json_lang();
+    let mut retriever = CodeContextRetriever::new();
+
+    let text = r#"
+        {
+            "array": [1, 2, 3, 4],
+            "string": "abcdefg",
+            "nested_object": {
+                "array_2": [5, 6, 7, 8],
+                "string_2": "hijklmnop",
+                "boolean": true,
+                "none": null
+            }
+        }
+    "#
+    .unindent();
+
+    let documents = retriever.parse_file(&text, language.clone()).unwrap();
+
+    assert_documents_eq(
+        &documents,
+        &[(
+            r#"
+                {
+                    "array": [],
+                    "string": "",
+                    "nested_object": {
+                        "array_2": [],
+                        "string_2": "",
+                        "boolean": true,
+                        "none": null
+                    }
+                }"#
+            .unindent(),
+            text.find("{").unwrap(),
+        )],
+    );
+
+    let text = r#"
+        [
+            {
+                "name": "somebody",
+                "age": 42
+            },
+            {
+                "name": "somebody else",
+                "age": 43
+            }
+        ]
+    "#
+    .unindent();
+
+    let documents = retriever.parse_file(&text, language.clone()).unwrap();
+
+    assert_documents_eq(
+        &documents,
+        &[(
+            r#"
+            [{
+                    "name": "",
+                    "age": 42
+                }]"#
+            .unindent(),
+            text.find("[").unwrap(),
+        )],
+    );
+}
+
+fn assert_documents_eq(
+    documents: &[Document],
+    expected_contents_and_start_offsets: &[(String, usize)],
+) {
+    assert_eq!(
+        documents
+            .iter()
+            .map(|document| (document.content.clone(), document.range.start))
+            .collect::<Vec<_>>(),
+        expected_contents_and_start_offsets
+    );
+}
+
+#[gpui::test]
+async fn test_code_context_retrieval_javascript() {
+    let language = js_lang();
+    let mut retriever = CodeContextRetriever::new();
+
+    let text = "
+        /* globals importScripts, backend */
+        function _authorize() {}
+
+        /**
+         * Sometimes the frontend build is way faster than backend.
+         */
+        export async function authorizeBank() {
+            _authorize(pushModal, upgradingAccountId, {});
+        }
+
+        export class SettingsPage {
+            /* This is a test setting */
+            constructor(page) {
+                this.page = page;
+            }
+        }
+
+        /* This is a test comment */
+        class TestClass {}
+
+        /* Schema for editor_events in Clickhouse. */
+        export interface ClickhouseEditorEvent {
+            installation_id: string
+            operation: string
+        }
+        "
+    .unindent();
+
+    let documents = retriever.parse_file(&text, language.clone()).unwrap();
+
+    assert_documents_eq(
+        &documents,
+        &[
+            (
+                "
+            /* globals importScripts, backend */
+            function _authorize() {}"
+                    .unindent(),
+                37,
+            ),
+            (
+                "
+            /**
+             * Sometimes the frontend build is way faster than backend.
+             */
+            export async function authorizeBank() {
+                _authorize(pushModal, upgradingAccountId, {});
+            }"
+                .unindent(),
+                131,
+            ),
+            (
+                "
+                export class SettingsPage {
+                    /* This is a test setting */
+                    constructor(page) {
+                        this.page = page;
+                    }
+                }"
+                .unindent(),
+                225,
+            ),
+            (
+                "
+                /* This is a test setting */
+                constructor(page) {
+                    this.page = page;
+                }"
+                .unindent(),
+                290,
+            ),
+            (
+                "
+                /* This is a test comment */
+                class TestClass {}"
+                    .unindent(),
+                374,
+            ),
+            (
+                "
+                /* Schema for editor_events in Clickhouse. */
+                export interface ClickhouseEditorEvent {
+                    installation_id: string
+                    operation: string
+                }"
+                .unindent(),
+                440,
+            ),
+        ],
+    )
+}
+
+#[gpui::test]
+async fn test_code_context_retrieval_elixir() {
+    let language = elixir_lang();
+    let mut retriever = CodeContextRetriever::new();
+
+    let text = r#"
+        defmodule File.Stream do
+            @moduledoc """
+            Defines a `File.Stream` struct returned by `File.stream!/3`.
+
+            The following fields are public:
+
+            * `path`          - the file path
+            * `modes`         - the file modes
+            * `raw`           - a boolean indicating if bin functions should be used
+            * `line_or_bytes` - if reading should read lines or a given number of bytes
+            * `node`          - the node the file belongs to
+
+            """
+
+            defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil
+
+            @type t :: %__MODULE__{}
+
+            @doc false
+            def __build__(path, modes, line_or_bytes) do
+            raw = :lists.keyfind(:encoding, 1, modes) == false
+
+            modes =
+                case raw do
+                true ->
+                    case :lists.keyfind(:read_ahead, 1, modes) do
+                    {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)]
+                    {:read_ahead, _} -> [:raw | modes]
+                    false -> [:raw, :read_ahead | modes]
+                    end
+
+                false ->
+                    modes
+                end
+
+            %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()}
+
+            end"#
+    .unindent();
+
+    let documents = retriever.parse_file(&text, language.clone()).unwrap();
+
+    assert_documents_eq(
+        &documents,
+        &[(
+            r#"
+        defmodule File.Stream do
+            @moduledoc """
+            Defines a `File.Stream` struct returned by `File.stream!/3`.
+
+            The following fields are public:
+
+            * `path`          - the file path
+            * `modes`         - the file modes
+            * `raw`           - a boolean indicating if bin functions should be used
+            * `line_or_bytes` - if reading should read lines or a given number of bytes
+            * `node`          - the node the file belongs to
+
+            """
+
+            defstruct path: nil, modes: [], line_or_bytes: :line, raw: true, node: nil
+
+            @type t :: %__MODULE__{}
+
+            @doc false
+            def __build__(path, modes, line_or_bytes) do
+            raw = :lists.keyfind(:encoding, 1, modes) == false
+
+            modes =
+                case raw do
+                true ->
+                    case :lists.keyfind(:read_ahead, 1, modes) do
+                    {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)]
+                    {:read_ahead, _} -> [:raw | modes]
+                    false -> [:raw, :read_ahead | modes]
+                    end
+
+                false ->
+                    modes
+                end
+
+            %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()}
+
+            end"#
+                .unindent(),
+            0,
+        ),(r#"
+            @doc false
+            def __build__(path, modes, line_or_bytes) do
+            raw = :lists.keyfind(:encoding, 1, modes) == false
+
+            modes =
+                case raw do
+                true ->
+                    case :lists.keyfind(:read_ahead, 1, modes) do
+                    {:read_ahead, false} -> [:raw | :lists.keydelete(:read_ahead, 1, modes)]
+                    {:read_ahead, _} -> [:raw | modes]
+                    false -> [:raw, :read_ahead | modes]
+                    end
+
+                false ->
+                    modes
+                end
+
+            %File.Stream{path: path, modes: modes, raw: raw, line_or_bytes: line_or_bytes, node: node()}
+
+            end"#.unindent(), 574)],
+    );
+}
+
+#[gpui::test]
+async fn test_code_context_retrieval_cpp() {
+    let language = cpp_lang();
+    let mut retriever = CodeContextRetriever::new();
+
+    let text = "
+    /**
+     * @brief Main function
+     * @returns 0 on exit
+     */
+    int main() { return 0; }
+
+    /**
+    * This is a test comment
+    */
+    class MyClass {       // The class
+        public:           // Access specifier
+        int myNum;        // Attribute (int variable)
+        string myString;  // Attribute (string variable)
+    };
+
+    // This is a test comment
+    enum Color { red, green, blue };
+
+    /** This is a preceding block comment
+     * This is the second line
+     */
+    struct {           // Structure declaration
+        int myNum;       // Member (int variable)
+        string myString; // Member (string variable)
+    } myStructure;
+
+    /**
+     * @brief Matrix class.
+     */
+    template <typename T,
+              typename = typename std::enable_if<
+                std::is_integral<T>::value || std::is_floating_point<T>::value,
+                bool>::type>
+    class Matrix2 {
+        std::vector<std::vector<T>> _mat;
+
+        public:
+            /**
+            * @brief Constructor
+            * @tparam Integer ensuring integers are being evaluated and not other
+            * data types.
+            * @param size denoting the size of Matrix as size x size
+            */
+            template <typename Integer,
+                    typename = typename std::enable_if<std::is_integral<Integer>::value,
+                    Integer>::type>
+            explicit Matrix(const Integer size) {
+                for (size_t i = 0; i < size; ++i) {
+                    _mat.emplace_back(std::vector<T>(size, 0));
+                }
+            }
+    }"
+    .unindent();
+
+    let documents = retriever.parse_file(&text, language.clone()).unwrap();
+
+    assert_documents_eq(
+        &documents,
+        &[
+            (
+                "
+        /**
+         * @brief Main function
+         * @returns 0 on exit
+         */
+        int main() { return 0; }"
+                    .unindent(),
+                54,
+            ),
+            (
+                "
+                /**
+                * This is a test comment
+                */
+                class MyClass {       // The class
+                    public:           // Access specifier
+                    int myNum;        // Attribute (int variable)
+                    string myString;  // Attribute (string variable)
+                }"
+                .unindent(),
+                112,
+            ),
+            (
+                "
+                // This is a test comment
+                enum Color { red, green, blue }"
+                    .unindent(),
+                322,
+            ),
+            (
+                "
+                /** This is a preceding block comment
+                 * This is the second line
+                 */
+                struct {           // Structure declaration
+                    int myNum;       // Member (int variable)
+                    string myString; // Member (string variable)
+                } myStructure;"
+                    .unindent(),
+                425,
+            ),
+            (
+                "
+                /**
+                 * @brief Matrix class.
+                 */
+                template <typename T,
+                          typename = typename std::enable_if<
+                            std::is_integral<T>::value || std::is_floating_point<T>::value,
+                            bool>::type>
+                class Matrix2 {
+                    std::vector<std::vector<T>> _mat;
+
+                    public:
+                        /**
+                        * @brief Constructor
+                        * @tparam Integer ensuring integers are being evaluated and not other
+                        * data types.
+                        * @param size denoting the size of Matrix as size x size
+                        */
+                        template <typename Integer,
+                                typename = typename std::enable_if<std::is_integral<Integer>::value,
+                                Integer>::type>
+                        explicit Matrix(const Integer size) {
+                            for (size_t i = 0; i < size; ++i) {
+                                _mat.emplace_back(std::vector<T>(size, 0));
+                            }
+                        }
+                }"
+                .unindent(),
+                612,
+            ),
+            (
+                "
+                explicit Matrix(const Integer size) {
+                    for (size_t i = 0; i < size; ++i) {
+                        _mat.emplace_back(std::vector<T>(size, 0));
+                    }
+                }"
+                .unindent(),
+                1226,
+            ),
+        ],
+    );
+}
+
+#[gpui::test]
+fn test_dot_product(mut rng: StdRng) {
+    assert_eq!(dot(&[1., 0., 0., 0., 0.], &[0., 1., 0., 0., 0.]), 0.);
+    assert_eq!(dot(&[2., 0., 0., 0., 0.], &[3., 1., 0., 0., 0.]), 6.);
+
+    for _ in 0..100 {
+        let size = 1536;
+        let mut a = vec![0.; size];
+        let mut b = vec![0.; size];
+        for (a, b) in a.iter_mut().zip(b.iter_mut()) {
+            *a = rng.gen();
+            *b = rng.gen();
+        }
+
+        assert_eq!(
+            round_to_decimals(dot(&a, &b), 1),
+            round_to_decimals(reference_dot(&a, &b), 1)
+        );
+    }
+
+    fn round_to_decimals(n: f32, decimal_places: i32) -> f32 {
+        let factor = (10.0 as f32).powi(decimal_places);
+        (n * factor).round() / factor
+    }
+
+    fn reference_dot(a: &[f32], b: &[f32]) -> f32 {
+        a.iter().zip(b.iter()).map(|(a, b)| a * b).sum()
+    }
+}
+
+#[derive(Default)]
+struct FakeEmbeddingProvider {
+    embedding_count: AtomicUsize,
+}
+
+impl FakeEmbeddingProvider {
+    fn embedding_count(&self) -> usize {
+        self.embedding_count.load(atomic::Ordering::SeqCst)
+    }
+}
+
+#[async_trait]
+impl EmbeddingProvider for FakeEmbeddingProvider {
+    async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>> {
+        self.embedding_count
+            .fetch_add(spans.len(), atomic::Ordering::SeqCst);
+        Ok(spans
+            .iter()
+            .map(|span| {
+                let mut result = vec![1.0; 26];
+                for letter in span.chars() {
+                    let letter = letter.to_ascii_lowercase();
+                    if letter as u32 >= 'a' as u32 {
+                        let ix = (letter as u32) - ('a' as u32);
+                        if ix < 26 {
+                            result[ix as usize] += 1.0;
+                        }
+                    }
+                }
+
+                let norm = result.iter().map(|x| x * x).sum::<f32>().sqrt();
+                for x in &mut result {
+                    *x /= norm;
+                }
+
+                result
+            })
+            .collect())
+    }
+}
+
+fn js_lang() -> Arc<Language> {
+    Arc::new(
+        Language::new(
+            LanguageConfig {
+                name: "Javascript".into(),
+                path_suffixes: vec!["js".into()],
+                ..Default::default()
+            },
+            Some(tree_sitter_typescript::language_tsx()),
+        )
+        .with_embedding_query(
+            &r#"
+
+            (
+                (comment)* @context
+                .
+                [
+                (export_statement
+                    (function_declaration
+                        "async"? @name
+                        "function" @name
+                        name: (_) @name))
+                (function_declaration
+                    "async"? @name
+                    "function" @name
+                    name: (_) @name)
+                ] @item
+            )
+
+            (
+                (comment)* @context
+                .
+                [
+                (export_statement
+                    (class_declaration
+                        "class" @name
+                        name: (_) @name))
+                (class_declaration
+                    "class" @name
+                    name: (_) @name)
+                ] @item
+            )
+
+            (
+                (comment)* @context
+                .
+                [
+                (export_statement
+                    (interface_declaration
+                        "interface" @name
+                        name: (_) @name))
+                (interface_declaration
+                    "interface" @name
+                    name: (_) @name)
+                ] @item
+            )
+
+            (
+                (comment)* @context
+                .
+                [
+                (export_statement
+                    (enum_declaration
+                        "enum" @name
+                        name: (_) @name))
+                (enum_declaration
+                    "enum" @name
+                    name: (_) @name)
+                ] @item
+            )
+
+            (
+                (comment)* @context
+                .
+                (method_definition
+                    [
+                        "get"
+                        "set"
+                        "async"
+                        "*"
+                        "static"
+                    ]* @name
+                    name: (_) @name) @item
+            )
+
+                    "#
+            .unindent(),
+        )
+        .unwrap(),
+    )
+}
+
+fn rust_lang() -> Arc<Language> {
+    Arc::new(
+        Language::new(
+            LanguageConfig {
+                name: "Rust".into(),
+                path_suffixes: vec!["rs".into()],
+                collapsed_placeholder: " /* ... */ ".to_string(),
+                ..Default::default()
+            },
+            Some(tree_sitter_rust::language()),
+        )
+        .with_embedding_query(
+            r#"
+            (
+                [(line_comment) (attribute_item)]* @context
+                .
+                [
+                    (struct_item
+                        name: (_) @name)
+
+                    (enum_item
+                        name: (_) @name)
+
+                    (impl_item
+                        trait: (_)? @name
+                        "for"? @name
+                        type: (_) @name)
+
+                    (trait_item
+                        name: (_) @name)
+
+                    (function_item
+                        name: (_) @name
+                        body: (block
+                            "{" @keep
+                            "}" @keep) @collapse)
+
+                    (macro_definition
+                        name: (_) @name)
+                ] @item
+            )
+            "#,
+        )
+        .unwrap(),
+    )
+}
+
+fn json_lang() -> Arc<Language> {
+    Arc::new(
+        Language::new(
+            LanguageConfig {
+                name: "JSON".into(),
+                path_suffixes: vec!["json".into()],
+                ..Default::default()
+            },
+            Some(tree_sitter_json::language()),
+        )
+        .with_embedding_query(
+            r#"
+            (document) @item
+
+            (array
+                "[" @keep
+                .
+                (object)? @keep
+                "]" @keep) @collapse
+
+            (pair value: (string
+                "\"" @keep
+                "\"" @keep) @collapse)
+            "#,
+        )
+        .unwrap(),
+    )
+}
+
+fn toml_lang() -> Arc<Language> {
+    Arc::new(Language::new(
+        LanguageConfig {
+            name: "TOML".into(),
+            path_suffixes: vec!["toml".into()],
+            ..Default::default()
+        },
+        Some(tree_sitter_toml::language()),
+    ))
+}
+
+fn cpp_lang() -> Arc<Language> {
+    Arc::new(
+        Language::new(
+            LanguageConfig {
+                name: "CPP".into(),
+                path_suffixes: vec!["cpp".into()],
+                ..Default::default()
+            },
+            Some(tree_sitter_cpp::language()),
+        )
+        .with_embedding_query(
+            r#"
+            (
+                (comment)* @context
+                .
+                (function_definition
+                    (type_qualifier)? @name
+                    type: (_)? @name
+                    declarator: [
+                        (function_declarator
+                            declarator: (_) @name)
+                        (pointer_declarator
+                            "*" @name
+                            declarator: (function_declarator
+                            declarator: (_) @name))
+                        (pointer_declarator
+                            "*" @name
+                            declarator: (pointer_declarator
+                                "*" @name
+                            declarator: (function_declarator
+                                declarator: (_) @name)))
+                        (reference_declarator
+                            ["&" "&&"] @name
+                            (function_declarator
+                            declarator: (_) @name))
+                    ]
+                    (type_qualifier)? @name) @item
+                )
+
+            (
+                (comment)* @context
+                .
+                (template_declaration
+                    (class_specifier
+                        "class" @name
+                        name: (_) @name)
+                        ) @item
+            )
+
+            (
+                (comment)* @context
+                .
+                (class_specifier
+                    "class" @name
+                    name: (_) @name) @item
+                )
+
+            (
+                (comment)* @context
+                .
+                (enum_specifier
+                    "enum" @name
+                    name: (_) @name) @item
+                )
+
+            (
+                (comment)* @context
+                .
+                (declaration
+                    type: (struct_specifier
+                    "struct" @name)
+                    declarator: (_) @name) @item
+            )
+
+            "#,
+        )
+        .unwrap(),
+    )
+}
+
+fn elixir_lang() -> Arc<Language> {
+    Arc::new(
+        Language::new(
+            LanguageConfig {
+                name: "Elixir".into(),
+                path_suffixes: vec!["rs".into()],
+                ..Default::default()
+            },
+            Some(tree_sitter_elixir::language()),
+        )
+        .with_embedding_query(
+            r#"
+            (
+                (unary_operator
+                    operator: "@"
+                    operand: (call
+                        target: (identifier) @unary
+                        (#match? @unary "^(doc)$"))
+                    ) @context
+                .
+                (call
+                target: (identifier) @name
+                (arguments
+                [
+                (identifier) @name
+                (call
+                target: (identifier) @name)
+                (binary_operator
+                left: (call
+                target: (identifier) @name)
+                operator: "when")
+                ])
+                (#match? @name "^(def|defp|defdelegate|defguard|defguardp|defmacro|defmacrop|defn|defnp)$")) @item
+                )
+
+            (call
+                target: (identifier) @name
+                (arguments (alias) @name)
+                (#match? @name "^(defmodule|defprotocol)$")) @item
+            "#,
+        )
+        .unwrap(),
+    )
+}
+
+#[gpui::test]
+fn test_subtract_ranges() {
+    // collapsed_ranges: Vec<Range<usize>>, keep_ranges: Vec<Range<usize>>
+
+    assert_eq!(
+        subtract_ranges(&[0..5, 10..21], &[0..1, 4..5]),
+        vec![1..4, 10..21]
+    );
+
+    assert_eq!(subtract_ranges(&[0..5], &[1..2]), &[0..1, 2..5]);
+}

crates/vector_store/src/modal.rs 🔗

@@ -1,172 +0,0 @@
-use crate::{SearchResult, VectorStore};
-use editor::{scroll::autoscroll::Autoscroll, Editor};
-use gpui::{
-    actions, elements::*, AnyElement, AppContext, ModelHandle, MouseState, Task, ViewContext,
-    WeakViewHandle,
-};
-use picker::{Picker, PickerDelegate, PickerEvent};
-use project::{Project, ProjectPath};
-use std::{collections::HashMap, sync::Arc, time::Duration};
-use util::ResultExt;
-use workspace::Workspace;
-
-const MIN_QUERY_LEN: usize = 5;
-const EMBEDDING_DEBOUNCE_INTERVAL: Duration = Duration::from_millis(500);
-
-actions!(semantic_search, [Toggle]);
-
-pub type SemanticSearch = Picker<SemanticSearchDelegate>;
-
-pub struct SemanticSearchDelegate {
-    workspace: WeakViewHandle<Workspace>,
-    project: ModelHandle<Project>,
-    vector_store: ModelHandle<VectorStore>,
-    selected_match_index: usize,
-    matches: Vec<SearchResult>,
-    history: HashMap<String, Vec<SearchResult>>,
-}
-
-impl SemanticSearchDelegate {
-    // This is currently searching on every keystroke,
-    // This is wildly overkill, and has the potential to get expensive
-    // We will need to update this to throttle searching
-    pub fn new(
-        workspace: WeakViewHandle<Workspace>,
-        project: ModelHandle<Project>,
-        vector_store: ModelHandle<VectorStore>,
-    ) -> Self {
-        Self {
-            workspace,
-            project,
-            vector_store,
-            selected_match_index: 0,
-            matches: vec![],
-            history: HashMap::new(),
-        }
-    }
-}
-
-impl PickerDelegate for SemanticSearchDelegate {
-    fn placeholder_text(&self) -> Arc<str> {
-        "Search repository in natural language...".into()
-    }
-
-    fn confirm(&mut self, _: bool, cx: &mut ViewContext<SemanticSearch>) {
-        if let Some(search_result) = self.matches.get(self.selected_match_index) {
-            // Open Buffer
-            let search_result = search_result.clone();
-            let buffer = self.project.update(cx, |project, cx| {
-                project.open_buffer(
-                    ProjectPath {
-                        worktree_id: search_result.worktree_id,
-                        path: search_result.file_path.clone().into(),
-                    },
-                    cx,
-                )
-            });
-
-            let workspace = self.workspace.clone();
-            let position = search_result.clone().offset;
-            cx.spawn(|_, mut cx| async move {
-                let buffer = buffer.await?;
-                workspace.update(&mut cx, |workspace, cx| {
-                    let editor = workspace.open_project_item::<Editor>(buffer, cx);
-                    editor.update(cx, |editor, cx| {
-                        editor.change_selections(Some(Autoscroll::center()), cx, |s| {
-                            s.select_ranges([position..position])
-                        });
-                    });
-                })?;
-                Ok::<_, anyhow::Error>(())
-            })
-            .detach_and_log_err(cx);
-            cx.emit(PickerEvent::Dismiss);
-        }
-    }
-
-    fn dismissed(&mut self, _cx: &mut ViewContext<SemanticSearch>) {}
-
-    fn match_count(&self) -> usize {
-        self.matches.len()
-    }
-
-    fn selected_index(&self) -> usize {
-        self.selected_match_index
-    }
-
-    fn set_selected_index(&mut self, ix: usize, _cx: &mut ViewContext<SemanticSearch>) {
-        self.selected_match_index = ix;
-    }
-
-    fn update_matches(&mut self, query: String, cx: &mut ViewContext<SemanticSearch>) -> Task<()> {
-        log::info!("Searching for {:?}...", query);
-        if query.len() < MIN_QUERY_LEN {
-            log::info!("Query below minimum length");
-            return Task::ready(());
-        }
-
-        let vector_store = self.vector_store.clone();
-        let project = self.project.clone();
-        cx.spawn(|this, mut cx| async move {
-            cx.background().timer(EMBEDDING_DEBOUNCE_INTERVAL).await;
-
-            let retrieved_cached = this.update(&mut cx, |this, _| {
-                let delegate = this.delegate_mut();
-                if delegate.history.contains_key(&query) {
-                    let historic_results = delegate.history.get(&query).unwrap().to_owned();
-                    delegate.matches = historic_results.clone();
-                    true
-                } else {
-                    false
-                }
-            });
-
-            if let Some(retrieved) = retrieved_cached.log_err() {
-                if !retrieved {
-                    let task = vector_store.update(&mut cx, |store, cx| {
-                        store.search(project.clone(), query.to_string(), 10, cx)
-                    });
-
-                    if let Some(results) = task.await.log_err() {
-                        log::info!("Not queried previously, searching...");
-                        this.update(&mut cx, |this, _| {
-                            let delegate = this.delegate_mut();
-                            delegate.matches = results.clone();
-                            delegate.history.insert(query, results);
-                        })
-                        .ok();
-                    }
-                } else {
-                    log::info!("Already queried, retrieved directly from cached history");
-                }
-            }
-        })
-    }
-
-    fn render_match(
-        &self,
-        ix: usize,
-        mouse_state: &mut MouseState,
-        selected: bool,
-        cx: &AppContext,
-    ) -> AnyElement<Picker<Self>> {
-        let theme = theme::current(cx);
-        let style = &theme.picker.item;
-        let current_style = style.in_state(selected).style_for(mouse_state);
-
-        let search_result = &self.matches[ix];
-
-        let path = search_result.file_path.to_string_lossy();
-        let name = search_result.name.clone();
-
-        Flex::column()
-            .with_child(Text::new(name, current_style.label.text.clone()).with_soft_wrap(false))
-            .with_child(Label::new(
-                path.to_string(),
-                style.inactive_state().default.label.clone(),
-            ))
-            .contained()
-            .with_style(current_style.container)
-            .into_any()
-    }
-}

crates/vector_store/src/parsing.rs 🔗

@@ -1,115 +0,0 @@
-use std::{path::PathBuf, sync::Arc, time::SystemTime};
-
-use anyhow::{anyhow, Ok, Result};
-use project::Fs;
-use tree_sitter::{Parser, QueryCursor};
-
-use crate::PendingFile;
-
-#[derive(Debug, PartialEq, Clone)]
-pub struct Document {
-    pub offset: usize,
-    pub name: String,
-    pub embedding: Vec<f32>,
-}
-
-#[derive(Debug, PartialEq, Clone)]
-pub struct ParsedFile {
-    pub path: PathBuf,
-    pub mtime: SystemTime,
-    pub documents: Vec<Document>,
-}
-
-const CODE_CONTEXT_TEMPLATE: &str =
-    "The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
-
-pub struct CodeContextRetriever {
-    pub parser: Parser,
-    pub cursor: QueryCursor,
-    pub fs: Arc<dyn Fs>,
-}
-
-impl CodeContextRetriever {
-    pub async fn parse_file(
-        &mut self,
-        pending_file: PendingFile,
-    ) -> Result<(ParsedFile, Vec<String>)> {
-        let grammar = pending_file
-            .language
-            .grammar()
-            .ok_or_else(|| anyhow!("no grammar for language"))?;
-        let embedding_config = grammar
-            .embedding_config
-            .as_ref()
-            .ok_or_else(|| anyhow!("no embedding queries"))?;
-
-        let content = self.fs.load(&pending_file.absolute_path).await?;
-
-        self.parser.set_language(grammar.ts_language).unwrap();
-
-        let tree = self
-            .parser
-            .parse(&content, None)
-            .ok_or_else(|| anyhow!("parsing failed"))?;
-
-        let mut documents = Vec::new();
-        let mut context_spans = Vec::new();
-
-        // Iterate through query matches
-        for mat in self.cursor.matches(
-            &embedding_config.query,
-            tree.root_node(),
-            content.as_bytes(),
-        ) {
-            // log::info!("-----MATCH-----");
-
-            let mut name = Vec::new();
-            let mut item: Option<&str> = None;
-            let mut offset: Option<usize> = None;
-            for capture in mat.captures {
-                if capture.index == embedding_config.item_capture_ix {
-                    offset = Some(capture.node.byte_range().start);
-                    item = content.get(capture.node.byte_range());
-                } else if capture.index == embedding_config.name_capture_ix {
-                    if let Some(name_content) = content.get(capture.node.byte_range()) {
-                        name.push(name_content);
-                    }
-                }
-
-                if let Some(context_capture_ix) = embedding_config.context_capture_ix {
-                    if capture.index == context_capture_ix {
-                        if let Some(context) = content.get(capture.node.byte_range()) {
-                            name.push(context);
-                        }
-                    }
-                }
-            }
-
-            if item.is_some() && offset.is_some() && name.len() > 0 {
-                let context_span = CODE_CONTEXT_TEMPLATE
-                    .replace("<path>", pending_file.relative_path.to_str().unwrap())
-                    .replace("<language>", &pending_file.language.name().to_lowercase())
-                    .replace("<item>", item.unwrap());
-
-                // log::info!("Name:       {:?}", name);
-                // log::info!("Span:       {:?}", util::truncate(&context_span, 100));
-
-                context_spans.push(context_span);
-                documents.push(Document {
-                    name: name.join(" "),
-                    offset: offset.unwrap(),
-                    embedding: Vec::new(),
-                })
-            }
-        }
-
-        return Ok((
-            ParsedFile {
-                path: pending_file.relative_path,
-                mtime: pending_file.modified_time,
-                documents,
-            },
-            context_spans,
-        ));
-    }
-}

crates/vector_store/src/vector_store.rs 🔗

@@ -1,770 +0,0 @@
-mod db;
-mod embedding;
-mod modal;
-mod parsing;
-mod vector_store_settings;
-
-#[cfg(test)]
-mod vector_store_tests;
-
-use crate::vector_store_settings::VectorStoreSettings;
-use anyhow::{anyhow, Result};
-use db::VectorDatabase;
-use embedding::{EmbeddingProvider, OpenAIEmbeddings};
-use futures::{channel::oneshot, Future};
-use gpui::{
-    AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, ViewContext,
-    WeakModelHandle,
-};
-use language::{Language, LanguageRegistry};
-use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
-use parsing::{CodeContextRetriever, ParsedFile};
-use project::{Fs, PathChange, Project, ProjectEntryId, WorktreeId};
-use smol::channel;
-use std::{
-    collections::HashMap,
-    path::{Path, PathBuf},
-    sync::Arc,
-    time::{Duration, Instant, SystemTime},
-};
-use tree_sitter::{Parser, QueryCursor};
-use util::{
-    channel::{ReleaseChannel, RELEASE_CHANNEL, RELEASE_CHANNEL_NAME},
-    http::HttpClient,
-    paths::EMBEDDINGS_DIR,
-    ResultExt,
-};
-use workspace::{Workspace, WorkspaceCreated};
-
-const VECTOR_STORE_VERSION: usize = 0;
-const EMBEDDINGS_BATCH_SIZE: usize = 150;
-
-pub fn init(
-    fs: Arc<dyn Fs>,
-    http_client: Arc<dyn HttpClient>,
-    language_registry: Arc<LanguageRegistry>,
-    cx: &mut AppContext,
-) {
-    settings::register::<VectorStoreSettings>(cx);
-
-    let db_file_path = EMBEDDINGS_DIR
-        .join(Path::new(RELEASE_CHANNEL_NAME.as_str()))
-        .join("embeddings_db");
-
-    SemanticSearch::init(cx);
-    cx.add_action(
-        |workspace: &mut Workspace, _: &Toggle, cx: &mut ViewContext<Workspace>| {
-            if cx.has_global::<ModelHandle<VectorStore>>() {
-                let vector_store = cx.global::<ModelHandle<VectorStore>>().clone();
-                workspace.toggle_modal(cx, |workspace, cx| {
-                    let project = workspace.project().clone();
-                    let workspace = cx.weak_handle();
-                    cx.add_view(|cx| {
-                        SemanticSearch::new(
-                            SemanticSearchDelegate::new(workspace, project, vector_store),
-                            cx,
-                        )
-                    })
-                });
-            }
-        },
-    );
-
-    if *RELEASE_CHANNEL == ReleaseChannel::Stable
-        || !settings::get::<VectorStoreSettings>(cx).enabled
-    {
-        return;
-    }
-
-    cx.spawn(move |mut cx| async move {
-        let vector_store = VectorStore::new(
-            fs,
-            db_file_path,
-            // Arc::new(embedding::DummyEmbeddings {}),
-            Arc::new(OpenAIEmbeddings {
-                client: http_client,
-                executor: cx.background(),
-            }),
-            language_registry,
-            cx.clone(),
-        )
-        .await?;
-
-        cx.update(|cx| {
-            cx.set_global(vector_store.clone());
-            cx.subscribe_global::<WorkspaceCreated, _>({
-                let vector_store = vector_store.clone();
-                move |event, cx| {
-                    let workspace = &event.0;
-                    if let Some(workspace) = workspace.upgrade(cx) {
-                        let project = workspace.read(cx).project().clone();
-                        if project.read(cx).is_local() {
-                            vector_store.update(cx, |store, cx| {
-                                store.add_project(project, cx).detach();
-                            });
-                        }
-                    }
-                }
-            })
-            .detach();
-        });
-
-        anyhow::Ok(())
-    })
-    .detach();
-}
-
-pub struct VectorStore {
-    fs: Arc<dyn Fs>,
-    database_url: Arc<PathBuf>,
-    embedding_provider: Arc<dyn EmbeddingProvider>,
-    language_registry: Arc<LanguageRegistry>,
-    db_update_tx: channel::Sender<DbOperation>,
-    parsing_files_tx: channel::Sender<PendingFile>,
-    _db_update_task: Task<()>,
-    _embed_batch_task: Task<()>,
-    _batch_files_task: Task<()>,
-    _parsing_files_tasks: Vec<Task<()>>,
-    projects: HashMap<WeakModelHandle<Project>, ProjectState>,
-}
-
-struct ProjectState {
-    worktree_db_ids: Vec<(WorktreeId, i64)>,
-    pending_files: HashMap<PathBuf, (PendingFile, SystemTime)>,
-    _subscription: gpui::Subscription,
-}
-
-impl ProjectState {
-    fn db_id_for_worktree_id(&self, id: WorktreeId) -> Option<i64> {
-        self.worktree_db_ids
-            .iter()
-            .find_map(|(worktree_id, db_id)| {
-                if *worktree_id == id {
-                    Some(*db_id)
-                } else {
-                    None
-                }
-            })
-    }
-
-    fn worktree_id_for_db_id(&self, id: i64) -> Option<WorktreeId> {
-        self.worktree_db_ids
-            .iter()
-            .find_map(|(worktree_id, db_id)| {
-                if *db_id == id {
-                    Some(*worktree_id)
-                } else {
-                    None
-                }
-            })
-    }
-
-    fn update_pending_files(&mut self, pending_file: PendingFile, indexing_time: SystemTime) {
-        // If Pending File Already Exists, Replace it with the new one
-        // but keep the old indexing time
-        if let Some(old_file) = self
-            .pending_files
-            .remove(&pending_file.relative_path.clone())
-        {
-            self.pending_files.insert(
-                pending_file.relative_path.clone(),
-                (pending_file, old_file.1),
-            );
-        } else {
-            self.pending_files.insert(
-                pending_file.relative_path.clone(),
-                (pending_file, indexing_time),
-            );
-        };
-    }
-
-    fn get_outstanding_files(&mut self) -> Vec<PendingFile> {
-        let mut outstanding_files = vec![];
-        let mut remove_keys = vec![];
-        for key in self.pending_files.keys().into_iter() {
-            if let Some(pending_details) = self.pending_files.get(key) {
-                let (pending_file, index_time) = pending_details;
-                if index_time <= &SystemTime::now() {
-                    outstanding_files.push(pending_file.clone());
-                    remove_keys.push(key.clone());
-                }
-            }
-        }
-
-        for key in remove_keys.iter() {
-            self.pending_files.remove(key);
-        }
-
-        return outstanding_files;
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct PendingFile {
-    worktree_db_id: i64,
-    relative_path: PathBuf,
-    absolute_path: PathBuf,
-    language: Arc<Language>,
-    modified_time: SystemTime,
-}
-
-#[derive(Debug, Clone)]
-pub struct SearchResult {
-    pub worktree_id: WorktreeId,
-    pub name: String,
-    pub offset: usize,
-    pub file_path: PathBuf,
-}
-
-enum DbOperation {
-    InsertFile {
-        worktree_id: i64,
-        indexed_file: ParsedFile,
-    },
-    Delete {
-        worktree_id: i64,
-        path: PathBuf,
-    },
-    FindOrCreateWorktree {
-        path: PathBuf,
-        sender: oneshot::Sender<Result<i64>>,
-    },
-    FileMTimes {
-        worktree_id: i64,
-        sender: oneshot::Sender<Result<HashMap<PathBuf, SystemTime>>>,
-    },
-}
-
-enum EmbeddingJob {
-    Enqueue {
-        worktree_id: i64,
-        parsed_file: ParsedFile,
-        document_spans: Vec<String>,
-    },
-    Flush,
-}
-
-impl VectorStore {
-    async fn new(
-        fs: Arc<dyn Fs>,
-        database_url: PathBuf,
-        embedding_provider: Arc<dyn EmbeddingProvider>,
-        language_registry: Arc<LanguageRegistry>,
-        mut cx: AsyncAppContext,
-    ) -> Result<ModelHandle<Self>> {
-        let database_url = Arc::new(database_url);
-
-        let db = cx
-            .background()
-            .spawn({
-                let fs = fs.clone();
-                let database_url = database_url.clone();
-                async move {
-                    if let Some(db_directory) = database_url.parent() {
-                        fs.create_dir(db_directory).await.log_err();
-                    }
-
-                    let db = VectorDatabase::new(database_url.to_string_lossy().to_string())?;
-                    anyhow::Ok(db)
-                }
-            })
-            .await?;
-
-        Ok(cx.add_model(|cx| {
-            // paths_tx -> embeddings_tx -> db_update_tx
-
-            //db_update_tx/rx: Updating Database
-            let (db_update_tx, db_update_rx) = channel::unbounded();
-            let _db_update_task = cx.background().spawn(async move {
-                while let Ok(job) = db_update_rx.recv().await {
-                    match job {
-                        DbOperation::InsertFile {
-                            worktree_id,
-                            indexed_file,
-                        } => {
-                            db.insert_file(worktree_id, indexed_file).log_err();
-                        }
-                        DbOperation::Delete { worktree_id, path } => {
-                            db.delete_file(worktree_id, path).log_err();
-                        }
-                        DbOperation::FindOrCreateWorktree { path, sender } => {
-                            let id = db.find_or_create_worktree(&path);
-                            sender.send(id).ok();
-                        }
-                        DbOperation::FileMTimes {
-                            worktree_id: worktree_db_id,
-                            sender,
-                        } => {
-                            let file_mtimes = db.get_file_mtimes(worktree_db_id);
-                            sender.send(file_mtimes).ok();
-                        }
-                    }
-                }
-            });
-
-            // embed_tx/rx: Embed Batch and Send to Database
-            let (embed_batch_tx, embed_batch_rx) =
-                channel::unbounded::<Vec<(i64, ParsedFile, Vec<String>)>>();
-            let _embed_batch_task = cx.background().spawn({
-                let db_update_tx = db_update_tx.clone();
-                let embedding_provider = embedding_provider.clone();
-                async move {
-                    while let Ok(mut embeddings_queue) = embed_batch_rx.recv().await {
-                        // Construct Batch
-                        let mut document_spans = vec![];
-                        for (_, _, document_span) in embeddings_queue.iter() {
-                            document_spans.extend(document_span.iter().map(|s| s.as_str()));
-                        }
-
-                        if let Ok(embeddings) = embedding_provider.embed_batch(document_spans).await
-                        {
-                            let mut i = 0;
-                            let mut j = 0;
-
-                            for embedding in embeddings.iter() {
-                                while embeddings_queue[i].1.documents.len() == j {
-                                    i += 1;
-                                    j = 0;
-                                }
-
-                                embeddings_queue[i].1.documents[j].embedding = embedding.to_owned();
-                                j += 1;
-                            }
-
-                            for (worktree_id, indexed_file, _) in embeddings_queue.into_iter() {
-                                for document in indexed_file.documents.iter() {
-                                    // TODO: Update this so it doesn't panic
-                                    assert!(
-                                        document.embedding.len() > 0,
-                                        "Document Embedding Not Complete"
-                                    );
-                                }
-
-                                db_update_tx
-                                    .send(DbOperation::InsertFile {
-                                        worktree_id,
-                                        indexed_file,
-                                    })
-                                    .await
-                                    .unwrap();
-                            }
-                        }
-                    }
-                }
-            });
-
-            // batch_tx/rx: Batch Files to Send for Embeddings
-            let (batch_files_tx, batch_files_rx) = channel::unbounded::<EmbeddingJob>();
-            let _batch_files_task = cx.background().spawn(async move {
-                let mut queue_len = 0;
-                let mut embeddings_queue = vec![];
-
-                while let Ok(job) = batch_files_rx.recv().await {
-                    let should_flush = match job {
-                        EmbeddingJob::Enqueue {
-                            document_spans,
-                            worktree_id,
-                            parsed_file,
-                        } => {
-                            queue_len += &document_spans.len();
-                            embeddings_queue.push((worktree_id, parsed_file, document_spans));
-                            queue_len >= EMBEDDINGS_BATCH_SIZE
-                        }
-                        EmbeddingJob::Flush => true,
-                    };
-
-                    if should_flush {
-                        embed_batch_tx.try_send(embeddings_queue).unwrap();
-                        embeddings_queue = vec![];
-                        queue_len = 0;
-                    }
-                }
-            });
-
-            // parsing_files_tx/rx: Parsing Files to Embeddable Documents
-            let (parsing_files_tx, parsing_files_rx) = channel::unbounded::<PendingFile>();
-
-            let mut _parsing_files_tasks = Vec::new();
-            // for _ in 0..cx.background().num_cpus() {
-            for _ in 0..1 {
-                let fs = fs.clone();
-                let parsing_files_rx = parsing_files_rx.clone();
-                let batch_files_tx = batch_files_tx.clone();
-                _parsing_files_tasks.push(cx.background().spawn(async move {
-                    let parser = Parser::new();
-                    let cursor = QueryCursor::new();
-                    let mut retriever = CodeContextRetriever { parser, cursor, fs };
-                    while let Ok(pending_file) = parsing_files_rx.recv().await {
-                        if let Some((indexed_file, document_spans)) =
-                            retriever.parse_file(pending_file.clone()).await.log_err()
-                        {
-                            batch_files_tx
-                                .try_send(EmbeddingJob::Enqueue {
-                                    worktree_id: pending_file.worktree_db_id,
-                                    parsed_file: indexed_file,
-                                    document_spans,
-                                })
-                                .unwrap();
-                        }
-
-                        if parsing_files_rx.len() == 0 {
-                            batch_files_tx.try_send(EmbeddingJob::Flush).unwrap();
-                        }
-                    }
-                }));
-            }
-
-            Self {
-                fs,
-                database_url,
-                embedding_provider,
-                language_registry,
-                db_update_tx,
-                parsing_files_tx,
-                _db_update_task,
-                _embed_batch_task,
-                _batch_files_task,
-                _parsing_files_tasks,
-                projects: HashMap::new(),
-            }
-        }))
-    }
-
-    fn find_or_create_worktree(&self, path: PathBuf) -> impl Future<Output = Result<i64>> {
-        let (tx, rx) = oneshot::channel();
-        self.db_update_tx
-            .try_send(DbOperation::FindOrCreateWorktree { path, sender: tx })
-            .unwrap();
-        async move { rx.await? }
-    }
-
-    fn get_file_mtimes(
-        &self,
-        worktree_id: i64,
-    ) -> impl Future<Output = Result<HashMap<PathBuf, SystemTime>>> {
-        let (tx, rx) = oneshot::channel();
-        self.db_update_tx
-            .try_send(DbOperation::FileMTimes {
-                worktree_id,
-                sender: tx,
-            })
-            .unwrap();
-        async move { rx.await? }
-    }
-
-    fn add_project(
-        &mut self,
-        project: ModelHandle<Project>,
-        cx: &mut ModelContext<Self>,
-    ) -> Task<Result<()>> {
-        let worktree_scans_complete = project
-            .read(cx)
-            .worktrees(cx)
-            .map(|worktree| {
-                let scan_complete = worktree.read(cx).as_local().unwrap().scan_complete();
-                async move {
-                    scan_complete.await;
-                }
-            })
-            .collect::<Vec<_>>();
-        let worktree_db_ids = project
-            .read(cx)
-            .worktrees(cx)
-            .map(|worktree| {
-                self.find_or_create_worktree(worktree.read(cx).abs_path().to_path_buf())
-            })
-            .collect::<Vec<_>>();
-
-        let fs = self.fs.clone();
-        let language_registry = self.language_registry.clone();
-        let database_url = self.database_url.clone();
-        let db_update_tx = self.db_update_tx.clone();
-        let parsing_files_tx = self.parsing_files_tx.clone();
-
-        cx.spawn(|this, mut cx| async move {
-            futures::future::join_all(worktree_scans_complete).await;
-
-            let worktree_db_ids = futures::future::join_all(worktree_db_ids).await;
-
-            if let Some(db_directory) = database_url.parent() {
-                fs.create_dir(db_directory).await.log_err();
-            }
-
-            let worktrees = project.read_with(&cx, |project, cx| {
-                project
-                    .worktrees(cx)
-                    .map(|worktree| worktree.read(cx).snapshot())
-                    .collect::<Vec<_>>()
-            });
-
-            let mut worktree_file_times = HashMap::new();
-            let mut db_ids_by_worktree_id = HashMap::new();
-            for (worktree, db_id) in worktrees.iter().zip(worktree_db_ids) {
-                let db_id = db_id?;
-                db_ids_by_worktree_id.insert(worktree.id(), db_id);
-                worktree_file_times.insert(
-                    worktree.id(),
-                    this.read_with(&cx, |this, _| this.get_file_mtimes(db_id))
-                        .await?,
-                );
-            }
-
-            cx.background()
-                .spawn({
-                    let db_ids_by_worktree_id = db_ids_by_worktree_id.clone();
-                    let db_update_tx = db_update_tx.clone();
-                    let language_registry = language_registry.clone();
-                    let parsing_files_tx = parsing_files_tx.clone();
-                    async move {
-                        let t0 = Instant::now();
-                        for worktree in worktrees.into_iter() {
-                            let mut file_mtimes =
-                                worktree_file_times.remove(&worktree.id()).unwrap();
-                            for file in worktree.files(false, 0) {
-                                let absolute_path = worktree.absolutize(&file.path);
-
-                                if let Ok(language) = language_registry
-                                    .language_for_file(&absolute_path, None)
-                                    .await
-                                {
-                                    if language
-                                        .grammar()
-                                        .and_then(|grammar| grammar.embedding_config.as_ref())
-                                        .is_none()
-                                    {
-                                        continue;
-                                    }
-
-                                    let path_buf = file.path.to_path_buf();
-                                    let stored_mtime = file_mtimes.remove(&file.path.to_path_buf());
-                                    let already_stored = stored_mtime
-                                        .map_or(false, |existing_mtime| {
-                                            existing_mtime == file.mtime
-                                        });
-
-                                    if !already_stored {
-                                        parsing_files_tx
-                                            .try_send(PendingFile {
-                                                worktree_db_id: db_ids_by_worktree_id
-                                                    [&worktree.id()],
-                                                relative_path: path_buf,
-                                                absolute_path,
-                                                language,
-                                                modified_time: file.mtime,
-                                            })
-                                            .unwrap();
-                                    }
-                                }
-                            }
-                            for file in file_mtimes.keys() {
-                                db_update_tx
-                                    .try_send(DbOperation::Delete {
-                                        worktree_id: db_ids_by_worktree_id[&worktree.id()],
-                                        path: file.to_owned(),
-                                    })
-                                    .unwrap();
-                            }
-                        }
-                        log::info!(
-                            "Parsing Worktree Completed in {:?}",
-                            t0.elapsed().as_millis()
-                        );
-                    }
-                })
-                .detach();
-
-            // let mut pending_files: Vec<(PathBuf, ((i64, PathBuf, Arc<Language>, SystemTime), SystemTime))> = vec![];
-            this.update(&mut cx, |this, cx| {
-                // The below is managing for updated on save
-                // Currently each time a file is saved, this code is run, and for all the files that were changed, if the current time is
-                // greater than the previous embedded time by the REINDEXING_DELAY variable, we will send the file off to be indexed.
-                let _subscription = cx.subscribe(&project, |this, project, event, cx| {
-                    if let project::Event::WorktreeUpdatedEntries(worktree_id, changes) = event {
-                        this.project_entries_changed(project, changes.clone(), cx, worktree_id);
-                    }
-                });
-
-                this.projects.insert(
-                    project.downgrade(),
-                    ProjectState {
-                        pending_files: HashMap::new(),
-                        worktree_db_ids: db_ids_by_worktree_id.into_iter().collect(),
-                        _subscription,
-                    },
-                );
-            });
-
-            anyhow::Ok(())
-        })
-    }
-
-    pub fn search(
-        &mut self,
-        project: ModelHandle<Project>,
-        phrase: String,
-        limit: usize,
-        cx: &mut ModelContext<Self>,
-    ) -> Task<Result<Vec<SearchResult>>> {
-        let project_state = if let Some(state) = self.projects.get(&project.downgrade()) {
-            state
-        } else {
-            return Task::ready(Err(anyhow!("project not added")));
-        };
-
-        let worktree_db_ids = project
-            .read(cx)
-            .worktrees(cx)
-            .filter_map(|worktree| {
-                let worktree_id = worktree.read(cx).id();
-                project_state.db_id_for_worktree_id(worktree_id)
-            })
-            .collect::<Vec<_>>();
-
-        let embedding_provider = self.embedding_provider.clone();
-        let database_url = self.database_url.clone();
-        cx.spawn(|this, cx| async move {
-            let documents = cx
-                .background()
-                .spawn(async move {
-                    let database = VectorDatabase::new(database_url.to_string_lossy().into())?;
-
-                    let phrase_embedding = embedding_provider
-                        .embed_batch(vec![&phrase])
-                        .await?
-                        .into_iter()
-                        .next()
-                        .unwrap();
-
-                    database.top_k_search(&worktree_db_ids, &phrase_embedding, limit)
-                })
-                .await?;
-
-            this.read_with(&cx, |this, _| {
-                let project_state = if let Some(state) = this.projects.get(&project.downgrade()) {
-                    state
-                } else {
-                    return Err(anyhow!("project not added"));
-                };
-
-                Ok(documents
-                    .into_iter()
-                    .filter_map(|(worktree_db_id, file_path, offset, name)| {
-                        let worktree_id = project_state.worktree_id_for_db_id(worktree_db_id)?;
-                        Some(SearchResult {
-                            worktree_id,
-                            name,
-                            offset,
-                            file_path,
-                        })
-                    })
-                    .collect())
-            })
-        })
-    }
-
-    fn project_entries_changed(
-        &mut self,
-        project: ModelHandle<Project>,
-        changes: Arc<[(Arc<Path>, ProjectEntryId, PathChange)]>,
-        cx: &mut ModelContext<'_, VectorStore>,
-        worktree_id: &WorktreeId,
-    ) -> Option<()> {
-        let reindexing_delay = settings::get::<VectorStoreSettings>(cx).reindexing_delay_seconds;
-
-        let worktree = project
-            .read(cx)
-            .worktree_for_id(worktree_id.clone(), cx)?
-            .read(cx)
-            .snapshot();
-
-        let worktree_db_id = self
-            .projects
-            .get(&project.downgrade())?
-            .db_id_for_worktree_id(worktree.id())?;
-        let file_mtimes = self.get_file_mtimes(worktree_db_id);
-
-        let language_registry = self.language_registry.clone();
-
-        cx.spawn(|this, mut cx| async move {
-            let file_mtimes = file_mtimes.await.log_err()?;
-
-            for change in changes.into_iter() {
-                let change_path = change.0.clone();
-                let absolute_path = worktree.absolutize(&change_path);
-
-                // Skip if git ignored or symlink
-                if let Some(entry) = worktree.entry_for_id(change.1) {
-                    if entry.is_ignored || entry.is_symlink || entry.is_external {
-                        continue;
-                    }
-                }
-
-                match change.2 {
-                    PathChange::Removed => this.update(&mut cx, |this, _| {
-                        this.db_update_tx
-                            .try_send(DbOperation::Delete {
-                                worktree_id: worktree_db_id,
-                                path: absolute_path,
-                            })
-                            .unwrap();
-                    }),
-                    _ => {
-                        if let Ok(language) = language_registry
-                            .language_for_file(&change_path.to_path_buf(), None)
-                            .await
-                        {
-                            if language
-                                .grammar()
-                                .and_then(|grammar| grammar.embedding_config.as_ref())
-                                .is_none()
-                            {
-                                continue;
-                            }
-
-                            let modified_time =
-                                change_path.metadata().log_err()?.modified().log_err()?;
-
-                            let existing_time = file_mtimes.get(&change_path.to_path_buf());
-                            let already_stored = existing_time
-                                .map_or(false, |existing_time| &modified_time != existing_time);
-
-                            if !already_stored {
-                                this.update(&mut cx, |this, _| {
-                                    let reindex_time = modified_time
-                                        + Duration::from_secs(reindexing_delay as u64);
-
-                                    let project_state =
-                                        this.projects.get_mut(&project.downgrade())?;
-                                    project_state.update_pending_files(
-                                        PendingFile {
-                                            relative_path: change_path.to_path_buf(),
-                                            absolute_path,
-                                            modified_time,
-                                            worktree_db_id,
-                                            language: language.clone(),
-                                        },
-                                        reindex_time,
-                                    );
-
-                                    for file in project_state.get_outstanding_files() {
-                                        this.parsing_files_tx.try_send(file).unwrap();
-                                    }
-                                    Some(())
-                                });
-                            }
-                        }
-                    }
-                }
-            }
-
-            Some(())
-        })
-        .detach();
-
-        Some(())
-    }
-}
-
-impl Entity for VectorStore {
-    type Event = ();
-}

crates/vector_store/src/vector_store_tests.rs 🔗

@@ -1,161 +0,0 @@
-use crate::{
-    db::dot, embedding::EmbeddingProvider, vector_store_settings::VectorStoreSettings, VectorStore,
-};
-use anyhow::Result;
-use async_trait::async_trait;
-use gpui::{Task, TestAppContext};
-use language::{Language, LanguageConfig, LanguageRegistry};
-use project::{project_settings::ProjectSettings, FakeFs, Project};
-use rand::{rngs::StdRng, Rng};
-use serde_json::json;
-use settings::SettingsStore;
-use std::sync::Arc;
-use unindent::Unindent;
-
-#[gpui::test]
-async fn test_vector_store(cx: &mut TestAppContext) {
-    cx.update(|cx| {
-        cx.set_global(SettingsStore::test(cx));
-        settings::register::<VectorStoreSettings>(cx);
-        settings::register::<ProjectSettings>(cx);
-    });
-
-    let fs = FakeFs::new(cx.background());
-    fs.insert_tree(
-        "/the-root",
-        json!({
-            "src": {
-                "file1.rs": "
-                    fn aaa() {
-                        println!(\"aaaa!\");
-                    }
-
-                    fn zzzzzzzzz() {
-                        println!(\"SLEEPING\");
-                    }
-                ".unindent(),
-                "file2.rs": "
-                    fn bbb() {
-                        println!(\"bbbb!\");
-                    }
-                ".unindent(),
-            }
-        }),
-    )
-    .await;
-
-    let languages = Arc::new(LanguageRegistry::new(Task::ready(())));
-    let rust_language = Arc::new(
-        Language::new(
-            LanguageConfig {
-                name: "Rust".into(),
-                path_suffixes: vec!["rs".into()],
-                ..Default::default()
-            },
-            Some(tree_sitter_rust::language()),
-        )
-        .with_embedding_query(
-            r#"
-            (function_item
-                name: (identifier) @name
-                body: (block)) @item
-            "#,
-        )
-        .unwrap(),
-    );
-    languages.add(rust_language);
-
-    let db_dir = tempdir::TempDir::new("vector-store").unwrap();
-    let db_path = db_dir.path().join("db.sqlite");
-
-    let store = VectorStore::new(
-        fs.clone(),
-        db_path,
-        Arc::new(FakeEmbeddingProvider),
-        languages,
-        cx.to_async(),
-    )
-    .await
-    .unwrap();
-
-    let project = Project::test(fs, ["/the-root".as_ref()], cx).await;
-    let worktree_id = project.read_with(cx, |project, cx| {
-        project.worktrees(cx).next().unwrap().read(cx).id()
-    });
-    store
-        .update(cx, |store, cx| store.add_project(project.clone(), cx))
-        .await
-        .unwrap();
-    cx.foreground().run_until_parked();
-
-    let search_results = store
-        .update(cx, |store, cx| {
-            store.search(project.clone(), "aaaa".to_string(), 5, cx)
-        })
-        .await
-        .unwrap();
-
-    assert_eq!(search_results[0].offset, 0);
-    assert_eq!(search_results[0].name, "aaa");
-    assert_eq!(search_results[0].worktree_id, worktree_id);
-}
-
-#[gpui::test]
-fn test_dot_product(mut rng: StdRng) {
-    assert_eq!(dot(&[1., 0., 0., 0., 0.], &[0., 1., 0., 0., 0.]), 0.);
-    assert_eq!(dot(&[2., 0., 0., 0., 0.], &[3., 1., 0., 0., 0.]), 6.);
-
-    for _ in 0..100 {
-        let size = 1536;
-        let mut a = vec![0.; size];
-        let mut b = vec![0.; size];
-        for (a, b) in a.iter_mut().zip(b.iter_mut()) {
-            *a = rng.gen();
-            *b = rng.gen();
-        }
-
-        assert_eq!(
-            round_to_decimals(dot(&a, &b), 1),
-            round_to_decimals(reference_dot(&a, &b), 1)
-        );
-    }
-
-    fn round_to_decimals(n: f32, decimal_places: i32) -> f32 {
-        let factor = (10.0 as f32).powi(decimal_places);
-        (n * factor).round() / factor
-    }
-
-    fn reference_dot(a: &[f32], b: &[f32]) -> f32 {
-        a.iter().zip(b.iter()).map(|(a, b)| a * b).sum()
-    }
-}
-
-struct FakeEmbeddingProvider;
-
-#[async_trait]
-impl EmbeddingProvider for FakeEmbeddingProvider {
-    async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>> {
-        Ok(spans
-            .iter()
-            .map(|span| {
-                let mut result = vec![1.0; 26];
-                for letter in span.chars() {
-                    let letter = letter.to_ascii_lowercase();
-                    if letter as u32 >= 'a' as u32 {
-                        let ix = (letter as u32) - ('a' as u32);
-                        if ix < 26 {
-                            result[ix as usize] += 1.0;
-                        }
-                    }
-                }
-
-                let norm = result.iter().map(|x| x * x).sum::<f32>().sqrt();
-                for x in &mut result {
-                    *x /= norm;
-                }
-
-                result
-            })
-            .collect())
-    }
-}

crates/zed/Cargo.toml 🔗

@@ -64,7 +64,7 @@ terminal_view = { path = "../terminal_view" }
 theme = { path = "../theme" }
 theme_selector = { path = "../theme_selector" }
 util = { path = "../util" }
-vector_store = { path = "../vector_store" }
+semantic_index = { path = "../semantic_index" }
 vim = { path = "../vim" }
 workspace = { path = "../workspace" }
 welcome = { path = "../welcome" }

crates/zed/src/languages/c/embedding.scm 🔗

@@ -0,0 +1,43 @@
+(
+    (comment)* @context
+    .
+    (declaration
+        declarator: [
+            (function_declarator
+                declarator: (_) @name)
+            (pointer_declarator
+                "*" @name
+                declarator: (function_declarator
+                    declarator: (_) @name))
+            (pointer_declarator
+                "*" @name
+                declarator: (pointer_declarator
+                    "*" @name
+                    declarator: (function_declarator
+                        declarator: (_) @name)))
+            ]
+        ) @item
+    )
+
+(
+    (comment)* @context
+    .
+    (function_definition
+        declarator: [
+            (function_declarator
+                declarator: (_) @name
+                )
+            (pointer_declarator
+                "*" @name
+                declarator: (function_declarator
+                    declarator: (_) @name
+                    ))
+            (pointer_declarator
+                "*" @name
+                declarator: (pointer_declarator
+                    "*" @name
+                    declarator: (function_declarator
+                        declarator: (_) @name)))
+            ]
+        ) @item
+    )

crates/zed/src/languages/cpp/embedding.scm 🔗

@@ -0,0 +1,61 @@
+(
+    (comment)* @context
+    .
+    (function_definition
+        (type_qualifier)? @name
+        type: (_)? @name
+        declarator: [
+            (function_declarator
+                declarator: (_) @name)
+            (pointer_declarator
+                "*" @name
+                declarator: (function_declarator
+                declarator: (_) @name))
+            (pointer_declarator
+                "*" @name
+                declarator: (pointer_declarator
+                    "*" @name
+                declarator: (function_declarator
+                    declarator: (_) @name)))
+            (reference_declarator
+                ["&" "&&"] @name
+                (function_declarator
+                declarator: (_) @name))
+        ]
+        (type_qualifier)? @name) @item
+    )
+
+(
+    (comment)* @context
+    .
+    (template_declaration
+        (class_specifier
+            "class" @name
+            name: (_) @name)
+            ) @item
+)
+
+(
+    (comment)* @context
+    .
+    (class_specifier
+        "class" @name
+        name: (_) @name) @item
+    )
+
+(
+    (comment)* @context
+    .
+    (enum_specifier
+        "enum" @name
+        name: (_) @name) @item
+    )
+
+(
+    (comment)* @context
+    .
+    (declaration
+        type: (struct_specifier
+        "struct" @name)
+        declarator: (_) @name) @item
+)

crates/zed/src/languages/elixir/embedding.scm 🔗

@@ -0,0 +1,27 @@
+(
+    (unary_operator
+        operator: "@"
+        operand: (call
+            target: (identifier) @unary
+            (#match? @unary "^(doc)$"))
+        ) @context
+    .
+    (call
+        target: (identifier) @name
+        (arguments
+            [
+            (identifier) @name
+            (call
+                target: (identifier) @name)
+                (binary_operator
+                    left: (call
+                    target: (identifier) @name)
+                    operator: "when")
+            ])
+        (#match? @name "^(def|defp|defdelegate|defguard|defguardp|defmacro|defmacrop|defn|defnp)$")) @item
+        )
+
+    (call
+        target: (identifier) @name
+        (arguments (alias) @name)
+        (#match? @name "^(defmodule|defprotocol)$")) @item

crates/zed/src/languages/go/embedding.scm 🔗

@@ -0,0 +1,24 @@
+(
+    (comment)* @context
+    .
+    (type_declaration
+        (type_spec
+            name: (_) @name)
+    ) @item
+)
+
+(
+    (comment)* @context
+    .
+    (function_declaration
+        name: (_) @name
+    ) @item
+)
+
+(
+    (comment)* @context
+    .
+    (method_declaration
+        name: (_) @name
+    ) @item
+)

crates/zed/src/languages/javascript/embedding.scm 🔗

@@ -1,56 +1,71 @@
-; (internal_module
-;     "namespace" @context
-;     name: (_) @name) @item
-
-(enum_declaration
-    "enum" @context
-    name: (_) @name) @item
-
-(function_declaration
-    "async"? @context
-    "function" @context
-    name: (_) @name) @item
-
-(interface_declaration
-    "interface" @context
-    name: (_) @name) @item
-
-; (program
-;     (export_statement
-;         (lexical_declaration
-;             ["let" "const"] @context
-;             (variable_declarator
-;                 name: (_) @name) @item)))
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (function_declaration
+                "async"? @name
+                "function" @name
+                name: (_) @name))
+        (function_declaration
+            "async"? @name
+            "function" @name
+            name: (_) @name)
+    ] @item
+)
 
-(program
-    (lexical_declaration
-        ["let" "const"] @context
-        (variable_declarator
-            name: (_) @name) @item))
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (class_declaration
+                "class" @name
+                name: (_) @name))
+        (class_declaration
+            "class" @name
+            name: (_) @name)
+    ] @item
+)
 
-(class_declaration
-    "class" @context
-    name: (_) @name) @item
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (interface_declaration
+                "interface" @name
+                name: (_) @name))
+        (interface_declaration
+            "interface" @name
+            name: (_) @name)
+    ] @item
+)
 
-(method_definition
+(
+    (comment)* @context
+    .
     [
-        "get"
-        "set"
-        "async"
-        "*"
-        "readonly"
-        "static"
-        (override_modifier)
-        (accessibility_modifier)
-        ]* @context
-    name: (_) @name) @item
+        (export_statement
+            (enum_declaration
+                "enum" @name
+                name: (_) @name))
+        (enum_declaration
+            "enum" @name
+            name: (_) @name)
+    ] @item
+)
 
-; (public_field_definition
-;     [
-;         "declare"
-;         "readonly"
-;         "abstract"
-;         "static"
-;         (accessibility_modifier)
-;         ]* @context
-;     name: (_) @name) @item
+(
+    (comment)* @context
+    .
+    (method_definition
+        [
+            "get"
+            "set"
+            "async"
+            "*"
+            "static"
+            ]* @name
+        name: (_) @name) @item
+)

crates/zed/src/languages/json/embedding.scm 🔗

@@ -0,0 +1,14 @@
+; Only produce one embedding for the entire file.
+(document) @item
+
+; Collapse arrays, except for the first object.
+(array
+  "[" @keep
+  .
+  (object)? @keep
+  "]" @keep) @collapse
+
+; Collapse string values (but not keys).
+(pair value: (string
+  "\"" @keep
+  "\"" @keep) @collapse)

crates/zed/src/languages/rust/config.toml 🔗

@@ -10,3 +10,4 @@ brackets = [
     { start = "\"", end = "\"", close = true, newline = false, not_in = ["string"] },
     { start = "/*", end = " */", close = true, newline = false, not_in = ["string", "comment"] },
 ]
+collapsed_placeholder = " /* ... */ "

crates/zed/src/languages/rust/embedding.scm 🔗

@@ -1,36 +1,28 @@
-(struct_item
-    (visibility_modifier)? @context
-    "struct" @context
-    name: (_) @name) @item
+(
+    [(line_comment) (attribute_item)]* @context
+    .
+    [
+        (struct_item
+            name: (_) @name)
 
-(enum_item
-    (visibility_modifier)? @context
-    "enum" @context
-    name: (_) @name) @item
+        (enum_item
+            name: (_) @name)
 
-(impl_item
-    "impl" @context
-    trait: (_)? @name
-    "for"? @context
-    type: (_) @name) @item
+        (impl_item
+            trait: (_)? @name
+            "for"? @name
+            type: (_) @name)
 
-(trait_item
-    (visibility_modifier)? @context
-    "trait" @context
-    name: (_) @name) @item
+        (trait_item
+            name: (_) @name)
 
-(function_item
-    (visibility_modifier)? @context
-    (function_modifiers)? @context
-    "fn" @context
-    name: (_) @name) @item
+        (function_item
+            name: (_) @name
+            body: (block
+                "{" @keep
+                "}" @keep) @collapse)
 
-(function_signature_item
-    (visibility_modifier)? @context
-    (function_modifiers)? @context
-    "fn" @context
-    name: (_) @name) @item
-
-(macro_definition
-    . "macro_rules!" @context
-    name: (_) @name) @item
+        (macro_definition
+            name: (_) @name)
+        ] @item
+    )

crates/zed/src/languages/tsx/embedding.scm 🔗

@@ -1,35 +1,85 @@
-(enum_declaration
-    "enum" @context
-    name: (_) @name) @item
-
-(function_declaration
-    "async"? @context
-    "function" @context
-    name: (_) @name) @item
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (function_declaration
+                "async"? @name
+                "function" @name
+                name: (_) @name))
+        (function_declaration
+            "async"? @name
+            "function" @name
+            name: (_) @name)
+        ] @item
+    )
 
-(interface_declaration
-    "interface" @context
-    name: (_) @name) @item
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (class_declaration
+                "class" @name
+                name: (_) @name))
+        (class_declaration
+            "class" @name
+            name: (_) @name)
+        ] @item
+    )
 
-(program
-    (lexical_declaration
-        ["let" "const"] @context
-        (variable_declarator
-            name: (_) @name) @item))
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (interface_declaration
+                "interface" @name
+                name: (_) @name))
+        (interface_declaration
+            "interface" @name
+            name: (_) @name)
+        ] @item
+    )
 
-(class_declaration
-    "class" @context
-    name: (_) @name) @item
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (enum_declaration
+                "enum" @name
+                name: (_) @name))
+        (enum_declaration
+            "enum" @name
+            name: (_) @name)
+        ] @item
+    )
 
-(method_definition
+(
+    (comment)* @context
+    .
     [
-        "get"
-        "set"
-        "async"
-        "*"
-        "readonly"
-        "static"
-        (override_modifier)
-        (accessibility_modifier)
-        ]* @context
-    name: (_) @name) @item
+        (export_statement
+            (type_alias_declaration
+                "type" @name
+                name: (_) @name))
+        (type_alias_declaration
+            "type" @name
+            name: (_) @name)
+        ] @item
+    )
+
+(
+    (comment)* @context
+    .
+    (method_definition
+        [
+            "get"
+            "set"
+            "async"
+            "*"
+            "static"
+            ]* @name
+        name: (_) @name) @item
+    )

crates/zed/src/languages/typescript/embedding.scm 🔗

@@ -1,59 +1,85 @@
-; (internal_module
-;     "namespace" @context
-;     name: (_) @name) @item
-
-(enum_declaration
-    "enum" @context
-    name: (_) @name) @item
-
-; (type_alias_declaration
-;     "type" @context
-;     name: (_) @name) @item
-
-(function_declaration
-    "async"? @context
-    "function" @context
-    name: (_) @name) @item
-
-(interface_declaration
-    "interface" @context
-    name: (_) @name) @item
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (function_declaration
+                "async"? @name
+                "function" @name
+                name: (_) @name))
+        (function_declaration
+            "async"? @name
+            "function" @name
+            name: (_) @name)
+    ] @item
+)
 
-; (export_statement
-;     (lexical_declaration
-;         ["let" "const"] @context
-;         (variable_declarator
-;             name: (_) @name) @item))
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (class_declaration
+                "class" @name
+                name: (_) @name))
+        (class_declaration
+            "class" @name
+            name: (_) @name)
+    ] @item
+)
 
-(program
-    (lexical_declaration
-        ["let" "const"] @context
-        (variable_declarator
-            name: (_) @name) @item))
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (interface_declaration
+                "interface" @name
+                name: (_) @name))
+        (interface_declaration
+            "interface" @name
+            name: (_) @name)
+    ] @item
+)
 
-(class_declaration
-    "class" @context
-    name: (_) @name) @item
+(
+    (comment)* @context
+    .
+    [
+        (export_statement
+            (enum_declaration
+                "enum" @name
+                name: (_) @name))
+        (enum_declaration
+            "enum" @name
+            name: (_) @name)
+    ] @item
+)
 
-(method_definition
+(
+    (comment)* @context
+    .
     [
-        "get"
-        "set"
-        "async"
-        "*"
-        "readonly"
-        "static"
-        (override_modifier)
-        (accessibility_modifier)
-        ]* @context
-    name: (_) @name) @item
+        (export_statement
+            (type_alias_declaration
+                "type" @name
+                name: (_) @name))
+        (type_alias_declaration
+            "type" @name
+            name: (_) @name)
+    ] @item
+)
 
-; (public_field_definition
-;     [
-;         "declare"
-;         "readonly"
-;         "abstract"
-;         "static"
-;         (accessibility_modifier)
-;         ]* @context
-;     name: (_) @name) @item
+(
+    (comment)* @context
+    .
+    (method_definition
+        [
+            "get"
+            "set"
+            "async"
+            "*"
+            "static"
+            ]* @name
+        name: (_) @name) @item
+)

crates/zed/src/main.rs 🔗

@@ -157,7 +157,7 @@ fn main() {
         project_panel::init(Assets, cx);
         diagnostics::init(cx);
         search::init(cx);
-        vector_store::init(fs.clone(), http.clone(), languages.clone(), cx);
+        semantic_index::init(fs.clone(), http.clone(), languages.clone(), cx);
         vim::init(cx);
         terminal_view::init(cx);
         copilot::init(http.clone(), node_runtime, cx);