From 3d89cd10a4c544bb4973861023f1da53954ecbf0 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 21 Aug 2023 16:35:57 +0200 Subject: [PATCH 01/15] added sha1 encoding for each document --- Cargo.lock | 411 ++++++++++---------- crates/semantic_index/Cargo.toml | 1 + crates/semantic_index/src/db.rs | 21 +- crates/semantic_index/src/embedding.rs | 2 +- crates/semantic_index/src/parsing.rs | 14 + crates/semantic_index/src/semantic_index.rs | 3 +- 6 files changed, 245 insertions(+), 207 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3edf9acab3911af43eb3e84055cb79892e26a27c..8048398ef82aba67c7a37b57d783d2d02d036dd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -88,9 +88,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.2" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +checksum = "6748e8def348ed4d14996fa801f4122cd763fff530258cdc03f64b25f89d3a5a" dependencies = [ "memchr", ] @@ -140,7 +140,7 @@ source = "git+https://github.com/zed-industries/alacritty?rev=33306142195b354ef3 dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -151,7 +151,7 @@ dependencies = [ "alacritty_config", "alacritty_config_derive", "base64 0.13.1", - "bitflags 2.3.3", + "bitflags 2.4.0", "home", "libc", "log", @@ -268,9 +268,9 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "c677ab05e09154296dd37acecd46420c17b9713e8366facafa8fc0885167cf4c" dependencies = [ "anstyle", "windows-sys", @@ -278,9 +278,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.72" +version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" [[package]] name = "arrayref" @@ -337,7 +337,7 @@ dependencies = [ "futures-core", "futures-io", "once_cell", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "tokio", ] @@ -351,7 +351,7 @@ dependencies = [ "futures-core", "futures-io", "memchr", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", ] [[package]] @@ -411,15 +411,15 @@ dependencies = [ "polling", "rustix 0.37.23", "slab", - "socket2", + "socket2 0.4.9", "waker-fn", ] [[package]] name = "async-lock" -version = "2.7.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa24f727524730b077666307f2734b4a1a1c57acb79193127dcc8914d5242dd7" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" dependencies = [ "event-listener", ] @@ -482,7 +482,7 @@ checksum = "0e97ce7de6cf12de5d7226c73f5ba9811622f4db3a5b91b55c53e987e5f91cba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -505,7 +505,7 @@ dependencies = [ "log", "memchr", "once_cell", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "pin-utils", "slab", "wasm-bindgen-futures", @@ -519,7 +519,7 @@ checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" dependencies = [ "async-stream-impl", "futures-core", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", ] [[package]] @@ -530,7 +530,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -567,13 +567,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.72" +version = "0.1.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc6dde6e4ed435a4c1ee4e73592f5ba9da2151af10076cc04858746af9352d09" +checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -586,7 +586,7 @@ dependencies = [ "futures-io", "futures-util", "log", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "tungstenite 0.16.0", ] @@ -681,12 +681,12 @@ dependencies = [ "http", "http-body", "hyper", - "itoa 1.0.9", + "itoa", "matchit", "memchr", "mime", "percent-encoding", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "serde", "serde_json", "serde_urlencoded", @@ -727,7 +727,7 @@ dependencies = [ "futures-util", "http", "mime", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "serde", "serde_json", "tokio", @@ -831,7 +831,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.28", + "syn 2.0.29", "which", ] @@ -858,9 +858,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.3.3" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" dependencies = [ "serde", ] @@ -996,7 +996,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05" dependencies = [ "memchr", - "regex-automata 0.3.4", + "regex-automata 0.3.6", "serde", ] @@ -1156,11 +1156,12 @@ checksum = "a2698f953def977c68f935bb0dfa959375ad4638570e969e2f1e9f433cbf1af6" [[package]] name = "cc" -version = "1.0.79" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ "jobserver", + "libc", ] [[package]] @@ -1251,9 +1252,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.19" +version = "4.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fd304a20bff958a57f04c4e96a2e7594cc4490a0e809cbd48bb6437edaa452d" +checksum = "03aef18ddf7d879c15ce20f04826ef8418101c7e528014c3eeea13321047dca3" dependencies = [ "clap_builder", "clap_derive 4.3.12", @@ -1262,9 +1263,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.3.19" +version = "4.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01c6a3f08f1fe5662a35cfe393aec09c4df95f60ee93b7556505260f75eee9e1" +checksum = "f8ce6fffb678c9b80a70b6b6de0aad31df727623a70fd9a842c30cd573e2fa98" dependencies = [ "anstream", "anstyle", @@ -1294,7 +1295,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -1355,7 +1356,7 @@ dependencies = [ "sum_tree", "tempfile", "thiserror", - "time 0.3.24", + "time 0.3.25", "tiny_http", "url", "util", @@ -1457,7 +1458,7 @@ dependencies = [ "sha-1 0.9.8", "sqlx", "theme", - "time 0.3.24", + "time 0.3.25", "tokio", "tokio-tungstenite", "toml 0.5.11", @@ -1984,7 +1985,7 @@ dependencies = [ "openssl-probe", "openssl-sys", "schannel", - "socket2", + "socket2 0.4.9", "winapi 0.3.9", ] @@ -2065,9 +2066,9 @@ dependencies = [ [[package]] name = "deranged" -version = "0.3.6" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8810e7e2cf385b1e9b50d68264908ec367ba642c96d02edfe61c39e88e2a3c01" +checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946" dependencies = [ "serde", ] @@ -2246,9 +2247,9 @@ dependencies = [ [[package]] name = "dyn-clone" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "304e6508efa593091e97a9abbc10f90aa7ca635b6d2784feff3c89d41dd12272" +checksum = "bbfc4744c1b8f2a09adc0e55242f60b1af195d88596bd8700be74418c056c555" [[package]] name = "editor" @@ -2361,9 +2362,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "erased-serde" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da96524cc884f6558f1769b6c46686af2fe8e8b4cd253bd5a3cdba8181b8e070" +checksum = "fc978899517288e3ebbd1a3bfc1d9537dbb87eeab149e53ea490e63bcdff561a" dependencies = [ "serde", ] @@ -2526,13 +2527,13 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.21" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" +checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0" dependencies = [ "cfg-if 1.0.0", "libc", - "redox_syscall 0.2.16", + "redox_syscall 0.3.5", "windows-sys", ] @@ -2544,9 +2545,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flate2" -version = "1.0.26" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +checksum = "c6c98ee8095e9d1dcbf2fcc6d95acccb90d1c81db1e44725c6a984b1dbdfb010" dependencies = [ "crc32fast", "miniz_oxide 0.7.1", @@ -2687,7 +2688,7 @@ dependencies = [ "smol", "sum_tree", "tempfile", - "time 0.3.24", + "time 0.3.25", "util", ] @@ -2825,7 +2826,7 @@ dependencies = [ "futures-io", "memchr", "parking", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "waker-fn", ] @@ -2837,7 +2838,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -2866,7 +2867,7 @@ dependencies = [ "futures-sink", "futures-task", "memchr", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "pin-utils", "slab", "tokio-io", @@ -2989,11 +2990,11 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "globset" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aca8bbd8e0707c1887a8bbb7e6b40e228f251ff5d62c8220a4a7a53c73aff006" +checksum = "759c97c1e17c55525b57192c06a267cda0ac5210b222d6b82189a2338fa1c13d" dependencies = [ - "aho-corasick 1.0.2", + "aho-corasick 1.0.4", "bstr", "fnv", "log", @@ -3078,7 +3079,7 @@ dependencies = [ "smol", "sqlez", "sum_tree", - "time 0.3.24", + "time 0.3.25", "tiny-skia", "usvg", "util", @@ -3293,7 +3294,7 @@ checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" dependencies = [ "bytes 1.4.0", "fnv", - "itoa 1.0.9", + "itoa", ] [[package]] @@ -3304,7 +3305,7 @@ checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes 1.4.0", "http", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", ] [[package]] @@ -3321,9 +3322,9 @@ checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" [[package]] name = "httpdate" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "human_bytes" @@ -3352,9 +3353,9 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.9", - "pin-project-lite 0.2.10", - "socket2", + "itoa", + "pin-project-lite 0.2.12", + "socket2 0.4.9", "tokio", "tower-service", "tracing", @@ -3368,7 +3369,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ "hyper", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "tokio", "tokio-io-timeout", ] @@ -3586,7 +3587,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", - "rustix 0.38.4", + "rustix 0.38.8", "windows-sys", ] @@ -3626,12 +3627,6 @@ dependencies = [ "either", ] -[[package]] -name = "itoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - [[package]] name = "itoa" version = "1.0.9" @@ -4058,9 +4053,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.19" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" dependencies = [ "serde", "value-bag", @@ -4091,9 +4086,9 @@ dependencies = [ [[package]] name = "lsp-types" -version = "0.94.0" +version = "0.94.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b63735a13a1f9cd4f4835223d828ed9c2e35c8c5e61837774399f558b6a1237" +checksum = "c66bfd44a06ae10647fe3f8214762e9369fd4248df1350924b4ef9e770a85ea1" dependencies = [ "bitflags 1.3.2", "serde", @@ -4751,9 +4746,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "openssl" -version = "0.10.55" +version = "0.10.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" +checksum = "729b745ad4a5575dd06a3e1af1414bd330ee561c01b3899eb584baeaa8def17e" dependencies = [ "bitflags 1.3.2", "cfg-if 1.0.0", @@ -4772,7 +4767,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -4783,9 +4778,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.90" +version = "0.9.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6" +checksum = "866b5f16f90776b9bb8dc1e1802ac6f0513de3a7a7465867bfbc563dc737faac" dependencies = [ "cc", "libc", @@ -4920,7 +4915,7 @@ dependencies = [ "libc", "redox_syscall 0.3.5", "smallvec", - "windows-targets 0.48.1", + "windows-targets 0.48.5", ] [[package]] @@ -5012,12 +5007,12 @@ dependencies = [ [[package]] name = "petgraph" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" +checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" dependencies = [ "fixedbitset", - "indexmap 1.9.3", + "indexmap 2.0.0", ] [[package]] @@ -5045,22 +5040,22 @@ checksum = "db8bcd96cb740d03149cbad5518db9fd87126a10ab519c011893b1754134c468" [[package]] name = "pin-project" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "030ad2bc4db10a8944cb0d837f158bdfec4d4a4873ab701a95046770d11f8842" +checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec2e072ecce94ec471b13398d5402c188e76ac03cf74dd1a975161b23a3f6d9c" +checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -5071,9 +5066,9 @@ checksum = "257b64915a082f7811703966789728173279bdebb956b143dbcd23f6f970a777" [[package]] name = "pin-project-lite" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c40d25201921e5ff0c862a505c6557ea88568a4e3ace775ab55e93f2f4f9d57" +checksum = "12cc1b0bf1727a77a54b6654e7b5f1af8604923edc8b81885f8ec92f9e3f0a05" [[package]] name = "pin-utils" @@ -5098,7 +5093,7 @@ dependencies = [ "line-wrap", "quick-xml", "serde", - "time 0.3.24", + "time 0.3.25", ] [[package]] @@ -5163,7 +5158,7 @@ dependencies = [ "concurrent-queue", "libc", "log", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "windows-sys", ] @@ -5213,7 +5208,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c64d9ba0963cdcea2e1b2230fbae2bab30eb25a174be395c41e764bfb65dd62" dependencies = [ "proc-macro2", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -5553,9 +5548,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.32" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" dependencies = [ "proc-macro2", ] @@ -5778,13 +5773,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.1" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575" +checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" dependencies = [ - "aho-corasick 1.0.2", + "aho-corasick 1.0.4", "memchr", - "regex-automata 0.3.4", + "regex-automata 0.3.6", "regex-syntax 0.7.4", ] @@ -5799,11 +5794,11 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.4" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7b6d6190b7594385f61bd3911cd1be99dfddcfc365a4160cc2ab5bff4aed294" +checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" dependencies = [ - "aho-corasick 1.0.2", + "aho-corasick 1.0.4", "memchr", "regex-syntax 0.7.4", ] @@ -5873,7 +5868,7 @@ dependencies = [ "native-tls", "once_cell", "percent-encoding", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "serde", "serde_json", "serde_urlencoded", @@ -6093,7 +6088,7 @@ dependencies = [ "proc-macro2", "quote", "rust-embed-utils", - "syn 2.0.28", + "syn 2.0.29", "walkdir", ] @@ -6164,7 +6159,7 @@ dependencies = [ "bitflags 1.3.2", "errno 0.2.8", "io-lifetimes 0.5.3", - "itoa 1.0.9", + "itoa", "libc", "linux-raw-sys 0.0.42", "once_cell", @@ -6187,11 +6182,11 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.4" +version = "0.38.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5" +checksum = "19ed4fa021d81c8392ce04db050a3da9a60299050b7ae1cf482d862b54a7218f" dependencies = [ - "bitflags 2.3.3", + "bitflags 2.4.0", "errno 0.3.2", "libc", "linux-raw-sys 0.4.5", @@ -6393,7 +6388,7 @@ dependencies = [ "serde_json", "sqlx", "thiserror", - "time 0.3.24", + "time 0.3.25", "tracing", "url", "uuid 1.4.1", @@ -6421,7 +6416,7 @@ dependencies = [ "rust_decimal", "sea-query-derive", "serde_json", - "time 0.3.24", + "time 0.3.25", "uuid 1.4.1", ] @@ -6436,7 +6431,7 @@ dependencies = [ "sea-query", "serde_json", "sqlx", - "time 0.3.24", + "time 0.3.25", "uuid 1.4.1", ] @@ -6564,10 +6559,11 @@ dependencies = [ "serde", "serde_json", "settings", + "sha1", "smol", "tempdir", "theme", - "tiktoken-rs 0.5.0", + "tiktoken-rs 0.5.1", "tree-sitter", "tree-sitter-cpp", "tree-sitter-elixir", @@ -6615,22 +6611,22 @@ checksum = "5a9f47faea3cad316faa914d013d24f471cd90bfca1a0c70f05a3f42c6441e99" [[package]] name = "serde" -version = "1.0.180" +version = "1.0.185" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea67f183f058fe88a4e3ec6e2788e003840893b91bac4559cabedd00863b3ed" +checksum = "be9b6f69f1dfd54c3b568ffa45c310d6973a5e5148fd40cf515acaf38cf5bc31" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.180" +version = "1.0.185" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24e744d7782b686ab3b73267ef05697159cc0e5abbed3f47f9933165e5219036" +checksum = "dc59dfdcbad1437773485e0367fea4b090a2e0a16d9ffc46af47764536a298ec" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -6655,24 +6651,24 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.104" +version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "076066c5f1078eac5b722a31827a8832fe108bed65dfa75e233c89f8206e976c" +checksum = "693151e1ac27563d6dbcec9dee9fbd5da8539b20fa14ad3752b2e6d363ace360" dependencies = [ "indexmap 2.0.0", - "itoa 1.0.9", + "itoa", "ryu", "serde", ] [[package]] name = "serde_json_lenient" -version = "0.1.4" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d7b9ce5b0a63c6269b9623ed828b39259545a6ec0d8a35d6135ad6af6232add" +checksum = "29591aaa3a13f5ad0f2dd1a8a21bcddab11eaae7c3522b20ade2e85e9df52206" dependencies = [ - "indexmap 1.9.3", - "itoa 0.4.8", + "indexmap 2.0.0", + "itoa", "ryu", "serde", ] @@ -6685,7 +6681,7 @@ checksum = "8725e1dfadb3a50f7e5ce0b1a540466f6ed3fe7a0fca2ac2b8b831d31316bd00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -6704,7 +6700,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.9", + "itoa", "ryu", "serde", ] @@ -6991,6 +6987,16 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "socket2" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877" +dependencies = [ + "libc", + "windows-sys", +] + [[package]] name = "spin" version = "0.5.2" @@ -7090,7 +7096,7 @@ dependencies = [ "hkdf", "hmac 0.12.1", "indexmap 1.9.3", - "itoa 1.0.9", + "itoa", "libc", "libsqlite3-sys", "log", @@ -7113,7 +7119,7 @@ dependencies = [ "sqlx-rt", "stringprep", "thiserror", - "time 0.3.24", + "time 0.3.25", "tokio-stream", "url", "uuid 1.4.1", @@ -7236,7 +7242,7 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dc09e9364c2045ab5fa38f7b04d077b3359d30c4c2b3ec4bae67a358bd64326" dependencies = [ - "itoa 1.0.9", + "itoa", "ryu", "sval", ] @@ -7247,7 +7253,7 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ada6f627e38cbb8860283649509d87bc4a5771141daa41c78fd31f2b9485888d" dependencies = [ - "itoa 1.0.9", + "itoa", "ryu", "sval", ] @@ -7312,9 +7318,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.28" +version = "2.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567" +checksum = "c324c494eba9d92503e6f1ef2e6df781e78f6a7705a0202d9801b198807d518a" dependencies = [ "proc-macro2", "quote", @@ -7398,14 +7404,14 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.7.0" +version = "3.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5486094ee78b2e5038a6382ed7645bc084dc2ec433426ca4c3cb61e2007b8998" +checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" dependencies = [ "cfg-if 1.0.0", "fastrand 2.0.0", "redox_syscall 0.3.5", - "rustix 0.38.4", + "rustix 0.38.8", "windows-sys", ] @@ -7552,22 +7558,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.44" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "611040a08a0439f8248d1990b111c95baa9c704c805fa1f62104b39655fd7f90" +checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.44" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96" +checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -7614,9 +7620,9 @@ dependencies = [ [[package]] name = "tiktoken-rs" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a99d843674a3468b4a9200a565bbe909a0152f95e82a52feae71e6bf2d4b49d" +checksum = "2bf14cb08d8fda6e484c75ec2bfb6bcef48347d47abcd011fa9d56ee995a3da0" dependencies = [ "anyhow", "base64 0.21.2", @@ -7640,12 +7646,12 @@ dependencies = [ [[package]] name = "time" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b79eabcd964882a646b3584543ccabeae7869e9ac32a46f6f22b7a5bd405308b" +checksum = "b0fdd63d58b18d663fbdf70e049f00a22c8e42be082203be7f26589213cd75ea" dependencies = [ "deranged", - "itoa 1.0.9", + "itoa", "serde", "time-core", "time-macros", @@ -7710,20 +7716,19 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.29.1" +version = "1.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "532826ff75199d5833b9d2c5fe410f29235e25704ee5f0ef599fb51c21f4a4da" +checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" dependencies = [ - "autocfg", "backtrace", "bytes 1.4.0", "libc", "mio 0.8.8", "num_cpus", "parking_lot 0.12.1", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "signal-hook-registry", - "socket2", + "socket2 0.5.3", "tokio-macros", "windows-sys", ] @@ -7745,7 +7750,7 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" dependencies = [ - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "tokio", ] @@ -7757,7 +7762,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -7788,7 +7793,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" dependencies = [ "futures-core", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "tokio", ] @@ -7814,7 +7819,7 @@ dependencies = [ "futures-core", "futures-sink", "log", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "tokio", ] @@ -7828,7 +7833,7 @@ dependencies = [ "futures-core", "futures-io", "futures-sink", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "tokio", "tracing", ] @@ -7917,7 +7922,7 @@ dependencies = [ "futures-util", "indexmap 1.9.3", "pin-project", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "rand 0.8.5", "slab", "tokio", @@ -7940,7 +7945,7 @@ dependencies = [ "http", "http-body", "http-range-header", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "tower", "tower-layer", "tower-service", @@ -7966,7 +7971,7 @@ checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ "cfg-if 1.0.0", "log", - "pin-project-lite 0.2.10", + "pin-project-lite 0.2.12", "tracing-attributes", "tracing-core", ] @@ -7979,7 +7984,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] @@ -8064,9 +8069,9 @@ dependencies = [ [[package]] name = "tree-sitter-c" -version = "0.20.4" +version = "0.20.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa1bb73a4101c88775e4fefcd0543ee25e192034484a5bd45cb99eefb997dca9" +checksum = "30b03bdf218020057abee831581a74bff8c298323d6c6cd1a70556430ded9f4b" dependencies = [ "cc", "tree-sitter", @@ -8213,9 +8218,9 @@ dependencies = [ [[package]] name = "tree-sitter-python" -version = "0.20.3" +version = "0.20.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f47ebd9cac632764b2f4389b08517bf2ef895431dd163eb562e3d2062cc23a14" +checksum = "e6c93b1b1fbd0d399db3445f51fd3058e43d0b4dcff62ddbdb46e66550978aa5" dependencies = [ "cc", "tree-sitter", @@ -8242,9 +8247,9 @@ dependencies = [ [[package]] name = "tree-sitter-rust" -version = "0.20.3" +version = "0.20.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "797842733e252dc11ae5d403a18060bf337b822fc2ae5ddfaa6ff4d9cc20bda6" +checksum = "b0832309b0b2b6d33760ce5c0e818cb47e1d72b468516bfe4134408926fa7594" dependencies = [ "cc", "tree-sitter", @@ -8773,7 +8778,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", "wasm-bindgen-shared", ] @@ -8807,7 +8812,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -9044,9 +9049,9 @@ dependencies = [ [[package]] name = "wast" -version = "62.0.1" +version = "63.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ae06f09dbe377b889fbd620ff8fa21e1d49d1d9d364983c0cdbf9870cb9f1f" +checksum = "2560471f60a48b77fccefaf40796fda61c97ce1e790b59dfcec9dc3995c9f63a" dependencies = [ "leb128", "memchr", @@ -9056,11 +9061,11 @@ dependencies = [ [[package]] name = "wat" -version = "1.0.69" +version = "1.0.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "842e15861d203fb4a96d314b0751cdeaf0f6f8b35e8d81d2953af2af5e44e637" +checksum = "3bdc306c2c4c2f2bf2ba69e083731d0d2a77437fc6a350a19db139636e7e416c" dependencies = [ - "wast 62.0.1", + "wast 63.0.0", ] [[package]] @@ -9262,7 +9267,7 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" dependencies = [ - "windows-targets 0.48.1", + "windows-targets 0.48.5", ] [[package]] @@ -9271,7 +9276,7 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.48.1", + "windows-targets 0.48.5", ] [[package]] @@ -9291,17 +9296,17 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.48.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -9312,9 +9317,9 @@ checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_msvc" @@ -9324,9 +9329,9 @@ checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" [[package]] name = "windows_aarch64_msvc" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_i686_gnu" @@ -9336,9 +9341,9 @@ checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" [[package]] name = "windows_i686_gnu" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_msvc" @@ -9348,9 +9353,9 @@ checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" [[package]] name = "windows_i686_msvc" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_x86_64_gnu" @@ -9360,9 +9365,9 @@ checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" [[package]] name = "windows_x86_64_gnu" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnullvm" @@ -9372,9 +9377,9 @@ checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_msvc" @@ -9384,15 +9389,15 @@ checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" [[package]] name = "windows_x86_64_msvc" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "winnow" -version = "0.5.2" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bd122eb777186e60c3fdf765a58ac76e41c582f1f535fbf3314434c6b58f3f7" +checksum = "d09770118a7eb1ccaf4a594a221334119a44a814fcb0d31c5b85e83e97227a97" dependencies = [ "memchr", ] @@ -9522,7 +9527,7 @@ name = "xtask" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.3.19", + "clap 4.3.23", "schemars", "serde_json", "theme", @@ -9703,7 +9708,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.28", + "syn 2.0.29", ] [[package]] diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index 3c7a6ff5df6df4b0ae7de9e0e7754a0e0d5850cc..4e817fcbe2c6dc8a6edac00fc51dd9e5be437b8f 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -38,6 +38,7 @@ parking_lot.workspace = true rand.workspace = true schemars.workspace = true globset.workspace = true +sha1 = "0.10.5" [dev-dependencies] gpui = { path = "../gpui", features = ["test-support"] } diff --git a/crates/semantic_index/src/db.rs b/crates/semantic_index/src/db.rs index e57a5d733fb2a0b2c68dc6e874b8ac050de5e78b..60ecf3b45fef383e73172c11b0b5ee3d7d48d93d 100644 --- a/crates/semantic_index/src/db.rs +++ b/crates/semantic_index/src/db.rs @@ -26,6 +26,9 @@ pub struct FileRecord { #[derive(Debug)] struct Embedding(pub Vec); +#[derive(Debug)] +struct Sha1(pub Vec); + impl FromSql for Embedding { fn column_result(value: ValueRef) -> FromSqlResult { let bytes = value.as_blob()?; @@ -37,6 +40,17 @@ impl FromSql for Embedding { } } +impl FromSql for Sha1 { + fn column_result(value: ValueRef) -> FromSqlResult { + let bytes = value.as_blob()?; + let sha1: Result, Box> = bincode::deserialize(bytes); + if sha1.is_err() { + return Err(rusqlite::types::FromSqlError::Other(sha1.unwrap_err())); + } + return Ok(Sha1(sha1.unwrap())); + } +} + pub struct VectorDatabase { db: rusqlite::Connection, } @@ -132,6 +146,7 @@ impl VectorDatabase { end_byte INTEGER NOT NULL, name VARCHAR NOT NULL, embedding BLOB NOT NULL, + sha1 BLOB NOT NULL, FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE )", [], @@ -182,15 +197,17 @@ impl VectorDatabase { // I imagine we can speed this up with a bulk insert of some kind. for document in documents { let embedding_blob = bincode::serialize(&document.embedding)?; + let sha_blob = bincode::serialize(&document.sha1)?; self.db.execute( - "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding) VALUES (?1, ?2, ?3, ?4, $5)", + "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, sha1) VALUES (?1, ?2, ?3, ?4, ?5, ?6)", params![ file_id, document.range.start.to_string(), document.range.end.to_string(), document.name, - embedding_blob + embedding_blob, + sha_blob ], )?; } diff --git a/crates/semantic_index/src/embedding.rs b/crates/semantic_index/src/embedding.rs index 77457ec7f6e34961ab2a784ef6f0d8068c4c1dbb..4fc247bfcc3233cc55e146fa356a64bcb837a09e 100644 --- a/crates/semantic_index/src/embedding.rs +++ b/crates/semantic_index/src/embedding.rs @@ -39,7 +39,7 @@ struct OpenAIEmbeddingResponse { #[derive(Debug, Deserialize)] struct OpenAIEmbedding { - embedding: Vec, + embedding: Vec, index: usize, object: String, } diff --git a/crates/semantic_index/src/parsing.rs b/crates/semantic_index/src/parsing.rs index cef23862c563f470000306fde5ac32f95a50a458..4aefb0b00d86df9ce6b0a927647687c20e050c83 100644 --- a/crates/semantic_index/src/parsing.rs +++ b/crates/semantic_index/src/parsing.rs @@ -1,5 +1,6 @@ use anyhow::{anyhow, Ok, Result}; use language::{Grammar, Language}; +use sha1::{Digest, Sha1}; use std::{ cmp::{self, Reverse}, collections::HashSet, @@ -15,6 +16,7 @@ pub struct Document { pub range: Range, pub content: String, pub embedding: Vec, + pub sha1: [u8; 20], } const CODE_CONTEXT_TEMPLATE: &str = @@ -63,11 +65,15 @@ impl CodeContextRetriever { .replace("", language_name.as_ref()) .replace("", &content); + let mut sha1 = Sha1::new(); + sha1.update(&document_span); + Ok(vec![Document { range: 0..content.len(), content: document_span, embedding: Vec::new(), name: language_name.to_string(), + sha1: sha1.finalize().into(), }]) } @@ -76,11 +82,15 @@ impl CodeContextRetriever { .replace("", relative_path.to_string_lossy().as_ref()) .replace("", &content); + let mut sha1 = Sha1::new(); + sha1.update(&document_span); + Ok(vec![Document { range: 0..content.len(), content: document_span, embedding: Vec::new(), name: "Markdown".to_string(), + sha1: sha1.finalize().into(), }]) } @@ -253,11 +263,15 @@ impl CodeContextRetriever { ); } + let mut sha1 = Sha1::new(); + sha1.update(&document_content); + documents.push(Document { name, content: document_content, range: item_range.clone(), embedding: vec![], + sha1: sha1.finalize().into(), }) } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 5aaecac733a3d171d36704bda7f0051b6f4db79b..f567ca8770a7e54b695ba0370831c709b0501a4b 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -34,7 +34,7 @@ use util::{ ResultExt, }; -const SEMANTIC_INDEX_VERSION: usize = 6; +const SEMANTIC_INDEX_VERSION: usize = 7; const EMBEDDINGS_BATCH_SIZE: usize = 80; pub fn init( @@ -92,6 +92,7 @@ pub struct SemanticIndex { struct ProjectState { worktree_db_ids: Vec<(WorktreeId, i64)>, + file_mtimes: HashMap, outstanding_job_count_rx: watch::Receiver, _outstanding_job_count_tx: Arc>>, } From ced2b2aec3e9fee2e598e0a0125f843e05c4e906 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 22 Aug 2023 11:58:48 +0200 Subject: [PATCH 02/15] reworked ProjectState to include additional context --- crates/search/src/project_search.rs | 12 +- crates/semantic_index/src/embedding.rs | 2 +- crates/semantic_index/src/semantic_index.rs | 138 ++++++++++++++++-- .../src/semantic_index_tests.rs | 7 + 4 files changed, 146 insertions(+), 13 deletions(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index 196d5589f4df881ddd60293d757d9640734f46a2..7e3585656a8925374365cebeb2265df0714f839c 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -640,6 +640,7 @@ impl ProjectSearchView { self.search_options = SearchOptions::none(); let project = self.model.read(cx).project.clone(); + let index_task = semantic_index.update(cx, |semantic_index, cx| { semantic_index.index_project(project, cx) }); @@ -759,7 +760,7 @@ impl ProjectSearchView { } fn new(model: ModelHandle, cx: &mut ViewContext) -> Self { - let project; + let mut project; let excerpts; let mut query_text = String::new(); let mut options = SearchOptions::NONE; @@ -843,6 +844,15 @@ impl ProjectSearchView { .detach(); let filters_enabled = false; + // Initialize Semantic Index if Needed + if SemanticIndex::enabled(cx) { + let model = model.read(cx); + project = model.project.clone(); + SemanticIndex::global(cx).map(|semantic| { + semantic.update(cx, |this, cx| this.initialize_project(project, cx)) + }); + } + // Check if Worktrees have all been previously indexed let mut this = ProjectSearchView { search_id: model.read(cx).search_id, diff --git a/crates/semantic_index/src/embedding.rs b/crates/semantic_index/src/embedding.rs index 4fc247bfcc3233cc55e146fa356a64bcb837a09e..77457ec7f6e34961ab2a784ef6f0d8068c4c1dbb 100644 --- a/crates/semantic_index/src/embedding.rs +++ b/crates/semantic_index/src/embedding.rs @@ -39,7 +39,7 @@ struct OpenAIEmbeddingResponse { #[derive(Debug, Deserialize)] struct OpenAIEmbedding { - embedding: Vec, + embedding: Vec, index: usize, object: String, } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index f567ca8770a7e54b695ba0370831c709b0501a4b..2b803e36acb766d9b82900b3227d496fd810f08d 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -92,7 +92,8 @@ pub struct SemanticIndex { struct ProjectState { worktree_db_ids: Vec<(WorktreeId, i64)>, - file_mtimes: HashMap, + worktree_file_mtimes: HashMap>, + subscription: gpui::Subscription, outstanding_job_count_rx: watch::Receiver, _outstanding_job_count_tx: Arc>>, } @@ -113,6 +114,25 @@ impl JobHandle { } } impl ProjectState { + fn new( + subscription: gpui::Subscription, + worktree_db_ids: Vec<(WorktreeId, i64)>, + worktree_file_mtimes: HashMap>, + outstanding_job_count_rx: watch::Receiver, + _outstanding_job_count_tx: Arc>>, + ) -> Self { + let (job_count_tx, job_count_rx) = watch::channel_with(0); + let job_count_tx = Arc::new(Mutex::new(job_count_tx)); + + Self { + worktree_db_ids, + worktree_file_mtimes, + outstanding_job_count_rx, + _outstanding_job_count_tx, + subscription, + } + } + fn db_id_for_worktree_id(&self, id: WorktreeId) -> Option { self.worktree_db_ids .iter() @@ -577,6 +597,84 @@ impl SemanticIndex { }) } + pub fn initialize_project( + &mut self, + project: ModelHandle, + cx: &mut ModelContext, + ) { + let worktree_scans_complete = project + .read(cx) + .worktrees(cx) + .map(|worktree| { + let scan_complete = worktree.read(cx).as_local().unwrap().scan_complete(); + async move { + scan_complete.await; + } + }) + .collect::>(); + + let worktree_db_ids = project + .read(cx) + .worktrees(cx) + .map(|worktree| { + self.find_or_create_worktree(worktree.read(cx).abs_path().to_path_buf()) + }) + .collect::>(); + + let _subscription = cx.subscribe(&project, |this, project, event, cx| { + if let project::Event::WorktreeUpdatedEntries(worktree_id, changes) = event { + todo!(); + // this.project_entries_changed(project, changes, cx, worktree_id); + } + }); + + cx.spawn(|this, mut cx| async move { + futures::future::join_all(worktree_scans_complete).await; + + let worktree_db_ids = futures::future::join_all(worktree_db_ids).await; + let worktrees = project.read_with(&cx, |project, cx| { + project + .worktrees(cx) + .map(|worktree| worktree.read(cx).snapshot()) + .collect::>() + }); + + let mut worktree_file_mtimes = HashMap::new(); + let mut db_ids_by_worktree_id = HashMap::new(); + + for (worktree, db_id) in worktrees.iter().zip(worktree_db_ids) { + let db_id = db_id?; + db_ids_by_worktree_id.insert(worktree.id(), db_id); + worktree_file_mtimes.insert( + worktree.id(), + this.read_with(&cx, |this, _| this.get_file_mtimes(db_id)) + .await?, + ); + } + + let worktree_db_ids = db_ids_by_worktree_id + .iter() + .map(|(a, b)| (*a, *b)) + .collect(); + + let (job_count_tx, job_count_rx) = watch::channel_with(0); + let job_count_tx = Arc::new(Mutex::new(job_count_tx)); + this.update(&mut cx, |this, _| { + let project_state = ProjectState::new( + _subscription, + worktree_db_ids, + worktree_file_mtimes.clone(), + job_count_rx, + job_count_tx, + ); + this.projects.insert(project.downgrade(), project_state); + }); + + anyhow::Ok(()) + }) + .detach_and_log_err(cx) + } + pub fn index_project( &mut self, project: ModelHandle, @@ -605,6 +703,22 @@ impl SemanticIndex { let db_update_tx = self.db_update_tx.clone(); let parsing_files_tx = self.parsing_files_tx.clone(); + let state = self.projects.get(&project.downgrade()); + let state = if state.is_none() { + return Task::Ready(Some(Err(anyhow!("Project not yet initialized")))); + } else { + state.unwrap() + }; + + let state = state.clone().to_owned(); + + let _subscription = cx.subscribe(&project, |this, project, event, _cx| { + if let project::Event::WorktreeUpdatedEntries(worktree_id, changes) = event { + todo!(); + // this.project_entries_changed(project, changes, cx, worktree_id); + } + }); + cx.spawn(|this, mut cx| async move { futures::future::join_all(worktree_scans_complete).await; @@ -629,20 +743,22 @@ impl SemanticIndex { ); } + let worktree_db_ids = db_ids_by_worktree_id + .iter() + .map(|(a, b)| (*a, *b)) + .collect(); + let (job_count_tx, job_count_rx) = watch::channel_with(0); let job_count_tx = Arc::new(Mutex::new(job_count_tx)); this.update(&mut cx, |this, _| { - this.projects.insert( - project.downgrade(), - ProjectState { - worktree_db_ids: db_ids_by_worktree_id - .iter() - .map(|(a, b)| (*a, *b)) - .collect(), - outstanding_job_count_rx: job_count_rx.clone(), - _outstanding_job_count_tx: job_count_tx.clone(), - }, + let project_state = ProjectState::new( + _subscription, + worktree_db_ids, + worktree_file_mtimes.clone(), + job_count_rx.clone(), + job_count_tx.clone(), ); + this.projects.insert(project.downgrade(), project_state); }); cx.background() diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 07ddce4d37c641e45b46599db73c6686e6421949..0ac5953f0bb2a5771cc9c962ee095208b7895c10 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -86,6 +86,13 @@ async fn test_semantic_index(cx: &mut TestAppContext) { .unwrap(); let project = Project::test(fs.clone(), ["/the-root".as_ref()], cx).await; + + store + .update(cx, |store, cx| { + store.initialize_project(project.clone(), cx) + }) + .await; + let (file_count, outstanding_file_count) = store .update(cx, |store, cx| store.index_project(project.clone(), cx)) .await From aabdfa210f3afbaea6929f2fae319a93b7ab8c67 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 22 Aug 2023 14:45:27 +0200 Subject: [PATCH 03/15] working on initialization + index breakup --- crates/search/src/project_search.rs | 2 +- crates/semantic_index/src/semantic_index.rs | 308 +++++++++++++------- 2 files changed, 197 insertions(+), 113 deletions(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index 7e3585656a8925374365cebeb2265df0714f839c..ca317c0ded71ab50ce46a1ec5f1af4a04a60991f 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -849,7 +849,7 @@ impl ProjectSearchView { let model = model.read(cx); project = model.project.clone(); SemanticIndex::global(cx).map(|semantic| { - semantic.update(cx, |this, cx| this.initialize_project(project, cx)) + semantic.update(cx, |this, cx| this.initialize_project(project.clone(), cx)); }); } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 2b803e36acb766d9b82900b3227d496fd810f08d..8849b643c5ef00a71c22ebb9d2da5b2658d8057e 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -96,6 +96,7 @@ struct ProjectState { subscription: gpui::Subscription, outstanding_job_count_rx: watch::Receiver, _outstanding_job_count_tx: Arc>>, + queue: HashMap>, } #[derive(Clone)] @@ -130,9 +131,25 @@ impl ProjectState { outstanding_job_count_rx, _outstanding_job_count_tx, subscription, + queue: HashMap::new(), } } + fn add_to_queue(&mut self, worktree_id: WorktreeId, operation: IndexOperation) { + if let Some(worktree_queue) = self.queue.get_mut(&worktree_id) { + worktree_queue.push(operation); + } else { + self.queue.insert(worktree_id, vec![operation]); + } + } + + fn pop(&mut self) -> Option { + self.queue + .iter_mut() + .next() + .and_then(|(_, mut entry)| entry.pop()) + } + fn db_id_for_worktree_id(&self, id: WorktreeId) -> Option { self.worktree_db_ids .iter() @@ -158,6 +175,7 @@ impl ProjectState { } } +#[derive(Clone)] pub struct PendingFile { worktree_db_id: i64, relative_path: PathBuf, @@ -167,6 +185,12 @@ pub struct PendingFile { job_handle: JobHandle, } +#[derive(Clone)] +enum IndexOperation { + IndexFile { file: PendingFile }, + DeleteFile { file: PendingFile }, +} + pub struct SearchResult { pub buffer: ModelHandle, pub range: Range, @@ -628,102 +652,12 @@ impl SemanticIndex { } }); - cx.spawn(|this, mut cx| async move { - futures::future::join_all(worktree_scans_complete).await; - - let worktree_db_ids = futures::future::join_all(worktree_db_ids).await; - let worktrees = project.read_with(&cx, |project, cx| { - project - .worktrees(cx) - .map(|worktree| worktree.read(cx).snapshot()) - .collect::>() - }); - - let mut worktree_file_mtimes = HashMap::new(); - let mut db_ids_by_worktree_id = HashMap::new(); - - for (worktree, db_id) in worktrees.iter().zip(worktree_db_ids) { - let db_id = db_id?; - db_ids_by_worktree_id.insert(worktree.id(), db_id); - worktree_file_mtimes.insert( - worktree.id(), - this.read_with(&cx, |this, _| this.get_file_mtimes(db_id)) - .await?, - ); - } - - let worktree_db_ids = db_ids_by_worktree_id - .iter() - .map(|(a, b)| (*a, *b)) - .collect(); - - let (job_count_tx, job_count_rx) = watch::channel_with(0); - let job_count_tx = Arc::new(Mutex::new(job_count_tx)); - this.update(&mut cx, |this, _| { - let project_state = ProjectState::new( - _subscription, - worktree_db_ids, - worktree_file_mtimes.clone(), - job_count_rx, - job_count_tx, - ); - this.projects.insert(project.downgrade(), project_state); - }); - - anyhow::Ok(()) - }) - .detach_and_log_err(cx) - } - - pub fn index_project( - &mut self, - project: ModelHandle, - cx: &mut ModelContext, - ) -> Task)>> { - let t0 = Instant::now(); - let worktree_scans_complete = project - .read(cx) - .worktrees(cx) - .map(|worktree| { - let scan_complete = worktree.read(cx).as_local().unwrap().scan_complete(); - async move { - scan_complete.await; - } - }) - .collect::>(); - let worktree_db_ids = project - .read(cx) - .worktrees(cx) - .map(|worktree| { - self.find_or_create_worktree(worktree.read(cx).abs_path().to_path_buf()) - }) - .collect::>(); - let language_registry = self.language_registry.clone(); - let db_update_tx = self.db_update_tx.clone(); - let parsing_files_tx = self.parsing_files_tx.clone(); - - let state = self.projects.get(&project.downgrade()); - let state = if state.is_none() { - return Task::Ready(Some(Err(anyhow!("Project not yet initialized")))); - } else { - state.unwrap() - }; - - let state = state.clone().to_owned(); - - let _subscription = cx.subscribe(&project, |this, project, event, _cx| { - if let project::Event::WorktreeUpdatedEntries(worktree_id, changes) = event { - todo!(); - // this.project_entries_changed(project, changes, cx, worktree_id); - } - }); cx.spawn(|this, mut cx| async move { futures::future::join_all(worktree_scans_complete).await; let worktree_db_ids = futures::future::join_all(worktree_db_ids).await; - let worktrees = project.read_with(&cx, |project, cx| { project .worktrees(cx) @@ -733,6 +667,7 @@ impl SemanticIndex { let mut worktree_file_mtimes = HashMap::new(); let mut db_ids_by_worktree_id = HashMap::new(); + for (worktree, db_id) in worktrees.iter().zip(worktree_db_ids) { let db_id = db_id?; db_ids_by_worktree_id.insert(worktree.id(), db_id); @@ -761,10 +696,12 @@ impl SemanticIndex { this.projects.insert(project.downgrade(), project_state); }); - cx.background() + let worktree_files = cx + .background() .spawn(async move { - let mut count = 0; + let mut worktree_files = HashMap::new(); for worktree in worktrees.into_iter() { + let mut candidate_files = Vec::new(); let mut file_mtimes = worktree_file_mtimes.remove(&worktree.id()).unwrap(); for file in worktree.files(false, 0) { let absolute_path = worktree.absolutize(&file.path); @@ -773,6 +710,7 @@ impl SemanticIndex { .language_for_file(&absolute_path, None) .await { + // Test if file is valid parseable file if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) && &language.name().as_ref() != &"Markdown" && language @@ -789,40 +727,186 @@ impl SemanticIndex { .map_or(false, |existing_mtime| existing_mtime == file.mtime); if !already_stored { - count += 1; - let job_handle = JobHandle::new(&job_count_tx); - parsing_files_tx - .try_send(PendingFile { + candidate_files.push(IndexOperation::IndexFile { + file: PendingFile { worktree_db_id: db_ids_by_worktree_id[&worktree.id()], relative_path: path_buf, absolute_path, language, job_handle, modified_time: file.mtime, - }) - .unwrap(); + }, + }); } } } - for file in file_mtimes.keys() { - db_update_tx - .try_send(DbOperation::Delete { - worktree_id: db_ids_by_worktree_id[&worktree.id()], - path: file.to_owned(), - }) - .unwrap(); - } + + worktree_files.insert(worktree.id(), candidate_files); } - log::trace!( - "walking worktree took {:?} milliseconds", - t0.elapsed().as_millis() - ); - anyhow::Ok((count, job_count_rx)) + anyhow::Ok(worktree_files) }) - .await + .await?; + + this.update(&mut cx, |this, cx| { + if let Some(project_state) = this.projects.get_mut(&project.downgrade()) { + for (worktree_id, index_operations) in &worktree_files { + for op in index_operations { + project_state.add_to_queue(*worktree_id, op.clone()); + } + } + } + }); + + cx.background().spawn(async move { anyhow::Ok(()) }).await + }) + .detach_and_log_err(cx) + } + + pub fn index_project( + &mut self, + project: ModelHandle, + cx: &mut ModelContext, + ) -> Task)>> { + let state = self.projects.get_mut(&project.downgrade()); + let state = if state.is_none() { + return Task::Ready(Some(Err(anyhow!("Project not yet initialized")))); + } else { + state.unwrap() + }; + + let parsing_files_tx = self.parsing_files_tx.clone(); + let db_update_tx = self.db_update_tx.clone(); + let job_count_rx = state.outstanding_job_count_rx.clone(); + let count = state.queue.values().map(Vec::len).sum(); + cx.spawn(|this, mut cx| async move { + this.update(&mut cx, |this, cx| { + let Some(mut state) = this.projects.get_mut(&project.downgrade()) else { + return; + }; + let Some(mut index_operation) = state.pop() else { return;}; + let _ = match index_operation { + IndexOperation::IndexFile { file } => { + parsing_files_tx.try_send(file); + } + IndexOperation::DeleteFile { file } => { + db_update_tx.try_send(DbOperation::Delete { + worktree_id: file.worktree_db_id, + path: file.relative_path, + }); + } + }; + }); }) + .detach(); + + Task::Ready(Some(Ok((count, job_count_rx)))) + + // cx.spawn(|this, mut cx| async move { + // futures::future::join_all(worktree_scans_complete).await; + + // let worktree_db_ids = futures::future::join_all(worktree_db_ids).await; + + // let worktrees = project.read_with(&cx, |project, cx| { + // project + // .worktrees(cx) + // .map(|worktree| worktree.read(cx).snapshot()) + // .collect::>() + // }); + + // let mut worktree_file_mtimes = HashMap::new(); + // let mut db_ids_by_worktree_id = HashMap::new(); + // for (worktree, db_id) in worktrees.iter().zip(worktree_db_ids) { + // let db_id = db_id?; + // db_ids_by_worktree_id.insert(worktree.id(), db_id); + // worktree_file_mtimes.insert( + // worktree.id(), + // this.read_with(&cx, |this, _| this.get_file_mtimes(db_id)) + // .await?, + // ); + // } + + // let worktree_db_ids = db_ids_by_worktree_id + // .iter() + // .map(|(a, b)| (*a, *b)) + // .collect(); + + // let (job_count_tx, job_count_rx) = watch::channel_with(0); + // let job_count_tx = Arc::new(Mutex::new(job_count_tx)); + // this.update(&mut cx, |this, _| { + // let project_state = ProjectState::new( + // _subscription, + // worktree_db_ids, + // worktree_file_mtimes.clone(), + // job_count_rx.clone(), + // job_count_tx.clone(), + // ); + // this.projects.insert(project.downgrade(), project_state); + // }); + + // cx.background() + // .spawn(async move { + // let mut count = 0; + // for worktree in worktrees.into_iter() { + // let mut file_mtimes = worktree_file_mtimes.remove(&worktree.id()).unwrap(); + // for file in worktree.files(false, 0) { + // let absolute_path = worktree.absolutize(&file.path); + + // if let Ok(language) = language_registry + // .language_for_file(&absolute_path, None) + // .await + // { + // if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) + // && &language.name().as_ref() != &"Markdown" + // && language + // .grammar() + // .and_then(|grammar| grammar.embedding_config.as_ref()) + // .is_none() + // { + // continue; + // } + + // let path_buf = file.path.to_path_buf(); + // let stored_mtime = file_mtimes.remove(&file.path.to_path_buf()); + // let already_stored = stored_mtime + // .map_or(false, |existing_mtime| existing_mtime == file.mtime); + + // if !already_stored { + // count += 1; + + // let job_handle = JobHandle::new(&job_count_tx); + // parsing_files_tx + // .try_send(PendingFile { + // worktree_db_id: db_ids_by_worktree_id[&worktree.id()], + // relative_path: path_buf, + // absolute_path, + // language, + // job_handle, + // modified_time: file.mtime, + // }) + // .unwrap(); + // } + // } + // } + // for file in file_mtimes.keys() { + // db_update_tx + // .try_send(DbOperation::Delete { + // worktree_id: db_ids_by_worktree_id[&worktree.id()], + // path: file.to_owned(), + // }) + // .unwrap(); + // } + // } + + // log::trace!( + // "walking worktree took {:?} milliseconds", + // t0.elapsed().as_millis() + // ); + // anyhow::Ok((count, job_count_rx)) + // }) + // .await + // }) } pub fn outstanding_job_count_rx( From 328b7e523c4d7897380af9f3ec17c8f2e904356d Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 22 Aug 2023 15:01:21 +0200 Subject: [PATCH 04/15] reorganized to stop the race --- crates/semantic_index/src/semantic_index.rs | 34 ++++++++++++++------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 8849b643c5ef00a71c22ebb9d2da5b2658d8057e..79e649838a6a7418da0e9c92346e71529a003620 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -685,17 +685,19 @@ impl SemanticIndex { let (job_count_tx, job_count_rx) = watch::channel_with(0); let job_count_tx = Arc::new(Mutex::new(job_count_tx)); - this.update(&mut cx, |this, _| { - let project_state = ProjectState::new( - _subscription, - worktree_db_ids, - worktree_file_mtimes.clone(), - job_count_rx.clone(), - job_count_tx.clone(), - ); - this.projects.insert(project.downgrade(), project_state); - }); - + let job_count_tx_longlived = job_count_tx.clone(); + // this.update(&mut cx, |this, _| { + // let project_state = ProjectState::new( + // _subscription, + // worktree_db_ids, + // worktree_file_mtimes.clone(), + // job_count_rx.clone(), + // job_count_tx.clone(), + // ); + // this.projects.insert(project.downgrade(), project_state); + // }); + + let worktree_file_mtimes_all = worktree_file_mtimes.clone(); let worktree_files = cx .background() .spawn(async move { @@ -750,6 +752,14 @@ impl SemanticIndex { .await?; this.update(&mut cx, |this, cx| { + let project_state = ProjectState::new( + _subscription, + worktree_db_ids, + worktree_file_mtimes_all, + job_count_rx, + job_count_tx_longlived, + ); + if let Some(project_state) = this.projects.get_mut(&project.downgrade()) { for (worktree_id, index_operations) in &worktree_files { for op in index_operations { @@ -757,6 +767,8 @@ impl SemanticIndex { } } } + + this.projects.insert(project.downgrade(), project_state); }); cx.background().spawn(async move { anyhow::Ok(()) }).await From 09fd99b1e3380f239c4971ea625a526e87b1d338 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 23 Aug 2023 15:09:15 +0200 Subject: [PATCH 05/15] moved semantic_index project intialization to queue and channel method --- crates/semantic_index/src/semantic_index.rs | 267 ++++++++------------ 1 file changed, 108 insertions(+), 159 deletions(-) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 79e649838a6a7418da0e9c92346e71529a003620..ffe6e74a6df05f10da53cbdf9ded64867823f941 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -15,8 +15,9 @@ use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, use language::{Anchor, Buffer, Language, LanguageRegistry}; use parking_lot::Mutex; use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES}; +use postage::stream::Stream; use postage::watch; -use project::{search::PathMatcher, Fs, Project, WorktreeId}; +use project::{search::PathMatcher, Fs, PathChange, Project, ProjectEntryId, WorktreeId}; use smol::channel; use std::{ cmp::Ordering, @@ -96,7 +97,8 @@ struct ProjectState { subscription: gpui::Subscription, outstanding_job_count_rx: watch::Receiver, _outstanding_job_count_tx: Arc>>, - queue: HashMap>, + job_queue_tx: channel::Sender, + _queue_update_task: Task<()>, } #[derive(Clone)] @@ -116,6 +118,7 @@ impl JobHandle { } impl ProjectState { fn new( + cx: &mut AppContext, subscription: gpui::Subscription, worktree_db_ids: Vec<(WorktreeId, i64)>, worktree_file_mtimes: HashMap>, @@ -125,29 +128,51 @@ impl ProjectState { let (job_count_tx, job_count_rx) = watch::channel_with(0); let job_count_tx = Arc::new(Mutex::new(job_count_tx)); + let (job_queue_tx, job_queue_rx) = channel::unbounded(); + let _queue_update_task = cx.background().spawn({ + let mut worktree_queue = Vec::new(); + async move { + while let Ok(operation) = job_queue_rx.recv().await { + Self::update_queue(&mut worktree_queue, operation); + } + } + }); + Self { worktree_db_ids, worktree_file_mtimes, outstanding_job_count_rx, _outstanding_job_count_tx, subscription, - queue: HashMap::new(), + _queue_update_task, + job_queue_tx, } } - fn add_to_queue(&mut self, worktree_id: WorktreeId, operation: IndexOperation) { - if let Some(worktree_queue) = self.queue.get_mut(&worktree_id) { - worktree_queue.push(operation); - } else { - self.queue.insert(worktree_id, vec![operation]); - } + pub fn get_outstanding_count(&self) -> usize { + self.outstanding_job_count_rx.borrow().clone() } - fn pop(&mut self) -> Option { - self.queue - .iter_mut() - .next() - .and_then(|(_, mut entry)| entry.pop()) + fn update_queue(queue: &mut Vec, operation: IndexOperation) { + match operation { + IndexOperation::FlushQueue => { + for op in queue.pop() { + match op { + IndexOperation::IndexFile { payload, tx } => { + tx.try_send(payload); + } + IndexOperation::DeleteFile { payload, tx } => { + tx.try_send(payload); + } + _ => {} + } + } + } + _ => { + // TODO: This has to accomodate for duplicate files to index. + queue.push(operation); + } + } } fn db_id_for_worktree_id(&self, id: WorktreeId) -> Option { @@ -185,10 +210,16 @@ pub struct PendingFile { job_handle: JobHandle, } -#[derive(Clone)] enum IndexOperation { - IndexFile { file: PendingFile }, - DeleteFile { file: PendingFile }, + IndexFile { + payload: PendingFile, + tx: channel::Sender, + }, + DeleteFile { + payload: DbOperation, + tx: channel::Sender, + }, + FlushQueue, } pub struct SearchResult { @@ -621,6 +652,52 @@ impl SemanticIndex { }) } + // pub fn project_entries_changed( + // &self, + // project: ModelHandle, + // changes: &Arc<[(Arc, ProjectEntryId, PathChange)]>, + // cx: &ModelContext, + // worktree_id: &WorktreeId, + // ) -> Result<()> { + // let parsing_files_tx = self.parsing_files_tx.clone(); + // let db_update_tx = self.db_update_tx.clone(); + // let (job_queue_tx, outstanding_job_tx, worktree_db_id) = { + // let state = self.projects.get(&project.downgrade()); + // if state.is_none() { + // return anyhow::Error(anyhow!("Project not yet initialized")); + // } + // let state = state.unwrap(); + // ( + // state.job_queue_tx.clone(), + // state._outstanding_job_count_tx, + // state.db_id_for_worktree_id(worktree_id), + // ) + // }; + + // for (path, entry_id, path_change) in changes.iter() { + // match path_change { + // PathChange::AddedOrUpdated => { + // let job_handle = JobHandle::new(&outstanding_job_tx); + // job_queue_tx.try_send(IndexOperation::IndexFile { + // payload: PendingFile { + // worktree_db_id, + // relative_path: path, + // absolute_path, + // language, + // modified_time, + // job_handle, + // }, + // tx: parsing_files_tx, + // }) + // } + // PathChange::Removed => {} + // _ => {} + // } + // } + + // Ok(()) + // } + pub fn initialize_project( &mut self, project: ModelHandle, @@ -653,6 +730,7 @@ impl SemanticIndex { }); let language_registry = self.language_registry.clone(); + let parsing_files_tx = self.parsing_files_tx.clone(); cx.spawn(|this, mut cx| async move { futures::future::join_all(worktree_scans_complete).await; @@ -686,24 +764,13 @@ impl SemanticIndex { let (job_count_tx, job_count_rx) = watch::channel_with(0); let job_count_tx = Arc::new(Mutex::new(job_count_tx)); let job_count_tx_longlived = job_count_tx.clone(); - // this.update(&mut cx, |this, _| { - // let project_state = ProjectState::new( - // _subscription, - // worktree_db_ids, - // worktree_file_mtimes.clone(), - // job_count_rx.clone(), - // job_count_tx.clone(), - // ); - // this.projects.insert(project.downgrade(), project_state); - // }); let worktree_file_mtimes_all = worktree_file_mtimes.clone(); let worktree_files = cx .background() .spawn(async move { - let mut worktree_files = HashMap::new(); + let mut worktree_files = Vec::new(); for worktree in worktrees.into_iter() { - let mut candidate_files = Vec::new(); let mut file_mtimes = worktree_file_mtimes.remove(&worktree.id()).unwrap(); for file in worktree.files(false, 0) { let absolute_path = worktree.absolutize(&file.path); @@ -730,8 +797,8 @@ impl SemanticIndex { if !already_stored { let job_handle = JobHandle::new(&job_count_tx); - candidate_files.push(IndexOperation::IndexFile { - file: PendingFile { + worktree_files.push(IndexOperation::IndexFile { + payload: PendingFile { worktree_db_id: db_ids_by_worktree_id[&worktree.id()], relative_path: path_buf, absolute_path, @@ -739,12 +806,11 @@ impl SemanticIndex { job_handle, modified_time: file.mtime, }, + tx: parsing_files_tx.clone(), }); } } } - - worktree_files.insert(worktree.id(), candidate_files); } anyhow::Ok(worktree_files) @@ -753,6 +819,7 @@ impl SemanticIndex { this.update(&mut cx, |this, cx| { let project_state = ProjectState::new( + cx, _subscription, worktree_db_ids, worktree_file_mtimes_all, @@ -761,10 +828,8 @@ impl SemanticIndex { ); if let Some(project_state) = this.projects.get_mut(&project.downgrade()) { - for (worktree_id, index_operations) in &worktree_files { - for op in index_operations { - project_state.add_to_queue(*worktree_id, op.clone()); - } + for op in worktree_files { + project_state.job_queue_tx.try_send(op); } } @@ -791,134 +856,18 @@ impl SemanticIndex { let parsing_files_tx = self.parsing_files_tx.clone(); let db_update_tx = self.db_update_tx.clone(); let job_count_rx = state.outstanding_job_count_rx.clone(); - let count = state.queue.values().map(Vec::len).sum(); + let count = state.get_outstanding_count(); + cx.spawn(|this, mut cx| async move { this.update(&mut cx, |this, cx| { - let Some(mut state) = this.projects.get_mut(&project.downgrade()) else { + let Some(state) = this.projects.get_mut(&project.downgrade()) else { return; }; - let Some(mut index_operation) = state.pop() else { return;}; - let _ = match index_operation { - IndexOperation::IndexFile { file } => { - parsing_files_tx.try_send(file); - } - IndexOperation::DeleteFile { file } => { - db_update_tx.try_send(DbOperation::Delete { - worktree_id: file.worktree_db_id, - path: file.relative_path, - }); - } - }; - }); - }) - .detach(); + state.job_queue_tx.try_send(IndexOperation::FlushQueue); + }) + }); Task::Ready(Some(Ok((count, job_count_rx)))) - - // cx.spawn(|this, mut cx| async move { - // futures::future::join_all(worktree_scans_complete).await; - - // let worktree_db_ids = futures::future::join_all(worktree_db_ids).await; - - // let worktrees = project.read_with(&cx, |project, cx| { - // project - // .worktrees(cx) - // .map(|worktree| worktree.read(cx).snapshot()) - // .collect::>() - // }); - - // let mut worktree_file_mtimes = HashMap::new(); - // let mut db_ids_by_worktree_id = HashMap::new(); - // for (worktree, db_id) in worktrees.iter().zip(worktree_db_ids) { - // let db_id = db_id?; - // db_ids_by_worktree_id.insert(worktree.id(), db_id); - // worktree_file_mtimes.insert( - // worktree.id(), - // this.read_with(&cx, |this, _| this.get_file_mtimes(db_id)) - // .await?, - // ); - // } - - // let worktree_db_ids = db_ids_by_worktree_id - // .iter() - // .map(|(a, b)| (*a, *b)) - // .collect(); - - // let (job_count_tx, job_count_rx) = watch::channel_with(0); - // let job_count_tx = Arc::new(Mutex::new(job_count_tx)); - // this.update(&mut cx, |this, _| { - // let project_state = ProjectState::new( - // _subscription, - // worktree_db_ids, - // worktree_file_mtimes.clone(), - // job_count_rx.clone(), - // job_count_tx.clone(), - // ); - // this.projects.insert(project.downgrade(), project_state); - // }); - - // cx.background() - // .spawn(async move { - // let mut count = 0; - // for worktree in worktrees.into_iter() { - // let mut file_mtimes = worktree_file_mtimes.remove(&worktree.id()).unwrap(); - // for file in worktree.files(false, 0) { - // let absolute_path = worktree.absolutize(&file.path); - - // if let Ok(language) = language_registry - // .language_for_file(&absolute_path, None) - // .await - // { - // if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) - // && &language.name().as_ref() != &"Markdown" - // && language - // .grammar() - // .and_then(|grammar| grammar.embedding_config.as_ref()) - // .is_none() - // { - // continue; - // } - - // let path_buf = file.path.to_path_buf(); - // let stored_mtime = file_mtimes.remove(&file.path.to_path_buf()); - // let already_stored = stored_mtime - // .map_or(false, |existing_mtime| existing_mtime == file.mtime); - - // if !already_stored { - // count += 1; - - // let job_handle = JobHandle::new(&job_count_tx); - // parsing_files_tx - // .try_send(PendingFile { - // worktree_db_id: db_ids_by_worktree_id[&worktree.id()], - // relative_path: path_buf, - // absolute_path, - // language, - // job_handle, - // modified_time: file.mtime, - // }) - // .unwrap(); - // } - // } - // } - // for file in file_mtimes.keys() { - // db_update_tx - // .try_send(DbOperation::Delete { - // worktree_id: db_ids_by_worktree_id[&worktree.id()], - // path: file.to_owned(), - // }) - // .unwrap(); - // } - // } - - // log::trace!( - // "walking worktree took {:?} milliseconds", - // t0.elapsed().as_millis() - // ); - // anyhow::Ok((count, job_count_rx)) - // }) - // .await - // }) } pub fn outstanding_job_count_rx( From e42b9e910ede97b35414ca9987d5d39b340776d1 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 23 Aug 2023 22:28:30 +0200 Subject: [PATCH 06/15] fix async calls on project updated entries to ensure that all files are updating appropriately --- crates/search/src/project_search.rs | 1 + crates/semantic_index/src/semantic_index.rs | 165 +++++++++++++------- 2 files changed, 110 insertions(+), 56 deletions(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index ca317c0ded71ab50ce46a1ec5f1af4a04a60991f..448735fe3c4856168713fcb75bea8ef60fd112e8 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -849,6 +849,7 @@ impl ProjectSearchView { let model = model.read(cx); project = model.project.clone(); SemanticIndex::global(cx).map(|semantic| { + dbg!("Initializing project"); semantic.update(cx, |this, cx| this.initialize_project(project.clone(), cx)); }); } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index ffe6e74a6df05f10da53cbdf9ded64867823f941..0df3a9cc8428e37825c4e982525c3934e8efa8b3 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -156,7 +156,7 @@ impl ProjectState { fn update_queue(queue: &mut Vec, operation: IndexOperation) { match operation { IndexOperation::FlushQueue => { - for op in queue.pop() { + while let Some(op) = queue.pop() { match op { IndexOperation::IndexFile { payload, tx } => { tx.try_send(payload); @@ -652,51 +652,103 @@ impl SemanticIndex { }) } - // pub fn project_entries_changed( - // &self, - // project: ModelHandle, - // changes: &Arc<[(Arc, ProjectEntryId, PathChange)]>, - // cx: &ModelContext, - // worktree_id: &WorktreeId, - // ) -> Result<()> { - // let parsing_files_tx = self.parsing_files_tx.clone(); - // let db_update_tx = self.db_update_tx.clone(); - // let (job_queue_tx, outstanding_job_tx, worktree_db_id) = { - // let state = self.projects.get(&project.downgrade()); - // if state.is_none() { - // return anyhow::Error(anyhow!("Project not yet initialized")); - // } - // let state = state.unwrap(); - // ( - // state.job_queue_tx.clone(), - // state._outstanding_job_count_tx, - // state.db_id_for_worktree_id(worktree_id), - // ) - // }; - - // for (path, entry_id, path_change) in changes.iter() { - // match path_change { - // PathChange::AddedOrUpdated => { - // let job_handle = JobHandle::new(&outstanding_job_tx); - // job_queue_tx.try_send(IndexOperation::IndexFile { - // payload: PendingFile { - // worktree_db_id, - // relative_path: path, - // absolute_path, - // language, - // modified_time, - // job_handle, - // }, - // tx: parsing_files_tx, - // }) - // } - // PathChange::Removed => {} - // _ => {} - // } - // } - - // Ok(()) - // } + fn project_entries_changed( + &self, + project: ModelHandle, + changes: Arc<[(Arc, ProjectEntryId, PathChange)]>, + cx: &mut ModelContext<'_, SemanticIndex>, + worktree_id: &WorktreeId, + ) -> Result<()> { + let parsing_files_tx = self.parsing_files_tx.clone(); + let db_update_tx = self.db_update_tx.clone(); + let (job_queue_tx, outstanding_job_tx, worktree_db_id) = { + let state = self + .projects + .get(&project.downgrade()) + .ok_or(anyhow!("Project not yet initialized"))?; + let worktree_db_id = state + .db_id_for_worktree_id(*worktree_id) + .ok_or(anyhow!("Worktree ID in Database Not Available"))?; + ( + state.job_queue_tx.clone(), + state._outstanding_job_count_tx.clone(), + worktree_db_id, + ) + }; + + let language_registry = self.language_registry.clone(); + let parsing_files_tx = parsing_files_tx.clone(); + let db_update_tx = db_update_tx.clone(); + + let worktree = project + .read(cx) + .worktree_for_id(worktree_id.clone(), cx) + .ok_or(anyhow!("Worktree not available"))? + .read(cx) + .snapshot(); + cx.spawn(|this, mut cx| async move { + let worktree = worktree.clone(); + for (path, entry_id, path_change) in changes.iter() { + let relative_path = path.to_path_buf(); + let absolute_path = worktree.absolutize(path); + + let Some(entry) = worktree.entry_for_id(*entry_id) else { + continue; + }; + if entry.is_ignored || entry.is_symlink || entry.is_external { + continue; + } + + match path_change { + PathChange::AddedOrUpdated | PathChange::Updated => { + log::trace!("File Updated: {:?}", path); + if let Ok(language) = language_registry + .language_for_file(&relative_path, None) + .await + { + if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) + && &language.name().as_ref() != &"Markdown" + && language + .grammar() + .and_then(|grammar| grammar.embedding_config.as_ref()) + .is_none() + { + continue; + } + + let job_handle = JobHandle::new(&outstanding_job_tx); + let new_operation = IndexOperation::IndexFile { + payload: PendingFile { + worktree_db_id, + relative_path, + absolute_path, + language, + modified_time: entry.mtime, + job_handle, + }, + tx: parsing_files_tx.clone(), + }; + job_queue_tx.try_send(new_operation); + } + } + PathChange::Removed => { + let new_operation = IndexOperation::DeleteFile { + payload: DbOperation::Delete { + worktree_id: worktree_db_id, + path: relative_path, + }, + tx: db_update_tx.clone(), + }; + job_queue_tx.try_send(new_operation); + } + _ => {} + } + } + }) + .detach(); + + Ok(()) + } pub fn initialize_project( &mut self, @@ -724,9 +776,8 @@ impl SemanticIndex { let _subscription = cx.subscribe(&project, |this, project, event, cx| { if let project::Event::WorktreeUpdatedEntries(worktree_id, changes) = event { - todo!(); - // this.project_entries_changed(project, changes, cx, worktree_id); - } + this.project_entries_changed(project, changes.clone(), cx, worktree_id); + }; }); let language_registry = self.language_registry.clone(); @@ -775,6 +826,10 @@ impl SemanticIndex { for file in worktree.files(false, 0) { let absolute_path = worktree.absolutize(&file.path); + if file.is_external || file.is_ignored || file.is_symlink { + continue; + } + if let Ok(language) = language_registry .language_for_file(&absolute_path, None) .await @@ -827,10 +882,8 @@ impl SemanticIndex { job_count_tx_longlived, ); - if let Some(project_state) = this.projects.get_mut(&project.downgrade()) { - for op in worktree_files { - project_state.job_queue_tx.try_send(op); - } + for op in worktree_files { + project_state.job_queue_tx.try_send(op); } this.projects.insert(project.downgrade(), project_state); @@ -864,10 +917,10 @@ impl SemanticIndex { return; }; state.job_queue_tx.try_send(IndexOperation::FlushQueue); - }) - }); + }); - Task::Ready(Some(Ok((count, job_count_rx)))) + Ok((count, job_count_rx)) + }) } pub fn outstanding_job_count_rx( From 3f9f742530cdb3f5684a22f4934cf3a949e74a58 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 24 Aug 2023 11:45:52 +0200 Subject: [PATCH 07/15] update rate limiting embeddings strategy to delay less --- crates/semantic_index/src/embedding.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/semantic_index/src/embedding.rs b/crates/semantic_index/src/embedding.rs index 77457ec7f6e34961ab2a784ef6f0d8068c4c1dbb..f2269a786a66af3896e0410c5558f3f18d618bd0 100644 --- a/crates/semantic_index/src/embedding.rs +++ b/crates/semantic_index/src/embedding.rs @@ -106,8 +106,8 @@ impl OpenAIEmbeddings { #[async_trait] impl EmbeddingProvider for OpenAIEmbeddings { async fn embed_batch(&self, spans: Vec<&str>) -> Result>> { - const BACKOFF_SECONDS: [usize; 3] = [45, 75, 125]; - const MAX_RETRIES: usize = 3; + const BACKOFF_SECONDS: [usize; 4] = [3, 5, 15, 45]; + const MAX_RETRIES: usize = 4; let api_key = OPENAI_API_KEY .as_ref() From aa07872a24bc25dfe97d4b460b3b7d919e4e9ae9 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 24 Aug 2023 12:36:33 +0200 Subject: [PATCH 08/15] accomodate for duplicate entries in indexing queue Co-authored-by: Piotr --- crates/semantic_index/src/semantic_index.rs | 37 ++++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 0df3a9cc8428e37825c4e982525c3934e8efa8b3..65b7e405256c1697bd862665ec39d7dc5537e31c 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -12,6 +12,7 @@ use db::VectorDatabase; use embedding::{EmbeddingProvider, OpenAIEmbeddings}; use futures::{channel::oneshot, Future}; use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle}; +use isahc::http::header::OccupiedEntry; use language::{Anchor, Buffer, Language, LanguageRegistry}; use parking_lot::Mutex; use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES}; @@ -130,10 +131,11 @@ impl ProjectState { let (job_queue_tx, job_queue_rx) = channel::unbounded(); let _queue_update_task = cx.background().spawn({ - let mut worktree_queue = Vec::new(); + let mut worktree_queue = HashMap::new(); async move { while let Ok(operation) = job_queue_rx.recv().await { Self::update_queue(&mut worktree_queue, operation); + dbg!(worktree_queue.len()); } } }); @@ -153,24 +155,37 @@ impl ProjectState { self.outstanding_job_count_rx.borrow().clone() } - fn update_queue(queue: &mut Vec, operation: IndexOperation) { + fn update_queue(queue: &mut HashMap, operation: IndexOperation) { match operation { IndexOperation::FlushQueue => { - while let Some(op) = queue.pop() { + let queue = std::mem::take(queue); + for (_, op) in queue { match op { - IndexOperation::IndexFile { payload, tx } => { + IndexOperation::IndexFile { + absolute_path, + payload, + tx, + } => { tx.try_send(payload); } - IndexOperation::DeleteFile { payload, tx } => { + IndexOperation::DeleteFile { + absolute_path, + payload, + tx, + } => { tx.try_send(payload); } _ => {} } } } - _ => { - // TODO: This has to accomodate for duplicate files to index. - queue.push(operation); + IndexOperation::IndexFile { + ref absolute_path, .. + } + | IndexOperation::DeleteFile { + ref absolute_path, .. + } => { + queue.insert(absolute_path.clone(), operation); } } } @@ -209,13 +224,14 @@ pub struct PendingFile { modified_time: SystemTime, job_handle: JobHandle, } - enum IndexOperation { IndexFile { + absolute_path: PathBuf, payload: PendingFile, tx: channel::Sender, }, DeleteFile { + absolute_path: PathBuf, payload: DbOperation, tx: channel::Sender, }, @@ -718,6 +734,7 @@ impl SemanticIndex { let job_handle = JobHandle::new(&outstanding_job_tx); let new_operation = IndexOperation::IndexFile { + absolute_path: absolute_path.clone(), payload: PendingFile { worktree_db_id, relative_path, @@ -733,6 +750,7 @@ impl SemanticIndex { } PathChange::Removed => { let new_operation = IndexOperation::DeleteFile { + absolute_path, payload: DbOperation::Delete { worktree_id: worktree_db_id, path: relative_path, @@ -853,6 +871,7 @@ impl SemanticIndex { if !already_stored { let job_handle = JobHandle::new(&job_count_tx); worktree_files.push(IndexOperation::IndexFile { + absolute_path: absolute_path.clone(), payload: PendingFile { worktree_db_id: db_ids_by_worktree_id[&worktree.id()], relative_path: path_buf, From afe0e74868f65ba16a80a35089f252af20b1bfc8 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 24 Aug 2023 12:42:41 +0200 Subject: [PATCH 09/15] remove worktree_file_mtimes in state as it is no longer used Co-authored-by: Piotr --- crates/semantic_index/src/semantic_index.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 65b7e405256c1697bd862665ec39d7dc5537e31c..9d789d0eacad5eb2c8e5e51cfd7312b6f43d0b0c 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -94,7 +94,6 @@ pub struct SemanticIndex { struct ProjectState { worktree_db_ids: Vec<(WorktreeId, i64)>, - worktree_file_mtimes: HashMap>, subscription: gpui::Subscription, outstanding_job_count_rx: watch::Receiver, _outstanding_job_count_tx: Arc>>, @@ -122,7 +121,6 @@ impl ProjectState { cx: &mut AppContext, subscription: gpui::Subscription, worktree_db_ids: Vec<(WorktreeId, i64)>, - worktree_file_mtimes: HashMap>, outstanding_job_count_rx: watch::Receiver, _outstanding_job_count_tx: Arc>>, ) -> Self { @@ -142,7 +140,6 @@ impl ProjectState { Self { worktree_db_ids, - worktree_file_mtimes, outstanding_job_count_rx, _outstanding_job_count_tx, subscription, @@ -834,7 +831,6 @@ impl SemanticIndex { let job_count_tx = Arc::new(Mutex::new(job_count_tx)); let job_count_tx_longlived = job_count_tx.clone(); - let worktree_file_mtimes_all = worktree_file_mtimes.clone(); let worktree_files = cx .background() .spawn(async move { @@ -896,7 +892,6 @@ impl SemanticIndex { cx, _subscription, worktree_db_ids, - worktree_file_mtimes_all, job_count_rx, job_count_tx_longlived, ); From e8e7b294d84b677256f1a10645ab65f7ecae135b Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 24 Aug 2023 12:49:20 +0200 Subject: [PATCH 10/15] add delete files operation for remaining files in database not included in current worktree Co-authored-by: Piotr --- crates/semantic_index/src/semantic_index.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 9d789d0eacad5eb2c8e5e51cfd7312b6f43d0b0c..cca63c00aaf89aacaa4683c15739f90ebd65bba5 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -797,6 +797,7 @@ impl SemanticIndex { let language_registry = self.language_registry.clone(); let parsing_files_tx = self.parsing_files_tx.clone(); + let db_update_tx = self.db_update_tx.clone(); cx.spawn(|this, mut cx| async move { futures::future::join_all(worktree_scans_complete).await; @@ -837,6 +838,7 @@ impl SemanticIndex { let mut worktree_files = Vec::new(); for worktree in worktrees.into_iter() { let mut file_mtimes = worktree_file_mtimes.remove(&worktree.id()).unwrap(); + let worktree_db_id = db_ids_by_worktree_id[&worktree.id()]; for file in worktree.files(false, 0) { let absolute_path = worktree.absolutize(&file.path); @@ -869,7 +871,7 @@ impl SemanticIndex { worktree_files.push(IndexOperation::IndexFile { absolute_path: absolute_path.clone(), payload: PendingFile { - worktree_db_id: db_ids_by_worktree_id[&worktree.id()], + worktree_db_id, relative_path: path_buf, absolute_path, language, @@ -881,6 +883,17 @@ impl SemanticIndex { } } } + // Clean up entries from database that are no longer in the worktree. + for (path, mtime) in file_mtimes { + worktree_files.push(IndexOperation::DeleteFile { + absolute_path: worktree.absolutize(path.as_path()), + payload: DbOperation::Delete { + worktree_id: worktree_db_id, + path, + }, + tx: db_update_tx.clone(), + }); + } } anyhow::Ok(worktree_files) From a1519e4c38bff6879616f261fb95ff0fec7262b9 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 24 Aug 2023 13:14:19 +0200 Subject: [PATCH 11/15] move semantic search project intialization to a subscribe event for workspace created Co-authored-by: Piotr --- crates/search/src/project_search.rs | 10 ---------- crates/semantic_index/src/semantic_index.rs | 18 +++++++++++++++++- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index 357aca2ed86690257bee89aa28d075ae79b62a5b..b0ec7219d138c03da94fcceaeb7f2ca682084e1e 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -844,16 +844,6 @@ impl ProjectSearchView { .detach(); let filters_enabled = false; - // Initialize Semantic Index if Needed - if SemanticIndex::enabled(cx) { - let model = model.read(cx); - project = model.project.clone(); - SemanticIndex::global(cx).map(|semantic| { - dbg!("Initializing project"); - semantic.update(cx, |this, cx| this.initialize_project(project.clone(), cx)); - }); - } - // Check if Worktrees have all been previously indexed let mut this = ProjectSearchView { search_id: model.read(cx).search_id, diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index cca63c00aaf89aacaa4683c15739f90ebd65bba5..fed320becdc3ef3bc0e7c5ce84632e0536283070 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -35,6 +35,7 @@ use util::{ paths::EMBEDDINGS_DIR, ResultExt, }; +use workspace::WorkspaceCreated; const SEMANTIC_INDEX_VERSION: usize = 7; const EMBEDDINGS_BATCH_SIZE: usize = 80; @@ -56,6 +57,22 @@ pub fn init( return; } + cx.subscribe_global::({ + move |event, mut cx| { + let Some(semantic_index) = SemanticIndex::global(cx) else { return; }; + let workspace = &event.0; + if let Some(workspace) = workspace.upgrade(cx) { + let project = workspace.read(cx).project().clone(); + if project.read(cx).is_local() { + semantic_index.update(cx, |index, cx| { + index.initialize_project(project, cx); + }); + } + } + } + }) + .detach(); + cx.spawn(move |mut cx| async move { let semantic_index = SemanticIndex::new( fs, @@ -133,7 +150,6 @@ impl ProjectState { async move { while let Ok(operation) = job_queue_rx.recv().await { Self::update_queue(&mut worktree_queue, operation); - dbg!(worktree_queue.len()); } } }); From 0b204bfdc8d083c05e6b0637b4f3dbdf0eff9cde Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 24 Aug 2023 13:40:04 +0200 Subject: [PATCH 12/15] reindex semantic index when search project pane is reactivated in semantic mode Co-authored-by: Piotr --- crates/search/src/project_search.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index b0ec7219d138c03da94fcceaeb7f2ca682084e1e..f665c4ddcdf585c4373406bbef12a01d88a2ceca 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -1635,6 +1635,12 @@ impl ToolbarItemView for ProjectSearchBar { self.subscription = None; self.active_project_search = None; if let Some(search) = active_pane_item.and_then(|i| i.downcast::()) { + search.update(cx, |search, cx| { + if search.current_mode == SearchMode::Semantic { + search.index_project(cx); + } + }); + self.subscription = Some(cx.observe(&search, |_, _, cx| cx.notify())); self.active_project_search = Some(search); ToolbarItemLocation::PrimaryLeft { From a892a51ec30bf9b6dec54399615e835528dfdac8 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 24 Aug 2023 13:46:43 +0200 Subject: [PATCH 13/15] update initialize project call to accomodate for test scenarios Co-authored-by: Piotr --- crates/semantic_index/src/semantic_index.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index fed320becdc3ef3bc0e7c5ce84632e0536283070..4f932b0622c7da8d4102f636f6bd833f08dff2a1 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -65,7 +65,7 @@ pub fn init( let project = workspace.read(cx).project().clone(); if project.read(cx).is_local() { semantic_index.update(cx, |index, cx| { - index.initialize_project(project, cx); + index.initialize_project(project, cx).detach_and_log_err(cx) }); } } @@ -785,7 +785,7 @@ impl SemanticIndex { &mut self, project: ModelHandle, cx: &mut ModelContext, - ) { + ) -> Task> { let worktree_scans_complete = project .read(cx) .worktrees(cx) @@ -931,10 +931,8 @@ impl SemanticIndex { this.projects.insert(project.downgrade(), project_state); }); - - cx.background().spawn(async move { anyhow::Ok(()) }).await + Result::<(), _>::Ok(()) }) - .detach_and_log_err(cx) } pub fn index_project( From 131950f6702c2af6549438673e5d79a00cface1d Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 24 Aug 2023 18:40:08 +0200 Subject: [PATCH 14/15] add handling for Added file events to semantic index --- crates/semantic_index/src/semantic_index.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 4f932b0622c7da8d4102f636f6bd833f08dff2a1..474025c25e3d435d36518c6125396e78ac6aba55 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -728,9 +728,9 @@ impl SemanticIndex { continue; } + log::trace!("File Event: {:?}, Path: {:?}", &path_change, &path); match path_change { - PathChange::AddedOrUpdated | PathChange::Updated => { - log::trace!("File Updated: {:?}", path); + PathChange::AddedOrUpdated | PathChange::Updated | PathChange::Added => { if let Ok(language) = language_registry .language_for_file(&relative_path, None) .await @@ -786,6 +786,7 @@ impl SemanticIndex { project: ModelHandle, cx: &mut ModelContext, ) -> Task> { + log::trace!("Initializing Project for Semantic Index"); let worktree_scans_complete = project .read(cx) .worktrees(cx) @@ -807,7 +808,7 @@ impl SemanticIndex { let _subscription = cx.subscribe(&project, |this, project, event, cx| { if let project::Event::WorktreeUpdatedEntries(worktree_id, changes) = event { - this.project_entries_changed(project, changes.clone(), cx, worktree_id); + this.project_entries_changed(project.clone(), changes.clone(), cx, worktree_id); }; }); From ee97bc54cfc184d90dfbaff2bac585b8e60df202 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Fri, 25 Aug 2023 10:38:01 +0200 Subject: [PATCH 15/15] cleaned up warnings --- crates/search/src/project_search.rs | 2 +- crates/semantic_index/src/semantic_index.rs | 40 +++++++++---------- .../src/semantic_index_tests.rs | 2 +- 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/crates/search/src/project_search.rs b/crates/search/src/project_search.rs index f665c4ddcdf585c4373406bbef12a01d88a2ceca..a29fad0f548fe0f7fdc0e8526b9aa5615873ea57 100644 --- a/crates/search/src/project_search.rs +++ b/crates/search/src/project_search.rs @@ -760,7 +760,7 @@ impl ProjectSearchView { } fn new(model: ModelHandle, cx: &mut ViewContext) -> Self { - let mut project; + let project; let excerpts; let mut query_text = String::new(); let mut options = SearchOptions::NONE; diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 474025c25e3d435d36518c6125396e78ac6aba55..70495b59d30cc9bb47eb9aa66e6cf22e73d720da 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -12,11 +12,9 @@ use db::VectorDatabase; use embedding::{EmbeddingProvider, OpenAIEmbeddings}; use futures::{channel::oneshot, Future}; use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle}; -use isahc::http::header::OccupiedEntry; use language::{Anchor, Buffer, Language, LanguageRegistry}; use parking_lot::Mutex; use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES}; -use postage::stream::Stream; use postage::watch; use project::{search::PathMatcher, Fs, PathChange, Project, ProjectEntryId, WorktreeId}; use smol::channel; @@ -58,7 +56,7 @@ pub fn init( } cx.subscribe_global::({ - move |event, mut cx| { + move |event, cx| { let Some(semantic_index) = SemanticIndex::global(cx) else { return; }; let workspace = &event.0; if let Some(workspace) = workspace.upgrade(cx) { @@ -111,7 +109,7 @@ pub struct SemanticIndex { struct ProjectState { worktree_db_ids: Vec<(WorktreeId, i64)>, - subscription: gpui::Subscription, + _subscription: gpui::Subscription, outstanding_job_count_rx: watch::Receiver, _outstanding_job_count_tx: Arc>>, job_queue_tx: channel::Sender, @@ -141,9 +139,6 @@ impl ProjectState { outstanding_job_count_rx: watch::Receiver, _outstanding_job_count_tx: Arc>>, ) -> Self { - let (job_count_tx, job_count_rx) = watch::channel_with(0); - let job_count_tx = Arc::new(Mutex::new(job_count_tx)); - let (job_queue_tx, job_queue_rx) = channel::unbounded(); let _queue_update_task = cx.background().spawn({ let mut worktree_queue = HashMap::new(); @@ -158,7 +153,7 @@ impl ProjectState { worktree_db_ids, outstanding_job_count_rx, _outstanding_job_count_tx, - subscription, + _subscription: subscription, _queue_update_task, job_queue_tx, } @@ -175,18 +170,18 @@ impl ProjectState { for (_, op) in queue { match op { IndexOperation::IndexFile { - absolute_path, + absolute_path: _, payload, tx, } => { - tx.try_send(payload); + let _ = tx.try_send(payload); } IndexOperation::DeleteFile { - absolute_path, + absolute_path: _, payload, tx, } => { - tx.try_send(payload); + let _ = tx.try_send(payload); } _ => {} } @@ -715,7 +710,7 @@ impl SemanticIndex { .ok_or(anyhow!("Worktree not available"))? .read(cx) .snapshot(); - cx.spawn(|this, mut cx| async move { + cx.spawn(|_, _| async move { let worktree = worktree.clone(); for (path, entry_id, path_change) in changes.iter() { let relative_path = path.to_path_buf(); @@ -758,7 +753,7 @@ impl SemanticIndex { }, tx: parsing_files_tx.clone(), }; - job_queue_tx.try_send(new_operation); + let _ = job_queue_tx.try_send(new_operation); } } PathChange::Removed => { @@ -770,7 +765,7 @@ impl SemanticIndex { }, tx: db_update_tx.clone(), }; - job_queue_tx.try_send(new_operation); + let _ = job_queue_tx.try_send(new_operation); } _ => {} } @@ -808,7 +803,8 @@ impl SemanticIndex { let _subscription = cx.subscribe(&project, |this, project, event, cx| { if let project::Event::WorktreeUpdatedEntries(worktree_id, changes) = event { - this.project_entries_changed(project.clone(), changes.clone(), cx, worktree_id); + let _ = + this.project_entries_changed(project.clone(), changes.clone(), cx, worktree_id); }; }); @@ -901,7 +897,7 @@ impl SemanticIndex { } } // Clean up entries from database that are no longer in the worktree. - for (path, mtime) in file_mtimes { + for (path, _) in file_mtimes { worktree_files.push(IndexOperation::DeleteFile { absolute_path: worktree.absolutize(path.as_path()), payload: DbOperation::Delete { @@ -927,7 +923,7 @@ impl SemanticIndex { ); for op in worktree_files { - project_state.job_queue_tx.try_send(op); + let _ = project_state.job_queue_tx.try_send(op); } this.projects.insert(project.downgrade(), project_state); @@ -948,17 +944,17 @@ impl SemanticIndex { state.unwrap() }; - let parsing_files_tx = self.parsing_files_tx.clone(); - let db_update_tx = self.db_update_tx.clone(); + // let parsing_files_tx = self.parsing_files_tx.clone(); + // let db_update_tx = self.db_update_tx.clone(); let job_count_rx = state.outstanding_job_count_rx.clone(); let count = state.get_outstanding_count(); cx.spawn(|this, mut cx| async move { - this.update(&mut cx, |this, cx| { + this.update(&mut cx, |this, _| { let Some(state) = this.projects.get_mut(&project.downgrade()) else { return; }; - state.job_queue_tx.try_send(IndexOperation::FlushQueue); + let _ = state.job_queue_tx.try_send(IndexOperation::FlushQueue); }); Ok((count, job_count_rx)) diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 0ac5953f0bb2a5771cc9c962ee095208b7895c10..32d8bb0fb879fe9e1dcf69713d73dbcdc722ffcb 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -87,7 +87,7 @@ async fn test_semantic_index(cx: &mut TestAppContext) { let project = Project::test(fs.clone(), ["/the-root".as_ref()], cx).await; - store + let _ = store .update(cx, |store, cx| { store.initialize_project(project.clone(), cx) })