Detailed changes
@@ -1389,6 +1389,15 @@ dependencies = [
"theme",
]
+[[package]]
+name = "conv"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ff10625fd0ac447827aa30ea8b861fead473bb60aeb73af6c1c58caf0d1299"
+dependencies = [
+ "custom_derive",
+]
+
[[package]]
name = "copilot"
version = "0.1.0"
@@ -1766,6 +1775,12 @@ dependencies = [
"winapi 0.3.9",
]
+[[package]]
+name = "custom_derive"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
+
[[package]]
name = "cxx"
version = "1.0.94"
@@ -7882,11 +7897,15 @@ name = "vector_store"
version = "0.1.0"
dependencies = [
"anyhow",
+ "async-compat",
+ "conv",
"futures 0.3.28",
"gpui",
"language",
"project",
+ "rand 0.8.5",
"smol",
+ "sqlx",
"util",
"workspace",
]
@@ -17,6 +17,10 @@ util = { path = "../util" }
anyhow.workspace = true
futures.workspace = true
smol.workspace = true
+sqlx = { version = "0.6", features = ["sqlite","runtime-tokio-rustls"] }
+async-compat = "0.2.1"
+conv = "0.3.3"
+rand.workspace = true
[dev-dependencies]
gpui = { path = "../gpui", features = ["test-support"] }
@@ -0,0 +1,107 @@
+use anyhow::Result;
+use async_compat::{Compat, CompatExt};
+use conv::ValueFrom;
+use sqlx::{migrate::MigrateDatabase, Pool, Sqlite, SqlitePool};
+use std::time::{Duration, Instant};
+
+use crate::IndexedFile;
+
+// This is saving to a local database store within the users dev zed path
+// Where do we want this to sit?
+// Assuming near where the workspace DB sits.
+const VECTOR_DB_URL: &str = "embeddings_db";
+
+pub struct VectorDatabase {}
+
+impl VectorDatabase {
+ pub async fn initialize_database() -> Result<()> {
+ // If database doesnt exist create database
+ if !Sqlite::database_exists(VECTOR_DB_URL)
+ .compat()
+ .await
+ .unwrap_or(false)
+ {
+ Sqlite::create_database(VECTOR_DB_URL).compat().await?;
+ }
+
+ let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
+
+ // Initialize Vector Databasing Tables
+ // We may be able to skip this assuming the database is never created
+ // without creating the tables at the same time.
+ sqlx::query(
+ "CREATE TABLE IF NOT EXISTS files (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ path NVARCHAR(100) NOT NULL,
+ sha1 NVARCHAR(40) NOT NULL
+ )",
+ )
+ .execute(&db)
+ .compat()
+ .await?;
+
+ sqlx::query(
+ "CREATE TABLE IF NOT EXISTS documents (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ file_id INTEGER NOT NULL,
+ offset INTEGER NOT NULL,
+ name NVARCHAR(100) NOT NULL,
+ embedding BLOB NOT NULL,
+ FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
+ )",
+ )
+ .execute(&db)
+ .compat()
+ .await?;
+
+ Ok(())
+ }
+
+ pub async fn insert_file(indexed_file: IndexedFile) -> Result<()> {
+ // Write to files table, and return generated id.
+ let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
+
+ let files_insert = sqlx::query("INSERT INTO files (path, sha1) VALUES ($1, $2)")
+ .bind(indexed_file.path.to_str())
+ .bind(indexed_file.sha1)
+ .execute(&db)
+ .compat()
+ .await?;
+
+ let inserted_id = files_insert.last_insert_rowid();
+
+ // I stole this from https://stackoverflow.com/questions/71829931/how-do-i-convert-a-negative-f32-value-to-binary-string-and-back-again
+ // I imagine there is a better way to serialize to/from blob
+ fn get_binary_from_values(values: Vec<f32>) -> String {
+ let bits: Vec<_> = values.iter().map(|v| v.to_bits().to_string()).collect();
+ bits.join(";")
+ }
+
+ fn get_values_from_binary(bin: &str) -> Vec<f32> {
+ (0..bin.len() / 32)
+ .map(|i| {
+ let start = i * 32;
+ let end = start + 32;
+ f32::from_bits(u32::from_str_radix(&bin[start..end], 2).unwrap())
+ })
+ .collect()
+ }
+
+ // Currently inserting at approximately 3400 documents a second
+ // I imagine we can speed this up with a bulk insert of some kind.
+ for document in indexed_file.documents {
+ sqlx::query(
+ "INSERT INTO documents (file_id, offset, name, embedding) VALUES ($1, $2, $3, $4)",
+ )
+ .bind(inserted_id)
+ .bind(document.offset.to_string())
+ .bind(document.name)
+ .bind(get_binary_from_values(document.embedding))
+ .execute(&db)
+ .compat()
+ .await?;
+ }
+
+ Ok(())
+ }
+}
@@ -1,9 +1,12 @@
-use anyhow::{anyhow, Result};
+mod db;
+use anyhow::Result;
+use db::VectorDatabase;
use gpui::{AppContext, Entity, ModelContext, ModelHandle};
use language::LanguageRegistry;
use project::{Fs, Project};
+use rand::Rng;
use smol::channel;
-use std::{path::PathBuf, sync::Arc};
+use std::{path::PathBuf, sync::Arc, time::Instant};
use util::ResultExt;
use workspace::WorkspaceCreated;
@@ -27,13 +30,15 @@ pub fn init(fs: Arc<dyn Fs>, language_registry: Arc<LanguageRegistry>, cx: &mut
.detach();
}
+#[derive(Debug, sqlx::FromRow)]
struct Document {
offset: usize,
name: String,
embedding: Vec<f32>,
}
-struct IndexedFile {
+#[derive(Debug, sqlx::FromRow)]
+pub struct IndexedFile {
path: PathBuf,
sha1: String,
documents: Vec<Document>,
@@ -64,9 +69,24 @@ impl VectorStore {
language_registry: &Arc<LanguageRegistry>,
file_path: PathBuf,
) -> Result<IndexedFile> {
- eprintln!("indexing file {file_path:?}");
- Err(anyhow!("not implemented"))
- // todo!();
+ // This is creating dummy documents to test the database writes.
+ let mut documents = vec![];
+ let mut rng = rand::thread_rng();
+ let rand_num_of_documents: u8 = rng.gen_range(0..200);
+ for _ in 0..rand_num_of_documents {
+ let doc = Document {
+ offset: 0,
+ name: "test symbol".to_string(),
+ embedding: vec![0.32 as f32; 768],
+ };
+ documents.push(doc);
+ }
+
+ return Ok(IndexedFile {
+ path: file_path,
+ sha1: "asdfasdfasdf".to_string(),
+ documents,
+ });
}
fn add_project(&mut self, project: ModelHandle<Project>, cx: &mut ModelContext<Self>) {
@@ -100,13 +120,17 @@ impl VectorStore {
}
})
.detach();
+
cx.background()
.spawn(async move {
+ // Initialize Database, creates database and tables if not exists
+ VectorDatabase::initialize_database().await.log_err();
while let Ok(indexed_file) = indexed_files_rx.recv().await {
- // write document to database
+ VectorDatabase::insert_file(indexed_file).await.log_err();
}
})
.detach();
+
cx.background()
.scoped(|scope| {
for _ in 0..cx.background().num_cpus() {