WIP: started DB creating and naive inserts

KCaverly created

Change summary

Cargo.lock                              |  19 ++++
crates/vector_store/Cargo.toml          |   4 +
crates/vector_store/src/db.rs           | 107 +++++++++++++++++++++++++++
crates/vector_store/src/vector_store.rs |  38 +++++++-
4 files changed, 161 insertions(+), 7 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -1389,6 +1389,15 @@ dependencies = [
  "theme",
 ]
 
+[[package]]
+name = "conv"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ff10625fd0ac447827aa30ea8b861fead473bb60aeb73af6c1c58caf0d1299"
+dependencies = [
+ "custom_derive",
+]
+
 [[package]]
 name = "copilot"
 version = "0.1.0"
@@ -1766,6 +1775,12 @@ dependencies = [
  "winapi 0.3.9",
 ]
 
+[[package]]
+name = "custom_derive"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
+
 [[package]]
 name = "cxx"
 version = "1.0.94"
@@ -7882,11 +7897,15 @@ name = "vector_store"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-compat",
+ "conv",
  "futures 0.3.28",
  "gpui",
  "language",
  "project",
+ "rand 0.8.5",
  "smol",
+ "sqlx",
  "util",
  "workspace",
 ]

crates/vector_store/Cargo.toml 🔗

@@ -17,6 +17,10 @@ util = { path = "../util" }
 anyhow.workspace = true
 futures.workspace = true
 smol.workspace = true
+sqlx = { version = "0.6", features = ["sqlite","runtime-tokio-rustls"] }
+async-compat = "0.2.1"
+conv = "0.3.3"
+rand.workspace = true
 
 [dev-dependencies]
 gpui = { path = "../gpui", features = ["test-support"] }

crates/vector_store/src/db.rs 🔗

@@ -0,0 +1,107 @@
+use anyhow::Result;
+use async_compat::{Compat, CompatExt};
+use conv::ValueFrom;
+use sqlx::{migrate::MigrateDatabase, Pool, Sqlite, SqlitePool};
+use std::time::{Duration, Instant};
+
+use crate::IndexedFile;
+
+// This is saving to a local database store within the users dev zed path
+// Where do we want this to sit?
+// Assuming near where the workspace DB sits.
+const VECTOR_DB_URL: &str = "embeddings_db";
+
+pub struct VectorDatabase {}
+
+impl VectorDatabase {
+    pub async fn initialize_database() -> Result<()> {
+        // If database doesnt exist create database
+        if !Sqlite::database_exists(VECTOR_DB_URL)
+            .compat()
+            .await
+            .unwrap_or(false)
+        {
+            Sqlite::create_database(VECTOR_DB_URL).compat().await?;
+        }
+
+        let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
+
+        // Initialize Vector Databasing Tables
+        // We may be able to skip this assuming the database is never created
+        // without creating the tables at the same time.
+        sqlx::query(
+            "CREATE TABLE IF NOT EXISTS files (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            path NVARCHAR(100) NOT NULL,
+            sha1 NVARCHAR(40) NOT NULL
+            )",
+        )
+        .execute(&db)
+        .compat()
+        .await?;
+
+        sqlx::query(
+            "CREATE TABLE IF NOT EXISTS documents (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            file_id INTEGER NOT NULL,
+            offset INTEGER NOT NULL,
+            name NVARCHAR(100) NOT NULL,
+            embedding BLOB NOT NULL,
+            FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
+            )",
+        )
+        .execute(&db)
+        .compat()
+        .await?;
+
+        Ok(())
+    }
+
+    pub async fn insert_file(indexed_file: IndexedFile) -> Result<()> {
+        // Write to files table, and return generated id.
+        let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
+
+        let files_insert = sqlx::query("INSERT INTO files (path, sha1) VALUES ($1, $2)")
+            .bind(indexed_file.path.to_str())
+            .bind(indexed_file.sha1)
+            .execute(&db)
+            .compat()
+            .await?;
+
+        let inserted_id = files_insert.last_insert_rowid();
+
+        // I stole this from https://stackoverflow.com/questions/71829931/how-do-i-convert-a-negative-f32-value-to-binary-string-and-back-again
+        // I imagine there is a better way to serialize to/from blob
+        fn get_binary_from_values(values: Vec<f32>) -> String {
+            let bits: Vec<_> = values.iter().map(|v| v.to_bits().to_string()).collect();
+            bits.join(";")
+        }
+
+        fn get_values_from_binary(bin: &str) -> Vec<f32> {
+            (0..bin.len() / 32)
+                .map(|i| {
+                    let start = i * 32;
+                    let end = start + 32;
+                    f32::from_bits(u32::from_str_radix(&bin[start..end], 2).unwrap())
+                })
+                .collect()
+        }
+
+        // Currently inserting at approximately 3400 documents a second
+        // I imagine we can speed this up with a bulk insert of some kind.
+        for document in indexed_file.documents {
+            sqlx::query(
+                "INSERT INTO documents (file_id, offset, name, embedding) VALUES ($1, $2, $3, $4)",
+            )
+            .bind(inserted_id)
+            .bind(document.offset.to_string())
+            .bind(document.name)
+            .bind(get_binary_from_values(document.embedding))
+            .execute(&db)
+            .compat()
+            .await?;
+        }
+
+        Ok(())
+    }
+}

crates/vector_store/src/vector_store.rs 🔗

@@ -1,9 +1,12 @@
-use anyhow::{anyhow, Result};
+mod db;
+use anyhow::Result;
+use db::VectorDatabase;
 use gpui::{AppContext, Entity, ModelContext, ModelHandle};
 use language::LanguageRegistry;
 use project::{Fs, Project};
+use rand::Rng;
 use smol::channel;
-use std::{path::PathBuf, sync::Arc};
+use std::{path::PathBuf, sync::Arc, time::Instant};
 use util::ResultExt;
 use workspace::WorkspaceCreated;
 
@@ -27,13 +30,15 @@ pub fn init(fs: Arc<dyn Fs>, language_registry: Arc<LanguageRegistry>, cx: &mut
     .detach();
 }
 
+#[derive(Debug, sqlx::FromRow)]
 struct Document {
     offset: usize,
     name: String,
     embedding: Vec<f32>,
 }
 
-struct IndexedFile {
+#[derive(Debug, sqlx::FromRow)]
+pub struct IndexedFile {
     path: PathBuf,
     sha1: String,
     documents: Vec<Document>,
@@ -64,9 +69,24 @@ impl VectorStore {
         language_registry: &Arc<LanguageRegistry>,
         file_path: PathBuf,
     ) -> Result<IndexedFile> {
-        eprintln!("indexing file {file_path:?}");
-        Err(anyhow!("not implemented"))
-        // todo!();
+        // This is creating dummy documents to test the database writes.
+        let mut documents = vec![];
+        let mut rng = rand::thread_rng();
+        let rand_num_of_documents: u8 = rng.gen_range(0..200);
+        for _ in 0..rand_num_of_documents {
+            let doc = Document {
+                offset: 0,
+                name: "test symbol".to_string(),
+                embedding: vec![0.32 as f32; 768],
+            };
+            documents.push(doc);
+        }
+
+        return Ok(IndexedFile {
+            path: file_path,
+            sha1: "asdfasdfasdf".to_string(),
+            documents,
+        });
     }
 
     fn add_project(&mut self, project: ModelHandle<Project>, cx: &mut ModelContext<Self>) {
@@ -100,13 +120,17 @@ impl VectorStore {
                     }
                 })
                 .detach();
+
             cx.background()
                 .spawn(async move {
+                    // Initialize Database, creates database and tables if not exists
+                    VectorDatabase::initialize_database().await.log_err();
                     while let Ok(indexed_file) = indexed_files_rx.recv().await {
-                        // write document to database
+                        VectorDatabase::insert_file(indexed_file).await.log_err();
                     }
                 })
                 .detach();
+
             cx.background()
                 .scoped(|scope| {
                     for _ in 0..cx.background().num_cpus() {