db.rs

  1use std::{
  2    collections::HashMap,
  3    path::{Path, PathBuf},
  4    rc::Rc,
  5    time::SystemTime,
  6};
  7
  8use anyhow::{anyhow, Result};
  9
 10use crate::IndexedFile;
 11use rpc::proto::Timestamp;
 12use rusqlite::{
 13    params,
 14    types::{FromSql, FromSqlResult, ValueRef},
 15};
 16
 17// Note this is not an appropriate document
 18#[derive(Debug)]
 19pub struct DocumentRecord {
 20    pub id: usize,
 21    pub file_id: usize,
 22    pub offset: usize,
 23    pub name: String,
 24    pub embedding: Embedding,
 25}
 26
 27#[derive(Debug)]
 28pub struct FileRecord {
 29    pub id: usize,
 30    pub relative_path: String,
 31    pub mtime: Timestamp,
 32}
 33
 34#[derive(Debug)]
 35pub struct Embedding(pub Vec<f32>);
 36
 37impl FromSql for Embedding {
 38    fn column_result(value: ValueRef) -> FromSqlResult<Self> {
 39        let bytes = value.as_blob()?;
 40        let embedding: Result<Vec<f32>, Box<bincode::ErrorKind>> = bincode::deserialize(bytes);
 41        if embedding.is_err() {
 42            return Err(rusqlite::types::FromSqlError::Other(embedding.unwrap_err()));
 43        }
 44        return Ok(Embedding(embedding.unwrap()));
 45    }
 46}
 47
 48pub struct VectorDatabase {
 49    db: rusqlite::Connection,
 50}
 51
 52impl VectorDatabase {
 53    pub fn new(path: String) -> Result<Self> {
 54        let this = Self {
 55            db: rusqlite::Connection::open(path)?,
 56        };
 57        this.initialize_database()?;
 58        Ok(this)
 59    }
 60
 61    fn initialize_database(&self) -> Result<()> {
 62        rusqlite::vtab::array::load_module(&self.db)?;
 63
 64        // This will create the database if it doesnt exist
 65
 66        // Initialize Vector Databasing Tables
 67        self.db.execute(
 68            "CREATE TABLE IF NOT EXISTS worktrees (
 69                id INTEGER PRIMARY KEY AUTOINCREMENT,
 70                absolute_path VARCHAR NOT NULL
 71            );
 72            CREATE UNIQUE INDEX IF NOT EXISTS worktrees_absolute_path ON worktrees (absolute_path);
 73            ",
 74            [],
 75        )?;
 76
 77        self.db.execute(
 78            "CREATE TABLE IF NOT EXISTS files (
 79                id INTEGER PRIMARY KEY AUTOINCREMENT,
 80                worktree_id INTEGER NOT NULL,
 81                relative_path VARCHAR NOT NULL,
 82                mtime_seconds INTEGER NOT NULL,
 83                mtime_nanos INTEGER NOT NULL,
 84                FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE
 85            )",
 86            [],
 87        )?;
 88
 89        self.db.execute(
 90            "CREATE TABLE IF NOT EXISTS documents (
 91                id INTEGER PRIMARY KEY AUTOINCREMENT,
 92                file_id INTEGER NOT NULL,
 93                offset INTEGER NOT NULL,
 94                name VARCHAR NOT NULL,
 95                embedding BLOB NOT NULL,
 96                FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
 97            )",
 98            [],
 99        )?;
100
101        Ok(())
102    }
103
104    pub fn delete_file(&self, worktree_id: i64, delete_path: PathBuf) -> Result<()> {
105        self.db.execute(
106            "DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2",
107            params![worktree_id, delete_path.to_str()],
108        )?;
109        Ok(())
110    }
111
112    pub fn insert_file(&self, worktree_id: i64, indexed_file: IndexedFile) -> Result<()> {
113        // Write to files table, and return generated id.
114        self.db.execute(
115            "
116            DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;
117            ",
118            params![worktree_id, indexed_file.path.to_str()],
119        )?;
120        let mtime = Timestamp::from(indexed_file.mtime);
121        self.db.execute(
122            "
123            INSERT INTO files
124            (worktree_id, relative_path, mtime_seconds, mtime_nanos)
125            VALUES
126            (?1, ?2, $3, $4);
127            ",
128            params![
129                worktree_id,
130                indexed_file.path.to_str(),
131                mtime.seconds,
132                mtime.nanos
133            ],
134        )?;
135
136        let file_id = self.db.last_insert_rowid();
137
138        // Currently inserting at approximately 3400 documents a second
139        // I imagine we can speed this up with a bulk insert of some kind.
140        for document in indexed_file.documents {
141            let embedding_blob = bincode::serialize(&document.embedding)?;
142
143            self.db.execute(
144                "INSERT INTO documents (file_id, offset, name, embedding) VALUES (?1, ?2, ?3, ?4)",
145                params![
146                    file_id,
147                    document.offset.to_string(),
148                    document.name,
149                    embedding_blob
150                ],
151            )?;
152        }
153
154        Ok(())
155    }
156
157    pub fn find_or_create_worktree(&self, worktree_root_path: &Path) -> Result<i64> {
158        // Check that the absolute path doesnt exist
159        let mut worktree_query = self
160            .db
161            .prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?;
162
163        let worktree_id = worktree_query
164            .query_row(params![worktree_root_path.to_string_lossy()], |row| {
165                Ok(row.get::<_, i64>(0)?)
166            })
167            .map_err(|err| anyhow!(err));
168
169        if worktree_id.is_ok() {
170            return worktree_id;
171        }
172
173        // If worktree_id is Err, insert new worktree
174        self.db.execute(
175            "
176            INSERT into worktrees (absolute_path) VALUES (?1)
177            ",
178            params![worktree_root_path.to_string_lossy()],
179        )?;
180        Ok(self.db.last_insert_rowid())
181    }
182
183    pub fn get_file_mtimes(&self, worktree_id: i64) -> Result<HashMap<PathBuf, SystemTime>> {
184        let mut statement = self.db.prepare(
185            "
186            SELECT relative_path, mtime_seconds, mtime_nanos
187            FROM files
188            WHERE worktree_id = ?1
189            ORDER BY relative_path",
190        )?;
191        let mut result: HashMap<PathBuf, SystemTime> = HashMap::new();
192        for row in statement.query_map(params![worktree_id], |row| {
193            Ok((
194                row.get::<_, String>(0)?.into(),
195                Timestamp {
196                    seconds: row.get(1)?,
197                    nanos: row.get(2)?,
198                }
199                .into(),
200            ))
201        })? {
202            let row = row?;
203            result.insert(row.0, row.1);
204        }
205        Ok(result)
206    }
207
208    pub fn for_each_document(
209        &self,
210        worktree_ids: &[i64],
211        mut f: impl FnMut(i64, Embedding),
212    ) -> Result<()> {
213        let mut query_statement = self.db.prepare(
214            "
215            SELECT
216                documents.id, documents.embedding
217            FROM
218                documents, files
219            WHERE
220                documents.file_id = files.id AND
221                files.worktree_id IN rarray(?)
222            ",
223        )?;
224        query_statement
225            .query_map(params![ids_to_sql(worktree_ids)], |row| {
226                Ok((row.get(0)?, row.get(1)?))
227            })?
228            .filter_map(|row| row.ok())
229            .for_each(|row| f(row.0, row.1));
230        Ok(())
231    }
232
233    pub fn get_documents_by_ids(&self, ids: &[i64]) -> Result<Vec<(i64, PathBuf, usize, String)>> {
234        let mut statement = self.db.prepare(
235            "
236                SELECT
237                    documents.id, files.worktree_id, files.relative_path, documents.offset, documents.name
238                FROM
239                    documents, files
240                WHERE
241                    documents.file_id = files.id AND
242                    documents.id in rarray(?)
243            ",
244        )?;
245
246        let result_iter = statement.query_map(params![ids_to_sql(ids)], |row| {
247            Ok((
248                row.get::<_, i64>(0)?,
249                row.get::<_, i64>(1)?,
250                row.get::<_, String>(2)?.into(),
251                row.get(3)?,
252                row.get(4)?,
253            ))
254        })?;
255
256        let mut values_by_id = HashMap::<i64, (i64, PathBuf, usize, String)>::default();
257        for row in result_iter {
258            let (id, worktree_id, path, offset, name) = row?;
259            values_by_id.insert(id, (worktree_id, path, offset, name));
260        }
261
262        let mut results = Vec::with_capacity(ids.len());
263        for id in ids {
264            let value = values_by_id
265                .remove(id)
266                .ok_or(anyhow!("missing document id {}", id))?;
267            results.push(value);
268        }
269
270        Ok(results)
271    }
272}
273
274fn ids_to_sql(ids: &[i64]) -> Rc<Vec<rusqlite::types::Value>> {
275    Rc::new(
276        ids.iter()
277            .copied()
278            .map(|v| rusqlite::types::Value::from(v))
279            .collect::<Vec<_>>(),
280    )
281}