1use std::{
2 collections::HashMap,
3 path::{Path, PathBuf},
4 rc::Rc,
5 time::SystemTime,
6};
7
8use anyhow::{anyhow, Result};
9
10use crate::IndexedFile;
11use rpc::proto::Timestamp;
12use rusqlite::{
13 params,
14 types::{FromSql, FromSqlResult, ValueRef},
15};
16
17// Note this is not an appropriate document
18#[derive(Debug)]
19pub struct DocumentRecord {
20 pub id: usize,
21 pub file_id: usize,
22 pub offset: usize,
23 pub name: String,
24 pub embedding: Embedding,
25}
26
27#[derive(Debug)]
28pub struct FileRecord {
29 pub id: usize,
30 pub relative_path: String,
31 pub mtime: Timestamp,
32}
33
34#[derive(Debug)]
35pub struct Embedding(pub Vec<f32>);
36
37impl FromSql for Embedding {
38 fn column_result(value: ValueRef) -> FromSqlResult<Self> {
39 let bytes = value.as_blob()?;
40 let embedding: Result<Vec<f32>, Box<bincode::ErrorKind>> = bincode::deserialize(bytes);
41 if embedding.is_err() {
42 return Err(rusqlite::types::FromSqlError::Other(embedding.unwrap_err()));
43 }
44 return Ok(Embedding(embedding.unwrap()));
45 }
46}
47
48pub struct VectorDatabase {
49 db: rusqlite::Connection,
50}
51
52impl VectorDatabase {
53 pub fn new(path: String) -> Result<Self> {
54 let this = Self {
55 db: rusqlite::Connection::open(path)?,
56 };
57 this.initialize_database()?;
58 Ok(this)
59 }
60
61 fn initialize_database(&self) -> Result<()> {
62 rusqlite::vtab::array::load_module(&self.db)?;
63
64 // This will create the database if it doesnt exist
65
66 // Initialize Vector Databasing Tables
67 self.db.execute(
68 "CREATE TABLE IF NOT EXISTS worktrees (
69 id INTEGER PRIMARY KEY AUTOINCREMENT,
70 absolute_path VARCHAR NOT NULL
71 );
72 CREATE UNIQUE INDEX IF NOT EXISTS worktrees_absolute_path ON worktrees (absolute_path);
73 ",
74 [],
75 )?;
76
77 self.db.execute(
78 "CREATE TABLE IF NOT EXISTS files (
79 id INTEGER PRIMARY KEY AUTOINCREMENT,
80 worktree_id INTEGER NOT NULL,
81 relative_path VARCHAR NOT NULL,
82 mtime_seconds INTEGER NOT NULL,
83 mtime_nanos INTEGER NOT NULL,
84 FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE
85 )",
86 [],
87 )?;
88
89 self.db.execute(
90 "CREATE TABLE IF NOT EXISTS documents (
91 id INTEGER PRIMARY KEY AUTOINCREMENT,
92 file_id INTEGER NOT NULL,
93 offset INTEGER NOT NULL,
94 name VARCHAR NOT NULL,
95 embedding BLOB NOT NULL,
96 FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
97 )",
98 [],
99 )?;
100
101 Ok(())
102 }
103
104 pub fn delete_file(&self, worktree_id: i64, delete_path: PathBuf) -> Result<()> {
105 self.db.execute(
106 "DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2",
107 params![worktree_id, delete_path.to_str()],
108 )?;
109 Ok(())
110 }
111
112 pub fn insert_file(&self, worktree_id: i64, indexed_file: IndexedFile) -> Result<()> {
113 // Write to files table, and return generated id.
114 self.db.execute(
115 "
116 DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;
117 ",
118 params![worktree_id, indexed_file.path.to_str()],
119 )?;
120 let mtime = Timestamp::from(indexed_file.mtime);
121 self.db.execute(
122 "
123 INSERT INTO files
124 (worktree_id, relative_path, mtime_seconds, mtime_nanos)
125 VALUES
126 (?1, ?2, $3, $4);
127 ",
128 params![
129 worktree_id,
130 indexed_file.path.to_str(),
131 mtime.seconds,
132 mtime.nanos
133 ],
134 )?;
135
136 let file_id = self.db.last_insert_rowid();
137
138 // Currently inserting at approximately 3400 documents a second
139 // I imagine we can speed this up with a bulk insert of some kind.
140 for document in indexed_file.documents {
141 let embedding_blob = bincode::serialize(&document.embedding)?;
142
143 self.db.execute(
144 "INSERT INTO documents (file_id, offset, name, embedding) VALUES (?1, ?2, ?3, ?4)",
145 params![
146 file_id,
147 document.offset.to_string(),
148 document.name,
149 embedding_blob
150 ],
151 )?;
152 }
153
154 Ok(())
155 }
156
157 pub fn find_or_create_worktree(&self, worktree_root_path: &Path) -> Result<i64> {
158 // Check that the absolute path doesnt exist
159 let mut worktree_query = self
160 .db
161 .prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?;
162
163 let worktree_id = worktree_query
164 .query_row(params![worktree_root_path.to_string_lossy()], |row| {
165 Ok(row.get::<_, i64>(0)?)
166 })
167 .map_err(|err| anyhow!(err));
168
169 if worktree_id.is_ok() {
170 return worktree_id;
171 }
172
173 // If worktree_id is Err, insert new worktree
174 self.db.execute(
175 "
176 INSERT into worktrees (absolute_path) VALUES (?1)
177 ",
178 params![worktree_root_path.to_string_lossy()],
179 )?;
180 Ok(self.db.last_insert_rowid())
181 }
182
183 pub fn get_file_mtimes(&self, worktree_id: i64) -> Result<HashMap<PathBuf, SystemTime>> {
184 let mut statement = self.db.prepare(
185 "
186 SELECT relative_path, mtime_seconds, mtime_nanos
187 FROM files
188 WHERE worktree_id = ?1
189 ORDER BY relative_path",
190 )?;
191 let mut result: HashMap<PathBuf, SystemTime> = HashMap::new();
192 for row in statement.query_map(params![worktree_id], |row| {
193 Ok((
194 row.get::<_, String>(0)?.into(),
195 Timestamp {
196 seconds: row.get(1)?,
197 nanos: row.get(2)?,
198 }
199 .into(),
200 ))
201 })? {
202 let row = row?;
203 result.insert(row.0, row.1);
204 }
205 Ok(result)
206 }
207
208 pub fn for_each_document(
209 &self,
210 worktree_ids: &[i64],
211 mut f: impl FnMut(i64, Embedding),
212 ) -> Result<()> {
213 let mut query_statement = self.db.prepare(
214 "
215 SELECT
216 documents.id, documents.embedding
217 FROM
218 documents, files
219 WHERE
220 documents.file_id = files.id AND
221 files.worktree_id IN rarray(?)
222 ",
223 )?;
224 query_statement
225 .query_map(params![ids_to_sql(worktree_ids)], |row| {
226 Ok((row.get(0)?, row.get(1)?))
227 })?
228 .filter_map(|row| row.ok())
229 .for_each(|row| f(row.0, row.1));
230 Ok(())
231 }
232
233 pub fn get_documents_by_ids(&self, ids: &[i64]) -> Result<Vec<(i64, PathBuf, usize, String)>> {
234 let mut statement = self.db.prepare(
235 "
236 SELECT
237 documents.id, files.worktree_id, files.relative_path, documents.offset, documents.name
238 FROM
239 documents, files
240 WHERE
241 documents.file_id = files.id AND
242 documents.id in rarray(?)
243 ",
244 )?;
245
246 let result_iter = statement.query_map(params![ids_to_sql(ids)], |row| {
247 Ok((
248 row.get::<_, i64>(0)?,
249 row.get::<_, i64>(1)?,
250 row.get::<_, String>(2)?.into(),
251 row.get(3)?,
252 row.get(4)?,
253 ))
254 })?;
255
256 let mut values_by_id = HashMap::<i64, (i64, PathBuf, usize, String)>::default();
257 for row in result_iter {
258 let (id, worktree_id, path, offset, name) = row?;
259 values_by_id.insert(id, (worktree_id, path, offset, name));
260 }
261
262 let mut results = Vec::with_capacity(ids.len());
263 for id in ids {
264 let value = values_by_id
265 .remove(id)
266 .ok_or(anyhow!("missing document id {}", id))?;
267 results.push(value);
268 }
269
270 Ok(results)
271 }
272}
273
274fn ids_to_sql(ids: &[i64]) -> Rc<Vec<rusqlite::types::Value>> {
275 Rc::new(
276 ids.iter()
277 .copied()
278 .map(|v| rusqlite::types::Value::from(v))
279 .collect::<Vec<_>>(),
280 )
281}