From 17237f748ce984a6285fe91ace515ba5830e4916 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 6 Sep 2023 15:09:15 -0400 Subject: [PATCH] update token_count for OpenAIEmbeddings to accomodate for truncation --- crates/semantic_index/src/embedding.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/crates/semantic_index/src/embedding.rs b/crates/semantic_index/src/embedding.rs index 97c25ca170f0c6164485532291347e12e6dc0864..8140e244bda131f59bad5d1e67d594765e3588e6 100644 --- a/crates/semantic_index/src/embedding.rs +++ b/crates/semantic_index/src/embedding.rs @@ -181,18 +181,17 @@ impl EmbeddingProvider for OpenAIEmbeddings { fn truncate(&self, span: &str) -> (String, usize) { let mut tokens = OPENAI_BPE_TOKENIZER.encode_with_special_tokens(span); - let token_count = tokens.len(); - let output = if token_count > OPENAI_INPUT_LIMIT { + let output = if tokens.len() > OPENAI_INPUT_LIMIT { tokens.truncate(OPENAI_INPUT_LIMIT); OPENAI_BPE_TOKENIZER - .decode(tokens) + .decode(tokens.clone()) .ok() .unwrap_or_else(|| span.to_string()) } else { span.to_string() }; - (output, token_count) + (output, tokens.len()) } async fn embed_batch(&self, spans: Vec) -> Result> {