ollama: Support model context_size (num_ctx) >2048 (#16877)

assets/settings/default.json 🔗

@@ -895,7 +895,8 @@
       "api_url": "https://generativelanguage.googleapis.com"
     },
     "ollama": {
-      "api_url": "http://localhost:11434"
+      "api_url": "http://localhost:11434",
+      "low_speed_timeout_in_seconds": 60
     },
     "openai": {
       "version": "1",

crates/ollama/src/ollama.rs 🔗

@@ -70,11 +70,36 @@ pub struct Model {
     pub keep_alive: Option<KeepAlive>,
 }
 
+// This could be dynamically retrieved via the API (1 call per model)
+// curl -s http://localhost:11434/api/show -d '{"model": "llama3.1:latest"}' | jq '.model_info."llama.context_length"'
+fn get_max_tokens(name: &str) -> usize {
+    match name {
+        "dolphin-llama3:8b-256k" => 262144, // 256K
+        _ => match name.split(':').next().unwrap() {
+            "mistral-nemo" => 1024000,                                      // 1M
+            "deepseek-coder-v2" => 163840,                                  // 160K
+            "llama3.1" | "phi3" | "command-r" | "command-r-plus" => 131072, // 128K
+            "codeqwen" => 65536,                                            // 64K
+            "mistral" | "mistral-large" | "dolphin-mistral" | "codestral"   // 32K
+            | "mistral-openorca" | "dolphin-mixtral" | "mixstral" | "llava"
+            | "qwen" | "qwen2" | "wizardlm2" | "wizard-math" => 32768,
+            "codellama" | "stable-code" | "deepseek-coder" | "starcoder2"   // 16K
+            | "wizardcoder" => 16384,
+            "llama3" | "gemma2" | "gemma" | "codegemma" | "dolphin-llama3"  // 8K
+            | "llava-llama3" | "starcoder" | "openchat" | "aya" => 8192,
+            "llama2" | "yi" | "llama2-chinese" | "vicuna" | "nous-hermes2"  // 4K
+            | "stablelm2" => 4096,
+            "phi" | "orca-mini" | "tinyllama" | "granite-code" => 2048,     // 2K
+            _ => 2048,                                                      // 2K (default)
+        },
+    }
+}
+
 impl Model {
     pub fn new(name: &str) -> Self {
         Self {
             name: name.to_owned(),
-            max_tokens: 2048,
+            max_tokens: get_max_tokens(name),
             keep_alive: Some(KeepAlive::indefinite()),
         }
     }

ollama: Support model context_size (num_ctx) >2048 (#16877)

Change summary

Detailed changes

assets/settings/default.json 🔗

crates/ollama/src/ollama.rs 🔗