From 59379c624dcd94e823f7fabf5e00bf4bfcf83cb7 Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Thu, 26 Mar 2026 02:29:11 -0500
Subject: [PATCH] Fix Ollama timeout: set num_ctx dynamically, truncate
 oversized prompts

Root cause: query_ollama() sent no num_ctx option, so Ollama defaulted
to 2048 tokens. Research mode with 15 questions builds prompts that
exceed model context windows, causing Ollama to hang until the 300s
timeout.

Fix:
- Calculate num_ctx from prompt size + 1024 token response buffer
- Cap at model's actual context limit
- Truncate prompts that exceed context window minus 512 response tokens
- Uses smart_truncate() to preserve start + end of prompt
- Updated MODEL_CONTEXT map with accurate limits for all local models

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 llm_team_ui.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/llm_team_ui.py b/llm_team_ui.py
index 8c98c9d..38b25bb 100644
--- a/llm_team_ui.py
+++ b/llm_team_ui.py
@@ -2751,8 +2751,16 @@ def _get_timeout(model_id):
 def query_ollama(model, prompt, timeout):
     cfg = load_config()
     base = cfg["providers"]["ollama"].get("base_url", "http://localhost:11434")
+    # Set num_ctx based on prompt size — Ollama defaults to 2048 which is too small
+    prompt_tokens = estimate_tokens(prompt)
+    ctx_limit = get_context_limit(model)
+    num_ctx = min(max(prompt_tokens + 1024, 2048), ctx_limit)
+    # Truncate prompt if it exceeds the model's context window
+    if prompt_tokens > ctx_limit - 512:
+        prompt = smart_truncate(prompt, ctx_limit - 512)
     resp = requests.post(f"{base}/api/generate", json={
         "model": model, "prompt": prompt, "stream": False,
+        "options": {"num_ctx": num_ctx}
     }, timeout=timeout)
     resp.raise_for_status()
     return resp.json()["response"]
@@ -2803,7 +2811,9 @@ def query_model(model_id, prompt):
 
 # Context window sizes (tokens) — conservative estimates for safe prompting
 MODEL_CONTEXT = {
-    "llama3.2": 4096, "mistral": 8192, "gemma2": 8192, "qwen2.5": 8192,
+    "llama3.2": 4096, "llama3.1": 8192, "llama3": 8192,
+    "mistral": 8192, "gemma2": 8192, "gemma3": 32768,
+    "qwen2.5": 8192, "qwen3": 32768,
     "gpt-oss": 4096, "gpt-4o": 128000, "gpt-4o-mini": 128000,
     "claude-3": 200000, "claude-sonnet": 200000, "claude-haiku": 200000,
 }