diff --git a/llm_team_ui.py b/llm_team_ui.py index 8c98c9d..38b25bb 100644 --- a/llm_team_ui.py +++ b/llm_team_ui.py @@ -2751,8 +2751,16 @@ def _get_timeout(model_id): def query_ollama(model, prompt, timeout): cfg = load_config() base = cfg["providers"]["ollama"].get("base_url", "http://localhost:11434") + # Set num_ctx based on prompt size — Ollama defaults to 2048 which is too small + prompt_tokens = estimate_tokens(prompt) + ctx_limit = get_context_limit(model) + num_ctx = min(max(prompt_tokens + 1024, 2048), ctx_limit) + # Truncate prompt if it exceeds the model's context window + if prompt_tokens > ctx_limit - 512: + prompt = smart_truncate(prompt, ctx_limit - 512) resp = requests.post(f"{base}/api/generate", json={ "model": model, "prompt": prompt, "stream": False, + "options": {"num_ctx": num_ctx} }, timeout=timeout) resp.raise_for_status() return resp.json()["response"] @@ -2803,7 +2811,9 @@ def query_model(model_id, prompt): # Context window sizes (tokens) — conservative estimates for safe prompting MODEL_CONTEXT = { - "llama3.2": 4096, "mistral": 8192, "gemma2": 8192, "qwen2.5": 8192, + "llama3.2": 4096, "llama3.1": 8192, "llama3": 8192, + "mistral": 8192, "gemma2": 8192, "gemma3": 32768, + "qwen2.5": 8192, "qwen3": 32768, "gpt-oss": 4096, "gpt-4o": 128000, "gpt-4o-mini": 128000, "claude-3": 200000, "claude-sonnet": 200000, "claude-haiku": 200000, }