import os from fastapi import APIRouter, HTTPException from pydantic import BaseModel from .ollama import client router = APIRouter() GEN_MODEL = os.environ.get("GEN_MODEL", "qwen2.5") class GenerateRequest(BaseModel): prompt: str model: str | None = None system: str | None = None temperature: float = 0.7 max_tokens: int = 2048 # think=false disables hidden reasoning blocks on thinking models # (qwen3, qwen3.5, gpt-oss). Required for hot-path JSON emitters # that need the whole token budget for the visible response. think: bool | None = None class GenerateResponse(BaseModel): text: str model: str tokens_evaluated: int | None = None tokens_generated: int | None = None @router.post("", response_model=GenerateResponse) async def generate(req: GenerateRequest): model = req.model or GEN_MODEL payload = { "model": model, "prompt": req.prompt, "stream": False, "options": { "temperature": req.temperature, "num_predict": req.max_tokens, }, } if req.system: payload["system"] = req.system if req.think is not None: payload["think"] = req.think async with client() as c: resp = await c.post("/api/generate", json=payload) if resp.status_code != 200: raise HTTPException(502, f"Ollama error: {resp.text}") data = resp.json() return GenerateResponse( text=data.get("response", ""), model=model, tokens_evaluated=data.get("prompt_eval_count"), tokens_generated=data.get("eval_count"), )