- sidecar: FastAPI app with /embed, /generate, /rerank hitting Ollama - sidecar: Dockerfile, env var config (EMBED_MODEL, GEN_MODEL, RERANK_MODEL) - aibridge: reqwest HTTP client with typed request/response structs - aibridge: Axum proxy endpoints (POST /ai/embed, /ai/generate, /ai/rerank) - gateway: wires AiClient with SIDECAR_URL env var - e2e verified: nomic-embed-text returns 768d vectors, qwen2.5 generates text Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
56 lines
1.3 KiB
Python
56 lines
1.3 KiB
Python
import os
|
|
|
|
from fastapi import APIRouter, HTTPException
|
|
from pydantic import BaseModel
|
|
|
|
from .ollama import client
|
|
|
|
router = APIRouter()
|
|
|
|
GEN_MODEL = os.environ.get("GEN_MODEL", "qwen2.5")
|
|
|
|
|
|
class GenerateRequest(BaseModel):
|
|
prompt: str
|
|
model: str | None = None
|
|
system: str | None = None
|
|
temperature: float = 0.7
|
|
max_tokens: int = 2048
|
|
|
|
|
|
class GenerateResponse(BaseModel):
|
|
text: str
|
|
model: str
|
|
tokens_evaluated: int | None = None
|
|
tokens_generated: int | None = None
|
|
|
|
|
|
@router.post("", response_model=GenerateResponse)
|
|
async def generate(req: GenerateRequest):
|
|
model = req.model or GEN_MODEL
|
|
|
|
payload = {
|
|
"model": model,
|
|
"prompt": req.prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": req.temperature,
|
|
"num_predict": req.max_tokens,
|
|
},
|
|
}
|
|
if req.system:
|
|
payload["system"] = req.system
|
|
|
|
async with client() as c:
|
|
resp = await c.post("/api/generate", json=payload)
|
|
if resp.status_code != 200:
|
|
raise HTTPException(502, f"Ollama error: {resp.text}")
|
|
data = resp.json()
|
|
|
|
return GenerateResponse(
|
|
text=data.get("response", ""),
|
|
model=model,
|
|
tokens_evaluated=data.get("prompt_eval_count"),
|
|
tokens_generated=data.get("eval_count"),
|
|
)
|