lakehouse/sidecar/sidecar/generate.py
root 239e471223 Phase 3: AI integration with Ollama via Python sidecar
- sidecar: FastAPI app with /embed, /generate, /rerank hitting Ollama
- sidecar: Dockerfile, env var config (EMBED_MODEL, GEN_MODEL, RERANK_MODEL)
- aibridge: reqwest HTTP client with typed request/response structs
- aibridge: Axum proxy endpoints (POST /ai/embed, /ai/generate, /ai/rerank)
- gateway: wires AiClient with SIDECAR_URL env var
- e2e verified: nomic-embed-text returns 768d vectors, qwen2.5 generates text

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 05:53:56 -05:00

56 lines
1.3 KiB
Python

import os
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from .ollama import client
router = APIRouter()
GEN_MODEL = os.environ.get("GEN_MODEL", "qwen2.5")
class GenerateRequest(BaseModel):
prompt: str
model: str | None = None
system: str | None = None
temperature: float = 0.7
max_tokens: int = 2048
class GenerateResponse(BaseModel):
text: str
model: str
tokens_evaluated: int | None = None
tokens_generated: int | None = None
@router.post("", response_model=GenerateResponse)
async def generate(req: GenerateRequest):
model = req.model or GEN_MODEL
payload = {
"model": model,
"prompt": req.prompt,
"stream": False,
"options": {
"temperature": req.temperature,
"num_predict": req.max_tokens,
},
}
if req.system:
payload["system"] = req.system
async with client() as c:
resp = await c.post("/api/generate", json=payload)
if resp.status_code != 200:
raise HTTPException(502, f"Ollama error: {resp.text}")
data = resp.json()
return GenerateResponse(
text=data.get("response", ""),
model=model,
tokens_evaluated=data.get("prompt_eval_count"),
tokens_generated=data.get("eval_count"),
)