diff --git a/tests/multi-agent/agent.ts b/tests/multi-agent/agent.ts index 6ee7449..487f1d4 100644 --- a/tests/multi-agent/agent.ts +++ b/tests/multi-agent/agent.ts @@ -394,11 +394,15 @@ export async function generate(model: string, prompt: string, opts: { return text; } -// Cloud generate — hits Ollama Cloud directly with the bearer key. Same -// /api/generate shape as local Ollama; `thinking` field (for gpt-oss:Nb) -// is discarded, only `response` is returned. Caller should budget -// num_predict ≥ 400 so thinking-model reasoning has room before the -// visible response starts. +// Cloud generate — routes through the lakehouse gateway's /v1/chat +// with provider="ollama_cloud". Phase 44 migration (2026-04-24): was +// hitting OLLAMA_CLOUD_URL/api/generate directly with a bearer key, +// bypassing the gateway's usage tracking + audit path. Now every call +// flows through /v1/chat so /v1/usage accounts for it. Gateway holds +// the OLLAMA_CLOUD_KEY; caller no longer needs it in env. +// +// Thinking-model budget note: num_predict ≥ 400 still matters, just +// expressed via max_tokens on the /v1/chat request. export async function generateCloud(model: string, prompt: string, opts: { max_tokens?: number; temperature?: number; @@ -406,41 +410,35 @@ export async function generateCloud(model: string, prompt: string, opts: { bypass_budget?: boolean; think?: boolean; } = {}): Promise { - if (!OLLAMA_CLOUD_KEY) { - throw new Error("OLLAMA_CLOUD_KEY not set; cannot reach Ollama Cloud"); - } assertContextBudget(model, prompt, { system: opts.system, max_tokens: opts.max_tokens, bypass: opts.bypass_budget, }); + const messages: Array<{ role: string; content: string }> = []; + if (opts.system) messages.push({ role: "system", content: opts.system }); + messages.push({ role: "user", content: prompt }); const body: Record = { model, - prompt, - stream: false, - options: { - temperature: opts.temperature ?? 0.3, - num_predict: Math.max(opts.max_tokens ?? 800, 400), - }, + messages, + provider: "ollama_cloud", + temperature: opts.temperature ?? 0.3, + max_tokens: Math.max(opts.max_tokens ?? 800, 400), }; - if (opts.system) body.system = opts.system; if (opts.think !== undefined) body.think = opts.think; - const resp = await fetch(`${OLLAMA_CLOUD_URL}/api/generate`, { + const resp = await fetch(`${GATEWAY}/v1/chat`, { method: "POST", - headers: { - "Authorization": `Bearer ${OLLAMA_CLOUD_KEY}`, - "Content-Type": "application/json", - }, + headers: { "Content-Type": "application/json" }, body: JSON.stringify(body), }); if (!resp.ok) { - throw new Error(`Ollama Cloud ${resp.status}: ${await resp.text().catch(() => "?")}`); + throw new Error(`gateway /v1/chat ${resp.status}: ${await resp.text().catch(() => "?")}`); } const data: any = await resp.json(); - const text = typeof data.response === "string" ? data.response : ""; + const text = data?.choices?.[0]?.message?.content ?? ""; // Same non-throw policy as local generate() — empty text is a valid // signal that thinking ate the budget. Let generateContinuable retry. - return text; + return typeof text === "string" ? text : ""; } // --- Prompt construction ---