From 021c1b557f602fc9c5bec78798667220b7a50bcf Mon Sep 17 00:00:00 2001 From: root Date: Fri, 24 Apr 2026 13:27:54 -0500 Subject: [PATCH] agent.ts: route generateCloud through /v1/chat (Phase 44 migration) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 44 PRD (docs/CONTROL_PLANE_PRD.md:204) explicitly lists `tests/multi-agent/agent.ts::generate()` as a migration target: every internal LLM caller must flow through /v1/chat so usage accounting + audit trail see all traffic. generateCloud() was bypassing the gateway entirely — direct POST to OLLAMA_CLOUD_URL/api/generate with the bearer key. This meant: - /v1/usage missed every agent.ts cloud call - No gateway-side caching, rate-limiting, or cost gating - Callers needed OLLAMA_CLOUD_KEY in env (leak risk; gateway already owns the key) Migration: - Endpoint: OLLAMA_CLOUD_URL/api/generate → GATEWAY/v1/chat - Body shape: {prompt,options.num_predict,options.temperature} → OpenAI-compatible {messages[],temperature,max_tokens} - provider: "ollama_cloud" explicit in the request - Response extraction: data.response → data.choices[0].message.content - OLLAMA_CLOUD_KEY no longer required in agent.ts env Phase 44 gate verified: `grep localhost:3200/generate|/api/generate` now only hits (a) the ollama_cloud.rs adapter itself (legit — it's the gateway-side direct caller) and (b) this comment explaining the migration history. Zero non-adapter code paths to /api/generate. generate() (local Ollama) still goes direct to :3200 — that's the t1_hot path. Phase 44 PRD focuses on cloud callers; hot-path local generation deliberately stays direct for latency. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/multi-agent/agent.ts | 44 ++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/tests/multi-agent/agent.ts b/tests/multi-agent/agent.ts index 6ee7449..487f1d4 100644 --- a/tests/multi-agent/agent.ts +++ b/tests/multi-agent/agent.ts @@ -394,11 +394,15 @@ export async function generate(model: string, prompt: string, opts: { return text; } -// Cloud generate — hits Ollama Cloud directly with the bearer key. Same -// /api/generate shape as local Ollama; `thinking` field (for gpt-oss:Nb) -// is discarded, only `response` is returned. Caller should budget -// num_predict ≥ 400 so thinking-model reasoning has room before the -// visible response starts. +// Cloud generate — routes through the lakehouse gateway's /v1/chat +// with provider="ollama_cloud". Phase 44 migration (2026-04-24): was +// hitting OLLAMA_CLOUD_URL/api/generate directly with a bearer key, +// bypassing the gateway's usage tracking + audit path. Now every call +// flows through /v1/chat so /v1/usage accounts for it. Gateway holds +// the OLLAMA_CLOUD_KEY; caller no longer needs it in env. +// +// Thinking-model budget note: num_predict ≥ 400 still matters, just +// expressed via max_tokens on the /v1/chat request. export async function generateCloud(model: string, prompt: string, opts: { max_tokens?: number; temperature?: number; @@ -406,41 +410,35 @@ export async function generateCloud(model: string, prompt: string, opts: { bypass_budget?: boolean; think?: boolean; } = {}): Promise { - if (!OLLAMA_CLOUD_KEY) { - throw new Error("OLLAMA_CLOUD_KEY not set; cannot reach Ollama Cloud"); - } assertContextBudget(model, prompt, { system: opts.system, max_tokens: opts.max_tokens, bypass: opts.bypass_budget, }); + const messages: Array<{ role: string; content: string }> = []; + if (opts.system) messages.push({ role: "system", content: opts.system }); + messages.push({ role: "user", content: prompt }); const body: Record = { model, - prompt, - stream: false, - options: { - temperature: opts.temperature ?? 0.3, - num_predict: Math.max(opts.max_tokens ?? 800, 400), - }, + messages, + provider: "ollama_cloud", + temperature: opts.temperature ?? 0.3, + max_tokens: Math.max(opts.max_tokens ?? 800, 400), }; - if (opts.system) body.system = opts.system; if (opts.think !== undefined) body.think = opts.think; - const resp = await fetch(`${OLLAMA_CLOUD_URL}/api/generate`, { + const resp = await fetch(`${GATEWAY}/v1/chat`, { method: "POST", - headers: { - "Authorization": `Bearer ${OLLAMA_CLOUD_KEY}`, - "Content-Type": "application/json", - }, + headers: { "Content-Type": "application/json" }, body: JSON.stringify(body), }); if (!resp.ok) { - throw new Error(`Ollama Cloud ${resp.status}: ${await resp.text().catch(() => "?")}`); + throw new Error(`gateway /v1/chat ${resp.status}: ${await resp.text().catch(() => "?")}`); } const data: any = await resp.json(); - const text = typeof data.response === "string" ? data.response : ""; + const text = data?.choices?.[0]?.message?.content ?? ""; // Same non-throw policy as local generate() — empty text is a valid // signal that thinking ate the budget. Let generateContinuable retry. - return text; + return typeof text === "string" ? text : ""; } // --- Prompt construction ---