agent.ts: route generateCloud through /v1/chat (Phase 44 migration)
Phase 44 PRD (docs/CONTROL_PLANE_PRD.md:204) explicitly lists
`tests/multi-agent/agent.ts::generate()` as a migration target:
every internal LLM caller must flow through /v1/chat so usage
accounting + audit trail see all traffic.
generateCloud() was bypassing the gateway entirely — direct POST to
OLLAMA_CLOUD_URL/api/generate with the bearer key. This meant:
- /v1/usage missed every agent.ts cloud call
- No gateway-side caching, rate-limiting, or cost gating
- Callers needed OLLAMA_CLOUD_KEY in env (leak risk; gateway
already owns the key)
Migration:
- Endpoint: OLLAMA_CLOUD_URL/api/generate → GATEWAY/v1/chat
- Body shape: {prompt,options.num_predict,options.temperature} →
OpenAI-compatible {messages[],temperature,max_tokens}
- provider: "ollama_cloud" explicit in the request
- Response extraction: data.response → data.choices[0].message.content
- OLLAMA_CLOUD_KEY no longer required in agent.ts env
Phase 44 gate verified: `grep localhost:3200/generate|/api/generate`
now only hits (a) the ollama_cloud.rs adapter itself (legit — it's
the gateway-side direct caller) and (b) this comment explaining the
migration history. Zero non-adapter code paths to /api/generate.
generate() (local Ollama) still goes direct to :3200 — that's the
t1_hot path. Phase 44 PRD focuses on cloud callers; hot-path local
generation deliberately stays direct for latency.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
049a4b69fb
commit
021c1b557f
@ -394,11 +394,15 @@ export async function generate(model: string, prompt: string, opts: {
|
|||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cloud generate — hits Ollama Cloud directly with the bearer key. Same
|
// Cloud generate — routes through the lakehouse gateway's /v1/chat
|
||||||
// /api/generate shape as local Ollama; `thinking` field (for gpt-oss:Nb)
|
// with provider="ollama_cloud". Phase 44 migration (2026-04-24): was
|
||||||
// is discarded, only `response` is returned. Caller should budget
|
// hitting OLLAMA_CLOUD_URL/api/generate directly with a bearer key,
|
||||||
// num_predict ≥ 400 so thinking-model reasoning has room before the
|
// bypassing the gateway's usage tracking + audit path. Now every call
|
||||||
// visible response starts.
|
// flows through /v1/chat so /v1/usage accounts for it. Gateway holds
|
||||||
|
// the OLLAMA_CLOUD_KEY; caller no longer needs it in env.
|
||||||
|
//
|
||||||
|
// Thinking-model budget note: num_predict ≥ 400 still matters, just
|
||||||
|
// expressed via max_tokens on the /v1/chat request.
|
||||||
export async function generateCloud(model: string, prompt: string, opts: {
|
export async function generateCloud(model: string, prompt: string, opts: {
|
||||||
max_tokens?: number;
|
max_tokens?: number;
|
||||||
temperature?: number;
|
temperature?: number;
|
||||||
@ -406,41 +410,35 @@ export async function generateCloud(model: string, prompt: string, opts: {
|
|||||||
bypass_budget?: boolean;
|
bypass_budget?: boolean;
|
||||||
think?: boolean;
|
think?: boolean;
|
||||||
} = {}): Promise<string> {
|
} = {}): Promise<string> {
|
||||||
if (!OLLAMA_CLOUD_KEY) {
|
|
||||||
throw new Error("OLLAMA_CLOUD_KEY not set; cannot reach Ollama Cloud");
|
|
||||||
}
|
|
||||||
assertContextBudget(model, prompt, {
|
assertContextBudget(model, prompt, {
|
||||||
system: opts.system,
|
system: opts.system,
|
||||||
max_tokens: opts.max_tokens,
|
max_tokens: opts.max_tokens,
|
||||||
bypass: opts.bypass_budget,
|
bypass: opts.bypass_budget,
|
||||||
});
|
});
|
||||||
|
const messages: Array<{ role: string; content: string }> = [];
|
||||||
|
if (opts.system) messages.push({ role: "system", content: opts.system });
|
||||||
|
messages.push({ role: "user", content: prompt });
|
||||||
const body: Record<string, any> = {
|
const body: Record<string, any> = {
|
||||||
model,
|
model,
|
||||||
prompt,
|
messages,
|
||||||
stream: false,
|
provider: "ollama_cloud",
|
||||||
options: {
|
|
||||||
temperature: opts.temperature ?? 0.3,
|
temperature: opts.temperature ?? 0.3,
|
||||||
num_predict: Math.max(opts.max_tokens ?? 800, 400),
|
max_tokens: Math.max(opts.max_tokens ?? 800, 400),
|
||||||
},
|
|
||||||
};
|
};
|
||||||
if (opts.system) body.system = opts.system;
|
|
||||||
if (opts.think !== undefined) body.think = opts.think;
|
if (opts.think !== undefined) body.think = opts.think;
|
||||||
const resp = await fetch(`${OLLAMA_CLOUD_URL}/api/generate`, {
|
const resp = await fetch(`${GATEWAY}/v1/chat`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: { "Content-Type": "application/json" },
|
||||||
"Authorization": `Bearer ${OLLAMA_CLOUD_KEY}`,
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
|
||||||
body: JSON.stringify(body),
|
body: JSON.stringify(body),
|
||||||
});
|
});
|
||||||
if (!resp.ok) {
|
if (!resp.ok) {
|
||||||
throw new Error(`Ollama Cloud ${resp.status}: ${await resp.text().catch(() => "?")}`);
|
throw new Error(`gateway /v1/chat ${resp.status}: ${await resp.text().catch(() => "?")}`);
|
||||||
}
|
}
|
||||||
const data: any = await resp.json();
|
const data: any = await resp.json();
|
||||||
const text = typeof data.response === "string" ? data.response : "";
|
const text = data?.choices?.[0]?.message?.content ?? "";
|
||||||
// Same non-throw policy as local generate() — empty text is a valid
|
// Same non-throw policy as local generate() — empty text is a valid
|
||||||
// signal that thinking ate the budget. Let generateContinuable retry.
|
// signal that thinking ate the budget. Let generateContinuable retry.
|
||||||
return text;
|
return typeof text === "string" ? text : "";
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Prompt construction ---
|
// --- Prompt construction ---
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user