phase 44 (part 1): migrate TS callers to /v1/chat + add regression guard

Migrates the four TypeScript /generate callers to the gateway's /v1/chat surface so every LLM call lands on /v1/usage and Langfuse: tests/multi-agent/agent.ts::generate() provider="ollama" tests/agent_test/agent_harness.ts::callAgent provider="ollama" bot/propose.ts::generateProposal provider="ollama_cloud" mcp-server/observer.ts (error analysis) provider="ollama" Each migration follows the same pattern as the prior generateCloud() migration (already on /v1/chat from 2026-04-24): replace `fetch(SIDECAR/generate)` with `fetch(GATEWAY/v1/chat)`, swap the prompt-style body for OpenAI-compat messages array, extract content from `choices[0].message.content` instead of `text`. Same upstream models in every case — gateway is the new home for the call, transport otherwise unchanged. Adds scripts/check_phase44_callers.sh — fail-loud regression guard that exits non-zero if any non-adapter file fetches /generate or api/generate. Adapter files (crates/gateway, crates/aibridge, sidecar/) are exempt. Pre-tightening regex flagged prose mentions in comments; the shipped regex requires `fetch(...)` or `client.post(...)` shape so comments don't trip it. Verification: bun build mcp-server/observer.ts compiles bun build tests/multi-agent/agent.ts compiles bun build tests/agent_test/agent_harness.ts compiles bun build bot/propose.ts compiles ./scripts/check_phase44_callers.sh ✅ clean systemctl restart lakehouse-observer active Phase 44 part 2 (deferred): - crates/aibridge/src/client.rs:118 still posts to sidecar /generate directly. AiClient is the foundational Rust LLM caller used by 8+ vectord modules; migrating it is a workspace-wide refactor that needs its own commit. Plan: keep AiClient as the local- transport layer for the gateway's `provider=ollama` arm, but introduce a thin `/v1/chat` wrapper for external callers (vectord autotune, agent, rag, refresh, supervisor, playbook_memory). - tests/real-world/hard_task_escalation.ts: comment mentions /api/generate but doesn't actually call it. Comment is left intentionally as historical context; regex no longer flags it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 07:33:06 -05:00 · 2026-04-27 07:33:06 -05:00 · f6af0fd409
commit f6af0fd409
parent bfe1ea9d1c
5 changed files with 108 additions and 26 deletions
--- a/bot/propose.ts
+++ b/bot/propose.ts
@ -13,7 +13,12 @@ import { readFile } from "node:fs/promises";
 import { createHash } from "node:crypto";
 import type { Gap, Proposal } from "./types.ts";
-const SIDECAR_URL = process.env.LH_SIDECAR_URL ?? "http://localhost:3200";
+// Phase 44 migration (2026-04-27): bot/propose.ts now flows through
 // the gateway's /v1/chat instead of hitting the sidecar's /generate
 // directly. /v1/usage tracks the call, Langfuse traces it, observer
 // sees it. Same upstream model (CLOUD_MODEL gpt-oss:120b on
 // Ollama Cloud) — gateway just owns the routing.
 const GATEWAY_URL = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const REPO_ROOT = "/home/profit/lakehouse";
 const PRD_PATH = `${REPO_ROOT}/docs/PRD.md`;
 const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "gpt-oss:120b";
@ -72,13 +77,16 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P
  sections.push("Propose a small change that addresses this gap. Respond with the JSON object only.");
  const userPrompt = sections.join("\n");
-  const r = await fetch(`${SIDECAR_URL}/generate`, {
+  const r = await fetch(`${GATEWAY_URL}/v1/chat`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify({
      model: CLOUD_MODEL,
-      system: SYSTEM_PROMPT,
+      provider: "ollama_cloud",
-      prompt: userPrompt,
+      messages: [
        { role: "system", content: SYSTEM_PROMPT },
        { role: "user", content: userPrompt },
      ],
      temperature: 0.2,
      max_tokens: MAX_TOKENS,
      think: false,
@ -86,10 +94,10 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P
    signal: AbortSignal.timeout(180000), // cloud T3 can be slow — 3 min
  });
  if (!r.ok) {
-    throw new Error(`sidecar ${r.status}: ${await r.text()}`);
+    throw new Error(`gateway /v1/chat ${r.status}: ${await r.text()}`);
  }
  const j = await r.json() as any;
-  const raw: string = j.text ?? j.response ?? "";
+  const raw: string = j?.choices?.[0]?.message?.content ?? "";
  const usage = j.usage ?? {};
  const tokens = (usage.prompt_tokens ?? 0) + (usage.completion_tokens ?? 0);
--- a/mcp-server/observer.ts
+++ b/mcp-server/observer.ts
@ -550,13 +550,19 @@ async function analyzeErrors() {
    `[${f.endpoint}] ${f.input_summary}: ${f.error}`
  ).join("\n");
-  // Ask local model to diagnose
+  // Ask local model to diagnose. Phase 44 migration (2026-04-27):
  // /v1/chat instead of legacy /ai/generate so /v1/usage tracks the
  // call + Langfuse traces it. Same upstream model (qwen2.5 local).
  try {
-    const resp = await fetch(`${LAKEHOUSE}/ai/generate`, {
+    const resp = await fetch(`${LAKEHOUSE}/v1/chat`, {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify({
-        prompt: `You are a system reliability observer. Analyze these recent failures and suggest fixes:
+        model: "qwen2.5",
        provider: "ollama",
        messages: [{
          role: "user",
          content: `You are a system reliability observer. Analyze these recent failures and suggest fixes:
 ${errorSummary}
@ -566,14 +572,15 @@ For each error:
 3. Should this be added to the playbook as a "don't do this"?
 Be specific and actionable. Under 200 words.`,
-        model: "qwen2.5",
+        }],
        max_tokens: 400,
        temperature: 0.2,
      }),
    });
-    const analysis = await resp.json();
+    const analysis = await resp.json() as any;
-    if (analysis.text) {
+    const analysisText = analysis?.choices?.[0]?.message?.content ?? "";
-      console.error(`[observer] Error analysis:\n${analysis.text}`);
+    if (analysisText) {
      console.error(`[observer] Error analysis:\n${analysisText}`);
      // Log the analysis as a playbook entry
      await fetch(`${GATEWAY}/log`, {
        method: "POST",
@ -581,7 +588,7 @@ Be specific and actionable. Under 200 words.`,
        body: JSON.stringify({
          operation: `error_analysis: ${failures.length} failures`,
          approach: "LLM-analyzed error patterns",
-          result: analysis.text.slice(0, 500),
+          result: analysisText.slice(0, 500),
          context: errorSummary.slice(0, 500),
        }),
      });
--- a/scripts/check_phase44_callers.sh
+++ b/scripts/check_phase44_callers.sh
@ -0,0 +1,58 @@
 #!/usr/bin/env bash
 # Phase 44 caller-migration guard. Fails if any non-adapter file
 # fetches the sidecar's /generate endpoint or hits Ollama Cloud's
 # /api/generate directly. Adapter files (gateway provider crate +
 # the sidecar's own Python implementation) are exempt.
 #
 # Run:    ./scripts/check_phase44_callers.sh
 # CI:     fail-loud (exits non-zero on regression)
 # Watch:  pre-commit hook — invoke from .git/hooks/pre-commit
 set -e
 cd "$(dirname "$0")/.."
 FORBIDDEN_TS=$(grep -rEln "fetch\([^)]*[/\$]generate" \
    --include="*.ts" \
    --exclude-dir=node_modules \
    --exclude-dir=target \
    --exclude-dir=.git \
    . 2>/dev/null | grep -v "^\./sidecar/" || true)
 FORBIDDEN_RS=$(grep -rEln "post\([^)]*\"\.?/generate\"" \
    --include="*.rs" \
    --exclude-dir=target \
    . 2>/dev/null | \
    grep -vE "^\./crates/(gateway|aibridge)/" || true)
 # Ollama Cloud /api/generate outside the gateway adapter. Match only
 # when the URL appears in an actual fetch/post call (not in a comment).
 # Tightened 2026-04-27 — pre-tightening regex flagged prose mentions.
 FORBIDDEN_CLOUD=$(grep -rEln "(fetch|client\.post)\([^)]*api/generate" \
    --include="*.ts" --include="*.rs" \
    --exclude-dir=node_modules \
    --exclude-dir=target \
    --exclude-dir=.git \
    . 2>/dev/null | \
    grep -vE "^\./(crates/gateway|sidecar)" || true)
 ANY_FAIL=0
 if [ -n "$FORBIDDEN_TS" ]; then
    echo "❌ Direct sidecar /generate calls (migrate to /v1/chat):"
    echo "$FORBIDDEN_TS" | sed 's/^/   /'
    ANY_FAIL=1
 fi
 if [ -n "$FORBIDDEN_RS" ]; then
    echo "❌ Direct Rust /generate post() calls (migrate via gateway adapter):"
    echo "$FORBIDDEN_RS" | sed 's/^/   /'
    ANY_FAIL=1
 fi
 if [ -n "$FORBIDDEN_CLOUD" ]; then
    echo "❌ Direct Ollama Cloud /api/generate (migrate to gateway provider=ollama_cloud):"
    echo "$FORBIDDEN_CLOUD" | sed 's/^/   /'
    ANY_FAIL=1
 fi
 if [ $ANY_FAIL -eq 0 ]; then
    echo "✅ Phase 44 caller-migration: clean (no direct /generate outside adapters)"
 fi
 exit $ANY_FAIL
--- a/tests/agent_test/agent_harness.ts
+++ b/tests/agent_test/agent_harness.ts
@ -161,16 +161,17 @@ const TOOL_SCHEMA = `Available tools (call by emitting JSON like: {"tool": "name
 // ─── AGENT LOOP ───
 async function callAgent(messages: Array<{role: string; content: string}>): Promise<string> {
-  // think:false disables hidden reasoning so all generated tokens go to
+  // Phase 44 migration (2026-04-27): /v1/chat instead of direct sidecar
-  // visible response. qwen3.5:latest defaults to thinking and silently
+  // /generate so /v1/usage tracks the call, Langfuse traces it.
-  // burns the token budget otherwise.
+  // think:false still disables hidden reasoning so generated tokens
-  const r = await fetch(`${SIDECAR}/generate`, {
+  // go to visible response — qwen3.5:latest defaults to thinking.
  const r = await fetch(`${GATEWAY}/v1/chat`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify({
      model: AGENT_MODEL,
-      prompt: messages.map(m => `${m.role.toUpperCase()}:\n${m.content}`).join("\n\n") + "\n\nASSISTANT:\n",
+      provider: "ollama",
-      stream: false,
+      messages,
      max_tokens: 1500,
      think: false,
    }),
@ -178,7 +179,7 @@ async function callAgent(messages: Array<{role: string; content: string}>): Prom
  });
  if (!r.ok) throw new Error(`agent ${r.status}: ${(await r.text()).slice(0, 200)}`);
  const j: any = await r.json();
-  return String(j.text ?? j.response ?? "").trim();
+  return String(j?.choices?.[0]?.message?.content ?? "").trim();
 }
 function extractToolCall(response: string): { tool: string; args: any } | null {
--- a/tests/multi-agent/agent.ts
+++ b/tests/multi-agent/agent.ts
@ -372,26 +372,34 @@ export async function generate(model: string, prompt: string, opts: {
  bypass_budget?: boolean;
  think?: boolean;
 } = {}): Promise<string> {
  // Phase 44 migration (2026-04-27): was hitting `${SIDECAR}/generate`
  // directly, bypassing the gateway's /v1/usage accounting + Langfuse
  // tracing. Now flows through /v1/chat with provider="ollama" so
  // every local call is observable + auditable. Sidecar transport is
  // unchanged — gateway just owns the call.
  assertContextBudget(model, prompt, {
    system: opts.system,
    max_tokens: opts.max_tokens,
    bypass: opts.bypass_budget,
  });
  const messages: Array<{ role: string; content: string }> = [];
  if (opts.system) messages.push({ role: "system", content: opts.system });
  messages.push({ role: "user", content: prompt });
  const body: Record<string, any> = {
    model,
-    prompt,
+    messages,
    provider: "ollama",
    temperature: opts.temperature ?? 0.3,
    max_tokens: opts.max_tokens ?? 800,
  };
  if (opts.system) body.system = opts.system;
  if (opts.think !== undefined) body.think = opts.think;
-  const r = await http<any>("POST", `${SIDECAR}/generate`, body);
+  const r = await http<any>("POST", `${GATEWAY}/v1/chat`, body);
-  const text = typeof r.text === "string" ? r.text : "";
+  const text = r?.choices?.[0]?.message?.content ?? "";
  // Do NOT throw on empty. Thinking models (gpt-oss, qwen3.5) burn the
  // max_tokens budget on hidden reasoning and emit "" when budget was
  // too tight. generateContinuable detects empty + continues with more
  // budget. Callers that expected non-empty can check themselves.
-  return text;
+  return typeof text === "string" ? text : "";
 }
 // Cloud generate — routes through the lakehouse gateway's /v1/chat