From f6af0fd40997c9e587f60fa9f2a6801aa089cb6a Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Mon, 27 Apr 2026 07:33:06 -0500
Subject: [PATCH] phase 44 (part 1): migrate TS callers to /v1/chat + add
 regression guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Migrates the four TypeScript /generate callers to the gateway's
/v1/chat surface so every LLM call lands on /v1/usage and Langfuse:

  tests/multi-agent/agent.ts::generate()      provider="ollama"
  tests/agent_test/agent_harness.ts::callAgent provider="ollama"
  bot/propose.ts::generateProposal             provider="ollama_cloud"
  mcp-server/observer.ts (error analysis)      provider="ollama"

Each migration follows the same pattern as the prior generateCloud()
migration (already on /v1/chat from 2026-04-24): replace
`fetch(SIDECAR/generate)` with `fetch(GATEWAY/v1/chat)`, swap the
prompt-style body for OpenAI-compat messages array, extract
content from `choices[0].message.content` instead of `text`.

Same upstream models in every case — gateway is the new home for
the call, transport otherwise unchanged.

Adds scripts/check_phase44_callers.sh — fail-loud regression guard
that exits non-zero if any non-adapter file fetches /generate or
api/generate. Adapter files (crates/gateway, crates/aibridge,
sidecar/) are exempt. Pre-tightening regex flagged prose mentions
in comments; the shipped regex requires `fetch(...)` or
`client.post(...)` shape so comments don't trip it.

Verification:
  bun build mcp-server/observer.ts                       compiles
  bun build tests/multi-agent/agent.ts                   compiles
  bun build tests/agent_test/agent_harness.ts            compiles
  bun build bot/propose.ts                               compiles
  ./scripts/check_phase44_callers.sh                     ✅ clean
  systemctl restart lakehouse-observer                   active

Phase 44 part 2 (deferred):
  - crates/aibridge/src/client.rs:118 still posts to sidecar /generate
    directly. AiClient is the foundational Rust LLM caller used by
    8+ vectord modules; migrating it is a workspace-wide refactor
    that needs its own commit. Plan: keep AiClient as the local-
    transport layer for the gateway's `provider=ollama` arm, but
    introduce a thin `/v1/chat` wrapper for external callers (vectord
    autotune, agent, rag, refresh, supervisor, playbook_memory).
  - tests/real-world/hard_task_escalation.ts: comment mentions
    /api/generate but doesn't actually call it. Comment is left
    intentionally as historical context; regex no longer flags it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bot/propose.ts                    | 20 +++++++----
 mcp-server/observer.ts            | 23 +++++++-----
 scripts/check_phase44_callers.sh  | 58 +++++++++++++++++++++++++++++++
 tests/agent_test/agent_harness.ts | 15 ++++----
 tests/multi-agent/agent.ts        | 18 +++++++---
 5 files changed, 108 insertions(+), 26 deletions(-)
 create mode 100755 scripts/check_phase44_callers.sh

diff --git a/bot/propose.ts b/bot/propose.ts
index a66dfe3..ab7b6ca 100644
--- a/bot/propose.ts
+++ b/bot/propose.ts
@@ -13,7 +13,12 @@ import { readFile } from "node:fs/promises";
 import { createHash } from "node:crypto";
 import type { Gap, Proposal } from "./types.ts";
 
-const SIDECAR_URL = process.env.LH_SIDECAR_URL ?? "http://localhost:3200";
+// Phase 44 migration (2026-04-27): bot/propose.ts now flows through
+// the gateway's /v1/chat instead of hitting the sidecar's /generate
+// directly. /v1/usage tracks the call, Langfuse traces it, observer
+// sees it. Same upstream model (CLOUD_MODEL gpt-oss:120b on
+// Ollama Cloud) — gateway just owns the routing.
+const GATEWAY_URL = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const REPO_ROOT = "/home/profit/lakehouse";
 const PRD_PATH = `${REPO_ROOT}/docs/PRD.md`;
 const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "gpt-oss:120b";
@@ -72,13 +77,16 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P
   sections.push("Propose a small change that addresses this gap. Respond with the JSON object only.");
   const userPrompt = sections.join("\n");
 
-  const r = await fetch(`${SIDECAR_URL}/generate`, {
+  const r = await fetch(`${GATEWAY_URL}/v1/chat`, {
     method: "POST",
     headers: { "content-type": "application/json" },
     body: JSON.stringify({
       model: CLOUD_MODEL,
-      system: SYSTEM_PROMPT,
-      prompt: userPrompt,
+      provider: "ollama_cloud",
+      messages: [
+        { role: "system", content: SYSTEM_PROMPT },
+        { role: "user", content: userPrompt },
+      ],
       temperature: 0.2,
       max_tokens: MAX_TOKENS,
       think: false,
@@ -86,10 +94,10 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P
     signal: AbortSignal.timeout(180000), // cloud T3 can be slow — 3 min
   });
   if (!r.ok) {
-    throw new Error(`sidecar ${r.status}: ${await r.text()}`);
+    throw new Error(`gateway /v1/chat ${r.status}: ${await r.text()}`);
   }
   const j = await r.json() as any;
-  const raw: string = j.text ?? j.response ?? "";
+  const raw: string = j?.choices?.[0]?.message?.content ?? "";
   const usage = j.usage ?? {};
   const tokens = (usage.prompt_tokens ?? 0) + (usage.completion_tokens ?? 0);
 
diff --git a/mcp-server/observer.ts b/mcp-server/observer.ts
index e92b244..edb6e45 100644
--- a/mcp-server/observer.ts
+++ b/mcp-server/observer.ts
@@ -550,13 +550,19 @@ async function analyzeErrors() {
     `[${f.endpoint}] ${f.input_summary}: ${f.error}`
   ).join("\n");
 
-  // Ask local model to diagnose
+  // Ask local model to diagnose. Phase 44 migration (2026-04-27):
+  // /v1/chat instead of legacy /ai/generate so /v1/usage tracks the
+  // call + Langfuse traces it. Same upstream model (qwen2.5 local).
   try {
-    const resp = await fetch(`${LAKEHOUSE}/ai/generate`, {
+    const resp = await fetch(`${LAKEHOUSE}/v1/chat`, {
       method: "POST",
       headers: { "Content-Type": "application/json" },
       body: JSON.stringify({
-        prompt: `You are a system reliability observer. Analyze these recent failures and suggest fixes:
+        model: "qwen2.5",
+        provider: "ollama",
+        messages: [{
+          role: "user",
+          content: `You are a system reliability observer. Analyze these recent failures and suggest fixes:
 
 ${errorSummary}
 
@@ -566,14 +572,15 @@ For each error:
 3. Should this be added to the playbook as a "don't do this"?
 
 Be specific and actionable. Under 200 words.`,
-        model: "qwen2.5",
+        }],
         max_tokens: 400,
         temperature: 0.2,
       }),
     });
-    const analysis = await resp.json();
-    if (analysis.text) {
-      console.error(`[observer] Error analysis:\n${analysis.text}`);
+    const analysis = await resp.json() as any;
+    const analysisText = analysis?.choices?.[0]?.message?.content ?? "";
+    if (analysisText) {
+      console.error(`[observer] Error analysis:\n${analysisText}`);
       // Log the analysis as a playbook entry
       await fetch(`${GATEWAY}/log`, {
         method: "POST",
@@ -581,7 +588,7 @@ Be specific and actionable. Under 200 words.`,
         body: JSON.stringify({
           operation: `error_analysis: ${failures.length} failures`,
           approach: "LLM-analyzed error patterns",
-          result: analysis.text.slice(0, 500),
+          result: analysisText.slice(0, 500),
           context: errorSummary.slice(0, 500),
         }),
       });
diff --git a/scripts/check_phase44_callers.sh b/scripts/check_phase44_callers.sh
new file mode 100755
index 0000000..8a42f69
--- /dev/null
+++ b/scripts/check_phase44_callers.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Phase 44 caller-migration guard. Fails if any non-adapter file
+# fetches the sidecar's /generate endpoint or hits Ollama Cloud's
+# /api/generate directly. Adapter files (gateway provider crate +
+# the sidecar's own Python implementation) are exempt.
+#
+# Run:    ./scripts/check_phase44_callers.sh
+# CI:     fail-loud (exits non-zero on regression)
+# Watch:  pre-commit hook — invoke from .git/hooks/pre-commit
+
+set -e
+cd "$(dirname "$0")/.."
+
+FORBIDDEN_TS=$(grep -rEln "fetch\([^)]*[/\$]generate" \
+    --include="*.ts" \
+    --exclude-dir=node_modules \
+    --exclude-dir=target \
+    --exclude-dir=.git \
+    . 2>/dev/null | grep -v "^\./sidecar/" || true)
+
+FORBIDDEN_RS=$(grep -rEln "post\([^)]*\"\.?/generate\"" \
+    --include="*.rs" \
+    --exclude-dir=target \
+    . 2>/dev/null | \
+    grep -vE "^\./crates/(gateway|aibridge)/" || true)
+
+# Ollama Cloud /api/generate outside the gateway adapter. Match only
+# when the URL appears in an actual fetch/post call (not in a comment).
+# Tightened 2026-04-27 — pre-tightening regex flagged prose mentions.
+FORBIDDEN_CLOUD=$(grep -rEln "(fetch|client\.post)\([^)]*api/generate" \
+    --include="*.ts" --include="*.rs" \
+    --exclude-dir=node_modules \
+    --exclude-dir=target \
+    --exclude-dir=.git \
+    . 2>/dev/null | \
+    grep -vE "^\./(crates/gateway|sidecar)" || true)
+
+ANY_FAIL=0
+if [ -n "$FORBIDDEN_TS" ]; then
+    echo "❌ Direct sidecar /generate calls (migrate to /v1/chat):"
+    echo "$FORBIDDEN_TS" | sed 's/^/   /'
+    ANY_FAIL=1
+fi
+if [ -n "$FORBIDDEN_RS" ]; then
+    echo "❌ Direct Rust /generate post() calls (migrate via gateway adapter):"
+    echo "$FORBIDDEN_RS" | sed 's/^/   /'
+    ANY_FAIL=1
+fi
+if [ -n "$FORBIDDEN_CLOUD" ]; then
+    echo "❌ Direct Ollama Cloud /api/generate (migrate to gateway provider=ollama_cloud):"
+    echo "$FORBIDDEN_CLOUD" | sed 's/^/   /'
+    ANY_FAIL=1
+fi
+
+if [ $ANY_FAIL -eq 0 ]; then
+    echo "✅ Phase 44 caller-migration: clean (no direct /generate outside adapters)"
+fi
+exit $ANY_FAIL
diff --git a/tests/agent_test/agent_harness.ts b/tests/agent_test/agent_harness.ts
index d835759..9630532 100644
--- a/tests/agent_test/agent_harness.ts
+++ b/tests/agent_test/agent_harness.ts
@@ -161,16 +161,17 @@ const TOOL_SCHEMA = `Available tools (call by emitting JSON like: {"tool": "name
 // ─── AGENT LOOP ───
 
 async function callAgent(messages: Array<{role: string; content: string}>): Promise<string> {
-  // think:false disables hidden reasoning so all generated tokens go to
-  // visible response. qwen3.5:latest defaults to thinking and silently
-  // burns the token budget otherwise.
-  const r = await fetch(`${SIDECAR}/generate`, {
+  // Phase 44 migration (2026-04-27): /v1/chat instead of direct sidecar
+  // /generate so /v1/usage tracks the call, Langfuse traces it.
+  // think:false still disables hidden reasoning so generated tokens
+  // go to visible response — qwen3.5:latest defaults to thinking.
+  const r = await fetch(`${GATEWAY}/v1/chat`, {
     method: "POST",
     headers: { "content-type": "application/json" },
     body: JSON.stringify({
       model: AGENT_MODEL,
-      prompt: messages.map(m => `${m.role.toUpperCase()}:\n${m.content}`).join("\n\n") + "\n\nASSISTANT:\n",
-      stream: false,
+      provider: "ollama",
+      messages,
       max_tokens: 1500,
       think: false,
     }),
@@ -178,7 +179,7 @@ async function callAgent(messages: Array<{role: string; content: string}>): Prom
   });
   if (!r.ok) throw new Error(`agent ${r.status}: ${(await r.text()).slice(0, 200)}`);
   const j: any = await r.json();
-  return String(j.text ?? j.response ?? "").trim();
+  return String(j?.choices?.[0]?.message?.content ?? "").trim();
 }
 
 function extractToolCall(response: string): { tool: string; args: any } | null {
diff --git a/tests/multi-agent/agent.ts b/tests/multi-agent/agent.ts
index 487f1d4..dc26524 100644
--- a/tests/multi-agent/agent.ts
+++ b/tests/multi-agent/agent.ts
@@ -372,26 +372,34 @@ export async function generate(model: string, prompt: string, opts: {
   bypass_budget?: boolean;
   think?: boolean;
 } = {}): Promise<string> {
+  // Phase 44 migration (2026-04-27): was hitting `${SIDECAR}/generate`
+  // directly, bypassing the gateway's /v1/usage accounting + Langfuse
+  // tracing. Now flows through /v1/chat with provider="ollama" so
+  // every local call is observable + auditable. Sidecar transport is
+  // unchanged — gateway just owns the call.
   assertContextBudget(model, prompt, {
     system: opts.system,
     max_tokens: opts.max_tokens,
     bypass: opts.bypass_budget,
   });
+  const messages: Array<{ role: string; content: string }> = [];
+  if (opts.system) messages.push({ role: "system", content: opts.system });
+  messages.push({ role: "user", content: prompt });
   const body: Record<string, any> = {
     model,
-    prompt,
+    messages,
+    provider: "ollama",
     temperature: opts.temperature ?? 0.3,
     max_tokens: opts.max_tokens ?? 800,
   };
-  if (opts.system) body.system = opts.system;
   if (opts.think !== undefined) body.think = opts.think;
-  const r = await http<any>("POST", `${SIDECAR}/generate`, body);
-  const text = typeof r.text === "string" ? r.text : "";
+  const r = await http<any>("POST", `${GATEWAY}/v1/chat`, body);
+  const text = r?.choices?.[0]?.message?.content ?? "";
   // Do NOT throw on empty. Thinking models (gpt-oss, qwen3.5) burn the
   // max_tokens budget on hidden reasoning and emit "" when budget was
   // too tight. generateContinuable detects empty + continues with more
   // budget. Callers that expected non-empty can check themselves.
-  return text;
+  return typeof text === "string" ? text : "";
 }
 
 // Cloud generate — routes through the lakehouse gateway's /v1/chat