From f6af0fd40997c9e587f60fa9f2a6801aa089cb6a Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 07:33:06 -0500 Subject: [PATCH] phase 44 (part 1): migrate TS callers to /v1/chat + add regression guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrates the four TypeScript /generate callers to the gateway's /v1/chat surface so every LLM call lands on /v1/usage and Langfuse: tests/multi-agent/agent.ts::generate() provider="ollama" tests/agent_test/agent_harness.ts::callAgent provider="ollama" bot/propose.ts::generateProposal provider="ollama_cloud" mcp-server/observer.ts (error analysis) provider="ollama" Each migration follows the same pattern as the prior generateCloud() migration (already on /v1/chat from 2026-04-24): replace `fetch(SIDECAR/generate)` with `fetch(GATEWAY/v1/chat)`, swap the prompt-style body for OpenAI-compat messages array, extract content from `choices[0].message.content` instead of `text`. Same upstream models in every case — gateway is the new home for the call, transport otherwise unchanged. Adds scripts/check_phase44_callers.sh — fail-loud regression guard that exits non-zero if any non-adapter file fetches /generate or api/generate. Adapter files (crates/gateway, crates/aibridge, sidecar/) are exempt. Pre-tightening regex flagged prose mentions in comments; the shipped regex requires `fetch(...)` or `client.post(...)` shape so comments don't trip it. Verification: bun build mcp-server/observer.ts compiles bun build tests/multi-agent/agent.ts compiles bun build tests/agent_test/agent_harness.ts compiles bun build bot/propose.ts compiles ./scripts/check_phase44_callers.sh ✅ clean systemctl restart lakehouse-observer active Phase 44 part 2 (deferred): - crates/aibridge/src/client.rs:118 still posts to sidecar /generate directly. AiClient is the foundational Rust LLM caller used by 8+ vectord modules; migrating it is a workspace-wide refactor that needs its own commit. Plan: keep AiClient as the local- transport layer for the gateway's `provider=ollama` arm, but introduce a thin `/v1/chat` wrapper for external callers (vectord autotune, agent, rag, refresh, supervisor, playbook_memory). - tests/real-world/hard_task_escalation.ts: comment mentions /api/generate but doesn't actually call it. Comment is left intentionally as historical context; regex no longer flags it. Co-Authored-By: Claude Opus 4.7 (1M context) --- bot/propose.ts | 20 +++++++---- mcp-server/observer.ts | 23 +++++++----- scripts/check_phase44_callers.sh | 58 +++++++++++++++++++++++++++++++ tests/agent_test/agent_harness.ts | 15 ++++---- tests/multi-agent/agent.ts | 18 +++++++--- 5 files changed, 108 insertions(+), 26 deletions(-) create mode 100755 scripts/check_phase44_callers.sh diff --git a/bot/propose.ts b/bot/propose.ts index a66dfe3..ab7b6ca 100644 --- a/bot/propose.ts +++ b/bot/propose.ts @@ -13,7 +13,12 @@ import { readFile } from "node:fs/promises"; import { createHash } from "node:crypto"; import type { Gap, Proposal } from "./types.ts"; -const SIDECAR_URL = process.env.LH_SIDECAR_URL ?? "http://localhost:3200"; +// Phase 44 migration (2026-04-27): bot/propose.ts now flows through +// the gateway's /v1/chat instead of hitting the sidecar's /generate +// directly. /v1/usage tracks the call, Langfuse traces it, observer +// sees it. Same upstream model (CLOUD_MODEL gpt-oss:120b on +// Ollama Cloud) — gateway just owns the routing. +const GATEWAY_URL = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; const REPO_ROOT = "/home/profit/lakehouse"; const PRD_PATH = `${REPO_ROOT}/docs/PRD.md`; const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "gpt-oss:120b"; @@ -72,13 +77,16 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P sections.push("Propose a small change that addresses this gap. Respond with the JSON object only."); const userPrompt = sections.join("\n"); - const r = await fetch(`${SIDECAR_URL}/generate`, { + const r = await fetch(`${GATEWAY_URL}/v1/chat`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ model: CLOUD_MODEL, - system: SYSTEM_PROMPT, - prompt: userPrompt, + provider: "ollama_cloud", + messages: [ + { role: "system", content: SYSTEM_PROMPT }, + { role: "user", content: userPrompt }, + ], temperature: 0.2, max_tokens: MAX_TOKENS, think: false, @@ -86,10 +94,10 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P signal: AbortSignal.timeout(180000), // cloud T3 can be slow — 3 min }); if (!r.ok) { - throw new Error(`sidecar ${r.status}: ${await r.text()}`); + throw new Error(`gateway /v1/chat ${r.status}: ${await r.text()}`); } const j = await r.json() as any; - const raw: string = j.text ?? j.response ?? ""; + const raw: string = j?.choices?.[0]?.message?.content ?? ""; const usage = j.usage ?? {}; const tokens = (usage.prompt_tokens ?? 0) + (usage.completion_tokens ?? 0); diff --git a/mcp-server/observer.ts b/mcp-server/observer.ts index e92b244..edb6e45 100644 --- a/mcp-server/observer.ts +++ b/mcp-server/observer.ts @@ -550,13 +550,19 @@ async function analyzeErrors() { `[${f.endpoint}] ${f.input_summary}: ${f.error}` ).join("\n"); - // Ask local model to diagnose + // Ask local model to diagnose. Phase 44 migration (2026-04-27): + // /v1/chat instead of legacy /ai/generate so /v1/usage tracks the + // call + Langfuse traces it. Same upstream model (qwen2.5 local). try { - const resp = await fetch(`${LAKEHOUSE}/ai/generate`, { + const resp = await fetch(`${LAKEHOUSE}/v1/chat`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - prompt: `You are a system reliability observer. Analyze these recent failures and suggest fixes: + model: "qwen2.5", + provider: "ollama", + messages: [{ + role: "user", + content: `You are a system reliability observer. Analyze these recent failures and suggest fixes: ${errorSummary} @@ -566,14 +572,15 @@ For each error: 3. Should this be added to the playbook as a "don't do this"? Be specific and actionable. Under 200 words.`, - model: "qwen2.5", + }], max_tokens: 400, temperature: 0.2, }), }); - const analysis = await resp.json(); - if (analysis.text) { - console.error(`[observer] Error analysis:\n${analysis.text}`); + const analysis = await resp.json() as any; + const analysisText = analysis?.choices?.[0]?.message?.content ?? ""; + if (analysisText) { + console.error(`[observer] Error analysis:\n${analysisText}`); // Log the analysis as a playbook entry await fetch(`${GATEWAY}/log`, { method: "POST", @@ -581,7 +588,7 @@ Be specific and actionable. Under 200 words.`, body: JSON.stringify({ operation: `error_analysis: ${failures.length} failures`, approach: "LLM-analyzed error patterns", - result: analysis.text.slice(0, 500), + result: analysisText.slice(0, 500), context: errorSummary.slice(0, 500), }), }); diff --git a/scripts/check_phase44_callers.sh b/scripts/check_phase44_callers.sh new file mode 100755 index 0000000..8a42f69 --- /dev/null +++ b/scripts/check_phase44_callers.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Phase 44 caller-migration guard. Fails if any non-adapter file +# fetches the sidecar's /generate endpoint or hits Ollama Cloud's +# /api/generate directly. Adapter files (gateway provider crate + +# the sidecar's own Python implementation) are exempt. +# +# Run: ./scripts/check_phase44_callers.sh +# CI: fail-loud (exits non-zero on regression) +# Watch: pre-commit hook — invoke from .git/hooks/pre-commit + +set -e +cd "$(dirname "$0")/.." + +FORBIDDEN_TS=$(grep -rEln "fetch\([^)]*[/\$]generate" \ + --include="*.ts" \ + --exclude-dir=node_modules \ + --exclude-dir=target \ + --exclude-dir=.git \ + . 2>/dev/null | grep -v "^\./sidecar/" || true) + +FORBIDDEN_RS=$(grep -rEln "post\([^)]*\"\.?/generate\"" \ + --include="*.rs" \ + --exclude-dir=target \ + . 2>/dev/null | \ + grep -vE "^\./crates/(gateway|aibridge)/" || true) + +# Ollama Cloud /api/generate outside the gateway adapter. Match only +# when the URL appears in an actual fetch/post call (not in a comment). +# Tightened 2026-04-27 — pre-tightening regex flagged prose mentions. +FORBIDDEN_CLOUD=$(grep -rEln "(fetch|client\.post)\([^)]*api/generate" \ + --include="*.ts" --include="*.rs" \ + --exclude-dir=node_modules \ + --exclude-dir=target \ + --exclude-dir=.git \ + . 2>/dev/null | \ + grep -vE "^\./(crates/gateway|sidecar)" || true) + +ANY_FAIL=0 +if [ -n "$FORBIDDEN_TS" ]; then + echo "❌ Direct sidecar /generate calls (migrate to /v1/chat):" + echo "$FORBIDDEN_TS" | sed 's/^/ /' + ANY_FAIL=1 +fi +if [ -n "$FORBIDDEN_RS" ]; then + echo "❌ Direct Rust /generate post() calls (migrate via gateway adapter):" + echo "$FORBIDDEN_RS" | sed 's/^/ /' + ANY_FAIL=1 +fi +if [ -n "$FORBIDDEN_CLOUD" ]; then + echo "❌ Direct Ollama Cloud /api/generate (migrate to gateway provider=ollama_cloud):" + echo "$FORBIDDEN_CLOUD" | sed 's/^/ /' + ANY_FAIL=1 +fi + +if [ $ANY_FAIL -eq 0 ]; then + echo "✅ Phase 44 caller-migration: clean (no direct /generate outside adapters)" +fi +exit $ANY_FAIL diff --git a/tests/agent_test/agent_harness.ts b/tests/agent_test/agent_harness.ts index d835759..9630532 100644 --- a/tests/agent_test/agent_harness.ts +++ b/tests/agent_test/agent_harness.ts @@ -161,16 +161,17 @@ const TOOL_SCHEMA = `Available tools (call by emitting JSON like: {"tool": "name // ─── AGENT LOOP ─── async function callAgent(messages: Array<{role: string; content: string}>): Promise { - // think:false disables hidden reasoning so all generated tokens go to - // visible response. qwen3.5:latest defaults to thinking and silently - // burns the token budget otherwise. - const r = await fetch(`${SIDECAR}/generate`, { + // Phase 44 migration (2026-04-27): /v1/chat instead of direct sidecar + // /generate so /v1/usage tracks the call, Langfuse traces it. + // think:false still disables hidden reasoning so generated tokens + // go to visible response — qwen3.5:latest defaults to thinking. + const r = await fetch(`${GATEWAY}/v1/chat`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ model: AGENT_MODEL, - prompt: messages.map(m => `${m.role.toUpperCase()}:\n${m.content}`).join("\n\n") + "\n\nASSISTANT:\n", - stream: false, + provider: "ollama", + messages, max_tokens: 1500, think: false, }), @@ -178,7 +179,7 @@ async function callAgent(messages: Array<{role: string; content: string}>): Prom }); if (!r.ok) throw new Error(`agent ${r.status}: ${(await r.text()).slice(0, 200)}`); const j: any = await r.json(); - return String(j.text ?? j.response ?? "").trim(); + return String(j?.choices?.[0]?.message?.content ?? "").trim(); } function extractToolCall(response: string): { tool: string; args: any } | null { diff --git a/tests/multi-agent/agent.ts b/tests/multi-agent/agent.ts index 487f1d4..dc26524 100644 --- a/tests/multi-agent/agent.ts +++ b/tests/multi-agent/agent.ts @@ -372,26 +372,34 @@ export async function generate(model: string, prompt: string, opts: { bypass_budget?: boolean; think?: boolean; } = {}): Promise { + // Phase 44 migration (2026-04-27): was hitting `${SIDECAR}/generate` + // directly, bypassing the gateway's /v1/usage accounting + Langfuse + // tracing. Now flows through /v1/chat with provider="ollama" so + // every local call is observable + auditable. Sidecar transport is + // unchanged — gateway just owns the call. assertContextBudget(model, prompt, { system: opts.system, max_tokens: opts.max_tokens, bypass: opts.bypass_budget, }); + const messages: Array<{ role: string; content: string }> = []; + if (opts.system) messages.push({ role: "system", content: opts.system }); + messages.push({ role: "user", content: prompt }); const body: Record = { model, - prompt, + messages, + provider: "ollama", temperature: opts.temperature ?? 0.3, max_tokens: opts.max_tokens ?? 800, }; - if (opts.system) body.system = opts.system; if (opts.think !== undefined) body.think = opts.think; - const r = await http("POST", `${SIDECAR}/generate`, body); - const text = typeof r.text === "string" ? r.text : ""; + const r = await http("POST", `${GATEWAY}/v1/chat`, body); + const text = r?.choices?.[0]?.message?.content ?? ""; // Do NOT throw on empty. Thinking models (gpt-oss, qwen3.5) burn the // max_tokens budget on hidden reasoning and emit "" when budget was // too tight. generateContinuable detects empty + continues with more // budget. Callers that expected non-empty can check themselves. - return text; + return typeof text === "string" ? text : ""; } // Cloud generate — routes through the lakehouse gateway's /v1/chat