phase 44 (part 1): migrate TS callers to /v1/chat + add regression guard
Some checks failed
lakehouse/auditor 16 blocking issues: cloud: claim not backed — "Verified end-to-end:"

Migrates the four TypeScript /generate callers to the gateway's
/v1/chat surface so every LLM call lands on /v1/usage and Langfuse:

  tests/multi-agent/agent.ts::generate()      provider="ollama"
  tests/agent_test/agent_harness.ts::callAgent provider="ollama"
  bot/propose.ts::generateProposal             provider="ollama_cloud"
  mcp-server/observer.ts (error analysis)      provider="ollama"

Each migration follows the same pattern as the prior generateCloud()
migration (already on /v1/chat from 2026-04-24): replace
`fetch(SIDECAR/generate)` with `fetch(GATEWAY/v1/chat)`, swap the
prompt-style body for OpenAI-compat messages array, extract
content from `choices[0].message.content` instead of `text`.

Same upstream models in every case — gateway is the new home for
the call, transport otherwise unchanged.

Adds scripts/check_phase44_callers.sh — fail-loud regression guard
that exits non-zero if any non-adapter file fetches /generate or
api/generate. Adapter files (crates/gateway, crates/aibridge,
sidecar/) are exempt. Pre-tightening regex flagged prose mentions
in comments; the shipped regex requires `fetch(...)` or
`client.post(...)` shape so comments don't trip it.

Verification:
  bun build mcp-server/observer.ts                       compiles
  bun build tests/multi-agent/agent.ts                   compiles
  bun build tests/agent_test/agent_harness.ts            compiles
  bun build bot/propose.ts                               compiles
  ./scripts/check_phase44_callers.sh                      clean
  systemctl restart lakehouse-observer                   active

Phase 44 part 2 (deferred):
  - crates/aibridge/src/client.rs:118 still posts to sidecar /generate
    directly. AiClient is the foundational Rust LLM caller used by
    8+ vectord modules; migrating it is a workspace-wide refactor
    that needs its own commit. Plan: keep AiClient as the local-
    transport layer for the gateway's `provider=ollama` arm, but
    introduce a thin `/v1/chat` wrapper for external callers (vectord
    autotune, agent, rag, refresh, supervisor, playbook_memory).
  - tests/real-world/hard_task_escalation.ts: comment mentions
    /api/generate but doesn't actually call it. Comment is left
    intentionally as historical context; regex no longer flags it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-04-27 07:33:06 -05:00
parent bfe1ea9d1c
commit f6af0fd409
5 changed files with 108 additions and 26 deletions

View File

@ -13,7 +13,12 @@ import { readFile } from "node:fs/promises";
import { createHash } from "node:crypto";
import type { Gap, Proposal } from "./types.ts";
const SIDECAR_URL = process.env.LH_SIDECAR_URL ?? "http://localhost:3200";
// Phase 44 migration (2026-04-27): bot/propose.ts now flows through
// the gateway's /v1/chat instead of hitting the sidecar's /generate
// directly. /v1/usage tracks the call, Langfuse traces it, observer
// sees it. Same upstream model (CLOUD_MODEL gpt-oss:120b on
// Ollama Cloud) — gateway just owns the routing.
const GATEWAY_URL = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
const REPO_ROOT = "/home/profit/lakehouse";
const PRD_PATH = `${REPO_ROOT}/docs/PRD.md`;
const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "gpt-oss:120b";
@ -72,13 +77,16 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P
sections.push("Propose a small change that addresses this gap. Respond with the JSON object only.");
const userPrompt = sections.join("\n");
const r = await fetch(`${SIDECAR_URL}/generate`, {
const r = await fetch(`${GATEWAY_URL}/v1/chat`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
model: CLOUD_MODEL,
system: SYSTEM_PROMPT,
prompt: userPrompt,
provider: "ollama_cloud",
messages: [
{ role: "system", content: SYSTEM_PROMPT },
{ role: "user", content: userPrompt },
],
temperature: 0.2,
max_tokens: MAX_TOKENS,
think: false,
@ -86,10 +94,10 @@ export async function generateProposal(gap: Gap, historySummary: string = ""): P
signal: AbortSignal.timeout(180000), // cloud T3 can be slow — 3 min
});
if (!r.ok) {
throw new Error(`sidecar ${r.status}: ${await r.text()}`);
throw new Error(`gateway /v1/chat ${r.status}: ${await r.text()}`);
}
const j = await r.json() as any;
const raw: string = j.text ?? j.response ?? "";
const raw: string = j?.choices?.[0]?.message?.content ?? "";
const usage = j.usage ?? {};
const tokens = (usage.prompt_tokens ?? 0) + (usage.completion_tokens ?? 0);

View File

@ -550,13 +550,19 @@ async function analyzeErrors() {
`[${f.endpoint}] ${f.input_summary}: ${f.error}`
).join("\n");
// Ask local model to diagnose
// Ask local model to diagnose. Phase 44 migration (2026-04-27):
// /v1/chat instead of legacy /ai/generate so /v1/usage tracks the
// call + Langfuse traces it. Same upstream model (qwen2.5 local).
try {
const resp = await fetch(`${LAKEHOUSE}/ai/generate`, {
const resp = await fetch(`${LAKEHOUSE}/v1/chat`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
prompt: `You are a system reliability observer. Analyze these recent failures and suggest fixes:
model: "qwen2.5",
provider: "ollama",
messages: [{
role: "user",
content: `You are a system reliability observer. Analyze these recent failures and suggest fixes:
${errorSummary}
@ -566,14 +572,15 @@ For each error:
3. Should this be added to the playbook as a "don't do this"?
Be specific and actionable. Under 200 words.`,
model: "qwen2.5",
}],
max_tokens: 400,
temperature: 0.2,
}),
});
const analysis = await resp.json();
if (analysis.text) {
console.error(`[observer] Error analysis:\n${analysis.text}`);
const analysis = await resp.json() as any;
const analysisText = analysis?.choices?.[0]?.message?.content ?? "";
if (analysisText) {
console.error(`[observer] Error analysis:\n${analysisText}`);
// Log the analysis as a playbook entry
await fetch(`${GATEWAY}/log`, {
method: "POST",
@ -581,7 +588,7 @@ Be specific and actionable. Under 200 words.`,
body: JSON.stringify({
operation: `error_analysis: ${failures.length} failures`,
approach: "LLM-analyzed error patterns",
result: analysis.text.slice(0, 500),
result: analysisText.slice(0, 500),
context: errorSummary.slice(0, 500),
}),
});

View File

@ -0,0 +1,58 @@
#!/usr/bin/env bash
# Phase 44 caller-migration guard. Fails if any non-adapter file
# fetches the sidecar's /generate endpoint or hits Ollama Cloud's
# /api/generate directly. Adapter files (gateway provider crate +
# the sidecar's own Python implementation) are exempt.
#
# Run: ./scripts/check_phase44_callers.sh
# CI: fail-loud (exits non-zero on regression)
# Watch: pre-commit hook — invoke from .git/hooks/pre-commit
set -e
cd "$(dirname "$0")/.."
FORBIDDEN_TS=$(grep -rEln "fetch\([^)]*[/\$]generate" \
--include="*.ts" \
--exclude-dir=node_modules \
--exclude-dir=target \
--exclude-dir=.git \
. 2>/dev/null | grep -v "^\./sidecar/" || true)
FORBIDDEN_RS=$(grep -rEln "post\([^)]*\"\.?/generate\"" \
--include="*.rs" \
--exclude-dir=target \
. 2>/dev/null | \
grep -vE "^\./crates/(gateway|aibridge)/" || true)
# Ollama Cloud /api/generate outside the gateway adapter. Match only
# when the URL appears in an actual fetch/post call (not in a comment).
# Tightened 2026-04-27 — pre-tightening regex flagged prose mentions.
FORBIDDEN_CLOUD=$(grep -rEln "(fetch|client\.post)\([^)]*api/generate" \
--include="*.ts" --include="*.rs" \
--exclude-dir=node_modules \
--exclude-dir=target \
--exclude-dir=.git \
. 2>/dev/null | \
grep -vE "^\./(crates/gateway|sidecar)" || true)
ANY_FAIL=0
if [ -n "$FORBIDDEN_TS" ]; then
echo "❌ Direct sidecar /generate calls (migrate to /v1/chat):"
echo "$FORBIDDEN_TS" | sed 's/^/ /'
ANY_FAIL=1
fi
if [ -n "$FORBIDDEN_RS" ]; then
echo "❌ Direct Rust /generate post() calls (migrate via gateway adapter):"
echo "$FORBIDDEN_RS" | sed 's/^/ /'
ANY_FAIL=1
fi
if [ -n "$FORBIDDEN_CLOUD" ]; then
echo "❌ Direct Ollama Cloud /api/generate (migrate to gateway provider=ollama_cloud):"
echo "$FORBIDDEN_CLOUD" | sed 's/^/ /'
ANY_FAIL=1
fi
if [ $ANY_FAIL -eq 0 ]; then
echo "✅ Phase 44 caller-migration: clean (no direct /generate outside adapters)"
fi
exit $ANY_FAIL

View File

@ -161,16 +161,17 @@ const TOOL_SCHEMA = `Available tools (call by emitting JSON like: {"tool": "name
// ─── AGENT LOOP ───
async function callAgent(messages: Array<{role: string; content: string}>): Promise<string> {
// think:false disables hidden reasoning so all generated tokens go to
// visible response. qwen3.5:latest defaults to thinking and silently
// burns the token budget otherwise.
const r = await fetch(`${SIDECAR}/generate`, {
// Phase 44 migration (2026-04-27): /v1/chat instead of direct sidecar
// /generate so /v1/usage tracks the call, Langfuse traces it.
// think:false still disables hidden reasoning so generated tokens
// go to visible response — qwen3.5:latest defaults to thinking.
const r = await fetch(`${GATEWAY}/v1/chat`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
model: AGENT_MODEL,
prompt: messages.map(m => `${m.role.toUpperCase()}:\n${m.content}`).join("\n\n") + "\n\nASSISTANT:\n",
stream: false,
provider: "ollama",
messages,
max_tokens: 1500,
think: false,
}),
@ -178,7 +179,7 @@ async function callAgent(messages: Array<{role: string; content: string}>): Prom
});
if (!r.ok) throw new Error(`agent ${r.status}: ${(await r.text()).slice(0, 200)}`);
const j: any = await r.json();
return String(j.text ?? j.response ?? "").trim();
return String(j?.choices?.[0]?.message?.content ?? "").trim();
}
function extractToolCall(response: string): { tool: string; args: any } | null {

View File

@ -372,26 +372,34 @@ export async function generate(model: string, prompt: string, opts: {
bypass_budget?: boolean;
think?: boolean;
} = {}): Promise<string> {
// Phase 44 migration (2026-04-27): was hitting `${SIDECAR}/generate`
// directly, bypassing the gateway's /v1/usage accounting + Langfuse
// tracing. Now flows through /v1/chat with provider="ollama" so
// every local call is observable + auditable. Sidecar transport is
// unchanged — gateway just owns the call.
assertContextBudget(model, prompt, {
system: opts.system,
max_tokens: opts.max_tokens,
bypass: opts.bypass_budget,
});
const messages: Array<{ role: string; content: string }> = [];
if (opts.system) messages.push({ role: "system", content: opts.system });
messages.push({ role: "user", content: prompt });
const body: Record<string, any> = {
model,
prompt,
messages,
provider: "ollama",
temperature: opts.temperature ?? 0.3,
max_tokens: opts.max_tokens ?? 800,
};
if (opts.system) body.system = opts.system;
if (opts.think !== undefined) body.think = opts.think;
const r = await http<any>("POST", `${SIDECAR}/generate`, body);
const text = typeof r.text === "string" ? r.text : "";
const r = await http<any>("POST", `${GATEWAY}/v1/chat`, body);
const text = r?.choices?.[0]?.message?.content ?? "";
// Do NOT throw on empty. Thinking models (gpt-oss, qwen3.5) burn the
// max_tokens budget on hidden reasoning and emit "" when budget was
// too tight. generateContinuable detects empty + continues with more
// budget. Callers that expected non-empty can check themselves.
return text;
return typeof text === "string" ? text : "";
}
// Cloud generate — routes through the lakehouse gateway's /v1/chat