Session infrastructure: OpenRouter + tree-split reducer + observer→LLM Team + scrum_applier #11
@ -224,6 +224,165 @@ async function escalateFailureClusterToLLMTeam(sigHash: string, cluster: Observe
|
|||||||
// persists across cycles.
|
// persists across cycles.
|
||||||
const escalatedSigHashes = new Set<string>();
|
const escalatedSigHashes = new Set<string>();
|
||||||
|
|
||||||
|
// ─── Hand-review for scrum/agent candidate responses (2026-04-25) ───
|
||||||
|
//
|
||||||
|
// Observer is OUTSIDE the scrum loop's epistemic scope, so its verdict
|
||||||
|
// can be treated as truth about whether a candidate review is grounded.
|
||||||
|
// Two-tier evaluator:
|
||||||
|
// 1. Try cloud LLM (qwen3-coder:480b) — semantic judgment with
|
||||||
|
// response + source excerpt + grounding stats as context.
|
||||||
|
// 2. On cloud failure (throttle/timeout) → deterministic heuristic
|
||||||
|
// over grounding_pct + total_quotes. Marked source: "heuristic"
|
||||||
|
// so consumers can tell which rung produced the verdict.
|
||||||
|
// Every verdict is persisted to data/_kb/observer_reviews.jsonl.
|
||||||
|
|
||||||
|
const OBSERVER_REVIEWS = "/home/profit/lakehouse/data/_kb/observer_reviews.jsonl";
|
||||||
|
|
||||||
|
interface HandReviewInput {
|
||||||
|
file_path: string;
|
||||||
|
model: string;
|
||||||
|
response: string;
|
||||||
|
source_content: string;
|
||||||
|
grounding_stats: { total: number; grounded: number; groundedPct: number | null };
|
||||||
|
attempt: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface HandReviewVerdict {
|
||||||
|
verdict: "accept" | "reject" | "cycle";
|
||||||
|
confidence: number;
|
||||||
|
notes: string;
|
||||||
|
source: "cloud" | "heuristic";
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handReview(input: HandReviewInput): Promise<HandReviewVerdict> {
|
||||||
|
const t0 = Date.now();
|
||||||
|
let verdict: HandReviewVerdict;
|
||||||
|
|
||||||
|
try {
|
||||||
|
verdict = await cloudHandReview(input);
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`[observer/review] cloud failed (${(e as Error).message}); using heuristic`);
|
||||||
|
verdict = heuristicHandReview(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Persist regardless of source so we can later compare cloud vs
|
||||||
|
// heuristic verdicts on the same input and tune the heuristic.
|
||||||
|
const row = {
|
||||||
|
ts: new Date().toISOString(),
|
||||||
|
file_path: input.file_path,
|
||||||
|
model: input.model,
|
||||||
|
attempt: input.attempt,
|
||||||
|
response_chars: input.response.length,
|
||||||
|
grounding_stats: input.grounding_stats,
|
||||||
|
verdict: verdict.verdict,
|
||||||
|
confidence: verdict.confidence,
|
||||||
|
notes: verdict.notes,
|
||||||
|
source: verdict.source,
|
||||||
|
duration_ms: Date.now() - t0,
|
||||||
|
};
|
||||||
|
try {
|
||||||
|
const { appendFile } = await import("node:fs/promises");
|
||||||
|
await appendFile(OBSERVER_REVIEWS, JSON.stringify(row) + "\n");
|
||||||
|
} catch { /* best-effort persistence */ }
|
||||||
|
|
||||||
|
return verdict;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function cloudHandReview(input: HandReviewInput): Promise<HandReviewVerdict> {
|
||||||
|
const grounded = input.grounding_stats.grounded;
|
||||||
|
const total = input.grounding_stats.total;
|
||||||
|
const pct = input.grounding_stats.groundedPct;
|
||||||
|
// Truncate to keep the prompt under typical context windows.
|
||||||
|
// 2000 + 4000 = ~6000 chars ≈ 1500 tokens, plus response context.
|
||||||
|
const responseExcerpt = input.response.slice(0, 2000);
|
||||||
|
const sourceExcerpt = input.source_content.slice(0, 4000);
|
||||||
|
|
||||||
|
const prompt = `You are a code-review quality observer. Decide whether the following automated review is grounded in the actual source — not invented, not hallucinated.
|
||||||
|
|
||||||
|
FILE: ${input.file_path}
|
||||||
|
MODEL: ${input.model}
|
||||||
|
ATTEMPT: ${input.attempt}
|
||||||
|
ANCHOR GROUNDING: ${grounded}/${total} backtick-quoted snippets matched the source verbatim${pct !== null ? ` (${pct}%)` : ""}
|
||||||
|
|
||||||
|
REVIEW (first 2000 chars):
|
||||||
|
\`\`\`
|
||||||
|
${responseExcerpt}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
SOURCE EXCERPT (first 4000 chars):
|
||||||
|
\`\`\`
|
||||||
|
${sourceExcerpt}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
Respond ONLY with a JSON object:
|
||||||
|
{
|
||||||
|
"verdict": "accept" | "reject" | "cycle",
|
||||||
|
"confidence": 0-100,
|
||||||
|
"notes": "<1-2 sentences on what makes this grounded or hallucinated>"
|
||||||
|
}
|
||||||
|
|
||||||
|
- accept: review references real symbols/lines in source; findings could be acted on.
|
||||||
|
- reject: review invents APIs, fabricates calls, contradicts source. Do NOT record.
|
||||||
|
- cycle: review is mediocre — partially grounded but wrong shape, try a stronger model.`;
|
||||||
|
|
||||||
|
const resp = await fetch(`${LAKEHOUSE}/v1/chat`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({
|
||||||
|
provider: "ollama_cloud",
|
||||||
|
model: "qwen3-coder:480b",
|
||||||
|
messages: [{ role: "user", content: prompt }],
|
||||||
|
max_tokens: 300,
|
||||||
|
temperature: 0.0,
|
||||||
|
}),
|
||||||
|
signal: AbortSignal.timeout(45000),
|
||||||
|
});
|
||||||
|
if (!resp.ok) {
|
||||||
|
throw new Error(`/v1/chat ${resp.status}: ${(await resp.text()).slice(0, 200)}`);
|
||||||
|
}
|
||||||
|
const j: any = await resp.json();
|
||||||
|
const content = (j?.choices?.[0]?.message?.content ?? "").trim();
|
||||||
|
// Pull JSON object from the response — model may wrap it in prose.
|
||||||
|
const m = content.match(/\{[\s\S]*\}/);
|
||||||
|
if (!m) throw new Error(`no JSON object in response: ${content.slice(0, 100)}`);
|
||||||
|
const parsed = JSON.parse(m[0]);
|
||||||
|
const v = String(parsed.verdict ?? "accept").toLowerCase();
|
||||||
|
return {
|
||||||
|
verdict: (v === "reject" || v === "cycle") ? v as "reject" | "cycle" : "accept",
|
||||||
|
confidence: Number(parsed.confidence ?? 50),
|
||||||
|
notes: String(parsed.notes ?? "").slice(0, 500),
|
||||||
|
source: "cloud",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function heuristicHandReview(input: HandReviewInput): HandReviewVerdict {
|
||||||
|
// Deterministic fallback when cloud is throttled. Conservative:
|
||||||
|
// only flip to reject when the evidence is overwhelming, otherwise
|
||||||
|
// accept (fall-open principle — observer is policy, not blocker).
|
||||||
|
const total = input.grounding_stats.total;
|
||||||
|
const pct = input.grounding_stats.groundedPct;
|
||||||
|
const respLen = input.response.length;
|
||||||
|
|
||||||
|
// Too short to be a real review
|
||||||
|
if (respLen < 1500) {
|
||||||
|
return { verdict: "reject", confidence: 80, notes: `response too short (${respLen} chars)`, source: "heuristic" };
|
||||||
|
}
|
||||||
|
// Below 5 quotes — not enough signal to judge grounding; accept
|
||||||
|
if (total < 5 || pct === null) {
|
||||||
|
return { verdict: "accept", confidence: 50, notes: `insufficient quote signal (${total} quotes); accepting`, source: "heuristic" };
|
||||||
|
}
|
||||||
|
// Very heavy hallucination
|
||||||
|
if (pct < 20) {
|
||||||
|
return { verdict: "reject", confidence: 85, notes: `low grounding (${pct}% of ${total} quotes)`, source: "heuristic" };
|
||||||
|
}
|
||||||
|
// Mediocre — cycle to a stronger model
|
||||||
|
if (pct < 50) {
|
||||||
|
return { verdict: "cycle", confidence: 65, notes: `mediocre grounding (${pct}% of ${total} quotes); try stronger`, source: "heuristic" };
|
||||||
|
}
|
||||||
|
// Good enough
|
||||||
|
return { verdict: "accept", confidence: 75, notes: `grounding ${pct}% of ${total} quotes`, source: "heuristic" };
|
||||||
|
}
|
||||||
|
|
||||||
async function maybeEscalate(failures: ObservedOp[]) {
|
async function maybeEscalate(failures: ObservedOp[]) {
|
||||||
// Group failures by sig_hash
|
// Group failures by sig_hash
|
||||||
const bySig = new Map<string, ObservedOp[]>();
|
const bySig = new Map<string, ObservedOp[]>();
|
||||||
@ -362,6 +521,28 @@ function startHttpListener() {
|
|||||||
.map(o => ({ ts: o.timestamp, ok: o.success, staffer: o.staffer_id, kind: o.event_kind, role: o.role })),
|
.map(o => ({ ts: o.timestamp, ok: o.success, staffer: o.staffer_id, kind: o.event_kind, role: o.role })),
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
// ─── Hand-review endpoint (2026-04-25) ───
|
||||||
|
// scrum/agent posts a candidate response + source content + grounding
|
||||||
|
// stats. Observer evaluates via cloud LLM (qwen3-coder:480b) with
|
||||||
|
// semantic context and returns {verdict, confidence, notes}. On
|
||||||
|
// cloud throttle, falls back to a deterministic heuristic over the
|
||||||
|
// grounding stats so the loop keeps moving with honest signal.
|
||||||
|
//
|
||||||
|
// This is the policy layer scrum was missing — pre-2026-04-25 the
|
||||||
|
// scrum_master applied a hardcoded grounding-rate threshold inline,
|
||||||
|
// which baked judgment into the wrong layer. Now scrum reports data
|
||||||
|
// (response + source + stats) and observer decides accept/reject/cycle.
|
||||||
|
if (req.method === "POST" && url.pathname === "/review") {
|
||||||
|
return req.json().then((body: any) => handReview(body))
|
||||||
|
.then((verdict) => new Response(JSON.stringify(verdict), {
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
}))
|
||||||
|
.catch((e: Error) =>
|
||||||
|
new Response(JSON.stringify({ verdict: "accept", notes: `observer error: ${e.message}`, source: "heuristic" }), {
|
||||||
|
status: 200, // fall-open shape — scrum keeps moving on observer failure
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
}));
|
||||||
|
}
|
||||||
if (req.method === "POST" && url.pathname === "/event") {
|
if (req.method === "POST" && url.pathname === "/event") {
|
||||||
return req.json().then((body: any) => {
|
return req.json().then((body: any) => {
|
||||||
const op: ObservedOp = {
|
const op: ObservedOp = {
|
||||||
|
|||||||
@ -301,6 +301,52 @@ async function recordPathwayReplay(pathwayId: string, succeeded: boolean): Promi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Observer hand-review — the policy layer that decides whether a
|
||||||
|
// candidate response is grounded enough to accept. Lives in mcp-server's
|
||||||
|
// observer (port 3800) so it sits OUTSIDE the scrum loop's epistemic
|
||||||
|
// scope. Synchronous so scrum can act on the verdict immediately.
|
||||||
|
//
|
||||||
|
// Returns {verdict, confidence, notes}. verdict ∈ {accept, reject, cycle}.
|
||||||
|
// On unreachable observer (network, timeout, parse failure), falls open
|
||||||
|
// to {verdict: "accept"} — the observer is the policy layer, not a hard
|
||||||
|
// dependency. Pipeline keeps moving when the observer is down.
|
||||||
|
const OBSERVER_URL = process.env.LH_OBSERVER_URL ?? "http://localhost:3800";
|
||||||
|
|
||||||
|
interface ObserverVerdict {
|
||||||
|
verdict: "accept" | "reject" | "cycle";
|
||||||
|
confidence?: number;
|
||||||
|
notes?: string;
|
||||||
|
source?: "cloud" | "heuristic";
|
||||||
|
}
|
||||||
|
|
||||||
|
async function observerHandReview(input: {
|
||||||
|
file_path: string;
|
||||||
|
model: string;
|
||||||
|
response: string;
|
||||||
|
source_content: string;
|
||||||
|
grounding_stats: { total: number; grounded: number; groundedPct: number | null };
|
||||||
|
attempt: number;
|
||||||
|
}): Promise<ObserverVerdict> {
|
||||||
|
try {
|
||||||
|
const r = await fetch(`${OBSERVER_URL}/review`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
body: JSON.stringify(input),
|
||||||
|
signal: AbortSignal.timeout(60000),
|
||||||
|
});
|
||||||
|
if (!r.ok) {
|
||||||
|
// Observer down or rejected the request — fall open to accept so
|
||||||
|
// the loop keeps moving. Log so we notice degradation.
|
||||||
|
console.error(`[scrum] observer review unreachable (${r.status}), falling open to accept`);
|
||||||
|
return { verdict: "accept", notes: `observer ${r.status}`, source: "heuristic" };
|
||||||
|
}
|
||||||
|
return (await r.json()) as ObserverVerdict;
|
||||||
|
} catch (e: any) {
|
||||||
|
console.error(`[scrum] observer review failed (${e.message}), falling open to accept`);
|
||||||
|
return { verdict: "accept", notes: `observer error: ${e.message}`, source: "heuristic" };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ADR-021 Phase C: pre-review enrichment. Fetch aggregated bug
|
// ADR-021 Phase C: pre-review enrichment. Fetch aggregated bug
|
||||||
// fingerprints for this narrow fingerprint (same key as hot-swap —
|
// fingerprints for this narrow fingerprint (same key as hot-swap —
|
||||||
// task_class + file_prefix + signal_class) so the reviewer prompt
|
// task_class + file_prefix + signal_class) so the reviewer prompt
|
||||||
@ -1057,22 +1103,41 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
|
|||||||
log(` ✗ thin/unstructured (${r.content.length} chars)`);
|
log(` ✗ thin/unstructured (${r.content.length} chars)`);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
// Compute grounding stats as DATA — feed to observer for hand-review.
|
||||||
|
// We no longer gate locally on a hardcoded threshold; that judgment
|
||||||
|
// belongs to the observer (which has Langfuse traces + can call cloud
|
||||||
|
// models for semantic review). Local stats are still informational
|
||||||
|
// and get appended as a footer for humans.
|
||||||
|
const groundingStats = verifyAnchorGrounding(r.content, content);
|
||||||
|
|
||||||
|
// Observer hand-review — synchronous call to mcp-server :3800. Observer
|
||||||
|
// returns {verdict: accept|reject|cycle, confidence, notes}. If the
|
||||||
|
// observer is unreachable or errors, fall through to acceptance (the
|
||||||
|
// observer is the policy layer; pipeline keeps moving when it's down).
|
||||||
|
const obsVerdict = await observerHandReview({
|
||||||
|
file_path: rel,
|
||||||
|
model: `${rung.provider}/${rung.model}`,
|
||||||
|
response: r.content,
|
||||||
|
source_content: content,
|
||||||
|
grounding_stats: groundingStats,
|
||||||
|
attempt: n,
|
||||||
|
});
|
||||||
|
if (obsVerdict.verdict === "reject" || obsVerdict.verdict === "cycle") {
|
||||||
|
const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`;
|
||||||
|
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason });
|
||||||
|
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason });
|
||||||
|
log(` ✗ ${reason} — cycling ladder`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });
|
history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });
|
||||||
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: true, reject_reason: null });
|
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: true, reject_reason: null });
|
||||||
accepted = r.content;
|
accepted = r.content;
|
||||||
acceptedModel = `${rung.provider}/${rung.model}`;
|
acceptedModel = `${rung.provider}/${rung.model}`;
|
||||||
acceptedOn = n;
|
acceptedOn = n;
|
||||||
// Post-acceptance: when tree-split fired, run the anchor-grounding
|
log(` ⚓ anchor grounding: ${groundingStats.grounded}/${groundingStats.total} quotes matched source` +
|
||||||
// verifier and append a footer with the grounding rate. The footer
|
(groundingStats.groundedPct !== null ? ` (${groundingStats.groundedPct}%)` : "") +
|
||||||
// surfaces ungrounded quotes so humans can spot hallucinated
|
` · observer ${obsVerdict.verdict}` + (obsVerdict.confidence ? ` (conf=${obsVerdict.confidence})` : ""));
|
||||||
// findings at a glance — prevents the 0/10 confabulation pattern
|
accepted = appendGroundingFooter(accepted, groundingStats);
|
||||||
// observed on llm_team_ui.py 2026-04-24.
|
|
||||||
if (treeSplitFired) {
|
|
||||||
const stats = verifyAnchorGrounding(accepted, content);
|
|
||||||
log(` ⚓ anchor grounding: ${stats.grounded}/${stats.total} quotes matched source` +
|
|
||||||
(stats.groundedPct !== null ? ` (${stats.groundedPct}%)` : ""));
|
|
||||||
accepted = appendGroundingFooter(accepted, stats);
|
|
||||||
}
|
|
||||||
log(` ✓ ACCEPTED on attempt ${n} (${rung.model}, ${r.content.length} chars)`);
|
log(` ✓ ACCEPTED on attempt ${n} (${rung.model}, ${r.content.length} chars)`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user