Session infrastructure: OpenRouter + tree-split reducer + observer→LLM Team + scrum_applier #11

Merged
profit merged 118 commits from scrum/auto-apply-19814 into main 2026-04-27 15:55:24 +00:00
2 changed files with 257 additions and 11 deletions
Showing only changes of commit 3f166a5558 - Show all commits

View File

@ -224,6 +224,165 @@ async function escalateFailureClusterToLLMTeam(sigHash: string, cluster: Observe
// persists across cycles. // persists across cycles.
const escalatedSigHashes = new Set<string>(); const escalatedSigHashes = new Set<string>();
// ─── Hand-review for scrum/agent candidate responses (2026-04-25) ───
//
// Observer is OUTSIDE the scrum loop's epistemic scope, so its verdict
// can be treated as truth about whether a candidate review is grounded.
// Two-tier evaluator:
// 1. Try cloud LLM (qwen3-coder:480b) — semantic judgment with
// response + source excerpt + grounding stats as context.
// 2. On cloud failure (throttle/timeout) → deterministic heuristic
// over grounding_pct + total_quotes. Marked source: "heuristic"
// so consumers can tell which rung produced the verdict.
// Every verdict is persisted to data/_kb/observer_reviews.jsonl.
const OBSERVER_REVIEWS = "/home/profit/lakehouse/data/_kb/observer_reviews.jsonl";
interface HandReviewInput {
file_path: string;
model: string;
response: string;
source_content: string;
grounding_stats: { total: number; grounded: number; groundedPct: number | null };
attempt: number;
}
interface HandReviewVerdict {
verdict: "accept" | "reject" | "cycle";
confidence: number;
notes: string;
source: "cloud" | "heuristic";
}
async function handReview(input: HandReviewInput): Promise<HandReviewVerdict> {
const t0 = Date.now();
let verdict: HandReviewVerdict;
try {
verdict = await cloudHandReview(input);
} catch (e) {
console.error(`[observer/review] cloud failed (${(e as Error).message}); using heuristic`);
verdict = heuristicHandReview(input);
}
// Persist regardless of source so we can later compare cloud vs
// heuristic verdicts on the same input and tune the heuristic.
const row = {
ts: new Date().toISOString(),
file_path: input.file_path,
model: input.model,
attempt: input.attempt,
response_chars: input.response.length,
grounding_stats: input.grounding_stats,
verdict: verdict.verdict,
confidence: verdict.confidence,
notes: verdict.notes,
source: verdict.source,
duration_ms: Date.now() - t0,
};
try {
const { appendFile } = await import("node:fs/promises");
await appendFile(OBSERVER_REVIEWS, JSON.stringify(row) + "\n");
} catch { /* best-effort persistence */ }
return verdict;
}
async function cloudHandReview(input: HandReviewInput): Promise<HandReviewVerdict> {
const grounded = input.grounding_stats.grounded;
const total = input.grounding_stats.total;
const pct = input.grounding_stats.groundedPct;
// Truncate to keep the prompt under typical context windows.
// 2000 + 4000 = ~6000 chars ≈ 1500 tokens, plus response context.
const responseExcerpt = input.response.slice(0, 2000);
const sourceExcerpt = input.source_content.slice(0, 4000);
const prompt = `You are a code-review quality observer. Decide whether the following automated review is grounded in the actual source — not invented, not hallucinated.
FILE: ${input.file_path}
MODEL: ${input.model}
ATTEMPT: ${input.attempt}
ANCHOR GROUNDING: ${grounded}/${total} backtick-quoted snippets matched the source verbatim${pct !== null ? ` (${pct}%)` : ""}
REVIEW (first 2000 chars):
\`\`\`
${responseExcerpt}
\`\`\`
SOURCE EXCERPT (first 4000 chars):
\`\`\`
${sourceExcerpt}
\`\`\`
Respond ONLY with a JSON object:
{
"verdict": "accept" | "reject" | "cycle",
"confidence": 0-100,
"notes": "<1-2 sentences on what makes this grounded or hallucinated>"
}
- accept: review references real symbols/lines in source; findings could be acted on.
- reject: review invents APIs, fabricates calls, contradicts source. Do NOT record.
- cycle: review is mediocre partially grounded but wrong shape, try a stronger model.`;
const resp = await fetch(`${LAKEHOUSE}/v1/chat`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
provider: "ollama_cloud",
model: "qwen3-coder:480b",
messages: [{ role: "user", content: prompt }],
max_tokens: 300,
temperature: 0.0,
}),
signal: AbortSignal.timeout(45000),
});
if (!resp.ok) {
throw new Error(`/v1/chat ${resp.status}: ${(await resp.text()).slice(0, 200)}`);
}
const j: any = await resp.json();
const content = (j?.choices?.[0]?.message?.content ?? "").trim();
// Pull JSON object from the response — model may wrap it in prose.
const m = content.match(/\{[\s\S]*\}/);
if (!m) throw new Error(`no JSON object in response: ${content.slice(0, 100)}`);
const parsed = JSON.parse(m[0]);
const v = String(parsed.verdict ?? "accept").toLowerCase();
return {
verdict: (v === "reject" || v === "cycle") ? v as "reject" | "cycle" : "accept",
confidence: Number(parsed.confidence ?? 50),
notes: String(parsed.notes ?? "").slice(0, 500),
source: "cloud",
};
}
function heuristicHandReview(input: HandReviewInput): HandReviewVerdict {
// Deterministic fallback when cloud is throttled. Conservative:
// only flip to reject when the evidence is overwhelming, otherwise
// accept (fall-open principle — observer is policy, not blocker).
const total = input.grounding_stats.total;
const pct = input.grounding_stats.groundedPct;
const respLen = input.response.length;
// Too short to be a real review
if (respLen < 1500) {
return { verdict: "reject", confidence: 80, notes: `response too short (${respLen} chars)`, source: "heuristic" };
}
// Below 5 quotes — not enough signal to judge grounding; accept
if (total < 5 || pct === null) {
return { verdict: "accept", confidence: 50, notes: `insufficient quote signal (${total} quotes); accepting`, source: "heuristic" };
}
// Very heavy hallucination
if (pct < 20) {
return { verdict: "reject", confidence: 85, notes: `low grounding (${pct}% of ${total} quotes)`, source: "heuristic" };
}
// Mediocre — cycle to a stronger model
if (pct < 50) {
return { verdict: "cycle", confidence: 65, notes: `mediocre grounding (${pct}% of ${total} quotes); try stronger`, source: "heuristic" };
}
// Good enough
return { verdict: "accept", confidence: 75, notes: `grounding ${pct}% of ${total} quotes`, source: "heuristic" };
}
async function maybeEscalate(failures: ObservedOp[]) { async function maybeEscalate(failures: ObservedOp[]) {
// Group failures by sig_hash // Group failures by sig_hash
const bySig = new Map<string, ObservedOp[]>(); const bySig = new Map<string, ObservedOp[]>();
@ -362,6 +521,28 @@ function startHttpListener() {
.map(o => ({ ts: o.timestamp, ok: o.success, staffer: o.staffer_id, kind: o.event_kind, role: o.role })), .map(o => ({ ts: o.timestamp, ok: o.success, staffer: o.staffer_id, kind: o.event_kind, role: o.role })),
})); }));
} }
// ─── Hand-review endpoint (2026-04-25) ───
// scrum/agent posts a candidate response + source content + grounding
// stats. Observer evaluates via cloud LLM (qwen3-coder:480b) with
// semantic context and returns {verdict, confidence, notes}. On
// cloud throttle, falls back to a deterministic heuristic over the
// grounding stats so the loop keeps moving with honest signal.
//
// This is the policy layer scrum was missing — pre-2026-04-25 the
// scrum_master applied a hardcoded grounding-rate threshold inline,
// which baked judgment into the wrong layer. Now scrum reports data
// (response + source + stats) and observer decides accept/reject/cycle.
if (req.method === "POST" && url.pathname === "/review") {
return req.json().then((body: any) => handReview(body))
.then((verdict) => new Response(JSON.stringify(verdict), {
headers: { "content-type": "application/json" },
}))
.catch((e: Error) =>
new Response(JSON.stringify({ verdict: "accept", notes: `observer error: ${e.message}`, source: "heuristic" }), {
status: 200, // fall-open shape — scrum keeps moving on observer failure
headers: { "content-type": "application/json" },
}));
}
if (req.method === "POST" && url.pathname === "/event") { if (req.method === "POST" && url.pathname === "/event") {
return req.json().then((body: any) => { return req.json().then((body: any) => {
const op: ObservedOp = { const op: ObservedOp = {

View File

@ -301,6 +301,52 @@ async function recordPathwayReplay(pathwayId: string, succeeded: boolean): Promi
} }
} }
// Observer hand-review — the policy layer that decides whether a
// candidate response is grounded enough to accept. Lives in mcp-server's
// observer (port 3800) so it sits OUTSIDE the scrum loop's epistemic
// scope. Synchronous so scrum can act on the verdict immediately.
//
// Returns {verdict, confidence, notes}. verdict ∈ {accept, reject, cycle}.
// On unreachable observer (network, timeout, parse failure), falls open
// to {verdict: "accept"} — the observer is the policy layer, not a hard
// dependency. Pipeline keeps moving when the observer is down.
const OBSERVER_URL = process.env.LH_OBSERVER_URL ?? "http://localhost:3800";
interface ObserverVerdict {
verdict: "accept" | "reject" | "cycle";
confidence?: number;
notes?: string;
source?: "cloud" | "heuristic";
}
async function observerHandReview(input: {
file_path: string;
model: string;
response: string;
source_content: string;
grounding_stats: { total: number; grounded: number; groundedPct: number | null };
attempt: number;
}): Promise<ObserverVerdict> {
try {
const r = await fetch(`${OBSERVER_URL}/review`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify(input),
signal: AbortSignal.timeout(60000),
});
if (!r.ok) {
// Observer down or rejected the request — fall open to accept so
// the loop keeps moving. Log so we notice degradation.
console.error(`[scrum] observer review unreachable (${r.status}), falling open to accept`);
return { verdict: "accept", notes: `observer ${r.status}`, source: "heuristic" };
}
return (await r.json()) as ObserverVerdict;
} catch (e: any) {
console.error(`[scrum] observer review failed (${e.message}), falling open to accept`);
return { verdict: "accept", notes: `observer error: ${e.message}`, source: "heuristic" };
}
}
// ADR-021 Phase C: pre-review enrichment. Fetch aggregated bug // ADR-021 Phase C: pre-review enrichment. Fetch aggregated bug
// fingerprints for this narrow fingerprint (same key as hot-swap — // fingerprints for this narrow fingerprint (same key as hot-swap —
// task_class + file_prefix + signal_class) so the reviewer prompt // task_class + file_prefix + signal_class) so the reviewer prompt
@ -1057,22 +1103,41 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
log(` ✗ thin/unstructured (${r.content.length} chars)`); log(` ✗ thin/unstructured (${r.content.length} chars)`);
continue; continue;
} }
// Compute grounding stats as DATA — feed to observer for hand-review.
// We no longer gate locally on a hardcoded threshold; that judgment
// belongs to the observer (which has Langfuse traces + can call cloud
// models for semantic review). Local stats are still informational
// and get appended as a footer for humans.
const groundingStats = verifyAnchorGrounding(r.content, content);
// Observer hand-review — synchronous call to mcp-server :3800. Observer
// returns {verdict: accept|reject|cycle, confidence, notes}. If the
// observer is unreachable or errors, fall through to acceptance (the
// observer is the policy layer; pipeline keeps moving when it's down).
const obsVerdict = await observerHandReview({
file_path: rel,
model: `${rung.provider}/${rung.model}`,
response: r.content,
source_content: content,
grounding_stats: groundingStats,
attempt: n,
});
if (obsVerdict.verdict === "reject" || obsVerdict.verdict === "cycle") {
const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`;
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason });
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason });
log(`${reason} — cycling ladder`);
continue;
}
history.push({ n, model: rung.model, status: "accepted", chars: r.content.length }); history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: true, reject_reason: null }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: true, reject_reason: null });
accepted = r.content; accepted = r.content;
acceptedModel = `${rung.provider}/${rung.model}`; acceptedModel = `${rung.provider}/${rung.model}`;
acceptedOn = n; acceptedOn = n;
// Post-acceptance: when tree-split fired, run the anchor-grounding log(` ⚓ anchor grounding: ${groundingStats.grounded}/${groundingStats.total} quotes matched source` +
// verifier and append a footer with the grounding rate. The footer (groundingStats.groundedPct !== null ? ` (${groundingStats.groundedPct}%)` : "") +
// surfaces ungrounded quotes so humans can spot hallucinated ` · observer ${obsVerdict.verdict}` + (obsVerdict.confidence ? ` (conf=${obsVerdict.confidence})` : ""));
// findings at a glance — prevents the 0/10 confabulation pattern accepted = appendGroundingFooter(accepted, groundingStats);
// observed on llm_team_ui.py 2026-04-24.
if (treeSplitFired) {
const stats = verifyAnchorGrounding(accepted, content);
log(` ⚓ anchor grounding: ${stats.grounded}/${stats.total} quotes matched source` +
(stats.groundedPct !== null ? ` (${stats.groundedPct}%)` : ""));
accepted = appendGroundingFooter(accepted, stats);
}
log(` ✓ ACCEPTED on attempt ${n} (${rung.model}, ${r.content.length} chars)`); log(` ✓ ACCEPTED on attempt ${n} (${rung.model}, ${r.content.length} chars)`);
break; break;
} }