lakehouse/tests/real-world/scrum_applier.ts
root 25ea3de836
Some checks failed
lakehouse/auditor 1 blocking issue: cloud: claim not backed — "journal event verified live (total_events_created 0→1 after probe)."
observer: fix LLM Team escalation — route to /v1/chat qwen3-coder:480b instead of dead mode
Discovery 2026-04-24: /api/run?mode=code_review returns "Unknown mode"
(error response from llm_team_ui.py). The 2026-04-24 observer escalation
wiring pointed at a dead endpoint and was failing silently. My earlier
claim of "9 registered LLM Team modes" came from GET probes that all
returned 405 — I interpreted that as "POST-only endpoints exist" when
it just means "GET is not allowed for anything, and on POST only `extract`
is registered."

Rewire: observer's escalateFailureClusterToLLMTeam now hits
  POST /v1/chat { provider: "ollama_cloud", model: "qwen3-coder:480b", ... }
which is the same coding-specialist rung 2 of the scrum ladder that
reliably produces substantive reviews. Probe shows 1240 chars of
substantive analysis in ~8.7s.

Also tightens scrum_applier:
  * MODEL default: kimi-k2:1t → qwen3-coder:480b (coding specialist)
  * Size gate: 20 lines → 6 lines (surgical patches only)
  * Max patches per file: 3 → 2
  * Prompt: explicit forbidden-actions list (no struct renames, no
    function-signature changes, no new modules) and mechanical-only
    whitelist

These changes produced the first auto-applied commit (96b46cd), which
landed a 2-line import addition that passed cargo check. Zero-to-one.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 04:14:33 -05:00

360 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// scrum_applier.ts — the auto-apply pipeline.
//
// Turns the scrum master's signal into real commits. Reads
// data/_kb/scrum_reviews.jsonl, filters to rows where the scrum's
// own confidence is high enough to trust auto-apply (gradient_tier
// auto OR confidence_avg ≥ 90), asks a patch-emitting model to
// produce concrete old_string/new_string pairs, applies them via
// text replacement, runs `cargo check` after each, commits on green
// and reverts on red.
//
// Runs on its own branch (never on main). Every action is recorded
// in data/_kb/auto_apply.jsonl so the auditor and future iterations
// can see what landed and what reverted.
//
// Usage:
// bun run tests/real-world/scrum_applier.ts # dry-run, print only
// LH_APPLIER_COMMIT=1 bun run tests/real-world/scrum_applier.ts # actually apply
//
// Env:
// LH_APPLIER_BRANCH — branch name (default: "scrum/auto-apply-${Date.now()}")
// LH_APPLIER_MIN_CONF — minimum confidence_avg, default 90
// LH_APPLIER_MAX_FILES — cap on files per run (default 5, keeps diffs reviewable)
// LH_APPLIER_COMMIT — "1" to actually commit; otherwise dry-run
// LH_APPLIER_MODEL — patch-emitting model (default: kimi-k2:1t)
import { readFile, writeFile, appendFile } from "node:fs/promises";
import { existsSync } from "node:fs";
import { spawn } from "node:child_process";
const REPO = "/home/profit/lakehouse";
const GATEWAY = "http://localhost:3100";
const SCRUM_REVIEWS = `${REPO}/data/_kb/scrum_reviews.jsonl`;
const AUDIT_LOG = `${REPO}/data/_kb/auto_apply.jsonl`;
const MIN_CONF = Number(process.env.LH_APPLIER_MIN_CONF ?? 90);
const MAX_FILES = Number(process.env.LH_APPLIER_MAX_FILES ?? 5);
const COMMIT = process.env.LH_APPLIER_COMMIT === "1";
// Default patch-emitter model — qwen3-coder:480b is the coding specialist
// in the scrum ladder (rung 2). Swapped in from kimi-k2:1t after 2026-04-24
// data showed kimi-k2:1t produces architectural patches that cascade across
// the file (rename a field → 20 broken call sites). qwen3-coder is tuned
// for targeted code changes and tends to stay within the mechanical-patch
// constraint the prompt asks for. LLM Team's /api/run?mode=patch would be
// the ideal choice but that mode isn't registered in llm_team_ui.py yet.
const MODEL = process.env.LH_APPLIER_MODEL ?? "qwen3-coder:480b";
const BRANCH = process.env.LH_APPLIER_BRANCH ?? `scrum/auto-apply-${Date.now().toString(36)}`;
// Deny-list — anything whose path starts with one of these is skipped
// regardless of how confident the scrum is. Config / systemd / docs /
// auditor itself are off limits for auto-apply; they need a human.
const DENY_PREFIXES = [
"config/",
"ops/",
"auditor/",
"docs/",
"data/",
"/etc/",
"mcp-server/",
"ui/",
"sidecar/",
"scripts/",
];
function log(msg: string) { console.log(`[applier] ${msg}`); }
async function sh(cmd: string[], cwd = REPO): Promise<{ stdout: string; stderr: string; code: number }> {
return new Promise((resolve) => {
const p = spawn(cmd[0], cmd.slice(1), { cwd, stdio: ["ignore", "pipe", "pipe"] });
let out = ""; let err = "";
p.stdout.on("data", (d) => { out += d.toString(); });
p.stderr.on("data", (d) => { err += d.toString(); });
p.on("close", (code) => resolve({ stdout: out, stderr: err, code: code ?? 1 }));
});
}
async function auditLog(row: Record<string, any>) {
const line = JSON.stringify({ ...row, ts: new Date().toISOString() }) + "\n";
await appendFile(AUDIT_LOG, line);
}
async function chat(opts: {
provider: "ollama_cloud" | "openrouter" | "ollama";
model: string;
prompt: string;
max_tokens?: number;
}): Promise<{ content: string; error?: string }> {
try {
const r = await fetch(`${GATEWAY}/v1/chat`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
provider: opts.provider,
model: opts.model,
messages: [{ role: "user", content: opts.prompt }],
max_tokens: opts.max_tokens ?? 1500,
temperature: 0.1,
}),
signal: AbortSignal.timeout(180000),
});
if (!r.ok) return { content: "", error: `${r.status}: ${(await r.text()).slice(0, 300)}` };
const j: any = await r.json();
return { content: j.choices?.[0]?.message?.content ?? "" };
} catch (e) {
return { content: "", error: String(e) };
}
}
interface ScrumReview {
file: string;
reviewed_at: string;
accepted_model: string;
suggestions_preview: string;
confidences_per_finding?: number[];
confidence_avg?: number | null;
confidence_min?: number | null;
gradient_tier?: string;
gradient_tier_avg?: string;
verdict?: string;
critical_failures_count?: number;
schema_version?: number;
}
async function loadLatestReviews(): Promise<Map<string, ScrumReview>> {
// Map of file → latest review for that file. Ordered by reviewed_at.
if (!existsSync(SCRUM_REVIEWS)) return new Map();
const text = await readFile(SCRUM_REVIEWS, "utf8");
const rows: ScrumReview[] = text.split("\n").filter(Boolean).map(l => {
try { return JSON.parse(l); } catch { return null; }
}).filter((r): r is ScrumReview => r !== null);
// Keep the LATEST review per file.
const latest = new Map<string, ScrumReview>();
for (const r of rows) {
if (!r.file) continue;
const prev = latest.get(r.file);
if (!prev || (r.reviewed_at > prev.reviewed_at)) latest.set(r.file, r);
}
return latest;
}
function passesConfidenceGate(r: ScrumReview): boolean {
const avg = r.confidence_avg ?? 0;
const min = r.confidence_min ?? 0;
// Must be auto or dry_run tier AND confidence_min ≥ MIN_CONF.
// min is the conservative tier-lower-bound (one weak finding drags
// the whole file to "simulation" or "block" tier).
if (r.gradient_tier === "block" || r.gradient_tier === "simulation") return false;
return avg >= MIN_CONF && min >= 70;
}
function passesDenyList(file: string): boolean {
return !DENY_PREFIXES.some((p) => file.startsWith(p) || file === p.replace(/\/$/, ""));
}
interface Patch {
file: string;
old_string: string;
new_string: string;
rationale: string;
confidence: number;
}
async function requestPatches(file: string, source: string, review: string): Promise<Patch[]> {
const prompt = `You previously produced this review of ${file}:
─── REVIEW ───
${review}
─── END REVIEW ───
The review is high-confidence and the file is eligible for auto-apply. Produce CONCRETE PATCHES as JSON so they can be applied via string replacement.
HARD CONSTRAINTS (violations → patch rejected):
1. Output ONE JSON object with a "patches" array. NO prose, no markdown fences.
2. Each patch is {"old_string": "...", "new_string": "...", "rationale": "short", "confidence": 0-100}.
3. "old_string" MUST appear EXACTLY ONCE in the file (verbatim, including whitespace + trailing newlines).
4. Max diff size: 6 lines changed per patch (NOT 20). If the change needs >6 lines, emit {"patches": []} — too risky for auto-apply.
5. Mechanical-only (the list — nothing else):
(a) remove a #[allow(dead_code)] marker on a function now wired elsewhere
(b) add a missing 'use' import statement
(c) add a single field to a struct (no renames)
(d) flip a single boolean/match-arm that doesn't cascade
(e) add a fire-and-forget log or tracing call
6. FORBIDDEN (automatic reject):
• struct field RENAMES (break all call sites)
• function signature changes
• new modules, new traits, new dependencies
• any change that requires editing another file to keep it compiling
7. Each "new_string" MUST compile in isolation with the same surrounding code.
8. Max 2 patches per file — quality over quantity.
9. If you cannot produce at least one high-confidence mechanical patch under these constraints, output {"patches": []}. Don't guess.
─── SOURCE (${source.length} bytes) ───
${source.slice(0, 14000)}
─── END SOURCE ───
Emit ONLY the JSON object.`;
const r = await chat({ provider: "ollama_cloud", model: MODEL, prompt, max_tokens: 2500 });
if (r.error || !r.content) return [];
// Strip markdown fences if model wrapped the JSON.
let raw = r.content.trim();
const fenceStart = raw.match(/^```(?:json)?\s*/);
if (fenceStart) raw = raw.slice(fenceStart[0].length);
if (raw.endsWith("```")) raw = raw.slice(0, -3).trim();
// Find first { and last } to extract JSON block if there's prose.
const first = raw.indexOf("{");
const last = raw.lastIndexOf("}");
if (first >= 0 && last > first) raw = raw.slice(first, last + 1);
try {
const obj = JSON.parse(raw);
const patches: Patch[] = (obj.patches ?? []).filter((p: any) =>
typeof p?.old_string === "string" &&
typeof p?.new_string === "string" &&
p.old_string !== p.new_string &&
p.old_string.length > 0 &&
typeof p?.confidence === "number"
).map((p: any) => ({
file,
old_string: p.old_string,
new_string: p.new_string,
rationale: String(p.rationale ?? ""),
confidence: p.confidence,
}));
return patches;
} catch (e) {
log(` ${file}: patch JSON parse failed — ${String(e).slice(0, 100)}`);
return [];
}
}
async function applyPatches(file: string, patches: Patch[]): Promise<{ applied: number; rejected: Array<{patch: Patch; reason: string}> }> {
const full = `${REPO}/${file}`;
let source = await readFile(full, "utf8");
const rejected: Array<{patch: Patch; reason: string}> = [];
let applied = 0;
for (const p of patches) {
// Confidence gate at the individual-patch level.
if (p.confidence < MIN_CONF) { rejected.push({patch: p, reason: `confidence ${p.confidence} < ${MIN_CONF}`}); continue; }
// Uniqueness gate.
const occurrences = source.split(p.old_string).length - 1;
if (occurrences === 0) { rejected.push({patch: p, reason: "old_string not found"}); continue; }
if (occurrences > 1) { rejected.push({patch: p, reason: `old_string appears ${occurrences}× (not unique)`}); continue; }
// Size gate — no patch touches > 6 lines (diff discipline; matches prompt).
// Raised from 20 to 6 after 2026-04-24 data showed most 10-20 line patches
// cascaded and broke the build. Mechanical changes genuinely fit in 6 lines.
const oldLines = p.old_string.split("\n").length;
const newLines = p.new_string.split("\n").length;
if (Math.max(oldLines, newLines) > 6) { rejected.push({patch: p, reason: `patch too large (${Math.max(oldLines,newLines)} lines, max 6)`}); continue; }
source = source.replace(p.old_string, p.new_string);
applied++;
}
if (applied > 0) await writeFile(full, source);
return { applied, rejected };
}
async function cargoCheck(): Promise<boolean> {
const r = await sh(["cargo", "check", "--workspace"]);
return r.code === 0;
}
async function gitCommit(file: string, patches: Patch[]): Promise<boolean> {
if (!COMMIT) { log(` (dry-run) would commit ${file}`); return true; }
const addR = await sh(["git", "add", file]);
if (addR.code !== 0) { log(` git add failed: ${addR.stderr.slice(0, 200)}`); return false; }
const msg = `auto-apply: ${patches.length} high-confidence fix${patches.length === 1 ? "" : "es"} in ${file}\n\n${patches.map(p => `- ${p.rationale} (conf ${p.confidence}%)`).join("\n")}\n\n🤖 scrum_applier.ts`;
const commitR = await sh(["git", "commit", "-m", msg]);
if (commitR.code !== 0) { log(` git commit failed: ${commitR.stderr.slice(0, 200)}`); return false; }
log(` ✓ committed ${file}`);
return true;
}
async function revertFile(file: string): Promise<void> {
await sh(["git", "checkout", "--", file]);
}
async function main() {
log(`starting · min_conf=${MIN_CONF} max_files=${MAX_FILES} model=${MODEL} commit=${COMMIT}`);
if (COMMIT) {
const headR = await sh(["git", "rev-parse", "--abbrev-ref", "HEAD"]);
const currentBranch = headR.stdout.trim();
if (currentBranch === "main") {
log(`refusing to run on main — create a branch first or set LH_APPLIER_BRANCH`);
const coR = await sh(["git", "checkout", "-b", BRANCH]);
if (coR.code !== 0) { log(`could not create branch ${BRANCH}: ${coR.stderr.slice(0, 200)}`); process.exit(1); }
log(`working branch: ${BRANCH}`);
} else {
log(`working branch: ${currentBranch}`);
}
}
const reviews = await loadLatestReviews();
log(`loaded ${reviews.size} latest reviews`);
const eligible = [...reviews.values()].filter(r =>
passesConfidenceGate(r) && passesDenyList(r.file)
).sort((a, b) => (b.confidence_avg ?? 0) - (a.confidence_avg ?? 0));
log(`${eligible.length} pass confidence gate + deny-list`);
log(`taking top ${Math.min(MAX_FILES, eligible.length)} by confidence`);
let committedFiles = 0;
let revertedFiles = 0;
for (const r of eligible.slice(0, MAX_FILES)) {
log(`${r.file} (conf_avg=${r.confidence_avg} tier=${r.gradient_tier})`);
const full = `${REPO}/${r.file}`;
if (!existsSync(full)) { log(` skip — file not found on disk`); continue; }
const source = await readFile(full, "utf8");
const patches = await requestPatches(r.file, source, r.suggestions_preview ?? "");
if (patches.length === 0) {
log(` no patches produced`);
await auditLog({ action: "no_patches", file: r.file, reviewer_model: r.accepted_model });
continue;
}
log(` ${patches.length} candidate patches`);
const { applied, rejected } = await applyPatches(r.file, patches);
log(` applied ${applied}, rejected ${rejected.length}`);
for (const rj of rejected) log(`${rj.reason}`);
if (applied === 0) {
await auditLog({ action: "all_rejected", file: r.file, rejected: rejected.map(x => x.reason) });
continue;
}
log(` running cargo check...`);
const green = await cargoCheck();
if (!green) {
log(` ✗ build red — reverting ${r.file}`);
await revertFile(r.file);
revertedFiles++;
await auditLog({ action: "build_red_reverted", file: r.file, patches_applied: applied });
continue;
}
log(` ✓ build green`);
const ok = await gitCommit(r.file, patches.slice(0, applied));
if (ok) {
committedFiles++;
await auditLog({
action: COMMIT ? "committed" : "dry_run_committed",
file: r.file,
patches_applied: applied,
patches_rejected: rejected.length,
confidence_avg: r.confidence_avg,
gradient_tier: r.gradient_tier,
reviewer_model: r.accepted_model,
});
}
}
log(`DONE · committed=${committedFiles} reverted=${revertedFiles}`);
}
await main();