// scrum_applier.ts — the auto-apply pipeline. // // Turns the scrum master's signal into real commits. Reads // data/_kb/scrum_reviews.jsonl, filters to rows where the scrum's // own confidence is high enough to trust auto-apply (gradient_tier // auto OR confidence_avg ≥ 90), asks a patch-emitting model to // produce concrete old_string/new_string pairs, applies them via // text replacement, runs `cargo check` after each, commits on green // and reverts on red. // // Runs on its own branch (never on main). Every action is recorded // in data/_kb/auto_apply.jsonl so the auditor and future iterations // can see what landed and what reverted. // // Usage: // bun run tests/real-world/scrum_applier.ts # dry-run, print only // LH_APPLIER_COMMIT=1 bun run tests/real-world/scrum_applier.ts # actually apply // // Env: // LH_APPLIER_BRANCH — branch name (default: "scrum/auto-apply-${Date.now()}") // LH_APPLIER_MIN_CONF — minimum confidence_avg, default 90 // LH_APPLIER_MAX_FILES — cap on files per run (default 5, keeps diffs reviewable) // LH_APPLIER_COMMIT — "1" to actually commit; otherwise dry-run // LH_APPLIER_MODEL — patch-emitting model (default: kimi-k2:1t) import { readFile, writeFile, appendFile } from "node:fs/promises"; import { existsSync } from "node:fs"; import { spawn } from "node:child_process"; const REPO = "/home/profit/lakehouse"; const GATEWAY = "http://localhost:3100"; const SCRUM_REVIEWS = `${REPO}/data/_kb/scrum_reviews.jsonl`; const AUDIT_LOG = `${REPO}/data/_kb/auto_apply.jsonl`; const MIN_CONF = Number(process.env.LH_APPLIER_MIN_CONF ?? 90); const MAX_FILES = Number(process.env.LH_APPLIER_MAX_FILES ?? 5); const COMMIT = process.env.LH_APPLIER_COMMIT === "1"; // LH_APPLIER_FILES — comma-separated repo-relative paths. When set, // constrains eligible reviews to ONLY those files. Used by the autonomous // loop so the applier patches what scrum just reviewed in this iter, // instead of pulling the highest-confidence file from global review history. const TARGET_FILES = (process.env.LH_APPLIER_FILES ?? "") .split(",").map(s => s.trim()).filter(Boolean); // Default patch-emitter model — qwen3-coder:480b is the coding specialist // in the scrum ladder (rung 2). Swapped in from kimi-k2:1t after 2026-04-24 // data showed kimi-k2:1t produces architectural patches that cascade across // the file (rename a field → 20 broken call sites). qwen3-coder is tuned // for targeted code changes and tends to stay within the mechanical-patch // constraint the prompt asks for. LLM Team's /api/run?mode=patch would be // the ideal choice but that mode isn't registered in llm_team_ui.py yet. // Default patch emitter swapped to OpenRouter Grok 4.1 fast (2026-04-25) // after observing the prior default (ollama_cloud::qwen3-coder:480b) sit // at 429 throttle and never produce patches. Grok 4.1 fast: $0.20/$0.50 // per M, 2M ctx, proven to emit precise structured patches in observer // hand-review tests. Override with LH_APPLIER_MODEL + LH_APPLIER_PROVIDER. const MODEL = process.env.LH_APPLIER_MODEL ?? "x-ai/grok-4.1-fast"; const PROVIDER = (process.env.LH_APPLIER_PROVIDER ?? "openrouter") as "ollama_cloud" | "openrouter" | "ollama"; const BRANCH = process.env.LH_APPLIER_BRANCH ?? `scrum/auto-apply-${Date.now().toString(36)}`; // Deny-list — anything whose path starts with one of these is skipped // regardless of how confident the scrum is. Config / systemd / docs / // auditor itself are off limits for auto-apply; they need a human. const DENY_PREFIXES = [ "config/", "ops/", "auditor/", "docs/", "data/", "/etc/", "mcp-server/", "ui/", "sidecar/", "scripts/", ]; function log(msg: string) { console.log(`[applier] ${msg}`); } async function sh(cmd: string[], cwd = REPO): Promise<{ stdout: string; stderr: string; code: number }> { return new Promise((resolve) => { const p = spawn(cmd[0], cmd.slice(1), { cwd, stdio: ["ignore", "pipe", "pipe"] }); let out = ""; let err = ""; p.stdout.on("data", (d) => { out += d.toString(); }); p.stderr.on("data", (d) => { err += d.toString(); }); p.on("close", (code) => resolve({ stdout: out, stderr: err, code: code ?? 1 })); }); } async function auditLog(row: Record) { const line = JSON.stringify({ ...row, ts: new Date().toISOString() }) + "\n"; await appendFile(AUDIT_LOG, line); } async function chat(opts: { provider: "ollama_cloud" | "openrouter" | "ollama"; model: string; prompt: string; max_tokens?: number; }): Promise<{ content: string; error?: string }> { try { const r = await fetch(`${GATEWAY}/v1/chat`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ provider: opts.provider, model: opts.model, messages: [{ role: "user", content: opts.prompt }], max_tokens: opts.max_tokens ?? 1500, temperature: 0.1, }), signal: AbortSignal.timeout(180000), }); if (!r.ok) return { content: "", error: `${r.status}: ${(await r.text()).slice(0, 300)}` }; const j: any = await r.json(); return { content: j.choices?.[0]?.message?.content ?? "" }; } catch (e) { return { content: "", error: String(e) }; } } interface ScrumReview { file: string; reviewed_at: string; accepted_model: string; suggestions_preview: string; confidences_per_finding?: number[]; confidence_avg?: number | null; confidence_min?: number | null; gradient_tier?: string; gradient_tier_avg?: string; verdict?: string; critical_failures_count?: number; schema_version?: number; } async function loadLatestReviews(): Promise> { // Map of file → latest review for that file. Ordered by reviewed_at. if (!existsSync(SCRUM_REVIEWS)) return new Map(); const text = await readFile(SCRUM_REVIEWS, "utf8"); const rows: ScrumReview[] = text.split("\n").filter(Boolean).map(l => { try { return JSON.parse(l); } catch { return null; } }).filter((r): r is ScrumReview => r !== null); // Keep the LATEST review per file. const latest = new Map(); for (const r of rows) { if (!r.file) continue; const prev = latest.get(r.file); if (!prev || (r.reviewed_at > prev.reviewed_at)) latest.set(r.file, r); } return latest; } function passesConfidenceGate(r: ScrumReview): boolean { const avg = r.confidence_avg ?? 0; const min = r.confidence_min ?? 0; // Must be auto or dry_run tier AND confidence_min ≥ MIN_CONF. // min is the conservative tier-lower-bound (one weak finding drags // the whole file to "simulation" or "block" tier). if (r.gradient_tier === "block" || r.gradient_tier === "simulation") return false; return avg >= MIN_CONF && min >= 70; } function passesDenyList(file: string): boolean { return !DENY_PREFIXES.some((p) => file.startsWith(p) || file === p.replace(/\/$/, "")); } interface Patch { file: string; old_string: string; new_string: string; rationale: string; confidence: number; } async function requestPatches(file: string, source: string, review: string): Promise { const prompt = `You previously produced this review of ${file}: ─── REVIEW ─── ${review} ─── END REVIEW ─── The review is high-confidence and the file is eligible for auto-apply. Produce CONCRETE PATCHES as JSON so they can be applied via string replacement. HARD CONSTRAINTS (violations → patch rejected): 1. Output ONE JSON object with a "patches" array. NO prose, no markdown fences. 2. Each patch is {"old_string": "...", "new_string": "...", "rationale": "short", "confidence": 0-100}. 3. "old_string" MUST appear EXACTLY ONCE in the file (verbatim, including whitespace + trailing newlines). 4. Max diff size: 6 lines changed per patch (NOT 20). If the change needs >6 lines, emit {"patches": []} — too risky for auto-apply. 5. Mechanical-only (the list — nothing else): (a) remove a #[allow(dead_code)] marker on a function now wired elsewhere (b) add a missing 'use' import statement (c) add a single field to a struct (no renames) (d) flip a single boolean/match-arm that doesn't cascade (e) add a fire-and-forget log or tracing call 6. FORBIDDEN (automatic reject): • struct field RENAMES (break all call sites) • function signature changes • new modules, new traits, new dependencies • any change that requires editing another file to keep it compiling • adding a 'use' import whose symbol is NOT referenced in the same patch or already referenced in the surrounding file (no "import and hope") • rationale that describes functionality not present in the diff (if rationale says "destructive SQL filter", the diff must contain SQL/filter tokens) 7. Each "new_string" MUST compile in isolation with the same surrounding code. 8. Max 2 patches per file — quality over quantity. 9. The "rationale" field MUST use vocabulary that appears in the new_string. Generic rationales like "improve code" or "add missing feature" are rejected. 10. If you cannot produce at least one high-confidence mechanical patch under these constraints, output {"patches": []}. Don't guess. ─── SOURCE (${source.length} bytes) ─── ${source.slice(0, 14000)} ─── END SOURCE ─── Emit ONLY the JSON object.`; const r = await chat({ provider: PROVIDER, model: MODEL, prompt, max_tokens: 2500 }); if (r.error || !r.content) return []; // Strip markdown fences if model wrapped the JSON. let raw = r.content.trim(); const fenceStart = raw.match(/^```(?:json)?\s*/); if (fenceStart) raw = raw.slice(fenceStart[0].length); if (raw.endsWith("```")) raw = raw.slice(0, -3).trim(); // Find first { and last } to extract JSON block if there's prose. const first = raw.indexOf("{"); const last = raw.lastIndexOf("}"); if (first >= 0 && last > first) raw = raw.slice(first, last + 1); try { const obj = JSON.parse(raw); const patches: Patch[] = (obj.patches ?? []).filter((p: any) => typeof p?.old_string === "string" && typeof p?.new_string === "string" && p.old_string !== p.new_string && p.old_string.length > 0 && typeof p?.confidence === "number" ).map((p: any) => ({ file, old_string: p.old_string, new_string: p.new_string, rationale: String(p.rationale ?? ""), confidence: p.confidence, })); return patches; } catch (e) { log(` ${file}: patch JSON parse failed — ${String(e).slice(0, 100)}`); return []; } } async function applyPatches(file: string, patches: Patch[]): Promise<{ applied: number; rejected: Array<{patch: Patch; reason: string}> }> { const full = `${REPO}/${file}`; let source = await readFile(full, "utf8"); const rejected: Array<{patch: Patch; reason: string}> = []; let applied = 0; for (const p of patches) { // Confidence gate at the individual-patch level. if (p.confidence < MIN_CONF) { rejected.push({patch: p, reason: `confidence ${p.confidence} < ${MIN_CONF}`}); continue; } // Uniqueness gate. const occurrences = source.split(p.old_string).length - 1; if (occurrences === 0) { rejected.push({patch: p, reason: "old_string not found"}); continue; } if (occurrences > 1) { rejected.push({patch: p, reason: `old_string appears ${occurrences}× (not unique)`}); continue; } // Size gate — no patch touches > 6 lines (diff discipline; matches prompt). // Raised from 20 to 6 after 2026-04-24 data showed most 10-20 line patches // cascaded and broke the build. Mechanical changes genuinely fit in 6 lines. const oldLines = p.old_string.split("\n").length; const newLines = p.new_string.split("\n").length; if (Math.max(oldLines, newLines) > 6) { rejected.push({patch: p, reason: `patch too large (${Math.max(oldLines,newLines)} lines, max 6)`}); continue; } source = source.replace(p.old_string, p.new_string); applied++; } if (applied > 0) await writeFile(full, source); return { applied, rejected }; } async function cargoCheck(): Promise<{ green: boolean; warnings: number }> { const r = await sh(["cargo", "check", "--workspace"]); // Count warnings: cargo emits "warning:" lines on stderr (and sometimes stdout). // We count unique "warning: " opening lines, not continuation context. const combined = r.stdout + r.stderr; const warnings = (combined.match(/^warning: /gm) || []).length; return { green: r.code === 0, warnings }; } async function gitCommit(file: string, patches: Patch[]): Promise { if (!COMMIT) { log(` (dry-run) would commit ${file}`); return true; } const addR = await sh(["git", "add", file]); if (addR.code !== 0) { log(` git add failed: ${addR.stderr.slice(0, 200)}`); return false; } const msg = `auto-apply: ${patches.length} high-confidence fix${patches.length === 1 ? "" : "es"} in ${file}\n\n${patches.map(p => `- ${p.rationale} (conf ${p.confidence}%)`).join("\n")}\n\n🤖 scrum_applier.ts`; const commitR = await sh(["git", "commit", "-m", msg]); if (commitR.code !== 0) { log(` git commit failed: ${commitR.stderr.slice(0, 200)}`); return false; } log(` ✓ committed ${file}`); return true; } async function revertFile(file: string): Promise { await sh(["git", "checkout", "--", file]); } // Cheap, conservative check that the rationale actually describes the // diff. Tokenizes the new_string (what's actually being added), drops // Rust keywords + common filler, and requires the rationale to share // at least one meaningful token. Catches rationale-diff divergence like // "Add destructive SQL filter" for a diff that only adds `use tracing;`. const STOPWORDS = new Set([ "use", "let", "pub", "mut", "fn", "self", "the", "a", "and", "or", "in", "to", "for", "of", "on", "at", "add", "added", "adds", "new", "is", "as", "if", "else", "match", "return", "crate", "mod", "struct", "trait", "impl", "this", "that", "with", "from", "into", "by", "be", "it", "its", "not", "one", "two", "line", "lines", ]); function tokenize(s: string): Set { const out = new Set(); for (const raw of s.toLowerCase().split(/[^a-z0-9_]+/)) { if (raw.length >= 4 && !STOPWORDS.has(raw)) out.add(raw); } return out; } function rationaleMatchesDiff(p: Patch): boolean { // Only check tokens introduced by the patch (what's in new but not old). const newToks = tokenize(p.new_string); const oldToks = tokenize(p.old_string); const added = new Set([...newToks].filter((t) => !oldToks.has(t))); if (added.size === 0) return true; // pure deletion — rationale unconstrained const rationaleToks = tokenize(p.rationale); for (const t of added) if (rationaleToks.has(t)) return true; return false; } async function main() { log(`starting · min_conf=${MIN_CONF} max_files=${MAX_FILES} model=${MODEL} commit=${COMMIT}`); if (COMMIT) { const headR = await sh(["git", "rev-parse", "--abbrev-ref", "HEAD"]); const currentBranch = headR.stdout.trim(); if (currentBranch === "main") { log(`refusing to run on main — create a branch first or set LH_APPLIER_BRANCH`); const coR = await sh(["git", "checkout", "-b", BRANCH]); if (coR.code !== 0) { log(`could not create branch ${BRANCH}: ${coR.stderr.slice(0, 200)}`); process.exit(1); } log(`working branch: ${BRANCH}`); } else { log(`working branch: ${currentBranch}`); } } const reviews = await loadLatestReviews(); log(`loaded ${reviews.size} latest reviews`); const eligible = [...reviews.values()].filter(r => passesConfidenceGate(r) && passesDenyList(r.file) && (TARGET_FILES.length === 0 || TARGET_FILES.includes(r.file)) ).sort((a, b) => (b.confidence_avg ?? 0) - (a.confidence_avg ?? 0)); if (TARGET_FILES.length > 0) { log(`LH_APPLIER_FILES set — constrained to ${TARGET_FILES.length} target file(s): ${TARGET_FILES.join(", ")}`); } log(`${eligible.length} pass confidence gate + deny-list${TARGET_FILES.length > 0 ? " + target filter" : ""}`); log(`taking top ${Math.min(MAX_FILES, eligible.length)} by confidence`); // Establish pre-run warning baseline so post-patch cargo check can // reject patches that keep the build green but add new warnings. // This gate exists because 96b46cd landed unused imports through the // green-only gate — compiled fine, left a permanent dead-code warning. log(`baseline cargo check (warning count)...`); let baselineWarnings = (await cargoCheck()).warnings; log(`baseline warnings = ${baselineWarnings}`); let committedFiles = 0; let revertedFiles = 0; let warningReverts = 0; let rationaleReverts = 0; for (const r of eligible.slice(0, MAX_FILES)) { log(`${r.file} (conf_avg=${r.confidence_avg} tier=${r.gradient_tier})`); const full = `${REPO}/${r.file}`; if (!existsSync(full)) { log(` skip — file not found on disk`); continue; } const source = await readFile(full, "utf8"); const patches = await requestPatches(r.file, source, r.suggestions_preview ?? ""); if (patches.length === 0) { log(` no patches produced`); await auditLog({ action: "no_patches", file: r.file, reviewer_model: r.accepted_model }); continue; } log(` ${patches.length} candidate patches`); const { applied, rejected } = await applyPatches(r.file, patches); log(` applied ${applied}, rejected ${rejected.length}`); for (const rj of rejected) log(` ✗ ${rj.reason}`); if (applied === 0) { await auditLog({ action: "all_rejected", file: r.file, rejected: rejected.map(x => x.reason) }); continue; } log(` running cargo check...`); const { green, warnings } = await cargoCheck(); if (!green) { log(` ✗ build red — reverting ${r.file}`); await revertFile(r.file); revertedFiles++; await auditLog({ action: "build_red_reverted", file: r.file, patches_applied: applied }); continue; } // New-warning gate. If a patch keeps the build green but adds a // warning (unused import, dead_code after removed allow, etc), treat // it the same as build red: revert. The cargo-green gate alone let // 96b46cd land unused imports; this closes that door. if (warnings > baselineWarnings) { log(` ✗ warnings ${baselineWarnings} → ${warnings} (+${warnings - baselineWarnings}) — reverting ${r.file}`); await revertFile(r.file); warningReverts++; await auditLog({ action: "warnings_increased_reverted", file: r.file, patches_applied: applied, warnings_before: baselineWarnings, warnings_after: warnings, }); continue; } // Rationale-diff alignment heuristic. At least one non-stopword // token from the applied patches' combined new_string must appear // in the rationale, OR the rationale must name the file's module. // This catches cases like "Add destructive SQL filter" rationale // with a diff that only adds `use tracing`. const applied_patches = patches.slice(0, applied); const rationale_ok = applied_patches.some((p) => rationaleMatchesDiff(p)); if (!rationale_ok) { log(` ✗ rationale doesn't name anything in diff — reverting ${r.file}`); await revertFile(r.file); rationaleReverts++; await auditLog({ action: "rationale_mismatch_reverted", file: r.file, patches_applied: applied, rationales: applied_patches.map((p) => p.rationale), }); continue; } log(` ✓ build green (warnings ${warnings}/${baselineWarnings})`); // Dry-run MUST revert the file after a successful check so the // workspace doesn't accumulate unpushed changes across dry-runs. // Previously this left modified files, silently polluting state. if (!COMMIT) { log(` (dry-run) would commit ${r.file} — reverting workspace change`); await revertFile(r.file); await auditLog({ action: "dry_run_would_commit", file: r.file, patches_applied: applied, patches_rejected: rejected.length, confidence_avg: r.confidence_avg, gradient_tier: r.gradient_tier, reviewer_model: r.accepted_model, }); continue; } const ok = await gitCommit(r.file, applied_patches); if (ok) { committedFiles++; // Advance the warning baseline so a later patch on a different // file is compared against the new (committed) state, not the // pre-run state — otherwise every subsequent patch trips the gate. baselineWarnings = warnings; await auditLog({ action: "committed", file: r.file, patches_applied: applied, patches_rejected: rejected.length, confidence_avg: r.confidence_avg, gradient_tier: r.gradient_tier, reviewer_model: r.accepted_model, warnings_after: warnings, }); } } log(`DONE · committed=${committedFiles} build_red=${revertedFiles} warning_revert=${warningReverts} rationale_revert=${rationaleReverts}`); } await main();