profit ac01fffd9a checkpoint: matrix-agent-validated (2026-04-25)
Architectural snapshot of the lakehouse codebase at the point where the
full matrix-driven agent loop with Mem0 versioning + deletion was
validated end-to-end.

WHAT THIS REPO IS
A clean single-commit snapshot of the lakehouse code. Heavy test data
(.parquet datasets, vector indexes) excluded — see REPLICATION.md for
regen path. Full lakehouse history at git.agentview.dev/profit/lakehouse.

WHAT WAS PROVEN
- Vector retrieval across multi-corpora matrix (chicago_permits + entity
  briefs + sec_tickers + distilled procedural + llm_team runs)
- Observer hand-review (cloud + heuristic fallback) gating each candidate
- Local-model agent loop (qwen3.5:latest) with tool use + scratchpad
- Playbook seal on success → next-iter retrieval surfaces it as preamble
- Mem0 versioning + deletion in pathway_memory:
    * UPSERT: ADD on new workflow, UPDATE bumps replay_count on identical
    * REVISE: chains versions, parent.superseded_at + superseded_by stamped
    * RETIRE: marks specific trace retired with reason, excluded from retrieval
    * HISTORY: walks chain root→tip, cycle-safe

KEY DIRECTORIES
- crates/vectord/src/pathway_memory.rs — Mem0 ops live here
- crates/vectord/src/playbook_memory.rs — original Mem0 reference
- tests/agent_test/ — local-model agent harness + PRD + session archives
- scripts/dump_raw_corpus.sh — MinIO bucket dump (raw test corpus)
- scripts/vectorize_raw_corpus.ts — corpus → vector indexes
- scripts/analyze_chicago_contracts.ts — real inference pipeline
- scripts/seal_agent_playbook.ts — Mem0 upsert from agent traces

Replication: see REPLICATION.md for Debian 13 clean install + cloud-only
adaptation (no local Ollama).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 19:43:27 -05:00

488 lines
21 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// scrum_applier.ts — the auto-apply pipeline.
//
// Turns the scrum master's signal into real commits. Reads
// data/_kb/scrum_reviews.jsonl, filters to rows where the scrum's
// own confidence is high enough to trust auto-apply (gradient_tier
// auto OR confidence_avg ≥ 90), asks a patch-emitting model to
// produce concrete old_string/new_string pairs, applies them via
// text replacement, runs `cargo check` after each, commits on green
// and reverts on red.
//
// Runs on its own branch (never on main). Every action is recorded
// in data/_kb/auto_apply.jsonl so the auditor and future iterations
// can see what landed and what reverted.
//
// Usage:
// bun run tests/real-world/scrum_applier.ts # dry-run, print only
// LH_APPLIER_COMMIT=1 bun run tests/real-world/scrum_applier.ts # actually apply
//
// Env:
// LH_APPLIER_BRANCH — branch name (default: "scrum/auto-apply-${Date.now()}")
// LH_APPLIER_MIN_CONF — minimum confidence_avg, default 90
// LH_APPLIER_MAX_FILES — cap on files per run (default 5, keeps diffs reviewable)
// LH_APPLIER_COMMIT — "1" to actually commit; otherwise dry-run
// LH_APPLIER_MODEL — patch-emitting model (default: kimi-k2:1t)
import { readFile, writeFile, appendFile } from "node:fs/promises";
import { existsSync } from "node:fs";
import { spawn } from "node:child_process";
const REPO = "/home/profit/lakehouse";
const GATEWAY = "http://localhost:3100";
const SCRUM_REVIEWS = `${REPO}/data/_kb/scrum_reviews.jsonl`;
const AUDIT_LOG = `${REPO}/data/_kb/auto_apply.jsonl`;
const MIN_CONF = Number(process.env.LH_APPLIER_MIN_CONF ?? 90);
const MAX_FILES = Number(process.env.LH_APPLIER_MAX_FILES ?? 5);
const COMMIT = process.env.LH_APPLIER_COMMIT === "1";
// LH_APPLIER_FILES — comma-separated repo-relative paths. When set,
// constrains eligible reviews to ONLY those files. Used by the autonomous
// loop so the applier patches what scrum just reviewed in this iter,
// instead of pulling the highest-confidence file from global review history.
const TARGET_FILES = (process.env.LH_APPLIER_FILES ?? "")
.split(",").map(s => s.trim()).filter(Boolean);
// Default patch-emitter model — qwen3-coder:480b is the coding specialist
// in the scrum ladder (rung 2). Swapped in from kimi-k2:1t after 2026-04-24
// data showed kimi-k2:1t produces architectural patches that cascade across
// the file (rename a field → 20 broken call sites). qwen3-coder is tuned
// for targeted code changes and tends to stay within the mechanical-patch
// constraint the prompt asks for. LLM Team's /api/run?mode=patch would be
// the ideal choice but that mode isn't registered in llm_team_ui.py yet.
// Default patch emitter swapped to OpenRouter Grok 4.1 fast (2026-04-25)
// after observing the prior default (ollama_cloud::qwen3-coder:480b) sit
// at 429 throttle and never produce patches. Grok 4.1 fast: $0.20/$0.50
// per M, 2M ctx, proven to emit precise structured patches in observer
// hand-review tests. Override with LH_APPLIER_MODEL + LH_APPLIER_PROVIDER.
const MODEL = process.env.LH_APPLIER_MODEL ?? "x-ai/grok-4.1-fast";
const PROVIDER = (process.env.LH_APPLIER_PROVIDER ?? "openrouter") as "ollama_cloud" | "openrouter" | "ollama";
const BRANCH = process.env.LH_APPLIER_BRANCH ?? `scrum/auto-apply-${Date.now().toString(36)}`;
// Deny-list — anything whose path starts with one of these is skipped
// regardless of how confident the scrum is. Config / systemd / docs /
// auditor itself are off limits for auto-apply; they need a human.
const DENY_PREFIXES = [
"config/",
"ops/",
"auditor/",
"docs/",
"data/",
"/etc/",
"mcp-server/",
"ui/",
"sidecar/",
"scripts/",
];
function log(msg: string) { console.log(`[applier] ${msg}`); }
async function sh(cmd: string[], cwd = REPO): Promise<{ stdout: string; stderr: string; code: number }> {
return new Promise((resolve) => {
const p = spawn(cmd[0], cmd.slice(1), { cwd, stdio: ["ignore", "pipe", "pipe"] });
let out = ""; let err = "";
p.stdout.on("data", (d) => { out += d.toString(); });
p.stderr.on("data", (d) => { err += d.toString(); });
p.on("close", (code) => resolve({ stdout: out, stderr: err, code: code ?? 1 }));
});
}
async function auditLog(row: Record<string, any>) {
const line = JSON.stringify({ ...row, ts: new Date().toISOString() }) + "\n";
await appendFile(AUDIT_LOG, line);
}
async function chat(opts: {
provider: "ollama_cloud" | "openrouter" | "ollama";
model: string;
prompt: string;
max_tokens?: number;
}): Promise<{ content: string; error?: string }> {
try {
const r = await fetch(`${GATEWAY}/v1/chat`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
provider: opts.provider,
model: opts.model,
messages: [{ role: "user", content: opts.prompt }],
max_tokens: opts.max_tokens ?? 1500,
temperature: 0.1,
}),
signal: AbortSignal.timeout(180000),
});
if (!r.ok) return { content: "", error: `${r.status}: ${(await r.text()).slice(0, 300)}` };
const j: any = await r.json();
return { content: j.choices?.[0]?.message?.content ?? "" };
} catch (e) {
return { content: "", error: String(e) };
}
}
interface ScrumReview {
file: string;
reviewed_at: string;
accepted_model: string;
suggestions_preview: string;
confidences_per_finding?: number[];
confidence_avg?: number | null;
confidence_min?: number | null;
gradient_tier?: string;
gradient_tier_avg?: string;
verdict?: string;
critical_failures_count?: number;
schema_version?: number;
}
async function loadLatestReviews(): Promise<Map<string, ScrumReview>> {
// Map of file → latest review for that file. Ordered by reviewed_at.
if (!existsSync(SCRUM_REVIEWS)) return new Map();
const text = await readFile(SCRUM_REVIEWS, "utf8");
const rows: ScrumReview[] = text.split("\n").filter(Boolean).map(l => {
try { return JSON.parse(l); } catch { return null; }
}).filter((r): r is ScrumReview => r !== null);
// Keep the LATEST review per file.
const latest = new Map<string, ScrumReview>();
for (const r of rows) {
if (!r.file) continue;
const prev = latest.get(r.file);
if (!prev || (r.reviewed_at > prev.reviewed_at)) latest.set(r.file, r);
}
return latest;
}
function passesConfidenceGate(r: ScrumReview): boolean {
const avg = r.confidence_avg ?? 0;
const min = r.confidence_min ?? 0;
// Must be auto or dry_run tier AND confidence_min ≥ MIN_CONF.
// min is the conservative tier-lower-bound (one weak finding drags
// the whole file to "simulation" or "block" tier).
if (r.gradient_tier === "block" || r.gradient_tier === "simulation") return false;
return avg >= MIN_CONF && min >= 70;
}
function passesDenyList(file: string): boolean {
return !DENY_PREFIXES.some((p) => file.startsWith(p) || file === p.replace(/\/$/, ""));
}
interface Patch {
file: string;
old_string: string;
new_string: string;
rationale: string;
confidence: number;
}
async function requestPatches(file: string, source: string, review: string): Promise<Patch[]> {
const prompt = `You previously produced this review of ${file}:
─── REVIEW ───
${review}
─── END REVIEW ───
The review is high-confidence and the file is eligible for auto-apply. Produce CONCRETE PATCHES as JSON so they can be applied via string replacement.
HARD CONSTRAINTS (violations → patch rejected):
1. Output ONE JSON object with a "patches" array. NO prose, no markdown fences.
2. Each patch is {"old_string": "...", "new_string": "...", "rationale": "short", "confidence": 0-100}.
3. "old_string" MUST appear EXACTLY ONCE in the file (verbatim, including whitespace + trailing newlines).
4. Max diff size: 6 lines changed per patch (NOT 20). If the change needs >6 lines, emit {"patches": []} — too risky for auto-apply.
5. Mechanical-only (the list — nothing else):
(a) remove a #[allow(dead_code)] marker on a function now wired elsewhere
(b) add a missing 'use' import statement
(c) add a single field to a struct (no renames)
(d) flip a single boolean/match-arm that doesn't cascade
(e) add a fire-and-forget log or tracing call
6. FORBIDDEN (automatic reject):
• struct field RENAMES (break all call sites)
• function signature changes
• new modules, new traits, new dependencies
• any change that requires editing another file to keep it compiling
• adding a 'use' import whose symbol is NOT referenced in the same patch
or already referenced in the surrounding file (no "import and hope")
• rationale that describes functionality not present in the diff
(if rationale says "destructive SQL filter", the diff must contain SQL/filter tokens)
7. Each "new_string" MUST compile in isolation with the same surrounding code.
8. Max 2 patches per file — quality over quantity.
9. The "rationale" field MUST use vocabulary that appears in the new_string.
Generic rationales like "improve code" or "add missing feature" are rejected.
10. If you cannot produce at least one high-confidence mechanical patch under these constraints, output {"patches": []}. Don't guess.
─── SOURCE (${source.length} bytes) ───
${source.slice(0, 14000)}
─── END SOURCE ───
Emit ONLY the JSON object.`;
const r = await chat({ provider: PROVIDER, model: MODEL, prompt, max_tokens: 2500 });
if (r.error || !r.content) return [];
// Strip markdown fences if model wrapped the JSON.
let raw = r.content.trim();
const fenceStart = raw.match(/^```(?:json)?\s*/);
if (fenceStart) raw = raw.slice(fenceStart[0].length);
if (raw.endsWith("```")) raw = raw.slice(0, -3).trim();
// Find first { and last } to extract JSON block if there's prose.
const first = raw.indexOf("{");
const last = raw.lastIndexOf("}");
if (first >= 0 && last > first) raw = raw.slice(first, last + 1);
try {
const obj = JSON.parse(raw);
const patches: Patch[] = (obj.patches ?? []).filter((p: any) =>
typeof p?.old_string === "string" &&
typeof p?.new_string === "string" &&
p.old_string !== p.new_string &&
p.old_string.length > 0 &&
typeof p?.confidence === "number"
).map((p: any) => ({
file,
old_string: p.old_string,
new_string: p.new_string,
rationale: String(p.rationale ?? ""),
confidence: p.confidence,
}));
return patches;
} catch (e) {
log(` ${file}: patch JSON parse failed — ${String(e).slice(0, 100)}`);
return [];
}
}
async function applyPatches(file: string, patches: Patch[]): Promise<{ applied: number; rejected: Array<{patch: Patch; reason: string}> }> {
const full = `${REPO}/${file}`;
let source = await readFile(full, "utf8");
const rejected: Array<{patch: Patch; reason: string}> = [];
let applied = 0;
for (const p of patches) {
// Confidence gate at the individual-patch level.
if (p.confidence < MIN_CONF) { rejected.push({patch: p, reason: `confidence ${p.confidence} < ${MIN_CONF}`}); continue; }
// Uniqueness gate.
const occurrences = source.split(p.old_string).length - 1;
if (occurrences === 0) { rejected.push({patch: p, reason: "old_string not found"}); continue; }
if (occurrences > 1) { rejected.push({patch: p, reason: `old_string appears ${occurrences}× (not unique)`}); continue; }
// Size gate — no patch touches > 6 lines (diff discipline; matches prompt).
// Raised from 20 to 6 after 2026-04-24 data showed most 10-20 line patches
// cascaded and broke the build. Mechanical changes genuinely fit in 6 lines.
const oldLines = p.old_string.split("\n").length;
const newLines = p.new_string.split("\n").length;
if (Math.max(oldLines, newLines) > 6) { rejected.push({patch: p, reason: `patch too large (${Math.max(oldLines,newLines)} lines, max 6)`}); continue; }
source = source.replace(p.old_string, p.new_string);
applied++;
}
if (applied > 0) await writeFile(full, source);
return { applied, rejected };
}
async function cargoCheck(): Promise<{ green: boolean; warnings: number }> {
const r = await sh(["cargo", "check", "--workspace"]);
// Count warnings: cargo emits "warning:" lines on stderr (and sometimes stdout).
// We count unique "warning: <msg>" opening lines, not continuation context.
const combined = r.stdout + r.stderr;
const warnings = (combined.match(/^warning: /gm) || []).length;
return { green: r.code === 0, warnings };
}
async function gitCommit(file: string, patches: Patch[]): Promise<boolean> {
if (!COMMIT) { log(` (dry-run) would commit ${file}`); return true; }
const addR = await sh(["git", "add", file]);
if (addR.code !== 0) { log(` git add failed: ${addR.stderr.slice(0, 200)}`); return false; }
const msg = `auto-apply: ${patches.length} high-confidence fix${patches.length === 1 ? "" : "es"} in ${file}\n\n${patches.map(p => `- ${p.rationale} (conf ${p.confidence}%)`).join("\n")}\n\n🤖 scrum_applier.ts`;
const commitR = await sh(["git", "commit", "-m", msg]);
if (commitR.code !== 0) { log(` git commit failed: ${commitR.stderr.slice(0, 200)}`); return false; }
log(` ✓ committed ${file}`);
return true;
}
async function revertFile(file: string): Promise<void> {
await sh(["git", "checkout", "--", file]);
}
// Cheap, conservative check that the rationale actually describes the
// diff. Tokenizes the new_string (what's actually being added), drops
// Rust keywords + common filler, and requires the rationale to share
// at least one meaningful token. Catches rationale-diff divergence like
// "Add destructive SQL filter" for a diff that only adds `use tracing;`.
const STOPWORDS = new Set([
"use", "let", "pub", "mut", "fn", "self", "the", "a", "and", "or", "in",
"to", "for", "of", "on", "at", "add", "added", "adds", "new", "is",
"as", "if", "else", "match", "return", "crate", "mod", "struct",
"trait", "impl", "this", "that", "with", "from", "into", "by", "be",
"it", "its", "not", "one", "two", "line", "lines",
]);
function tokenize(s: string): Set<string> {
const out = new Set<string>();
for (const raw of s.toLowerCase().split(/[^a-z0-9_]+/)) {
if (raw.length >= 4 && !STOPWORDS.has(raw)) out.add(raw);
}
return out;
}
function rationaleMatchesDiff(p: Patch): boolean {
// Only check tokens introduced by the patch (what's in new but not old).
const newToks = tokenize(p.new_string);
const oldToks = tokenize(p.old_string);
const added = new Set([...newToks].filter((t) => !oldToks.has(t)));
if (added.size === 0) return true; // pure deletion — rationale unconstrained
const rationaleToks = tokenize(p.rationale);
for (const t of added) if (rationaleToks.has(t)) return true;
return false;
}
async function main() {
log(`starting · min_conf=${MIN_CONF} max_files=${MAX_FILES} model=${MODEL} commit=${COMMIT}`);
if (COMMIT) {
const headR = await sh(["git", "rev-parse", "--abbrev-ref", "HEAD"]);
const currentBranch = headR.stdout.trim();
if (currentBranch === "main") {
log(`refusing to run on main — create a branch first or set LH_APPLIER_BRANCH`);
const coR = await sh(["git", "checkout", "-b", BRANCH]);
if (coR.code !== 0) { log(`could not create branch ${BRANCH}: ${coR.stderr.slice(0, 200)}`); process.exit(1); }
log(`working branch: ${BRANCH}`);
} else {
log(`working branch: ${currentBranch}`);
}
}
const reviews = await loadLatestReviews();
log(`loaded ${reviews.size} latest reviews`);
const eligible = [...reviews.values()].filter(r =>
passesConfidenceGate(r) && passesDenyList(r.file) &&
(TARGET_FILES.length === 0 || TARGET_FILES.includes(r.file))
).sort((a, b) => (b.confidence_avg ?? 0) - (a.confidence_avg ?? 0));
if (TARGET_FILES.length > 0) {
log(`LH_APPLIER_FILES set — constrained to ${TARGET_FILES.length} target file(s): ${TARGET_FILES.join(", ")}`);
}
log(`${eligible.length} pass confidence gate + deny-list${TARGET_FILES.length > 0 ? " + target filter" : ""}`);
log(`taking top ${Math.min(MAX_FILES, eligible.length)} by confidence`);
// Establish pre-run warning baseline so post-patch cargo check can
// reject patches that keep the build green but add new warnings.
// This gate exists because 96b46cd landed unused imports through the
// green-only gate — compiled fine, left a permanent dead-code warning.
log(`baseline cargo check (warning count)...`);
let baselineWarnings = (await cargoCheck()).warnings;
log(`baseline warnings = ${baselineWarnings}`);
let committedFiles = 0;
let revertedFiles = 0;
let warningReverts = 0;
let rationaleReverts = 0;
for (const r of eligible.slice(0, MAX_FILES)) {
log(`${r.file} (conf_avg=${r.confidence_avg} tier=${r.gradient_tier})`);
const full = `${REPO}/${r.file}`;
if (!existsSync(full)) { log(` skip — file not found on disk`); continue; }
const source = await readFile(full, "utf8");
const patches = await requestPatches(r.file, source, r.suggestions_preview ?? "");
if (patches.length === 0) {
log(` no patches produced`);
await auditLog({ action: "no_patches", file: r.file, reviewer_model: r.accepted_model });
continue;
}
log(` ${patches.length} candidate patches`);
const { applied, rejected } = await applyPatches(r.file, patches);
log(` applied ${applied}, rejected ${rejected.length}`);
for (const rj of rejected) log(`${rj.reason}`);
if (applied === 0) {
await auditLog({ action: "all_rejected", file: r.file, rejected: rejected.map(x => x.reason) });
continue;
}
log(` running cargo check...`);
const { green, warnings } = await cargoCheck();
if (!green) {
log(` ✗ build red — reverting ${r.file}`);
await revertFile(r.file);
revertedFiles++;
await auditLog({ action: "build_red_reverted", file: r.file, patches_applied: applied });
continue;
}
// New-warning gate. If a patch keeps the build green but adds a
// warning (unused import, dead_code after removed allow, etc), treat
// it the same as build red: revert. The cargo-green gate alone let
// 96b46cd land unused imports; this closes that door.
if (warnings > baselineWarnings) {
log(` ✗ warnings ${baselineWarnings}${warnings} (+${warnings - baselineWarnings}) — reverting ${r.file}`);
await revertFile(r.file);
warningReverts++;
await auditLog({
action: "warnings_increased_reverted",
file: r.file,
patches_applied: applied,
warnings_before: baselineWarnings,
warnings_after: warnings,
});
continue;
}
// Rationale-diff alignment heuristic. At least one non-stopword
// token from the applied patches' combined new_string must appear
// in the rationale, OR the rationale must name the file's module.
// This catches cases like "Add destructive SQL filter" rationale
// with a diff that only adds `use tracing`.
const applied_patches = patches.slice(0, applied);
const rationale_ok = applied_patches.some((p) => rationaleMatchesDiff(p));
if (!rationale_ok) {
log(` ✗ rationale doesn't name anything in diff — reverting ${r.file}`);
await revertFile(r.file);
rationaleReverts++;
await auditLog({
action: "rationale_mismatch_reverted",
file: r.file,
patches_applied: applied,
rationales: applied_patches.map((p) => p.rationale),
});
continue;
}
log(` ✓ build green (warnings ${warnings}/${baselineWarnings})`);
// Dry-run MUST revert the file after a successful check so the
// workspace doesn't accumulate unpushed changes across dry-runs.
// Previously this left modified files, silently polluting state.
if (!COMMIT) {
log(` (dry-run) would commit ${r.file} — reverting workspace change`);
await revertFile(r.file);
await auditLog({
action: "dry_run_would_commit",
file: r.file,
patches_applied: applied,
patches_rejected: rejected.length,
confidence_avg: r.confidence_avg,
gradient_tier: r.gradient_tier,
reviewer_model: r.accepted_model,
});
continue;
}
const ok = await gitCommit(r.file, applied_patches);
if (ok) {
committedFiles++;
// Advance the warning baseline so a later patch on a different
// file is compared against the new (committed) state, not the
// pre-run state — otherwise every subsequent patch trips the gate.
baselineWarnings = warnings;
await auditLog({
action: "committed",
file: r.file,
patches_applied: applied,
patches_rejected: rejected.length,
confidence_avg: r.confidence_avg,
gradient_tier: r.gradient_tier,
reviewer_model: r.accepted_model,
warnings_after: warnings,
});
}
}
log(`DONE · committed=${committedFiles} build_red=${revertedFiles} warning_revert=${warningReverts} rationale_revert=${rationaleReverts}`);
}
await main();