lakehouse/tests/real-world/scrum_master_pipeline.ts
root ed85620558 scrum: filter table-header words from bug_fingerprint extraction
Iter 11 surfaced "DeadCode:Flag" in the matrix — a noisy pattern_key
where "Flag" is the table column HEADER kimi produces for structured
review output, not an actual Rust identifier.

Kimi's standard format on recent iters:
  | # | Change                    | Flag       | Confidence |
  | 1 | Wire AgentIdentity into.. | Boundary.. | 92%        |

The extractor's KEYWORDS set already filtered Rust grammar words
(self, mut, async, etc) and the FLAG_VARIANTS themselves. Adding
markdown-layout words (Flag, Change, Confidence, PRD, Plan) closes
the last common noise class.

One-line addition — empirically validated against the iter 11
vectord trace that produced DeadCode:Flag. Future iters won't
reproduce that specific noise.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 13:22:50 -05:00

1258 lines
58 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Scrum-master orchestrator — pulls git repo source + PRD + a change
// proposal, chunks everything, hands each code piece to the proven
// escalation ladder (small-local → big-local → cloud → specialist →
// biggest) with learning context between attempts. Collects per-file
// suggestions in a coherent handoff report.
//
// What it composes (everything below is already shipped + proven):
// - Chunker + embeddings (sidecar /embed, nomic-embed-text)
// - In-memory cosine retrieval (top-K PRD + plan chunks per file)
// - Escalation ladder (6 tiers, cycling on empty/error/thin-answer)
// - Per-attempt learning-context injection (prior failures → prompt)
// - Tree-split fallback when combined context exceeds budget
// - JSONL output per file + summary
//
// Deliberate scope limit: TARGET_FILES is 3 files by default. The
// pipeline works at larger N, but at ~90s/file × 3 files = 4-5 min,
// 15 files = 22 min. Bump via env LH_SCRUM_FILES="path1,path2,...".
//
// Run: bun run tests/real-world/scrum_master_pipeline.ts
import { readFile, writeFile, mkdir } from "node:fs/promises";
import { createHash } from "node:crypto";
const GATEWAY = "http://localhost:3100";
const SIDECAR = "http://localhost:3200";
const CHUNK_SIZE = 800;
const CHUNK_OVERLAP = 120;
const TOP_K_CONTEXT = 5;
const MAX_ATTEMPTS = 9;
// Files larger than this get tree-split instead of truncated. Fixes the
// 6KB false-positive class (model claiming a field is "missing" when
// it exists past the context cutoff).
// Env-configurable so the pipeline can adapt to different repos:
// a 13K-line Python file like /root/llm-team-ui/llm_team_ui.py needs
// larger shards to avoid producing 200+ cloud calls per review.
// Defaults stay at 6000 / 3500 — tuned for Rust source files in
// crates/<crate>/src/*.rs.
const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000);
const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500);
// Appended jsonl so auditor's kb_query can surface scrum findings for
// files touched by a PR under review. Part of cohesion plan Phase C.
const SCRUM_REVIEWS_JSONL = process.env.LH_SCRUM_REVIEWS_OUT
|| "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl";
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/scrum_${Date.now().toString(36)}`;
const PRD_PATH = process.env.LH_SCRUM_PRD
|| "/home/profit/lakehouse/docs/PRD.md";
// Using CONTROL_PLANE_PRD as the "suggested changes" doc since it
// describes the Phase 38-44 target architecture and is on main.
// Override via LH_SCRUM_PROPOSAL env to point at a fix-wave doc
// generated from a phase-sweep audit, so the scrum pulls direction
// from concrete findings instead of the high-level PRD alone.
const PROPOSAL_PATH = process.env.LH_SCRUM_PROPOSAL
|| "/home/profit/lakehouse/docs/CONTROL_PLANE_PRD.md";
// Iter 2+ — when LH_SCRUM_FORENSIC is set to a file path, prepend its
// contents as an adversarial auditor preamble to every per-file prompt.
// This flips the review tone from "suggest improvements" to "prove it
// works or mark FAIL." Added 2026-04-23 for iter 2 of the 6x loop.
// Empty string = no preamble (iter-1 behavior).
const FORENSIC_PREAMBLE = process.env.LH_SCRUM_FORENSIC
? (() => {
try {
return require("node:fs").readFileSync(process.env.LH_SCRUM_FORENSIC!, "utf8");
} catch (e) {
console.error(`[scrum] warning: could not read LH_SCRUM_FORENSIC=${process.env.LH_SCRUM_FORENSIC}: ${e}`);
return "";
}
})()
: "";
// Scoped target: 3 representative source files by default.
// The scrum-master walks these in order and produces one suggestion
// set per file. Override via env for a wider sweep.
const DEFAULT_TARGETS = [
"/home/profit/lakehouse/crates/vectord/src/playbook_memory.rs",
"/home/profit/lakehouse/crates/vectord/src/doc_drift.rs",
"/home/profit/lakehouse/auditor/audit.ts",
];
const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES
? process.env.LH_SCRUM_FILES.split(",").map(s => s.trim())
: DEFAULT_TARGETS;
// Cloud-first ladder, STRONGEST-MODEL-FIRST (iter 3+, 2026-04-24).
// J's direction: "switch to the strongest cloud model" for iter 3 —
// the forensic prompt is demanding enough that even 120B gets rejected
// as thin. Rank by parameter count / reasoning strength:
// 1. kimi-k2:1t — 1T params, Moonshot flagship (biggest)
// 2. kimi-k2.6 — Moonshot next-gen, pro tier
// 3. deepseek-v3.1:671b — 671B, strong reasoning + coding
// 4. mistral-large-3:675b — 675B, deep analysis
// 5. qwen3.5:397b — 397B (iter 2's rescue model)
// 6. gpt-oss:120b — 120B (iter 1's primary; still strong fallback)
// Local fallbacks kept for cloud-down scenarios.
// Hot-path pipelines (scenario.ts / execution_loop) stay local per
// Phase 20 t1_hot — this scrum is not hot path.
const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [
{ provider: "ollama_cloud", model: "kimi-k2:1t", note: "cloud 1T — biggest available, 1.4s probe" },
{ provider: "ollama_cloud", model: "qwen3-coder:480b", note: "cloud 480B — coding specialist, 0.9s probe" },
{ provider: "ollama_cloud", model: "deepseek-v3.1:671b", note: "cloud 671B — fast reasoning (1.0s probe)" },
{ provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B — deep analysis (0.9s probe)" },
{ provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B — reliable workhorse (iter1 baseline)" },
{ provider: "ollama_cloud", model: "qwen3.5:397b", note: "cloud 397B dense — deep final thinker (J 2026-04-24)" },
// Free-tier rescue — different provider backbone, different quota.
// Added 2026-04-24 after iter 5 hit repeated Ollama Cloud 502s on
// kimi-k2:1t. These have lower parameter counts than the Ollama
// Cloud rungs but high availability: if upstream is down, we still
// land a review instead of giving up.
{ provider: "openrouter", model: "openai/gpt-oss-120b:free", note: "OpenRouter free 120B — substantive rescue, 2.8s probe" },
{ provider: "openrouter", model: "google/gemma-3-27b-it:free", note: "OpenRouter free 27B — fastest rescue, 1.4s probe" },
{ provider: "ollama", model: "qwen3.5:latest", note: "local qwen3.5 — best local model per J (2026-04-24), last-resort if all cloud down" },
// Dropped from the ladder after 2026-04-24 probe:
// - kimi-k2.6 — not available on current tier (empty response)
// - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist)
// - minimax-m2.7 — 400 thinking tokens, 0 content output
// - openrouter qwen3-coder:free / llama-3.3 / hermes-3 — provider errors
// - openrouter minimax-m2.5:free — 45s timeout
];
type Chunk = { id: string; text: string; embedding: number[]; origin: string; offset: number };
interface FileReview {
file: string;
file_bytes: number;
tree_split_fired: boolean;
shards_summarized: number;
top_prd_chunks: Array<{ origin: string; offset: number; score: number }>;
top_proposal_chunks: Array<{ origin: string; offset: number; score: number }>;
attempts_made: number;
attempts_history: Array<{ n: number; model: string; status: "accepted" | "thin" | "error"; chars: number; error?: string }>;
accepted_on: number | null;
escalated_to_model: string;
suggestions: string;
duration_ms: number;
}
function log(msg: string) { console.log(`[scrum] ${msg}`); }
function cosine(a: number[], b: number[]): number {
let dot = 0, na = 0, nb = 0;
for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
return na && nb ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0;
}
function chunkText(text: string): Array<{ text: string; offset: number }> {
const out: Array<{ text: string; offset: number }> = [];
for (let i = 0; i < text.length; ) {
const end = Math.min(i + CHUNK_SIZE, text.length);
const slice = text.slice(i, end).trim();
if (slice.length > 60) out.push({ text: slice, offset: i });
if (end >= text.length) break;
i = end - CHUNK_OVERLAP;
}
return out;
}
async function embedBatch(texts: string[]): Promise<number[][]> {
const r = await fetch(`${SIDECAR}/embed`, {
method: "POST", headers: { "content-type": "application/json" },
body: JSON.stringify({ texts }),
signal: AbortSignal.timeout(120000),
});
if (!r.ok) throw new Error(`embed ${r.status}`);
return (await r.json() as any).embeddings;
}
// ─── Pathway memory (2026-04-24 consensus design) ───────────────────
//
// Mirrors vectord/src/pathway_memory.rs. The bucket-hash vector MUST
// byte-match the Rust implementation so traces written from TypeScript
// are searchable against the same embedding space. Verified by running
// both implementations on the same input tokens and asserting matching
// bucket indices.
function filePrefix(path: string): string {
return path.split("/").slice(0, 2).join("/");
}
function computePathwayId(taskClass: string, filePath: string, signalClass: string | null): string {
const h = createHash("sha256");
h.update(taskClass);
h.update("|");
h.update(filePrefix(filePath));
h.update("|");
h.update(signalClass ?? "");
return h.digest("hex");
}
// 32-bucket L2-normalized token hash. Same algorithm as Rust.
function buildPathwayVec(tokens: string[]): number[] {
const buckets = new Array(32).fill(0);
for (const t of tokens) {
const d = createHash("sha256").update(t, "utf8").digest();
const b1 = d[0] % 32;
const b2 = d[8] % 32;
buckets[b1] += 1;
buckets[b2] += 1;
}
let norm = 0;
for (const v of buckets) norm += v * v;
norm = Math.sqrt(norm);
if (norm > 0) for (let i = 0; i < buckets.length; i++) buckets[i] /= norm;
return buckets;
}
// Build the minimal query vector for a pre-ladder hot-swap check. We
// don't yet know the ladder attempts or KB chunks — the query vec is
// computed from what we CAN know up front: task/file/signal. This is
// a weaker embedding than the one computed at trace-insert time, but
// similarity still discriminates between task/file/signal combinations.
function buildQueryVec(taskClass: string, filePath: string, signalClass: string | null): number[] {
const tokens = [taskClass, filePath];
if (signalClass) tokens.push(`signal:${signalClass}`);
return buildPathwayVec(tokens);
}
interface HotSwapCandidate {
pathway_id: string;
similarity: number;
replay_count: number;
success_rate: number;
recommended_rung: number;
recommended_model: string;
}
async function queryHotSwap(taskClass: string, filePath: string, signalClass: string | null): Promise<HotSwapCandidate | null> {
try {
const query_vec = buildQueryVec(taskClass, filePath, signalClass);
const r = await fetch(`${GATEWAY}/vectors/pathway/query`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({ task_class: taskClass, file_path: filePath, signal_class: signalClass, query_vec }),
signal: AbortSignal.timeout(5000),
});
if (!r.ok) return null;
const j = await r.json() as { candidate: HotSwapCandidate | null };
return j.candidate ?? null;
} catch {
// Pathway service unavailable → run full ladder. Hot-swap is
// always an optimization, never a correctness requirement.
return null;
}
}
interface LadderAttemptRec {
rung: number;
model: string;
latency_ms: number;
accepted: boolean;
reject_reason: string | null;
}
interface PathwayTracePayload {
pathway_id: string;
task_class: string;
file_path: string;
signal_class: string | null;
created_at: string;
ladder_attempts: LadderAttemptRec[];
kb_chunks: { source_doc: string; chunk_id: string; cosine_score: number; rank: number }[];
observer_signals: { class: string; priors: string[]; prior_iter_outcomes: string[] }[];
bridge_hits: { library: string; version: string }[];
sub_pipeline_calls: { pipeline: string; result_summary: string }[];
audit_consensus: { pass: boolean; models: string[]; disagreements: number } | null;
reducer_summary: string;
final_verdict: string;
pathway_vec: number[];
// ADR-021 semantic-correctness layer. `kind` field matches the Rust
// serde(tag = "kind") wire format — TS and Rust interop directly.
semantic_flags: { kind: string }[];
type_hints_used: { source: string; symbol: string; type_repr: string }[];
bug_fingerprints: { flag: { kind: string }; pattern_key: string; example: string; occurrences: number }[];
replay_count: number;
replays_succeeded: number;
retired: boolean;
}
async function writePathwayTrace(trace: PathwayTracePayload): Promise<void> {
try {
await fetch(`${GATEWAY}/vectors/pathway/insert`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify(trace),
signal: AbortSignal.timeout(10000),
});
} catch {
// Fire-and-forget: scrum runs shouldn't fail if pathway insert fails.
}
}
async function recordPathwayReplay(pathwayId: string, succeeded: boolean): Promise<void> {
try {
await fetch(`${GATEWAY}/vectors/pathway/record_replay`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({ pathway_id: pathwayId, succeeded }),
signal: AbortSignal.timeout(5000),
});
} catch {
// Fire-and-forget. Not critical.
}
}
// ADR-021 Phase C: pre-review enrichment. Fetch aggregated bug
// fingerprints for this narrow fingerprint (same key as hot-swap —
// task_class + file_prefix + signal_class) so the reviewer prompt
// can explicitly warn "this file area has had these bug patterns
// before." Empty on fresh install; grows as the matrix index learns.
interface BugFingerprintRow {
flag: { kind: string };
pattern_key: string;
example: string;
occurrences: number;
}
async function fetchBugFingerprints(taskClass: string, filePath: string, signalClass: string | null, limit: number): Promise<BugFingerprintRow[]> {
try {
const r = await fetch(`${GATEWAY}/vectors/pathway/bug_fingerprints`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({ task_class: taskClass, file_path: filePath, signal_class: signalClass, limit }),
signal: AbortSignal.timeout(5000),
});
if (!r.ok) return [];
const j = await r.json() as { fingerprints: BugFingerprintRow[] };
return j.fingerprints ?? [];
} catch {
return [];
}
}
// Deterministic signal_class lookup from scrum_reviews.jsonl history.
// First-time files get `null`. Files seen before get the signal class
// the observer assigned on their most-recent review (if any). Keeps the
// pathway fingerprint stable across iterations for LOOPING files.
async function lookupSignalClass(filePath: string): Promise<string | null> {
try {
const { readFile } = await import("node:fs/promises");
const raw = await readFile(SCRUM_REVIEWS_JSONL, "utf8").catch(() => "");
if (!raw) return null;
const lines = raw.trim().split("\n").reverse();
for (const line of lines) {
try {
const r = JSON.parse(line);
if (r.file === filePath && r.signal_class) return r.signal_class;
} catch {}
}
return null;
} catch { return null; }
}
async function chat(opts: {
provider: "ollama" | "ollama_cloud",
model: string,
prompt: string,
max_tokens?: number,
}): Promise<{ content: string; error?: string; prompt_tokens: number; completion_tokens: number }> {
try {
const r = await fetch(`${GATEWAY}/v1/chat`, {
method: "POST", headers: { "content-type": "application/json" },
body: JSON.stringify({
provider: opts.provider,
model: opts.model,
messages: [{ role: "user", content: opts.prompt }],
max_tokens: opts.max_tokens ?? 1500,
temperature: 0.2,
think: false,
}),
signal: AbortSignal.timeout(180000),
});
if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 200)}`, prompt_tokens: 0, completion_tokens: 0 };
const j: any = await r.json();
return {
content: j.choices?.[0]?.message?.content ?? "",
prompt_tokens: j.usage?.prompt_tokens ?? 0,
completion_tokens: j.usage?.completion_tokens ?? 0,
};
} catch (e) {
return { content: "", error: (e as Error).message, prompt_tokens: 0, completion_tokens: 0 };
}
}
// Accept a file-review answer if it's substantive + structured.
// We're not validating Rust here — we're validating that the model
// produced a coherent suggestion set.
function isAcceptable(answer: string): boolean {
if (answer.length < 200) return false; // too thin
// Must at least try a structured form — numbered list, bullets,
// or sections. Models that just hand-wave fail.
const hasStructure = /^\s*[-*]\s/m.test(answer)
|| /^\s*\d+\.\s/m.test(answer)
|| /^\s*#/m.test(answer);
return hasStructure;
}
function retrieveTopK(query_emb: number[], pool: Chunk[], k: number): Chunk[] {
return pool
.map(c => ({ c, score: cosine(query_emb, c.embedding) }))
.sort((a, b) => b.score - a.score)
.slice(0, k)
.map(x => ({ ...x.c, _score: x.score } as any));
}
// Tree-split a large file: shard it, summarize each shard into a
// running scratchpad, THEN run a reduce step that collapses the
// scratchpad into one file-level synthesis with shard boundaries
// stripped. Returns the synthesis (not the raw scratchpad) so the
// final reviewer never sees "--- shard N ---" markers and can't
// leak them into its review title.
//
// Phase 21 design (aibridge/src/tree_split.rs) with the map → reduce
// shape. Earlier version concatenated per-shard digests directly into
// the reviewer prompt, which led to kimi-k2:1t writing review titles
// like "Forensic Audit Report file.rs (shard 3)" because the shard
// markers bled through. Fix 2026-04-24 adds the reduce step.
async function treeSplitFile(
filePath: string,
content: string,
): Promise<{ scratchpad: string; shards: number; cloud_calls: number }> {
const shards: Array<{ from: number; to: number; text: string }> = [];
for (let i = 0; i < content.length; i += FILE_SHARD_SIZE) {
const end = Math.min(i + FILE_SHARD_SIZE, content.length);
shards.push({ from: i, to: end, text: content.slice(i, end) });
}
// MAP — each shard produces a digest that feeds the next shard's
// context. Internal markers are kept to help the reducer align
// overlapping observations across shards; they're stripped before
// the reviewer sees anything.
let workingScratchpad = "";
let cloud_calls = 0;
log(` tree-split: ${content.length} chars → ${shards.length} shards of ${FILE_SHARD_SIZE}`);
for (const [si, shard] of shards.entries()) {
const prompt = `You are writing a SECTION of a full-file summary. File: ${filePath}. This is one piece (bytes ${shard.from}..${shard.to}) of a larger source file you are NOT seeing in its entirety right now.
─────── source ───────
${shard.text}
─────── end source ───────
Prior-piece notes so far (if empty, this is the first piece):
${workingScratchpad || "(empty)"}
Extract facts about the code in this piece that will help review the FULL file later: function + struct names with brief purpose, struct fields + types, invariants, TODOs, error-handling style, obvious gaps. Under 150 words. Flat facts only, no headings, no phrases like "this shard" or "in my section".`;
const r = await chat({
provider: "ollama_cloud",
model: "gpt-oss:120b",
prompt,
max_tokens: 400,
});
cloud_calls += 1;
if (r.content) {
// Keep internal alignment markers for the reducer; stripped later.
workingScratchpad += `\${si + 1}§\n${r.content.trim()}`;
}
}
// REDUCE — collapse the per-shard digests into one coherent
// file-level summary. The reducer sees all digests at once and
// produces a single narrative the reviewer can treat as "the file".
// Shard markers are NOT in the output. This is what fixes the
// shard-leakage bug that affected both the scrum and the auditor.
const reducePrompt = `You are producing a SINGLE coherent summary of a Rust/TypeScript source file from a set of prior-piece notes. The notes were taken while walking the file in order but should be merged into one description of the whole file.
FILE: ${filePath} (${content.length} bytes, ${shards.length} pieces)
PRIOR-PIECE NOTES (markers §N§ delimit pieces but are artifacts — do not mention them):
${workingScratchpad}
Produce ONE coherent file-level summary:
1. One-sentence purpose of the file.
2. Key public types / functions / constants (names + one-line purpose each).
3. Known gaps, TODOs, or error-handling inconsistencies the notes surfaced.
4. Obvious invariants the file relies on.
Do NOT say "piece 1" or "shard N" or "section" — present the summary as if you read the whole file at once. Under 600 words.`;
const reduced = await chat({
provider: "ollama_cloud",
model: "gpt-oss:120b",
prompt: reducePrompt,
max_tokens: 900,
});
cloud_calls += 1;
const synthesis = reduced.content?.trim() ?? "";
// Safety: if the reducer returned thin output, fall back to the
// raw scratchpad stripped of markers — better than nothing.
const final = synthesis.length > 200
? synthesis
: workingScratchpad.replace(/§\d+§\n/g, "").trim();
return { scratchpad: final, shards: shards.length, cloud_calls };
}
async function reviewFile(
filePath: string,
prd_chunks: Chunk[],
proposal_chunks: Chunk[],
): Promise<FileReview> {
const t0 = Date.now();
log(`file: ${filePath}`);
const content = await readFile(filePath, "utf8");
const rel = filePath.replace("/home/profit/lakehouse/", "");
// Build a query embedding from the first ~800 chars of the file
// (good enough for topical retrieval).
const seed = content.slice(0, 800);
const [seedEmb] = await embedBatch([seed]);
const topPrd = retrieveTopK(seedEmb, prd_chunks, TOP_K_CONTEXT);
const topPlan = retrieveTopK(seedEmb, proposal_chunks, TOP_K_CONTEXT);
log(` retrieved ${topPrd.length} PRD chunks + ${topPlan.length} proposal chunks`);
const contextBlock = [
"═══ RELEVANT PRD EXCERPTS ═══",
...topPrd.map(c => `[PRD @${c.offset}]\n${c.text.slice(0, 600)}`),
"",
"═══ RELEVANT CHANGE PROPOSAL EXCERPTS ═══",
...topPlan.map(c => `[PLAN @${c.offset}]\n${c.text.slice(0, 600)}`),
].join("\n\n");
// Files bigger than FILE_TREE_SPLIT_THRESHOLD get tree-split.
// Summarize each shard to a scratchpad, then review against the
// scratchpad instead of the truncated first chunk. Prevents the
// false-positive pattern where the model claims a field is
// "missing" because it's past the context cutoff.
let sourceForPrompt: string;
let treeSplitFired = false;
let shardsSummarized = 0;
let extraCloudCalls = 0;
if (content.length > FILE_TREE_SPLIT_THRESHOLD) {
treeSplitFired = true;
const ts = await treeSplitFile(rel, content);
shardsSummarized = ts.shards;
extraCloudCalls = ts.cloud_calls;
sourceForPrompt = `[FULL-FILE SCRATCHPAD — distilled from ${ts.shards} shards via tree-split]\n${ts.scratchpad}`;
} else {
sourceForPrompt = content;
}
// Prompt — when tree-split fired, include an explicit instruction
// not to claim a field/function is "missing" because the scratchpad
// is a distillation not the full file. Attacks the rubric-tuning
// concern J called out.
const truncationWarning = treeSplitFired
? `\nIMPORTANT: the "source" below is a multi-shard distillation (tree-split across ${shardsSummarized} shards), NOT the full raw file. DO NOT claim any field, function, or feature is "missing" based on its absence from this distillation — the distillation may have elided it. Only call out gaps that appear DIRECTLY contradicted by the PRD excerpts.\n`
: "";
const forensicPrefix = FORENSIC_PREAMBLE
? `${FORENSIC_PREAMBLE}\n\n═══ FILE UNDER AUDIT ═══\n\n`
: "";
const baseTask = `${forensicPrefix}You are reviewing one source file against the Lakehouse PRD and an active cohesion-integration plan.
FILE: ${rel} (${content.length} bytes${treeSplitFired ? `, tree-split into ${shardsSummarized} shards` : ""})
${truncationWarning}
─────── source ───────
${sourceForPrompt}
─────── end source ───────
${contextBlock}
Produce a structured review with:
1. Alignment score (1-10) between this file and the PRD intent
2. 3-5 concrete suggested changes (bullet points), each naming a specific function/line and what to change
3. Any gap where this file's behavior contradicts the PRD or the proposal
${FORENSIC_PREAMBLE ? "4. Apply the forensic audit passes from the preamble: pseudocode detection, PRD contract status, normalization/validation pipeline, failure→repair loop, execution memory, relevance orchestration, execution safety, testing evidence. Issue a verdict pass|needs_patch|fail." : ""}
**Per-finding confidence (required on every suggestion):**
Attach a self-assessed **Confidence: NN%** to every suggested change AND every gap you list. The percentage is your belief that the suggestion is correct, will compile, and lands the PRD intent. Calibration guide:
- 90-100%: pattern seen repeatedly in shipped code; change is mechanical; low risk of regressions
- 70-89%: confident in direction, some room for interpretation on API shape or naming
- 50-69%: plausible fix but may not match existing conventions or may cascade to other files
- <50%: genuinely uncertain — include regardless so downstream knows to investigate before applying
Format each finding as: \`**1.** <change>. **Confidence: NN%.**\` (in tables, add a final "Confidence" column.) Low confidence is valuable signal — do not round up.
**Per-finding semantic-flag tag (ADR-021, required on every finding):**
Also attach a \`**Flag: <CATEGORY>**\` on each finding so the pathway-memory matrix index can cluster bug classes over time. Pick the ONE tag that best fits; if none fits, use \`None\`. Allowed categories:
- \`UnitMismatch\` — operation combines values with different units (e.g. row_count - file_count, bytes - rows)
- \`TypeConfusion\` — same type, wrong role (e.g. treating a PK as a row index)
- \`NullableConfusion\` — unwrap-without-check or nullable-treated-as-non-null
- \`OffByOne\` — loop / range / slice boundary mistake
- \`StaleReference\` — calls a deprecated / removed / moved symbol
- \`PseudoImpl\` — stub / todo!() / function named for work it doesn't do
- \`DeadCode\` — unreachable or uncalled code
- \`WarningNoise\` — compiles green but would add a cargo warning
- \`BoundaryViolation\` — crosses a crate/layer boundary it shouldn't
- \`None\` — improvement or nicety that doesn't fit a bug category
In tables, add a "Flag" column. Examples:
\`**1.** Rewrite base_rows calc. **Confidence: 90%.** **Flag: UnitMismatch.**\`
\`**2.** Extract retry loop. **Confidence: 75%.** **Flag: None.**\`
Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-offset when relevant.`;
const history: FileReview["attempts_history"] = [];
let accepted: string | null = null;
let acceptedModel = "";
let acceptedOn = 0;
// Pathway hot-swap pre-check. If a proven pathway exists for this
// (task, file_prefix, signal) combo with ≥3 replays at ≥80% success,
// skip the ladder and try its winning rung first. On success we
// record a positive replay; on failure we fall through to the full
// ladder and record a negative replay. Fire-and-forget — pathway
// service unavailable → null candidate → business as usual.
const signalClass = await lookupSignalClass(rel);
const taskClass = "scrum_review";
const hotSwap = await queryHotSwap(taskClass, rel, signalClass);
// ADR-021 Phase C: pre-review enrichment. Pull aggregated bug
// fingerprints the matrix index has learned for this narrow
// fingerprint and prepend to the reviewer prompt as historical
// context. This is the compounding mechanism — iter-N reviewer
// sees what iter-(N-1) and earlier found, so the grammar of bugs
// accumulates instead of being re-discovered each iteration.
const pastFingerprints = await fetchBugFingerprints(taskClass, rel, signalClass, 5);
let pathwayPreamble = "";
if (pastFingerprints.length > 0) {
pathwayPreamble = "═══ PATHWAY MEMORY — BUGS PREVIOUSLY FOUND ON THIS FILE AREA (ADR-021) ═══\n" +
"The matrix index has flagged these patterns on the same task_class + file_prefix + signal_class before. Check this file for recurrences of the same shape:\n\n" +
pastFingerprints.map((fp, i) =>
`${i + 1}. [${fp.flag.kind}] pattern=\`${fp.pattern_key}\` occurrences=${fp.occurrences}\n example: ${fp.example.slice(0, 160)}`
).join("\n") +
"\n═══\n\n";
log(` 📚 pathway memory: ${pastFingerprints.length} historical bug pattern(s) prepended to prompt`);
}
let hotSwapOrderedIndices: number[] | null = null;
if (hotSwap) {
// Reorder the ladder to try the recommended model first. Rung
// indices are preserved in the output so the trace still reflects
// the true ladder position the model sits at.
const recommendedIdx = LADDER.findIndex(r => r.model === hotSwap.recommended_model);
if (recommendedIdx >= 0) {
log(` 🔥 hot-swap candidate: ${hotSwap.recommended_model} (rung ${hotSwap.recommended_rung}, sim=${hotSwap.similarity.toFixed(3)}, success_rate=${hotSwap.success_rate.toFixed(2)}, ${hotSwap.replay_count} replays)`);
hotSwapOrderedIndices = [recommendedIdx, ...LADDER.map((_, i) => i).filter(i => i !== recommendedIdx)];
}
}
const ladderOrder = hotSwapOrderedIndices ?? LADDER.map((_, i) => i);
// Collect attempts for the pathway trace sidecar.
const pathwayAttempts: LadderAttemptRec[] = [];
for (let step = 0; step < MAX_ATTEMPTS; step++) {
const i = ladderOrder[step];
const n = step + 1;
const rung = LADDER[i];
const learning = history.length > 0
? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status}${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══`
: "";
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}`);
const attemptStarted = Date.now();
const r = await chat({
provider: rung.provider,
model: rung.model,
prompt: pathwayPreamble + baseTask + learning,
max_tokens: 1500,
});
const attemptMs = Date.now() - attemptStarted;
if (r.error) {
history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) });
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `error: ${r.error.slice(0, 100)}` });
log(` ✗ error: ${r.error.slice(0, 80)}`);
continue;
}
if (!isAcceptable(r.content)) {
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` });
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `thin (${r.content.length} chars)` });
log(` ✗ thin/unstructured (${r.content.length} chars)`);
continue;
}
history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: true, reject_reason: null });
accepted = r.content;
acceptedModel = `${rung.provider}/${rung.model}`;
acceptedOn = n;
log(` ✓ ACCEPTED on attempt ${n} (${rung.model}, ${r.content.length} chars)`);
break;
}
// Hot-swap bookkeeping: if we tried the recommended model first,
// report whether it worked so the pathway's success_rate updates.
if (hotSwap) {
const replaySucceeded = acceptedModel.endsWith(`/${hotSwap.recommended_model}`);
log(` pathway replay ${replaySucceeded ? "✓" : "✗"} (${hotSwap.pathway_id.slice(0, 12)}…)`);
// Fire and forget — don't await; observer can handle it.
recordPathwayReplay(hotSwap.pathway_id, replaySucceeded);
}
const review: FileReview = {
file: rel,
file_bytes: content.length,
tree_split_fired: treeSplitFired,
shards_summarized: shardsSummarized,
top_prd_chunks: topPrd.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })),
top_proposal_chunks: topPlan.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })),
attempts_made: history.length,
attempts_history: history,
accepted_on: acceptedOn || null,
escalated_to_model: acceptedModel,
suggestions: accepted ?? "[no acceptable answer after escalation ladder exhausted]",
duration_ms: Date.now() - t0,
};
// Append to the shared scrum-reviews jsonl so the auditor's
// kb_query check can surface relevant reviews for files in a
// PR diff. Cohesion plan Phase C wire.
if (accepted) {
const { appendFile, mkdir } = await import("node:fs/promises");
const { dirname } = await import("node:path");
await mkdir(dirname(SCRUM_REVIEWS_JSONL), { recursive: true });
// Extract per-finding confidences from the accepted markdown.
// Patterns tried: "Confidence: NN%", "Confidence**: NN%",
// and table-cell "| 92% |". Cap at 20 matches to bound row size.
// Added 2026-04-23 (iter 2 direction from J: "make scrum output
// include self-assessed confidence per finding").
const confidences: number[] = [];
// Markdown format: "Confidence: 92%" / "Confidence**: 92%" / "| 92% |"
const patMarkdown = /(?:Confidence[*:\s]*\s*|\|\s*)(\d{1,3})\s*%/gi;
// JSON format (forensic strict output): "confidence": 92
const patJson = /"confidence"\s*:\s*(\d{1,3})(?!\d)/gi;
for (const pat of [patMarkdown, patJson]) {
const matches = accepted.matchAll(pat);
for (const hit of matches) {
if (confidences.length >= 40) break;
const pct = parseInt(hit[1], 10);
if (pct >= 0 && pct <= 100) confidences.push(pct);
}
}
const conf_avg = confidences.length
? Math.round(confidences.reduce((a, b) => a + b, 0) / confidences.length)
: null;
const conf_min = confidences.length ? Math.min(...confidences) : null;
// ADR-021 Phase B: extract per-finding semantic flags. Reviewer is
// prompted to tag each finding with one of 9 categories plus None.
// Patterns: "**Flag: UnitMismatch**", "Flag: OffByOne", table cell
// with the flag word, or bare-word match. Deduplicated per-trace
// so repeats in one review count once.
const FLAG_VARIANTS = [
"UnitMismatch", "TypeConfusion", "NullableConfusion", "OffByOne",
"StaleReference", "PseudoImpl", "DeadCode", "WarningNoise", "BoundaryViolation",
];
const flagMatches = new Set<string>();
// Prefer matches anchored to the "Flag:" keyword; fall back to
// bare-word matches so older reviewers that mention a category
// without the "Flag:" prefix still contribute signal.
const patFlagLabeled = /(?:Flag[*:\s]*\s*)([A-Z][A-Za-z]+)/g;
for (const m of accepted.matchAll(patFlagLabeled)) {
if (FLAG_VARIANTS.includes(m[1])) flagMatches.add(m[1]);
}
// Second pass — bare-word matches for each variant, but ONLY if
// the labeled pass produced nothing. This avoids flagging every
// file that happens to mention "DeadCode" in a code sample.
if (flagMatches.size === 0) {
for (const v of FLAG_VARIANTS) {
const re = new RegExp(`\\b${v}\\b`);
if (re.test(accepted)) flagMatches.add(v);
}
}
const semantic_flags_arr = [...flagMatches].map(k => ({ kind: k }));
// ADR-021 Phase D: bug_fingerprint extraction.
//
// Walk per-finding rows (either table format with columns
// `Change | Flag | Confidence` OR bullet-list with inline
// `**Flag: X.**` tag) and pair each flag with the surrounding
// finding text. Then derive a stable pattern_key from code
// identifiers the finding cites in backticks, so future reviews
// of similar bugs cluster under the same key.
//
// v1 is heuristic (regex + identifier extraction + canonical
// sort). It's intentionally NOT a semantic extractor — just a
// deterministic "take the top code-shaped tokens and hash them
// with the flag." Stability comes from sorting tokens alphabetically
// before hashing so "row_count + QueryResponse" and "QueryResponse
// + row_count" produce the same key.
const bug_fingerprints_arr: Array<{
flag: { kind: string };
pattern_key: string;
example: string;
occurrences: number;
}> = [];
{
// Split into candidate finding blocks. Both formats are row-
// oriented, so a line split is a reasonable starting point.
// Findings tend to be one-line table rows OR multi-line bullets
// starting with **N.** — we handle both by looking at any line
// that mentions a Flag variant and treating it as a finding.
const lines = accepted.split(/\r?\n/);
const seenKeys = new Set<string>();
for (const line of lines) {
// Find the flag variant on this line (if any).
let variantOnLine: string | null = null;
for (const v of FLAG_VARIANTS) {
const re = new RegExp(`\\b${v}\\b`);
if (re.test(line)) { variantOnLine = v; break; }
}
if (!variantOnLine) continue;
// Extract identifier-shaped tokens from backticks. We try two
// levels: (a) whole-backtick match if it's a clean identifier
// or path, (b) for complex content like function signatures
// (`Foo::bar(&self) -> u64`) pull out the longest identifier
// substrings so we still capture the callable.
const codeTokens: string[] = [];
const idRe = /[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*/g;
for (const m of line.matchAll(/`([^`]+)`/g)) {
const raw = m[1].trim();
// Whole-backtick identifier or dotted path? (`row_count`,
// `AccessControl::can_access`, `foo.bar`).
if (/^[A-Za-z_][A-Za-z0-9_:]*(?:\.[A-Za-z_][A-Za-z0-9_]*)?$/.test(raw)) {
if (raw.length >= 3) codeTokens.push(raw);
continue;
}
// Fallback: scan for identifier substrings, take the longest
// meaningful ones (usually the function or type name comes
// first in a signature like `Foo::bar(&self)`).
const ids = [...raw.matchAll(idRe)]
.map(x => x[0])
.filter(id => id.length >= 3);
// Prefer ::-qualified paths first (they're more specific),
// then the top-2 longest; keeps the key stable under
// signature variation.
const ranked = ids
.map(id => ({ id, score: (id.includes("::") ? 1000 : 0) + id.length }))
.sort((a, b) => b.score - a.score)
.slice(0, 2)
.map(x => x.id);
codeTokens.push(...ranked);
}
// Remove the flag variant name itself if it got captured (kimi
// and other reviewers often wrap the flag column in backticks).
// Also drop Rust + common keywords that slip through the
// identifier regex — "self", "mut", "async", "await", "pub"
// aren't bug-shape signal, they're grammar.
const FLAG_SET = new Set(FLAG_VARIANTS);
const KEYWORDS = new Set([
"self", "Self", "mut", "async", "await", "pub", "fn", "let",
"const", "static", "impl", "trait", "struct", "enum", "use",
"mod", "crate", "super", "match", "return", "Some", "None",
"Ok", "Err", "true", "false",
// Markdown table column headers kimi outputs for structured
// reviews — "Flag" / "Change" / "Confidence" are layout words,
// not identifiers. Seen as noise in iter 11 vectord extraction
// ("DeadCode:Flag" pattern_key).
"Flag", "Change", "Confidence", "PRD", "Plan",
]);
const filtered = codeTokens.filter(t => !FLAG_SET.has(t) && !KEYWORDS.has(t));
if (filtered.length === 0) continue;
// Canonicalize: dedupe, sort alphabetically, take top 3.
// Alphabetical sort gives stability across "A then B" / "B then A"
// variants. Top 3 keeps the key short while retaining enough
// signal for different bugs to separate.
const uniqTokens = [...new Set(filtered)].sort().slice(0, 3);
const pattern_key = `${variantOnLine}:${uniqTokens.join("-")}`;
if (seenKeys.has(pattern_key)) continue;
seenKeys.add(pattern_key);
// Example: the finding line, trimmed + truncated. Preserves
// just enough context that the pre-review preamble in the
// next iter can quote it back to the reviewer meaningfully.
const example = line.replace(/\s+/g, " ").trim().slice(0, 200);
bug_fingerprints_arr.push({
flag: { kind: variantOnLine },
pattern_key,
example,
occurrences: 1,
});
}
}
// Score extraction — regex accepts decimals ("Score: 4.5/10") and
// surrounding punctuation ("4/10 — mid"). iter 3 had 4 unparseable
// scores because the prior regex /(\d)\s*\/\s*10/ missed decimals.
const scoreMatch = accepted.match(/(?:score[\s*:]*)?(\d(?:\.\d)?)\s*\/\s*10\b/i);
const alignment_score = scoreMatch ? parseFloat(scoreMatch[1]) : null;
// Forensic JSON extraction — iter 3 showed 20/21 files came back
// as JSON (verdict + critical_failures[] + verified_components[] + ...)
// rather than markdown tables. Previously we only stored suggestions_preview
// (truncated to 2KB); now we also capture the structured counters so
// consumers can filter by verdict, sort by critical_failures_count, etc.
let verdict: string | null = null;
let critical_failures_count = 0;
let pseudocode_flags_count = 0;
let prd_mismatches_count = 0;
let missing_components_count = 0;
let verified_components_count = 0;
let risk_points_count = 0;
const isJsonShape = accepted.includes('"verdict"');
if (isJsonShape) {
const vm = accepted.match(/"verdict"\s*:\s*"([a-z_]+)"/i);
verdict = vm ? vm[1] : null;
// Count object entries per array by counting occurrences of
// either a unique-per-entry field name or {...} bracket pairs
// inside the array span. A straight "count opening braces inside
// the array range" is simplest and robust to field order.
const countArrayEntries = (arrayName: string): number => {
const re = new RegExp(`"${arrayName}"\\s*:\\s*\\[([\\s\\S]*?)\\]`, "i");
const m = accepted.match(re);
if (!m || !m[1].trim()) return 0;
// Count opening braces of direct-child objects.
let depth = 0, entries = 0;
for (const ch of m[1]) {
if (ch === '{') { if (depth === 0) entries++; depth++; }
else if (ch === '}') depth--;
}
return entries;
};
critical_failures_count = countArrayEntries("critical_failures");
pseudocode_flags_count = countArrayEntries("pseudocode_flags");
prd_mismatches_count = countArrayEntries("prd_mismatches");
missing_components_count = countArrayEntries("missing_components");
verified_components_count = countArrayEntries("verified_components");
risk_points_count = countArrayEntries("risk_points");
}
// Permission Gradient (Layer #6 from SYSTEM_EVOLUTION_LAYERS.md).
// Classify the overall finding set by confidence_avg:
// ≥90 auto-apply-safe, ≥70 dry-run + diff, ≥50 simulation only,
// <50 block (human review). Use conf_min as the tier-lower-bound
// so one shaky finding drags the whole row down to the safer tier.
const tierFor = (c: number | null): string => {
if (c === null) return "unknown";
if (c >= 90) return "auto";
if (c >= 70) return "dry_run";
if (c >= 50) return "simulation";
return "block";
};
const gradient_tier = tierFor(conf_min); // conservative: weakest finding decides
const gradient_tier_avg = tierFor(conf_avg);
const row = {
file: rel,
reviewed_at: new Date().toISOString(),
accepted_model: acceptedModel,
accepted_on_attempt: acceptedOn,
attempts_made: history.length,
tree_split_fired: treeSplitFired,
suggestions_preview: accepted.slice(0, 2000),
// Iter-3+ confidence fields.
confidences_per_finding: confidences,
confidence_avg: conf_avg,
confidence_min: conf_min,
findings_count: confidences.length,
// Layer #6 Permission Gradient — downstream consumers decide
// apply-semantics based on these fields instead of blindly trusting
// every suggestion.
gradient_tier,
gradient_tier_avg,
// Score (decimal-aware) and forensic JSON structured counters.
// iter 4+ fields (schema_version 4).
alignment_score,
output_format: isJsonShape ? "forensic_json" : "markdown",
verdict,
critical_failures_count,
pseudocode_flags_count,
prd_mismatches_count,
missing_components_count,
verified_components_count,
risk_points_count,
schema_version: 4,
scrum_master_reviewed: true,
};
// Layer #2 Model Trust Profiling — append one row per file-accept
// so over time we can compute per-(model, task_type) success/thin
// rates and trust scores. task_type here is inferred from the file
// path — good enough for initial stats, refine later.
try {
const taskType = rel.includes("/truth/") ? "truth"
: rel.includes("/gateway/") ? "gateway"
: rel.includes("/vectord") ? "vector"
: rel.includes("/ingestd") ? "ingest"
: rel.includes("/queryd") ? "query"
: rel.includes("/storaged") || rel.includes("/catalogd") || rel.includes("/journald") ? "storage"
: rel.includes("/aibridge") ? "aibridge"
: "other";
const trustRow = {
run_id: OUT_DIR.split("/").pop(),
reviewed_at: row.reviewed_at,
file: rel,
task_type: taskType,
accepted_model: acceptedModel,
accepted_on_attempt: acceptedOn,
attempts_made: history.length,
thin_rejections: history.filter(h => h.status === "thin").length,
errors: history.filter(h => h.status === "error").length,
confidence_avg: conf_avg,
tree_split_fired: treeSplitFired,
};
await appendFile("/home/profit/lakehouse/data/_kb/model_trust.jsonl",
JSON.stringify(trustRow) + "\n");
} catch (e) {
console.error(`[scrum] model_trust append failed: ${(e as Error).message}`);
}
try {
await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n");
} catch (e) {
console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`);
}
// Pathway trace sidecar (consensus-designed 2026-04-24). Captures
// FULL context (ladder attempts, KB chunks, observer signal, verdict)
// for similarity-based hot-swap in future iterations. First-review
// pathways start in probation (replay_count=0); they become
// hot-swappable only after ≥3 replays at ≥80% success.
try {
const pathwayTrace: PathwayTracePayload = {
pathway_id: computePathwayId(taskClass, rel, signalClass),
task_class: taskClass,
file_path: rel,
signal_class: signalClass,
created_at: row.reviewed_at,
ladder_attempts: pathwayAttempts,
kb_chunks: [
...topPrd.map((c, idx) => ({
source_doc: "PRD.md", chunk_id: `prd@${c.offset}`, cosine_score: (c as any)._score ?? 0, rank: idx,
})),
...topPlan.map((c, idx) => ({
source_doc: "cohesion_plan", chunk_id: `plan@${c.offset}`, cosine_score: (c as any)._score ?? 0, rank: idx,
})),
],
observer_signals: signalClass ? [{ class: signalClass, priors: [], prior_iter_outcomes: [] }] : [],
bridge_hits: [], // context7 not wired into scrum yet; empty for v1
sub_pipeline_calls: [], // LLM Team extract happens after this row; out of scope for v1
audit_consensus: null, // set by auditor's later N=3 pass, via /pathway/insert update
reducer_summary: accepted.slice(0, 4000),
final_verdict: verdict ?? "accepted",
// Vec built from the full attempts/chunks — richer than the
// query-time vector. The similarity gate will still discriminate
// between pathways with the same fingerprint but different
// ladder/KB profiles.
// Include semantic flag tokens in the embedding so traces with
// different bug histories cluster separately — matches Rust's
// build_pathway_vec exactly (flag:<Variant> token shape).
pathway_vec: buildPathwayVec([
taskClass,
rel,
...(signalClass ? [`signal:${signalClass}`] : []),
...pathwayAttempts.flatMap(a => [`rung:${a.rung}`, `model:${a.model}`, `accepted:${a.accepted}`]),
...topPrd.map(c => `kb:PRD.md`),
...topPlan.map(c => `kb:cohesion_plan`),
...semantic_flags_arr.map(f => `flag:${f.kind}`),
]),
semantic_flags: semantic_flags_arr,
type_hints_used: [], // Phase E — pre-review enrichment from catalogd/arrow/truth
bug_fingerprints: bug_fingerprints_arr, // ADR-021 Phase D
replay_count: 0,
replays_succeeded: 0,
retired: false,
};
writePathwayTrace(pathwayTrace); // fire-and-forget
} catch (e) {
console.error(`[scrum] pathway trace failed: ${(e as Error).message}`);
}
// Close the scrum → observer loop (fix 2026-04-24). Architecture
// audit surfaced: observer ring had 2000 ops, 1999 from Langfuse,
// zero from scrum. Observer's analyzeErrors + PLAYBOOK_BUILDER loops
// were blind to the very pipeline most likely to teach them. One
// fire-and-forget POST wires them in. Observer tolerates unreachable
// backends; no scrum run fails if observer is down.
//
// Schema matches observer's ObservedOp shape (source, staffer_id,
// sig_hash, event_kind, success, ...). file + accepted_model +
// confidence_avg + gradient_tier give downstream analyzers enough
// signal to correlate reviews with later regressions.
try {
const sigHash = createHash("sha256")
.update(`${rel}|${OUT_DIR.split("/").pop()}`)
.digest("hex")
.slice(0, 16);
fetch("http://localhost:3800/event", {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
source: "scrum",
staffer_id: "scrum_master",
sig_hash: sigHash,
event_kind: "file_review",
success: true,
run_id: OUT_DIR.split("/").pop(),
file: rel,
accepted_model: acceptedModel,
accepted_on_attempt: acceptedOn,
attempts_made: history.length,
thin_rejections: history.filter(h => h.status === "thin").length,
confidence_avg: conf_avg,
confidence_min: conf_min,
findings_count: confidences.length,
gradient_tier,
tree_split_fired: treeSplitFired,
// iter4+ forensic-JSON fields so observer's analyzer can
// route by verdict / sort by critical_failures_count
alignment_score,
verdict,
output_format: isJsonShape ? "forensic_json" : "markdown",
critical_failures_count,
verified_components_count,
missing_components_count,
// Pathway fields: emitted on every review so the observer
// can build a full picture of hot-swap performance over time.
// `pathway_hot_swap_hit` flags whether the first-tried rung
// this review was a pathway recommendation vs the default
// ladder top. `rungs_saved` quantifies the compute we avoided
// when a hot-swap landed — this is the value metric the VCP
// UI surfaces ("avg_rungs_saved_per_commit").
pathway_hot_swap_hit: hotSwap !== null,
pathway_id: hotSwap?.pathway_id ?? null,
pathway_similarity: hotSwap?.similarity ?? null,
pathway_success_rate: hotSwap?.success_rate ?? null,
rungs_saved: hotSwap && acceptedModel.endsWith(`/${hotSwap.recommended_model}`)
? Math.max(0, hotSwap.recommended_rung - 1)
: 0,
ts: row.reviewed_at,
}),
signal: AbortSignal.timeout(3000),
}).catch(() => {
// observer down — not a scrum-run failure, just lose the signal.
});
} catch (e) {
// Synchronous construction error — ignore.
}
// Route the accepted review through llm_team's fact extractor so
// its entities + relationships land in audit_facts.jsonl alongside
// inference-side extractions. Same index, two sources. Tagged
// source:"scrum_review" + scrum_master_reviewed:true so downstream
// queries can filter by provenance. Reviews shorter than 120
// chars are skipped — they're usually one-liners ("LGTM") with
// no extractable knowledge.
if (accepted.length >= 120 && process.env.LH_SCRUM_SKIP_EXTRACT !== "1") {
try {
const { extractFacts } = await import("../../auditor/fact_extractor.ts");
const ex = await extractFacts(accepted);
if (!ex.error || ex.entities.length + ex.facts.length > 0) {
const factRow = {
pr_number: 0, // scrum runs outside a PR scope
file: rel,
head_sha: "", // no SHA scope; scope is the file+timestamp
extracted_at: ex.extracted_at,
extractor: ex.extractor_model,
verifier: ex.verifier_model,
llm_team_run_id: ex.llm_team_run_id ?? null,
facts: ex.facts,
entities: ex.entities,
relationships: ex.relationships,
verification_preview: ex.verification.slice(0, 400),
schema_version: 2,
source: "scrum_review",
scrum_master_reviewed: true,
};
const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl";
await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(factRow) + "\n");
}
} catch (e) {
console.error(`[scrum] fact extraction failed for ${rel}: ${(e as Error).message}`);
}
}
}
return review;
}
async function loadAndChunk(path: string, origin_tag: string): Promise<Chunk[]> {
const text = await readFile(path, "utf8");
const raw = chunkText(text);
const embs = await embedBatch(raw.map(r => r.text));
return raw.map((r, i) => ({
id: createHash("sha256").update(r.text).digest("hex").slice(0, 10),
text: r.text,
embedding: embs[i],
origin: origin_tag,
offset: r.offset,
}));
}
async function main() {
await mkdir(OUT_DIR, { recursive: true });
log(`output: ${OUT_DIR}`);
log(`targets: ${TARGET_FILES.length} files`);
log("loading + embedding PRD...");
const prd_chunks = await loadAndChunk(PRD_PATH, "PRD");
log(` PRD: ${prd_chunks.length} chunks`);
log("loading + embedding cohesion plan...");
const plan_chunks = await loadAndChunk(PROPOSAL_PATH, "COHESION_PLAN");
log(` plan: ${plan_chunks.length} chunks`);
log("");
log("─── scrum master: walking target files ───");
const reviews: FileReview[] = [];
for (const f of TARGET_FILES) {
const review = await reviewFile(f, prd_chunks, plan_chunks);
reviews.push(review);
await writeFile(
`${OUT_DIR}/review_${review.file.replace(/\//g, "_")}.json`,
JSON.stringify(review, null, 2),
);
log(`${review.file}: ${review.accepted_on ? `accepted on ${review.accepted_on} by ${review.escalated_to_model}` : "UNRESOLVED"} (${review.duration_ms}ms)`);
}
// Consolidated scrum-master report
const report_md: string[] = [];
report_md.push(`# Scrum-master review\n`);
report_md.push(`Generated: ${new Date().toISOString()}`);
report_md.push(`Files reviewed: ${reviews.length}`);
report_md.push(`Total duration: ${(reviews.reduce((s, r) => s + r.duration_ms, 0) / 1000).toFixed(1)}s\n`);
for (const r of reviews) {
report_md.push(`\n## ${r.file}`);
report_md.push(`- **Accepted on attempt:** ${r.accepted_on ?? "NOT resolved after 6 attempts"}`);
report_md.push(`- **Escalated to:** \`${r.escalated_to_model || "—"}\``);
report_md.push(`- **Total attempts:** ${r.attempts_made}`);
if (r.attempts_history.length > 1) {
report_md.push(`- **Attempt history:**`);
for (const h of r.attempts_history) {
report_md.push(` - ${h.n}: \`${h.model}\`${h.status}${h.error ? ` (${h.error.slice(0, 100)})` : ""}`);
}
}
report_md.push(`\n### Suggestions\n\n${r.suggestions}\n`);
}
await writeFile(`${OUT_DIR}/scrum_report.md`, report_md.join("\n"));
const summary = {
ran_at: new Date().toISOString(),
target_count: TARGET_FILES.length,
resolved: reviews.filter(r => r.accepted_on !== null).length,
total_attempts: reviews.reduce((s, r) => s + r.attempts_made, 0),
total_duration_ms: reviews.reduce((s, r) => s + r.duration_ms, 0),
per_file: reviews.map(r => ({ file: r.file, accepted_on: r.accepted_on, model: r.escalated_to_model, attempts: r.attempts_made, ms: r.duration_ms })),
};
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
log("");
log("═══ SCRUM REPORT ═══");
log(` files: ${summary.target_count}, resolved: ${summary.resolved}, total attempts: ${summary.total_attempts}`);
log(` total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
log("");
for (const p of summary.per_file) {
const mark = p.accepted_on ? "✓" : "✗";
log(` ${mark} ${p.file.padEnd(60)} attempt ${p.accepted_on ?? "—"}/${p.attempts} ${p.model} ${p.ms}ms`);
}
log("");
log(`report: ${OUT_DIR}/scrum_report.md`);
process.exit(summary.resolved === summary.target_count ? 0 : 1);
}
main().catch(e => { console.error("[scrum] fatal:", e); process.exit(2); });