// Scrum-master orchestrator — pulls git repo source + PRD + a change // proposal, chunks everything, hands each code piece to the proven // escalation ladder (small-local → big-local → cloud → specialist → // biggest) with learning context between attempts. Collects per-file // suggestions in a coherent handoff report. // // What it composes (everything below is already shipped + proven): // - Chunker + embeddings (sidecar /embed, nomic-embed-text) // - In-memory cosine retrieval (top-K PRD + plan chunks per file) // - Escalation ladder (6 tiers, cycling on empty/error/thin-answer) // - Per-attempt learning-context injection (prior failures → prompt) // - Tree-split fallback when combined context exceeds budget // - JSONL output per file + summary // // Deliberate scope limit: TARGET_FILES is 3 files by default. The // pipeline works at larger N, but at ~90s/file × 3 files = 4-5 min, // 15 files = 22 min. Bump via env LH_SCRUM_FILES="path1,path2,...". // // Run: bun run tests/real-world/scrum_master_pipeline.ts import { readFile, writeFile, mkdir } from "node:fs/promises"; import { createHash } from "node:crypto"; const GATEWAY = "http://localhost:3100"; const SIDECAR = "http://localhost:3200"; const CHUNK_SIZE = 800; const CHUNK_OVERLAP = 120; const TOP_K_CONTEXT = 5; const MAX_ATTEMPTS = 9; // Files larger than this get tree-split instead of truncated. Fixes the // 6KB false-positive class (model claiming a field is "missing" when // it exists past the context cutoff). // Env-configurable so the pipeline can adapt to different repos: // a 13K-line Python file like /root/llm-team-ui/llm_team_ui.py needs // larger shards to avoid producing 200+ cloud calls per review. // Defaults stay at 6000 / 3500 — tuned for Rust source files in // crates//src/*.rs. const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000); const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500); // Same-model retry budget after observer rejection. After this many // quality rejects on the current model, advance to the next provider- // error fallback. Counts ONLY observer/quality rejects, not provider // errors (which advance immediately). const MAX_QUALITY_RETRIES = Number(process.env.LH_SCRUM_MAX_QUALITY_RETRIES ?? 2); // Appended jsonl so auditor's kb_query can surface scrum findings for // files touched by a PR under review. Part of cohesion plan Phase C. const SCRUM_REVIEWS_JSONL = process.env.LH_SCRUM_REVIEWS_OUT || "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl"; const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/scrum_${Date.now().toString(36)}`; const PRD_PATH = process.env.LH_SCRUM_PRD || "/home/profit/lakehouse/docs/PRD.md"; // Using CONTROL_PLANE_PRD as the "suggested changes" doc since it // describes the Phase 38-44 target architecture and is on main. // Override via LH_SCRUM_PROPOSAL env to point at a fix-wave doc // generated from a phase-sweep audit, so the scrum pulls direction // from concrete findings instead of the high-level PRD alone. const PROPOSAL_PATH = process.env.LH_SCRUM_PROPOSAL || "/home/profit/lakehouse/docs/CONTROL_PLANE_PRD.md"; // Iter 2+ — when LH_SCRUM_FORENSIC is set to a file path, prepend its // contents as an adversarial auditor preamble to every per-file prompt. // This flips the review tone from "suggest improvements" to "prove it // works or mark FAIL." Added 2026-04-23 for iter 2 of the 6x loop. // Empty string = no preamble (iter-1 behavior). const FORENSIC_PREAMBLE = process.env.LH_SCRUM_FORENSIC ? (() => { try { return require("node:fs").readFileSync(process.env.LH_SCRUM_FORENSIC!, "utf8"); } catch (e) { console.error(`[scrum] warning: could not read LH_SCRUM_FORENSIC=${process.env.LH_SCRUM_FORENSIC}: ${e}`); return ""; } })() : ""; // Scoped target: 3 representative source files by default. // The scrum-master walks these in order and produces one suggestion // set per file. Override via env for a wider sweep. const DEFAULT_TARGETS = [ "/home/profit/lakehouse/crates/vectord/src/playbook_memory.rs", "/home/profit/lakehouse/crates/vectord/src/doc_drift.rs", "/home/profit/lakehouse/auditor/audit.ts", ]; const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES ? process.env.LH_SCRUM_FILES.split(",").map(s => s.trim()) : DEFAULT_TARGETS; // Cloud-first ladder, STRONGEST-MODEL-FIRST (iter 3+, 2026-04-24). // J's direction: "switch to the strongest cloud model" for iter 3 — // the forensic prompt is demanding enough that even 120B gets rejected // as thin. Rank by parameter count / reasoning strength: // 1. kimi-k2:1t — 1T params, Moonshot flagship (biggest) // 2. kimi-k2.6 — Moonshot next-gen, pro tier // 3. deepseek-v3.1:671b — 671B, strong reasoning + coding // 4. mistral-large-3:675b — 675B, deep analysis // 5. qwen3.5:397b — 397B (iter 2's rescue model) // 6. gpt-oss:120b — 120B (iter 1's primary; still strong fallback) // Local fallbacks kept for cloud-down scenarios. // Hot-path pipelines (scenario.ts / execution_loop) stay local per // Phase 20 t1_hot — this scrum is not hot path. // 2026-04-25 J architectural correction: stop cascading models on // every failure. ONE model handles the work, with same-model retries // using enriched context. Cycle to a different model ONLY on PROVIDER // errors (network/auth/5xx) — not on quality issues. Quality issues // signal that the context needs more enrichment, not a different model. // // Tree-split (treeSplitFile) is the ONE legitimate model-switch trigger // for context-overflow, and even that just re-runs the same model // against smaller chunks. // // This ladder is now a SAFETY chain for provider failures, not the // strategy. Kimi K2.6, Gemini, free-tier, local fallback, etc. were // removed — they're available as routable tools later (mode router) // but not as automatic fallbacks. const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [ { provider: "openrouter", model: "x-ai/grok-4.1-fast", note: "PRIMARY · Grok 4.1 fast · $0.20/$0.50 · 2M ctx · single-model strategy" }, { provider: "openrouter", model: "deepseek/deepseek-v4-flash", note: "FALLBACK on provider error · DeepSeek V4 flash · $0.14/$0.28 · 1M ctx" }, { provider: "openrouter", model: "qwen/qwen3-235b-a22b-2507", note: "LAST FALLBACK on provider error · Qwen3 235B · $0.07/$0.10 · 262K" }, // Dropped from the ladder after 2026-04-24 probe: // - kimi-k2.6 — not available on current tier (empty response) // - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist) // - minimax-m2.7 — 400 thinking tokens, 0 content output // - openrouter qwen3-coder:free / llama-3.3 / hermes-3 — provider errors // - openrouter minimax-m2.5:free — 45s timeout ]; type Chunk = { id: string; text: string; embedding: number[]; origin: string; offset: number }; interface FileReview { file: string; file_bytes: number; tree_split_fired: boolean; shards_summarized: number; top_prd_chunks: Array<{ origin: string; offset: number; score: number }>; top_proposal_chunks: Array<{ origin: string; offset: number; score: number }>; attempts_made: number; attempts_history: Array<{ n: number; model: string; status: "accepted" | "thin" | "error"; chars: number; error?: string }>; accepted_on: number | null; escalated_to_model: string; suggestions: string; duration_ms: number; } function log(msg: string) { console.log(`[scrum] ${msg}`); } function cosine(a: number[], b: number[]): number { let dot = 0, na = 0, nb = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; } return na && nb ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0; } function chunkText(text: string): Array<{ text: string; offset: number }> { const out: Array<{ text: string; offset: number }> = []; for (let i = 0; i < text.length; ) { const end = Math.min(i + CHUNK_SIZE, text.length); const slice = text.slice(i, end).trim(); if (slice.length > 60) out.push({ text: slice, offset: i }); if (end >= text.length) break; i = end - CHUNK_OVERLAP; } return out; } async function embedBatch(texts: string[]): Promise { const r = await fetch(`${SIDECAR}/embed`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ texts }), signal: AbortSignal.timeout(120000), }); if (!r.ok) throw new Error(`embed ${r.status}`); return (await r.json() as any).embeddings; } // ─── Pathway memory (2026-04-24 consensus design) ─────────────────── // // Mirrors vectord/src/pathway_memory.rs. The bucket-hash vector MUST // byte-match the Rust implementation so traces written from TypeScript // are searchable against the same embedding space. Verified by running // both implementations on the same input tokens and asserting matching // bucket indices. function filePrefix(path: string): string { return path.split("/").slice(0, 2).join("/"); } function computePathwayId(taskClass: string, filePath: string, signalClass: string | null): string { const h = createHash("sha256"); h.update(taskClass); h.update("|"); h.update(filePrefix(filePath)); h.update("|"); h.update(signalClass ?? ""); return h.digest("hex"); } // 32-bucket L2-normalized token hash. Same algorithm as Rust. function buildPathwayVec(tokens: string[]): number[] { const buckets = new Array(32).fill(0); for (const t of tokens) { const d = createHash("sha256").update(t, "utf8").digest(); const b1 = d[0] % 32; const b2 = d[8] % 32; buckets[b1] += 1; buckets[b2] += 1; } let norm = 0; for (const v of buckets) norm += v * v; norm = Math.sqrt(norm); if (norm > 0) for (let i = 0; i < buckets.length; i++) buckets[i] /= norm; return buckets; } // Build the minimal query vector for a pre-ladder hot-swap check. We // don't yet know the ladder attempts or KB chunks — the query vec is // computed from what we CAN know up front: task/file/signal. This is // a weaker embedding than the one computed at trace-insert time, but // similarity still discriminates between task/file/signal combinations. function buildQueryVec(taskClass: string, filePath: string, signalClass: string | null): number[] { const tokens = [taskClass, filePath]; if (signalClass) tokens.push(`signal:${signalClass}`); return buildPathwayVec(tokens); } interface HotSwapCandidate { pathway_id: string; similarity: number; replay_count: number; success_rate: number; recommended_rung: number; recommended_model: string; } async function queryHotSwap(taskClass: string, filePath: string, signalClass: string | null): Promise { try { const query_vec = buildQueryVec(taskClass, filePath, signalClass); const r = await fetch(`${GATEWAY}/vectors/pathway/query`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ task_class: taskClass, file_path: filePath, signal_class: signalClass, query_vec }), signal: AbortSignal.timeout(5000), }); if (!r.ok) return null; const j = await r.json() as { candidate: HotSwapCandidate | null }; return j.candidate ?? null; } catch { // Pathway service unavailable → run full ladder. Hot-swap is // always an optimization, never a correctness requirement. return null; } } interface LadderAttemptRec { rung: number; model: string; latency_ms: number; accepted: boolean; reject_reason: string | null; } interface PathwayTracePayload { pathway_id: string; task_class: string; file_path: string; signal_class: string | null; created_at: string; ladder_attempts: LadderAttemptRec[]; kb_chunks: { source_doc: string; chunk_id: string; cosine_score: number; rank: number }[]; observer_signals: { class: string; priors: string[]; prior_iter_outcomes: string[] }[]; bridge_hits: { library: string; version: string }[]; sub_pipeline_calls: { pipeline: string; result_summary: string }[]; audit_consensus: { pass: boolean; models: string[]; disagreements: number } | null; reducer_summary: string; final_verdict: string; pathway_vec: number[]; // ADR-021 semantic-correctness layer. `kind` field matches the Rust // serde(tag = "kind") wire format — TS and Rust interop directly. semantic_flags: { kind: string }[]; type_hints_used: { source: string; symbol: string; type_repr: string }[]; bug_fingerprints: { flag: { kind: string }; pattern_key: string; example: string; occurrences: number }[]; replay_count: number; replays_succeeded: number; retired: boolean; } async function writePathwayTrace(trace: PathwayTracePayload): Promise { try { await fetch(`${GATEWAY}/vectors/pathway/insert`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify(trace), signal: AbortSignal.timeout(10000), }); } catch { // Fire-and-forget: scrum runs shouldn't fail if pathway insert fails. } } // Per-model rate limiter. Persists timestamps to a JSONL file so // caps survive process restarts (autonomous loop spawns a new // scrum_master subprocess per iteration; without persistence each // iter would reset to 0). File is append-only; pruning happens at // read time to keep writes O(1). // // Config: model → { perHour }. Add an entry here to cap a model. // J directive 2026-04-25: Kimi K2.6 capped at 25/hour because the // $4.66/M output cost would compound fast otherwise. const MODEL_RATE_LIMITS: Record = { "moonshotai/kimi-k2.6": { perHour: 25 }, }; const RATE_LIMIT_LOG = "/home/profit/lakehouse/data/_kb/rate_limit_calls.jsonl"; async function readRateLimitTimestamps(model: string, windowMs: number): Promise { const f = Bun.file(RATE_LIMIT_LOG); if (!(await f.exists())) return []; const text = await f.text(); const cutoff = Date.now() - windowMs; const ts: number[] = []; for (const line of text.split("\n")) { if (!line.trim()) continue; try { const r = JSON.parse(line); if (r.model === model && typeof r.ts === "number" && r.ts >= cutoff) { ts.push(r.ts); } } catch { /* skip malformed */ } } return ts; } async function checkRateLimit(model: string, perHour: number): Promise { const ts = await readRateLimitTimestamps(model, 60 * 60 * 1000); return ts.length < perHour; } async function recordRateLimitCall(model: string): Promise { const { appendFile } = await import("node:fs/promises"); await appendFile(RATE_LIMIT_LOG, JSON.stringify({ model, ts: Date.now() }) + "\n"); } async function recordPathwayReplay(pathwayId: string, succeeded: boolean): Promise { try { await fetch(`${GATEWAY}/vectors/pathway/record_replay`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ pathway_id: pathwayId, succeeded }), signal: AbortSignal.timeout(5000), }); } catch { // Fire-and-forget. Not critical. } } // Observer hand-review — the policy layer that decides whether a // candidate response is grounded enough to accept. Lives in mcp-server's // observer (port 3800) so it sits OUTSIDE the scrum loop's epistemic // scope. Synchronous so scrum can act on the verdict immediately. // // Returns {verdict, confidence, notes}. verdict ∈ {accept, reject, cycle}. // On unreachable observer (network, timeout, parse failure), falls open // to {verdict: "accept"} — the observer is the policy layer, not a hard // dependency. Pipeline keeps moving when the observer is down. const OBSERVER_URL = process.env.LH_OBSERVER_URL ?? "http://localhost:3800"; // Matrix retrieval — pulls proven-success pathways for this task class // + file area and prepends them as a "📖 PROVEN APPROACHES" preamble. // First time the matrix index is actually USED to route work (vs being // written to). LH_SCRUM_MATRIX_RETRIEVE=0 disables for A/B comparison. // // The proof J asked for: does loading prior successes change scrum // behavior? Run with retrieval ON and OFF on the same target — if // reviews differ measurably, the matrix has signal. If identical, // the writes are noise and need a different design. const MATRIX_RETRIEVE = process.env.LH_SCRUM_MATRIX_RETRIEVE !== "0"; const PATHWAY_STATE_PATH = "/home/profit/lakehouse/data/_pathway_memory/state.json"; interface ProvenApproach { pathway_id: string; file_path: string; accepted_model: string; accepted_attempt: number; kb_sources: string[]; // top-3 KB chunk source_doc names summary_excerpt: string; // first 400 chars of reducer_summary created_at: string; } async function fetchProvenApproaches( taskClass: string, filePath: string, signalClass: string | null, limit = 3, ): Promise { if (!MATRIX_RETRIEVE) return []; try { const f = Bun.file(PATHWAY_STATE_PATH); if (!(await f.exists())) return []; const state = JSON.parse(await f.text()); const pathways = state.pathways ?? {}; // Match by narrow fingerprint: same task_class + same file prefix // (first 2 path segments, e.g. "crates/queryd"). signal_class match // is preferred but not required — broader matches still inform. const filePrefix = filePath.split("/").slice(0, 2).join("/"); const matched: any[] = []; for (const traces of Object.values(pathways) as any[][]) { for (const t of traces) { if (t.task_class !== taskClass) continue; if (!t.file_path?.startsWith(filePrefix)) continue; if (t.final_verdict !== "accepted") continue; if (t.retired) continue; matched.push(t); } } // Most recent first matched.sort((a, b) => (b.created_at ?? "").localeCompare(a.created_at ?? "")); return matched.slice(0, limit).map(t => { const acceptedAttempt = (t.ladder_attempts ?? []).find((a: any) => a.accepted) ?? { model: "unknown", rung: 0 }; const sources = (t.kb_chunks ?? []) .slice(0, 3) .map((c: any) => c.source_doc ?? "?"); return { pathway_id: t.pathway_id, file_path: t.file_path, accepted_model: acceptedAttempt.model, accepted_attempt: acceptedAttempt.rung, kb_sources: [...new Set(sources)], summary_excerpt: (t.reducer_summary ?? "").slice(0, 400), created_at: t.created_at ?? "", }; }); } catch (e: any) { console.error(`[scrum] matrix retrieval failed: ${e.message}`); return []; } } // Unified matrix retriever — pulls from ALL relevant corpora at once. // Per J 2026-04-25: matrix is the vector indexing layer for the whole KB, // not just pathway memory. Returns combined top-N ranked across corpora. // // Smoke-test goal: prove the matrix surfaces relevant context from MULTIPLE // indexed sources (distilled facts/procedures/config-hints + team runs + // playbook memory + pathway successes), not just one slice. // // Per-corpus configuration. Add an entry to query a new corpus. Limited // to indexes that actually contain code-review-relevant context — staffing // data (workers_500k_*, resumes_*) is excluded by design. const MATRIX_CORPORA_FOR_TASK: Record = { // Code review task — distilled facts/procedures/hints from prior reviews // plus team-run history. Staffing data deliberately excluded. scrum_review: [ "distilled_factual_v20260423095819", "distilled_procedural_v20260423102847", "distilled_config_hint_v20260423102847", "kb_team_runs_v1", ], // Chicago contract / permit analysis — pulls actual permits + contractor // entities + SEC tickers + LLM team historical reasoning + lake-house // distilled procedures (which encode prior task patterns). contract_analysis: [ "chicago_permits_v1", "entity_brief_v1", "sec_tickers_v1", "llm_team_runs_v1", "llm_team_response_cache_v1", "distilled_procedural_v20260423102847", ], // Staffing inference — workers data + entity briefs + Chicago permit // demand signal + LLM team patterns. workers_500k_v8 is the most // recent dense version. staffing_inference: [ "workers_500k_v8", "entity_brief_v1", "chicago_permits_v1", "llm_team_runs_v1", "distilled_procedural_v20260423102847", ], }; interface MatrixHit { source_corpus: string; score: number; doc_id: string; text: string; } interface MatrixContext { hits: MatrixHit[]; by_corpus: Record; errors: Record; latency_ms: number; } async function fetchMatrixContext( query: string, taskClass: string, filePath: string, topPerCorpus = 3, topOverall = 8, ): Promise { const t0 = Date.now(); const corpora = MATRIX_CORPORA_FOR_TASK[taskClass] ?? []; const allHits: MatrixHit[] = []; const errors: Record = {}; const byCorpus: Record = {}; // Query persistent vector indexes in parallel await Promise.all(corpora.map(async (idx) => { try { const r = await fetch(`${GATEWAY}/vectors/search`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ index_name: idx, query, top_k: topPerCorpus }), signal: AbortSignal.timeout(15000), }); if (!r.ok) { errors[idx] = `HTTP ${r.status}`; return; } const data: any = await r.json(); const results = data.results ?? []; byCorpus[idx] = results.length; for (const h of results) { allHits.push({ source_corpus: idx, score: Number(h.score ?? 0), doc_id: String(h.doc_id ?? "?"), text: String(h.chunk_text ?? h.text ?? "").slice(0, 400), }); } } catch (e: any) { errors[idx] = e.message; } })); // Pull pathway successes via the existing helper, mapped into MatrixHit shape try { const proven = await fetchProvenApproaches(taskClass, filePath, null, topPerCorpus); byCorpus.pathway_memory = proven.length; for (const p of proven) { allHits.push({ source_corpus: "pathway_memory", score: 0.6, // neutral — pathway has no cosine; used as soft mid-rank doc_id: p.pathway_id.slice(0, 12), text: `[${p.accepted_model} accepted on attempt ${p.accepted_attempt} · sources=${p.kb_sources.join(",")}]\n${p.summary_excerpt.replace(/\s+/g, " ")}`.slice(0, 400), }); } } catch (e: any) { errors.pathway_memory = e.message; } // Sort all hits by score desc, take top N allHits.sort((a, b) => b.score - a.score); const topHits = allHits.slice(0, topOverall); return { hits: topHits, by_corpus: byCorpus, errors, latency_ms: Date.now() - t0, }; } function buildMatrixPreamble(ctx: MatrixContext): string { if (ctx.hits.length === 0) return ""; const lines = [ `═══ 📖 MATRIX-INDEXED CONTEXT (${ctx.hits.length} hits across ${Object.keys(ctx.by_corpus).length} corpora) ═══`, "Relevant chunks pulled from the knowledge base. Use as REFERENCE — not findings to copy. Cite specific chunks if they shape your review.", "", ]; for (let i = 0; i < ctx.hits.length; i++) { const h = ctx.hits[i]; lines.push(`[${i + 1}] ${h.source_corpus} (score=${h.score.toFixed(2)}, doc=${h.doc_id}): ${h.text.replace(/\s+/g, " ").trim()}`); } lines.push("═══"); lines.push(""); return lines.join("\n"); } function buildProvenApproachesPreamble(approaches: ProvenApproach[]): string { if (approaches.length === 0) return ""; const lines = [ "═══ 📖 PROVEN APPROACHES — PRIOR ACCEPTED REVIEWS ON THIS FILE AREA ═══", "These are reviews that previously passed observer hand-review on the same task class + file prefix.", "Use them as REFERENCE PATTERNS for what a strong review looks like — not as findings to copy.", "", ]; for (let i = 0; i < approaches.length; i++) { const a = approaches[i]; lines.push(`Approach ${i + 1} · file=${a.file_path} · model=${a.accepted_model} · sources=[${a.kb_sources.join(", ")}]`); lines.push(` excerpt: ${a.summary_excerpt.replace(/\s+/g, " ").trim()}`); lines.push(""); } lines.push("═══"); lines.push(""); return lines.join("\n"); } interface ObserverVerdict { verdict: "accept" | "reject" | "cycle"; confidence?: number; notes?: string; source?: "cloud" | "heuristic"; } async function observerHandReview(input: { file_path: string; model: string; response: string; source_content: string; grounding_stats: { total: number; grounded: number; groundedPct: number | null }; attempt: number; }): Promise { try { const r = await fetch(`${OBSERVER_URL}/review`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify(input), signal: AbortSignal.timeout(60000), }); if (!r.ok) { // Observer down or rejected the request — fall open to accept so // the loop keeps moving. Log so we notice degradation. console.error(`[scrum] observer review unreachable (${r.status}), falling open to accept`); return { verdict: "accept", notes: `observer ${r.status}`, source: "heuristic" }; } return (await r.json()) as ObserverVerdict; } catch (e: any) { console.error(`[scrum] observer review failed (${e.message}), falling open to accept`); return { verdict: "accept", notes: `observer error: ${e.message}`, source: "heuristic" }; } } // ADR-021 Phase C: pre-review enrichment. Fetch aggregated bug // fingerprints for this narrow fingerprint (same key as hot-swap — // task_class + file_prefix + signal_class) so the reviewer prompt // can explicitly warn "this file area has had these bug patterns // before." Empty on fresh install; grows as the matrix index learns. interface BugFingerprintRow { flag: { kind: string }; pattern_key: string; example: string; occurrences: number; } async function fetchBugFingerprints(taskClass: string, filePath: string, signalClass: string | null, limit: number): Promise { try { const r = await fetch(`${GATEWAY}/vectors/pathway/bug_fingerprints`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ task_class: taskClass, file_path: filePath, signal_class: signalClass, limit }), signal: AbortSignal.timeout(5000), }); if (!r.ok) return []; const j = await r.json() as { fingerprints: BugFingerprintRow[] }; return j.fingerprints ?? []; } catch { return []; } } // Deterministic signal_class lookup from scrum_reviews.jsonl history. // First-time files get `null`. Files seen before get the signal class // the observer assigned on their most-recent review (if any). Keeps the // pathway fingerprint stable across iterations for LOOPING files. async function lookupSignalClass(filePath: string): Promise { try { const { readFile } = await import("node:fs/promises"); const raw = await readFile(SCRUM_REVIEWS_JSONL, "utf8").catch(() => ""); if (!raw) return null; const lines = raw.trim().split("\n").reverse(); for (const line of lines) { try { const r = JSON.parse(line); if (r.file === filePath && r.signal_class) return r.signal_class; } catch {} } return null; } catch { return null; } } async function chat(opts: { provider: "ollama" | "ollama_cloud", model: string, prompt: string, max_tokens?: number, }): Promise<{ content: string; error?: string; prompt_tokens: number; completion_tokens: number }> { try { const r = await fetch(`${GATEWAY}/v1/chat`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ provider: opts.provider, model: opts.model, messages: [{ role: "user", content: opts.prompt }], max_tokens: opts.max_tokens ?? 1500, temperature: 0.2, think: false, }), signal: AbortSignal.timeout(180000), }); if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 200)}`, prompt_tokens: 0, completion_tokens: 0 }; const j: any = await r.json(); return { content: j.choices?.[0]?.message?.content ?? "", prompt_tokens: j.usage?.prompt_tokens ?? 0, completion_tokens: j.usage?.completion_tokens ?? 0, }; } catch (e) { return { content: "", error: (e as Error).message, prompt_tokens: 0, completion_tokens: 0 }; } } // Accept a file-review answer if it's substantive + structured. // We're not validating Rust here — we're validating that the model // produced a coherent suggestion set. // // BLIND-RESPONSE GUARD (added after iter 4 regression on llm-team-ui): // Some models pretend the source code wasn't supplied even when it was — // they produce structurally-valid JSON with one critical_failure of the // form "No source code visible; cannot verify..." Those should be // rejected so the ladder cycles to the next rung. We check for a small // set of telltale phrases inside critical_failures descriptions. function isBlindResponse(answer: string): boolean { // Cheap substring match on the descriptions area of the JSON const blindPhrases = [ /no source code (visible|provided|supplied)/i, /cannot (view|see|verify|access) (the )?source/i, /no code (was )?(visible|provided|supplied|attached)/i, /unable to (view|access|read) (the )?(source|file|code)/i, /source (code )?was not (provided|supplied|attached|included)/i, ]; return blindPhrases.some((re) => re.test(answer)); } // Anchor-grounding verifier — runs after a review is accepted (only // when tree-split fired, since small files don't need it). Extracts // every backtick-quoted code snippet from the review and checks // whether it appears in the original source content. Returns the // stats + a footer that gets appended to the review so humans can // audit grounding rate at a glance. // // Why: 2026-04-24 verification of llm_team_ui.py (13K lines, 61 shards) // showed 0/10 findings real, 6/10 hallucinated. Model invented // `render_template_string(f"

{user}

")`, `logger.exception(e)`, // SHA-256 password hashing — none of which existed in the actual // source. The reviewer wrote what *fit* the PRD's worry-list rather // than what the code actually does. This verifier catches that. function verifyAnchorGrounding(answer: string, sourceContent: string) { // Pull both inline `quoted` and triple-fenced ```quoted``` snippets. // Skip very short ones (≤ 3 chars — they false-match too easily on // common tokens like \`a\` or \`if\`). const inline = [...answer.matchAll(/`([^`\n]{4,})`/g)].map((m) => m[1]); const fenced = [...answer.matchAll(/```(?:[a-z]+\n)?([\s\S]+?)```/g)] .map((m) => m[1].trim()) .flatMap((b) => b.split("\n")) .map((l) => l.trim()) .filter((l) => l.length >= 6); const allQuotes = [...new Set([...inline, ...fenced])]; const grounded: string[] = []; const ungrounded: string[] = []; const sourceLower = sourceContent.toLowerCase(); // The model often emits the review wrapped in a JSON envelope, so // backtick-quoted snippets have their internal `"` escaped as `\"`, // `\n` as `\\n`, etc. Try unescaped variant first; if that's in the // source consider it grounded. Also normalize curly quotes to ASCII // since some models smart-quote string literals. const unescapeJsonish = (s: string) => s .replace(/\\"/g, '"') .replace(/\\'/g, "'") .replace(/\\n/g, "\n") .replace(/\\t/g, "\t") .replace(/\\\\/g, "\\") .replace(/[“”]/g, '"') .replace(/[‘’]/g, "'"); for (const q of allQuotes) { // Strip leading offset markers like "@123456" the anchors carry const cleaned = q.replace(/^@\d+\s*/, "").trim(); if (cleaned.length < 4) continue; const candidates = [cleaned, unescapeJsonish(cleaned)]; const hit = candidates.some((c) => sourceLower.includes(c.toLowerCase())); if (hit) grounded.push(cleaned); else ungrounded.push(cleaned); } const total = grounded.length + ungrounded.length; const groundedPct = total > 0 ? Math.round((grounded.length / total) * 100) : null; return { total, grounded: grounded.length, ungrounded, groundedPct }; } function appendGroundingFooter( answer: string, stats: ReturnType, ): string { const lines = [ "", "─── ANCHOR GROUNDING (post-process verifier) ───", `Backtick-quoted snippets: ${stats.total}`, `Grounded in source (literal substring match): ${stats.grounded}` + (stats.groundedPct !== null ? ` (${stats.groundedPct}%)` : ""), `Ungrounded (likely hallucinated, treat findings using these as low-confidence):`, ]; if (stats.ungrounded.length === 0) { lines.push(" (none — every quoted snippet matches the source verbatim)"); } else { for (const u of stats.ungrounded.slice(0, 12)) { lines.push(` · \`${u.slice(0, 80)}\``); } if (stats.ungrounded.length > 12) { lines.push(` · ... and ${stats.ungrounded.length - 12} more`); } } lines.push("─────────────────────────────────────────────────"); return answer + "\n" + lines.join("\n"); } function isAcceptable(answer: string): boolean { if (answer.length < 200) return false; // too thin if (isBlindResponse(answer)) return false; // hallucinated "no source" // Two accepted shapes: // (a) Markdown — bullets, numbered list, or headers. Original shape. // (b) Forensic JSON — `{"verdict":"..."}` with at least one of the // finding arrays populated. SCRUM_FORENSIC_PROMPT.md requires // this shape; previous version rejected it because the first // character is `{`, not `-`/`#`/`1.`. Iter-2 observation in // SCRUM_LOOP_NOTES flagged this as `[FORENSIC vs thin-detector // mismatch]` — this is the fix. const hasMarkdownStructure = /^\s*[-*]\s/m.test(answer) || /^\s*\d+\.\s/m.test(answer) || /^\s*#/m.test(answer); if (hasMarkdownStructure) return true; // Accept JSON verdict shape even without surrounding markdown. // Check for a `"verdict"` key and at least one populated finding // array — empty objects still fail. if (/"verdict"\s*:\s*"(pass|fail|needs_patch)"/i.test(answer)) { const hasFindings = /"(critical_failures|pseudocode_flags|prd_mismatches|broken_pipelines|missing_components|risk_points|verified_components|required_next_actions)"\s*:\s*\[\s*\{/.test(answer); if (hasFindings) return true; } return false; } function retrieveTopK(query_emb: number[], pool: Chunk[], k: number): Chunk[] { return pool .map(c => ({ c, score: cosine(query_emb, c.embedding) })) .sort((a, b) => b.score - a.score) .slice(0, k) .map(x => ({ ...x.c, _score: x.score } as any)); } // File substrate — replaces the original tree-split summarize/reduce // architecture. The original was lossy: model paraphrased shards into // prose, paraphrase-of-paraphrase fed reviewer, reviewer hallucinated // against PRD worry-list (verified 2026-04-24: 0/10 real findings on // llm_team_ui.py 13K lines). // // The substrate approach (J's redesign, 2026-04-24): // // 1. ANCHORS zone — deterministic regex extraction of literally- // suspicious lines (route defs, auth calls, SQL, secrets, exception // handlers, env access). No LLM, no paraphrasing. Reviewer can // quote any anchor verbatim. // // 2. NEIGHBORS zone — the file is chunked line-aware and embedded // via the sidecar's nomic-embed-text. For every relevant PRD // chunk, we hybrid-retrieve the top-K matching FILE chunks. // Reviewer sees actual code regions semantically close to each // PRD worry-area, not summaries. // // 3. RANGE-LOOKUP zone — full-file kept in memory; reviewer can // ask for byte-exact ranges. (Currently surfaced via the verifier // check — every backtick-quoted snippet must literal-match the // source. Future: tool-call interface for in-prompt range fetch.) // // All three zones feed the reviewer with grounded code, not paraphrased // distillation. interface AnchorLine { byte_offset: number; line_no: number; text: string; kind: string; } interface FileChunk { byte_offset: number; line_from: number; line_to: number; text: string; embedding: number[]; } interface FileSubstrate { anchors: AnchorLine[]; chunks: FileChunk[]; queryFile: (emb: number[], k: number) => FileChunk[]; } // Deterministic regex extraction of reviewer-relevant lines. Each // pattern targets a class of risk surface common to web services: // auth, SQL, secrets, templating, HTTP routing, exception flow. const ANCHOR_PATTERNS: Array<{ kind: string; re: RegExp }> = [ { kind: "route", re: /^@\w+\.route\s*\(/m }, { kind: "func_def", re: /^\s*(async\s+)?def\s+\w+\s*\(/m }, { kind: "class_def", re: /^class\s+\w+/m }, { kind: "import", re: /^(from\s+\S+\s+import|import\s+\S+)/m }, { kind: "auth_decorator", re: /@(login_required|admin_required|api_key_required|require_\w+)/ }, { kind: "sql_exec", re: /\.\s*execute\s*\(/ }, { kind: "f_string_sql", re: /f["'][^"']*\b(SELECT|INSERT|UPDATE|DELETE|CREATE|DROP)\b/i }, { kind: "secret", re: /(secret|api_key|token|password|FLASK_SECRET|DB_URL)/i }, { kind: "template", re: /render_template(_string)?\s*\(/ }, { kind: "exception", re: /\bexcept\s+\w+/ }, { kind: "env_access", re: /os\.(environ|getenv)\b/ }, { kind: "rate_limit", re: /(rate_limit|limiter|RateLimit)/ }, { kind: "subprocess", re: /\b(subprocess|os\.system|exec\s*\(|eval\s*\()/ }, { kind: "todo", re: /\b(TODO|FIXME|XXX|HACK)\b/ }, ]; function extractAnchors(content: string): AnchorLine[] { const lines = content.split("\n"); const anchors: AnchorLine[] = []; let byteCursor = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const lineByte = byteCursor; byteCursor += line.length + 1; // +1 for the \n const trimmed = line.trim(); if (trimmed.length === 0 || trimmed.length > 240) continue; for (const p of ANCHOR_PATTERNS) { if (p.re.test(line)) { anchors.push({ byte_offset: lineByte, line_no: i + 1, text: line.length > 200 ? line.slice(0, 200) + "…" : line, kind: p.kind, }); break; // first matching kind wins; one line, one anchor entry } } } return anchors; } // Line-aware chunker. Targets ~800-char chunks but won't split a line. // Each chunk records the line range so the reviewer can cite "lines N-M". function chunkFileLineAware(content: string, target = 800): Array<{ byte_offset: number; line_from: number; line_to: number; text: string; }> { const lines = content.split("\n"); const chunks: Array<{ byte_offset: number; line_from: number; line_to: number; text: string }> = []; let buf: string[] = []; let bufBytes = 0; let chunkStartByte = 0; let chunkStartLine = 1; let byteCursor = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const lineLen = line.length + 1; if (bufBytes + lineLen > target && buf.length > 0) { chunks.push({ byte_offset: chunkStartByte, line_from: chunkStartLine, line_to: i, text: buf.join("\n"), }); buf = []; bufBytes = 0; chunkStartByte = byteCursor; chunkStartLine = i + 1; } buf.push(line); bufBytes += lineLen; byteCursor += lineLen; } if (buf.length > 0) { chunks.push({ byte_offset: chunkStartByte, line_from: chunkStartLine, line_to: lines.length, text: buf.join("\n"), }); } return chunks; } async function buildFileSubstrate(filePath: string, content: string): Promise { const anchors = extractAnchors(content); const rawChunks = chunkFileLineAware(content, 800); log(` substrate: ${anchors.length} anchors · ${rawChunks.length} chunks (line-aware, 800-char target)`); // Embed chunks in batches of 64 (nomic-embed-text handles this well). const embeddings: number[][] = []; const BATCH = 64; for (let i = 0; i < rawChunks.length; i += BATCH) { const batch = rawChunks.slice(i, i + BATCH).map((c) => c.text); const embs = await embedBatch(batch); embeddings.push(...embs); } const chunks: FileChunk[] = rawChunks.map((c, i) => ({ ...c, embedding: embeddings[i] })); return { anchors, chunks, queryFile: (emb: number[], k: number) => chunks .map((c) => ({ c, score: cosine(emb, c.embedding) })) .sort((a, b) => b.score - a.score) .slice(0, k) .map((x) => x.c), }; } // Tree-split a large file: shard it, summarize each shard into a // running scratchpad, THEN run a reduce step that collapses the // scratchpad into one file-level synthesis with shard boundaries // stripped. Returns the synthesis (not the raw scratchpad) so the // final reviewer never sees "--- shard N ---" markers and can't // leak them into its review title. // // Phase 21 design (aibridge/src/tree_split.rs) with the map → reduce // shape. Earlier version concatenated per-shard digests directly into // the reviewer prompt, which led to kimi-k2:1t writing review titles // like "Forensic Audit Report – file.rs (shard 3)" because the shard // markers bled through. Fix 2026-04-24 adds the reduce step. // // DEPRECATED 2026-04-24: superseded by buildFileSubstrate() above. // Kept temporarily as a fallback if substrate ingestion fails. async function treeSplitFile( filePath: string, content: string, ): Promise<{ scratchpad: string; shards: number; cloud_calls: number }> { const shards: Array<{ from: number; to: number; text: string }> = []; for (let i = 0; i < content.length; i += FILE_SHARD_SIZE) { const end = Math.min(i + FILE_SHARD_SIZE, content.length); shards.push({ from: i, to: end, text: content.slice(i, end) }); } // MAP — each shard digests independently. Previously the prompt // carried the accumulating scratchpad of all prior shard outputs, // which made MAP cost O(n²) in shard count AND forced late shards // to fight for context-window space against the prior notes (on a // 209-shard file, the prior-notes block alone hit ~40K tokens). The // cost/budget fix: each shard sees only its own text. The reducer // integrates the cross-shard view, not MAP. // // Instruction also changed to require SPECIFIC line/byte markers // and identifiers — previous "flat facts" framing produced generic // prose summaries where "line 9959: model_sets default contains // mistral:latest" collapsed to "the file routes to local models". // Scrum iter 11 observation: fine-grained fixes vanished from the // reviewer's view because specific-line detail didn't survive MAP. let workingScratchpad = ""; let cloud_calls = 0; log(` tree-split: ${content.length} chars → ${shards.length} shards of ${FILE_SHARD_SIZE}`); for (const [si, shard] of shards.entries()) { const prompt = `You are writing a SECTION of a full-file summary. File: ${filePath}. This is one piece (bytes ${shard.from}..${shard.to}) of a larger source file. ─────── source ─────── ${shard.text} ─────── end source ─────── Output two parts in order: PART A — Flat-bullet digest (≤200 words): - Every function, struct, class, or public type by name with one-line purpose. - Every hardcoded default, literal, or model name a caller might override. - Every TODO, FIXME, placeholder, or stub return. - Every exception handler and what it swallows vs re-raises. Do NOT say "this section" or "this shard". PART B — VERBATIM ANCHORS (REQUIRED — 5 to 10 lines copied character-perfect from the source above): Format each as a code-fenced block with the byte offset within the shard: \`\`\` @${shard.from}+OFFSET EXACT LINE OF SOURCE — DO NOT PARAPHRASE, DO NOT TRUNCATE \`\`\` Pick the most reviewer-relevant lines: route definitions (e.g. \`@app.route(...)\`), function signatures, security-sensitive calls (auth/SQL/exec/template/secrets), hardcoded credentials/defaults, exception handlers, sensitive imports. The reviewer will REFUSE to act on any claim not backed by a verbatim anchor — so anchors are how you prove findings are real.`; const r = await chat({ provider: "ollama_cloud", model: "gpt-oss:120b", prompt, max_tokens: 900, }); cloud_calls += 1; if (r.content) { // Keep internal alignment markers with byte offsets so the // reducer can correlate findings back to file regions. workingScratchpad += `\n§bytes ${shard.from}..${shard.to}§\n${r.content.trim()}`; } } // REDUCE — collapse the per-shard digests into one coherent // file-level summary. The reducer sees all digests at once and // produces a single narrative the reviewer can treat as "the file". // Shard markers are NOT in the output. This is what fixes the // shard-leakage bug that affected both the scrum and the auditor. // REDUCE — the one place where the cross-shard view comes together. // Previous max_tokens=900 asked for 40K tokens → 900 compression, // which destroyed specific line references. Raised to 2400 and the // prompt now explicitly requires preserving byte-offset markers and // concrete literals (hardcoded model names, line snippets, TODOs) // so fine-grained findings actually survive to the reviewer. // // Fix for shard-leakage: the reducer output is the SINGLE source // the reviewer sees as "the file" — per prior iter 3 observation // ("tree_split_fired:true is supposed to mean reducer-merged summary"). const reducePrompt = `You are producing a SINGLE coherent file-level summary of a source file from byte-addressed piece notes. Each piece note has TWO parts: a prose digest (PART A) and VERBATIM ANCHORS (PART B — code-fenced blocks with @offset markers and literal source lines). FILE: ${filePath} (${content.length} bytes, ${shards.length} pieces) PIECE NOTES: ${workingScratchpad} Produce ONE coherent output with TWO sections: ═══ NARRATIVE ═══ - One-sentence purpose of the file. - All public types / functions / constants with byte-offset markers like §bytes 24500..28000§. - Every hardcoded default, model name, or literal a caller might override — keep the EXACT string. - Every TODO / FIXME / stub return / placeholder. - Every exception handler and what it does with the error. - Obvious invariants. Under 1200 words. Do NOT mention "piece N" or "section". ═══ VERBATIM ANCHORS ═══ COPY EVERY anchor block from the piece notes IN ORDER, character-perfect. DO NOT paraphrase. DO NOT shorten. DO NOT skip any. The reviewer will use these to ground findings — if you elide one, real risks become invisible. Output the anchor blocks under their original \`\`\`@offset...\`\`\` fences, each on its own with a blank line between. The reviewer rejects findings that don't quote a string from this anchors block, so completeness here directly determines review quality.`; const reduced = await chat({ provider: "ollama_cloud", model: "gpt-oss:120b", prompt: reducePrompt, max_tokens: 2400, }); cloud_calls += 1; const synthesis = reduced.content?.trim() ?? ""; // Safety: if the reducer returned thin output, fall back to the // raw scratchpad — with byte markers preserved since the reviewer // benefits from offsets regardless of whether they're inside the // reducer's narrative or the raw per-piece bullets. const final = synthesis.length > 200 ? synthesis : workingScratchpad.trim(); return { scratchpad: final, shards: shards.length, cloud_calls }; } async function reviewFile( filePath: string, prd_chunks: Chunk[], proposal_chunks: Chunk[], ): Promise { const t0 = Date.now(); log(`file: ${filePath}`); const content = await readFile(filePath, "utf8"); const rel = filePath.replace("/home/profit/lakehouse/", ""); // Build a query embedding from the first ~800 chars of the file // (good enough for topical retrieval). const seed = content.slice(0, 800); const [seedEmb] = await embedBatch([seed]); const topPrd = retrieveTopK(seedEmb, prd_chunks, TOP_K_CONTEXT); const topPlan = retrieveTopK(seedEmb, proposal_chunks, TOP_K_CONTEXT); log(` retrieved ${topPrd.length} PRD chunks + ${topPlan.length} proposal chunks`); const contextBlock = [ "═══ RELEVANT PRD EXCERPTS ═══", ...topPrd.map(c => `[PRD @${c.offset}]\n${c.text.slice(0, 600)}`), "", "═══ RELEVANT CHANGE PROPOSAL EXCERPTS ═══", ...topPlan.map(c => `[PLAN @${c.offset}]\n${c.text.slice(0, 600)}`), ].join("\n\n"); // Files bigger than FILE_TREE_SPLIT_THRESHOLD trigger the substrate // path: deterministic anchor extraction + per-file vector index. // Reviewer sees three zones: // ANCHORS — verbatim suspicious lines (regex-extracted, never paraphrased) // NEIGHBORS — top-K file chunks retrieved per PRD chunk via cosine // PRD/PLAN — already retrieved, kept as-is // No LLM-paraphrased prose is shown. Reviewer is required to quote // anchors or chunks verbatim; verifier drops findings whose backtick- // quoted snippets don't appear in the original source. let sourceForPrompt: string; let treeSplitFired = false; let shardsSummarized = 0; let extraCloudCalls = 0; let substrateAnchorBlock = ""; let substrateRetrievedBlock = ""; if (content.length > FILE_TREE_SPLIT_THRESHOLD) { treeSplitFired = true; const sub = await buildFileSubstrate(rel, content); shardsSummarized = sub.chunks.length; // ANCHORS zone — pick representative anchors per kind, cap to ~30 // to keep the block readable. const byKind = new Map(); for (const a of sub.anchors) { const arr = byKind.get(a.kind) || []; arr.push(a); byKind.set(a.kind, arr); } const balanced: AnchorLine[] = []; const PER_KIND = 4; const MAX_ANCHORS = 40; for (const [, arr] of byKind) balanced.push(...arr.slice(0, PER_KIND)); balanced.sort((a, b) => a.byte_offset - b.byte_offset); const trimmedAnchors = balanced.slice(0, MAX_ANCHORS); substrateAnchorBlock = trimmedAnchors .map((a) => `[L${a.line_no} @byte ${a.byte_offset} kind=${a.kind}]\n${a.text}`) .join("\n\n"); log(` substrate anchors selected: ${trimmedAnchors.length}/${sub.anchors.length}`); // NEIGHBORS zone — for each top PRD chunk, pull the top-2 file // chunks that semantically match it. Surfaces the actual code // regions the PRD's worry-areas point at. Dedup by byte_offset. const seen = new Set(); const neighbors: FileChunk[] = []; for (const prdChunk of topPrd) { const top = sub.queryFile(prdChunk.embedding, 2); for (const fc of top) { if (seen.has(fc.byte_offset)) continue; seen.add(fc.byte_offset); neighbors.push(fc); } } substrateRetrievedBlock = neighbors .slice(0, 8) .map((c) => `[lines ${c.line_from}-${c.line_to} @byte ${c.byte_offset}]\n${c.text}`) .join("\n\n──\n\n"); log(` substrate neighbors retrieved: ${neighbors.length} (showing top 8)`); sourceForPrompt = `═══ ANCHORS (verbatim source lines extracted by regex — quotable) ═══\n${substrateAnchorBlock}\n\n` + `═══ NEIGHBORS (file chunks retrieved by similarity to PRD worry-areas — quotable) ═══\n${substrateRetrievedBlock}`; } else { sourceForPrompt = content; } // Prompt — when tree-split fired, include an explicit instruction // not to claim a field/function is "missing" because the scratchpad // is a distillation not the full file. Plus a hard quote-or-die // requirement: every finding MUST quote a literal string from the // VERBATIM ANCHORS section. Without this, big-file reviews // hallucinate against the PRD worry-list (verified 2026-04-24: // 0/10 real findings on 13K-line llm_team_ui.py). The // post-acceptance verifier (verifyAnchorGrounding) drops findings // whose backtick-quoted strings don't appear in the original // source — last-line defense against confabulation. const truncationWarning = treeSplitFired ? `\nIMPORTANT: this is a LARGE file (${content.length} bytes / ${shardsSummarized} chunks). You are NOT seeing the full raw source. You are seeing TWO grounded zones: ANCHORS — regex-extracted verbatim source lines (route defs, auth calls, SQL, secrets, exception handlers, etc.) with line numbers and byte offsets. Every line is character-perfect. NEIGHBORS — file chunks retrieved by cosine-similarity to each relevant PRD excerpt. Every chunk is character-perfect source code. QUOTE-OR-DIE RULE — NON-NEGOTIABLE: EVERY finding you list MUST include a backtick-quoted snippet of literal source text drawn from the ANCHORS or NEIGHBORS zones. If you cannot quote literal source for a claim, DO NOT make the claim. Generic "this file lacks X" is NOT acceptable when X isn't visibly absent from the anchors/neighbors you can see — instead, if you suspect X is absent, write "could not verify presence of X in retrieved zones" with low confidence rather than asserting it as a critical failure. The pipeline runs a post-process verifier that drops findings whose quoted code doesn't appear in the original source byte-for-byte. Make every claim grounded.\n` : ""; const forensicPrefix = FORENSIC_PREAMBLE ? `${FORENSIC_PREAMBLE}\n\n═══ FILE UNDER AUDIT ═══\n\n` : ""; const baseTask = `${forensicPrefix}You are reviewing one source file against the Lakehouse PRD and an active cohesion-integration plan. FILE: ${rel} (${content.length} bytes${treeSplitFired ? `, tree-split into ${shardsSummarized} shards` : ""}) ${truncationWarning} ─────── source ─────── ${sourceForPrompt} ─────── end source ─────── ${contextBlock} Produce a structured review with: 1. Alignment score (1-10) between this file and the PRD intent 2. 3-5 concrete suggested changes (bullet points), each naming a specific function/line and what to change 3. Any gap where this file's behavior contradicts the PRD or the proposal ${FORENSIC_PREAMBLE ? "4. Apply the forensic audit passes from the preamble: pseudocode detection, PRD contract status, normalization/validation pipeline, failure→repair loop, execution memory, relevance orchestration, execution safety, testing evidence. Issue a verdict pass|needs_patch|fail." : ""} **Per-finding confidence (required on every suggestion):** Attach a self-assessed **Confidence: NN%** to every suggested change AND every gap you list. The percentage is your belief that the suggestion is correct, will compile, and lands the PRD intent. Calibration guide: - 90-100%: pattern seen repeatedly in shipped code; change is mechanical; low risk of regressions - 70-89%: confident in direction, some room for interpretation on API shape or naming - 50-69%: plausible fix but may not match existing conventions or may cascade to other files - <50%: genuinely uncertain — include regardless so downstream knows to investigate before applying Format each finding as: \`**1.** . **Confidence: NN%.**\` (in tables, add a final "Confidence" column.) Low confidence is valuable signal — do not round up. **Per-finding semantic-flag tag (ADR-021, required on every finding):** Also attach a \`**Flag: **\` on each finding so the pathway-memory matrix index can cluster bug classes over time. Pick the ONE tag that best fits; if none fits, use \`None\`. Allowed categories: - \`UnitMismatch\` — operation combines values with different units (e.g. row_count - file_count, bytes - rows) - \`TypeConfusion\` — same type, wrong role (e.g. treating a PK as a row index) - \`NullableConfusion\` — unwrap-without-check or nullable-treated-as-non-null - \`OffByOne\` — loop / range / slice boundary mistake - \`StaleReference\` — calls a deprecated / removed / moved symbol - \`PseudoImpl\` — stub / todo!() / function named for work it doesn't do - \`DeadCode\` — unreachable or uncalled code - \`WarningNoise\` — compiles green but would add a cargo warning - \`BoundaryViolation\` — crosses a crate/layer boundary it shouldn't - \`None\` — improvement or nicety that doesn't fit a bug category In tables, add a "Flag" column. Examples: \`**1.** Rewrite base_rows calc. **Confidence: 90%.** **Flag: UnitMismatch.**\` \`**2.** Extract retry loop. **Confidence: 75%.** **Flag: None.**\` Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-offset when relevant.`; const history: FileReview["attempts_history"] = []; let accepted: string | null = null; let acceptedModel = ""; let acceptedOn = 0; // Pathway hot-swap pre-check. If a proven pathway exists for this // (task, file_prefix, signal) combo with ≥3 replays at ≥80% success, // skip the ladder and try its winning rung first. On success we // record a positive replay; on failure we fall through to the full // ladder and record a negative replay. Fire-and-forget — pathway // service unavailable → null candidate → business as usual. const signalClass = await lookupSignalClass(rel); const taskClass = "scrum_review"; const hotSwap = await queryHotSwap(taskClass, rel, signalClass); // ADR-021 Phase C: pre-review enrichment. Pull aggregated bug // fingerprints the matrix index has learned for this narrow // fingerprint and prepend to the reviewer prompt as historical // context. This is the compounding mechanism — iter-N reviewer // sees what iter-(N-1) and earlier found, so the grammar of bugs // accumulates instead of being re-discovered each iteration. const pastFingerprints = await fetchBugFingerprints(taskClass, rel, signalClass, 5); let pathwayPreamble = ""; if (pastFingerprints.length > 0) { pathwayPreamble = "═══ PATHWAY MEMORY — BUGS PREVIOUSLY FOUND ON THIS FILE AREA (ADR-021) ═══\n" + "The matrix index has flagged these patterns on the same task_class + file_prefix + signal_class before. Check this file for recurrences of the same shape:\n\n" + pastFingerprints.map((fp, i) => `${i + 1}. [${fp.flag.kind}] pattern=\`${fp.pattern_key}\` occurrences=${fp.occurrences}\n example: ${fp.example.slice(0, 160)}` ).join("\n") + "\n═══\n\n"; log(` 📚 pathway memory: ${pastFingerprints.length} historical bug pattern(s) prepended to prompt`); } // Unified matrix-indexed retrieval — pulls from ALL relevant KB // corpora (distilled facts/procedures/config-hints + team runs + // pathway successes). LH_SCRUM_MATRIX_RETRIEVE=0 disables for A/B. let provenApproachesPreamble = ""; if (MATRIX_RETRIEVE) { // Query text combines task framing + file path + first chunk of // source so retrieval anchors against both the metadata and the // actual code being reviewed. const matrixQuery = `${taskClass} ${rel} ${content.slice(0, 500)}`; const matrixCtx = await fetchMatrixContext(matrixQuery, taskClass, rel, 3, 8); provenApproachesPreamble = buildMatrixPreamble(matrixCtx); const corporaSummary = Object.entries(matrixCtx.by_corpus) .map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" "); const errSummary = Object.keys(matrixCtx.errors).length > 0 ? ` errors=[${Object.entries(matrixCtx.errors).map(([k, v]) => `${k}:${v}`).join(", ")}]` : ""; log(` 📖 matrix: ${matrixCtx.hits.length} hits in ${matrixCtx.latency_ms}ms · ${corporaSummary}${errSummary}`); } else { log(` 📖 matrix retrieval: DISABLED (LH_SCRUM_MATRIX_RETRIEVE=0)`); } let hotSwapOrderedIndices: number[] | null = null; if (hotSwap) { // Reorder the ladder to try the recommended model first. Rung // indices are preserved in the output so the trace still reflects // the true ladder position the model sits at. const recommendedIdx = LADDER.findIndex(r => r.model === hotSwap.recommended_model); if (recommendedIdx >= 0) { log(` 🔥 hot-swap candidate: ${hotSwap.recommended_model} (rung ${hotSwap.recommended_rung}, sim=${hotSwap.similarity.toFixed(3)}, success_rate=${hotSwap.success_rate.toFixed(2)}, ${hotSwap.replay_count} replays)`); hotSwapOrderedIndices = [recommendedIdx, ...LADDER.map((_, i) => i).filter(i => i !== recommendedIdx)]; } } const ladderOrder = hotSwapOrderedIndices ?? LADDER.map((_, i) => i); // Collect attempts for the pathway trace sidecar. const pathwayAttempts: LadderAttemptRec[] = []; // Single-model strategy with same-model retry. modelIdx advances // only on PROVIDER errors. Quality rejects from observer keep the // same model and retry with enriched context (history feeds back // into the `learning` preamble so the model sees what was wrong). // After MAX_QUALITY_RETRIES on the current model, advance to the // next fallback model in the safety chain. let modelIdx = 0; let qualityRetriesOnCurrentModel = 0; for (let step = 0; step < MAX_ATTEMPTS; step++) { if (modelIdx >= ladderOrder.length) { log(` ✗ all ${ladderOrder.length} fallback models exhausted, marking UNRESOLVED`); break; } const i = ladderOrder[modelIdx]; const n = step + 1; const rung = LADDER[i]; // Per-model rate limit. When capped, advance modelIdx (this model // is unavailable for the rest of the hour) and reset retries. const limit = MODEL_RATE_LIMITS[rung.model]; if (limit && !(await checkRateLimit(rung.model, limit.perHour))) { log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model} — SKIP (rate-limited: cap ${limit.perHour}/hr reached)`); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: 0, accepted: false, reject_reason: `rate-limited (cap ${limit.perHour}/hr)` }); modelIdx++; qualityRetriesOnCurrentModel = 0; continue; } const learning = history.length > 0 ? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status} — ${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══` : ""; const retryTag = qualityRetriesOnCurrentModel > 0 ? ` [retry ${qualityRetriesOnCurrentModel + 1}/${MAX_QUALITY_RETRIES + 1} same model + enrichment]` : ""; log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}${retryTag}`); const attemptStarted = Date.now(); if (limit) await recordRateLimitCall(rung.model); const r = await chat({ provider: rung.provider, model: rung.model, prompt: provenApproachesPreamble + pathwayPreamble + baseTask + learning, max_tokens: 1500, }); const attemptMs = Date.now() - attemptStarted; if (r.error) { // PROVIDER error (network, auth, 5xx) → cycle to next fallback // model. Reset retry counter for the new model. history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `error: ${r.error.slice(0, 100)}` }); log(` ✗ provider error: ${r.error.slice(0, 80)} — advancing to next fallback model`); modelIdx++; qualityRetriesOnCurrentModel = 0; continue; } if (!isAcceptable(r.content)) { // Thin/unstructured response = quality issue. Retry SAME model // with the failure logged to learning so it sees what to fix. history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `thin (${r.content.length} chars)` }); qualityRetriesOnCurrentModel++; if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) { log(` ✗ thin (${r.content.length} chars) — quality retries exhausted on ${rung.model}, advancing fallback`); modelIdx++; qualityRetriesOnCurrentModel = 0; } else { log(` ✗ thin (${r.content.length} chars) — retrying same model with enrichment hint`); } continue; } // Compute grounding stats as DATA — feed to observer for hand-review. // We no longer gate locally on a hardcoded threshold; that judgment // belongs to the observer (which has Langfuse traces + can call cloud // models for semantic review). Local stats are still informational // and get appended as a footer for humans. const groundingStats = verifyAnchorGrounding(r.content, content); // Observer hand-review — synchronous call to mcp-server :3800. Observer // returns {verdict: accept|reject|cycle, confidence, notes}. If the // observer is unreachable or errors, fall through to acceptance (the // observer is the policy layer; pipeline keeps moving when it's down). const obsVerdict = await observerHandReview({ file_path: rel, model: `${rung.provider}/${rung.model}`, response: r.content, source_content: content, grounding_stats: groundingStats, attempt: n, }); if (obsVerdict.verdict === "reject" || obsVerdict.verdict === "cycle") { // Observer rejected on quality grounds → retry SAME model with // the rejection notes feeding into `learning`. This is the // architectural correction (J 2026-04-25): quality issues mean // the context needs more enrichment, not a different model. const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`; history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason }); qualityRetriesOnCurrentModel++; if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) { log(` ✗ ${reason} — quality retries exhausted on ${rung.model}, advancing fallback`); modelIdx++; qualityRetriesOnCurrentModel = 0; } else { log(` ✗ ${reason} — retrying same model with enrichment hint`); } continue; } history.push({ n, model: rung.model, status: "accepted", chars: r.content.length }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: true, reject_reason: null }); accepted = r.content; acceptedModel = `${rung.provider}/${rung.model}`; acceptedOn = n; log(` ⚓ anchor grounding: ${groundingStats.grounded}/${groundingStats.total} quotes matched source` + (groundingStats.groundedPct !== null ? ` (${groundingStats.groundedPct}%)` : "") + ` · observer ${obsVerdict.verdict}` + (obsVerdict.confidence ? ` (conf=${obsVerdict.confidence})` : "")); accepted = appendGroundingFooter(accepted, groundingStats); log(` ✓ ACCEPTED on attempt ${n} (${rung.model}, ${r.content.length} chars)`); break; } // Hot-swap bookkeeping: if we tried the recommended model first, // report whether it worked so the pathway's success_rate updates. if (hotSwap) { const replaySucceeded = acceptedModel.endsWith(`/${hotSwap.recommended_model}`); log(` pathway replay ${replaySucceeded ? "✓" : "✗"} (${hotSwap.pathway_id.slice(0, 12)}…)`); // Fire and forget — don't await; observer can handle it. recordPathwayReplay(hotSwap.pathway_id, replaySucceeded); } const review: FileReview = { file: rel, file_bytes: content.length, tree_split_fired: treeSplitFired, shards_summarized: shardsSummarized, top_prd_chunks: topPrd.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })), top_proposal_chunks: topPlan.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })), attempts_made: history.length, attempts_history: history, accepted_on: acceptedOn || null, escalated_to_model: acceptedModel, suggestions: accepted ?? "[no acceptable answer after escalation ladder exhausted]", duration_ms: Date.now() - t0, }; // Append to the shared scrum-reviews jsonl so the auditor's // kb_query check can surface relevant reviews for files in a // PR diff. Cohesion plan Phase C wire. if (accepted) { const { appendFile, mkdir } = await import("node:fs/promises"); const { dirname } = await import("node:path"); await mkdir(dirname(SCRUM_REVIEWS_JSONL), { recursive: true }); // Extract per-finding confidences from the accepted markdown. // Patterns tried: "Confidence: NN%", "Confidence**: NN%", // and table-cell "| 92% |". Cap at 20 matches to bound row size. // Added 2026-04-23 (iter 2 direction from J: "make scrum output // include self-assessed confidence per finding"). const confidences: number[] = []; // Markdown format: "Confidence: 92%" / "Confidence**: 92%" / "| 92% |" const patMarkdown = /(?:Confidence[*:\s]*\s*|\|\s*)(\d{1,3})\s*%/gi; // JSON format (forensic strict output): "confidence": 92 const patJson = /"confidence"\s*:\s*(\d{1,3})(?!\d)/gi; for (const pat of [patMarkdown, patJson]) { const matches = accepted.matchAll(pat); for (const hit of matches) { if (confidences.length >= 40) break; const pct = parseInt(hit[1], 10); if (pct >= 0 && pct <= 100) confidences.push(pct); } } const conf_avg = confidences.length ? Math.round(confidences.reduce((a, b) => a + b, 0) / confidences.length) : null; const conf_min = confidences.length ? Math.min(...confidences) : null; // ADR-021 Phase B: extract per-finding semantic flags. Reviewer is // prompted to tag each finding with one of 9 categories plus None. // Patterns: "**Flag: UnitMismatch**", "Flag: OffByOne", table cell // with the flag word, or bare-word match. Deduplicated per-trace // so repeats in one review count once. const FLAG_VARIANTS = [ "UnitMismatch", "TypeConfusion", "NullableConfusion", "OffByOne", "StaleReference", "PseudoImpl", "DeadCode", "WarningNoise", "BoundaryViolation", ]; const flagMatches = new Set(); // Prefer matches anchored to the "Flag:" keyword; fall back to // bare-word matches so older reviewers that mention a category // without the "Flag:" prefix still contribute signal. const patFlagLabeled = /(?:Flag[*:\s]*\s*)([A-Z][A-Za-z]+)/g; for (const m of accepted.matchAll(patFlagLabeled)) { if (FLAG_VARIANTS.includes(m[1])) flagMatches.add(m[1]); } // Second pass — bare-word matches for each variant, but ONLY if // the labeled pass produced nothing. This avoids flagging every // file that happens to mention "DeadCode" in a code sample. if (flagMatches.size === 0) { for (const v of FLAG_VARIANTS) { const re = new RegExp(`\\b${v}\\b`); if (re.test(accepted)) flagMatches.add(v); } } const semantic_flags_arr = [...flagMatches].map(k => ({ kind: k })); // ADR-021 Phase D: bug_fingerprint extraction. // // Walk per-finding rows (either table format with columns // `Change | Flag | Confidence` OR bullet-list with inline // `**Flag: X.**` tag) and pair each flag with the surrounding // finding text. Then derive a stable pattern_key from code // identifiers the finding cites in backticks, so future reviews // of similar bugs cluster under the same key. // // v1 is heuristic (regex + identifier extraction + canonical // sort). It's intentionally NOT a semantic extractor — just a // deterministic "take the top code-shaped tokens and hash them // with the flag." Stability comes from sorting tokens alphabetically // before hashing so "row_count + QueryResponse" and "QueryResponse // + row_count" produce the same key. const bug_fingerprints_arr: Array<{ flag: { kind: string }; pattern_key: string; example: string; occurrences: number; }> = []; { // Split into candidate finding blocks. Both formats are row- // oriented, so a line split is a reasonable starting point. // Findings tend to be one-line table rows OR multi-line bullets // starting with **N.** — we handle both by looking at any line // that mentions a Flag variant and treating it as a finding. const lines = accepted.split(/\r?\n/); const seenKeys = new Set(); for (const line of lines) { // Find the flag variant on this line (if any). let variantOnLine: string | null = null; for (const v of FLAG_VARIANTS) { const re = new RegExp(`\\b${v}\\b`); if (re.test(line)) { variantOnLine = v; break; } } if (!variantOnLine) continue; // Extract identifier-shaped tokens from backticks. We try two // levels: (a) whole-backtick match if it's a clean identifier // or path, (b) for complex content like function signatures // (`Foo::bar(&self) -> u64`) pull out the longest identifier // substrings so we still capture the callable. const codeTokens: string[] = []; const idRe = /[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*/g; for (const m of line.matchAll(/`([^`]+)`/g)) { const raw = m[1].trim(); // Whole-backtick identifier or dotted path? (`row_count`, // `AccessControl::can_access`, `foo.bar`). if (/^[A-Za-z_][A-Za-z0-9_:]*(?:\.[A-Za-z_][A-Za-z0-9_]*)?$/.test(raw)) { if (raw.length >= 3) codeTokens.push(raw); continue; } // Fallback: scan for identifier substrings, take the longest // meaningful ones (usually the function or type name comes // first in a signature like `Foo::bar(&self)`). const ids = [...raw.matchAll(idRe)] .map(x => x[0]) .filter(id => id.length >= 3); // Prefer ::-qualified paths first (they're more specific), // then the top-2 longest; keeps the key stable under // signature variation. const ranked = ids .map(id => ({ id, score: (id.includes("::") ? 1000 : 0) + id.length })) .sort((a, b) => b.score - a.score) .slice(0, 2) .map(x => x.id); codeTokens.push(...ranked); } // Remove the flag variant name itself if it got captured (kimi // and other reviewers often wrap the flag column in backticks). // Also drop Rust + common keywords that slip through the // identifier regex — "self", "mut", "async", "await", "pub" // aren't bug-shape signal, they're grammar. const FLAG_SET = new Set(FLAG_VARIANTS); const KEYWORDS = new Set([ "self", "Self", "mut", "async", "await", "pub", "fn", "let", "const", "static", "impl", "trait", "struct", "enum", "use", "mod", "crate", "super", "match", "return", "Some", "None", "Ok", "Err", "true", "false", // Markdown table column headers kimi outputs for structured // reviews — "Flag" / "Change" / "Confidence" are layout words, // not identifiers. Seen as noise in iter 11 vectord extraction // ("DeadCode:Flag" pattern_key). "Flag", "Change", "Confidence", "PRD", "Plan", ]); const filtered = codeTokens.filter(t => !FLAG_SET.has(t) && !KEYWORDS.has(t)); if (filtered.length === 0) continue; // Canonicalize: dedupe, sort alphabetically, take top 3. // Alphabetical sort gives stability across "A then B" / "B then A" // variants. Top 3 keeps the key short while retaining enough // signal for different bugs to separate. const uniqTokens = [...new Set(filtered)].sort().slice(0, 3); const pattern_key = `${variantOnLine}:${uniqTokens.join("-")}`; if (seenKeys.has(pattern_key)) continue; seenKeys.add(pattern_key); // Example: the finding line, trimmed + truncated. Preserves // just enough context that the pre-review preamble in the // next iter can quote it back to the reviewer meaningfully. const example = line.replace(/\s+/g, " ").trim().slice(0, 200); bug_fingerprints_arr.push({ flag: { kind: variantOnLine }, pattern_key, example, occurrences: 1, }); } } // Score extraction — regex accepts decimals ("Score: 4.5/10") and // surrounding punctuation ("4/10 — mid"). iter 3 had 4 unparseable // scores because the prior regex /(\d)\s*\/\s*10/ missed decimals. const scoreMatch = accepted.match(/(?:score[\s*:]*)?(\d(?:\.\d)?)\s*\/\s*10\b/i); const alignment_score = scoreMatch ? parseFloat(scoreMatch[1]) : null; // Forensic JSON extraction — iter 3 showed 20/21 files came back // as JSON (verdict + critical_failures[] + verified_components[] + ...) // rather than markdown tables. Previously we only stored suggestions_preview // (truncated to 2KB); now we also capture the structured counters so // consumers can filter by verdict, sort by critical_failures_count, etc. let verdict: string | null = null; let critical_failures_count = 0; let pseudocode_flags_count = 0; let prd_mismatches_count = 0; let missing_components_count = 0; let verified_components_count = 0; let risk_points_count = 0; const isJsonShape = accepted.includes('"verdict"'); if (isJsonShape) { const vm = accepted.match(/"verdict"\s*:\s*"([a-z_]+)"/i); verdict = vm ? vm[1] : null; // Count object entries per array by counting occurrences of // either a unique-per-entry field name or {...} bracket pairs // inside the array span. A straight "count opening braces inside // the array range" is simplest and robust to field order. const countArrayEntries = (arrayName: string): number => { const re = new RegExp(`"${arrayName}"\\s*:\\s*\\[([\\s\\S]*?)\\]`, "i"); const m = accepted.match(re); if (!m || !m[1].trim()) return 0; // Count opening braces of direct-child objects. let depth = 0, entries = 0; for (const ch of m[1]) { if (ch === '{') { if (depth === 0) entries++; depth++; } else if (ch === '}') depth--; } return entries; }; critical_failures_count = countArrayEntries("critical_failures"); pseudocode_flags_count = countArrayEntries("pseudocode_flags"); prd_mismatches_count = countArrayEntries("prd_mismatches"); missing_components_count = countArrayEntries("missing_components"); verified_components_count = countArrayEntries("verified_components"); risk_points_count = countArrayEntries("risk_points"); } // Permission Gradient (Layer #6 from SYSTEM_EVOLUTION_LAYERS.md). // Classify the overall finding set by confidence_avg: // ≥90 auto-apply-safe, ≥70 dry-run + diff, ≥50 simulation only, // <50 block (human review). Use conf_min as the tier-lower-bound // so one shaky finding drags the whole row down to the safer tier. const tierFor = (c: number | null): string => { if (c === null) return "unknown"; if (c >= 90) return "auto"; if (c >= 70) return "dry_run"; if (c >= 50) return "simulation"; return "block"; }; const gradient_tier = tierFor(conf_min); // conservative: weakest finding decides const gradient_tier_avg = tierFor(conf_avg); const row = { file: rel, reviewed_at: new Date().toISOString(), accepted_model: acceptedModel, accepted_on_attempt: acceptedOn, attempts_made: history.length, tree_split_fired: treeSplitFired, suggestions_preview: accepted.slice(0, 2000), // Iter-3+ confidence fields. confidences_per_finding: confidences, confidence_avg: conf_avg, confidence_min: conf_min, findings_count: confidences.length, // Layer #6 Permission Gradient — downstream consumers decide // apply-semantics based on these fields instead of blindly trusting // every suggestion. gradient_tier, gradient_tier_avg, // Score (decimal-aware) and forensic JSON structured counters. // iter 4+ fields (schema_version 4). alignment_score, output_format: isJsonShape ? "forensic_json" : "markdown", verdict, critical_failures_count, pseudocode_flags_count, prd_mismatches_count, missing_components_count, verified_components_count, risk_points_count, schema_version: 4, scrum_master_reviewed: true, }; // Layer #2 Model Trust Profiling — append one row per file-accept // so over time we can compute per-(model, task_type) success/thin // rates and trust scores. task_type here is inferred from the file // path — good enough for initial stats, refine later. try { const taskType = rel.includes("/truth/") ? "truth" : rel.includes("/gateway/") ? "gateway" : rel.includes("/vectord") ? "vector" : rel.includes("/ingestd") ? "ingest" : rel.includes("/queryd") ? "query" : rel.includes("/storaged") || rel.includes("/catalogd") || rel.includes("/journald") ? "storage" : rel.includes("/aibridge") ? "aibridge" : "other"; const trustRow = { run_id: OUT_DIR.split("/").pop(), reviewed_at: row.reviewed_at, file: rel, task_type: taskType, accepted_model: acceptedModel, accepted_on_attempt: acceptedOn, attempts_made: history.length, thin_rejections: history.filter(h => h.status === "thin").length, errors: history.filter(h => h.status === "error").length, confidence_avg: conf_avg, tree_split_fired: treeSplitFired, }; await appendFile("/home/profit/lakehouse/data/_kb/model_trust.jsonl", JSON.stringify(trustRow) + "\n"); } catch (e) { console.error(`[scrum] model_trust append failed: ${(e as Error).message}`); } try { await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n"); } catch (e) { console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`); } // Pathway trace sidecar (consensus-designed 2026-04-24). Captures // FULL context (ladder attempts, KB chunks, observer signal, verdict) // for similarity-based hot-swap in future iterations. First-review // pathways start in probation (replay_count=0); they become // hot-swappable only after ≥3 replays at ≥80% success. try { const pathwayTrace: PathwayTracePayload = { pathway_id: computePathwayId(taskClass, rel, signalClass), task_class: taskClass, file_path: rel, signal_class: signalClass, created_at: row.reviewed_at, ladder_attempts: pathwayAttempts, kb_chunks: [ ...topPrd.map((c, idx) => ({ source_doc: "PRD.md", chunk_id: `prd@${c.offset}`, cosine_score: (c as any)._score ?? 0, rank: idx, })), ...topPlan.map((c, idx) => ({ source_doc: "cohesion_plan", chunk_id: `plan@${c.offset}`, cosine_score: (c as any)._score ?? 0, rank: idx, })), ], observer_signals: signalClass ? [{ class: signalClass, priors: [], prior_iter_outcomes: [] }] : [], bridge_hits: [], // context7 not wired into scrum yet; empty for v1 sub_pipeline_calls: [], // LLM Team extract happens after this row; out of scope for v1 audit_consensus: null, // set by auditor's later N=3 pass, via /pathway/insert update reducer_summary: accepted.slice(0, 4000), final_verdict: verdict ?? "accepted", // Vec built from the full attempts/chunks — richer than the // query-time vector. The similarity gate will still discriminate // between pathways with the same fingerprint but different // ladder/KB profiles. // Include semantic flag tokens in the embedding so traces with // different bug histories cluster separately — matches Rust's // build_pathway_vec exactly (flag: token shape). pathway_vec: buildPathwayVec([ taskClass, rel, ...(signalClass ? [`signal:${signalClass}`] : []), ...pathwayAttempts.flatMap(a => [`rung:${a.rung}`, `model:${a.model}`, `accepted:${a.accepted}`]), ...topPrd.map(c => `kb:PRD.md`), ...topPlan.map(c => `kb:cohesion_plan`), ...semantic_flags_arr.map(f => `flag:${f.kind}`), ]), semantic_flags: semantic_flags_arr, type_hints_used: [], // Phase E — pre-review enrichment from catalogd/arrow/truth bug_fingerprints: bug_fingerprints_arr, // ADR-021 Phase D replay_count: 0, replays_succeeded: 0, retired: false, }; writePathwayTrace(pathwayTrace); // fire-and-forget } catch (e) { console.error(`[scrum] pathway trace failed: ${(e as Error).message}`); } // Close the scrum → observer loop (fix 2026-04-24). Architecture // audit surfaced: observer ring had 2000 ops, 1999 from Langfuse, // zero from scrum. Observer's analyzeErrors + PLAYBOOK_BUILDER loops // were blind to the very pipeline most likely to teach them. One // fire-and-forget POST wires them in. Observer tolerates unreachable // backends; no scrum run fails if observer is down. // // Schema matches observer's ObservedOp shape (source, staffer_id, // sig_hash, event_kind, success, ...). file + accepted_model + // confidence_avg + gradient_tier give downstream analyzers enough // signal to correlate reviews with later regressions. try { const sigHash = createHash("sha256") .update(`${rel}|${OUT_DIR.split("/").pop()}`) .digest("hex") .slice(0, 16); fetch("http://localhost:3800/event", { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ source: "scrum", staffer_id: "scrum_master", sig_hash: sigHash, event_kind: "file_review", success: true, run_id: OUT_DIR.split("/").pop(), file: rel, accepted_model: acceptedModel, accepted_on_attempt: acceptedOn, attempts_made: history.length, thin_rejections: history.filter(h => h.status === "thin").length, confidence_avg: conf_avg, confidence_min: conf_min, findings_count: confidences.length, gradient_tier, tree_split_fired: treeSplitFired, // iter4+ forensic-JSON fields so observer's analyzer can // route by verdict / sort by critical_failures_count alignment_score, verdict, output_format: isJsonShape ? "forensic_json" : "markdown", critical_failures_count, verified_components_count, missing_components_count, // Pathway fields: emitted on every review so the observer // can build a full picture of hot-swap performance over time. // `pathway_hot_swap_hit` flags whether the first-tried rung // this review was a pathway recommendation vs the default // ladder top. `rungs_saved` quantifies the compute we avoided // when a hot-swap landed — this is the value metric the VCP // UI surfaces ("avg_rungs_saved_per_commit"). pathway_hot_swap_hit: hotSwap !== null, pathway_id: hotSwap?.pathway_id ?? null, pathway_similarity: hotSwap?.similarity ?? null, pathway_success_rate: hotSwap?.success_rate ?? null, rungs_saved: hotSwap && acceptedModel.endsWith(`/${hotSwap.recommended_model}`) ? Math.max(0, hotSwap.recommended_rung - 1) : 0, ts: row.reviewed_at, }), signal: AbortSignal.timeout(3000), }).catch(() => { // observer down — not a scrum-run failure, just lose the signal. }); } catch (e) { // Synchronous construction error — ignore. } // Route the accepted review through llm_team's fact extractor so // its entities + relationships land in audit_facts.jsonl alongside // inference-side extractions. Same index, two sources. Tagged // source:"scrum_review" + scrum_master_reviewed:true so downstream // queries can filter by provenance. Reviews shorter than 120 // chars are skipped — they're usually one-liners ("LGTM") with // no extractable knowledge. if (accepted.length >= 120 && process.env.LH_SCRUM_SKIP_EXTRACT !== "1") { try { const { extractFacts } = await import("../../auditor/fact_extractor.ts"); const ex = await extractFacts(accepted); if (!ex.error || ex.entities.length + ex.facts.length > 0) { const factRow = { pr_number: 0, // scrum runs outside a PR scope file: rel, head_sha: "", // no SHA scope; scope is the file+timestamp extracted_at: ex.extracted_at, extractor: ex.extractor_model, verifier: ex.verifier_model, llm_team_run_id: ex.llm_team_run_id ?? null, facts: ex.facts, entities: ex.entities, relationships: ex.relationships, verification_preview: ex.verification.slice(0, 400), schema_version: 2, source: "scrum_review", scrum_master_reviewed: true, }; const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl"; await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(factRow) + "\n"); } } catch (e) { console.error(`[scrum] fact extraction failed for ${rel}: ${(e as Error).message}`); } } } return review; } async function loadAndChunk(path: string, origin_tag: string): Promise { const text = await readFile(path, "utf8"); const raw = chunkText(text); const embs = await embedBatch(raw.map(r => r.text)); return raw.map((r, i) => ({ id: createHash("sha256").update(r.text).digest("hex").slice(0, 10), text: r.text, embedding: embs[i], origin: origin_tag, offset: r.offset, })); } async function main() { await mkdir(OUT_DIR, { recursive: true }); log(`output: ${OUT_DIR}`); log(`targets: ${TARGET_FILES.length} files`); log("loading + embedding PRD..."); const prd_chunks = await loadAndChunk(PRD_PATH, "PRD"); log(` PRD: ${prd_chunks.length} chunks`); log("loading + embedding cohesion plan..."); const plan_chunks = await loadAndChunk(PROPOSAL_PATH, "COHESION_PLAN"); log(` plan: ${plan_chunks.length} chunks`); log(""); log("─── scrum master: walking target files ───"); const reviews: FileReview[] = []; for (const f of TARGET_FILES) { const review = await reviewFile(f, prd_chunks, plan_chunks); reviews.push(review); await writeFile( `${OUT_DIR}/review_${review.file.replace(/\//g, "_")}.json`, JSON.stringify(review, null, 2), ); log(` → ${review.file}: ${review.accepted_on ? `accepted on ${review.accepted_on} by ${review.escalated_to_model}` : "UNRESOLVED"} (${review.duration_ms}ms)`); } // Consolidated scrum-master report const report_md: string[] = []; report_md.push(`# Scrum-master review\n`); report_md.push(`Generated: ${new Date().toISOString()}`); report_md.push(`Files reviewed: ${reviews.length}`); report_md.push(`Total duration: ${(reviews.reduce((s, r) => s + r.duration_ms, 0) / 1000).toFixed(1)}s\n`); for (const r of reviews) { report_md.push(`\n## ${r.file}`); report_md.push(`- **Accepted on attempt:** ${r.accepted_on ?? "NOT resolved after 6 attempts"}`); report_md.push(`- **Escalated to:** \`${r.escalated_to_model || "—"}\``); report_md.push(`- **Total attempts:** ${r.attempts_made}`); if (r.attempts_history.length > 1) { report_md.push(`- **Attempt history:**`); for (const h of r.attempts_history) { report_md.push(` - ${h.n}: \`${h.model}\` → ${h.status}${h.error ? ` (${h.error.slice(0, 100)})` : ""}`); } } report_md.push(`\n### Suggestions\n\n${r.suggestions}\n`); } await writeFile(`${OUT_DIR}/scrum_report.md`, report_md.join("\n")); const summary = { ran_at: new Date().toISOString(), target_count: TARGET_FILES.length, resolved: reviews.filter(r => r.accepted_on !== null).length, total_attempts: reviews.reduce((s, r) => s + r.attempts_made, 0), total_duration_ms: reviews.reduce((s, r) => s + r.duration_ms, 0), per_file: reviews.map(r => ({ file: r.file, accepted_on: r.accepted_on, model: r.escalated_to_model, attempts: r.attempts_made, ms: r.duration_ms })), }; await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2)); log(""); log("═══ SCRUM REPORT ═══"); log(` files: ${summary.target_count}, resolved: ${summary.resolved}, total attempts: ${summary.total_attempts}`); log(` total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`); log(""); for (const p of summary.per_file) { const mark = p.accepted_on ? "✓" : "✗"; log(` ${mark} ${p.file.padEnd(60)} attempt ${p.accepted_on ?? "—"}/${p.attempts} ${p.model} ${p.ms}ms`); } log(""); log(`report: ${OUT_DIR}/scrum_report.md`); process.exit(summary.resolved === summary.target_count ? 0 : 1); } main().catch(e => { console.error("[scrum] fatal:", e); process.exit(2); });