Two signal-quality fixes for the scrum loop: 1. isBlindResponse() — detects models that emit structurally-valid review JSON containing "no source code visible / cannot verify" even when source WAS supplied. Rejects so the ladder cycles to the next rung instead of accepting the blind hallucination. 2. verifyAnchorGrounding() + appendGroundingFooter() — post-process verifier that extracts every backtick-quoted snippet from the review and checks it against the original source content. Appends a grounding footer reporting grounded vs ungrounded counts so humans can audit hallucination rate at a glance. Born from the iter where llm_team_ui.py review came back with 6/10 findings hallucinated (invented render_template_string calls, fabricated logger.exception sites, made-up SHA-256 hashing). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1656 lines
76 KiB
TypeScript
1656 lines
76 KiB
TypeScript
// Scrum-master orchestrator — pulls git repo source + PRD + a change
|
||
// proposal, chunks everything, hands each code piece to the proven
|
||
// escalation ladder (small-local → big-local → cloud → specialist →
|
||
// biggest) with learning context between attempts. Collects per-file
|
||
// suggestions in a coherent handoff report.
|
||
//
|
||
// What it composes (everything below is already shipped + proven):
|
||
// - Chunker + embeddings (sidecar /embed, nomic-embed-text)
|
||
// - In-memory cosine retrieval (top-K PRD + plan chunks per file)
|
||
// - Escalation ladder (6 tiers, cycling on empty/error/thin-answer)
|
||
// - Per-attempt learning-context injection (prior failures → prompt)
|
||
// - Tree-split fallback when combined context exceeds budget
|
||
// - JSONL output per file + summary
|
||
//
|
||
// Deliberate scope limit: TARGET_FILES is 3 files by default. The
|
||
// pipeline works at larger N, but at ~90s/file × 3 files = 4-5 min,
|
||
// 15 files = 22 min. Bump via env LH_SCRUM_FILES="path1,path2,...".
|
||
//
|
||
// Run: bun run tests/real-world/scrum_master_pipeline.ts
|
||
|
||
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
||
import { createHash } from "node:crypto";
|
||
|
||
const GATEWAY = "http://localhost:3100";
|
||
const SIDECAR = "http://localhost:3200";
|
||
const CHUNK_SIZE = 800;
|
||
const CHUNK_OVERLAP = 120;
|
||
const TOP_K_CONTEXT = 5;
|
||
const MAX_ATTEMPTS = 9;
|
||
// Files larger than this get tree-split instead of truncated. Fixes the
|
||
// 6KB false-positive class (model claiming a field is "missing" when
|
||
// it exists past the context cutoff).
|
||
// Env-configurable so the pipeline can adapt to different repos:
|
||
// a 13K-line Python file like /root/llm-team-ui/llm_team_ui.py needs
|
||
// larger shards to avoid producing 200+ cloud calls per review.
|
||
// Defaults stay at 6000 / 3500 — tuned for Rust source files in
|
||
// crates/<crate>/src/*.rs.
|
||
const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000);
|
||
const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500);
|
||
// Appended jsonl so auditor's kb_query can surface scrum findings for
|
||
// files touched by a PR under review. Part of cohesion plan Phase C.
|
||
const SCRUM_REVIEWS_JSONL = process.env.LH_SCRUM_REVIEWS_OUT
|
||
|| "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl";
|
||
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/scrum_${Date.now().toString(36)}`;
|
||
|
||
const PRD_PATH = process.env.LH_SCRUM_PRD
|
||
|| "/home/profit/lakehouse/docs/PRD.md";
|
||
// Using CONTROL_PLANE_PRD as the "suggested changes" doc since it
|
||
// describes the Phase 38-44 target architecture and is on main.
|
||
// Override via LH_SCRUM_PROPOSAL env to point at a fix-wave doc
|
||
// generated from a phase-sweep audit, so the scrum pulls direction
|
||
// from concrete findings instead of the high-level PRD alone.
|
||
const PROPOSAL_PATH = process.env.LH_SCRUM_PROPOSAL
|
||
|| "/home/profit/lakehouse/docs/CONTROL_PLANE_PRD.md";
|
||
|
||
// Iter 2+ — when LH_SCRUM_FORENSIC is set to a file path, prepend its
|
||
// contents as an adversarial auditor preamble to every per-file prompt.
|
||
// This flips the review tone from "suggest improvements" to "prove it
|
||
// works or mark FAIL." Added 2026-04-23 for iter 2 of the 6x loop.
|
||
// Empty string = no preamble (iter-1 behavior).
|
||
const FORENSIC_PREAMBLE = process.env.LH_SCRUM_FORENSIC
|
||
? (() => {
|
||
try {
|
||
return require("node:fs").readFileSync(process.env.LH_SCRUM_FORENSIC!, "utf8");
|
||
} catch (e) {
|
||
console.error(`[scrum] warning: could not read LH_SCRUM_FORENSIC=${process.env.LH_SCRUM_FORENSIC}: ${e}`);
|
||
return "";
|
||
}
|
||
})()
|
||
: "";
|
||
|
||
// Scoped target: 3 representative source files by default.
|
||
// The scrum-master walks these in order and produces one suggestion
|
||
// set per file. Override via env for a wider sweep.
|
||
const DEFAULT_TARGETS = [
|
||
"/home/profit/lakehouse/crates/vectord/src/playbook_memory.rs",
|
||
"/home/profit/lakehouse/crates/vectord/src/doc_drift.rs",
|
||
"/home/profit/lakehouse/auditor/audit.ts",
|
||
];
|
||
const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES
|
||
? process.env.LH_SCRUM_FILES.split(",").map(s => s.trim())
|
||
: DEFAULT_TARGETS;
|
||
|
||
// Cloud-first ladder, STRONGEST-MODEL-FIRST (iter 3+, 2026-04-24).
|
||
// J's direction: "switch to the strongest cloud model" for iter 3 —
|
||
// the forensic prompt is demanding enough that even 120B gets rejected
|
||
// as thin. Rank by parameter count / reasoning strength:
|
||
// 1. kimi-k2:1t — 1T params, Moonshot flagship (biggest)
|
||
// 2. kimi-k2.6 — Moonshot next-gen, pro tier
|
||
// 3. deepseek-v3.1:671b — 671B, strong reasoning + coding
|
||
// 4. mistral-large-3:675b — 675B, deep analysis
|
||
// 5. qwen3.5:397b — 397B (iter 2's rescue model)
|
||
// 6. gpt-oss:120b — 120B (iter 1's primary; still strong fallback)
|
||
// Local fallbacks kept for cloud-down scenarios.
|
||
// Hot-path pipelines (scenario.ts / execution_loop) stay local per
|
||
// Phase 20 t1_hot — this scrum is not hot path.
|
||
const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [
|
||
{ provider: "ollama_cloud", model: "kimi-k2:1t", note: "cloud 1T — biggest available, 1.4s probe" },
|
||
{ provider: "ollama_cloud", model: "qwen3-coder:480b", note: "cloud 480B — coding specialist, 0.9s probe" },
|
||
{ provider: "ollama_cloud", model: "deepseek-v3.1:671b", note: "cloud 671B — fast reasoning (1.0s probe)" },
|
||
{ provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B — deep analysis (0.9s probe)" },
|
||
{ provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B — reliable workhorse (iter1 baseline)" },
|
||
{ provider: "ollama_cloud", model: "qwen3.5:397b", note: "cloud 397B dense — deep final thinker (J 2026-04-24)" },
|
||
// Free-tier rescue — different provider backbone, different quota.
|
||
// Added 2026-04-24 after iter 5 hit repeated Ollama Cloud 502s on
|
||
// kimi-k2:1t. These have lower parameter counts than the Ollama
|
||
// Cloud rungs but high availability: if upstream is down, we still
|
||
// land a review instead of giving up.
|
||
{ provider: "openrouter", model: "openai/gpt-oss-120b:free", note: "OpenRouter free 120B — substantive rescue, 2.8s probe" },
|
||
{ provider: "openrouter", model: "google/gemma-3-27b-it:free", note: "OpenRouter free 27B — fastest rescue, 1.4s probe" },
|
||
{ provider: "ollama", model: "qwen3.5:latest", note: "local qwen3.5 — best local model per J (2026-04-24), last-resort if all cloud down" },
|
||
// Dropped from the ladder after 2026-04-24 probe:
|
||
// - kimi-k2.6 — not available on current tier (empty response)
|
||
// - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist)
|
||
// - minimax-m2.7 — 400 thinking tokens, 0 content output
|
||
// - openrouter qwen3-coder:free / llama-3.3 / hermes-3 — provider errors
|
||
// - openrouter minimax-m2.5:free — 45s timeout
|
||
];
|
||
|
||
type Chunk = { id: string; text: string; embedding: number[]; origin: string; offset: number };
|
||
|
||
interface FileReview {
|
||
file: string;
|
||
file_bytes: number;
|
||
tree_split_fired: boolean;
|
||
shards_summarized: number;
|
||
top_prd_chunks: Array<{ origin: string; offset: number; score: number }>;
|
||
top_proposal_chunks: Array<{ origin: string; offset: number; score: number }>;
|
||
attempts_made: number;
|
||
attempts_history: Array<{ n: number; model: string; status: "accepted" | "thin" | "error"; chars: number; error?: string }>;
|
||
accepted_on: number | null;
|
||
escalated_to_model: string;
|
||
suggestions: string;
|
||
duration_ms: number;
|
||
}
|
||
|
||
function log(msg: string) { console.log(`[scrum] ${msg}`); }
|
||
|
||
function cosine(a: number[], b: number[]): number {
|
||
let dot = 0, na = 0, nb = 0;
|
||
for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
|
||
return na && nb ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0;
|
||
}
|
||
|
||
function chunkText(text: string): Array<{ text: string; offset: number }> {
|
||
const out: Array<{ text: string; offset: number }> = [];
|
||
for (let i = 0; i < text.length; ) {
|
||
const end = Math.min(i + CHUNK_SIZE, text.length);
|
||
const slice = text.slice(i, end).trim();
|
||
if (slice.length > 60) out.push({ text: slice, offset: i });
|
||
if (end >= text.length) break;
|
||
i = end - CHUNK_OVERLAP;
|
||
}
|
||
return out;
|
||
}
|
||
|
||
async function embedBatch(texts: string[]): Promise<number[][]> {
|
||
const r = await fetch(`${SIDECAR}/embed`, {
|
||
method: "POST", headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({ texts }),
|
||
signal: AbortSignal.timeout(120000),
|
||
});
|
||
if (!r.ok) throw new Error(`embed ${r.status}`);
|
||
return (await r.json() as any).embeddings;
|
||
}
|
||
|
||
// ─── Pathway memory (2026-04-24 consensus design) ───────────────────
|
||
//
|
||
// Mirrors vectord/src/pathway_memory.rs. The bucket-hash vector MUST
|
||
// byte-match the Rust implementation so traces written from TypeScript
|
||
// are searchable against the same embedding space. Verified by running
|
||
// both implementations on the same input tokens and asserting matching
|
||
// bucket indices.
|
||
|
||
function filePrefix(path: string): string {
|
||
return path.split("/").slice(0, 2).join("/");
|
||
}
|
||
|
||
function computePathwayId(taskClass: string, filePath: string, signalClass: string | null): string {
|
||
const h = createHash("sha256");
|
||
h.update(taskClass);
|
||
h.update("|");
|
||
h.update(filePrefix(filePath));
|
||
h.update("|");
|
||
h.update(signalClass ?? "");
|
||
return h.digest("hex");
|
||
}
|
||
|
||
// 32-bucket L2-normalized token hash. Same algorithm as Rust.
|
||
function buildPathwayVec(tokens: string[]): number[] {
|
||
const buckets = new Array(32).fill(0);
|
||
for (const t of tokens) {
|
||
const d = createHash("sha256").update(t, "utf8").digest();
|
||
const b1 = d[0] % 32;
|
||
const b2 = d[8] % 32;
|
||
buckets[b1] += 1;
|
||
buckets[b2] += 1;
|
||
}
|
||
let norm = 0;
|
||
for (const v of buckets) norm += v * v;
|
||
norm = Math.sqrt(norm);
|
||
if (norm > 0) for (let i = 0; i < buckets.length; i++) buckets[i] /= norm;
|
||
return buckets;
|
||
}
|
||
|
||
// Build the minimal query vector for a pre-ladder hot-swap check. We
|
||
// don't yet know the ladder attempts or KB chunks — the query vec is
|
||
// computed from what we CAN know up front: task/file/signal. This is
|
||
// a weaker embedding than the one computed at trace-insert time, but
|
||
// similarity still discriminates between task/file/signal combinations.
|
||
function buildQueryVec(taskClass: string, filePath: string, signalClass: string | null): number[] {
|
||
const tokens = [taskClass, filePath];
|
||
if (signalClass) tokens.push(`signal:${signalClass}`);
|
||
return buildPathwayVec(tokens);
|
||
}
|
||
|
||
interface HotSwapCandidate {
|
||
pathway_id: string;
|
||
similarity: number;
|
||
replay_count: number;
|
||
success_rate: number;
|
||
recommended_rung: number;
|
||
recommended_model: string;
|
||
}
|
||
|
||
async function queryHotSwap(taskClass: string, filePath: string, signalClass: string | null): Promise<HotSwapCandidate | null> {
|
||
try {
|
||
const query_vec = buildQueryVec(taskClass, filePath, signalClass);
|
||
const r = await fetch(`${GATEWAY}/vectors/pathway/query`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({ task_class: taskClass, file_path: filePath, signal_class: signalClass, query_vec }),
|
||
signal: AbortSignal.timeout(5000),
|
||
});
|
||
if (!r.ok) return null;
|
||
const j = await r.json() as { candidate: HotSwapCandidate | null };
|
||
return j.candidate ?? null;
|
||
} catch {
|
||
// Pathway service unavailable → run full ladder. Hot-swap is
|
||
// always an optimization, never a correctness requirement.
|
||
return null;
|
||
}
|
||
}
|
||
|
||
interface LadderAttemptRec {
|
||
rung: number;
|
||
model: string;
|
||
latency_ms: number;
|
||
accepted: boolean;
|
||
reject_reason: string | null;
|
||
}
|
||
|
||
interface PathwayTracePayload {
|
||
pathway_id: string;
|
||
task_class: string;
|
||
file_path: string;
|
||
signal_class: string | null;
|
||
created_at: string;
|
||
ladder_attempts: LadderAttemptRec[];
|
||
kb_chunks: { source_doc: string; chunk_id: string; cosine_score: number; rank: number }[];
|
||
observer_signals: { class: string; priors: string[]; prior_iter_outcomes: string[] }[];
|
||
bridge_hits: { library: string; version: string }[];
|
||
sub_pipeline_calls: { pipeline: string; result_summary: string }[];
|
||
audit_consensus: { pass: boolean; models: string[]; disagreements: number } | null;
|
||
reducer_summary: string;
|
||
final_verdict: string;
|
||
pathway_vec: number[];
|
||
// ADR-021 semantic-correctness layer. `kind` field matches the Rust
|
||
// serde(tag = "kind") wire format — TS and Rust interop directly.
|
||
semantic_flags: { kind: string }[];
|
||
type_hints_used: { source: string; symbol: string; type_repr: string }[];
|
||
bug_fingerprints: { flag: { kind: string }; pattern_key: string; example: string; occurrences: number }[];
|
||
replay_count: number;
|
||
replays_succeeded: number;
|
||
retired: boolean;
|
||
}
|
||
|
||
async function writePathwayTrace(trace: PathwayTracePayload): Promise<void> {
|
||
try {
|
||
await fetch(`${GATEWAY}/vectors/pathway/insert`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify(trace),
|
||
signal: AbortSignal.timeout(10000),
|
||
});
|
||
} catch {
|
||
// Fire-and-forget: scrum runs shouldn't fail if pathway insert fails.
|
||
}
|
||
}
|
||
|
||
async function recordPathwayReplay(pathwayId: string, succeeded: boolean): Promise<void> {
|
||
try {
|
||
await fetch(`${GATEWAY}/vectors/pathway/record_replay`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({ pathway_id: pathwayId, succeeded }),
|
||
signal: AbortSignal.timeout(5000),
|
||
});
|
||
} catch {
|
||
// Fire-and-forget. Not critical.
|
||
}
|
||
}
|
||
|
||
// ADR-021 Phase C: pre-review enrichment. Fetch aggregated bug
|
||
// fingerprints for this narrow fingerprint (same key as hot-swap —
|
||
// task_class + file_prefix + signal_class) so the reviewer prompt
|
||
// can explicitly warn "this file area has had these bug patterns
|
||
// before." Empty on fresh install; grows as the matrix index learns.
|
||
interface BugFingerprintRow {
|
||
flag: { kind: string };
|
||
pattern_key: string;
|
||
example: string;
|
||
occurrences: number;
|
||
}
|
||
async function fetchBugFingerprints(taskClass: string, filePath: string, signalClass: string | null, limit: number): Promise<BugFingerprintRow[]> {
|
||
try {
|
||
const r = await fetch(`${GATEWAY}/vectors/pathway/bug_fingerprints`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({ task_class: taskClass, file_path: filePath, signal_class: signalClass, limit }),
|
||
signal: AbortSignal.timeout(5000),
|
||
});
|
||
if (!r.ok) return [];
|
||
const j = await r.json() as { fingerprints: BugFingerprintRow[] };
|
||
return j.fingerprints ?? [];
|
||
} catch {
|
||
return [];
|
||
}
|
||
}
|
||
|
||
// Deterministic signal_class lookup from scrum_reviews.jsonl history.
|
||
// First-time files get `null`. Files seen before get the signal class
|
||
// the observer assigned on their most-recent review (if any). Keeps the
|
||
// pathway fingerprint stable across iterations for LOOPING files.
|
||
async function lookupSignalClass(filePath: string): Promise<string | null> {
|
||
try {
|
||
const { readFile } = await import("node:fs/promises");
|
||
const raw = await readFile(SCRUM_REVIEWS_JSONL, "utf8").catch(() => "");
|
||
if (!raw) return null;
|
||
const lines = raw.trim().split("\n").reverse();
|
||
for (const line of lines) {
|
||
try {
|
||
const r = JSON.parse(line);
|
||
if (r.file === filePath && r.signal_class) return r.signal_class;
|
||
} catch {}
|
||
}
|
||
return null;
|
||
} catch { return null; }
|
||
}
|
||
|
||
async function chat(opts: {
|
||
provider: "ollama" | "ollama_cloud",
|
||
model: string,
|
||
prompt: string,
|
||
max_tokens?: number,
|
||
}): Promise<{ content: string; error?: string; prompt_tokens: number; completion_tokens: number }> {
|
||
try {
|
||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||
method: "POST", headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({
|
||
provider: opts.provider,
|
||
model: opts.model,
|
||
messages: [{ role: "user", content: opts.prompt }],
|
||
max_tokens: opts.max_tokens ?? 1500,
|
||
temperature: 0.2,
|
||
think: false,
|
||
}),
|
||
signal: AbortSignal.timeout(180000),
|
||
});
|
||
if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 200)}`, prompt_tokens: 0, completion_tokens: 0 };
|
||
const j: any = await r.json();
|
||
return {
|
||
content: j.choices?.[0]?.message?.content ?? "",
|
||
prompt_tokens: j.usage?.prompt_tokens ?? 0,
|
||
completion_tokens: j.usage?.completion_tokens ?? 0,
|
||
};
|
||
} catch (e) {
|
||
return { content: "", error: (e as Error).message, prompt_tokens: 0, completion_tokens: 0 };
|
||
}
|
||
}
|
||
|
||
// Accept a file-review answer if it's substantive + structured.
|
||
// We're not validating Rust here — we're validating that the model
|
||
// produced a coherent suggestion set.
|
||
//
|
||
// BLIND-RESPONSE GUARD (added after iter 4 regression on llm-team-ui):
|
||
// Some models pretend the source code wasn't supplied even when it was —
|
||
// they produce structurally-valid JSON with one critical_failure of the
|
||
// form "No source code visible; cannot verify..." Those should be
|
||
// rejected so the ladder cycles to the next rung. We check for a small
|
||
// set of telltale phrases inside critical_failures descriptions.
|
||
function isBlindResponse(answer: string): boolean {
|
||
// Cheap substring match on the descriptions area of the JSON
|
||
const blindPhrases = [
|
||
/no source code (visible|provided|supplied)/i,
|
||
/cannot (view|see|verify|access) (the )?source/i,
|
||
/no code (was )?(visible|provided|supplied|attached)/i,
|
||
/unable to (view|access|read) (the )?(source|file|code)/i,
|
||
/source (code )?was not (provided|supplied|attached|included)/i,
|
||
];
|
||
return blindPhrases.some((re) => re.test(answer));
|
||
}
|
||
|
||
// Anchor-grounding verifier — runs after a review is accepted (only
|
||
// when tree-split fired, since small files don't need it). Extracts
|
||
// every backtick-quoted code snippet from the review and checks
|
||
// whether it appears in the original source content. Returns the
|
||
// stats + a footer that gets appended to the review so humans can
|
||
// audit grounding rate at a glance.
|
||
//
|
||
// Why: 2026-04-24 verification of llm_team_ui.py (13K lines, 61 shards)
|
||
// showed 0/10 findings real, 6/10 hallucinated. Model invented
|
||
// `render_template_string(f"<h1>{user}</h1>")`, `logger.exception(e)`,
|
||
// SHA-256 password hashing — none of which existed in the actual
|
||
// source. The reviewer wrote what *fit* the PRD's worry-list rather
|
||
// than what the code actually does. This verifier catches that.
|
||
function verifyAnchorGrounding(answer: string, sourceContent: string) {
|
||
// Pull both inline `quoted` and triple-fenced ```quoted``` snippets.
|
||
// Skip very short ones (≤ 3 chars — they false-match too easily on
|
||
// common tokens like \`a\` or \`if\`).
|
||
const inline = [...answer.matchAll(/`([^`\n]{4,})`/g)].map((m) => m[1]);
|
||
const fenced = [...answer.matchAll(/```(?:[a-z]+\n)?([\s\S]+?)```/g)]
|
||
.map((m) => m[1].trim())
|
||
.flatMap((b) => b.split("\n"))
|
||
.map((l) => l.trim())
|
||
.filter((l) => l.length >= 6);
|
||
const allQuotes = [...new Set([...inline, ...fenced])];
|
||
|
||
const grounded: string[] = [];
|
||
const ungrounded: string[] = [];
|
||
const sourceLower = sourceContent.toLowerCase();
|
||
// The model often emits the review wrapped in a JSON envelope, so
|
||
// backtick-quoted snippets have their internal `"` escaped as `\"`,
|
||
// `\n` as `\\n`, etc. Try unescaped variant first; if that's in the
|
||
// source consider it grounded. Also normalize curly quotes to ASCII
|
||
// since some models smart-quote string literals.
|
||
const unescapeJsonish = (s: string) =>
|
||
s
|
||
.replace(/\\"/g, '"')
|
||
.replace(/\\'/g, "'")
|
||
.replace(/\\n/g, "\n")
|
||
.replace(/\\t/g, "\t")
|
||
.replace(/\\\\/g, "\\")
|
||
.replace(/[“”]/g, '"')
|
||
.replace(/[‘’]/g, "'");
|
||
for (const q of allQuotes) {
|
||
// Strip leading offset markers like "@123456" the anchors carry
|
||
const cleaned = q.replace(/^@\d+\s*/, "").trim();
|
||
if (cleaned.length < 4) continue;
|
||
const candidates = [cleaned, unescapeJsonish(cleaned)];
|
||
const hit = candidates.some((c) => sourceLower.includes(c.toLowerCase()));
|
||
if (hit) grounded.push(cleaned);
|
||
else ungrounded.push(cleaned);
|
||
}
|
||
const total = grounded.length + ungrounded.length;
|
||
const groundedPct = total > 0 ? Math.round((grounded.length / total) * 100) : null;
|
||
|
||
return { total, grounded: grounded.length, ungrounded, groundedPct };
|
||
}
|
||
|
||
function appendGroundingFooter(
|
||
answer: string,
|
||
stats: ReturnType<typeof verifyAnchorGrounding>,
|
||
): string {
|
||
const lines = [
|
||
"",
|
||
"─── ANCHOR GROUNDING (post-process verifier) ───",
|
||
`Backtick-quoted snippets: ${stats.total}`,
|
||
`Grounded in source (literal substring match): ${stats.grounded}` +
|
||
(stats.groundedPct !== null ? ` (${stats.groundedPct}%)` : ""),
|
||
`Ungrounded (likely hallucinated, treat findings using these as low-confidence):`,
|
||
];
|
||
if (stats.ungrounded.length === 0) {
|
||
lines.push(" (none — every quoted snippet matches the source verbatim)");
|
||
} else {
|
||
for (const u of stats.ungrounded.slice(0, 12)) {
|
||
lines.push(` · \`${u.slice(0, 80)}\``);
|
||
}
|
||
if (stats.ungrounded.length > 12) {
|
||
lines.push(` · ... and ${stats.ungrounded.length - 12} more`);
|
||
}
|
||
}
|
||
lines.push("─────────────────────────────────────────────────");
|
||
return answer + "\n" + lines.join("\n");
|
||
}
|
||
|
||
function isAcceptable(answer: string): boolean {
|
||
if (answer.length < 200) return false; // too thin
|
||
if (isBlindResponse(answer)) return false; // hallucinated "no source"
|
||
// Two accepted shapes:
|
||
// (a) Markdown — bullets, numbered list, or headers. Original shape.
|
||
// (b) Forensic JSON — `{"verdict":"..."}` with at least one of the
|
||
// finding arrays populated. SCRUM_FORENSIC_PROMPT.md requires
|
||
// this shape; previous version rejected it because the first
|
||
// character is `{`, not `-`/`#`/`1.`. Iter-2 observation in
|
||
// SCRUM_LOOP_NOTES flagged this as `[FORENSIC vs thin-detector
|
||
// mismatch]` — this is the fix.
|
||
const hasMarkdownStructure = /^\s*[-*]\s/m.test(answer)
|
||
|| /^\s*\d+\.\s/m.test(answer)
|
||
|| /^\s*#/m.test(answer);
|
||
if (hasMarkdownStructure) return true;
|
||
// Accept JSON verdict shape even without surrounding markdown.
|
||
// Check for a `"verdict"` key and at least one populated finding
|
||
// array — empty objects still fail.
|
||
if (/"verdict"\s*:\s*"(pass|fail|needs_patch)"/i.test(answer)) {
|
||
const hasFindings = /"(critical_failures|pseudocode_flags|prd_mismatches|broken_pipelines|missing_components|risk_points|verified_components|required_next_actions)"\s*:\s*\[\s*\{/.test(answer);
|
||
if (hasFindings) return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
function retrieveTopK(query_emb: number[], pool: Chunk[], k: number): Chunk[] {
|
||
return pool
|
||
.map(c => ({ c, score: cosine(query_emb, c.embedding) }))
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, k)
|
||
.map(x => ({ ...x.c, _score: x.score } as any));
|
||
}
|
||
|
||
// File substrate — replaces the original tree-split summarize/reduce
|
||
// architecture. The original was lossy: model paraphrased shards into
|
||
// prose, paraphrase-of-paraphrase fed reviewer, reviewer hallucinated
|
||
// against PRD worry-list (verified 2026-04-24: 0/10 real findings on
|
||
// llm_team_ui.py 13K lines).
|
||
//
|
||
// The substrate approach (J's redesign, 2026-04-24):
|
||
//
|
||
// 1. ANCHORS zone — deterministic regex extraction of literally-
|
||
// suspicious lines (route defs, auth calls, SQL, secrets, exception
|
||
// handlers, env access). No LLM, no paraphrasing. Reviewer can
|
||
// quote any anchor verbatim.
|
||
//
|
||
// 2. NEIGHBORS zone — the file is chunked line-aware and embedded
|
||
// via the sidecar's nomic-embed-text. For every relevant PRD
|
||
// chunk, we hybrid-retrieve the top-K matching FILE chunks.
|
||
// Reviewer sees actual code regions semantically close to each
|
||
// PRD worry-area, not summaries.
|
||
//
|
||
// 3. RANGE-LOOKUP zone — full-file kept in memory; reviewer can
|
||
// ask for byte-exact ranges. (Currently surfaced via the verifier
|
||
// check — every backtick-quoted snippet must literal-match the
|
||
// source. Future: tool-call interface for in-prompt range fetch.)
|
||
//
|
||
// All three zones feed the reviewer with grounded code, not paraphrased
|
||
// distillation.
|
||
|
||
interface AnchorLine {
|
||
byte_offset: number;
|
||
line_no: number;
|
||
text: string;
|
||
kind: string;
|
||
}
|
||
|
||
interface FileChunk {
|
||
byte_offset: number;
|
||
line_from: number;
|
||
line_to: number;
|
||
text: string;
|
||
embedding: number[];
|
||
}
|
||
|
||
interface FileSubstrate {
|
||
anchors: AnchorLine[];
|
||
chunks: FileChunk[];
|
||
queryFile: (emb: number[], k: number) => FileChunk[];
|
||
}
|
||
|
||
// Deterministic regex extraction of reviewer-relevant lines. Each
|
||
// pattern targets a class of risk surface common to web services:
|
||
// auth, SQL, secrets, templating, HTTP routing, exception flow.
|
||
const ANCHOR_PATTERNS: Array<{ kind: string; re: RegExp }> = [
|
||
{ kind: "route", re: /^@\w+\.route\s*\(/m },
|
||
{ kind: "func_def", re: /^\s*(async\s+)?def\s+\w+\s*\(/m },
|
||
{ kind: "class_def", re: /^class\s+\w+/m },
|
||
{ kind: "import", re: /^(from\s+\S+\s+import|import\s+\S+)/m },
|
||
{ kind: "auth_decorator", re: /@(login_required|admin_required|api_key_required|require_\w+)/ },
|
||
{ kind: "sql_exec", re: /\.\s*execute\s*\(/ },
|
||
{ kind: "f_string_sql", re: /f["'][^"']*\b(SELECT|INSERT|UPDATE|DELETE|CREATE|DROP)\b/i },
|
||
{ kind: "secret", re: /(secret|api_key|token|password|FLASK_SECRET|DB_URL)/i },
|
||
{ kind: "template", re: /render_template(_string)?\s*\(/ },
|
||
{ kind: "exception", re: /\bexcept\s+\w+/ },
|
||
{ kind: "env_access", re: /os\.(environ|getenv)\b/ },
|
||
{ kind: "rate_limit", re: /(rate_limit|limiter|RateLimit)/ },
|
||
{ kind: "subprocess", re: /\b(subprocess|os\.system|exec\s*\(|eval\s*\()/ },
|
||
{ kind: "todo", re: /\b(TODO|FIXME|XXX|HACK)\b/ },
|
||
];
|
||
|
||
function extractAnchors(content: string): AnchorLine[] {
|
||
const lines = content.split("\n");
|
||
const anchors: AnchorLine[] = [];
|
||
let byteCursor = 0;
|
||
for (let i = 0; i < lines.length; i++) {
|
||
const line = lines[i];
|
||
const lineByte = byteCursor;
|
||
byteCursor += line.length + 1; // +1 for the \n
|
||
const trimmed = line.trim();
|
||
if (trimmed.length === 0 || trimmed.length > 240) continue;
|
||
for (const p of ANCHOR_PATTERNS) {
|
||
if (p.re.test(line)) {
|
||
anchors.push({
|
||
byte_offset: lineByte,
|
||
line_no: i + 1,
|
||
text: line.length > 200 ? line.slice(0, 200) + "…" : line,
|
||
kind: p.kind,
|
||
});
|
||
break; // first matching kind wins; one line, one anchor entry
|
||
}
|
||
}
|
||
}
|
||
return anchors;
|
||
}
|
||
|
||
// Line-aware chunker. Targets ~800-char chunks but won't split a line.
|
||
// Each chunk records the line range so the reviewer can cite "lines N-M".
|
||
function chunkFileLineAware(content: string, target = 800): Array<{
|
||
byte_offset: number;
|
||
line_from: number;
|
||
line_to: number;
|
||
text: string;
|
||
}> {
|
||
const lines = content.split("\n");
|
||
const chunks: Array<{ byte_offset: number; line_from: number; line_to: number; text: string }> = [];
|
||
let buf: string[] = [];
|
||
let bufBytes = 0;
|
||
let chunkStartByte = 0;
|
||
let chunkStartLine = 1;
|
||
let byteCursor = 0;
|
||
for (let i = 0; i < lines.length; i++) {
|
||
const line = lines[i];
|
||
const lineLen = line.length + 1;
|
||
if (bufBytes + lineLen > target && buf.length > 0) {
|
||
chunks.push({
|
||
byte_offset: chunkStartByte,
|
||
line_from: chunkStartLine,
|
||
line_to: i,
|
||
text: buf.join("\n"),
|
||
});
|
||
buf = [];
|
||
bufBytes = 0;
|
||
chunkStartByte = byteCursor;
|
||
chunkStartLine = i + 1;
|
||
}
|
||
buf.push(line);
|
||
bufBytes += lineLen;
|
||
byteCursor += lineLen;
|
||
}
|
||
if (buf.length > 0) {
|
||
chunks.push({
|
||
byte_offset: chunkStartByte,
|
||
line_from: chunkStartLine,
|
||
line_to: lines.length,
|
||
text: buf.join("\n"),
|
||
});
|
||
}
|
||
return chunks;
|
||
}
|
||
|
||
async function buildFileSubstrate(filePath: string, content: string): Promise<FileSubstrate> {
|
||
const anchors = extractAnchors(content);
|
||
const rawChunks = chunkFileLineAware(content, 800);
|
||
log(` substrate: ${anchors.length} anchors · ${rawChunks.length} chunks (line-aware, 800-char target)`);
|
||
|
||
// Embed chunks in batches of 64 (nomic-embed-text handles this well).
|
||
const embeddings: number[][] = [];
|
||
const BATCH = 64;
|
||
for (let i = 0; i < rawChunks.length; i += BATCH) {
|
||
const batch = rawChunks.slice(i, i + BATCH).map((c) => c.text);
|
||
const embs = await embedBatch(batch);
|
||
embeddings.push(...embs);
|
||
}
|
||
const chunks: FileChunk[] = rawChunks.map((c, i) => ({ ...c, embedding: embeddings[i] }));
|
||
|
||
return {
|
||
anchors,
|
||
chunks,
|
||
queryFile: (emb: number[], k: number) =>
|
||
chunks
|
||
.map((c) => ({ c, score: cosine(emb, c.embedding) }))
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, k)
|
||
.map((x) => x.c),
|
||
};
|
||
}
|
||
|
||
// Tree-split a large file: shard it, summarize each shard into a
|
||
// running scratchpad, THEN run a reduce step that collapses the
|
||
// scratchpad into one file-level synthesis with shard boundaries
|
||
// stripped. Returns the synthesis (not the raw scratchpad) so the
|
||
// final reviewer never sees "--- shard N ---" markers and can't
|
||
// leak them into its review title.
|
||
//
|
||
// Phase 21 design (aibridge/src/tree_split.rs) with the map → reduce
|
||
// shape. Earlier version concatenated per-shard digests directly into
|
||
// the reviewer prompt, which led to kimi-k2:1t writing review titles
|
||
// like "Forensic Audit Report – file.rs (shard 3)" because the shard
|
||
// markers bled through. Fix 2026-04-24 adds the reduce step.
|
||
//
|
||
// DEPRECATED 2026-04-24: superseded by buildFileSubstrate() above.
|
||
// Kept temporarily as a fallback if substrate ingestion fails.
|
||
async function treeSplitFile(
|
||
filePath: string,
|
||
content: string,
|
||
): Promise<{ scratchpad: string; shards: number; cloud_calls: number }> {
|
||
const shards: Array<{ from: number; to: number; text: string }> = [];
|
||
for (let i = 0; i < content.length; i += FILE_SHARD_SIZE) {
|
||
const end = Math.min(i + FILE_SHARD_SIZE, content.length);
|
||
shards.push({ from: i, to: end, text: content.slice(i, end) });
|
||
}
|
||
|
||
// MAP — each shard digests independently. Previously the prompt
|
||
// carried the accumulating scratchpad of all prior shard outputs,
|
||
// which made MAP cost O(n²) in shard count AND forced late shards
|
||
// to fight for context-window space against the prior notes (on a
|
||
// 209-shard file, the prior-notes block alone hit ~40K tokens). The
|
||
// cost/budget fix: each shard sees only its own text. The reducer
|
||
// integrates the cross-shard view, not MAP.
|
||
//
|
||
// Instruction also changed to require SPECIFIC line/byte markers
|
||
// and identifiers — previous "flat facts" framing produced generic
|
||
// prose summaries where "line 9959: model_sets default contains
|
||
// mistral:latest" collapsed to "the file routes to local models".
|
||
// Scrum iter 11 observation: fine-grained fixes vanished from the
|
||
// reviewer's view because specific-line detail didn't survive MAP.
|
||
let workingScratchpad = "";
|
||
let cloud_calls = 0;
|
||
log(` tree-split: ${content.length} chars → ${shards.length} shards of ${FILE_SHARD_SIZE}`);
|
||
for (const [si, shard] of shards.entries()) {
|
||
const prompt = `You are writing a SECTION of a full-file summary. File: ${filePath}. This is one piece (bytes ${shard.from}..${shard.to}) of a larger source file.
|
||
|
||
─────── source ───────
|
||
${shard.text}
|
||
─────── end source ───────
|
||
|
||
Output two parts in order:
|
||
|
||
PART A — Flat-bullet digest (≤200 words):
|
||
- Every function, struct, class, or public type by name with one-line purpose.
|
||
- Every hardcoded default, literal, or model name a caller might override.
|
||
- Every TODO, FIXME, placeholder, or stub return.
|
||
- Every exception handler and what it swallows vs re-raises.
|
||
Do NOT say "this section" or "this shard".
|
||
|
||
PART B — VERBATIM ANCHORS (REQUIRED — 5 to 10 lines copied character-perfect from the source above):
|
||
Format each as a code-fenced block with the byte offset within the shard:
|
||
\`\`\`
|
||
@${shard.from}+OFFSET
|
||
EXACT LINE OF SOURCE — DO NOT PARAPHRASE, DO NOT TRUNCATE
|
||
\`\`\`
|
||
Pick the most reviewer-relevant lines: route definitions (e.g. \`@app.route(...)\`), function signatures, security-sensitive calls (auth/SQL/exec/template/secrets), hardcoded credentials/defaults, exception handlers, sensitive imports. The reviewer will REFUSE to act on any claim not backed by a verbatim anchor — so anchors are how you prove findings are real.`;
|
||
const r = await chat({
|
||
provider: "ollama_cloud",
|
||
model: "gpt-oss:120b",
|
||
prompt,
|
||
max_tokens: 900,
|
||
});
|
||
cloud_calls += 1;
|
||
if (r.content) {
|
||
// Keep internal alignment markers with byte offsets so the
|
||
// reducer can correlate findings back to file regions.
|
||
workingScratchpad += `\n§bytes ${shard.from}..${shard.to}§\n${r.content.trim()}`;
|
||
}
|
||
}
|
||
|
||
// REDUCE — collapse the per-shard digests into one coherent
|
||
// file-level summary. The reducer sees all digests at once and
|
||
// produces a single narrative the reviewer can treat as "the file".
|
||
// Shard markers are NOT in the output. This is what fixes the
|
||
// shard-leakage bug that affected both the scrum and the auditor.
|
||
// REDUCE — the one place where the cross-shard view comes together.
|
||
// Previous max_tokens=900 asked for 40K tokens → 900 compression,
|
||
// which destroyed specific line references. Raised to 2400 and the
|
||
// prompt now explicitly requires preserving byte-offset markers and
|
||
// concrete literals (hardcoded model names, line snippets, TODOs)
|
||
// so fine-grained findings actually survive to the reviewer.
|
||
//
|
||
// Fix for shard-leakage: the reducer output is the SINGLE source
|
||
// the reviewer sees as "the file" — per prior iter 3 observation
|
||
// ("tree_split_fired:true is supposed to mean reducer-merged summary").
|
||
const reducePrompt = `You are producing a SINGLE coherent file-level summary of a source file from byte-addressed piece notes. Each piece note has TWO parts: a prose digest (PART A) and VERBATIM ANCHORS (PART B — code-fenced blocks with @offset markers and literal source lines).
|
||
|
||
FILE: ${filePath} (${content.length} bytes, ${shards.length} pieces)
|
||
|
||
PIECE NOTES:
|
||
${workingScratchpad}
|
||
|
||
Produce ONE coherent output with TWO sections:
|
||
|
||
═══ NARRATIVE ═══
|
||
- One-sentence purpose of the file.
|
||
- All public types / functions / constants with byte-offset markers like §bytes 24500..28000§.
|
||
- Every hardcoded default, model name, or literal a caller might override — keep the EXACT string.
|
||
- Every TODO / FIXME / stub return / placeholder.
|
||
- Every exception handler and what it does with the error.
|
||
- Obvious invariants.
|
||
Under 1200 words. Do NOT mention "piece N" or "section".
|
||
|
||
═══ VERBATIM ANCHORS ═══
|
||
COPY EVERY anchor block from the piece notes IN ORDER, character-perfect. DO NOT paraphrase. DO NOT shorten. DO NOT skip any. The reviewer will use these to ground findings — if you elide one, real risks become invisible.
|
||
|
||
Output the anchor blocks under their original \`\`\`@offset...\`\`\` fences, each on its own with a blank line between. The reviewer rejects findings that don't quote a string from this anchors block, so completeness here directly determines review quality.`;
|
||
|
||
const reduced = await chat({
|
||
provider: "ollama_cloud",
|
||
model: "gpt-oss:120b",
|
||
prompt: reducePrompt,
|
||
max_tokens: 2400,
|
||
});
|
||
cloud_calls += 1;
|
||
const synthesis = reduced.content?.trim() ?? "";
|
||
|
||
// Safety: if the reducer returned thin output, fall back to the
|
||
// raw scratchpad — with byte markers preserved since the reviewer
|
||
// benefits from offsets regardless of whether they're inside the
|
||
// reducer's narrative or the raw per-piece bullets.
|
||
const final = synthesis.length > 200
|
||
? synthesis
|
||
: workingScratchpad.trim();
|
||
|
||
return { scratchpad: final, shards: shards.length, cloud_calls };
|
||
}
|
||
|
||
async function reviewFile(
|
||
filePath: string,
|
||
prd_chunks: Chunk[],
|
||
proposal_chunks: Chunk[],
|
||
): Promise<FileReview> {
|
||
const t0 = Date.now();
|
||
log(`file: ${filePath}`);
|
||
const content = await readFile(filePath, "utf8");
|
||
const rel = filePath.replace("/home/profit/lakehouse/", "");
|
||
|
||
// Build a query embedding from the first ~800 chars of the file
|
||
// (good enough for topical retrieval).
|
||
const seed = content.slice(0, 800);
|
||
const [seedEmb] = await embedBatch([seed]);
|
||
|
||
const topPrd = retrieveTopK(seedEmb, prd_chunks, TOP_K_CONTEXT);
|
||
const topPlan = retrieveTopK(seedEmb, proposal_chunks, TOP_K_CONTEXT);
|
||
log(` retrieved ${topPrd.length} PRD chunks + ${topPlan.length} proposal chunks`);
|
||
|
||
const contextBlock = [
|
||
"═══ RELEVANT PRD EXCERPTS ═══",
|
||
...topPrd.map(c => `[PRD @${c.offset}]\n${c.text.slice(0, 600)}`),
|
||
"",
|
||
"═══ RELEVANT CHANGE PROPOSAL EXCERPTS ═══",
|
||
...topPlan.map(c => `[PLAN @${c.offset}]\n${c.text.slice(0, 600)}`),
|
||
].join("\n\n");
|
||
|
||
// Files bigger than FILE_TREE_SPLIT_THRESHOLD trigger the substrate
|
||
// path: deterministic anchor extraction + per-file vector index.
|
||
// Reviewer sees three zones:
|
||
// ANCHORS — verbatim suspicious lines (regex-extracted, never paraphrased)
|
||
// NEIGHBORS — top-K file chunks retrieved per PRD chunk via cosine
|
||
// PRD/PLAN — already retrieved, kept as-is
|
||
// No LLM-paraphrased prose is shown. Reviewer is required to quote
|
||
// anchors or chunks verbatim; verifier drops findings whose backtick-
|
||
// quoted snippets don't appear in the original source.
|
||
let sourceForPrompt: string;
|
||
let treeSplitFired = false;
|
||
let shardsSummarized = 0;
|
||
let extraCloudCalls = 0;
|
||
let substrateAnchorBlock = "";
|
||
let substrateRetrievedBlock = "";
|
||
if (content.length > FILE_TREE_SPLIT_THRESHOLD) {
|
||
treeSplitFired = true;
|
||
const sub = await buildFileSubstrate(rel, content);
|
||
shardsSummarized = sub.chunks.length;
|
||
// ANCHORS zone — pick representative anchors per kind, cap to ~30
|
||
// to keep the block readable.
|
||
const byKind = new Map<string, AnchorLine[]>();
|
||
for (const a of sub.anchors) {
|
||
const arr = byKind.get(a.kind) || [];
|
||
arr.push(a);
|
||
byKind.set(a.kind, arr);
|
||
}
|
||
const balanced: AnchorLine[] = [];
|
||
const PER_KIND = 4;
|
||
const MAX_ANCHORS = 40;
|
||
for (const [, arr] of byKind) balanced.push(...arr.slice(0, PER_KIND));
|
||
balanced.sort((a, b) => a.byte_offset - b.byte_offset);
|
||
const trimmedAnchors = balanced.slice(0, MAX_ANCHORS);
|
||
substrateAnchorBlock = trimmedAnchors
|
||
.map((a) => `[L${a.line_no} @byte ${a.byte_offset} kind=${a.kind}]\n${a.text}`)
|
||
.join("\n\n");
|
||
log(` substrate anchors selected: ${trimmedAnchors.length}/${sub.anchors.length}`);
|
||
|
||
// NEIGHBORS zone — for each top PRD chunk, pull the top-2 file
|
||
// chunks that semantically match it. Surfaces the actual code
|
||
// regions the PRD's worry-areas point at. Dedup by byte_offset.
|
||
const seen = new Set<number>();
|
||
const neighbors: FileChunk[] = [];
|
||
for (const prdChunk of topPrd) {
|
||
const top = sub.queryFile(prdChunk.embedding, 2);
|
||
for (const fc of top) {
|
||
if (seen.has(fc.byte_offset)) continue;
|
||
seen.add(fc.byte_offset);
|
||
neighbors.push(fc);
|
||
}
|
||
}
|
||
substrateRetrievedBlock = neighbors
|
||
.slice(0, 8)
|
||
.map((c) => `[lines ${c.line_from}-${c.line_to} @byte ${c.byte_offset}]\n${c.text}`)
|
||
.join("\n\n──\n\n");
|
||
log(` substrate neighbors retrieved: ${neighbors.length} (showing top 8)`);
|
||
|
||
sourceForPrompt =
|
||
`═══ ANCHORS (verbatim source lines extracted by regex — quotable) ═══\n${substrateAnchorBlock}\n\n` +
|
||
`═══ NEIGHBORS (file chunks retrieved by similarity to PRD worry-areas — quotable) ═══\n${substrateRetrievedBlock}`;
|
||
} else {
|
||
sourceForPrompt = content;
|
||
}
|
||
|
||
// Prompt — when tree-split fired, include an explicit instruction
|
||
// not to claim a field/function is "missing" because the scratchpad
|
||
// is a distillation not the full file. Plus a hard quote-or-die
|
||
// requirement: every finding MUST quote a literal string from the
|
||
// VERBATIM ANCHORS section. Without this, big-file reviews
|
||
// hallucinate against the PRD worry-list (verified 2026-04-24:
|
||
// 0/10 real findings on 13K-line llm_team_ui.py). The
|
||
// post-acceptance verifier (verifyAnchorGrounding) drops findings
|
||
// whose backtick-quoted strings don't appear in the original
|
||
// source — last-line defense against confabulation.
|
||
const truncationWarning = treeSplitFired
|
||
? `\nIMPORTANT: this is a LARGE file (${content.length} bytes / ${shardsSummarized} chunks). You are NOT seeing the full raw source. You are seeing TWO grounded zones:
|
||
|
||
ANCHORS — regex-extracted verbatim source lines (route defs, auth calls, SQL, secrets, exception handlers, etc.) with line numbers and byte offsets. Every line is character-perfect.
|
||
NEIGHBORS — file chunks retrieved by cosine-similarity to each relevant PRD excerpt. Every chunk is character-perfect source code.
|
||
|
||
QUOTE-OR-DIE RULE — NON-NEGOTIABLE:
|
||
EVERY finding you list MUST include a backtick-quoted snippet of literal source text drawn from the ANCHORS or NEIGHBORS zones. If you cannot quote literal source for a claim, DO NOT make the claim. Generic "this file lacks X" is NOT acceptable when X isn't visibly absent from the anchors/neighbors you can see — instead, if you suspect X is absent, write "could not verify presence of X in retrieved zones" with low confidence rather than asserting it as a critical failure.
|
||
|
||
The pipeline runs a post-process verifier that drops findings whose quoted code doesn't appear in the original source byte-for-byte. Make every claim grounded.\n`
|
||
: "";
|
||
|
||
const forensicPrefix = FORENSIC_PREAMBLE
|
||
? `${FORENSIC_PREAMBLE}\n\n═══ FILE UNDER AUDIT ═══\n\n`
|
||
: "";
|
||
|
||
const baseTask = `${forensicPrefix}You are reviewing one source file against the Lakehouse PRD and an active cohesion-integration plan.
|
||
|
||
FILE: ${rel} (${content.length} bytes${treeSplitFired ? `, tree-split into ${shardsSummarized} shards` : ""})
|
||
${truncationWarning}
|
||
─────── source ───────
|
||
${sourceForPrompt}
|
||
─────── end source ───────
|
||
|
||
${contextBlock}
|
||
|
||
Produce a structured review with:
|
||
1. Alignment score (1-10) between this file and the PRD intent
|
||
2. 3-5 concrete suggested changes (bullet points), each naming a specific function/line and what to change
|
||
3. Any gap where this file's behavior contradicts the PRD or the proposal
|
||
${FORENSIC_PREAMBLE ? "4. Apply the forensic audit passes from the preamble: pseudocode detection, PRD contract status, normalization/validation pipeline, failure→repair loop, execution memory, relevance orchestration, execution safety, testing evidence. Issue a verdict pass|needs_patch|fail." : ""}
|
||
|
||
**Per-finding confidence (required on every suggestion):**
|
||
Attach a self-assessed **Confidence: NN%** to every suggested change AND every gap you list. The percentage is your belief that the suggestion is correct, will compile, and lands the PRD intent. Calibration guide:
|
||
- 90-100%: pattern seen repeatedly in shipped code; change is mechanical; low risk of regressions
|
||
- 70-89%: confident in direction, some room for interpretation on API shape or naming
|
||
- 50-69%: plausible fix but may not match existing conventions or may cascade to other files
|
||
- <50%: genuinely uncertain — include regardless so downstream knows to investigate before applying
|
||
Format each finding as: \`**1.** <change>. **Confidence: NN%.**\` (in tables, add a final "Confidence" column.) Low confidence is valuable signal — do not round up.
|
||
|
||
**Per-finding semantic-flag tag (ADR-021, required on every finding):**
|
||
Also attach a \`**Flag: <CATEGORY>**\` on each finding so the pathway-memory matrix index can cluster bug classes over time. Pick the ONE tag that best fits; if none fits, use \`None\`. Allowed categories:
|
||
- \`UnitMismatch\` — operation combines values with different units (e.g. row_count - file_count, bytes - rows)
|
||
- \`TypeConfusion\` — same type, wrong role (e.g. treating a PK as a row index)
|
||
- \`NullableConfusion\` — unwrap-without-check or nullable-treated-as-non-null
|
||
- \`OffByOne\` — loop / range / slice boundary mistake
|
||
- \`StaleReference\` — calls a deprecated / removed / moved symbol
|
||
- \`PseudoImpl\` — stub / todo!() / function named for work it doesn't do
|
||
- \`DeadCode\` — unreachable or uncalled code
|
||
- \`WarningNoise\` — compiles green but would add a cargo warning
|
||
- \`BoundaryViolation\` — crosses a crate/layer boundary it shouldn't
|
||
- \`None\` — improvement or nicety that doesn't fit a bug category
|
||
|
||
In tables, add a "Flag" column. Examples:
|
||
\`**1.** Rewrite base_rows calc. **Confidence: 90%.** **Flag: UnitMismatch.**\`
|
||
\`**2.** Extract retry loop. **Confidence: 75%.** **Flag: None.**\`
|
||
|
||
Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-offset when relevant.`;
|
||
|
||
const history: FileReview["attempts_history"] = [];
|
||
let accepted: string | null = null;
|
||
let acceptedModel = "";
|
||
let acceptedOn = 0;
|
||
|
||
// Pathway hot-swap pre-check. If a proven pathway exists for this
|
||
// (task, file_prefix, signal) combo with ≥3 replays at ≥80% success,
|
||
// skip the ladder and try its winning rung first. On success we
|
||
// record a positive replay; on failure we fall through to the full
|
||
// ladder and record a negative replay. Fire-and-forget — pathway
|
||
// service unavailable → null candidate → business as usual.
|
||
const signalClass = await lookupSignalClass(rel);
|
||
const taskClass = "scrum_review";
|
||
const hotSwap = await queryHotSwap(taskClass, rel, signalClass);
|
||
|
||
// ADR-021 Phase C: pre-review enrichment. Pull aggregated bug
|
||
// fingerprints the matrix index has learned for this narrow
|
||
// fingerprint and prepend to the reviewer prompt as historical
|
||
// context. This is the compounding mechanism — iter-N reviewer
|
||
// sees what iter-(N-1) and earlier found, so the grammar of bugs
|
||
// accumulates instead of being re-discovered each iteration.
|
||
const pastFingerprints = await fetchBugFingerprints(taskClass, rel, signalClass, 5);
|
||
let pathwayPreamble = "";
|
||
if (pastFingerprints.length > 0) {
|
||
pathwayPreamble = "═══ PATHWAY MEMORY — BUGS PREVIOUSLY FOUND ON THIS FILE AREA (ADR-021) ═══\n" +
|
||
"The matrix index has flagged these patterns on the same task_class + file_prefix + signal_class before. Check this file for recurrences of the same shape:\n\n" +
|
||
pastFingerprints.map((fp, i) =>
|
||
`${i + 1}. [${fp.flag.kind}] pattern=\`${fp.pattern_key}\` occurrences=${fp.occurrences}\n example: ${fp.example.slice(0, 160)}`
|
||
).join("\n") +
|
||
"\n═══\n\n";
|
||
log(` 📚 pathway memory: ${pastFingerprints.length} historical bug pattern(s) prepended to prompt`);
|
||
}
|
||
let hotSwapOrderedIndices: number[] | null = null;
|
||
if (hotSwap) {
|
||
// Reorder the ladder to try the recommended model first. Rung
|
||
// indices are preserved in the output so the trace still reflects
|
||
// the true ladder position the model sits at.
|
||
const recommendedIdx = LADDER.findIndex(r => r.model === hotSwap.recommended_model);
|
||
if (recommendedIdx >= 0) {
|
||
log(` 🔥 hot-swap candidate: ${hotSwap.recommended_model} (rung ${hotSwap.recommended_rung}, sim=${hotSwap.similarity.toFixed(3)}, success_rate=${hotSwap.success_rate.toFixed(2)}, ${hotSwap.replay_count} replays)`);
|
||
hotSwapOrderedIndices = [recommendedIdx, ...LADDER.map((_, i) => i).filter(i => i !== recommendedIdx)];
|
||
}
|
||
}
|
||
const ladderOrder = hotSwapOrderedIndices ?? LADDER.map((_, i) => i);
|
||
|
||
// Collect attempts for the pathway trace sidecar.
|
||
const pathwayAttempts: LadderAttemptRec[] = [];
|
||
|
||
for (let step = 0; step < MAX_ATTEMPTS; step++) {
|
||
const i = ladderOrder[step];
|
||
const n = step + 1;
|
||
const rung = LADDER[i];
|
||
const learning = history.length > 0
|
||
? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status} — ${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══`
|
||
: "";
|
||
|
||
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}`);
|
||
const attemptStarted = Date.now();
|
||
const r = await chat({
|
||
provider: rung.provider,
|
||
model: rung.model,
|
||
prompt: pathwayPreamble + baseTask + learning,
|
||
max_tokens: 1500,
|
||
});
|
||
const attemptMs = Date.now() - attemptStarted;
|
||
|
||
if (r.error) {
|
||
history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) });
|
||
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `error: ${r.error.slice(0, 100)}` });
|
||
log(` ✗ error: ${r.error.slice(0, 80)}`);
|
||
continue;
|
||
}
|
||
if (!isAcceptable(r.content)) {
|
||
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` });
|
||
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `thin (${r.content.length} chars)` });
|
||
log(` ✗ thin/unstructured (${r.content.length} chars)`);
|
||
continue;
|
||
}
|
||
history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });
|
||
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: true, reject_reason: null });
|
||
accepted = r.content;
|
||
acceptedModel = `${rung.provider}/${rung.model}`;
|
||
acceptedOn = n;
|
||
// Post-acceptance: when tree-split fired, run the anchor-grounding
|
||
// verifier and append a footer with the grounding rate. The footer
|
||
// surfaces ungrounded quotes so humans can spot hallucinated
|
||
// findings at a glance — prevents the 0/10 confabulation pattern
|
||
// observed on llm_team_ui.py 2026-04-24.
|
||
if (treeSplitFired) {
|
||
const stats = verifyAnchorGrounding(accepted, content);
|
||
log(` ⚓ anchor grounding: ${stats.grounded}/${stats.total} quotes matched source` +
|
||
(stats.groundedPct !== null ? ` (${stats.groundedPct}%)` : ""));
|
||
accepted = appendGroundingFooter(accepted, stats);
|
||
}
|
||
log(` ✓ ACCEPTED on attempt ${n} (${rung.model}, ${r.content.length} chars)`);
|
||
break;
|
||
}
|
||
|
||
// Hot-swap bookkeeping: if we tried the recommended model first,
|
||
// report whether it worked so the pathway's success_rate updates.
|
||
if (hotSwap) {
|
||
const replaySucceeded = acceptedModel.endsWith(`/${hotSwap.recommended_model}`);
|
||
log(` pathway replay ${replaySucceeded ? "✓" : "✗"} (${hotSwap.pathway_id.slice(0, 12)}…)`);
|
||
// Fire and forget — don't await; observer can handle it.
|
||
recordPathwayReplay(hotSwap.pathway_id, replaySucceeded);
|
||
}
|
||
|
||
const review: FileReview = {
|
||
file: rel,
|
||
file_bytes: content.length,
|
||
tree_split_fired: treeSplitFired,
|
||
shards_summarized: shardsSummarized,
|
||
top_prd_chunks: topPrd.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })),
|
||
top_proposal_chunks: topPlan.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })),
|
||
attempts_made: history.length,
|
||
attempts_history: history,
|
||
accepted_on: acceptedOn || null,
|
||
escalated_to_model: acceptedModel,
|
||
suggestions: accepted ?? "[no acceptable answer after escalation ladder exhausted]",
|
||
duration_ms: Date.now() - t0,
|
||
};
|
||
|
||
// Append to the shared scrum-reviews jsonl so the auditor's
|
||
// kb_query check can surface relevant reviews for files in a
|
||
// PR diff. Cohesion plan Phase C wire.
|
||
if (accepted) {
|
||
const { appendFile, mkdir } = await import("node:fs/promises");
|
||
const { dirname } = await import("node:path");
|
||
await mkdir(dirname(SCRUM_REVIEWS_JSONL), { recursive: true });
|
||
|
||
// Extract per-finding confidences from the accepted markdown.
|
||
// Patterns tried: "Confidence: NN%", "Confidence**: NN%",
|
||
// and table-cell "| 92% |". Cap at 20 matches to bound row size.
|
||
// Added 2026-04-23 (iter 2 direction from J: "make scrum output
|
||
// include self-assessed confidence per finding").
|
||
const confidences: number[] = [];
|
||
// Markdown format: "Confidence: 92%" / "Confidence**: 92%" / "| 92% |"
|
||
const patMarkdown = /(?:Confidence[*:\s]*\s*|\|\s*)(\d{1,3})\s*%/gi;
|
||
// JSON format (forensic strict output): "confidence": 92
|
||
const patJson = /"confidence"\s*:\s*(\d{1,3})(?!\d)/gi;
|
||
for (const pat of [patMarkdown, patJson]) {
|
||
const matches = accepted.matchAll(pat);
|
||
for (const hit of matches) {
|
||
if (confidences.length >= 40) break;
|
||
const pct = parseInt(hit[1], 10);
|
||
if (pct >= 0 && pct <= 100) confidences.push(pct);
|
||
}
|
||
}
|
||
const conf_avg = confidences.length
|
||
? Math.round(confidences.reduce((a, b) => a + b, 0) / confidences.length)
|
||
: null;
|
||
const conf_min = confidences.length ? Math.min(...confidences) : null;
|
||
|
||
// ADR-021 Phase B: extract per-finding semantic flags. Reviewer is
|
||
// prompted to tag each finding with one of 9 categories plus None.
|
||
// Patterns: "**Flag: UnitMismatch**", "Flag: OffByOne", table cell
|
||
// with the flag word, or bare-word match. Deduplicated per-trace
|
||
// so repeats in one review count once.
|
||
const FLAG_VARIANTS = [
|
||
"UnitMismatch", "TypeConfusion", "NullableConfusion", "OffByOne",
|
||
"StaleReference", "PseudoImpl", "DeadCode", "WarningNoise", "BoundaryViolation",
|
||
];
|
||
const flagMatches = new Set<string>();
|
||
// Prefer matches anchored to the "Flag:" keyword; fall back to
|
||
// bare-word matches so older reviewers that mention a category
|
||
// without the "Flag:" prefix still contribute signal.
|
||
const patFlagLabeled = /(?:Flag[*:\s]*\s*)([A-Z][A-Za-z]+)/g;
|
||
for (const m of accepted.matchAll(patFlagLabeled)) {
|
||
if (FLAG_VARIANTS.includes(m[1])) flagMatches.add(m[1]);
|
||
}
|
||
// Second pass — bare-word matches for each variant, but ONLY if
|
||
// the labeled pass produced nothing. This avoids flagging every
|
||
// file that happens to mention "DeadCode" in a code sample.
|
||
if (flagMatches.size === 0) {
|
||
for (const v of FLAG_VARIANTS) {
|
||
const re = new RegExp(`\\b${v}\\b`);
|
||
if (re.test(accepted)) flagMatches.add(v);
|
||
}
|
||
}
|
||
const semantic_flags_arr = [...flagMatches].map(k => ({ kind: k }));
|
||
|
||
// ADR-021 Phase D: bug_fingerprint extraction.
|
||
//
|
||
// Walk per-finding rows (either table format with columns
|
||
// `Change | Flag | Confidence` OR bullet-list with inline
|
||
// `**Flag: X.**` tag) and pair each flag with the surrounding
|
||
// finding text. Then derive a stable pattern_key from code
|
||
// identifiers the finding cites in backticks, so future reviews
|
||
// of similar bugs cluster under the same key.
|
||
//
|
||
// v1 is heuristic (regex + identifier extraction + canonical
|
||
// sort). It's intentionally NOT a semantic extractor — just a
|
||
// deterministic "take the top code-shaped tokens and hash them
|
||
// with the flag." Stability comes from sorting tokens alphabetically
|
||
// before hashing so "row_count + QueryResponse" and "QueryResponse
|
||
// + row_count" produce the same key.
|
||
const bug_fingerprints_arr: Array<{
|
||
flag: { kind: string };
|
||
pattern_key: string;
|
||
example: string;
|
||
occurrences: number;
|
||
}> = [];
|
||
{
|
||
// Split into candidate finding blocks. Both formats are row-
|
||
// oriented, so a line split is a reasonable starting point.
|
||
// Findings tend to be one-line table rows OR multi-line bullets
|
||
// starting with **N.** — we handle both by looking at any line
|
||
// that mentions a Flag variant and treating it as a finding.
|
||
const lines = accepted.split(/\r?\n/);
|
||
const seenKeys = new Set<string>();
|
||
for (const line of lines) {
|
||
// Find the flag variant on this line (if any).
|
||
let variantOnLine: string | null = null;
|
||
for (const v of FLAG_VARIANTS) {
|
||
const re = new RegExp(`\\b${v}\\b`);
|
||
if (re.test(line)) { variantOnLine = v; break; }
|
||
}
|
||
if (!variantOnLine) continue;
|
||
|
||
// Extract identifier-shaped tokens from backticks. We try two
|
||
// levels: (a) whole-backtick match if it's a clean identifier
|
||
// or path, (b) for complex content like function signatures
|
||
// (`Foo::bar(&self) -> u64`) pull out the longest identifier
|
||
// substrings so we still capture the callable.
|
||
const codeTokens: string[] = [];
|
||
const idRe = /[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*/g;
|
||
for (const m of line.matchAll(/`([^`]+)`/g)) {
|
||
const raw = m[1].trim();
|
||
// Whole-backtick identifier or dotted path? (`row_count`,
|
||
// `AccessControl::can_access`, `foo.bar`).
|
||
if (/^[A-Za-z_][A-Za-z0-9_:]*(?:\.[A-Za-z_][A-Za-z0-9_]*)?$/.test(raw)) {
|
||
if (raw.length >= 3) codeTokens.push(raw);
|
||
continue;
|
||
}
|
||
// Fallback: scan for identifier substrings, take the longest
|
||
// meaningful ones (usually the function or type name comes
|
||
// first in a signature like `Foo::bar(&self)`).
|
||
const ids = [...raw.matchAll(idRe)]
|
||
.map(x => x[0])
|
||
.filter(id => id.length >= 3);
|
||
// Prefer ::-qualified paths first (they're more specific),
|
||
// then the top-2 longest; keeps the key stable under
|
||
// signature variation.
|
||
const ranked = ids
|
||
.map(id => ({ id, score: (id.includes("::") ? 1000 : 0) + id.length }))
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, 2)
|
||
.map(x => x.id);
|
||
codeTokens.push(...ranked);
|
||
}
|
||
// Remove the flag variant name itself if it got captured (kimi
|
||
// and other reviewers often wrap the flag column in backticks).
|
||
// Also drop Rust + common keywords that slip through the
|
||
// identifier regex — "self", "mut", "async", "await", "pub"
|
||
// aren't bug-shape signal, they're grammar.
|
||
const FLAG_SET = new Set(FLAG_VARIANTS);
|
||
const KEYWORDS = new Set([
|
||
"self", "Self", "mut", "async", "await", "pub", "fn", "let",
|
||
"const", "static", "impl", "trait", "struct", "enum", "use",
|
||
"mod", "crate", "super", "match", "return", "Some", "None",
|
||
"Ok", "Err", "true", "false",
|
||
// Markdown table column headers kimi outputs for structured
|
||
// reviews — "Flag" / "Change" / "Confidence" are layout words,
|
||
// not identifiers. Seen as noise in iter 11 vectord extraction
|
||
// ("DeadCode:Flag" pattern_key).
|
||
"Flag", "Change", "Confidence", "PRD", "Plan",
|
||
]);
|
||
const filtered = codeTokens.filter(t => !FLAG_SET.has(t) && !KEYWORDS.has(t));
|
||
if (filtered.length === 0) continue;
|
||
|
||
// Canonicalize: dedupe, sort alphabetically, take top 3.
|
||
// Alphabetical sort gives stability across "A then B" / "B then A"
|
||
// variants. Top 3 keeps the key short while retaining enough
|
||
// signal for different bugs to separate.
|
||
const uniqTokens = [...new Set(filtered)].sort().slice(0, 3);
|
||
const pattern_key = `${variantOnLine}:${uniqTokens.join("-")}`;
|
||
if (seenKeys.has(pattern_key)) continue;
|
||
seenKeys.add(pattern_key);
|
||
|
||
// Example: the finding line, trimmed + truncated. Preserves
|
||
// just enough context that the pre-review preamble in the
|
||
// next iter can quote it back to the reviewer meaningfully.
|
||
const example = line.replace(/\s+/g, " ").trim().slice(0, 200);
|
||
|
||
bug_fingerprints_arr.push({
|
||
flag: { kind: variantOnLine },
|
||
pattern_key,
|
||
example,
|
||
occurrences: 1,
|
||
});
|
||
}
|
||
}
|
||
|
||
// Score extraction — regex accepts decimals ("Score: 4.5/10") and
|
||
// surrounding punctuation ("4/10 — mid"). iter 3 had 4 unparseable
|
||
// scores because the prior regex /(\d)\s*\/\s*10/ missed decimals.
|
||
const scoreMatch = accepted.match(/(?:score[\s*:]*)?(\d(?:\.\d)?)\s*\/\s*10\b/i);
|
||
const alignment_score = scoreMatch ? parseFloat(scoreMatch[1]) : null;
|
||
|
||
// Forensic JSON extraction — iter 3 showed 20/21 files came back
|
||
// as JSON (verdict + critical_failures[] + verified_components[] + ...)
|
||
// rather than markdown tables. Previously we only stored suggestions_preview
|
||
// (truncated to 2KB); now we also capture the structured counters so
|
||
// consumers can filter by verdict, sort by critical_failures_count, etc.
|
||
let verdict: string | null = null;
|
||
let critical_failures_count = 0;
|
||
let pseudocode_flags_count = 0;
|
||
let prd_mismatches_count = 0;
|
||
let missing_components_count = 0;
|
||
let verified_components_count = 0;
|
||
let risk_points_count = 0;
|
||
const isJsonShape = accepted.includes('"verdict"');
|
||
if (isJsonShape) {
|
||
const vm = accepted.match(/"verdict"\s*:\s*"([a-z_]+)"/i);
|
||
verdict = vm ? vm[1] : null;
|
||
// Count object entries per array by counting occurrences of
|
||
// either a unique-per-entry field name or {...} bracket pairs
|
||
// inside the array span. A straight "count opening braces inside
|
||
// the array range" is simplest and robust to field order.
|
||
const countArrayEntries = (arrayName: string): number => {
|
||
const re = new RegExp(`"${arrayName}"\\s*:\\s*\\[([\\s\\S]*?)\\]`, "i");
|
||
const m = accepted.match(re);
|
||
if (!m || !m[1].trim()) return 0;
|
||
// Count opening braces of direct-child objects.
|
||
let depth = 0, entries = 0;
|
||
for (const ch of m[1]) {
|
||
if (ch === '{') { if (depth === 0) entries++; depth++; }
|
||
else if (ch === '}') depth--;
|
||
}
|
||
return entries;
|
||
};
|
||
critical_failures_count = countArrayEntries("critical_failures");
|
||
pseudocode_flags_count = countArrayEntries("pseudocode_flags");
|
||
prd_mismatches_count = countArrayEntries("prd_mismatches");
|
||
missing_components_count = countArrayEntries("missing_components");
|
||
verified_components_count = countArrayEntries("verified_components");
|
||
risk_points_count = countArrayEntries("risk_points");
|
||
}
|
||
|
||
// Permission Gradient (Layer #6 from SYSTEM_EVOLUTION_LAYERS.md).
|
||
// Classify the overall finding set by confidence_avg:
|
||
// ≥90 auto-apply-safe, ≥70 dry-run + diff, ≥50 simulation only,
|
||
// <50 block (human review). Use conf_min as the tier-lower-bound
|
||
// so one shaky finding drags the whole row down to the safer tier.
|
||
const tierFor = (c: number | null): string => {
|
||
if (c === null) return "unknown";
|
||
if (c >= 90) return "auto";
|
||
if (c >= 70) return "dry_run";
|
||
if (c >= 50) return "simulation";
|
||
return "block";
|
||
};
|
||
const gradient_tier = tierFor(conf_min); // conservative: weakest finding decides
|
||
const gradient_tier_avg = tierFor(conf_avg);
|
||
|
||
const row = {
|
||
file: rel,
|
||
reviewed_at: new Date().toISOString(),
|
||
accepted_model: acceptedModel,
|
||
accepted_on_attempt: acceptedOn,
|
||
attempts_made: history.length,
|
||
tree_split_fired: treeSplitFired,
|
||
suggestions_preview: accepted.slice(0, 2000),
|
||
// Iter-3+ confidence fields.
|
||
confidences_per_finding: confidences,
|
||
confidence_avg: conf_avg,
|
||
confidence_min: conf_min,
|
||
findings_count: confidences.length,
|
||
// Layer #6 Permission Gradient — downstream consumers decide
|
||
// apply-semantics based on these fields instead of blindly trusting
|
||
// every suggestion.
|
||
gradient_tier,
|
||
gradient_tier_avg,
|
||
// Score (decimal-aware) and forensic JSON structured counters.
|
||
// iter 4+ fields (schema_version 4).
|
||
alignment_score,
|
||
output_format: isJsonShape ? "forensic_json" : "markdown",
|
||
verdict,
|
||
critical_failures_count,
|
||
pseudocode_flags_count,
|
||
prd_mismatches_count,
|
||
missing_components_count,
|
||
verified_components_count,
|
||
risk_points_count,
|
||
schema_version: 4,
|
||
scrum_master_reviewed: true,
|
||
};
|
||
|
||
// Layer #2 Model Trust Profiling — append one row per file-accept
|
||
// so over time we can compute per-(model, task_type) success/thin
|
||
// rates and trust scores. task_type here is inferred from the file
|
||
// path — good enough for initial stats, refine later.
|
||
try {
|
||
const taskType = rel.includes("/truth/") ? "truth"
|
||
: rel.includes("/gateway/") ? "gateway"
|
||
: rel.includes("/vectord") ? "vector"
|
||
: rel.includes("/ingestd") ? "ingest"
|
||
: rel.includes("/queryd") ? "query"
|
||
: rel.includes("/storaged") || rel.includes("/catalogd") || rel.includes("/journald") ? "storage"
|
||
: rel.includes("/aibridge") ? "aibridge"
|
||
: "other";
|
||
const trustRow = {
|
||
run_id: OUT_DIR.split("/").pop(),
|
||
reviewed_at: row.reviewed_at,
|
||
file: rel,
|
||
task_type: taskType,
|
||
accepted_model: acceptedModel,
|
||
accepted_on_attempt: acceptedOn,
|
||
attempts_made: history.length,
|
||
thin_rejections: history.filter(h => h.status === "thin").length,
|
||
errors: history.filter(h => h.status === "error").length,
|
||
confidence_avg: conf_avg,
|
||
tree_split_fired: treeSplitFired,
|
||
};
|
||
await appendFile("/home/profit/lakehouse/data/_kb/model_trust.jsonl",
|
||
JSON.stringify(trustRow) + "\n");
|
||
} catch (e) {
|
||
console.error(`[scrum] model_trust append failed: ${(e as Error).message}`);
|
||
}
|
||
try {
|
||
await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n");
|
||
} catch (e) {
|
||
console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`);
|
||
}
|
||
|
||
// Pathway trace sidecar (consensus-designed 2026-04-24). Captures
|
||
// FULL context (ladder attempts, KB chunks, observer signal, verdict)
|
||
// for similarity-based hot-swap in future iterations. First-review
|
||
// pathways start in probation (replay_count=0); they become
|
||
// hot-swappable only after ≥3 replays at ≥80% success.
|
||
try {
|
||
const pathwayTrace: PathwayTracePayload = {
|
||
pathway_id: computePathwayId(taskClass, rel, signalClass),
|
||
task_class: taskClass,
|
||
file_path: rel,
|
||
signal_class: signalClass,
|
||
created_at: row.reviewed_at,
|
||
ladder_attempts: pathwayAttempts,
|
||
kb_chunks: [
|
||
...topPrd.map((c, idx) => ({
|
||
source_doc: "PRD.md", chunk_id: `prd@${c.offset}`, cosine_score: (c as any)._score ?? 0, rank: idx,
|
||
})),
|
||
...topPlan.map((c, idx) => ({
|
||
source_doc: "cohesion_plan", chunk_id: `plan@${c.offset}`, cosine_score: (c as any)._score ?? 0, rank: idx,
|
||
})),
|
||
],
|
||
observer_signals: signalClass ? [{ class: signalClass, priors: [], prior_iter_outcomes: [] }] : [],
|
||
bridge_hits: [], // context7 not wired into scrum yet; empty for v1
|
||
sub_pipeline_calls: [], // LLM Team extract happens after this row; out of scope for v1
|
||
audit_consensus: null, // set by auditor's later N=3 pass, via /pathway/insert update
|
||
reducer_summary: accepted.slice(0, 4000),
|
||
final_verdict: verdict ?? "accepted",
|
||
// Vec built from the full attempts/chunks — richer than the
|
||
// query-time vector. The similarity gate will still discriminate
|
||
// between pathways with the same fingerprint but different
|
||
// ladder/KB profiles.
|
||
// Include semantic flag tokens in the embedding so traces with
|
||
// different bug histories cluster separately — matches Rust's
|
||
// build_pathway_vec exactly (flag:<Variant> token shape).
|
||
pathway_vec: buildPathwayVec([
|
||
taskClass,
|
||
rel,
|
||
...(signalClass ? [`signal:${signalClass}`] : []),
|
||
...pathwayAttempts.flatMap(a => [`rung:${a.rung}`, `model:${a.model}`, `accepted:${a.accepted}`]),
|
||
...topPrd.map(c => `kb:PRD.md`),
|
||
...topPlan.map(c => `kb:cohesion_plan`),
|
||
...semantic_flags_arr.map(f => `flag:${f.kind}`),
|
||
]),
|
||
semantic_flags: semantic_flags_arr,
|
||
type_hints_used: [], // Phase E — pre-review enrichment from catalogd/arrow/truth
|
||
bug_fingerprints: bug_fingerprints_arr, // ADR-021 Phase D
|
||
replay_count: 0,
|
||
replays_succeeded: 0,
|
||
retired: false,
|
||
};
|
||
writePathwayTrace(pathwayTrace); // fire-and-forget
|
||
} catch (e) {
|
||
console.error(`[scrum] pathway trace failed: ${(e as Error).message}`);
|
||
}
|
||
|
||
// Close the scrum → observer loop (fix 2026-04-24). Architecture
|
||
// audit surfaced: observer ring had 2000 ops, 1999 from Langfuse,
|
||
// zero from scrum. Observer's analyzeErrors + PLAYBOOK_BUILDER loops
|
||
// were blind to the very pipeline most likely to teach them. One
|
||
// fire-and-forget POST wires them in. Observer tolerates unreachable
|
||
// backends; no scrum run fails if observer is down.
|
||
//
|
||
// Schema matches observer's ObservedOp shape (source, staffer_id,
|
||
// sig_hash, event_kind, success, ...). file + accepted_model +
|
||
// confidence_avg + gradient_tier give downstream analyzers enough
|
||
// signal to correlate reviews with later regressions.
|
||
try {
|
||
const sigHash = createHash("sha256")
|
||
.update(`${rel}|${OUT_DIR.split("/").pop()}`)
|
||
.digest("hex")
|
||
.slice(0, 16);
|
||
fetch("http://localhost:3800/event", {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({
|
||
source: "scrum",
|
||
staffer_id: "scrum_master",
|
||
sig_hash: sigHash,
|
||
event_kind: "file_review",
|
||
success: true,
|
||
run_id: OUT_DIR.split("/").pop(),
|
||
file: rel,
|
||
accepted_model: acceptedModel,
|
||
accepted_on_attempt: acceptedOn,
|
||
attempts_made: history.length,
|
||
thin_rejections: history.filter(h => h.status === "thin").length,
|
||
confidence_avg: conf_avg,
|
||
confidence_min: conf_min,
|
||
findings_count: confidences.length,
|
||
gradient_tier,
|
||
tree_split_fired: treeSplitFired,
|
||
// iter4+ forensic-JSON fields so observer's analyzer can
|
||
// route by verdict / sort by critical_failures_count
|
||
alignment_score,
|
||
verdict,
|
||
output_format: isJsonShape ? "forensic_json" : "markdown",
|
||
critical_failures_count,
|
||
verified_components_count,
|
||
missing_components_count,
|
||
// Pathway fields: emitted on every review so the observer
|
||
// can build a full picture of hot-swap performance over time.
|
||
// `pathway_hot_swap_hit` flags whether the first-tried rung
|
||
// this review was a pathway recommendation vs the default
|
||
// ladder top. `rungs_saved` quantifies the compute we avoided
|
||
// when a hot-swap landed — this is the value metric the VCP
|
||
// UI surfaces ("avg_rungs_saved_per_commit").
|
||
pathway_hot_swap_hit: hotSwap !== null,
|
||
pathway_id: hotSwap?.pathway_id ?? null,
|
||
pathway_similarity: hotSwap?.similarity ?? null,
|
||
pathway_success_rate: hotSwap?.success_rate ?? null,
|
||
rungs_saved: hotSwap && acceptedModel.endsWith(`/${hotSwap.recommended_model}`)
|
||
? Math.max(0, hotSwap.recommended_rung - 1)
|
||
: 0,
|
||
ts: row.reviewed_at,
|
||
}),
|
||
signal: AbortSignal.timeout(3000),
|
||
}).catch(() => {
|
||
// observer down — not a scrum-run failure, just lose the signal.
|
||
});
|
||
} catch (e) {
|
||
// Synchronous construction error — ignore.
|
||
}
|
||
|
||
// Route the accepted review through llm_team's fact extractor so
|
||
// its entities + relationships land in audit_facts.jsonl alongside
|
||
// inference-side extractions. Same index, two sources. Tagged
|
||
// source:"scrum_review" + scrum_master_reviewed:true so downstream
|
||
// queries can filter by provenance. Reviews shorter than 120
|
||
// chars are skipped — they're usually one-liners ("LGTM") with
|
||
// no extractable knowledge.
|
||
if (accepted.length >= 120 && process.env.LH_SCRUM_SKIP_EXTRACT !== "1") {
|
||
try {
|
||
const { extractFacts } = await import("../../auditor/fact_extractor.ts");
|
||
const ex = await extractFacts(accepted);
|
||
if (!ex.error || ex.entities.length + ex.facts.length > 0) {
|
||
const factRow = {
|
||
pr_number: 0, // scrum runs outside a PR scope
|
||
file: rel,
|
||
head_sha: "", // no SHA scope; scope is the file+timestamp
|
||
extracted_at: ex.extracted_at,
|
||
extractor: ex.extractor_model,
|
||
verifier: ex.verifier_model,
|
||
llm_team_run_id: ex.llm_team_run_id ?? null,
|
||
facts: ex.facts,
|
||
entities: ex.entities,
|
||
relationships: ex.relationships,
|
||
verification_preview: ex.verification.slice(0, 400),
|
||
schema_version: 2,
|
||
source: "scrum_review",
|
||
scrum_master_reviewed: true,
|
||
};
|
||
const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl";
|
||
await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(factRow) + "\n");
|
||
}
|
||
} catch (e) {
|
||
console.error(`[scrum] fact extraction failed for ${rel}: ${(e as Error).message}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
return review;
|
||
}
|
||
|
||
async function loadAndChunk(path: string, origin_tag: string): Promise<Chunk[]> {
|
||
const text = await readFile(path, "utf8");
|
||
const raw = chunkText(text);
|
||
const embs = await embedBatch(raw.map(r => r.text));
|
||
return raw.map((r, i) => ({
|
||
id: createHash("sha256").update(r.text).digest("hex").slice(0, 10),
|
||
text: r.text,
|
||
embedding: embs[i],
|
||
origin: origin_tag,
|
||
offset: r.offset,
|
||
}));
|
||
}
|
||
|
||
async function main() {
|
||
await mkdir(OUT_DIR, { recursive: true });
|
||
log(`output: ${OUT_DIR}`);
|
||
log(`targets: ${TARGET_FILES.length} files`);
|
||
|
||
log("loading + embedding PRD...");
|
||
const prd_chunks = await loadAndChunk(PRD_PATH, "PRD");
|
||
log(` PRD: ${prd_chunks.length} chunks`);
|
||
|
||
log("loading + embedding cohesion plan...");
|
||
const plan_chunks = await loadAndChunk(PROPOSAL_PATH, "COHESION_PLAN");
|
||
log(` plan: ${plan_chunks.length} chunks`);
|
||
|
||
log("");
|
||
log("─── scrum master: walking target files ───");
|
||
|
||
const reviews: FileReview[] = [];
|
||
for (const f of TARGET_FILES) {
|
||
const review = await reviewFile(f, prd_chunks, plan_chunks);
|
||
reviews.push(review);
|
||
await writeFile(
|
||
`${OUT_DIR}/review_${review.file.replace(/\//g, "_")}.json`,
|
||
JSON.stringify(review, null, 2),
|
||
);
|
||
log(` → ${review.file}: ${review.accepted_on ? `accepted on ${review.accepted_on} by ${review.escalated_to_model}` : "UNRESOLVED"} (${review.duration_ms}ms)`);
|
||
}
|
||
|
||
// Consolidated scrum-master report
|
||
const report_md: string[] = [];
|
||
report_md.push(`# Scrum-master review\n`);
|
||
report_md.push(`Generated: ${new Date().toISOString()}`);
|
||
report_md.push(`Files reviewed: ${reviews.length}`);
|
||
report_md.push(`Total duration: ${(reviews.reduce((s, r) => s + r.duration_ms, 0) / 1000).toFixed(1)}s\n`);
|
||
for (const r of reviews) {
|
||
report_md.push(`\n## ${r.file}`);
|
||
report_md.push(`- **Accepted on attempt:** ${r.accepted_on ?? "NOT resolved after 6 attempts"}`);
|
||
report_md.push(`- **Escalated to:** \`${r.escalated_to_model || "—"}\``);
|
||
report_md.push(`- **Total attempts:** ${r.attempts_made}`);
|
||
if (r.attempts_history.length > 1) {
|
||
report_md.push(`- **Attempt history:**`);
|
||
for (const h of r.attempts_history) {
|
||
report_md.push(` - ${h.n}: \`${h.model}\` → ${h.status}${h.error ? ` (${h.error.slice(0, 100)})` : ""}`);
|
||
}
|
||
}
|
||
report_md.push(`\n### Suggestions\n\n${r.suggestions}\n`);
|
||
}
|
||
await writeFile(`${OUT_DIR}/scrum_report.md`, report_md.join("\n"));
|
||
|
||
const summary = {
|
||
ran_at: new Date().toISOString(),
|
||
target_count: TARGET_FILES.length,
|
||
resolved: reviews.filter(r => r.accepted_on !== null).length,
|
||
total_attempts: reviews.reduce((s, r) => s + r.attempts_made, 0),
|
||
total_duration_ms: reviews.reduce((s, r) => s + r.duration_ms, 0),
|
||
per_file: reviews.map(r => ({ file: r.file, accepted_on: r.accepted_on, model: r.escalated_to_model, attempts: r.attempts_made, ms: r.duration_ms })),
|
||
};
|
||
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
|
||
|
||
log("");
|
||
log("═══ SCRUM REPORT ═══");
|
||
log(` files: ${summary.target_count}, resolved: ${summary.resolved}, total attempts: ${summary.total_attempts}`);
|
||
log(` total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
||
log("");
|
||
for (const p of summary.per_file) {
|
||
const mark = p.accepted_on ? "✓" : "✗";
|
||
log(` ${mark} ${p.file.padEnd(60)} attempt ${p.accepted_on ?? "—"}/${p.attempts} ${p.model} ${p.ms}ms`);
|
||
}
|
||
log("");
|
||
log(`report: ${OUT_DIR}/scrum_report.md`);
|
||
|
||
process.exit(summary.resolved === summary.target_count ? 0 : 1);
|
||
}
|
||
|
||
main().catch(e => { console.error("[scrum] fatal:", e); process.exit(2); });
|