// Nine-consecutive audit runner — empirical test of the predictive- // compounding property. Runs the audit pipeline 9 times against the // same PR (each time with a new diff from Gitea), captures the // verdict + audit_lessons state after each run, and reports whether // the KB stabilizes or drifts. // // What we expect (favorable compounding): // - signature_count grows sublinearly (same patterns recur, so // distinct-signature count stabilizes fast) // - verdict settles on a stable value after run 2-3 (first audit // establishes baseline, rest repeat) // - confidence stays LOW for all signatures (same PR repeatedly) // - NO new recurring findings fire because confidence < 0.3 on // same-PR noise (kb_index rating policy) // // What would indicate drift (the thing we want to prove DOESN'T happen): // - signature_count grows linearly — each run produces new signatures // - verdict oscillates (block → approve → block ...) // - confidence inflates — kb_index rating escalates on repeated runs // // Run: bun run tests/real-world/nine_consecutive_audits.ts import { readFile, writeFile } from "node:fs/promises"; import { join } from "node:path"; import { aggregate } from "../../auditor/kb_index.ts"; import { getPrSnapshot } from "../../auditor/gitea.ts"; import { auditPr } from "../../auditor/audit.ts"; const REPO = "/home/profit/lakehouse"; const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`; const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`; const POLL_INTERVAL_MS = 5_000; const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9); const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8); const SKIP_INFERENCE = process.env.LH_AUDITOR_SKIP_INFERENCE !== "0"; const RESET_KB = process.env.LH_RESET_KB === "1"; async function waitForVerdict(prNum: number, sha: string, deadlineMs: number): Promise { const short = sha.slice(0, 12); const path = join(VERDICTS_DIR, `${prNum}-${short}.json`); const start = Date.now(); while (Date.now() - start < deadlineMs) { try { const raw = await readFile(path, "utf8"); return JSON.parse(raw); } catch { /* not yet */ } await Bun.sleep(POLL_INTERVAL_MS); } throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`); } async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> { const agg = await aggregate(AUDIT_LESSONS, { keyFn: (r) => r?.signature, scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined), }); const list = Array.from(agg.values()).sort((a, b) => b.count - a.count); const recurring = list.filter(r => r.count >= 2); const recurringMaxCount = recurring.length > 0 ? Math.max(...recurring.map(a => a.count)) : 0; const recurringMaxConf = recurring.length > 0 ? Math.max(...recurring.map(a => a.confidence)) : 0; return { sig_count: list.length, max_count: list[0]?.count ?? 0, max_confidence: recurringMaxConf, recurring_max_count: recurringMaxCount, top3: list.slice(0, 3).map(a => ({ sig: a.signature, count: a.count, conf: a.confidence, summary: a.representative_summary.slice(0, 80), })), }; } interface RunRecord { run: number; sha: string; verdict_overall: string; findings_total: number; findings_block: number; findings_warn: number; findings_info: number; audit_duration_ms: number; claims_total: number; claims_empirical: number; kb_sig_count_after: number; kb_max_count_after: number; kb_max_confidence_after: number; kb_recurring_max_count: number; } async function main() { console.log(`[nine] target PR: #${TARGET_PR}`); console.log(`[nine] runs: ${RUNS}`); console.log(`[nine] skip_inference: ${SKIP_INFERENCE}`); console.log(`[nine] reset_kb: ${RESET_KB}`); console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`); if (RESET_KB) { console.log("[nine] clearing audit_lessons.jsonl for clean test..."); await writeFile(AUDIT_LESSONS, ""); } console.log(""); const pr = await getPrSnapshot(TARGET_PR); console.log(`[nine] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`); console.log(`[nine] files in diff: ${pr.files.length}`); console.log(""); const baseline = await captureAggState(); console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`); console.log(""); const records: RunRecord[] = []; for (let n = 1; n <= RUNS; n++) { const t0 = Date.now(); console.log(`─── run ${n}/${RUNS} ───`); const verdict = await auditPr(pr, { dry_run: true, skip_dynamic: true, skip_inference: SKIP_INFERENCE, }); console.log(` sha ${verdict.head_sha.slice(0, 12)}`); const after = await captureAggState(); const rec: RunRecord = { run: n, sha: verdict.head_sha.slice(0, 12), verdict_overall: String(verdict.overall), findings_total: Number(verdict.metrics?.findings_total ?? 0), findings_block: Number(verdict.metrics?.findings_block ?? 0), findings_warn: Number(verdict.metrics?.findings_warn ?? 0), findings_info: Number(verdict.metrics?.findings_info ?? 0), audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0), claims_total: Number(verdict.metrics?.claims_total ?? 0), claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0), kb_sig_count_after: after.sig_count, kb_max_count_after: after.max_count, kb_max_confidence_after: after.max_confidence, kb_recurring_max_count: after.recurring_max_count, }; records.push(rec); console.log(` verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`); console.log(` kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} recurring_max=${rec.kb_recurring_max_count} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`); console.log(` elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`); console.log(""); } console.log("═══ FINAL ═══"); console.log("run | verdict | find | block warn info | dur_s | kb_sig max_count max_conf"); for (const r of records) { console.log( ` ${String(r.run).padStart(1)} | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)} | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)} | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)} ${String(r.kb_max_count_after).padStart(9)} ${r.kb_max_confidence_after.toFixed(2)}`, ); } console.log(""); console.log("═══ COMPOUNDING PROPERTY ═══"); const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count; const maxConf = records[records.length - 1].kb_max_confidence_after; const recurringMax = records[records.length - 1].kb_recurring_max_count; console.log(` signatures added over ${RUNS} runs: ${sigDelta}`); console.log(` max recurring count after run ${RUNS}: ${recurringMax} (same-PR recurrences per signature)`); console.log(` max confidence after run ${RUNS}: ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`); const verdictSet = new Set(records.map(r => r.verdict_overall)); if (verdictSet.size === 1) { console.log(` verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`); } else { console.log(` verdict oscillated across runs: ${[...verdictSet].join(" | ")} ✗`); } if (maxConf < 0.6 && recurringMax < 5) { console.log(` confidence policy holding: same-PR noise stays below escalation threshold ✓`); } else { console.log(` ⚠ cross-cutting pattern detected (conf=${maxConf.toFixed(2)}, recurring=${recurringMax}) — kb_index policy escalated`); } const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`; await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2)); console.log(""); console.log(` report: ${jsonOut}`); } main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); });