9-run empirical test showed 20 of 27 audit_lessons signatures were singletons (count=1) — the cloud producing slightly-different summary phrasings for the SAME underlying claim on each audit, each hashing to a fresh signature. That's the creep J flagged — not explosive, but steady ~2 new sigs per run, unbounded over hundreds of runs. Root cause: temperature=0.2 + think=true was letting variable prose leak into the classification output. Fix: temp=0 (greedy sample → identical input yields identical output on same model version), think=false (no reasoning trace variance), max_tokens 3000→1500 (tighter bound prevents tail wander). The compounding policy itself was validated by the 9 runs: - 7 recurring claims (the legitimate signals) all at conf 0.08-0.20 - ratingSeverity() correctly held them at info (below 0.3 threshold) - cross-PR signal test separately confirmed conf=1.00 → sev=block Also: LH_AUDIT_RUNS env so the test can validate with smaller N.
182 lines
8.1 KiB
TypeScript
182 lines
8.1 KiB
TypeScript
// Nine-consecutive audit runner — empirical test of the predictive-
|
|
// compounding property. Pushes 9 empty commits to the current branch,
|
|
// waits for each audit to complete on the new SHA, captures the
|
|
// verdict + audit_lessons state after each run, and reports whether
|
|
// the KB stabilizes or drifts.
|
|
//
|
|
// What we expect (favorable compounding):
|
|
// - signature_count grows sublinearly (same patterns recur, so
|
|
// distinct-signature count stabilizes fast)
|
|
// - verdict settles on a stable value after run 2-3 (first audit
|
|
// establishes baseline, rest repeat)
|
|
// - confidence stays LOW for all signatures (same PR repeatedly)
|
|
// - NO new recurring findings fire because confidence < 0.3 on
|
|
// same-PR noise (kb_index rating policy)
|
|
//
|
|
// What would indicate drift (the thing we want to prove DOESN'T happen):
|
|
// - signature_count grows linearly — each run produces new signatures
|
|
// - verdict oscillates (block → approve → block ...)
|
|
// - confidence inflates — kb_index rating escalates on repeated runs
|
|
//
|
|
// Run: bun run tests/real-world/nine_consecutive_audits.ts
|
|
|
|
import { readFile } from "node:fs/promises";
|
|
import { aggregate } from "../../auditor/kb_index.ts";
|
|
|
|
const REPO = "/home/profit/lakehouse";
|
|
const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
|
|
const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
|
|
const POLL_INTERVAL_MS = 5_000;
|
|
const AUDIT_TIMEOUT_MS = 180_000;
|
|
const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9);
|
|
const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8);
|
|
|
|
async function sh(cmd: string): Promise<{ stdout: string; stderr: string; code: number }> {
|
|
const p = Bun.spawn(["bash", "-lc", cmd], { cwd: REPO, stdout: "pipe", stderr: "pipe" });
|
|
const [stdout, stderr] = await Promise.all([new Response(p.stdout).text(), new Response(p.stderr).text()]);
|
|
const code = await p.exited;
|
|
return { stdout, stderr, code };
|
|
}
|
|
|
|
async function getHeadSha(): Promise<string> {
|
|
const r = await sh("git rev-parse HEAD");
|
|
return r.stdout.trim();
|
|
}
|
|
|
|
async function pushEmptyCommit(n: number): Promise<string> {
|
|
const msg = `test: nine-consecutive audit run ${n}/${RUNS} (compounding probe)`;
|
|
await sh(`GIT_AUTHOR_NAME=profit GIT_AUTHOR_EMAIL=profit@lakehouse GIT_COMMITTER_NAME=profit GIT_COMMITTER_EMAIL=profit@lakehouse git commit --allow-empty -m "${msg}"`);
|
|
const sha = await getHeadSha();
|
|
const pushCmd = `PAT="dead60d1160a02f81d241197d5d18f4608794fb2"; git -c credential.helper='!f() { echo "username=profit"; echo "password='$PAT'"; }; f' push origin HEAD 2>&1`;
|
|
const pr = await sh(pushCmd);
|
|
if (pr.code !== 0) throw new Error(`push failed: ${pr.stderr || pr.stdout}`);
|
|
return sha;
|
|
}
|
|
|
|
async function waitForVerdict(sha: string, deadlineMs: number): Promise<any> {
|
|
const short = sha.slice(0, 12);
|
|
const path = `${VERDICTS_DIR}/${TARGET_PR}-${short}.json`;
|
|
const start = Date.now();
|
|
while (Date.now() - start < deadlineMs) {
|
|
try {
|
|
const raw = await readFile(path, "utf8");
|
|
return JSON.parse(raw);
|
|
} catch { /* not yet */ }
|
|
await new Promise(r => setTimeout(r, POLL_INTERVAL_MS));
|
|
}
|
|
throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`);
|
|
}
|
|
|
|
async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> {
|
|
const agg = await aggregate<any>(AUDIT_LESSONS, {
|
|
keyFn: (r) => r?.signature,
|
|
scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
|
|
});
|
|
const list = Array.from(agg.values()).sort((a, b) => b.count - a.count);
|
|
return {
|
|
sig_count: list.length,
|
|
max_count: list[0]?.count ?? 0,
|
|
max_confidence: list.reduce((m, a) => Math.max(m, a.confidence), 0),
|
|
top3: list.slice(0, 3).map(a => ({
|
|
sig: a.signature,
|
|
count: a.count,
|
|
conf: a.confidence,
|
|
summary: a.representative_summary.slice(0, 80),
|
|
})),
|
|
};
|
|
}
|
|
|
|
interface RunRecord {
|
|
run: number;
|
|
sha: string;
|
|
verdict_overall: string;
|
|
findings_total: number;
|
|
findings_block: number;
|
|
findings_warn: number;
|
|
findings_info: number;
|
|
audit_duration_ms: number;
|
|
claims_total: number;
|
|
claims_empirical: number;
|
|
kb_sig_count_after: number;
|
|
kb_max_count_after: number;
|
|
kb_max_confidence_after: number;
|
|
}
|
|
|
|
async function main() {
|
|
console.log(`[nine] target PR: #${TARGET_PR}`);
|
|
console.log(`[nine] runs: ${RUNS}`);
|
|
console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`);
|
|
console.log("");
|
|
|
|
const baseline = await captureAggState();
|
|
console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`);
|
|
console.log("");
|
|
|
|
const records: RunRecord[] = [];
|
|
for (let n = 1; n <= RUNS; n++) {
|
|
const t0 = Date.now();
|
|
console.log(`─── run ${n}/${RUNS} ───`);
|
|
const sha = await pushEmptyCommit(n);
|
|
console.log(` pushed ${sha.slice(0, 12)}`);
|
|
const verdict = await waitForVerdict(sha, AUDIT_TIMEOUT_MS);
|
|
const after = await captureAggState();
|
|
const rec: RunRecord = {
|
|
run: n,
|
|
sha: sha.slice(0, 12),
|
|
verdict_overall: String(verdict.overall),
|
|
findings_total: Number(verdict.metrics?.findings_total ?? 0),
|
|
findings_block: Number(verdict.metrics?.findings_block ?? 0),
|
|
findings_warn: Number(verdict.metrics?.findings_warn ?? 0),
|
|
findings_info: Number(verdict.metrics?.findings_info ?? 0),
|
|
audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0),
|
|
claims_total: Number(verdict.metrics?.claims_total ?? 0),
|
|
claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0),
|
|
kb_sig_count_after: after.sig_count,
|
|
kb_max_count_after: after.max_count,
|
|
kb_max_confidence_after: after.max_confidence,
|
|
};
|
|
records.push(rec);
|
|
console.log(` verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`);
|
|
console.log(` kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`);
|
|
console.log(` elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`);
|
|
console.log("");
|
|
}
|
|
|
|
console.log("═══ FINAL ═══");
|
|
console.log("run | verdict | find | block warn info | dur_s | kb_sig max_count max_conf");
|
|
for (const r of records) {
|
|
console.log(
|
|
` ${String(r.run).padStart(1)} | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)} | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)} | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)} ${String(r.kb_max_count_after).padStart(9)} ${r.kb_max_confidence_after.toFixed(2)}`,
|
|
);
|
|
}
|
|
|
|
console.log("");
|
|
console.log("═══ COMPOUNDING PROPERTY ═══");
|
|
const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count;
|
|
const maxCount = records[records.length - 1].kb_max_count_after;
|
|
const maxConf = records[records.length - 1].kb_max_confidence_after;
|
|
console.log(` signatures added over ${RUNS} runs: ${sigDelta}`);
|
|
console.log(` max count after run ${RUNS}: ${maxCount} (same-PR recurrences per signature)`);
|
|
console.log(` max confidence after run ${RUNS}: ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`);
|
|
|
|
const verdictSet = new Set(records.map(r => r.verdict_overall));
|
|
if (verdictSet.size === 1) {
|
|
console.log(` verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`);
|
|
} else {
|
|
console.log(` verdict oscillated across runs: ${[...verdictSet].join(" | ")} ✗`);
|
|
}
|
|
|
|
if (maxConf < 0.3) {
|
|
console.log(` confidence policy holding: same-PR noise stays below escalation threshold ✓`);
|
|
} else {
|
|
console.log(` ⚠ confidence escalated above 0.3 on same-PR noise — kb_index policy needs tightening`);
|
|
}
|
|
|
|
const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`;
|
|
await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2));
|
|
console.log("");
|
|
console.log(` report: ${jsonOut}`);
|
|
}
|
|
|
|
main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); });
|