matrix-agent-validated/tests/real-world/nine_consecutive_audits.ts
profit ac01fffd9a checkpoint: matrix-agent-validated (2026-04-25)
Architectural snapshot of the lakehouse codebase at the point where the
full matrix-driven agent loop with Mem0 versioning + deletion was
validated end-to-end.

WHAT THIS REPO IS
A clean single-commit snapshot of the lakehouse code. Heavy test data
(.parquet datasets, vector indexes) excluded — see REPLICATION.md for
regen path. Full lakehouse history at git.agentview.dev/profit/lakehouse.

WHAT WAS PROVEN
- Vector retrieval across multi-corpora matrix (chicago_permits + entity
  briefs + sec_tickers + distilled procedural + llm_team runs)
- Observer hand-review (cloud + heuristic fallback) gating each candidate
- Local-model agent loop (qwen3.5:latest) with tool use + scratchpad
- Playbook seal on success → next-iter retrieval surfaces it as preamble
- Mem0 versioning + deletion in pathway_memory:
    * UPSERT: ADD on new workflow, UPDATE bumps replay_count on identical
    * REVISE: chains versions, parent.superseded_at + superseded_by stamped
    * RETIRE: marks specific trace retired with reason, excluded from retrieval
    * HISTORY: walks chain root→tip, cycle-safe

KEY DIRECTORIES
- crates/vectord/src/pathway_memory.rs — Mem0 ops live here
- crates/vectord/src/playbook_memory.rs — original Mem0 reference
- tests/agent_test/ — local-model agent harness + PRD + session archives
- scripts/dump_raw_corpus.sh — MinIO bucket dump (raw test corpus)
- scripts/vectorize_raw_corpus.ts — corpus → vector indexes
- scripts/analyze_chicago_contracts.ts — real inference pipeline
- scripts/seal_agent_playbook.ts — Mem0 upsert from agent traces

Replication: see REPLICATION.md for Debian 13 clean install + cloud-only
adaptation (no local Ollama).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 19:43:27 -05:00

186 lines
8.2 KiB
TypeScript

// Nine-consecutive audit runner — empirical test of the predictive-
// compounding property. Runs the audit pipeline 9 times against the
// same PR (each time with a new diff from Gitea), captures the
// verdict + audit_lessons state after each run, and reports whether
// the KB stabilizes or drifts.
//
// What we expect (favorable compounding):
// - signature_count grows sublinearly (same patterns recur, so
// distinct-signature count stabilizes fast)
// - verdict settles on a stable value after run 2-3 (first audit
// establishes baseline, rest repeat)
// - confidence stays LOW for all signatures (same PR repeatedly)
// - NO new recurring findings fire because confidence < 0.3 on
// same-PR noise (kb_index rating policy)
//
// What would indicate drift (the thing we want to prove DOESN'T happen):
// - signature_count grows linearly — each run produces new signatures
// - verdict oscillates (block → approve → block ...)
// - confidence inflates — kb_index rating escalates on repeated runs
//
// Run: bun run tests/real-world/nine_consecutive_audits.ts
import { readFile, writeFile } from "node:fs/promises";
import { join } from "node:path";
import { aggregate } from "../../auditor/kb_index.ts";
import { getPrSnapshot } from "../../auditor/gitea.ts";
import { auditPr } from "../../auditor/audit.ts";
const REPO = "/home/profit/lakehouse";
const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
const POLL_INTERVAL_MS = 5_000;
const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9);
const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8);
const SKIP_INFERENCE = process.env.LH_AUDITOR_SKIP_INFERENCE !== "0";
const RESET_KB = process.env.LH_RESET_KB === "1";
async function waitForVerdict(prNum: number, sha: string, deadlineMs: number): Promise<any> {
const short = sha.slice(0, 12);
const path = join(VERDICTS_DIR, `${prNum}-${short}.json`);
const start = Date.now();
while (Date.now() - start < deadlineMs) {
try {
const raw = await readFile(path, "utf8");
return JSON.parse(raw);
} catch { /* not yet */ }
await Bun.sleep(POLL_INTERVAL_MS);
}
throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`);
}
async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> {
const agg = await aggregate<any>(AUDIT_LESSONS, {
keyFn: (r) => r?.signature,
scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
});
const list = Array.from(agg.values()).sort((a, b) => b.count - a.count);
const recurring = list.filter(r => r.count >= 2);
const recurringMaxCount = recurring.length > 0 ? Math.max(...recurring.map(a => a.count)) : 0;
const recurringMaxConf = recurring.length > 0 ? Math.max(...recurring.map(a => a.confidence)) : 0;
return {
sig_count: list.length,
max_count: list[0]?.count ?? 0,
max_confidence: recurringMaxConf,
recurring_max_count: recurringMaxCount,
top3: list.slice(0, 3).map(a => ({
sig: a.signature,
count: a.count,
conf: a.confidence,
summary: a.representative_summary.slice(0, 80),
})),
};
}
interface RunRecord {
run: number;
sha: string;
verdict_overall: string;
findings_total: number;
findings_block: number;
findings_warn: number;
findings_info: number;
audit_duration_ms: number;
claims_total: number;
claims_empirical: number;
kb_sig_count_after: number;
kb_max_count_after: number;
kb_max_confidence_after: number;
kb_recurring_max_count: number;
}
async function main() {
console.log(`[nine] target PR: #${TARGET_PR}`);
console.log(`[nine] runs: ${RUNS}`);
console.log(`[nine] skip_inference: ${SKIP_INFERENCE}`);
console.log(`[nine] reset_kb: ${RESET_KB}`);
console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`);
if (RESET_KB) {
console.log("[nine] clearing audit_lessons.jsonl for clean test...");
await writeFile(AUDIT_LESSONS, "");
}
console.log("");
const pr = await getPrSnapshot(TARGET_PR);
console.log(`[nine] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`);
console.log(`[nine] files in diff: ${pr.files.length}`);
console.log("");
const baseline = await captureAggState();
console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`);
console.log("");
const records: RunRecord[] = [];
for (let n = 1; n <= RUNS; n++) {
const t0 = Date.now();
console.log(`─── run ${n}/${RUNS} ───`);
const verdict = await auditPr(pr, {
dry_run: true,
skip_dynamic: true,
skip_inference: SKIP_INFERENCE,
});
console.log(` sha ${verdict.head_sha.slice(0, 12)}`);
const after = await captureAggState();
const rec: RunRecord = {
run: n,
sha: verdict.head_sha.slice(0, 12),
verdict_overall: String(verdict.overall),
findings_total: Number(verdict.metrics?.findings_total ?? 0),
findings_block: Number(verdict.metrics?.findings_block ?? 0),
findings_warn: Number(verdict.metrics?.findings_warn ?? 0),
findings_info: Number(verdict.metrics?.findings_info ?? 0),
audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0),
claims_total: Number(verdict.metrics?.claims_total ?? 0),
claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0),
kb_sig_count_after: after.sig_count,
kb_max_count_after: after.max_count,
kb_max_confidence_after: after.max_confidence,
kb_recurring_max_count: after.recurring_max_count,
};
records.push(rec);
console.log(` verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`);
console.log(` kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} recurring_max=${rec.kb_recurring_max_count} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`);
console.log(` elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`);
console.log("");
}
console.log("═══ FINAL ═══");
console.log("run | verdict | find | block warn info | dur_s | kb_sig max_count max_conf");
for (const r of records) {
console.log(
` ${String(r.run).padStart(1)} | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)} | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)} | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)} ${String(r.kb_max_count_after).padStart(9)} ${r.kb_max_confidence_after.toFixed(2)}`,
);
}
console.log("");
console.log("═══ COMPOUNDING PROPERTY ═══");
const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count;
const maxConf = records[records.length - 1].kb_max_confidence_after;
const recurringMax = records[records.length - 1].kb_recurring_max_count;
console.log(` signatures added over ${RUNS} runs: ${sigDelta}`);
console.log(` max recurring count after run ${RUNS}: ${recurringMax} (same-PR recurrences per signature)`);
console.log(` max confidence after run ${RUNS}: ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`);
const verdictSet = new Set(records.map(r => r.verdict_overall));
if (verdictSet.size === 1) {
console.log(` verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`);
} else {
console.log(` verdict oscillated across runs: ${[...verdictSet].join(" | ")}`);
}
if (maxConf < 0.6 && recurringMax < 5) {
console.log(` confidence policy holding: same-PR noise stays below escalation threshold ✓`);
} else {
console.log(` ⚠ cross-cutting pattern detected (conf=${maxConf.toFixed(2)}, recurring=${recurringMax}) — kb_index policy escalated`);
}
const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`;
await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2));
console.log("");
console.log(` report: ${jsonOut}`);
}
main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); });