auditor/kb_stats.ts — on-demand observability without Grafana

Reads every KB scratchpad file and prints a dashboard of audit health: verdict distribution, per-PR verdict instability rate, consensus discrepancy counters, KB size + distinct-signature growth, verifier verdict histogram, top recurring entities by cross-PR count. Also supports --json for feeding CI gates or later piping into a static dashboard page. --top N caps the entities section. Current state from running it: 30 audits across 8 PRs, 25% verdict instability rate (all pre-N=3-consensus), 0 discrepancies logged yet (audits before commit A didn't persist them), 84 audit_lessons rows with 28 distinct signatures, 4 audit_facts rows with 20 distinct entities. No cross-PR recurrences yet — but the machinery prints them as soon as audits on other PRs produce overlapping entities. This is the full observability surface for PR #9 — the Grafana alternative I proposed in the counter-plan. Zero infra, 280 LOC, zero maintenance. If someone later wants a real dashboard, `--json` output pipes directly into any visualization layer.
2026-04-22 23:41:50 -05:00 · 2026-04-22 23:41:50 -05:00 · a264bcf3fc
commit a264bcf3fc
parent 181c35b829
1 changed files with 269 additions and 0 deletions
--- a/auditor/kb_stats.ts
+++ b/auditor/kb_stats.ts
@ -0,0 +1,269 @@
+// kb_stats — on-demand dashboard numbers from the KB scratchpad
+// files. Reads data/_auditor/verdicts/*, data/_kb/audit_lessons.jsonl,
+// data/_kb/audit_facts.jsonl, data/_kb/audit_discrepancies.jsonl,
+// data/_kb/scrum_reviews.jsonl and prints:
+//
+//   - verdict flip-flop rate (same SHA re-audited, verdict changed?)
+//   - consensus discrepancy rate (N runs disagreed on a claim)
+//   - confidence distribution from kb_index aggregator
+//   - top N recurring entities from audit_facts
+//   - fact growth over time
+//   - scrum vs inference KB split
+//
+// Run:  bun run auditor/kb_stats.ts
+//       bun run auditor/kb_stats.ts --top 15     # show top 15 entities
+//       bun run auditor/kb_stats.ts --json       # machine-readable
+//
+// This is the "dashboard" without running Grafana. If someone really
+// wants a dashboard, wire this output into a static HTML page + cron.
+
+import { readFile, readdir } from "node:fs/promises";
+import { join } from "node:path";
+import { aggregate } from "./kb_index.ts";
+
+const REPO = "/home/profit/lakehouse";
+const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
+const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
+const AUDIT_FACTS = `${REPO}/data/_kb/audit_facts.jsonl`;
+const AUDIT_DISCREPANCIES = `${REPO}/data/_kb/audit_discrepancies.jsonl`;
+const SCRUM_REVIEWS = `${REPO}/data/_kb/scrum_reviews.jsonl`;
+
+interface Args {
+  top: number;
+  json: boolean;
+}
+
+function parseArgs(argv: string[]): Args {
+  const a: Args = { top: 10, json: false };
+  for (let i = 2; i < argv.length; i++) {
+    if (argv[i] === "--top") a.top = Number(argv[++i] ?? 10);
+    else if (argv[i] === "--json") a.json = true;
+  }
+  return a;
+}
+
+async function readJsonl<T = any>(path: string): Promise<T[]> {
+  try {
+    const raw = await readFile(path, "utf8");
+    return raw.split("\n").filter(l => l.length > 0).map(l => {
+      try { return JSON.parse(l) as T; } catch { return null as any; }
+    }).filter(r => r !== null);
+  } catch { return []; }
+}
+
+async function loadVerdicts(): Promise<Array<{ pr: number; sha: string; overall: string; findings_total: number; findings_block: number; findings_warn: number }>> {
+  let files: string[] = [];
+  try { files = await readdir(VERDICTS_DIR); } catch { return []; }
+  const out = [];
+  for (const f of files) {
+    if (!f.endsWith(".json")) continue;
+    const m = f.match(/^(\d+)-([0-9a-f]+)\.json$/);
+    if (!m) continue;
+    try {
+      const v = JSON.parse(await readFile(join(VERDICTS_DIR, f), "utf8"));
+      out.push({
+        pr: Number(m[1]),
+        sha: m[2],
+        overall: String(v.overall),
+        findings_total: Number(v.metrics?.findings_total ?? 0),
+        findings_block: Number(v.metrics?.findings_block ?? 0),
+        findings_warn: Number(v.metrics?.findings_warn ?? 0),
+      });
+    } catch { /* skip corrupt */ }
+  }
+  return out;
+}
+
+interface Stats {
+  audit_count: number;
+  verdict_distribution: Record<string, number>;
+  // Same PR with multiple SHAs — if verdicts differ, that's drift across
+  // the PR's commit history. Not a flip-flop in the classical sense,
+  // but worth surfacing (e.g. "PR #8 was block block req req block").
+  per_pr_verdict_sequences: Record<number, string[]>;
+  // For each PR with ≥ 2 audits, how many distinct verdicts did it
+  // produce? 1 = stable; 2+ = some flipping.
+  verdict_instability: { pr_count: number; pr_with_multiple_verdicts: number; pr_with_3plus: number };
+  consensus: { discrepancy_count: number; tiebreaker_used: number; unresolved: number };
+  kb: {
+    audit_lessons_rows: number;
+    audit_facts_rows: number;
+    scrum_reviews_rows: number;
+    distinct_finding_signatures: number;
+    distinct_entities_across_prs: number;
+    entities_in_2plus_prs: number;
+    entities_in_5plus_prs: number;
+  };
+  fact_quality: {
+    verifier_verdict_distribution: Record<string, number>;
+    facts_dropped_by_verifier_total: number;
+    extraction_success_rate: number;
+  };
+  top_entities: Array<{ name: string; distinct_prs: number; count: number; types: string[] }>;
+  kb_by_source: Record<string, number>;
+}
+
+async function collect(args: Args): Promise<Stats> {
+  const verdicts = await loadVerdicts();
+  const lessons = await readJsonl<any>(AUDIT_LESSONS);
+  const facts = await readJsonl<any>(AUDIT_FACTS);
+  const disc = await readJsonl<any>(AUDIT_DISCREPANCIES);
+  const reviews = await readJsonl<any>(SCRUM_REVIEWS);
+
+  // Verdict stability
+  const byPr: Record<number, string[]> = {};
+  const verdictDist: Record<string, number> = {};
+  for (const v of verdicts) {
+    (byPr[v.pr] ??= []).push(v.overall);
+    verdictDist[v.overall] = (verdictDist[v.overall] ?? 0) + 1;
+  }
+  let multi = 0, tri = 0;
+  for (const [_, seq] of Object.entries(byPr)) {
+    const distinct = new Set(seq);
+    if (distinct.size >= 2) multi++;
+    if (distinct.size >= 3) tri++;
+  }
+
+  // Consensus drift
+  const consensus = {
+    discrepancy_count: disc.length,
+    tiebreaker_used: disc.filter(d => String(d.resolution).startsWith("tiebreaker")).length,
+    unresolved: disc.filter(d => d.resolution === "unresolved").length,
+  };
+
+  // Lesson signatures
+  const lessonAgg = await aggregate<any>(AUDIT_LESSONS, {
+    keyFn: r => r?.signature,
+    scopeFn: r => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
+  });
+
+  // Entity aggregation across audit_facts rows
+  interface EntAgg { distinct_prs: Set<number>; count: number; types: Set<string>; name: string; sources: Set<string> }
+  const entAgg = new Map<string, EntAgg>();
+  const sourceCount: Record<string, number> = {};
+  let totalVerdictDist: Record<string, number> = { CORRECT: 0, INCORRECT: 0, UNVERIFIABLE: 0, UNCHECKED: 0 };
+  let factsDroppedTotal = 0;
+  let extractionsWithFacts = 0;
+
+  for (const row of facts) {
+    const src = String(row.source ?? "unknown");
+    sourceCount[src] = (sourceCount[src] ?? 0) + 1;
+    const pr = Number(row.pr_number);
+    if (Array.isArray(row.verifier_verdicts)) {
+      for (const v of row.verifier_verdicts) {
+        totalVerdictDist[v] = (totalVerdictDist[v] ?? 0) + 1;
+      }
+    }
+    factsDroppedTotal += Number(row.facts_dropped_by_verifier ?? 0);
+    if ((Array.isArray(row.facts) && row.facts.length > 0) || (Array.isArray(row.entities) && row.entities.length > 0)) {
+      extractionsWithFacts++;
+    }
+    for (const e of Array.isArray(row.entities) ? row.entities : []) {
+      const name = String(e?.name ?? "").trim();
+      if (name.length < 3) continue;
+      const key = name.toLowerCase();
+      const agg = entAgg.get(key) ?? { distinct_prs: new Set(), count: 0, types: new Set(), name, sources: new Set() };
+      agg.count++;
+      if (Number.isFinite(pr) && pr > 0) agg.distinct_prs.add(pr);
+      if (e?.type) agg.types.add(String(e.type));
+      agg.sources.add(src);
+      entAgg.set(key, agg);
+    }
+  }
+
+  const entitiesIn2Plus = Array.from(entAgg.values()).filter(a => a.distinct_prs.size >= 2).length;
+  const entitiesIn5Plus = Array.from(entAgg.values()).filter(a => a.distinct_prs.size >= 5).length;
+  const topEntities = Array.from(entAgg.values())
+    .sort((a, b) => b.distinct_prs.size - a.distinct_prs.size || b.count - a.count)
+    .slice(0, args.top)
+    .map(a => ({
+      name: a.name,
+      distinct_prs: a.distinct_prs.size,
+      count: a.count,
+      types: Array.from(a.types),
+    }));
+
+  const stats: Stats = {
+    audit_count: verdicts.length,
+    verdict_distribution: verdictDist,
+    per_pr_verdict_sequences: byPr,
+    verdict_instability: {
+      pr_count: Object.keys(byPr).length,
+      pr_with_multiple_verdicts: multi,
+      pr_with_3plus: tri,
+    },
+    consensus,
+    kb: {
+      audit_lessons_rows: lessons.length,
+      audit_facts_rows: facts.length,
+      scrum_reviews_rows: reviews.length,
+      distinct_finding_signatures: lessonAgg.size,
+      distinct_entities_across_prs: entAgg.size,
+      entities_in_2plus_prs: entitiesIn2Plus,
+      entities_in_5plus_prs: entitiesIn5Plus,
+    },
+    fact_quality: {
+      verifier_verdict_distribution: totalVerdictDist,
+      facts_dropped_by_verifier_total: factsDroppedTotal,
+      extraction_success_rate: facts.length > 0 ? extractionsWithFacts / facts.length : 0,
+    },
+    top_entities: topEntities,
+    kb_by_source: sourceCount,
+  };
+  return stats;
+}
+
+function renderHuman(s: Stats): string {
+  const lines: string[] = [];
+  lines.push("═══ KB STATS ═══");
+  lines.push("");
+  lines.push(`Audits: ${s.audit_count} total across ${s.verdict_instability.pr_count} distinct PRs`);
+  lines.push(`Verdicts: ${Object.entries(s.verdict_distribution).map(([k, v]) => `${k}=${v}`).join("  ")}`);
+  const multiplePct = s.verdict_instability.pr_count > 0
+    ? Math.round(100 * s.verdict_instability.pr_with_multiple_verdicts / s.verdict_instability.pr_count)
+    : 0;
+  lines.push(`Verdict instability: ${s.verdict_instability.pr_with_multiple_verdicts}/${s.verdict_instability.pr_count} PRs had 2+ distinct verdicts (${multiplePct}%) — 3+ distinct: ${s.verdict_instability.pr_with_3plus}`);
+  lines.push("");
+  lines.push("─── Consensus ───");
+  lines.push(`  discrepancies logged: ${s.consensus.discrepancy_count}`);
+  lines.push(`  tiebreaker used: ${s.consensus.tiebreaker_used}`);
+  lines.push(`  unresolved: ${s.consensus.unresolved}`);
+  const dRate = s.audit_count > 0 ? (100 * s.consensus.discrepancy_count / s.audit_count).toFixed(1) : "0";
+  lines.push(`  discrepancy rate: ${dRate}% of audits`);
+  lines.push("");
+  lines.push("─── KB size ───");
+  lines.push(`  audit_lessons.jsonl:     ${s.kb.audit_lessons_rows} rows, ${s.kb.distinct_finding_signatures} distinct signatures`);
+  lines.push(`  audit_facts.jsonl:       ${s.kb.audit_facts_rows} rows, ${s.kb.distinct_entities_across_prs} distinct entities`);
+  lines.push(`  scrum_reviews.jsonl:     ${s.kb.scrum_reviews_rows} rows`);
+  lines.push(`  entities in 2+ PRs:      ${s.kb.entities_in_2plus_prs}`);
+  lines.push(`  entities in 5+ PRs:      ${s.kb.entities_in_5plus_prs} ← strong cross-cutting signal`);
+  lines.push("");
+  lines.push("─── Fact quality ───");
+  const v = s.fact_quality.verifier_verdict_distribution;
+  lines.push(`  verifier verdicts:  CORRECT=${v.CORRECT ?? 0}  UNVERIFIABLE=${v.UNVERIFIABLE ?? 0}  UNCHECKED=${v.UNCHECKED ?? 0}  INCORRECT=${v.INCORRECT ?? 0}`);
+  lines.push(`  facts dropped by verifier: ${s.fact_quality.facts_dropped_by_verifier_total}`);
+  lines.push(`  extraction success rate: ${(s.fact_quality.extraction_success_rate * 100).toFixed(1)}%`);
+  lines.push("");
+  lines.push("─── KB sources ───");
+  for (const [src, n] of Object.entries(s.kb_by_source)) {
+    lines.push(`  ${src}: ${n}`);
+  }
+  lines.push("");
+  lines.push(`─── Top ${s.top_entities.length} recurring entities ───`);
+  for (const e of s.top_entities) {
+    lines.push(`  [${e.distinct_prs} PRs × ${e.count} obs]  ${e.name}  (${e.types.join(",")})`);
+  }
+  return lines.join("\n");
+}
+
+async function main() {
+  const args = parseArgs(process.argv);
+  const stats = await collect(args);
+  if (args.json) {
+    console.log(JSON.stringify(stats, (_, v) => v instanceof Set ? Array.from(v) : v, 2));
+  } else {
+    console.log(renderHuman(stats));
+  }
+}
+
+main().catch(e => { console.error("[kb_stats] fatal:", e); process.exit(1); });