// check_evidence_health.ts — high-level audit of the materialized // EvidenceRecord substrate. Answers two questions Phase 3 needs: // // 1. PROVENANCE ROUND-TRIP — sample N output rows, look up the // source row at the recorded (source_file, line_offset), // recompute canonicalSha256, confirm it matches provenance.sig_hash. // Hard pass/fail. If even one row fails, provenance is theater. // // 2. SCORE-READINESS COVERAGE — for each source, what fraction of // materialized rows carry the signals the Success Scorer will // need: model_role, success_markers, failure_markers, // observer_verdict, latency_ms, retrieved_context, text. Tells // Phase 3 which sources to read from for each gate. // // Output: markdown report to stdout + data/_kb/evidence_health.md. // // Run: bun run scripts/distillation/check_evidence_health.ts import { existsSync, readFileSync, readdirSync, statSync, writeFileSync } from "node:fs"; import { resolve } from "node:path"; import { canonicalSha256 } from "../../auditor/schemas/distillation/types"; const ROOT = process.env.LH_DISTILL_ROOT ?? "/home/profit/lakehouse"; const SAMPLE_FOR_PROVENANCE = 30; interface CoverageBucket { source: string; total: number; with_model_role: number; with_model_name: number; with_success_markers: number; with_failure_markers: number; with_observer_verdict: number; with_latency_ms: number; with_retrieved_context: number; with_text: number; scoreable: number; // has at least ONE signal the scorer can use } interface ProvenanceCheck { passed: number; failed: number; failures: Array<{ output_path: string; line: number; reason: string }>; } function listEvidenceFiles(evidence_root: string): string[] { const out: string[] = []; if (!existsSync(evidence_root)) return out; for (const yyyy of readdirSync(evidence_root).sort()) { const ydir = resolve(evidence_root, yyyy); if (!statSync(ydir).isDirectory()) continue; for (const mm of readdirSync(ydir).sort()) { const mdir = resolve(ydir, mm); if (!statSync(mdir).isDirectory()) continue; for (const dd of readdirSync(mdir).sort()) { const ddir = resolve(mdir, dd); if (!statSync(ddir).isDirectory()) continue; for (const f of readdirSync(ddir)) { if (f.endsWith(".jsonl")) out.push(resolve(ddir, f)); } } } } return out; } // Has at least one deterministic signal the Phase 3 scorer can act on. // Order is generous: any of these counts as "scoreable", because the // scorer combines multiple signals. function isScoreable(row: any): boolean { if (Array.isArray(row.success_markers) && row.success_markers.length > 0) return true; if (Array.isArray(row.failure_markers) && row.failure_markers.length > 0) return true; if (typeof row.observer_verdict === "string") return true; if (row.validation_results && Object.keys(row.validation_results).length > 0) return true; if (Array.isArray(row.observer_notes) && row.observer_notes.length > 0) return true; return false; } function bucketStart(source: string): CoverageBucket { return { source, total: 0, with_model_role: 0, with_model_name: 0, with_success_markers: 0, with_failure_markers: 0, with_observer_verdict: 0, with_latency_ms: 0, with_retrieved_context: 0, with_text: 0, scoreable: 0, }; } function pct(n: number, total: number): string { if (total === 0) return "—"; return Math.round(100 * n / total) + "%"; } async function main() { const evidenceFiles = listEvidenceFiles(resolve(ROOT, "data/evidence")); if (evidenceFiles.length === 0) { console.error("No evidence files found. Run scripts/distillation/build_evidence_index.ts first."); process.exit(1); } // ── 1. Coverage scan ──────────────────────────────────────────── const buckets = new Map(); const allOutputRows: Array<{ output_path: string; line: number; row: any }> = []; for (const evPath of evidenceFiles) { const sourceLabel = evPath.split("/").pop()!.replace(/\.jsonl$/, ""); const b = buckets.get(sourceLabel) ?? bucketStart(sourceLabel); const lines = readFileSync(evPath, "utf8").split("\n").filter(Boolean); for (let i = 0; i < lines.length; i++) { const row = JSON.parse(lines[i]); b.total++; if (row.model_role) b.with_model_role++; if (row.model_name) b.with_model_name++; if (Array.isArray(row.success_markers) && row.success_markers.length > 0) b.with_success_markers++; if (Array.isArray(row.failure_markers) && row.failure_markers.length > 0) b.with_failure_markers++; if (typeof row.observer_verdict === "string") b.with_observer_verdict++; if (typeof row.latency_ms === "number") b.with_latency_ms++; if (row.retrieved_context && Object.keys(row.retrieved_context).length > 0) b.with_retrieved_context++; if (typeof row.text === "string" && row.text.length > 0) b.with_text++; if (isScoreable(row)) b.scoreable++; allOutputRows.push({ output_path: evPath, line: i, row }); } buckets.set(sourceLabel, b); } // ── 2. Provenance round-trip on a random sample ───────────────── const sampleSize = Math.min(SAMPLE_FOR_PROVENANCE, allOutputRows.length); const indices = new Set(); // Deterministic-ish sample: stride through evenly so we hit different sources. const stride = Math.max(1, Math.floor(allOutputRows.length / sampleSize)); for (let i = 0; i < allOutputRows.length && indices.size < sampleSize; i += stride) indices.add(i); // Top up with the tail in case stride truncates early. while (indices.size < sampleSize) indices.add(allOutputRows.length - 1 - indices.size); const provCheck: ProvenanceCheck = { passed: 0, failed: 0, failures: [] }; // Cache source-file lines so we don't re-read big files repeatedly. const sourceCache = new Map(); for (const idx of indices) { const { output_path, line, row } = allOutputRows[idx]; const prov = row.provenance; if (!prov || !prov.source_file || prov.line_offset == null || !prov.sig_hash) { provCheck.failed++; provCheck.failures.push({ output_path, line, reason: "missing provenance fields" }); continue; } const sourceAbs = resolve(ROOT, prov.source_file); if (!sourceCache.has(sourceAbs)) { if (!existsSync(sourceAbs)) { provCheck.failed++; provCheck.failures.push({ output_path, line, reason: `source missing: ${prov.source_file}` }); continue; } sourceCache.set(sourceAbs, readFileSync(sourceAbs, "utf8").split("\n")); } const sourceLines = sourceCache.get(sourceAbs)!; if (prov.line_offset >= sourceLines.length) { provCheck.failed++; provCheck.failures.push({ output_path, line, reason: `line_offset ${prov.line_offset} past EOF (source has ${sourceLines.length} lines)` }); continue; } const sourceLine = sourceLines[prov.line_offset]; let sourceRow: any; try { sourceRow = JSON.parse(sourceLine); } catch (e) { provCheck.failed++; provCheck.failures.push({ output_path, line, reason: `source line not JSON: ${(e as Error).message.slice(0, 60)}` }); continue; } const recomputed = await canonicalSha256(sourceRow); if (recomputed !== prov.sig_hash) { provCheck.failed++; provCheck.failures.push({ output_path, line, reason: `sig_hash mismatch: prov=${prov.sig_hash.slice(0, 16)}… recomputed=${recomputed.slice(0, 16)}…`, }); continue; } provCheck.passed++; } // ── 3. Render markdown ────────────────────────────────────────── const md: string[] = []; md.push("# Evidence Health — Phase 2 high-level audit"); md.push(""); md.push(`**Run:** ${new Date().toISOString()}`); md.push(`**Evidence files:** ${evidenceFiles.length}`); md.push(`**Total records:** ${allOutputRows.length}`); md.push(""); md.push("## 1. Provenance round-trip"); md.push(""); md.push(`Sample size: **${sampleSize}** rows (stride sample across all evidence).`); md.push(""); md.push(`| Passed | Failed |`); md.push(`|---|---|`); md.push(`| ${provCheck.passed} | ${provCheck.failed} |`); md.push(""); if (provCheck.failed > 0) { md.push("### Failures"); for (const f of provCheck.failures.slice(0, 20)) { md.push(`- \`${f.output_path.split("/").slice(-2).join("/")}\` line ${f.line}: ${f.reason}`); } } else { md.push("**All sampled rows traced cleanly back to source rows with matching canonical sig_hash.**"); } md.push(""); md.push("## 2. Score-readiness coverage"); md.push(""); md.push("Per source, fraction of materialized rows carrying each signal the Phase 3 Success Scorer will read."); md.push(""); md.push("| Source | Rows | role | name | success | failure | obs.verdict | latency | retrieval | text | scoreable |"); md.push("|---|---|---|---|---|---|---|---|---|---|---|"); const sortedBuckets = Array.from(buckets.values()).sort((a, b) => b.total - a.total); for (const b of sortedBuckets) { md.push(`| ${b.source} | ${b.total} | ${pct(b.with_model_role, b.total)} | ${pct(b.with_model_name, b.total)} | ${pct(b.with_success_markers, b.total)} | ${pct(b.with_failure_markers, b.total)} | ${pct(b.with_observer_verdict, b.total)} | ${pct(b.with_latency_ms, b.total)} | ${pct(b.with_retrieved_context, b.total)} | ${pct(b.with_text, b.total)} | **${pct(b.scoreable, b.total)}** |`); } md.push(""); // Aggregate totals row const totals = sortedBuckets.reduce((acc, b) => ({ total: acc.total + b.total, role: acc.role + b.with_model_role, name: acc.name + b.with_model_name, success: acc.success + b.with_success_markers, failure: acc.failure + b.with_failure_markers, obs: acc.obs + b.with_observer_verdict, lat: acc.lat + b.with_latency_ms, ret: acc.ret + b.with_retrieved_context, text: acc.text + b.with_text, score: acc.score + b.scoreable, }), { total: 0, role: 0, name: 0, success: 0, failure: 0, obs: 0, lat: 0, ret: 0, text: 0, score: 0 }); md.push(`**Aggregate:** ${totals.total} rows · role ${pct(totals.role, totals.total)} · name ${pct(totals.name, totals.total)} · success ${pct(totals.success, totals.total)} · failure ${pct(totals.failure, totals.total)} · obs.verdict ${pct(totals.obs, totals.total)} · latency ${pct(totals.lat, totals.total)} · retrieval ${pct(totals.ret, totals.total)} · text ${pct(totals.text, totals.total)} · scoreable **${pct(totals.score, totals.total)}**`); md.push(""); md.push("## 3. Phase 3 readiness"); md.push(""); if (provCheck.failed > 0) { md.push("**❌ NOT READY** — provenance round-trip failed. Fix materializer or transforms before Phase 3."); } else if (totals.score < totals.total * 0.5) { md.push(`**⚠️ PARTIAL READINESS** — only ${pct(totals.score, totals.total)} of records are scoreable. Phase 3 will produce many \`needs_human_review\` until transforms enrich more sources with markers.`); } else { md.push(`**✓ READY** — provenance traces, ${pct(totals.score, totals.total)} of records carry scorer signals. Phase 3 can begin.`); } md.push(""); const out = md.join("\n"); console.log(out); writeFileSync(resolve(ROOT, "data/_kb/evidence_health.md"), out); if (provCheck.failed > 0) process.exit(1); } if (import.meta.main) { main().catch(e => { console.error(e); process.exit(1); }); }