// Integration test: score_runs.ts CLI pipeline. Synthesizes evidence // records, runs scoreAll, asserts behavior on the materialized scored // runs + receipt. import { test, expect, beforeEach, afterEach } from "bun:test"; import { mkdirSync, writeFileSync, rmSync, existsSync, readFileSync } from "node:fs"; import { resolve } from "node:path"; import { scoreAll } from "../../scripts/distillation/score_runs"; import { EVIDENCE_SCHEMA_VERSION, type EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record"; import { validateScoredRun } from "../../auditor/schemas/distillation/scored_run"; import { validateReceipt } from "../../auditor/schemas/distillation/receipt"; const TMP = "/tmp/distillation_test_phase3"; const RECORDED = "2026-04-26T22:30:00.000Z"; const SHA = "0".repeat(64); function makeEv(opts: Partial & { source_stem: string }): EvidenceRecord { return { run_id: opts.run_id ?? `run-${Math.random()}`, task_id: opts.task_id ?? "task-test", timestamp: opts.timestamp ?? RECORDED, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: { source_file: `data/_kb/${opts.source_stem}.jsonl`, line_offset: 0, sig_hash: SHA, recorded_at: RECORDED, }, ...opts, } as EvidenceRecord; } function writeEvidence(ev: EvidenceRecord[], stem: string) { const partition = "2026/04/27"; const dir = resolve(TMP, "data/evidence", partition); mkdirSync(dir, { recursive: true }); writeFileSync(resolve(dir, `${stem}.jsonl`), ev.map(r => JSON.stringify(r)).join("\n") + "\n"); } function setup() { if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true }); mkdirSync(resolve(TMP, "data/_kb"), { recursive: true }); // Mix of every category across sources writeEvidence([ makeEv({ source_stem: "scrum_reviews", run_id: "s1", success_markers: ["accepted_on_attempt_1"] }), makeEv({ source_stem: "scrum_reviews", run_id: "s2", success_markers: ["accepted_on_attempt_3"] }), makeEv({ source_stem: "scrum_reviews", run_id: "s3" }), // no markers → human ], "scrum_reviews"); writeEvidence([ makeEv({ source_stem: "audits", run_id: "a1", success_markers: ["approved"] }), makeEv({ source_stem: "audits", run_id: "a2", failure_markers: ["blocked"] }), makeEv({ source_stem: "audits", run_id: "a3", failure_markers: ["request_changes"] }), ], "audits"); writeEvidence([ makeEv({ source_stem: "auto_apply", run_id: "ap1", success_markers: ["committed"] }), makeEv({ source_stem: "auto_apply", run_id: "ap2", failure_markers: ["build_red_reverted"] }), makeEv({ source_stem: "auto_apply", run_id: "ap3" }), ], "auto_apply"); writeEvidence([ makeEv({ source_stem: "distilled_facts", run_id: "df1", text: "extracted fact" }), ], "distilled_facts"); } beforeEach(setup); afterEach(() => { if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true }); }); test("score_runs: emits ScoredRun for every EvidenceRecord", async () => { const r = await scoreAll({ root: TMP, recorded_at: RECORDED }); expect(r.totals.rows_read).toBe(10); expect(r.totals.rows_written).toBe(10); expect(r.totals.rows_skipped).toBe(0); }); test("score_runs: category distribution matches expected per source", async () => { const r = await scoreAll({ root: TMP, recorded_at: RECORDED }); // 1 (s1) + 1 (a1) + 1 (ap1) = 3 accepted // 1 (s2) + 1 (a3) = 2 partial // 1 (a2) + 1 (ap2) = 2 rejected // 1 (s3) + 1 (ap3) + 1 (df1) = 3 needs_human expect(r.totals.by_category.accepted).toBe(3); expect(r.totals.by_category.partially_accepted).toBe(2); expect(r.totals.by_category.rejected).toBe(2); expect(r.totals.by_category.needs_human_review).toBe(3); }); test("score_runs: every output row validates against ScoredRun schema", async () => { await scoreAll({ root: TMP, recorded_at: RECORDED }); const dir = resolve(TMP, "data/scored-runs/2026/04/27"); for (const stem of ["scrum_reviews", "audits", "auto_apply", "distilled_facts"]) { const path = resolve(dir, `${stem}.jsonl`); expect(existsSync(path)).toBe(true); const lines = readFileSync(path, "utf8").trim().split("\n").filter(Boolean); for (const line of lines) { const v = validateScoredRun(JSON.parse(line)); expect(v.valid).toBe(true); } } }); test("score_runs: idempotent — second run produces 0 new writes", async () => { await scoreAll({ root: TMP, recorded_at: RECORDED }); const r2 = await scoreAll({ root: TMP, recorded_at: RECORDED }); expect(r2.totals.rows_written).toBe(0); expect(r2.totals.rows_deduped).toBe(10); }); test("score_runs: receipt validates and pins git_sha + record_counts + by_category", async () => { const r = await scoreAll({ root: TMP, recorded_at: RECORDED }); const v = validateReceipt(r.receipt); expect(v.valid).toBe(true); expect(r.receipt.git_sha).toMatch(/^[0-9a-f]{40}$/); expect(r.receipt.record_counts.in).toBe(10); expect(r.receipt.record_counts.out).toBe(10); expect(r.receipt.record_counts.cat_accepted).toBe(3); expect(r.receipt.record_counts.cat_partially_accepted).toBe(2); expect(r.receipt.record_counts.cat_rejected).toBe(2); expect(r.receipt.record_counts.cat_needs_human_review).toBe(3); expect(r.receipt.validation_pass).toBe(true); // 0 skips }); test("score_runs: every output row carries provenance + reasons + scorer_version", async () => { await scoreAll({ root: TMP, recorded_at: RECORDED }); const path = resolve(TMP, "data/scored-runs/2026/04/27/scrum_reviews.jsonl"); const rows = readFileSync(path, "utf8").trim().split("\n").map(l => JSON.parse(l)); for (const row of rows) { expect(row.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/); expect(row.reasons.length).toBeGreaterThan(0); expect(row.scorer_version).toBeTruthy(); } }); test("score_runs: malformed evidence row is skipped, valid rows still process", async () => { // Inject a malformed line into one of the evidence files const path = resolve(TMP, "data/evidence/2026/04/27/scrum_reviews.jsonl"); const existing = readFileSync(path, "utf8"); writeFileSync(path, existing + "{not valid json\n"); const r = await scoreAll({ root: TMP, recorded_at: RECORDED }); expect(r.totals.rows_skipped).toBe(1); expect(r.totals.rows_written).toBe(10); // valid rows unaffected expect(r.receipt.validation_pass).toBe(false); // skips > 0 expect(existsSync(r.skips_path)).toBe(true); const skipBody = readFileSync(r.skips_path, "utf8"); expect(skipBody).toContain("evidence not JSON"); }); test("score_runs: dry-run reports counts but writes no scored-runs", async () => { const r = await scoreAll({ root: TMP, recorded_at: RECORDED, dry_run: true }); expect(r.totals.rows_written).toBe(10); const scoredDir = resolve(TMP, "data/scored-runs"); expect(existsSync(scoredDir)).toBe(false); });