// Unit tests on the pure scoreRecord function. No I/O, no fixtures — // inline EvidenceRecord makers per source class. Each scoring rule // gets a positive case + at least one boundary case. import { test, expect } from "bun:test"; import { scoreRecord, SCORER_VERSION, buildScoredRun } from "../../scripts/distillation/scorer"; import { EVIDENCE_SCHEMA_VERSION, type EvidenceRecord, type ModelRole } from "../../auditor/schemas/distillation/evidence_record"; const NOW = "2026-04-26T22:30:00.000Z"; const SHA = "0".repeat(64); function makeEvidence(opts: Partial & { source_stem: string }): EvidenceRecord { return { run_id: opts.run_id ?? "run-test", task_id: opts.task_id ?? "task-test", timestamp: opts.timestamp ?? NOW, schema_version: EVIDENCE_SCHEMA_VERSION, provenance: { source_file: `data/_kb/${opts.source_stem}.jsonl`, line_offset: 0, sig_hash: SHA, recorded_at: NOW, }, ...opts, } as EvidenceRecord; } // ─── Class A: scrum_reviews ─────────────────────────────────────── test("scrum_reviews: accepted_on_attempt_1 → accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_1"], })); expect(r.category).toBe("accepted"); expect(r.sub_scores?.accepted_on_attempt).toBe(1); expect(r.reasons.some(x => x.includes("first attempt"))).toBe(true); }); test("scrum_reviews: accepted_on_attempt_2 → partially_accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_2"], })); expect(r.category).toBe("partially_accepted"); }); test("scrum_reviews: accepted_on_attempt_5 → partially_accepted with high-cost reason", () => { const r = scoreRecord(makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_5"], })); expect(r.category).toBe("partially_accepted"); expect(r.reasons.some(x => x.includes("5 attempts"))).toBe(true); }); test("scrum_reviews: no success_markers → needs_human_review", () => { const r = scoreRecord(makeEvidence({ source_stem: "scrum_reviews" })); expect(r.category).toBe("needs_human_review"); }); // ─── Class A: observer_reviews ──────────────────────────────────── test("observer_reviews: accept → accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "accept" })); expect(r.category).toBe("accepted"); expect(r.sub_scores?.observer_verdict).toBe("accept"); }); test("observer_reviews: reject → rejected", () => { const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "reject" })); expect(r.category).toBe("rejected"); }); test("observer_reviews: cycle → partially_accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "cycle" })); expect(r.category).toBe("partially_accepted"); }); test("observer_reviews: missing verdict → needs_human_review", () => { const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews" })); expect(r.category).toBe("needs_human_review"); }); // ─── Class A: audits (per-finding stream, severity-based) ──────── test("audits: severity_info → accepted (minor finding)", () => { const r = scoreRecord(makeEvidence({ source_stem: "audits", success_markers: ["audit_severity_info"], })); expect(r.category).toBe("accepted"); }); test("audits: severity_low → accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "audits", success_markers: ["audit_severity_low"], })); expect(r.category).toBe("accepted"); }); test("audits: severity_medium → partially_accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "audits", failure_markers: ["audit_severity_medium"], })); expect(r.category).toBe("partially_accepted"); }); test("audits: severity_high → rejected", () => { const r = scoreRecord(makeEvidence({ source_stem: "audits", failure_markers: ["audit_severity_high"], })); expect(r.category).toBe("rejected"); }); test("audits: severity_critical → rejected", () => { const r = scoreRecord(makeEvidence({ source_stem: "audits", failure_markers: ["audit_severity_critical"], })); expect(r.category).toBe("rejected"); }); // Legacy markers preserved for back-compat with pre-fix data on disk test("audits: legacy 'approved' still maps to accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "audits", success_markers: ["approved"], })); expect(r.category).toBe("accepted"); }); test("audits: legacy 'blocked' still maps to rejected", () => { const r = scoreRecord(makeEvidence({ source_stem: "audits", failure_markers: ["blocked"], })); expect(r.category).toBe("rejected"); }); // ─── Class A: contract_analyses ─────────────────────────────────── test("contract_analyses: observer_rejected failure marker → rejected", () => { const r = scoreRecord(makeEvidence({ source_stem: "contract_analyses", failure_markers: ["observer_rejected"], observer_verdict: "reject", })); expect(r.category).toBe("rejected"); }); test("contract_analyses: observer accept → accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "contract_analyses", observer_verdict: "accept", })); expect(r.category).toBe("accepted"); }); // ─── Class B: auto_apply ────────────────────────────────────────── test("auto_apply: committed → accepted with cargo_green=true", () => { const r = scoreRecord(makeEvidence({ source_stem: "auto_apply", success_markers: ["committed"], })); expect(r.category).toBe("accepted"); expect(r.sub_scores?.cargo_green).toBe(true); }); test("auto_apply: build_red_reverted → rejected with cargo_green=false", () => { const r = scoreRecord(makeEvidence({ source_stem: "auto_apply", failure_markers: ["build_red_reverted"], })); expect(r.category).toBe("rejected"); expect(r.sub_scores?.cargo_green).toBe(false); }); test("auto_apply: warnings_increased_reverted → rejected", () => { const r = scoreRecord(makeEvidence({ source_stem: "auto_apply", failure_markers: ["warnings_increased_reverted"], })); expect(r.category).toBe("rejected"); }); test("auto_apply: no markers → needs_human_review", () => { const r = scoreRecord(makeEvidence({ source_stem: "auto_apply" })); expect(r.category).toBe("needs_human_review"); }); // ─── Class B: outcomes ──────────────────────────────────────────── test("outcomes: all_events_ok → accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "outcomes", success_markers: ["all_events_ok"], })); expect(r.category).toBe("accepted"); }); test("outcomes: gap_signals > 0 → partially_accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "outcomes", validation_results: { gap_signals: 3 }, })); expect(r.category).toBe("partially_accepted"); }); // ─── Class B: mode_experiments ──────────────────────────────────── test("mode_experiments: empty text → rejected", () => { const r = scoreRecord(makeEvidence({ source_stem: "mode_experiments", text: "", })); expect(r.category).toBe("rejected"); }); test("mode_experiments: latency > 120s → partially_accepted", () => { const r = scoreRecord(makeEvidence({ source_stem: "mode_experiments", text: "valid response", latency_ms: 150_000, })); expect(r.category).toBe("partially_accepted"); }); test("mode_experiments: text + reasonable latency → needs_human_review (no native verdict yet)", () => { const r = scoreRecord(makeEvidence({ source_stem: "mode_experiments", text: "response present", latency_ms: 10_000, })); expect(r.category).toBe("needs_human_review"); }); // ─── Class C: extraction-class default ──────────────────────────── test("distilled_facts: no native verdict → needs_human_review", () => { const r = scoreRecord(makeEvidence({ source_stem: "distilled_facts", text: "extracted fact" })); expect(r.category).toBe("needs_human_review"); }); test("distilled_procedures: no native verdict → needs_human_review", () => { const r = scoreRecord(makeEvidence({ source_stem: "distilled_procedures" })); expect(r.category).toBe("needs_human_review"); }); test("audit_facts: extraction-class → needs_human_review", () => { const r = scoreRecord(makeEvidence({ source_stem: "audit_facts" })); expect(r.category).toBe("needs_human_review"); }); test("observer_escalations: extraction-class → needs_human_review", () => { const r = scoreRecord(makeEvidence({ source_stem: "observer_escalations" })); expect(r.category).toBe("needs_human_review"); }); test("unknown source: defaults to extraction class → needs_human_review", () => { const r = scoreRecord(makeEvidence({ source_stem: "some_future_stream" })); expect(r.category).toBe("needs_human_review"); }); // ─── Universal invariants ───────────────────────────────────────── test("every score has at least one reason (reasons non-empty)", () => { // Sample a scoring of every source we know about const sources = ["scrum_reviews", "observer_reviews", "audits", "contract_analyses", "auto_apply", "outcomes", "mode_experiments", "distilled_facts"]; for (const s of sources) { const r = scoreRecord(makeEvidence({ source_stem: s })); expect(r.reasons.length).toBeGreaterThanOrEqual(1); for (const reason of r.reasons) expect(reason.length).toBeGreaterThan(0); } }); test("buildScoredRun stamps SCORER_VERSION + computes provenance.sig_hash", async () => { const ev = makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_1"] }); const scored = await buildScoredRun(ev, "data/scored-runs/2026/04/27/scrum_reviews.jsonl", 0, NOW); expect(scored.scorer_version).toBe(SCORER_VERSION); expect(scored.evidence_run_id).toBe(ev.run_id); expect(scored.evidence_task_id).toBe(ev.task_id); expect(scored.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/); expect(scored.provenance.source_file).toBe("data/scored-runs/2026/04/27/scrum_reviews.jsonl"); }); test("buildScoredRun is deterministic — same input → same sig_hash", async () => { const ev = makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_1"] }); const a = await buildScoredRun(ev, "p", 0, NOW); const b = await buildScoredRun(ev, "p", 0, NOW); expect(a.provenance.sig_hash).toBe(b.provenance.sig_hash); });