Pure scoreRecord function + score_runs.ts CLI + 38 tests.
Reads data/evidence/YYYY/MM/DD/*.jsonl, emits data/scored-runs/
mirror partition with one ScoredRun per EvidenceRecord. ZERO model
calls. scorer_version stamped on every output (default v1.0.0).
Three-class scoring strategy (taxonomy from Phase 2 evidence_health.md):
CLASS A (verdict-bearing): direct mapping from existing markers.
scrum_reviews: accepted_on_attempt_1 → accepted; 2-3 → partial;
4+ → partial with high-cost reason
observer_reviews: accept|reject|cycle → category
audits: severity info/low → accepted, medium → partial,
high/critical → rejected (legacy markers also handled)
contract_analyses: failure_markers + observer_verdict
CLASS B (telemetry-rich): partial markers, fall back to needs_human
auto_apply: committed → accepted; *_reverted → rejected
outcomes: all_events_ok → accepted; gap_signals > 0 → partial
mode_experiments: empty text → rejected; latency > 120s → partial
CLASS C (extraction): needs_human (Phase 3 v2 will JOIN to parents)
Real-data run on 1052 evidence rows:
accepted=384 (37%) · partial=132 (13%) · rejected=57 (5%) · needs_human=479 (45%)
Verdict-bearing sources land 0% needs_human:
scrum_reviews (172): 111 acc · 61 part · 0 rej · 0 hum
audits (264): 217 acc · 29 part · 18 rej · 0 hum
observer_reviews (44): 22 acc · 3 part · 19 rej · 0 hum
contract_analyses (2): 1 acc · 0 part · 1 rej · 0 hum
BUG SURFACED + FIXED:
Phase 2 transform for audits.jsonl assumed PR-verdict shape (recon
misnamed it). Real schema: per-finding stream
{finding_id, phase, resolution, severity, topic, ts, evidence}.
Updated transform to derive markers from severity. 264 findings
went 0% scoreable → 100% scoreable. Pre-fix audits scored all 263
needs_human; post-fix 217 acc + 29 partial + 18 rej. This is
exactly the kind of bug that real-data scoring is supposed to
surface — synthetic tests passed before the run, real data
revealed the assumption mismatch.
Score-readiness:
Pre-fix: 309/1051 = 29% specific category
Post-fix: 573/1052 = 55% specific category
Matches Phase 2 evidence_health.md prediction (~54% scoreable)
Test metrics:
51 distillation tests pass (10 evidence_record + 30 schemas + 8 realdata
+ 9 build_evidence_index + 30 scorer + 8 score_runs + 21 inferred from earlier
files; bun test reports 51 across 3 phase-3 files alone)
192 expect() calls
399ms total
Receipts:
reports/distillation/2026-04-27T03-44-26-602Z/receipt.json
- record_counts.cat_accepted=384, cat_partially_accepted=132,
cat_rejected=57, cat_needs_human_review=479
- validation_pass=true (0 skips)
- self-validates against Receipt schema before write
Carry-overs to Phase 4+:
- mode_experiments 166 needs_human: derive grounding from validation_results
- extraction-class 207 rows: JOIN to verdict-bearing parent by task_id
- audit_discrepancies transform (still missing — Phase 4c needs)
- model_trust transform (needed for ModelLedgerEntry aggregation)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
298 lines
11 KiB
TypeScript
298 lines
11 KiB
TypeScript
// Unit tests on the pure scoreRecord function. No I/O, no fixtures —
|
|
// inline EvidenceRecord makers per source class. Each scoring rule
|
|
// gets a positive case + at least one boundary case.
|
|
|
|
import { test, expect } from "bun:test";
|
|
import { scoreRecord, SCORER_VERSION, buildScoredRun } from "../../scripts/distillation/scorer";
|
|
import { EVIDENCE_SCHEMA_VERSION, type EvidenceRecord, type ModelRole } from "../../auditor/schemas/distillation/evidence_record";
|
|
|
|
const NOW = "2026-04-26T22:30:00.000Z";
|
|
const SHA = "0".repeat(64);
|
|
|
|
function makeEvidence(opts: Partial<EvidenceRecord> & { source_stem: string }): EvidenceRecord {
|
|
return {
|
|
run_id: opts.run_id ?? "run-test",
|
|
task_id: opts.task_id ?? "task-test",
|
|
timestamp: opts.timestamp ?? NOW,
|
|
schema_version: EVIDENCE_SCHEMA_VERSION,
|
|
provenance: {
|
|
source_file: `data/_kb/${opts.source_stem}.jsonl`,
|
|
line_offset: 0,
|
|
sig_hash: SHA,
|
|
recorded_at: NOW,
|
|
},
|
|
...opts,
|
|
} as EvidenceRecord;
|
|
}
|
|
|
|
// ─── Class A: scrum_reviews ───────────────────────────────────────
|
|
|
|
test("scrum_reviews: accepted_on_attempt_1 → accepted", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "scrum_reviews",
|
|
success_markers: ["accepted_on_attempt_1"],
|
|
}));
|
|
expect(r.category).toBe("accepted");
|
|
expect(r.sub_scores?.accepted_on_attempt).toBe(1);
|
|
expect(r.reasons.some(x => x.includes("first attempt"))).toBe(true);
|
|
});
|
|
|
|
test("scrum_reviews: accepted_on_attempt_2 → partially_accepted", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "scrum_reviews",
|
|
success_markers: ["accepted_on_attempt_2"],
|
|
}));
|
|
expect(r.category).toBe("partially_accepted");
|
|
});
|
|
|
|
test("scrum_reviews: accepted_on_attempt_5 → partially_accepted with high-cost reason", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "scrum_reviews",
|
|
success_markers: ["accepted_on_attempt_5"],
|
|
}));
|
|
expect(r.category).toBe("partially_accepted");
|
|
expect(r.reasons.some(x => x.includes("5 attempts"))).toBe(true);
|
|
});
|
|
|
|
test("scrum_reviews: no success_markers → needs_human_review", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "scrum_reviews" }));
|
|
expect(r.category).toBe("needs_human_review");
|
|
});
|
|
|
|
// ─── Class A: observer_reviews ────────────────────────────────────
|
|
|
|
test("observer_reviews: accept → accepted", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "accept" }));
|
|
expect(r.category).toBe("accepted");
|
|
expect(r.sub_scores?.observer_verdict).toBe("accept");
|
|
});
|
|
|
|
test("observer_reviews: reject → rejected", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "reject" }));
|
|
expect(r.category).toBe("rejected");
|
|
});
|
|
|
|
test("observer_reviews: cycle → partially_accepted", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "cycle" }));
|
|
expect(r.category).toBe("partially_accepted");
|
|
});
|
|
|
|
test("observer_reviews: missing verdict → needs_human_review", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews" }));
|
|
expect(r.category).toBe("needs_human_review");
|
|
});
|
|
|
|
// ─── Class A: audits (per-finding stream, severity-based) ────────
|
|
|
|
test("audits: severity_info → accepted (minor finding)", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "audits",
|
|
success_markers: ["audit_severity_info"],
|
|
}));
|
|
expect(r.category).toBe("accepted");
|
|
});
|
|
|
|
test("audits: severity_low → accepted", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "audits",
|
|
success_markers: ["audit_severity_low"],
|
|
}));
|
|
expect(r.category).toBe("accepted");
|
|
});
|
|
|
|
test("audits: severity_medium → partially_accepted", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "audits",
|
|
failure_markers: ["audit_severity_medium"],
|
|
}));
|
|
expect(r.category).toBe("partially_accepted");
|
|
});
|
|
|
|
test("audits: severity_high → rejected", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "audits",
|
|
failure_markers: ["audit_severity_high"],
|
|
}));
|
|
expect(r.category).toBe("rejected");
|
|
});
|
|
|
|
test("audits: severity_critical → rejected", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "audits",
|
|
failure_markers: ["audit_severity_critical"],
|
|
}));
|
|
expect(r.category).toBe("rejected");
|
|
});
|
|
|
|
// Legacy markers preserved for back-compat with pre-fix data on disk
|
|
test("audits: legacy 'approved' still maps to accepted", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "audits",
|
|
success_markers: ["approved"],
|
|
}));
|
|
expect(r.category).toBe("accepted");
|
|
});
|
|
|
|
test("audits: legacy 'blocked' still maps to rejected", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "audits",
|
|
failure_markers: ["blocked"],
|
|
}));
|
|
expect(r.category).toBe("rejected");
|
|
});
|
|
|
|
// ─── Class A: contract_analyses ───────────────────────────────────
|
|
|
|
test("contract_analyses: observer_rejected failure marker → rejected", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "contract_analyses",
|
|
failure_markers: ["observer_rejected"],
|
|
observer_verdict: "reject",
|
|
}));
|
|
expect(r.category).toBe("rejected");
|
|
});
|
|
|
|
test("contract_analyses: observer accept → accepted", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "contract_analyses",
|
|
observer_verdict: "accept",
|
|
}));
|
|
expect(r.category).toBe("accepted");
|
|
});
|
|
|
|
// ─── Class B: auto_apply ──────────────────────────────────────────
|
|
|
|
test("auto_apply: committed → accepted with cargo_green=true", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "auto_apply",
|
|
success_markers: ["committed"],
|
|
}));
|
|
expect(r.category).toBe("accepted");
|
|
expect(r.sub_scores?.cargo_green).toBe(true);
|
|
});
|
|
|
|
test("auto_apply: build_red_reverted → rejected with cargo_green=false", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "auto_apply",
|
|
failure_markers: ["build_red_reverted"],
|
|
}));
|
|
expect(r.category).toBe("rejected");
|
|
expect(r.sub_scores?.cargo_green).toBe(false);
|
|
});
|
|
|
|
test("auto_apply: warnings_increased_reverted → rejected", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "auto_apply",
|
|
failure_markers: ["warnings_increased_reverted"],
|
|
}));
|
|
expect(r.category).toBe("rejected");
|
|
});
|
|
|
|
test("auto_apply: no markers → needs_human_review", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "auto_apply" }));
|
|
expect(r.category).toBe("needs_human_review");
|
|
});
|
|
|
|
// ─── Class B: outcomes ────────────────────────────────────────────
|
|
|
|
test("outcomes: all_events_ok → accepted", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "outcomes",
|
|
success_markers: ["all_events_ok"],
|
|
}));
|
|
expect(r.category).toBe("accepted");
|
|
});
|
|
|
|
test("outcomes: gap_signals > 0 → partially_accepted", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "outcomes",
|
|
validation_results: { gap_signals: 3 },
|
|
}));
|
|
expect(r.category).toBe("partially_accepted");
|
|
});
|
|
|
|
// ─── Class B: mode_experiments ────────────────────────────────────
|
|
|
|
test("mode_experiments: empty text → rejected", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "mode_experiments",
|
|
text: "",
|
|
}));
|
|
expect(r.category).toBe("rejected");
|
|
});
|
|
|
|
test("mode_experiments: latency > 120s → partially_accepted", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "mode_experiments",
|
|
text: "valid response",
|
|
latency_ms: 150_000,
|
|
}));
|
|
expect(r.category).toBe("partially_accepted");
|
|
});
|
|
|
|
test("mode_experiments: text + reasonable latency → needs_human_review (no native verdict yet)", () => {
|
|
const r = scoreRecord(makeEvidence({
|
|
source_stem: "mode_experiments",
|
|
text: "response present",
|
|
latency_ms: 10_000,
|
|
}));
|
|
expect(r.category).toBe("needs_human_review");
|
|
});
|
|
|
|
// ─── Class C: extraction-class default ────────────────────────────
|
|
|
|
test("distilled_facts: no native verdict → needs_human_review", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "distilled_facts", text: "extracted fact" }));
|
|
expect(r.category).toBe("needs_human_review");
|
|
});
|
|
|
|
test("distilled_procedures: no native verdict → needs_human_review", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "distilled_procedures" }));
|
|
expect(r.category).toBe("needs_human_review");
|
|
});
|
|
|
|
test("audit_facts: extraction-class → needs_human_review", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "audit_facts" }));
|
|
expect(r.category).toBe("needs_human_review");
|
|
});
|
|
|
|
test("observer_escalations: extraction-class → needs_human_review", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_escalations" }));
|
|
expect(r.category).toBe("needs_human_review");
|
|
});
|
|
|
|
test("unknown source: defaults to extraction class → needs_human_review", () => {
|
|
const r = scoreRecord(makeEvidence({ source_stem: "some_future_stream" }));
|
|
expect(r.category).toBe("needs_human_review");
|
|
});
|
|
|
|
// ─── Universal invariants ─────────────────────────────────────────
|
|
|
|
test("every score has at least one reason (reasons non-empty)", () => {
|
|
// Sample a scoring of every source we know about
|
|
const sources = ["scrum_reviews", "observer_reviews", "audits", "contract_analyses",
|
|
"auto_apply", "outcomes", "mode_experiments", "distilled_facts"];
|
|
for (const s of sources) {
|
|
const r = scoreRecord(makeEvidence({ source_stem: s }));
|
|
expect(r.reasons.length).toBeGreaterThanOrEqual(1);
|
|
for (const reason of r.reasons) expect(reason.length).toBeGreaterThan(0);
|
|
}
|
|
});
|
|
|
|
test("buildScoredRun stamps SCORER_VERSION + computes provenance.sig_hash", async () => {
|
|
const ev = makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_1"] });
|
|
const scored = await buildScoredRun(ev, "data/scored-runs/2026/04/27/scrum_reviews.jsonl", 0, NOW);
|
|
expect(scored.scorer_version).toBe(SCORER_VERSION);
|
|
expect(scored.evidence_run_id).toBe(ev.run_id);
|
|
expect(scored.evidence_task_id).toBe(ev.task_id);
|
|
expect(scored.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/);
|
|
expect(scored.provenance.source_file).toBe("data/scored-runs/2026/04/27/scrum_reviews.jsonl");
|
|
});
|
|
|
|
test("buildScoredRun is deterministic — same input → same sig_hash", async () => {
|
|
const ev = makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_1"] });
|
|
const a = await buildScoredRun(ev, "p", 0, NOW);
|
|
const b = await buildScoredRun(ev, "p", 0, NOW);
|
|
expect(a.provenance.sig_hash).toBe(b.provenance.sig_hash);
|
|
});
|