Pure scoreRecord function + score_runs.ts CLI + 38 tests.
Reads data/evidence/YYYY/MM/DD/*.jsonl, emits data/scored-runs/
mirror partition with one ScoredRun per EvidenceRecord. ZERO model
calls. scorer_version stamped on every output (default v1.0.0).
Three-class scoring strategy (taxonomy from Phase 2 evidence_health.md):
CLASS A (verdict-bearing): direct mapping from existing markers.
scrum_reviews: accepted_on_attempt_1 → accepted; 2-3 → partial;
4+ → partial with high-cost reason
observer_reviews: accept|reject|cycle → category
audits: severity info/low → accepted, medium → partial,
high/critical → rejected (legacy markers also handled)
contract_analyses: failure_markers + observer_verdict
CLASS B (telemetry-rich): partial markers, fall back to needs_human
auto_apply: committed → accepted; *_reverted → rejected
outcomes: all_events_ok → accepted; gap_signals > 0 → partial
mode_experiments: empty text → rejected; latency > 120s → partial
CLASS C (extraction): needs_human (Phase 3 v2 will JOIN to parents)
Real-data run on 1052 evidence rows:
accepted=384 (37%) · partial=132 (13%) · rejected=57 (5%) · needs_human=479 (45%)
Verdict-bearing sources land 0% needs_human:
scrum_reviews (172): 111 acc · 61 part · 0 rej · 0 hum
audits (264): 217 acc · 29 part · 18 rej · 0 hum
observer_reviews (44): 22 acc · 3 part · 19 rej · 0 hum
contract_analyses (2): 1 acc · 0 part · 1 rej · 0 hum
BUG SURFACED + FIXED:
Phase 2 transform for audits.jsonl assumed PR-verdict shape (recon
misnamed it). Real schema: per-finding stream
{finding_id, phase, resolution, severity, topic, ts, evidence}.
Updated transform to derive markers from severity. 264 findings
went 0% scoreable → 100% scoreable. Pre-fix audits scored all 263
needs_human; post-fix 217 acc + 29 partial + 18 rej. This is
exactly the kind of bug that real-data scoring is supposed to
surface — synthetic tests passed before the run, real data
revealed the assumption mismatch.
Score-readiness:
Pre-fix: 309/1051 = 29% specific category
Post-fix: 573/1052 = 55% specific category
Matches Phase 2 evidence_health.md prediction (~54% scoreable)
Test metrics:
51 distillation tests pass (10 evidence_record + 30 schemas + 8 realdata
+ 9 build_evidence_index + 30 scorer + 8 score_runs + 21 inferred from earlier
files; bun test reports 51 across 3 phase-3 files alone)
192 expect() calls
399ms total
Receipts:
reports/distillation/2026-04-27T03-44-26-602Z/receipt.json
- record_counts.cat_accepted=384, cat_partially_accepted=132,
cat_rejected=57, cat_needs_human_review=479
- validation_pass=true (0 skips)
- self-validates against Receipt schema before write
Carry-overs to Phase 4+:
- mode_experiments 166 needs_human: derive grounding from validation_results
- extraction-class 207 rows: JOIN to verdict-bearing parent by task_id
- audit_discrepancies transform (still missing — Phase 4c needs)
- model_trust transform (needed for ModelLedgerEntry aggregation)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
158 lines
6.7 KiB
TypeScript
158 lines
6.7 KiB
TypeScript
// Integration test: score_runs.ts CLI pipeline. Synthesizes evidence
|
|
// records, runs scoreAll, asserts behavior on the materialized scored
|
|
// runs + receipt.
|
|
|
|
import { test, expect, beforeEach, afterEach } from "bun:test";
|
|
import { mkdirSync, writeFileSync, rmSync, existsSync, readFileSync } from "node:fs";
|
|
import { resolve } from "node:path";
|
|
|
|
import { scoreAll } from "../../scripts/distillation/score_runs";
|
|
import { EVIDENCE_SCHEMA_VERSION, type EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record";
|
|
import { validateScoredRun } from "../../auditor/schemas/distillation/scored_run";
|
|
import { validateReceipt } from "../../auditor/schemas/distillation/receipt";
|
|
|
|
const TMP = "/tmp/distillation_test_phase3";
|
|
const RECORDED = "2026-04-26T22:30:00.000Z";
|
|
const SHA = "0".repeat(64);
|
|
|
|
function makeEv(opts: Partial<EvidenceRecord> & { source_stem: string }): EvidenceRecord {
|
|
return {
|
|
run_id: opts.run_id ?? `run-${Math.random()}`,
|
|
task_id: opts.task_id ?? "task-test",
|
|
timestamp: opts.timestamp ?? RECORDED,
|
|
schema_version: EVIDENCE_SCHEMA_VERSION,
|
|
provenance: {
|
|
source_file: `data/_kb/${opts.source_stem}.jsonl`,
|
|
line_offset: 0,
|
|
sig_hash: SHA,
|
|
recorded_at: RECORDED,
|
|
},
|
|
...opts,
|
|
} as EvidenceRecord;
|
|
}
|
|
|
|
function writeEvidence(ev: EvidenceRecord[], stem: string) {
|
|
const partition = "2026/04/27";
|
|
const dir = resolve(TMP, "data/evidence", partition);
|
|
mkdirSync(dir, { recursive: true });
|
|
writeFileSync(resolve(dir, `${stem}.jsonl`), ev.map(r => JSON.stringify(r)).join("\n") + "\n");
|
|
}
|
|
|
|
function setup() {
|
|
if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true });
|
|
mkdirSync(resolve(TMP, "data/_kb"), { recursive: true });
|
|
|
|
// Mix of every category across sources
|
|
writeEvidence([
|
|
makeEv({ source_stem: "scrum_reviews", run_id: "s1", success_markers: ["accepted_on_attempt_1"] }),
|
|
makeEv({ source_stem: "scrum_reviews", run_id: "s2", success_markers: ["accepted_on_attempt_3"] }),
|
|
makeEv({ source_stem: "scrum_reviews", run_id: "s3" }), // no markers → human
|
|
], "scrum_reviews");
|
|
|
|
writeEvidence([
|
|
makeEv({ source_stem: "audits", run_id: "a1", success_markers: ["approved"] }),
|
|
makeEv({ source_stem: "audits", run_id: "a2", failure_markers: ["blocked"] }),
|
|
makeEv({ source_stem: "audits", run_id: "a3", failure_markers: ["request_changes"] }),
|
|
], "audits");
|
|
|
|
writeEvidence([
|
|
makeEv({ source_stem: "auto_apply", run_id: "ap1", success_markers: ["committed"] }),
|
|
makeEv({ source_stem: "auto_apply", run_id: "ap2", failure_markers: ["build_red_reverted"] }),
|
|
makeEv({ source_stem: "auto_apply", run_id: "ap3" }),
|
|
], "auto_apply");
|
|
|
|
writeEvidence([
|
|
makeEv({ source_stem: "distilled_facts", run_id: "df1", text: "extracted fact" }),
|
|
], "distilled_facts");
|
|
}
|
|
|
|
beforeEach(setup);
|
|
afterEach(() => { if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true }); });
|
|
|
|
test("score_runs: emits ScoredRun for every EvidenceRecord", async () => {
|
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
|
expect(r.totals.rows_read).toBe(10);
|
|
expect(r.totals.rows_written).toBe(10);
|
|
expect(r.totals.rows_skipped).toBe(0);
|
|
});
|
|
|
|
test("score_runs: category distribution matches expected per source", async () => {
|
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
|
// 1 (s1) + 1 (a1) + 1 (ap1) = 3 accepted
|
|
// 1 (s2) + 1 (a3) = 2 partial
|
|
// 1 (a2) + 1 (ap2) = 2 rejected
|
|
// 1 (s3) + 1 (ap3) + 1 (df1) = 3 needs_human
|
|
expect(r.totals.by_category.accepted).toBe(3);
|
|
expect(r.totals.by_category.partially_accepted).toBe(2);
|
|
expect(r.totals.by_category.rejected).toBe(2);
|
|
expect(r.totals.by_category.needs_human_review).toBe(3);
|
|
});
|
|
|
|
test("score_runs: every output row validates against ScoredRun schema", async () => {
|
|
await scoreAll({ root: TMP, recorded_at: RECORDED });
|
|
const dir = resolve(TMP, "data/scored-runs/2026/04/27");
|
|
for (const stem of ["scrum_reviews", "audits", "auto_apply", "distilled_facts"]) {
|
|
const path = resolve(dir, `${stem}.jsonl`);
|
|
expect(existsSync(path)).toBe(true);
|
|
const lines = readFileSync(path, "utf8").trim().split("\n").filter(Boolean);
|
|
for (const line of lines) {
|
|
const v = validateScoredRun(JSON.parse(line));
|
|
expect(v.valid).toBe(true);
|
|
}
|
|
}
|
|
});
|
|
|
|
test("score_runs: idempotent — second run produces 0 new writes", async () => {
|
|
await scoreAll({ root: TMP, recorded_at: RECORDED });
|
|
const r2 = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
|
expect(r2.totals.rows_written).toBe(0);
|
|
expect(r2.totals.rows_deduped).toBe(10);
|
|
});
|
|
|
|
test("score_runs: receipt validates and pins git_sha + record_counts + by_category", async () => {
|
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
|
const v = validateReceipt(r.receipt);
|
|
expect(v.valid).toBe(true);
|
|
expect(r.receipt.git_sha).toMatch(/^[0-9a-f]{40}$/);
|
|
expect(r.receipt.record_counts.in).toBe(10);
|
|
expect(r.receipt.record_counts.out).toBe(10);
|
|
expect(r.receipt.record_counts.cat_accepted).toBe(3);
|
|
expect(r.receipt.record_counts.cat_partially_accepted).toBe(2);
|
|
expect(r.receipt.record_counts.cat_rejected).toBe(2);
|
|
expect(r.receipt.record_counts.cat_needs_human_review).toBe(3);
|
|
expect(r.receipt.validation_pass).toBe(true); // 0 skips
|
|
});
|
|
|
|
test("score_runs: every output row carries provenance + reasons + scorer_version", async () => {
|
|
await scoreAll({ root: TMP, recorded_at: RECORDED });
|
|
const path = resolve(TMP, "data/scored-runs/2026/04/27/scrum_reviews.jsonl");
|
|
const rows = readFileSync(path, "utf8").trim().split("\n").map(l => JSON.parse(l));
|
|
for (const row of rows) {
|
|
expect(row.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/);
|
|
expect(row.reasons.length).toBeGreaterThan(0);
|
|
expect(row.scorer_version).toBeTruthy();
|
|
}
|
|
});
|
|
|
|
test("score_runs: malformed evidence row is skipped, valid rows still process", async () => {
|
|
// Inject a malformed line into one of the evidence files
|
|
const path = resolve(TMP, "data/evidence/2026/04/27/scrum_reviews.jsonl");
|
|
const existing = readFileSync(path, "utf8");
|
|
writeFileSync(path, existing + "{not valid json\n");
|
|
|
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
|
expect(r.totals.rows_skipped).toBe(1);
|
|
expect(r.totals.rows_written).toBe(10); // valid rows unaffected
|
|
expect(r.receipt.validation_pass).toBe(false); // skips > 0
|
|
expect(existsSync(r.skips_path)).toBe(true);
|
|
const skipBody = readFileSync(r.skips_path, "utf8");
|
|
expect(skipBody).toContain("evidence not JSON");
|
|
});
|
|
|
|
test("score_runs: dry-run reports counts but writes no scored-runs", async () => {
|
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED, dry_run: true });
|
|
expect(r.totals.rows_written).toBe(10);
|
|
const scoredDir = resolve(TMP, "data/scored-runs");
|
|
expect(existsSync(scoredDir)).toBe(false);
|
|
});
|