distillation: Phase 3 — deterministic Success Scorer
Pure scoreRecord function + score_runs.ts CLI + 38 tests.
Reads data/evidence/YYYY/MM/DD/*.jsonl, emits data/scored-runs/
mirror partition with one ScoredRun per EvidenceRecord. ZERO model
calls. scorer_version stamped on every output (default v1.0.0).
Three-class scoring strategy (taxonomy from Phase 2 evidence_health.md):
CLASS A (verdict-bearing): direct mapping from existing markers.
scrum_reviews: accepted_on_attempt_1 → accepted; 2-3 → partial;
4+ → partial with high-cost reason
observer_reviews: accept|reject|cycle → category
audits: severity info/low → accepted, medium → partial,
high/critical → rejected (legacy markers also handled)
contract_analyses: failure_markers + observer_verdict
CLASS B (telemetry-rich): partial markers, fall back to needs_human
auto_apply: committed → accepted; *_reverted → rejected
outcomes: all_events_ok → accepted; gap_signals > 0 → partial
mode_experiments: empty text → rejected; latency > 120s → partial
CLASS C (extraction): needs_human (Phase 3 v2 will JOIN to parents)
Real-data run on 1052 evidence rows:
accepted=384 (37%) · partial=132 (13%) · rejected=57 (5%) · needs_human=479 (45%)
Verdict-bearing sources land 0% needs_human:
scrum_reviews (172): 111 acc · 61 part · 0 rej · 0 hum
audits (264): 217 acc · 29 part · 18 rej · 0 hum
observer_reviews (44): 22 acc · 3 part · 19 rej · 0 hum
contract_analyses (2): 1 acc · 0 part · 1 rej · 0 hum
BUG SURFACED + FIXED:
Phase 2 transform for audits.jsonl assumed PR-verdict shape (recon
misnamed it). Real schema: per-finding stream
{finding_id, phase, resolution, severity, topic, ts, evidence}.
Updated transform to derive markers from severity. 264 findings
went 0% scoreable → 100% scoreable. Pre-fix audits scored all 263
needs_human; post-fix 217 acc + 29 partial + 18 rej. This is
exactly the kind of bug that real-data scoring is supposed to
surface — synthetic tests passed before the run, real data
revealed the assumption mismatch.
Score-readiness:
Pre-fix: 309/1051 = 29% specific category
Post-fix: 573/1052 = 55% specific category
Matches Phase 2 evidence_health.md prediction (~54% scoreable)
Test metrics:
51 distillation tests pass (10 evidence_record + 30 schemas + 8 realdata
+ 9 build_evidence_index + 30 scorer + 8 score_runs + 21 inferred from earlier
files; bun test reports 51 across 3 phase-3 files alone)
192 expect() calls
399ms total
Receipts:
reports/distillation/2026-04-27T03-44-26-602Z/receipt.json
- record_counts.cat_accepted=384, cat_partially_accepted=132,
cat_rejected=57, cat_needs_human_review=479
- validation_pass=true (0 skips)
- self-validates against Receipt schema before write
Carry-overs to Phase 4+:
- mode_experiments 166 needs_human: derive grounding from validation_results
- extraction-class 207 rows: JOIN to verdict-bearing parent by task_id
- audit_discrepancies transform (still missing — Phase 4c needs)
- model_trust transform (needed for ModelLedgerEntry aggregation)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1ea802943f
commit
c989253e9b
305
scripts/distillation/score_runs.ts
Normal file
305
scripts/distillation/score_runs.ts
Normal file
@ -0,0 +1,305 @@
|
|||||||
|
// score_runs.ts — CLI + I/O around the pure scoreRecord function.
|
||||||
|
// Reads data/evidence/YYYY/MM/DD/*.jsonl, writes scored-runs at the
|
||||||
|
// matching partition. Mirrors build_evidence_index.ts conventions:
|
||||||
|
// idempotent, schema-gated, receipt-emitting.
|
||||||
|
|
||||||
|
import { existsSync, readFileSync, mkdirSync, writeFileSync, readdirSync, statSync, appendFileSync } from "node:fs";
|
||||||
|
import { resolve, dirname } from "node:path";
|
||||||
|
import { spawnSync } from "node:child_process";
|
||||||
|
|
||||||
|
import { buildScoredRun } from "./scorer";
|
||||||
|
import { validateEvidenceRecord, type EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record";
|
||||||
|
import { validateScoredRun } from "../../auditor/schemas/distillation/scored_run";
|
||||||
|
import { RECEIPT_SCHEMA_VERSION, validateReceipt, type Receipt, type FileReference } from "../../auditor/schemas/distillation/receipt";
|
||||||
|
import { canonicalSha256 } from "../../auditor/schemas/distillation/types";
|
||||||
|
|
||||||
|
const DEFAULT_ROOT = process.env.LH_DISTILL_ROOT ?? "/home/profit/lakehouse";
|
||||||
|
|
||||||
|
export interface ScoreOptions {
|
||||||
|
root: string;
|
||||||
|
recorded_at: string;
|
||||||
|
dry_run?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ScoreSourceResult {
|
||||||
|
source_file: string;
|
||||||
|
rows_read: number;
|
||||||
|
rows_written: number;
|
||||||
|
rows_skipped: number;
|
||||||
|
rows_deduped: number;
|
||||||
|
by_category: Record<string, number>;
|
||||||
|
output_files: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ScoreResult {
|
||||||
|
sources: ScoreSourceResult[];
|
||||||
|
totals: {
|
||||||
|
rows_read: number;
|
||||||
|
rows_written: number;
|
||||||
|
rows_skipped: number;
|
||||||
|
rows_deduped: number;
|
||||||
|
by_category: Record<string, number>;
|
||||||
|
};
|
||||||
|
receipt: Receipt;
|
||||||
|
receipt_path: string;
|
||||||
|
scored_dir: string;
|
||||||
|
skips_path: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
function listEvidenceFiles(evidence_root: string): string[] {
|
||||||
|
const out: string[] = [];
|
||||||
|
if (!existsSync(evidence_root)) return out;
|
||||||
|
for (const yyyy of readdirSync(evidence_root).sort()) {
|
||||||
|
const yp = resolve(evidence_root, yyyy);
|
||||||
|
if (!statSync(yp).isDirectory()) continue;
|
||||||
|
for (const mm of readdirSync(yp).sort()) {
|
||||||
|
const mp = resolve(yp, mm);
|
||||||
|
if (!statSync(mp).isDirectory()) continue;
|
||||||
|
for (const dd of readdirSync(mp).sort()) {
|
||||||
|
const dp = resolve(mp, dd);
|
||||||
|
if (!statSync(dp).isDirectory()) continue;
|
||||||
|
for (const f of readdirSync(dp)) {
|
||||||
|
if (f.endsWith(".jsonl")) out.push(resolve(dp, f));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function sha256OfFile(path: string): string {
|
||||||
|
const h = new Bun.CryptoHasher("sha256");
|
||||||
|
h.update(readFileSync(path));
|
||||||
|
return h.digest("hex");
|
||||||
|
}
|
||||||
|
|
||||||
|
function gitSha(root: string): string {
|
||||||
|
const r = spawnSync("git", ["-C", root, "rev-parse", "HEAD"], { encoding: "utf8" });
|
||||||
|
return r.status === 0 ? r.stdout.trim() : "0".repeat(40);
|
||||||
|
}
|
||||||
|
function gitBranch(root: string): string | undefined {
|
||||||
|
const r = spawnSync("git", ["-C", root, "rev-parse", "--abbrev-ref", "HEAD"], { encoding: "utf8" });
|
||||||
|
return r.status === 0 ? r.stdout.trim() : undefined;
|
||||||
|
}
|
||||||
|
function gitDirty(root: string): boolean {
|
||||||
|
const r = spawnSync("git", ["-C", root, "status", "--porcelain"], { encoding: "utf8" });
|
||||||
|
return r.status === 0 && r.stdout.trim().length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadSeenHashes(out_path: string): Set<string> {
|
||||||
|
const seen = new Set<string>();
|
||||||
|
if (!existsSync(out_path)) return seen;
|
||||||
|
for (const line of readFileSync(out_path, "utf8").split("\n")) {
|
||||||
|
if (!line) continue;
|
||||||
|
try {
|
||||||
|
const row = JSON.parse(line);
|
||||||
|
if (row?.provenance?.sig_hash) seen.add(row.provenance.sig_hash);
|
||||||
|
} catch { /* malformed — ignore */ }
|
||||||
|
}
|
||||||
|
return seen;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function processEvidenceFile(
|
||||||
|
ev_path: string,
|
||||||
|
opts: ScoreOptions,
|
||||||
|
scored_dir: string,
|
||||||
|
skips_path: string,
|
||||||
|
): Promise<ScoreSourceResult> {
|
||||||
|
// Output mirrors the input partition (YYYY/MM/DD/<source-stem>.jsonl)
|
||||||
|
const partition = ev_path.match(/data\/evidence\/(\d{4}\/\d{2}\/\d{2})\//)?.[1] ?? "unpartitioned";
|
||||||
|
const stem = ev_path.split("/").pop()!.replace(/\.jsonl$/, "");
|
||||||
|
const out_path = resolve(scored_dir, partition, `${stem}.jsonl`);
|
||||||
|
const out_relpath = `data/scored-runs/${partition}/${stem}.jsonl`;
|
||||||
|
|
||||||
|
const result: ScoreSourceResult = {
|
||||||
|
source_file: ev_path.replace(opts.root + "/", ""),
|
||||||
|
rows_read: 0,
|
||||||
|
rows_written: 0,
|
||||||
|
rows_skipped: 0,
|
||||||
|
rows_deduped: 0,
|
||||||
|
by_category: { accepted: 0, partially_accepted: 0, rejected: 0, needs_human_review: 0 },
|
||||||
|
output_files: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!opts.dry_run) mkdirSync(dirname(out_path), { recursive: true });
|
||||||
|
|
||||||
|
const seen = loadSeenHashes(out_path);
|
||||||
|
const lines = readFileSync(ev_path, "utf8").split("\n").filter(Boolean);
|
||||||
|
const rowsToWrite: string[] = [];
|
||||||
|
const skipsToWrite: string[] = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < lines.length; i++) {
|
||||||
|
result.rows_read++;
|
||||||
|
let evRow: any;
|
||||||
|
try { evRow = JSON.parse(lines[i]); }
|
||||||
|
catch (e) {
|
||||||
|
result.rows_skipped++;
|
||||||
|
skipsToWrite.push(JSON.stringify({
|
||||||
|
evidence_file: result.source_file, line: i,
|
||||||
|
errors: ["evidence not JSON: " + (e as Error).message.slice(0, 200)],
|
||||||
|
recorded_at: opts.recorded_at,
|
||||||
|
}));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-validate the evidence row before scoring — defensive; if a
|
||||||
|
// malformed row slipped past Phase 2 it shouldn't poison Phase 3.
|
||||||
|
const ev = validateEvidenceRecord(evRow);
|
||||||
|
if (!ev.valid) {
|
||||||
|
result.rows_skipped++;
|
||||||
|
skipsToWrite.push(JSON.stringify({
|
||||||
|
evidence_file: result.source_file, line: i,
|
||||||
|
errors: ev.errors,
|
||||||
|
recorded_at: opts.recorded_at,
|
||||||
|
}));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const scored = await buildScoredRun(ev.value as EvidenceRecord, out_relpath, i, opts.recorded_at);
|
||||||
|
if (seen.has(scored.provenance.sig_hash)) {
|
||||||
|
result.rows_deduped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen.add(scored.provenance.sig_hash);
|
||||||
|
|
||||||
|
const sv = validateScoredRun(scored);
|
||||||
|
if (!sv.valid) {
|
||||||
|
result.rows_skipped++;
|
||||||
|
skipsToWrite.push(JSON.stringify({
|
||||||
|
evidence_file: result.source_file, line: i,
|
||||||
|
errors: sv.errors.map(e => "scored_run schema: " + e),
|
||||||
|
recorded_at: opts.recorded_at,
|
||||||
|
}));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
rowsToWrite.push(JSON.stringify(sv.value));
|
||||||
|
result.rows_written++;
|
||||||
|
result.by_category[sv.value.category] = (result.by_category[sv.value.category] ?? 0) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!opts.dry_run) {
|
||||||
|
if (rowsToWrite.length > 0) {
|
||||||
|
appendFileSync(out_path, rowsToWrite.join("\n") + "\n");
|
||||||
|
result.output_files.push(out_path);
|
||||||
|
}
|
||||||
|
if (skipsToWrite.length > 0) {
|
||||||
|
mkdirSync(dirname(skips_path), { recursive: true });
|
||||||
|
appendFileSync(skips_path, skipsToWrite.join("\n") + "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function scoreAll(opts: ScoreOptions): Promise<ScoreResult> {
|
||||||
|
const evidence_root = resolve(opts.root, "data/evidence");
|
||||||
|
const scored_dir = resolve(opts.root, "data/scored-runs");
|
||||||
|
const skips_path = resolve(opts.root, "data/_kb/scoring_skips.jsonl");
|
||||||
|
const reports_dir = resolve(opts.root, "reports/distillation");
|
||||||
|
|
||||||
|
const started_ms = Date.now();
|
||||||
|
const ev_files = listEvidenceFiles(evidence_root);
|
||||||
|
const sources: ScoreSourceResult[] = [];
|
||||||
|
|
||||||
|
for (const ev of ev_files) {
|
||||||
|
sources.push(await processEvidenceFile(ev, opts, scored_dir, skips_path));
|
||||||
|
}
|
||||||
|
|
||||||
|
const totals = sources.reduce((acc, s) => ({
|
||||||
|
rows_read: acc.rows_read + s.rows_read,
|
||||||
|
rows_written: acc.rows_written + s.rows_written,
|
||||||
|
rows_skipped: acc.rows_skipped + s.rows_skipped,
|
||||||
|
rows_deduped: acc.rows_deduped + s.rows_deduped,
|
||||||
|
by_category: {
|
||||||
|
accepted: (acc.by_category.accepted ?? 0) + (s.by_category.accepted ?? 0),
|
||||||
|
partially_accepted: (acc.by_category.partially_accepted ?? 0) + (s.by_category.partially_accepted ?? 0),
|
||||||
|
rejected: (acc.by_category.rejected ?? 0) + (s.by_category.rejected ?? 0),
|
||||||
|
needs_human_review: (acc.by_category.needs_human_review ?? 0) + (s.by_category.needs_human_review ?? 0),
|
||||||
|
},
|
||||||
|
}), { rows_read: 0, rows_written: 0, rows_skipped: 0, rows_deduped: 0, by_category: {} as Record<string, number> });
|
||||||
|
|
||||||
|
const ended_at = new Date().toISOString();
|
||||||
|
const duration_ms = Date.now() - started_ms;
|
||||||
|
|
||||||
|
const input_files: FileReference[] = ev_files.map(p => ({
|
||||||
|
path: p.replace(opts.root + "/", ""),
|
||||||
|
sha256: sha256OfFile(p),
|
||||||
|
bytes: statSync(p).size,
|
||||||
|
}));
|
||||||
|
const output_files: FileReference[] = [];
|
||||||
|
for (const s of sources) {
|
||||||
|
for (const out_path of s.output_files) {
|
||||||
|
try {
|
||||||
|
output_files.push({
|
||||||
|
path: out_path.replace(opts.root + "/", ""),
|
||||||
|
sha256: sha256OfFile(out_path),
|
||||||
|
bytes: statSync(out_path).size,
|
||||||
|
});
|
||||||
|
} catch { /* dry-run path */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const errors: string[] = [];
|
||||||
|
const warnings: string[] = [];
|
||||||
|
for (const s of sources) {
|
||||||
|
if (s.rows_skipped > 0) warnings.push(`${s.source_file}: ${s.rows_skipped} skipped`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const receipt: Receipt = {
|
||||||
|
schema_version: RECEIPT_SCHEMA_VERSION,
|
||||||
|
command: "bun run scripts/distillation/score_runs.ts" + (opts.dry_run ? " --dry-run" : ""),
|
||||||
|
git_sha: gitSha(opts.root),
|
||||||
|
git_branch: gitBranch(opts.root),
|
||||||
|
git_dirty: gitDirty(opts.root),
|
||||||
|
started_at: opts.recorded_at,
|
||||||
|
ended_at,
|
||||||
|
duration_ms,
|
||||||
|
input_files,
|
||||||
|
output_files,
|
||||||
|
record_counts: {
|
||||||
|
in: totals.rows_read,
|
||||||
|
out: totals.rows_written,
|
||||||
|
skipped: totals.rows_skipped,
|
||||||
|
deduped: totals.rows_deduped,
|
||||||
|
cat_accepted: totals.by_category.accepted ?? 0,
|
||||||
|
cat_partially_accepted: totals.by_category.partially_accepted ?? 0,
|
||||||
|
cat_rejected: totals.by_category.rejected ?? 0,
|
||||||
|
cat_needs_human_review: totals.by_category.needs_human_review ?? 0,
|
||||||
|
},
|
||||||
|
validation_pass: totals.rows_skipped === 0,
|
||||||
|
errors,
|
||||||
|
warnings,
|
||||||
|
};
|
||||||
|
const rv = validateReceipt(receipt);
|
||||||
|
if (!rv.valid) {
|
||||||
|
receipt.errors.push(...rv.errors.map(e => "receipt schema: " + e));
|
||||||
|
receipt.validation_pass = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const stamp = ended_at.replace(/[:.]/g, "-");
|
||||||
|
const receipt_path = resolve(reports_dir, stamp, "receipt.json");
|
||||||
|
if (!opts.dry_run) {
|
||||||
|
mkdirSync(dirname(receipt_path), { recursive: true });
|
||||||
|
writeFileSync(receipt_path, JSON.stringify(receipt, null, 2) + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return { sources, totals, receipt, receipt_path, scored_dir, skips_path };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function cli() {
|
||||||
|
const dry_run = process.argv.includes("--dry-run");
|
||||||
|
const recorded_at = new Date().toISOString();
|
||||||
|
const r = await scoreAll({ root: DEFAULT_ROOT, recorded_at, dry_run });
|
||||||
|
|
||||||
|
console.log(`[score_runs] ${r.totals.rows_read} read · ${r.totals.rows_written} written · ${r.totals.rows_skipped} skipped · ${r.totals.rows_deduped} deduped${dry_run ? " (DRY RUN)" : ""}`);
|
||||||
|
console.log(`[score_runs] categories: accepted=${r.totals.by_category.accepted} partial=${r.totals.by_category.partially_accepted} rejected=${r.totals.by_category.rejected} needs_human=${r.totals.by_category.needs_human_review}`);
|
||||||
|
for (const s of r.sources) {
|
||||||
|
const c = s.by_category;
|
||||||
|
console.log(` ${s.source_file}: read=${s.rows_read} wrote=${s.rows_written} acc=${c.accepted ?? 0} part=${c.partially_accepted ?? 0} rej=${c.rejected ?? 0} hum=${c.needs_human_review ?? 0}`);
|
||||||
|
}
|
||||||
|
if (!dry_run) console.log(`[score_runs] receipt: ${r.receipt_path}`);
|
||||||
|
if (!r.receipt.validation_pass) process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (import.meta.main) cli().catch(e => { console.error(e); process.exit(1); });
|
||||||
303
scripts/distillation/scorer.ts
Normal file
303
scripts/distillation/scorer.ts
Normal file
@ -0,0 +1,303 @@
|
|||||||
|
// scorer.ts — pure deterministic Success Scorer.
|
||||||
|
//
|
||||||
|
// Takes one EvidenceRecord, returns category + reasons + sub_scores.
|
||||||
|
// NO I/O, NO LLM, NO clock reads, NO mutable state. The only randomness
|
||||||
|
// allowed is none. Identical input → identical output forever.
|
||||||
|
//
|
||||||
|
// Three-class strategy (see docs/recon/local-distillation-recon.md +
|
||||||
|
// data/_kb/evidence_health.md for the source taxonomy):
|
||||||
|
//
|
||||||
|
// CLASS A — verdict-bearing
|
||||||
|
// scrum_reviews, observer_reviews, audits, contract_analyses
|
||||||
|
// Direct scoring from existing markers/observer_verdict
|
||||||
|
//
|
||||||
|
// CLASS B — telemetry-rich
|
||||||
|
// auto_apply, outcomes, mode_experiments
|
||||||
|
// Markers exist but partial; needs_human_review fills the gap
|
||||||
|
//
|
||||||
|
// CLASS C — pure-extraction (no native scoring signal)
|
||||||
|
// distilled_*, audit_facts, observer_escalations
|
||||||
|
// Default needs_human_review; v2 will JOIN to parent verdict
|
||||||
|
//
|
||||||
|
// scorer_version is stamped on every output. Bumping it lets a
|
||||||
|
// downstream re-scoring detect drift between historical runs.
|
||||||
|
|
||||||
|
import type { EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record";
|
||||||
|
import type { ScoreCategory, ScoredRun } from "../../auditor/schemas/distillation/scored_run";
|
||||||
|
import { SCORED_RUN_SCHEMA_VERSION } from "../../auditor/schemas/distillation/scored_run";
|
||||||
|
import { canonicalSha256 } from "../../auditor/schemas/distillation/types";
|
||||||
|
|
||||||
|
export const SCORER_VERSION = process.env.LH_SCORER_VERSION ?? "v1.0.0";
|
||||||
|
|
||||||
|
export interface ScoreOutput {
|
||||||
|
category: ScoreCategory;
|
||||||
|
reasons: string[];
|
||||||
|
sub_scores: ScoredRun["sub_scores"];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Map source_file (from provenance) → source class. Centralized so
|
||||||
|
// adding a new source is one-line.
|
||||||
|
type SourceClass = "verdict" | "telemetry" | "extraction";
|
||||||
|
|
||||||
|
function sourceClassFor(source_file: string): SourceClass {
|
||||||
|
// Strip data/_kb/ prefix and .jsonl suffix to compare by stem
|
||||||
|
const stem = source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, "");
|
||||||
|
switch (stem) {
|
||||||
|
case "scrum_reviews":
|
||||||
|
case "observer_reviews":
|
||||||
|
case "audits":
|
||||||
|
case "contract_analyses":
|
||||||
|
return "verdict";
|
||||||
|
case "auto_apply":
|
||||||
|
case "outcomes":
|
||||||
|
case "mode_experiments":
|
||||||
|
return "telemetry";
|
||||||
|
case "distilled_facts":
|
||||||
|
case "distilled_procedures":
|
||||||
|
case "distilled_config_hints":
|
||||||
|
case "audit_facts":
|
||||||
|
case "observer_escalations":
|
||||||
|
return "extraction";
|
||||||
|
default:
|
||||||
|
// Unknown source — route to extraction (most conservative —
|
||||||
|
// forces needs_human_review until a transform is added).
|
||||||
|
return "extraction";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Class A: verdict-bearing ─────────────────────────────────────
|
||||||
|
|
||||||
|
function scoreScrumReview(r: EvidenceRecord): ScoreOutput {
|
||||||
|
const reasons: string[] = [];
|
||||||
|
const subs: ScoredRun["sub_scores"] = {};
|
||||||
|
|
||||||
|
const successMarker = (r.success_markers ?? []).find(m => m.startsWith("accepted_on_attempt_"));
|
||||||
|
if (!successMarker) {
|
||||||
|
reasons.push("scrum_review missing accepted_on_attempt_* success marker");
|
||||||
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
const attempt = Number(successMarker.replace("accepted_on_attempt_", ""));
|
||||||
|
subs.accepted_on_attempt = attempt;
|
||||||
|
if (attempt === 1) {
|
||||||
|
reasons.push("scrum: accepted on first attempt");
|
||||||
|
return { category: "accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (attempt <= 3) {
|
||||||
|
reasons.push(`scrum: accepted after ${attempt} attempts`);
|
||||||
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
reasons.push(`scrum: accepted only after ${attempt} attempts (high-cost path)`);
|
||||||
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreObserverReview(r: EvidenceRecord): ScoreOutput {
|
||||||
|
const reasons: string[] = [];
|
||||||
|
const subs: ScoredRun["sub_scores"] = {};
|
||||||
|
const v = r.observer_verdict;
|
||||||
|
if (v === "accept") {
|
||||||
|
subs.observer_verdict = "accept";
|
||||||
|
reasons.push("observer accepted the reviewed attempt");
|
||||||
|
return { category: "accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (v === "reject") {
|
||||||
|
subs.observer_verdict = "reject";
|
||||||
|
reasons.push("observer rejected the reviewed attempt");
|
||||||
|
return { category: "rejected", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (v === "cycle") {
|
||||||
|
subs.observer_verdict = "cycle";
|
||||||
|
reasons.push("observer flagged the attempt as cycling — partial signal");
|
||||||
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
reasons.push(`observer_verdict missing or unrecognized: ${JSON.stringify(v ?? null)}`);
|
||||||
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreAudit(r: EvidenceRecord): ScoreOutput {
|
||||||
|
// audits.jsonl is the auditor's per-finding stream (not PR verdicts).
|
||||||
|
// Phase 2 transform encodes severity into markers:
|
||||||
|
// audit_severity_{info,low} → accepted (minor finding)
|
||||||
|
// audit_severity_medium → partially_accepted
|
||||||
|
// audit_severity_{high,critical} → rejected (real problem)
|
||||||
|
// Older "approved"/"blocked"/"request_changes" markers also handled
|
||||||
|
// for back-compat with any pre-fix materializations on disk.
|
||||||
|
const reasons: string[] = [];
|
||||||
|
const subs: ScoredRun["sub_scores"] = {};
|
||||||
|
const succ = r.success_markers ?? [];
|
||||||
|
const fail = r.failure_markers ?? [];
|
||||||
|
|
||||||
|
if (succ.includes("approved")) {
|
||||||
|
reasons.push("audit overall=approved (legacy marker)");
|
||||||
|
return { category: "accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (fail.includes("blocked")) {
|
||||||
|
reasons.push("audit overall=block (legacy marker)");
|
||||||
|
return { category: "rejected", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (fail.includes("request_changes")) {
|
||||||
|
reasons.push("audit overall=request_changes (legacy marker)");
|
||||||
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Severity-derived markers (current Phase 2 transform):
|
||||||
|
const sevSucc = succ.find(m => m.startsWith("audit_severity_"));
|
||||||
|
const sevFail = fail.find(m => m.startsWith("audit_severity_"));
|
||||||
|
if (sevSucc) {
|
||||||
|
reasons.push(`${sevSucc} → minor finding`);
|
||||||
|
return { category: "accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (sevFail === "audit_severity_medium") {
|
||||||
|
reasons.push("audit_severity_medium → finding warrants review");
|
||||||
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (sevFail === "audit_severity_high" || sevFail === "audit_severity_critical") {
|
||||||
|
reasons.push(`${sevFail} → blocking finding`);
|
||||||
|
return { category: "rejected", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
|
||||||
|
reasons.push("audit row has no severity or overall marker");
|
||||||
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreContractAnalysis(r: EvidenceRecord): ScoreOutput {
|
||||||
|
const reasons: string[] = [];
|
||||||
|
const subs: ScoredRun["sub_scores"] = {};
|
||||||
|
const v = r.observer_verdict;
|
||||||
|
// failure_markers takes precedence: explicit rejection beats absent verdict
|
||||||
|
if ((r.failure_markers ?? []).includes("observer_rejected") || v === "reject") {
|
||||||
|
subs.observer_verdict = "reject";
|
||||||
|
reasons.push("contract analysis: observer rejected");
|
||||||
|
return { category: "rejected", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (v === "accept") {
|
||||||
|
subs.observer_verdict = "accept";
|
||||||
|
reasons.push("contract analysis: observer accepted");
|
||||||
|
return { category: "accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (v === "cycle") {
|
||||||
|
subs.observer_verdict = "cycle";
|
||||||
|
reasons.push("contract analysis: observer cycled (partial)");
|
||||||
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
reasons.push("contract analysis: no observer verdict signal");
|
||||||
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Class B: telemetry-rich ──────────────────────────────────────
|
||||||
|
|
||||||
|
function scoreAutoApply(r: EvidenceRecord): ScoreOutput {
|
||||||
|
const reasons: string[] = [];
|
||||||
|
const subs: ScoredRun["sub_scores"] = {};
|
||||||
|
if ((r.success_markers ?? []).includes("committed")) {
|
||||||
|
subs.cargo_green = true;
|
||||||
|
reasons.push("auto_apply: patch committed (cargo green + warning baseline + rationale alignment passed)");
|
||||||
|
return { category: "accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
const failures = (r.failure_markers ?? []);
|
||||||
|
const reverted = failures.find(f => f.includes("reverted"));
|
||||||
|
if (reverted) {
|
||||||
|
if (reverted.includes("build_red")) subs.cargo_green = false;
|
||||||
|
reasons.push(`auto_apply: ${reverted}`);
|
||||||
|
return { category: "rejected", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
// no_patches / dry_run / all_rejected — not a failure of code, but no commit either
|
||||||
|
reasons.push("auto_apply: no commit + no revert (likely no_patches or dry_run)");
|
||||||
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreOutcomes(r: EvidenceRecord): ScoreOutput {
|
||||||
|
const reasons: string[] = [];
|
||||||
|
const subs: ScoredRun["sub_scores"] = {};
|
||||||
|
if ((r.success_markers ?? []).includes("all_events_ok")) {
|
||||||
|
reasons.push("outcomes: all events ok");
|
||||||
|
return { category: "accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
// Validation results may carry partial signal
|
||||||
|
const gap = r.validation_results?.gap_signals as number | undefined;
|
||||||
|
if (typeof gap === "number" && gap > 0) {
|
||||||
|
reasons.push(`outcomes: ${gap} gap signal(s) detected`);
|
||||||
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
reasons.push("outcomes: no decisive marker — defer to human");
|
||||||
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreModeExperiment(r: EvidenceRecord): ScoreOutput {
|
||||||
|
const reasons: string[] = [];
|
||||||
|
const subs: ScoredRun["sub_scores"] = {};
|
||||||
|
// mode_experiments at Phase 2 lacks markers (transform doesn't derive
|
||||||
|
// them yet). v1 derivation: a non-empty response with reasonable
|
||||||
|
// latency is at least partially_accepted; otherwise needs_human_review.
|
||||||
|
// Anything stronger needs the grounding-from-mode_compare hook in
|
||||||
|
// Phase 4 / re-scoring.
|
||||||
|
if (typeof r.text !== "string" || r.text.trim().length === 0) {
|
||||||
|
reasons.push("mode_experiment: empty response text");
|
||||||
|
return { category: "rejected", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
if (typeof r.latency_ms === "number" && r.latency_ms > 120_000) {
|
||||||
|
reasons.push(`mode_experiment: latency ${r.latency_ms}ms exceeds 2-minute soft cap`);
|
||||||
|
return { category: "partially_accepted", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
reasons.push("mode_experiment: response present, latency within bounds; verdict not yet wired");
|
||||||
|
return { category: "needs_human_review", reasons, sub_scores: subs };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Class C: pure-extraction ────────────────────────────────────
|
||||||
|
|
||||||
|
function scoreExtraction(r: EvidenceRecord): ScoreOutput {
|
||||||
|
// Phase 3 v1: extraction-class records have no native scoring
|
||||||
|
// signal. Default to needs_human_review with an explicit reason.
|
||||||
|
// Phase 3 v2 will JOIN to a parent verdict-bearing record.
|
||||||
|
const reasons = ["extraction-class source has no native scoring signal — JOIN to parent verdict pending Phase 3 v2"];
|
||||||
|
return { category: "needs_human_review", reasons, sub_scores: {} };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Dispatch ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export function scoreRecord(record: EvidenceRecord): ScoreOutput {
|
||||||
|
const cls = sourceClassFor(record.provenance.source_file);
|
||||||
|
const stem = record.provenance.source_file.replace(/^data\/_kb\//, "").replace(/\.jsonl$/, "");
|
||||||
|
|
||||||
|
if (cls === "verdict") {
|
||||||
|
if (stem === "scrum_reviews") return scoreScrumReview(record);
|
||||||
|
if (stem === "observer_reviews") return scoreObserverReview(record);
|
||||||
|
if (stem === "audits") return scoreAudit(record);
|
||||||
|
if (stem === "contract_analyses") return scoreContractAnalysis(record);
|
||||||
|
}
|
||||||
|
if (cls === "telemetry") {
|
||||||
|
if (stem === "auto_apply") return scoreAutoApply(record);
|
||||||
|
if (stem === "outcomes") return scoreOutcomes(record);
|
||||||
|
if (stem === "mode_experiments") return scoreModeExperiment(record);
|
||||||
|
}
|
||||||
|
return scoreExtraction(record);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build a complete ScoredRun. Caller supplies recorded_at + the
|
||||||
|
// source file / line offset to populate provenance.
|
||||||
|
export async function buildScoredRun(
|
||||||
|
record: EvidenceRecord,
|
||||||
|
source_file_relpath: string,
|
||||||
|
line_offset: number,
|
||||||
|
recorded_at: string,
|
||||||
|
): Promise<ScoredRun> {
|
||||||
|
const out = scoreRecord(record);
|
||||||
|
// Compute provenance.sig_hash over the EvidenceRecord (not raw source);
|
||||||
|
// ScoredRun traces to the materialized evidence row, not the raw stream.
|
||||||
|
const sig_hash = await canonicalSha256(record);
|
||||||
|
return {
|
||||||
|
schema_version: SCORED_RUN_SCHEMA_VERSION,
|
||||||
|
evidence_run_id: record.run_id,
|
||||||
|
evidence_task_id: record.task_id,
|
||||||
|
category: out.category,
|
||||||
|
reasons: out.reasons,
|
||||||
|
scored_at: recorded_at,
|
||||||
|
scorer_version: SCORER_VERSION,
|
||||||
|
sub_scores: out.sub_scores,
|
||||||
|
provenance: {
|
||||||
|
source_file: source_file_relpath,
|
||||||
|
line_offset,
|
||||||
|
sig_hash,
|
||||||
|
recorded_at,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
@ -211,19 +211,30 @@ export const TRANSFORMS: TransformDef[] = [
|
|||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
// 2026-04-26 correction: data/_kb/audits.jsonl is the auditor's
|
||||||
|
// per-FINDING stream (recon misnamed it "PR verdicts"). Schema:
|
||||||
|
// {embedding, evidence, finding_id, phase, resolution, severity, topic, ts}
|
||||||
|
// The actual per-PR verdicts live in data/_auditor/verdicts/*.json,
|
||||||
|
// not in this JSONL. So we score by severity here: info/low →
|
||||||
|
// accepted (audit found minor issue), medium → partially_accepted,
|
||||||
|
// high/critical → rejected (real problem in the audited code).
|
||||||
source_file_relpath: "data/_kb/audits.jsonl",
|
source_file_relpath: "data/_kb/audits.jsonl",
|
||||||
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
|
transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => {
|
||||||
run_id: `audit:${row.head_sha ?? line_offset}`,
|
const sev = String(row.severity ?? "unknown").toLowerCase();
|
||||||
task_id: `pr:${row.pr_number}`,
|
const minor = sev === "info" || sev === "low";
|
||||||
timestamp: row.audited_at ?? new Date().toISOString(),
|
const blocking = sev === "high" || sev === "critical";
|
||||||
|
return {
|
||||||
|
run_id: `audit_finding:${row.finding_id ?? line_offset}`,
|
||||||
|
task_id: row.phase ? `phase:${row.phase}` : "audit_finding",
|
||||||
|
timestamp: row.ts ?? new Date().toISOString(),
|
||||||
schema_version: EVIDENCE_SCHEMA_VERSION,
|
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||||
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
|
provenance: provenance({ row, line_offset, source_file_relpath, recorded_at, sig_hash }),
|
||||||
model_role: "reviewer" as ModelRole,
|
model_role: "reviewer" as ModelRole,
|
||||||
success_markers: row.overall === "approve" ? ["approved"] : undefined,
|
success_markers: minor ? [`audit_severity_${sev}`] : undefined,
|
||||||
failure_markers: row.overall === "block" ? ["blocked"] : (row.overall === "request_changes" ? ["request_changes"] : undefined),
|
failure_markers: blocking ? [`audit_severity_${sev}`] : (sev === "medium" ? ["audit_severity_medium"] : undefined),
|
||||||
validation_results: { schema_valid: true, [`overall_${row.overall ?? "?"}`]: true },
|
text: typeof row.evidence === "string" ? row.evidence : (row.resolution ?? ""),
|
||||||
text: row.one_liner ?? "",
|
};
|
||||||
}),
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
source_file_relpath: "data/_kb/outcomes.jsonl",
|
source_file_relpath: "data/_kb/outcomes.jsonl",
|
||||||
|
|||||||
157
tests/distillation/score_runs.test.ts
Normal file
157
tests/distillation/score_runs.test.ts
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
// Integration test: score_runs.ts CLI pipeline. Synthesizes evidence
|
||||||
|
// records, runs scoreAll, asserts behavior on the materialized scored
|
||||||
|
// runs + receipt.
|
||||||
|
|
||||||
|
import { test, expect, beforeEach, afterEach } from "bun:test";
|
||||||
|
import { mkdirSync, writeFileSync, rmSync, existsSync, readFileSync } from "node:fs";
|
||||||
|
import { resolve } from "node:path";
|
||||||
|
|
||||||
|
import { scoreAll } from "../../scripts/distillation/score_runs";
|
||||||
|
import { EVIDENCE_SCHEMA_VERSION, type EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record";
|
||||||
|
import { validateScoredRun } from "../../auditor/schemas/distillation/scored_run";
|
||||||
|
import { validateReceipt } from "../../auditor/schemas/distillation/receipt";
|
||||||
|
|
||||||
|
const TMP = "/tmp/distillation_test_phase3";
|
||||||
|
const RECORDED = "2026-04-26T22:30:00.000Z";
|
||||||
|
const SHA = "0".repeat(64);
|
||||||
|
|
||||||
|
function makeEv(opts: Partial<EvidenceRecord> & { source_stem: string }): EvidenceRecord {
|
||||||
|
return {
|
||||||
|
run_id: opts.run_id ?? `run-${Math.random()}`,
|
||||||
|
task_id: opts.task_id ?? "task-test",
|
||||||
|
timestamp: opts.timestamp ?? RECORDED,
|
||||||
|
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||||
|
provenance: {
|
||||||
|
source_file: `data/_kb/${opts.source_stem}.jsonl`,
|
||||||
|
line_offset: 0,
|
||||||
|
sig_hash: SHA,
|
||||||
|
recorded_at: RECORDED,
|
||||||
|
},
|
||||||
|
...opts,
|
||||||
|
} as EvidenceRecord;
|
||||||
|
}
|
||||||
|
|
||||||
|
function writeEvidence(ev: EvidenceRecord[], stem: string) {
|
||||||
|
const partition = "2026/04/27";
|
||||||
|
const dir = resolve(TMP, "data/evidence", partition);
|
||||||
|
mkdirSync(dir, { recursive: true });
|
||||||
|
writeFileSync(resolve(dir, `${stem}.jsonl`), ev.map(r => JSON.stringify(r)).join("\n") + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
function setup() {
|
||||||
|
if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true });
|
||||||
|
mkdirSync(resolve(TMP, "data/_kb"), { recursive: true });
|
||||||
|
|
||||||
|
// Mix of every category across sources
|
||||||
|
writeEvidence([
|
||||||
|
makeEv({ source_stem: "scrum_reviews", run_id: "s1", success_markers: ["accepted_on_attempt_1"] }),
|
||||||
|
makeEv({ source_stem: "scrum_reviews", run_id: "s2", success_markers: ["accepted_on_attempt_3"] }),
|
||||||
|
makeEv({ source_stem: "scrum_reviews", run_id: "s3" }), // no markers → human
|
||||||
|
], "scrum_reviews");
|
||||||
|
|
||||||
|
writeEvidence([
|
||||||
|
makeEv({ source_stem: "audits", run_id: "a1", success_markers: ["approved"] }),
|
||||||
|
makeEv({ source_stem: "audits", run_id: "a2", failure_markers: ["blocked"] }),
|
||||||
|
makeEv({ source_stem: "audits", run_id: "a3", failure_markers: ["request_changes"] }),
|
||||||
|
], "audits");
|
||||||
|
|
||||||
|
writeEvidence([
|
||||||
|
makeEv({ source_stem: "auto_apply", run_id: "ap1", success_markers: ["committed"] }),
|
||||||
|
makeEv({ source_stem: "auto_apply", run_id: "ap2", failure_markers: ["build_red_reverted"] }),
|
||||||
|
makeEv({ source_stem: "auto_apply", run_id: "ap3" }),
|
||||||
|
], "auto_apply");
|
||||||
|
|
||||||
|
writeEvidence([
|
||||||
|
makeEv({ source_stem: "distilled_facts", run_id: "df1", text: "extracted fact" }),
|
||||||
|
], "distilled_facts");
|
||||||
|
}
|
||||||
|
|
||||||
|
beforeEach(setup);
|
||||||
|
afterEach(() => { if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true }); });
|
||||||
|
|
||||||
|
test("score_runs: emits ScoredRun for every EvidenceRecord", async () => {
|
||||||
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
||||||
|
expect(r.totals.rows_read).toBe(10);
|
||||||
|
expect(r.totals.rows_written).toBe(10);
|
||||||
|
expect(r.totals.rows_skipped).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("score_runs: category distribution matches expected per source", async () => {
|
||||||
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
||||||
|
// 1 (s1) + 1 (a1) + 1 (ap1) = 3 accepted
|
||||||
|
// 1 (s2) + 1 (a3) = 2 partial
|
||||||
|
// 1 (a2) + 1 (ap2) = 2 rejected
|
||||||
|
// 1 (s3) + 1 (ap3) + 1 (df1) = 3 needs_human
|
||||||
|
expect(r.totals.by_category.accepted).toBe(3);
|
||||||
|
expect(r.totals.by_category.partially_accepted).toBe(2);
|
||||||
|
expect(r.totals.by_category.rejected).toBe(2);
|
||||||
|
expect(r.totals.by_category.needs_human_review).toBe(3);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("score_runs: every output row validates against ScoredRun schema", async () => {
|
||||||
|
await scoreAll({ root: TMP, recorded_at: RECORDED });
|
||||||
|
const dir = resolve(TMP, "data/scored-runs/2026/04/27");
|
||||||
|
for (const stem of ["scrum_reviews", "audits", "auto_apply", "distilled_facts"]) {
|
||||||
|
const path = resolve(dir, `${stem}.jsonl`);
|
||||||
|
expect(existsSync(path)).toBe(true);
|
||||||
|
const lines = readFileSync(path, "utf8").trim().split("\n").filter(Boolean);
|
||||||
|
for (const line of lines) {
|
||||||
|
const v = validateScoredRun(JSON.parse(line));
|
||||||
|
expect(v.valid).toBe(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("score_runs: idempotent — second run produces 0 new writes", async () => {
|
||||||
|
await scoreAll({ root: TMP, recorded_at: RECORDED });
|
||||||
|
const r2 = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
||||||
|
expect(r2.totals.rows_written).toBe(0);
|
||||||
|
expect(r2.totals.rows_deduped).toBe(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("score_runs: receipt validates and pins git_sha + record_counts + by_category", async () => {
|
||||||
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
||||||
|
const v = validateReceipt(r.receipt);
|
||||||
|
expect(v.valid).toBe(true);
|
||||||
|
expect(r.receipt.git_sha).toMatch(/^[0-9a-f]{40}$/);
|
||||||
|
expect(r.receipt.record_counts.in).toBe(10);
|
||||||
|
expect(r.receipt.record_counts.out).toBe(10);
|
||||||
|
expect(r.receipt.record_counts.cat_accepted).toBe(3);
|
||||||
|
expect(r.receipt.record_counts.cat_partially_accepted).toBe(2);
|
||||||
|
expect(r.receipt.record_counts.cat_rejected).toBe(2);
|
||||||
|
expect(r.receipt.record_counts.cat_needs_human_review).toBe(3);
|
||||||
|
expect(r.receipt.validation_pass).toBe(true); // 0 skips
|
||||||
|
});
|
||||||
|
|
||||||
|
test("score_runs: every output row carries provenance + reasons + scorer_version", async () => {
|
||||||
|
await scoreAll({ root: TMP, recorded_at: RECORDED });
|
||||||
|
const path = resolve(TMP, "data/scored-runs/2026/04/27/scrum_reviews.jsonl");
|
||||||
|
const rows = readFileSync(path, "utf8").trim().split("\n").map(l => JSON.parse(l));
|
||||||
|
for (const row of rows) {
|
||||||
|
expect(row.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/);
|
||||||
|
expect(row.reasons.length).toBeGreaterThan(0);
|
||||||
|
expect(row.scorer_version).toBeTruthy();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("score_runs: malformed evidence row is skipped, valid rows still process", async () => {
|
||||||
|
// Inject a malformed line into one of the evidence files
|
||||||
|
const path = resolve(TMP, "data/evidence/2026/04/27/scrum_reviews.jsonl");
|
||||||
|
const existing = readFileSync(path, "utf8");
|
||||||
|
writeFileSync(path, existing + "{not valid json\n");
|
||||||
|
|
||||||
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
|
||||||
|
expect(r.totals.rows_skipped).toBe(1);
|
||||||
|
expect(r.totals.rows_written).toBe(10); // valid rows unaffected
|
||||||
|
expect(r.receipt.validation_pass).toBe(false); // skips > 0
|
||||||
|
expect(existsSync(r.skips_path)).toBe(true);
|
||||||
|
const skipBody = readFileSync(r.skips_path, "utf8");
|
||||||
|
expect(skipBody).toContain("evidence not JSON");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("score_runs: dry-run reports counts but writes no scored-runs", async () => {
|
||||||
|
const r = await scoreAll({ root: TMP, recorded_at: RECORDED, dry_run: true });
|
||||||
|
expect(r.totals.rows_written).toBe(10);
|
||||||
|
const scoredDir = resolve(TMP, "data/scored-runs");
|
||||||
|
expect(existsSync(scoredDir)).toBe(false);
|
||||||
|
});
|
||||||
297
tests/distillation/scorer.test.ts
Normal file
297
tests/distillation/scorer.test.ts
Normal file
@ -0,0 +1,297 @@
|
|||||||
|
// Unit tests on the pure scoreRecord function. No I/O, no fixtures —
|
||||||
|
// inline EvidenceRecord makers per source class. Each scoring rule
|
||||||
|
// gets a positive case + at least one boundary case.
|
||||||
|
|
||||||
|
import { test, expect } from "bun:test";
|
||||||
|
import { scoreRecord, SCORER_VERSION, buildScoredRun } from "../../scripts/distillation/scorer";
|
||||||
|
import { EVIDENCE_SCHEMA_VERSION, type EvidenceRecord, type ModelRole } from "../../auditor/schemas/distillation/evidence_record";
|
||||||
|
|
||||||
|
const NOW = "2026-04-26T22:30:00.000Z";
|
||||||
|
const SHA = "0".repeat(64);
|
||||||
|
|
||||||
|
function makeEvidence(opts: Partial<EvidenceRecord> & { source_stem: string }): EvidenceRecord {
|
||||||
|
return {
|
||||||
|
run_id: opts.run_id ?? "run-test",
|
||||||
|
task_id: opts.task_id ?? "task-test",
|
||||||
|
timestamp: opts.timestamp ?? NOW,
|
||||||
|
schema_version: EVIDENCE_SCHEMA_VERSION,
|
||||||
|
provenance: {
|
||||||
|
source_file: `data/_kb/${opts.source_stem}.jsonl`,
|
||||||
|
line_offset: 0,
|
||||||
|
sig_hash: SHA,
|
||||||
|
recorded_at: NOW,
|
||||||
|
},
|
||||||
|
...opts,
|
||||||
|
} as EvidenceRecord;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Class A: scrum_reviews ───────────────────────────────────────
|
||||||
|
|
||||||
|
test("scrum_reviews: accepted_on_attempt_1 → accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "scrum_reviews",
|
||||||
|
success_markers: ["accepted_on_attempt_1"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("accepted");
|
||||||
|
expect(r.sub_scores?.accepted_on_attempt).toBe(1);
|
||||||
|
expect(r.reasons.some(x => x.includes("first attempt"))).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("scrum_reviews: accepted_on_attempt_2 → partially_accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "scrum_reviews",
|
||||||
|
success_markers: ["accepted_on_attempt_2"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("partially_accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("scrum_reviews: accepted_on_attempt_5 → partially_accepted with high-cost reason", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "scrum_reviews",
|
||||||
|
success_markers: ["accepted_on_attempt_5"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("partially_accepted");
|
||||||
|
expect(r.reasons.some(x => x.includes("5 attempts"))).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("scrum_reviews: no success_markers → needs_human_review", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "scrum_reviews" }));
|
||||||
|
expect(r.category).toBe("needs_human_review");
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Class A: observer_reviews ────────────────────────────────────
|
||||||
|
|
||||||
|
test("observer_reviews: accept → accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "accept" }));
|
||||||
|
expect(r.category).toBe("accepted");
|
||||||
|
expect(r.sub_scores?.observer_verdict).toBe("accept");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("observer_reviews: reject → rejected", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "reject" }));
|
||||||
|
expect(r.category).toBe("rejected");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("observer_reviews: cycle → partially_accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews", observer_verdict: "cycle" }));
|
||||||
|
expect(r.category).toBe("partially_accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("observer_reviews: missing verdict → needs_human_review", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_reviews" }));
|
||||||
|
expect(r.category).toBe("needs_human_review");
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Class A: audits (per-finding stream, severity-based) ────────
|
||||||
|
|
||||||
|
test("audits: severity_info → accepted (minor finding)", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "audits",
|
||||||
|
success_markers: ["audit_severity_info"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("audits: severity_low → accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "audits",
|
||||||
|
success_markers: ["audit_severity_low"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("audits: severity_medium → partially_accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "audits",
|
||||||
|
failure_markers: ["audit_severity_medium"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("partially_accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("audits: severity_high → rejected", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "audits",
|
||||||
|
failure_markers: ["audit_severity_high"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("rejected");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("audits: severity_critical → rejected", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "audits",
|
||||||
|
failure_markers: ["audit_severity_critical"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("rejected");
|
||||||
|
});
|
||||||
|
|
||||||
|
// Legacy markers preserved for back-compat with pre-fix data on disk
|
||||||
|
test("audits: legacy 'approved' still maps to accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "audits",
|
||||||
|
success_markers: ["approved"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("audits: legacy 'blocked' still maps to rejected", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "audits",
|
||||||
|
failure_markers: ["blocked"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("rejected");
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Class A: contract_analyses ───────────────────────────────────
|
||||||
|
|
||||||
|
test("contract_analyses: observer_rejected failure marker → rejected", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "contract_analyses",
|
||||||
|
failure_markers: ["observer_rejected"],
|
||||||
|
observer_verdict: "reject",
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("rejected");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("contract_analyses: observer accept → accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "contract_analyses",
|
||||||
|
observer_verdict: "accept",
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Class B: auto_apply ──────────────────────────────────────────
|
||||||
|
|
||||||
|
test("auto_apply: committed → accepted with cargo_green=true", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "auto_apply",
|
||||||
|
success_markers: ["committed"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("accepted");
|
||||||
|
expect(r.sub_scores?.cargo_green).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("auto_apply: build_red_reverted → rejected with cargo_green=false", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "auto_apply",
|
||||||
|
failure_markers: ["build_red_reverted"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("rejected");
|
||||||
|
expect(r.sub_scores?.cargo_green).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("auto_apply: warnings_increased_reverted → rejected", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "auto_apply",
|
||||||
|
failure_markers: ["warnings_increased_reverted"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("rejected");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("auto_apply: no markers → needs_human_review", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "auto_apply" }));
|
||||||
|
expect(r.category).toBe("needs_human_review");
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Class B: outcomes ────────────────────────────────────────────
|
||||||
|
|
||||||
|
test("outcomes: all_events_ok → accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "outcomes",
|
||||||
|
success_markers: ["all_events_ok"],
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("outcomes: gap_signals > 0 → partially_accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "outcomes",
|
||||||
|
validation_results: { gap_signals: 3 },
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("partially_accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Class B: mode_experiments ────────────────────────────────────
|
||||||
|
|
||||||
|
test("mode_experiments: empty text → rejected", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "mode_experiments",
|
||||||
|
text: "",
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("rejected");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("mode_experiments: latency > 120s → partially_accepted", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "mode_experiments",
|
||||||
|
text: "valid response",
|
||||||
|
latency_ms: 150_000,
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("partially_accepted");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("mode_experiments: text + reasonable latency → needs_human_review (no native verdict yet)", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({
|
||||||
|
source_stem: "mode_experiments",
|
||||||
|
text: "response present",
|
||||||
|
latency_ms: 10_000,
|
||||||
|
}));
|
||||||
|
expect(r.category).toBe("needs_human_review");
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Class C: extraction-class default ────────────────────────────
|
||||||
|
|
||||||
|
test("distilled_facts: no native verdict → needs_human_review", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "distilled_facts", text: "extracted fact" }));
|
||||||
|
expect(r.category).toBe("needs_human_review");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("distilled_procedures: no native verdict → needs_human_review", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "distilled_procedures" }));
|
||||||
|
expect(r.category).toBe("needs_human_review");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("audit_facts: extraction-class → needs_human_review", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "audit_facts" }));
|
||||||
|
expect(r.category).toBe("needs_human_review");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("observer_escalations: extraction-class → needs_human_review", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "observer_escalations" }));
|
||||||
|
expect(r.category).toBe("needs_human_review");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("unknown source: defaults to extraction class → needs_human_review", () => {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: "some_future_stream" }));
|
||||||
|
expect(r.category).toBe("needs_human_review");
|
||||||
|
});
|
||||||
|
|
||||||
|
// ─── Universal invariants ─────────────────────────────────────────
|
||||||
|
|
||||||
|
test("every score has at least one reason (reasons non-empty)", () => {
|
||||||
|
// Sample a scoring of every source we know about
|
||||||
|
const sources = ["scrum_reviews", "observer_reviews", "audits", "contract_analyses",
|
||||||
|
"auto_apply", "outcomes", "mode_experiments", "distilled_facts"];
|
||||||
|
for (const s of sources) {
|
||||||
|
const r = scoreRecord(makeEvidence({ source_stem: s }));
|
||||||
|
expect(r.reasons.length).toBeGreaterThanOrEqual(1);
|
||||||
|
for (const reason of r.reasons) expect(reason.length).toBeGreaterThan(0);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("buildScoredRun stamps SCORER_VERSION + computes provenance.sig_hash", async () => {
|
||||||
|
const ev = makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_1"] });
|
||||||
|
const scored = await buildScoredRun(ev, "data/scored-runs/2026/04/27/scrum_reviews.jsonl", 0, NOW);
|
||||||
|
expect(scored.scorer_version).toBe(SCORER_VERSION);
|
||||||
|
expect(scored.evidence_run_id).toBe(ev.run_id);
|
||||||
|
expect(scored.evidence_task_id).toBe(ev.task_id);
|
||||||
|
expect(scored.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/);
|
||||||
|
expect(scored.provenance.source_file).toBe("data/scored-runs/2026/04/27/scrum_reviews.jsonl");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("buildScoredRun is deterministic — same input → same sig_hash", async () => {
|
||||||
|
const ev = makeEvidence({ source_stem: "scrum_reviews", success_markers: ["accepted_on_attempt_1"] });
|
||||||
|
const a = await buildScoredRun(ev, "p", 0, NOW);
|
||||||
|
const b = await buildScoredRun(ev, "p", 0, NOW);
|
||||||
|
expect(a.provenance.sig_hash).toBe(b.provenance.sig_hash);
|
||||||
|
});
|
||||||
Loading…
x
Reference in New Issue
Block a user