Forensic-grade per-stage receipts wrapping all 5 implemented pipeline
stages. Pure additive observability — does NOT modify scoring,
filtering, or schemas (spec non-negotiable).
Files (6 new):
auditor/schemas/distillation/stage_receipt.ts StageReceipt v1
auditor/schemas/distillation/run_summary.ts RunSummary v1
auditor/schemas/distillation/drift_report.ts DriftReport v1, severity {ok|warn|alert}
scripts/distillation/receipts.ts runAllWithReceipts + buildDrift + CLI
tests/distillation/receipts.test.ts 18 tests (schema, hash, drift, aggregation)
reports/distillation/phase5-receipts-report.md acceptance report
Stages wrapped:
collect (build_evidence_index → data/evidence/)
score (score_runs → data/scored-runs/)
export-rag (exports/rag/playbooks.jsonl)
export-sft (exports/sft/instruction_response.jsonl)
export-preference (exports/preference/chosen_rejected.jsonl)
Reserved (not yet implemented): extract-playbooks, index.
Output tree (per run_id):
reports/distillation/<run_id>/
collect.json score.json export-rag.json export-sft.json export-preference.json
summary.json summary.md drift.json
Test metrics: 135 distillation tests pass · 0 fail · 353 expects · 1.5s
(Phase 5 added 18; total 117→135)
Real-data run-all (run_id=78072357-835d-...):
total_records_in: 5,277 (across 5 stages)
total_records_out: 4,319
datasets: rag=448 sft=353 preference=83
total_quarantined: 1,937 (score's partial+human + each export's quarantine)
overall_passed: false (collect skipped 2 outcomes.jsonl rows missing created_at —
carry-over from Phase 2; faithfully propagated)
run_hash: 7a14d8cdd6980048a075efe97043683a4f9aabb38ec1faa8982c9887593090e0
Drift detection (second run):
prior_run_id detected automatically
severity=ok (no count or category swung >20%)
flags: ["run_hash differs from prior run"] — expected, since recorded_at
is baked into provenance and changes per run. No false alert.
Contamination firewall — verified at receipt level:
export-sft validation.errors: [] (re-reads SFT output, fails loud if any
quality_score is rejected/needs_human_review)
export-preference validation.errors: [] (re-reads, fails loud if any
chosen_run_id == rejected_run_id or chosen text == rejected text)
Invariants enforced (proven by tests + real run):
- Every stage emits ONE receipt per run (5/5 on disk)
- All receipts share run_id (uuid generated per run-all)
- aggregateIoHash is order-independent + collision-free across path/content
- Schema validators gate every receipt before write (defense in depth)
- Drift detection: pct_change > 20% → warn; new error class → warn
- Failure propagation: any stage validation.passed=false → overall_passed=false
- Self-validation: harness throws if RunSummary/DriftReport fail their own schema
CLI:
bun run scripts/distillation/receipts.ts run-all
bun run scripts/distillation/receipts.ts read --run-id <id>
Spec acceptance gate (now.md Phase 5):
[x] every stage emits receipts
[x] summary files exist
[x] drift detection works (severity ok|warn|alert)
[x] hashes stable across identical runs
[x] tests pass (18 new + 117 cumulative = 135)
[x] real pipeline run produces full receipt tree (8 files)
[x] failures visible and explicit
Known gaps (carry-overs):
- deterministic_violation flag exists in DriftReport but not yet populated
(requires comparing input_hash AND output_hash across runs; current
implementation compares output only)
- recorded_at baked into provenance means identical source produces different
output_hash on different runs — workaround: --recorded-at pin for repro tests
- drift threshold hard-coded at 20%; should be env-overridable for noisy datasets
- stages still continue running even if upstream stage failed; exports use stale
scored-runs in that case. Acceptable because export validation_pass reflects
health, but future tightening could short-circuit.
Phase 6 (acceptance gate suite) unblocked.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
91 lines
3.0 KiB
TypeScript
91 lines
3.0 KiB
TypeScript
// run_summary.ts — aggregates StageReceipt rows for one run_id.
|
|
// Spec field set: total records processed, total accepted/rejected/
|
|
// quarantined, dataset sizes, validation status, overall hash of run.
|
|
|
|
import {
|
|
ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireSha256,
|
|
} from "./types";
|
|
import type { StageName } from "./stage_receipt";
|
|
|
|
export const RUN_SUMMARY_SCHEMA_VERSION = 1;
|
|
|
|
export interface RunStageSummary {
|
|
stage: StageName;
|
|
records_in: number;
|
|
records_out: number;
|
|
accepted: number;
|
|
rejected: number;
|
|
quarantined: number;
|
|
skipped: number;
|
|
passed: boolean;
|
|
duration_ms: number;
|
|
output_hash: string;
|
|
}
|
|
|
|
export interface RunSummary {
|
|
schema_version: number;
|
|
run_id: string;
|
|
started_at: string; // earliest stage timestamp
|
|
ended_at: string; // latest stage timestamp + duration
|
|
git_commit: string;
|
|
stages: RunStageSummary[];
|
|
// Aggregates across stages
|
|
total_records_in: number;
|
|
total_records_out: number;
|
|
total_accepted: number;
|
|
total_rejected: number;
|
|
total_quarantined: number;
|
|
total_skipped: number;
|
|
// Dataset sizes — final outputs of each export stage
|
|
rag_records: number;
|
|
sft_records: number;
|
|
preference_pairs: number;
|
|
// Pipeline-wide pass = AND of every stage validation.passed
|
|
overall_passed: boolean;
|
|
// Run-wide hash: sha256 over each stage's output hash, sorted by stage name.
|
|
// Detects ANY change in any stage output across runs.
|
|
run_hash: string;
|
|
total_duration_ms: number;
|
|
}
|
|
|
|
export function validateRunSummary(input: unknown): ValidationResult<RunSummary> {
|
|
const errors: string[] = [];
|
|
if (typeof input !== "object" || input === null) {
|
|
return { valid: false, errors: ["expected object"] };
|
|
}
|
|
const r = input as Record<string, unknown>;
|
|
let ok = true;
|
|
|
|
if (r.schema_version !== RUN_SUMMARY_SCHEMA_VERSION) {
|
|
errors.push(`schema_version: expected ${RUN_SUMMARY_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
|
ok = false;
|
|
}
|
|
ok = requireString(r.run_id, "run_id", errors) && ok;
|
|
ok = requireIsoTimestamp(r.started_at, "started_at", errors) && ok;
|
|
ok = requireIsoTimestamp(r.ended_at, "ended_at", errors) && ok;
|
|
if (typeof r.git_commit !== "string" || !/^[0-9a-f]{40}$/.test(r.git_commit as string)) {
|
|
errors.push("git_commit: must be 40-char hex");
|
|
ok = false;
|
|
}
|
|
if (typeof r.overall_passed !== "boolean") {
|
|
errors.push("overall_passed: must be boolean");
|
|
ok = false;
|
|
}
|
|
ok = requireSha256(r.run_hash, "run_hash", errors) && ok;
|
|
for (const k of ["total_records_in", "total_records_out", "total_accepted", "total_rejected",
|
|
"total_quarantined", "total_skipped", "rag_records", "sft_records",
|
|
"preference_pairs", "total_duration_ms"]) {
|
|
if (typeof (r as any)[k] !== "number") {
|
|
errors.push(`${k}: expected number`);
|
|
ok = false;
|
|
}
|
|
}
|
|
if (!Array.isArray(r.stages)) {
|
|
errors.push("stages: expected array");
|
|
ok = false;
|
|
}
|
|
|
|
if (!ok) return { valid: false, errors };
|
|
return { valid: true, value: r as unknown as RunSummary };
|
|
}
|