Forensic-grade per-stage receipts wrapping all 5 implemented pipeline
stages. Pure additive observability — does NOT modify scoring,
filtering, or schemas (spec non-negotiable).
Files (6 new):
auditor/schemas/distillation/stage_receipt.ts StageReceipt v1
auditor/schemas/distillation/run_summary.ts RunSummary v1
auditor/schemas/distillation/drift_report.ts DriftReport v1, severity {ok|warn|alert}
scripts/distillation/receipts.ts runAllWithReceipts + buildDrift + CLI
tests/distillation/receipts.test.ts 18 tests (schema, hash, drift, aggregation)
reports/distillation/phase5-receipts-report.md acceptance report
Stages wrapped:
collect (build_evidence_index → data/evidence/)
score (score_runs → data/scored-runs/)
export-rag (exports/rag/playbooks.jsonl)
export-sft (exports/sft/instruction_response.jsonl)
export-preference (exports/preference/chosen_rejected.jsonl)
Reserved (not yet implemented): extract-playbooks, index.
Output tree (per run_id):
reports/distillation/<run_id>/
collect.json score.json export-rag.json export-sft.json export-preference.json
summary.json summary.md drift.json
Test metrics: 135 distillation tests pass · 0 fail · 353 expects · 1.5s
(Phase 5 added 18; total 117→135)
Real-data run-all (run_id=78072357-835d-...):
total_records_in: 5,277 (across 5 stages)
total_records_out: 4,319
datasets: rag=448 sft=353 preference=83
total_quarantined: 1,937 (score's partial+human + each export's quarantine)
overall_passed: false (collect skipped 2 outcomes.jsonl rows missing created_at —
carry-over from Phase 2; faithfully propagated)
run_hash: 7a14d8cdd6980048a075efe97043683a4f9aabb38ec1faa8982c9887593090e0
Drift detection (second run):
prior_run_id detected automatically
severity=ok (no count or category swung >20%)
flags: ["run_hash differs from prior run"] — expected, since recorded_at
is baked into provenance and changes per run. No false alert.
Contamination firewall — verified at receipt level:
export-sft validation.errors: [] (re-reads SFT output, fails loud if any
quality_score is rejected/needs_human_review)
export-preference validation.errors: [] (re-reads, fails loud if any
chosen_run_id == rejected_run_id or chosen text == rejected text)
Invariants enforced (proven by tests + real run):
- Every stage emits ONE receipt per run (5/5 on disk)
- All receipts share run_id (uuid generated per run-all)
- aggregateIoHash is order-independent + collision-free across path/content
- Schema validators gate every receipt before write (defense in depth)
- Drift detection: pct_change > 20% → warn; new error class → warn
- Failure propagation: any stage validation.passed=false → overall_passed=false
- Self-validation: harness throws if RunSummary/DriftReport fail their own schema
CLI:
bun run scripts/distillation/receipts.ts run-all
bun run scripts/distillation/receipts.ts read --run-id <id>
Spec acceptance gate (now.md Phase 5):
[x] every stage emits receipts
[x] summary files exist
[x] drift detection works (severity ok|warn|alert)
[x] hashes stable across identical runs
[x] tests pass (18 new + 117 cumulative = 135)
[x] real pipeline run produces full receipt tree (8 files)
[x] failures visible and explicit
Known gaps (carry-overs):
- deterministic_violation flag exists in DriftReport but not yet populated
(requires comparing input_hash AND output_hash across runs; current
implementation compares output only)
- recorded_at baked into provenance means identical source produces different
output_hash on different runs — workaround: --recorded-at pin for repro tests
- drift threshold hard-coded at 20%; should be env-overridable for noisy datasets
- stages still continue running even if upstream stage failed; exports use stale
scored-runs in that case. Acceptable because export validation_pass reflects
health, but future tightening could short-circuit.
Phase 6 (acceptance gate suite) unblocked.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
82 lines
2.8 KiB
TypeScript
82 lines
2.8 KiB
TypeScript
// drift_report.ts — comparison of a current run summary vs the
|
|
// previous run summary on disk. Spec calls this "drift detection";
|
|
// concretely it answers: did the pipeline behave the same way as
|
|
// last time, and if not, was the change explained by an input change
|
|
// or did it appear out of nowhere (silent drift)?
|
|
//
|
|
// Severity:
|
|
// ok — within 20% on every metric, no hash surprises
|
|
// warn — record-count or category swing > 20%, OR new error class
|
|
// alert — output_hash differs while input_hash is identical
|
|
// (deterministic violation — same input → different output)
|
|
|
|
import {
|
|
ValidationResult, requireString, requireIsoTimestamp,
|
|
} from "./types";
|
|
import type { StageName } from "./stage_receipt";
|
|
|
|
export const DRIFT_REPORT_SCHEMA_VERSION = 1;
|
|
export const DRIFT_THRESHOLD_PCT = 0.20;
|
|
|
|
export type DriftSeverity = "ok" | "warn" | "alert";
|
|
|
|
export interface StageDrift {
|
|
stage: StageName;
|
|
delta_records_in: number; // current - prior
|
|
delta_records_out: number;
|
|
delta_accepted: number;
|
|
delta_quarantined: number;
|
|
pct_change_out: number | null; // null when prior had 0 records
|
|
input_hash_match: boolean;
|
|
output_hash_match: boolean;
|
|
// alert if input_hash matches but output_hash diverges
|
|
deterministic_violation: boolean;
|
|
notes: string[];
|
|
}
|
|
|
|
export interface DriftReport {
|
|
schema_version: number;
|
|
run_id: string;
|
|
prior_run_id: string | null; // null when no prior run on disk
|
|
generated_at: string;
|
|
severity: DriftSeverity;
|
|
stages: StageDrift[];
|
|
// Top-level swings the human reader should see immediately.
|
|
flags: string[];
|
|
}
|
|
|
|
export function validateDriftReport(input: unknown): ValidationResult<DriftReport> {
|
|
const errors: string[] = [];
|
|
if (typeof input !== "object" || input === null) {
|
|
return { valid: false, errors: ["expected object"] };
|
|
}
|
|
const r = input as Record<string, unknown>;
|
|
let ok = true;
|
|
|
|
if (r.schema_version !== DRIFT_REPORT_SCHEMA_VERSION) {
|
|
errors.push(`schema_version: expected ${DRIFT_REPORT_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
|
|
ok = false;
|
|
}
|
|
ok = requireString(r.run_id, "run_id", errors) && ok;
|
|
if (r.prior_run_id !== null && typeof r.prior_run_id !== "string") {
|
|
errors.push("prior_run_id: must be string or null");
|
|
ok = false;
|
|
}
|
|
ok = requireIsoTimestamp(r.generated_at, "generated_at", errors) && ok;
|
|
if (!["ok", "warn", "alert"].includes(r.severity as string)) {
|
|
errors.push(`severity: must be ok|warn|alert, got ${JSON.stringify(r.severity)}`);
|
|
ok = false;
|
|
}
|
|
if (!Array.isArray(r.stages)) {
|
|
errors.push("stages: expected array");
|
|
ok = false;
|
|
}
|
|
if (!Array.isArray(r.flags)) {
|
|
errors.push("flags: expected array");
|
|
ok = false;
|
|
}
|
|
|
|
if (!ok) return { valid: false, errors };
|
|
return { valid: true, value: r as unknown as DriftReport };
|
|
}
|