diff --git a/auditor/schemas/distillation/drift_report.ts b/auditor/schemas/distillation/drift_report.ts index f05e71f..9f979f2 100644 --- a/auditor/schemas/distillation/drift_report.ts +++ b/auditor/schemas/distillation/drift_report.ts @@ -15,7 +15,7 @@ import { } from "./types"; import type { StageName } from "./stage_receipt"; -export const DRIFT_REPORT_SCHEMA_VERSION = 1; +export const DRIFT_REPORT_SCHEMA_VERSION = 2; export const DRIFT_THRESHOLD_PCT = 0.20; export type DriftSeverity = "ok" | "warn" | "alert"; @@ -27,7 +27,11 @@ export interface StageDrift { delta_accepted: number; delta_quarantined: number; pct_change_out: number | null; // null when prior had 0 records - input_hash_match: boolean; + // null when input_hash isn't materialized into the stage summary — + // schema v1 lied and reported `true` here. v2 is honest: callers + // that want determinism enforcement must read the full StageReceipt + // off disk and compute input_hash equality there. + input_hash_match: boolean | null; output_hash_match: boolean; // alert if input_hash matches but output_hash diverges deterministic_violation: boolean; diff --git a/auditor/schemas/distillation/evidence_record.ts b/auditor/schemas/distillation/evidence_record.ts index 62bbe39..6730646 100644 --- a/auditor/schemas/distillation/evidence_record.ts +++ b/auditor/schemas/distillation/evidence_record.ts @@ -121,6 +121,14 @@ export interface EvidenceRecord { // and have no text payload. Present for distilled_*, contract_analyses, // mode_experiments, scrum_reviews etc. text?: string; + + // ── Domain-specific metadata bucket ── + // Source-specific fields that don't earn a top-level slot. e.g. + // contract_analyses rows carry `contractor` here; mode_experiments + // could carry `corpus_set`. Typed scalar values only — keep this + // small or it becomes a junk drawer. Added 2026-04-27 (Kimi audit + // flagged `(ev as any).contractor` schema bypass at export_sft.ts:126). + metadata?: Record; } export function validateEvidenceRecord(input: unknown): ValidationResult { diff --git a/crates/gateway/src/v1/mode.rs b/crates/gateway/src/v1/mode.rs index 29ead20..4123b6e 100644 --- a/crates/gateway/src/v1/mode.rs +++ b/crates/gateway/src/v1/mode.rs @@ -1032,14 +1032,14 @@ mod tests { preferred_mode: "codereview".into(), fallback_modes: vec!["consensus".into()], default_model: "qwen3-coder:480b".into(), - matrix_corpus: Some("distilled_procedural_v1".into()), + matrix_corpus: vec!["distilled_procedural_v1".into()], }, TaskClassEntry { name: "broken".into(), preferred_mode: "nonsense_mode".into(), fallback_modes: vec!["consensus".into()], default_model: "x".into(), - matrix_corpus: None, + matrix_corpus: vec![], }, ], default: DefaultEntry { diff --git a/scripts/distillation/export_sft.ts b/scripts/distillation/export_sft.ts index 2a6c592..c6192fe 100644 --- a/scripts/distillation/export_sft.ts +++ b/scripts/distillation/export_sft.ts @@ -122,9 +122,18 @@ function synthesizeSft( case "observer_reviews": instruction = `Observer-review the latest attempt on '${ev.source_files?.[0] ?? ""}'. Verdict: accept | reject | cycle.`; break; - case "contract_analyses": - instruction = `Analyze contractor '${(ev as any).contractor ?? ""}' for permit '${ev.task_id.replace(/^permit:/, "")}'. Recommend with risk markers.`; + case "contract_analyses": { + // Read contractor from the typed metadata bucket (populated in + // transforms.ts for contract_analyses rows). Pre-2026-04-27 this + // used `(ev as any).contractor` and silently emitted "" + // for every row because EvidenceRecord didn't carry the field. + const contractor = typeof ev.metadata?.contractor === "string" ? ev.metadata.contractor : null; + const permit = ev.task_id.replace(/^permit:/, ""); + instruction = contractor + ? `Analyze contractor '${contractor}' for permit '${permit}'. Recommend with risk markers.` + : `Analyze permit '${permit}'. Recommend with risk markers.`; break; + } case "outcomes": instruction = `Run scenario; report per-event outcome with citations.`; break; diff --git a/scripts/distillation/receipts.ts b/scripts/distillation/receipts.ts index c3b11bc..43b99d7 100644 --- a/scripts/distillation/receipts.ts +++ b/scripts/distillation/receipts.ts @@ -451,7 +451,7 @@ export function buildDrift(current: RunSummary, prior: RunSummary | null): Drift delta_accepted: cur.accepted, delta_quarantined: cur.quarantined, pct_change_out: null, - input_hash_match: false, + input_hash_match: null, // no prior stage to compare output_hash_match: false, deterministic_violation: false, notes: ["stage not present in prior run"], @@ -461,12 +461,12 @@ export function buildDrift(current: RunSummary, prior: RunSummary | null): Drift } const pct = pctChange(pri.records_out, cur.records_out); const out_match = pri.output_hash === cur.output_hash; - const inp_match = (current.stages.find(s => s.stage === cur.stage)?.output_hash ?? "") - !== "" /* placeholder */; - // We have output_hash on stage summaries but not input_hash — - // input_hash lives on the full StageReceipt, which we can re-read - // from the run dir if needed. For simplicity, drift compares the - // OUTPUT hashes (what really changed). + // input_hash is NOT materialized into stage summaries (lives on the + // per-stage StageReceipt files on disk). We don't load them here, so + // we honestly report null. Schema v2 makes this explicit; v1 returned + // `true` unconditionally which made deterministic_violation always + // false even when it should have alerted. Cross-run determinism + // enforcement is its own pass — see ./scripts/distill audit-full. const notes: string[] = []; if (pct !== null && Math.abs(pct) > DRIFT_THRESHOLD_PCT) { const dir = pct > 0 ? "spike" : "drop"; @@ -492,9 +492,9 @@ export function buildDrift(current: RunSummary, prior: RunSummary | null): Drift delta_accepted: cur.accepted - pri.accepted, delta_quarantined: cur.quarantined - pri.quarantined, pct_change_out: pct, - input_hash_match: true, // simplified — see comment above + input_hash_match: null, // not computed at this layer; see comment above output_hash_match: out_match, - deterministic_violation: false, // requires input_hash match, see future tightening + deterministic_violation: false, // requires input_hash match — null means "unknown", not "verified" notes, }); } diff --git a/scripts/distillation/replay.ts b/scripts/distillation/replay.ts index 88a5ce3..0a6559f 100644 --- a/scripts/distillation/replay.ts +++ b/scripts/distillation/replay.ts @@ -375,7 +375,12 @@ export async function replay(opts: ReplayRequest, root = DEFAULT_ROOT): Promise< } } - const recorded_run_id = `replay:${task_hash.slice(0, 16)}:${Date.now()}`; + // Stable derivation from task_hash + recorded_at (already an ISO + // timestamp captured at start of the call). Avoids a second wall-clock + // read and makes run_id reproducible given a fixed recorded_at — useful + // for fixture-driven tests + acceptance gates. Replaces Date.now()-based + // id post-Kimi-audit 2026-04-27. + const recorded_run_id = `replay:${task_hash.slice(0, 16)}:${(await canonicalSha256(recorded_at)).slice(0, 12)}`; const result: ReplayResult = { input_task: opts.task, task_hash, diff --git a/scripts/distillation/score_runs.ts b/scripts/distillation/score_runs.ts index edd8bfa..45f21b4 100644 --- a/scripts/distillation/score_runs.ts +++ b/scripts/distillation/score_runs.ts @@ -86,6 +86,17 @@ function gitDirty(root: string): boolean { return r.status === 0 && r.stdout.trim().length > 0; } +// Composite dedup key — `sig_hash:scorer_version`. Keying on sig_hash +// alone made scorer-rule bumps invisible: a bumped SCORER_VERSION +// produced different scoring categories, but pre-existing rows on disk +// (with the OLD version) still matched the new sig_hash and the new +// scoring was silently skipped. Compositing version forces re-scoring +// when the version changes. Caller tags `scorer_version` on the +// ScoredRun row, which we read alongside sig_hash. +function dedupKey(sig_hash: string, scorer_version: string): string { + return `${sig_hash}:${scorer_version}`; +} + function loadSeenHashes(out_path: string): Set { const seen = new Set(); if (!existsSync(out_path)) return seen; @@ -93,7 +104,9 @@ function loadSeenHashes(out_path: string): Set { if (!line) continue; try { const row = JSON.parse(line); - if (row?.provenance?.sig_hash) seen.add(row.provenance.sig_hash); + const sh = row?.provenance?.sig_hash; + const sv = row?.scorer_version; + if (sh && sv) seen.add(dedupKey(sh, sv)); } catch { /* malformed — ignore */ } } return seen; @@ -156,11 +169,12 @@ async function processEvidenceFile( } const scored = await buildScoredRun(ev.value as EvidenceRecord, out_relpath, i, opts.recorded_at); - if (seen.has(scored.provenance.sig_hash)) { + const key = dedupKey(scored.provenance.sig_hash, scored.scorer_version); + if (seen.has(key)) { result.rows_deduped++; continue; } - seen.add(scored.provenance.sig_hash); + seen.add(key); const sv = validateScoredRun(scored); if (!sv.valid) { diff --git a/scripts/distillation/scorer.ts b/scripts/distillation/scorer.ts index 5cd292d..2ad7317 100644 --- a/scripts/distillation/scorer.ts +++ b/scripts/distillation/scorer.ts @@ -27,7 +27,11 @@ import type { ScoreCategory, ScoredRun } from "../../auditor/schemas/distillatio import { SCORED_RUN_SCHEMA_VERSION } from "../../auditor/schemas/distillation/scored_run"; import { canonicalSha256 } from "../../auditor/schemas/distillation/types"; -export const SCORER_VERSION = process.env.LH_SCORER_VERSION ?? "v1.0.0"; +// Hardcoded — the deterministic-output contract requires this. Bump the +// literal in the same commit as any scoring-rule change so the version +// stamp moves atomically with logic. Env override removed 2026-04-27 +// after Kimi audit flagged identical-input-different-version drift. +export const SCORER_VERSION = "v1.0.0"; export interface ScoreOutput { category: ScoreCategory; diff --git a/scripts/distillation/transforms.ts b/scripts/distillation/transforms.ts index 6146618..da5de77 100644 --- a/scripts/distillation/transforms.ts +++ b/scripts/distillation/transforms.ts @@ -100,6 +100,9 @@ export const TRANSFORMS: TransformDef[] = [ cost_usd: typeof row.cost === "number" ? row.cost / 1_000_000 : undefined, latency_ms: row.duration_ms, text: row.analysis, + metadata: typeof row.contractor === "string" && row.contractor.length > 0 + ? { contractor: row.contractor } + : undefined, }), }, { @@ -178,7 +181,11 @@ export const TRANSFORMS: TransformDef[] = [ // even though the text field is empty. source_file_relpath: "data/_kb/auto_apply.jsonl", transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => { - const ts: string = row.ts ?? new Date().toISOString(); + // Deterministic fallback: use the source-file's recorded_at when + // the row itself lacks a ts. Wall-clock (new Date()) leaked here + // pre-2026-04-27 — broke bit-identical reproducibility on rows + // that historically wrote without a ts field. + const ts: string = row.ts ?? recorded_at; const action = String(row.action ?? "unknown"); const success = action === "committed"; const reverted = action.includes("reverted");