lakehouse/auditor/schemas/distillation/drift_report.ts

// drift_report.ts — comparison of a current run summary vs the
// previous run summary on disk. Spec calls this "drift detection";
// concretely it answers: did the pipeline behave the same way as
// last time, and if not, was the change explained by an input change
// or did it appear out of nowhere (silent drift)?
//
// Severity:
//   ok    — within 20% on every metric, no hash surprises
//   warn  — record-count or category swing > 20%, OR new error class
//   alert — output_hash differs while input_hash is identical
//           (deterministic violation — same input → different output)

import {
  ValidationResult, requireString, requireIsoTimestamp,
} from "./types";
import type { StageName } from "./stage_receipt";

export const DRIFT_REPORT_SCHEMA_VERSION = 2;
export const DRIFT_THRESHOLD_PCT = 0.20;

export type DriftSeverity = "ok" | "warn" | "alert";

export interface StageDrift {
  stage: StageName;
  delta_records_in: number;       // current - prior
  delta_records_out: number;
  delta_accepted: number;
  delta_quarantined: number;
  pct_change_out: number | null;  // null when prior had 0 records
  // null when input_hash isn't materialized into the stage summary —
  // schema v1 lied and reported `true` here. v2 is honest: callers
  // that want determinism enforcement must read the full StageReceipt
  // off disk and compute input_hash equality there.
  input_hash_match: boolean | null;
  output_hash_match: boolean;
  // alert if input_hash matches but output_hash diverges
  deterministic_violation: boolean;
  notes: string[];
}

export interface DriftReport {
  schema_version: number;
  run_id: string;
  prior_run_id: string | null;    // null when no prior run on disk
  generated_at: string;
  severity: DriftSeverity;
  stages: StageDrift[];
  // Top-level swings the human reader should see immediately.
  flags: string[];
}

export function validateDriftReport(input: unknown): ValidationResult<DriftReport> {
  const errors: string[] = [];
  if (typeof input !== "object" || input === null) {
    return { valid: false, errors: ["expected object"] };
  }
  const r = input as Record<string, unknown>;
  let ok = true;

  if (r.schema_version !== DRIFT_REPORT_SCHEMA_VERSION) {
    errors.push(`schema_version: expected ${DRIFT_REPORT_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
    ok = false;
  }
  ok = requireString(r.run_id, "run_id", errors) && ok;
  if (r.prior_run_id !== null && typeof r.prior_run_id !== "string") {
    errors.push("prior_run_id: must be string or null");
    ok = false;
  }
  ok = requireIsoTimestamp(r.generated_at, "generated_at", errors) && ok;
  if (!["ok", "warn", "alert"].includes(r.severity as string)) {
    errors.push(`severity: must be ok|warn|alert, got ${JSON.stringify(r.severity)}`);
    ok = false;
  }
  if (!Array.isArray(r.stages)) {
    errors.push("stages: expected array");
    ok = false;
  }
  if (!Array.isArray(r.flags)) {
    errors.push("flags: expected array");
    ok = false;
  }

  if (!ok) return { valid: false, errors };
  return { valid: true, value: r as unknown as DriftReport };
}