lakehouse/auditor/schemas/distillation/run_summary.ts

// run_summary.ts — aggregates StageReceipt rows for one run_id.
// Spec field set: total records processed, total accepted/rejected/
// quarantined, dataset sizes, validation status, overall hash of run.

import {
  ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireSha256,
} from "./types";
import type { StageName } from "./stage_receipt";

export const RUN_SUMMARY_SCHEMA_VERSION = 1;

export interface RunStageSummary {
  stage: StageName;
  records_in: number;
  records_out: number;
  accepted: number;
  rejected: number;
  quarantined: number;
  skipped: number;
  passed: boolean;
  duration_ms: number;
  output_hash: string;
}

export interface RunSummary {
  schema_version: number;
  run_id: string;
  started_at: string;        // earliest stage timestamp
  ended_at: string;          // latest stage timestamp + duration
  git_commit: string;
  stages: RunStageSummary[];
  // Aggregates across stages
  total_records_in: number;
  total_records_out: number;
  total_accepted: number;
  total_rejected: number;
  total_quarantined: number;
  total_skipped: number;
  // Dataset sizes — final outputs of each export stage
  rag_records: number;
  sft_records: number;
  preference_pairs: number;
  // Pipeline-wide pass = AND of every stage validation.passed
  overall_passed: boolean;
  // Run-wide hash: sha256 over each stage's output hash, sorted by stage name.
  // Detects ANY change in any stage output across runs.
  run_hash: string;
  total_duration_ms: number;
}

export function validateRunSummary(input: unknown): ValidationResult<RunSummary> {
  const errors: string[] = [];
  if (typeof input !== "object" || input === null) {
    return { valid: false, errors: ["expected object"] };
  }
  const r = input as Record<string, unknown>;
  let ok = true;

  if (r.schema_version !== RUN_SUMMARY_SCHEMA_VERSION) {
    errors.push(`schema_version: expected ${RUN_SUMMARY_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
    ok = false;
  }
  ok = requireString(r.run_id, "run_id", errors) && ok;
  ok = requireIsoTimestamp(r.started_at, "started_at", errors) && ok;
  ok = requireIsoTimestamp(r.ended_at, "ended_at", errors) && ok;
  if (typeof r.git_commit !== "string" || !/^[0-9a-f]{40}$/.test(r.git_commit as string)) {
    errors.push("git_commit: must be 40-char hex");
    ok = false;
  }
  if (typeof r.overall_passed !== "boolean") {
    errors.push("overall_passed: must be boolean");
    ok = false;
  }
  ok = requireSha256(r.run_hash, "run_hash", errors) && ok;
  for (const k of ["total_records_in", "total_records_out", "total_accepted", "total_rejected",
                   "total_quarantined", "total_skipped", "rag_records", "sft_records",
                   "preference_pairs", "total_duration_ms"]) {
    if (typeof (r as any)[k] !== "number") {
      errors.push(`${k}: expected number`);
      ok = false;
    }
  }
  if (!Array.isArray(r.stages)) {
    errors.push("stages: expected array");
    ok = false;
  }

  if (!ok) return { valid: false, errors };
  return { valid: true, value: r as unknown as RunSummary };
}