lakehouse/auditor/schemas/distillation/sft_sample.ts

// SftSample — entry in exports/sft/instruction_response.jsonl. Spec
// non-negotiable: ONLY accepted runs, never partial/rejected/needs_human.
// Validator enforces that invariant — exporters can't bypass.
import {
  ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireNumber,
} from "./types";

export const SFT_SAMPLE_SCHEMA_VERSION = 1;

export interface SftSample {
  schema_version: number;
  instruction: string;             // the prompt / user message
  context?: string;                // optional retrieved context that was visible
  response: string;                // the model output that was accepted
  source_run_id: string;
  quality_score: "accepted";       // hard-pinned — see validator
  exported_at: string;
  provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
}

export function validateSftSample(input: unknown): ValidationResult<SftSample> {
  const errors: string[] = [];
  if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
  const r = input as Record<string, unknown>;
  let ok = true;

  if (r.schema_version !== SFT_SAMPLE_SCHEMA_VERSION) {
    errors.push(`schema_version: expected ${SFT_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
    ok = false;
  }
  ok = requireString(r.instruction, "instruction", errors) && ok;
  ok = requireString(r.response, "response", errors) && ok;
  ok = requireString(r.source_run_id, "source_run_id", errors) && ok;
  ok = requireIsoTimestamp(r.exported_at, "exported_at", errors) && ok;
  ok = requireProvenance(r.provenance, "provenance", errors) && ok;

  // Empty pair guard.
  if (typeof r.instruction === "string" && (r.instruction as string).trim().length === 0) {
    errors.push("instruction: must be non-whitespace (no empty pairs)");
    ok = false;
  }
  if (typeof r.response === "string" && (r.response as string).trim().length === 0) {
    errors.push("response: must be non-whitespace (no empty pairs)");
    ok = false;
  }
  // The non-negotiable: SFT samples MUST have quality_score=accepted.
  // Anything else is a leak from rejected/partial/needs_human into SFT.
  if (r.quality_score !== "accepted") {
    errors.push(`quality_score: must be 'accepted' (no rejected/partial/needs_human leak into SFT — spec non-negotiable). Got ${JSON.stringify(r.quality_score)}`);
    ok = false;
  }
  if (r.context !== undefined && typeof r.context !== "string") {
    errors.push("context: expected string when present");
    ok = false;
  }

  if (!ok) return { valid: false, errors };
  return { valid: true, value: r as unknown as SftSample };
}