lakehouse/auditor/schemas/distillation/sft_sample.ts

// SftSample — entry in exports/sft/instruction_response.jsonl. Spec
// non-negotiable: ONLY accepted runs, never partial/rejected/needs_human.
// Validator enforces that invariant — exporters can't bypass.
import {
  ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireNumber,
} from "./types";

export const SFT_SAMPLE_SCHEMA_VERSION = 1;

// SFT default: only `accepted` ships. With --include-partial CLI flag,
// `partially_accepted` becomes legal. `rejected` and `needs_human_review`
// NEVER ship to SFT — that's the contamination firewall.
export const SFT_QUALITY_SCORES = ["accepted", "partially_accepted"] as const;
export type SftQualityScore = (typeof SFT_QUALITY_SCORES)[number];

export interface SftSample {
  schema_version: number;
  id: string;
  instruction: string;             // the prompt / user message
  context: string;                 // retrieved context that was visible (empty string allowed; null/undefined not)
  response: string;                // the model output that was accepted
  source_run_id: string;
  quality_score: SftQualityScore;
  created_at: string;
  provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
}

export function validateSftSample(input: unknown): ValidationResult<SftSample> {
  const errors: string[] = [];
  if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
  const r = input as Record<string, unknown>;
  let ok = true;

  if (r.schema_version !== SFT_SAMPLE_SCHEMA_VERSION) {
    errors.push(`schema_version: expected ${SFT_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
    ok = false;
  }
  ok = requireString(r.id, "id", errors) && ok;
  ok = requireString(r.instruction, "instruction", errors) && ok;
  ok = requireString(r.response, "response", errors) && ok;
  ok = requireString(r.source_run_id, "source_run_id", errors) && ok;
  ok = requireIsoTimestamp(r.created_at, "created_at", errors) && ok;
  ok = requireProvenance(r.provenance, "provenance", errors) && ok;

  // Empty pair guard.
  if (typeof r.instruction === "string" && (r.instruction as string).trim().length === 0) {
    errors.push("instruction: must be non-whitespace (no empty pairs)");
    ok = false;
  }
  if (typeof r.response === "string" && (r.response as string).trim().length === 0) {
    errors.push("response: must be non-whitespace (no empty pairs)");
    ok = false;
  }
  // Context is required-string but empty is allowed (some SFT samples
  // are pure instruction→response with no retrieval context).
  if (typeof r.context !== "string") {
    errors.push("context: expected string (use empty string for no-context samples)");
    ok = false;
  }
  // The non-negotiable: SFT samples MUST have quality_score in
  // SFT_QUALITY_SCORES. Anything else is a leak.
  if (!SFT_QUALITY_SCORES.includes(r.quality_score as SftQualityScore)) {
    errors.push(`quality_score: must be one of ${SFT_QUALITY_SCORES.join("|")} (no rejected/needs_human leak into SFT — spec non-negotiable). Got ${JSON.stringify(r.quality_score)}`);
    ok = false;
  }

  if (!ok) return { valid: false, errors };
  return { valid: true, value: r as unknown as SftSample };
}