// SftSample — entry in exports/sft/instruction_response.jsonl. Spec // non-negotiable: ONLY accepted runs, never partial/rejected/needs_human. // Validator enforces that invariant — exporters can't bypass. import { ValidationResult, requireString, requireIsoTimestamp, requireProvenance, requireNumber, } from "./types"; export const SFT_SAMPLE_SCHEMA_VERSION = 1; // SFT default: only `accepted` ships. With --include-partial CLI flag, // `partially_accepted` becomes legal. `rejected` and `needs_human_review` // NEVER ship to SFT — that's the contamination firewall. export const SFT_QUALITY_SCORES = ["accepted", "partially_accepted"] as const; export type SftQualityScore = (typeof SFT_QUALITY_SCORES)[number]; export interface SftSample { schema_version: number; id: string; instruction: string; // the prompt / user message context: string; // retrieved context that was visible (empty string allowed; null/undefined not) response: string; // the model output that was accepted source_run_id: string; quality_score: SftQualityScore; created_at: string; provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string }; } export function validateSftSample(input: unknown): ValidationResult { const errors: string[] = []; if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] }; const r = input as Record; let ok = true; if (r.schema_version !== SFT_SAMPLE_SCHEMA_VERSION) { errors.push(`schema_version: expected ${SFT_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`); ok = false; } ok = requireString(r.id, "id", errors) && ok; ok = requireString(r.instruction, "instruction", errors) && ok; ok = requireString(r.response, "response", errors) && ok; ok = requireString(r.source_run_id, "source_run_id", errors) && ok; ok = requireIsoTimestamp(r.created_at, "created_at", errors) && ok; ok = requireProvenance(r.provenance, "provenance", errors) && ok; // Empty pair guard. if (typeof r.instruction === "string" && (r.instruction as string).trim().length === 0) { errors.push("instruction: must be non-whitespace (no empty pairs)"); ok = false; } if (typeof r.response === "string" && (r.response as string).trim().length === 0) { errors.push("response: must be non-whitespace (no empty pairs)"); ok = false; } // Context is required-string but empty is allowed (some SFT samples // are pure instruction→response with no retrieval context). if (typeof r.context !== "string") { errors.push("context: expected string (use empty string for no-context samples)"); ok = false; } // The non-negotiable: SFT samples MUST have quality_score in // SFT_QUALITY_SCORES. Anything else is a leak. if (!SFT_QUALITY_SCORES.includes(r.quality_score as SftQualityScore)) { errors.push(`quality_score: must be one of ${SFT_QUALITY_SCORES.join("|")} (no rejected/needs_human leak into SFT — spec non-negotiable). Got ${JSON.stringify(r.quality_score)}`); ok = false; } if (!ok) return { valid: false, errors }; return { valid: true, value: r as unknown as SftSample }; }