lakehouse/auditor/schemas/distillation/rag_sample.ts

// RagSample — entry in exports/rag/playbooks.jsonl. Spec shape exactly,
// plus provenance + success_score (so the index can re-rank by quality).
import {
  ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireProvenance, requireStringArray,
} from "./types";

export const RAG_SAMPLE_SCHEMA_VERSION = 1;

// Allowed source_category values. RAG accepts accepted/partial freely;
// needs_human_review is opt-in (must be tagged so consumers can filter
// it out for SFT).
export const RAG_ALLOWED_CATEGORIES = ["accepted", "partially_accepted", "needs_human_review"] as const;
export type RagSourceCategory = (typeof RAG_ALLOWED_CATEGORIES)[number];

export interface RagSample {
  schema_version: number;
  id: string;
  title: string;
  content: string;
  tags: string[];
  source_run_id: string;
  // Snapshot of the score the source carried at export time. Lets a
  // consumer see "this was partial" without re-reading scored-runs.
  success_score: RagSourceCategory;
  // Same value as success_score by spec (now.md asks for both fields).
  // Kept distinct so future schemas can diverge them (e.g. an
  // "is_review_material" flag) without breaking old consumers.
  source_category: RagSourceCategory;
  embedding_text: string;                            // the text to embed (often == content but can be shorter)
  created_at: string;
  provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string };
}

export function validateRagSample(input: unknown): ValidationResult<RagSample> {
  const errors: string[] = [];
  if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] };
  const r = input as Record<string, unknown>;
  let ok = true;

  if (r.schema_version !== RAG_SAMPLE_SCHEMA_VERSION) {
    errors.push(`schema_version: expected ${RAG_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`);
    ok = false;
  }
  ok = requireString(r.id, "id", errors) && ok;
  ok = requireString(r.title, "title", errors) && ok;
  ok = requireString(r.content, "content", errors) && ok;
  ok = requireString(r.embedding_text, "embedding_text", errors) && ok;
  ok = requireString(r.source_run_id, "source_run_id", errors) && ok;
  ok = requireIsoTimestamp(r.created_at, "created_at", errors) && ok;
  ok = requireStringArray(r.tags, "tags", errors) && ok;
  ok = requireProvenance(r.provenance, "provenance", errors) && ok;

  if (!RAG_ALLOWED_CATEGORIES.includes(r.success_score as RagSourceCategory)) {
    errors.push(`success_score: must be one of ${RAG_ALLOWED_CATEGORIES.join("|")} (rejected never enters RAG)`);
    ok = false;
  }
  if (!RAG_ALLOWED_CATEGORIES.includes(r.source_category as RagSourceCategory)) {
    errors.push(`source_category: must be one of ${RAG_ALLOWED_CATEGORIES.join("|")}`);
    ok = false;
  }
  if (r.success_score !== r.source_category) {
    errors.push("success_score and source_category must match (mirrored fields per spec)");
    ok = false;
  }
  if (typeof r.content === "string" && (r.content as string).trim().length === 0) {
    errors.push("content: must be non-whitespace");
    ok = false;
  }

  if (!ok) return { valid: false, errors };
  return { valid: true, value: r as unknown as RagSample };
}