// RagSample — entry in exports/rag/playbooks.jsonl. Spec shape exactly, // plus provenance + success_score (so the index can re-rank by quality). import { ValidationResult, requireString, requireNumber, requireIsoTimestamp, requireProvenance, requireStringArray, } from "./types"; export const RAG_SAMPLE_SCHEMA_VERSION = 1; // Allowed source_category values. RAG accepts accepted/partial freely; // needs_human_review is opt-in (must be tagged so consumers can filter // it out for SFT). export const RAG_ALLOWED_CATEGORIES = ["accepted", "partially_accepted", "needs_human_review"] as const; export type RagSourceCategory = (typeof RAG_ALLOWED_CATEGORIES)[number]; export interface RagSample { schema_version: number; id: string; title: string; content: string; tags: string[]; source_run_id: string; // Snapshot of the score the source carried at export time. Lets a // consumer see "this was partial" without re-reading scored-runs. success_score: RagSourceCategory; // Same value as success_score by spec (now.md asks for both fields). // Kept distinct so future schemas can diverge them (e.g. an // "is_review_material" flag) without breaking old consumers. source_category: RagSourceCategory; embedding_text: string; // the text to embed (often == content but can be shorter) created_at: string; provenance: { source_file: string; line_offset?: number; sig_hash: string; recorded_at: string }; } export function validateRagSample(input: unknown): ValidationResult { const errors: string[] = []; if (typeof input !== "object" || input === null) return { valid: false, errors: ["expected object"] }; const r = input as Record; let ok = true; if (r.schema_version !== RAG_SAMPLE_SCHEMA_VERSION) { errors.push(`schema_version: expected ${RAG_SAMPLE_SCHEMA_VERSION}, got ${JSON.stringify(r.schema_version)}`); ok = false; } ok = requireString(r.id, "id", errors) && ok; ok = requireString(r.title, "title", errors) && ok; ok = requireString(r.content, "content", errors) && ok; ok = requireString(r.embedding_text, "embedding_text", errors) && ok; ok = requireString(r.source_run_id, "source_run_id", errors) && ok; ok = requireIsoTimestamp(r.created_at, "created_at", errors) && ok; ok = requireStringArray(r.tags, "tags", errors) && ok; ok = requireProvenance(r.provenance, "provenance", errors) && ok; if (!RAG_ALLOWED_CATEGORIES.includes(r.success_score as RagSourceCategory)) { errors.push(`success_score: must be one of ${RAG_ALLOWED_CATEGORIES.join("|")} (rejected never enters RAG)`); ok = false; } if (!RAG_ALLOWED_CATEGORIES.includes(r.source_category as RagSourceCategory)) { errors.push(`source_category: must be one of ${RAG_ALLOWED_CATEGORIES.join("|")}`); ok = false; } if (r.success_score !== r.source_category) { errors.push("success_score and source_category must match (mirrored fields per spec)"); ok = false; } if (typeof r.content === "string" && (r.content as string).trim().length === 0) { errors.push("content: must be non-whitespace"); ok = false; } if (!ok) return { valid: false, errors }; return { valid: true, value: r as unknown as RagSample }; }