// quarantine.ts — shared sink for records the exporters refuse to emit. // // Every exporter routes skipped records here with a structured reason // + the original record + provenance back to the source. Spec // non-negotiable: no silent drops. If a record can't ship, it must be // observable here. // // Path: exports/quarantine/.jsonl (one file per exporter, // append-mode, JSONL lines). import { mkdirSync, appendFileSync, existsSync, readFileSync } from "node:fs"; import { resolve, dirname } from "node:path"; export const QUARANTINE_REASONS = [ "missing_provenance", "missing_source_run_id", "empty_content", "schema_violation", "unsafe_sft_category", // rejected/needs_human_review tried to enter SFT "unsafe_rag_category", // rejected tried to enter RAG "invalid_preference_pairing", // pair shares no comparable signal "hallucinated_file_path", // referenced file doesn't exist on disk "duplicate_id", // id collision within the same export "self_pairing", // chosen == rejected (preference) "category_disallowed", // exporter-specific category gate ] as const; export type QuarantineReason = (typeof QUARANTINE_REASONS)[number]; export interface QuarantineEntry { exporter: "rag" | "sft" | "preference"; reason: QuarantineReason; source_record: Record; // the scored-run that was rejected errors: string[]; // detailed error list (from validators or pairing logic) recorded_at: string; // ISO 8601 // Provenance carried over from the source so the quarantine row can // be traced back to the underlying evidence/scored-run. source_provenance?: { source_file?: string; line_offset?: number; sig_hash?: string; }; } export class QuarantineWriter { private root: string; private exporter: "rag" | "sft" | "preference"; private path: string; private dry_run: boolean; // Counts by reason so the exporter can emit a summary without reading // the file back. public readonly counts: Record = QUARANTINE_REASONS.reduce( (acc, r) => { acc[r] = 0; return acc; }, {} as Record, ); public total = 0; // Buffer in dry_run so callers can still see what would have been // quarantined. public readonly buffered: QuarantineEntry[] = []; constructor(root: string, exporter: "rag" | "sft" | "preference", dry_run = false) { this.root = root; this.exporter = exporter; this.path = resolve(root, "exports/quarantine", `${exporter}.jsonl`); this.dry_run = dry_run; } add(entry: Omit & { recorded_at: string }) { const full: QuarantineEntry = { exporter: this.exporter, reason: entry.reason, source_record: entry.source_record, errors: entry.errors, recorded_at: entry.recorded_at, source_provenance: entry.source_provenance, }; this.counts[full.reason]++; this.total++; if (this.dry_run) { this.buffered.push(full); } else { mkdirSync(dirname(this.path), { recursive: true }); appendFileSync(this.path, JSON.stringify(full) + "\n"); } } // Summary string useful for CLI output / reports. summary(): string { if (this.total === 0) return "0 quarantined"; const parts = Object.entries(this.counts) .filter(([, n]) => n > 0) .map(([r, n]) => `${r}=${n}`) .join(" "); return `${this.total} quarantined (${parts})`; } outputPath(): string { return this.path; } } // Helper: load existing quarantine entries to dedupe by sig_hash on // re-runs. Only used when the caller wants per-record idempotency. export function loadQuarantinedSigs(quarantine_path: string): Set { const seen = new Set(); if (!existsSync(quarantine_path)) return seen; for (const line of readFileSync(quarantine_path, "utf8").split("\n")) { if (!line) continue; try { const e = JSON.parse(line) as QuarantineEntry; if (e.source_provenance?.sig_hash) seen.add(e.source_provenance.sig_hash); } catch { /* malformed — skip */ } } return seen; }