lakehouse/scripts/distillation/quarantine.ts

// quarantine.ts — shared sink for records the exporters refuse to emit.
//
// Every exporter routes skipped records here with a structured reason
// + the original record + provenance back to the source. Spec
// non-negotiable: no silent drops. If a record can't ship, it must be
// observable here.
//
// Path: exports/quarantine/<exporter>.jsonl (one file per exporter,
// append-mode, JSONL lines).

import { mkdirSync, appendFileSync, existsSync, readFileSync } from "node:fs";
import { resolve, dirname } from "node:path";

export const QUARANTINE_REASONS = [
  "missing_provenance",
  "missing_source_run_id",
  "empty_content",
  "schema_violation",
  "unsafe_sft_category",            // rejected/needs_human_review tried to enter SFT
  "unsafe_rag_category",            // rejected tried to enter RAG
  "invalid_preference_pairing",     // pair shares no comparable signal
  "hallucinated_file_path",         // referenced file doesn't exist on disk
  "duplicate_id",                   // id collision within the same export
  "self_pairing",                   // chosen == rejected (preference)
  "category_disallowed",            // exporter-specific category gate
] as const;
export type QuarantineReason = (typeof QUARANTINE_REASONS)[number];

export interface QuarantineEntry {
  exporter: "rag" | "sft" | "preference";
  reason: QuarantineReason;
  source_record: Record<string, unknown>;   // the scored-run that was rejected
  errors: string[];                          // detailed error list (from validators or pairing logic)
  recorded_at: string;                       // ISO 8601
  // Provenance carried over from the source so the quarantine row can
  // be traced back to the underlying evidence/scored-run.
  source_provenance?: {
    source_file?: string;
    line_offset?: number;
    sig_hash?: string;
  };
}

export class QuarantineWriter {
  private root: string;
  private exporter: "rag" | "sft" | "preference";
  private path: string;
  private dry_run: boolean;
  // Counts by reason so the exporter can emit a summary without reading
  // the file back.
  public readonly counts: Record<QuarantineReason, number> = QUARANTINE_REASONS.reduce(
    (acc, r) => { acc[r] = 0; return acc; },
    {} as Record<QuarantineReason, number>,
  );
  public total = 0;
  // Buffer in dry_run so callers can still see what would have been
  // quarantined.
  public readonly buffered: QuarantineEntry[] = [];

  constructor(root: string, exporter: "rag" | "sft" | "preference", dry_run = false) {
    this.root = root;
    this.exporter = exporter;
    this.path = resolve(root, "exports/quarantine", `${exporter}.jsonl`);
    this.dry_run = dry_run;
  }

  add(entry: Omit<QuarantineEntry, "recorded_at" | "exporter"> & { recorded_at: string }) {
    const full: QuarantineEntry = {
      exporter: this.exporter,
      reason: entry.reason,
      source_record: entry.source_record,
      errors: entry.errors,
      recorded_at: entry.recorded_at,
      source_provenance: entry.source_provenance,
    };
    this.counts[full.reason]++;
    this.total++;
    if (this.dry_run) {
      this.buffered.push(full);
    } else {
      mkdirSync(dirname(this.path), { recursive: true });
      appendFileSync(this.path, JSON.stringify(full) + "\n");
    }
  }

  // Summary string useful for CLI output / reports.
  summary(): string {
    if (this.total === 0) return "0 quarantined";
    const parts = Object.entries(this.counts)
      .filter(([, n]) => n > 0)
      .map(([r, n]) => `${r}=${n}`)
      .join(" ");
    return `${this.total} quarantined (${parts})`;
  }

  outputPath(): string {
    return this.path;
  }
}

// Helper: load existing quarantine entries to dedupe by sig_hash on
// re-runs. Only used when the caller wants per-record idempotency.
export function loadQuarantinedSigs(quarantine_path: string): Set<string> {
  const seen = new Set<string>();
  if (!existsSync(quarantine_path)) return seen;
  for (const line of readFileSync(quarantine_path, "utf8").split("\n")) {
    if (!line) continue;
    try {
      const e = JSON.parse(line) as QuarantineEntry;
      if (e.source_provenance?.sig_hash) seen.add(e.source_provenance.sig_hash);
    } catch { /* malformed — skip */ }
  }
  return seen;
}