lakehouse/scripts/distillation/distill.ts

// distill.ts — single-entry CLI dispatcher for the distillation
// pipeline. Mirrors the spec's `./scripts/distill <command>` shape.
//
// USAGE
//   bun run scripts/distillation/distill.ts <command> [flags]
//
// COMMANDS
//   build-evidence     materialize EvidenceRecord rows from data/_kb/*.jsonl
//   score              run deterministic Success Scorer
//   export-rag         RAG export (--include-review opt-in)
//   export-sft         SFT export (--include-partial opt-in)
//   export-preference  preference export
//   export-all         RAG + SFT + preference (no opt-ins by default)
//   health             evidence health audit
//
// All commands accept --dry-run.

import { materializeAll } from "./build_evidence_index";
import { scoreAll } from "./score_runs";
import { exportRag } from "./export_rag";
import { exportSft } from "./export_sft";
import { exportPreference } from "./export_preference";
import { TRANSFORMS } from "./transforms";

const DEFAULT_ROOT = process.env.LH_DISTILL_ROOT ?? "/home/profit/lakehouse";

async function main() {
  const cmd = process.argv[2];
  const dry_run = process.argv.includes("--dry-run");
  const include_partial = process.argv.includes("--include-partial");
  const include_review = process.argv.includes("--include-review");
  const recorded_at = new Date().toISOString();

  switch (cmd) {
    case "build-evidence": {
      const r = await materializeAll({ root: DEFAULT_ROOT, transforms: TRANSFORMS, recorded_at, dry_run });
      console.log(`[build-evidence] in=${r.totals.rows_read} out=${r.totals.rows_written} skip=${r.totals.rows_skipped} dedup=${r.totals.rows_deduped}`);
      if (!dry_run) console.log(`[build-evidence] receipt: ${r.receipt_path}`);
      if (!r.receipt.validation_pass) process.exit(1);
      break;
    }
    case "score": {
      const r = await scoreAll({ root: DEFAULT_ROOT, recorded_at, dry_run });
      const c = r.totals.by_category;
      console.log(`[score] in=${r.totals.rows_read} out=${r.totals.rows_written} acc=${c.accepted ?? 0} part=${c.partially_accepted ?? 0} rej=${c.rejected ?? 0} hum=${c.needs_human_review ?? 0}`);
      if (!dry_run) console.log(`[score] receipt: ${r.receipt_path}`);
      break;
    }
    case "export-rag": {
      const r = await exportRag({ root: DEFAULT_ROOT, recorded_at, include_review, dry_run });
      console.log(`[export-rag] in=${r.records_read} out=${r.records_exported} ${r.quarantine_summary}`);
      console.log(`[export-rag] output: ${r.output_path}${include_review ? " (review included)" : ""}`);
      break;
    }
    case "export-sft": {
      const r = await exportSft({ root: DEFAULT_ROOT, recorded_at, include_partial, dry_run });
      console.log(`[export-sft] in=${r.records_read} out=${r.records_exported} ${r.quarantine_summary}`);
      console.log(`[export-sft] output: ${r.output_path}${include_partial ? " (partial included)" : ""}`);
      break;
    }
    case "export-preference": {
      const r = await exportPreference({ root: DEFAULT_ROOT, recorded_at, dry_run });
      console.log(`[export-preference] in=${r.records_read} pairs=${r.pairs_exported} task_ids_paired=${r.task_ids_with_pairs} ${r.quarantine_summary}`);
      console.log(`[export-preference] output: ${r.output_path}`);
      break;
    }
    case "export-all": {
      const rRag = await exportRag({ root: DEFAULT_ROOT, recorded_at, include_review, dry_run });
      const rSft = await exportSft({ root: DEFAULT_ROOT, recorded_at, include_partial, dry_run });
      const rPref = await exportPreference({ root: DEFAULT_ROOT, recorded_at, dry_run });
      console.log("");
      console.log("─── export-all summary ───");
      console.log(`  RAG:        in=${rRag.records_read} out=${rRag.records_exported} ${rRag.quarantine_summary}`);
      console.log(`  SFT:        in=${rSft.records_read} out=${rSft.records_exported} ${rSft.quarantine_summary}`);
      console.log(`  Preference: in=${rPref.records_read} pairs=${rPref.pairs_exported} ${rPref.quarantine_summary}`);
      break;
    }
    case "health":
    case "help":
    case undefined: {
      console.log("Usage: bun run scripts/distillation/distill.ts <command> [flags]");
      console.log("");
      console.log("Commands:");
      console.log("  build-evidence     materialize EvidenceRecord rows");
      console.log("  score              run deterministic Success Scorer");
      console.log("  export-rag         RAG export (--include-review opt-in)");
      console.log("  export-sft         SFT export (--include-partial opt-in)");
      console.log("  export-preference  preference export");
      console.log("  export-all         RAG + SFT + preference");
      console.log("");
      console.log("Flags: --dry-run, --include-partial, --include-review");
      break;
    }
    default:
      console.error(`unknown command: ${cmd}. Try 'help'.`);
      process.exit(2);
  }
}

main().catch(e => { console.error(e); process.exit(1); });