lakehouse/tests/distillation/score_runs.test.ts

// Integration test: score_runs.ts CLI pipeline. Synthesizes evidence
// records, runs scoreAll, asserts behavior on the materialized scored
// runs + receipt.

import { test, expect, beforeEach, afterEach } from "bun:test";
import { mkdirSync, writeFileSync, rmSync, existsSync, readFileSync } from "node:fs";
import { resolve } from "node:path";

import { scoreAll } from "../../scripts/distillation/score_runs";
import { EVIDENCE_SCHEMA_VERSION, type EvidenceRecord } from "../../auditor/schemas/distillation/evidence_record";
import { validateScoredRun } from "../../auditor/schemas/distillation/scored_run";
import { validateReceipt } from "../../auditor/schemas/distillation/receipt";

const TMP = "/tmp/distillation_test_phase3";
const RECORDED = "2026-04-26T22:30:00.000Z";
const SHA = "0".repeat(64);

function makeEv(opts: Partial<EvidenceRecord> & { source_stem: string }): EvidenceRecord {
  return {
    run_id: opts.run_id ?? `run-${Math.random()}`,
    task_id: opts.task_id ?? "task-test",
    timestamp: opts.timestamp ?? RECORDED,
    schema_version: EVIDENCE_SCHEMA_VERSION,
    provenance: {
      source_file: `data/_kb/${opts.source_stem}.jsonl`,
      line_offset: 0,
      sig_hash: SHA,
      recorded_at: RECORDED,
    },
    ...opts,
  } as EvidenceRecord;
}

function writeEvidence(ev: EvidenceRecord[], stem: string) {
  const partition = "2026/04/27";
  const dir = resolve(TMP, "data/evidence", partition);
  mkdirSync(dir, { recursive: true });
  writeFileSync(resolve(dir, `${stem}.jsonl`), ev.map(r => JSON.stringify(r)).join("\n") + "\n");
}

function setup() {
  if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true });
  mkdirSync(resolve(TMP, "data/_kb"), { recursive: true });

  // Mix of every category across sources
  writeEvidence([
    makeEv({ source_stem: "scrum_reviews", run_id: "s1", success_markers: ["accepted_on_attempt_1"] }),
    makeEv({ source_stem: "scrum_reviews", run_id: "s2", success_markers: ["accepted_on_attempt_3"] }),
    makeEv({ source_stem: "scrum_reviews", run_id: "s3" }),  // no markers → human
  ], "scrum_reviews");

  writeEvidence([
    makeEv({ source_stem: "audits", run_id: "a1", success_markers: ["approved"] }),
    makeEv({ source_stem: "audits", run_id: "a2", failure_markers: ["blocked"] }),
    makeEv({ source_stem: "audits", run_id: "a3", failure_markers: ["request_changes"] }),
  ], "audits");

  writeEvidence([
    makeEv({ source_stem: "auto_apply", run_id: "ap1", success_markers: ["committed"] }),
    makeEv({ source_stem: "auto_apply", run_id: "ap2", failure_markers: ["build_red_reverted"] }),
    makeEv({ source_stem: "auto_apply", run_id: "ap3" }),
  ], "auto_apply");

  writeEvidence([
    makeEv({ source_stem: "distilled_facts", run_id: "df1", text: "extracted fact" }),
  ], "distilled_facts");
}

beforeEach(setup);
afterEach(() => { if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true }); });

test("score_runs: emits ScoredRun for every EvidenceRecord", async () => {
  const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
  expect(r.totals.rows_read).toBe(10);
  expect(r.totals.rows_written).toBe(10);
  expect(r.totals.rows_skipped).toBe(0);
});

test("score_runs: category distribution matches expected per source", async () => {
  const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
  // 1 (s1) + 1 (a1) + 1 (ap1) = 3 accepted
  // 1 (s2) + 1 (a3) = 2 partial
  // 1 (a2) + 1 (ap2) = 2 rejected
  // 1 (s3) + 1 (ap3) + 1 (df1) = 3 needs_human
  expect(r.totals.by_category.accepted).toBe(3);
  expect(r.totals.by_category.partially_accepted).toBe(2);
  expect(r.totals.by_category.rejected).toBe(2);
  expect(r.totals.by_category.needs_human_review).toBe(3);
});

test("score_runs: every output row validates against ScoredRun schema", async () => {
  await scoreAll({ root: TMP, recorded_at: RECORDED });
  const dir = resolve(TMP, "data/scored-runs/2026/04/27");
  for (const stem of ["scrum_reviews", "audits", "auto_apply", "distilled_facts"]) {
    const path = resolve(dir, `${stem}.jsonl`);
    expect(existsSync(path)).toBe(true);
    const lines = readFileSync(path, "utf8").trim().split("\n").filter(Boolean);
    for (const line of lines) {
      const v = validateScoredRun(JSON.parse(line));
      expect(v.valid).toBe(true);
    }
  }
});

test("score_runs: idempotent — second run produces 0 new writes", async () => {
  await scoreAll({ root: TMP, recorded_at: RECORDED });
  const r2 = await scoreAll({ root: TMP, recorded_at: RECORDED });
  expect(r2.totals.rows_written).toBe(0);
  expect(r2.totals.rows_deduped).toBe(10);
});

test("score_runs: receipt validates and pins git_sha + record_counts + by_category", async () => {
  const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
  const v = validateReceipt(r.receipt);
  expect(v.valid).toBe(true);
  expect(r.receipt.git_sha).toMatch(/^[0-9a-f]{40}$/);
  expect(r.receipt.record_counts.in).toBe(10);
  expect(r.receipt.record_counts.out).toBe(10);
  expect(r.receipt.record_counts.cat_accepted).toBe(3);
  expect(r.receipt.record_counts.cat_partially_accepted).toBe(2);
  expect(r.receipt.record_counts.cat_rejected).toBe(2);
  expect(r.receipt.record_counts.cat_needs_human_review).toBe(3);
  expect(r.receipt.validation_pass).toBe(true);  // 0 skips
});

test("score_runs: every output row carries provenance + reasons + scorer_version", async () => {
  await scoreAll({ root: TMP, recorded_at: RECORDED });
  const path = resolve(TMP, "data/scored-runs/2026/04/27/scrum_reviews.jsonl");
  const rows = readFileSync(path, "utf8").trim().split("\n").map(l => JSON.parse(l));
  for (const row of rows) {
    expect(row.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/);
    expect(row.reasons.length).toBeGreaterThan(0);
    expect(row.scorer_version).toBeTruthy();
  }
});

test("score_runs: malformed evidence row is skipped, valid rows still process", async () => {
  // Inject a malformed line into one of the evidence files
  const path = resolve(TMP, "data/evidence/2026/04/27/scrum_reviews.jsonl");
  const existing = readFileSync(path, "utf8");
  writeFileSync(path, existing + "{not valid json\n");

  const r = await scoreAll({ root: TMP, recorded_at: RECORDED });
  expect(r.totals.rows_skipped).toBe(1);
  expect(r.totals.rows_written).toBe(10);  // valid rows unaffected
  expect(r.receipt.validation_pass).toBe(false);  // skips > 0
  expect(existsSync(r.skips_path)).toBe(true);
  const skipBody = readFileSync(r.skips_path, "utf8");
  expect(skipBody).toContain("evidence not JSON");
});

test("score_runs: dry-run reports counts but writes no scored-runs", async () => {
  const r = await scoreAll({ root: TMP, recorded_at: RECORDED, dry_run: true });
  expect(r.totals.rows_written).toBe(10);
  const scoredDir = resolve(TMP, "data/scored-runs");
  expect(existsSync(scoredDir)).toBe(false);
});