lakehouse/tests/distillation/build_evidence_index.test.ts

// Phase 2 acceptance tests — pin the materializer's invariants:
//   1. Valid rows materialize; invalid rows go to skips with errors
//   2. Idempotency: re-running on same source yields zero new writes
//   3. Stability: identical input → byte-identical output (canonical hash)
//   4. Schema gating: rows that fail validateEvidenceRecord NEVER reach
//      data/evidence/*.jsonl, only skips
//   5. Receipt: substantive (git_sha + sha256 + record_counts +
//      validation_pass), conforms to Receipt schema
//   6. JSON-parse failures handled gracefully
//
// All tests run against a temp repo root with synthetic source jsonls
// and a custom TRANSFORMS list pointing at them. No live JSONLs touched.
//
// Run: bun test tests/distillation/build_evidence_index.test.ts

import { test, expect, beforeEach, afterEach } from "bun:test";
import { mkdirSync, writeFileSync, rmSync, existsSync, readFileSync, readdirSync, statSync } from "node:fs";
import { resolve } from "node:path";

import { materializeAll, type MaterializeOptions } from "../../scripts/distillation/build_evidence_index";
import type { TransformDef } from "../../scripts/distillation/transforms";
import { EVIDENCE_SCHEMA_VERSION, type ModelRole } from "../../auditor/schemas/distillation/evidence_record";
import { validateReceipt } from "../../auditor/schemas/distillation/receipt";

const TMP_ROOT = "/tmp/distillation_test_phase2";
const RECORDED = "2026-04-26T22:30:00.000Z";

// Minimal transform — produces a valid EvidenceRecord from the
// synthetic source rows below.
const TEST_TRANSFORMS: TransformDef[] = [
  {
    source_file_relpath: "data/_kb/synthetic_a.jsonl",
    transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => {
      // Test rows that intentionally fail validation set bad: true.
      // Transform still returns a Partial — validator catches it.
      if (row.bad) {
        return {
          // missing run_id (required) → forces validateEvidenceRecord to reject
          task_id: row.task_id,
          timestamp: row.ts,
          schema_version: EVIDENCE_SCHEMA_VERSION,
          provenance: { source_file: source_file_relpath, line_offset, sig_hash, recorded_at },
        } as any;
      }
      return {
        run_id: row.run_id,
        task_id: row.task_id,
        timestamp: row.ts,
        schema_version: EVIDENCE_SCHEMA_VERSION,
        provenance: { source_file: source_file_relpath, line_offset, sig_hash, recorded_at },
        text: row.text,
        model_role: "executor" as ModelRole,
      };
    },
  },
  {
    source_file_relpath: "data/_kb/synthetic_b.jsonl",
    transform: ({ row, line_offset, source_file_relpath, recorded_at, sig_hash }) => ({
      run_id: row.run_id,
      task_id: row.task_id,
      timestamp: row.ts,
      schema_version: EVIDENCE_SCHEMA_VERSION,
      provenance: { source_file: source_file_relpath, line_offset, sig_hash, recorded_at },
      text: row.text,
      model_role: "extractor" as ModelRole,
    }),
  },
];

function setupRoot() {
  if (existsSync(TMP_ROOT)) rmSync(TMP_ROOT, { recursive: true, force: true });
  mkdirSync(resolve(TMP_ROOT, "data/_kb"), { recursive: true });

  // Source A: 3 valid + 1 invalid + 1 malformed JSON
  const aRows = [
    { run_id: "a1", task_id: "task1", ts: "2026-04-26T20:00:00.000Z", text: "first" },
    { run_id: "a2", task_id: "task2", ts: "2026-04-26T20:01:00.000Z", text: "second" },
    { run_id: "a3", task_id: "task3", ts: "2026-04-26T20:02:00.000Z", text: "third" },
    { bad: true, task_id: "fail-row", ts: "2026-04-26T20:03:00.000Z" },
  ];
  const aLines = aRows.map(r => JSON.stringify(r)).join("\n") + "\n{not valid json\n";
  writeFileSync(resolve(TMP_ROOT, "data/_kb/synthetic_a.jsonl"), aLines);

  // Source B: 2 valid rows
  const bRows = [
    { run_id: "b1", task_id: "btask1", ts: "2026-04-26T20:10:00.000Z", text: "alpha" },
    { run_id: "b2", task_id: "btask2", ts: "2026-04-26T20:11:00.000Z", text: "beta" },
  ];
  writeFileSync(resolve(TMP_ROOT, "data/_kb/synthetic_b.jsonl"), bRows.map(r => JSON.stringify(r)).join("\n") + "\n");
}

beforeEach(setupRoot);
afterEach(() => {
  if (existsSync(TMP_ROOT)) rmSync(TMP_ROOT, { recursive: true, force: true });
});

// ─── Acceptance Test 1: valid rows materialize, invalid go to skips ──

test("materializer: 3 valid rows from source A reach evidence/, 1 invalid + 1 malformed go to skips", async () => {
  const r = await materializeAll({
    root: TMP_ROOT,
    transforms: TEST_TRANSFORMS,
    recorded_at: RECORDED,
  });

  // Source A: 5 read, 3 written, 2 skipped (1 missing run_id, 1 malformed JSON)
  const a = r.sources.find(s => s.source_file_relpath.endsWith("synthetic_a.jsonl"))!;
  expect(a.rows_read).toBe(5);
  expect(a.rows_written).toBe(3);
  expect(a.rows_skipped).toBe(2);

  // Source B: 2 read, 2 written
  const b = r.sources.find(s => s.source_file_relpath.endsWith("synthetic_b.jsonl"))!;
  expect(b.rows_read).toBe(2);
  expect(b.rows_written).toBe(2);

  // Skips file exists and contains both rejection reasons
  const skipsContent = readFileSync(r.skips_path, "utf8");
  expect(skipsContent).toContain("run_id");      // missing required field
  expect(skipsContent).toContain("JSON.parse");  // malformed JSON

  // Evidence files exist at the expected day partition
  const partition = "2026/04/26";
  const aOut = resolve(TMP_ROOT, "data/evidence", partition, "synthetic_a.jsonl");
  const bOut = resolve(TMP_ROOT, "data/evidence", partition, "synthetic_b.jsonl");
  expect(existsSync(aOut)).toBe(true);
  expect(existsSync(bOut)).toBe(true);

  // Output rows count matches written
  const aLines = readFileSync(aOut, "utf8").trim().split("\n");
  expect(aLines.length).toBe(3);
  for (const line of aLines) {
    const row = JSON.parse(line);
    expect(row.schema_version).toBe(EVIDENCE_SCHEMA_VERSION);
    expect(row.provenance.source_file).toBe("data/_kb/synthetic_a.jsonl");
    expect(typeof row.provenance.sig_hash).toBe("string");
    expect(row.provenance.sig_hash.length).toBe(64);
  }
});

// ─── Acceptance Test 2: idempotency ──────────────────────────────────

test("materializer: re-running on same source produces 0 new writes (idempotent)", async () => {
  await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED });
  const r2 = await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED });

  // Second run reads the same rows but dedups all of them — zero new writes
  const a2 = r2.sources.find(s => s.source_file_relpath.endsWith("synthetic_a.jsonl"))!;
  expect(a2.rows_written).toBe(0);
  expect(a2.rows_deduped).toBe(3);
});

// ─── Acceptance Test 3: stable sig_hash → byte-identical output ──────

test("materializer: identical input produces byte-identical output across runs", async () => {
  const r1 = await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED });
  const aPath = resolve(TMP_ROOT, "data/evidence/2026/04/26/synthetic_a.jsonl");
  const aBeforeBytes = readFileSync(aPath);

  // Wipe the output file and re-run with the same inputs
  rmSync(aPath);
  await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED });
  const aAfterBytes = readFileSync(aPath);

  expect(aBeforeBytes.equals(aAfterBytes)).toBe(true);
});

// ─── Acceptance Test 4: schema gating ────────────────────────────────

test("materializer: rows failing validateEvidenceRecord NEVER reach evidence/, only skips", async () => {
  const r = await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED });

  const aOut = resolve(TMP_ROOT, "data/evidence/2026/04/26/synthetic_a.jsonl");
  const aRows = readFileSync(aOut, "utf8").trim().split("\n").filter(Boolean).map(l => JSON.parse(l));

  // Every output row has a non-empty run_id (the invalid row had no
  // run_id, so it MUST be absent from output).
  for (const row of aRows) {
    expect(typeof row.run_id).toBe("string");
    expect(row.run_id.length).toBeGreaterThan(0);
  }
  // Specifically: no row carries the failing fixture's task_id "fail-row"
  expect(aRows.find((r: any) => r.task_id === "fail-row")).toBeUndefined();
});

// ─── Acceptance Test 5: receipt is substantive + schema-conforming ───

test("materializer: receipt has git_sha + sha256(input) + sha256(output) + record_counts and validates", async () => {
  const r = await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED });

  // Self-validation against the Receipt schema
  const v = validateReceipt(r.receipt);
  expect(v.valid).toBe(true);

  // git_sha is 40 hex chars (real or 0...0 fallback)
  expect(r.receipt.git_sha).toMatch(/^[0-9a-f]{40}$/);

  // Each input file has a real sha256 + bytes
  expect(r.receipt.input_files.length).toBe(2);
  for (const f of r.receipt.input_files) {
    expect(f.sha256).toMatch(/^[0-9a-f]{64}$/);
    expect(typeof f.bytes).toBe("number");
    expect(f.bytes).toBeGreaterThan(0);
  }

  // Each output file too
  expect(r.receipt.output_files.length).toBe(2);
  for (const f of r.receipt.output_files) {
    expect(f.sha256).toMatch(/^[0-9a-f]{64}$/);
  }

  // Counts add up
  expect(r.receipt.record_counts.in).toBe(7);    // 5 from A + 2 from B
  expect(r.receipt.record_counts.out).toBe(5);   // 3 + 2
  expect(r.receipt.record_counts.skipped).toBe(2); // both from A

  // validation_pass MUST be a boolean — never inferred
  expect(typeof r.receipt.validation_pass).toBe("boolean");
  // With skips > 0, validation_pass should be false
  expect(r.receipt.validation_pass).toBe(false);

  // Receipt persisted
  expect(existsSync(r.receipt_path)).toBe(true);
});

// ─── Acceptance Test 6: clean run sets validation_pass=true ──────────

test("materializer: with all-valid sources, validation_pass=true and skips=0", async () => {
  // Strip the bad row + malformed JSON from source A
  const cleanRows = [
    { run_id: "c1", task_id: "ct1", ts: "2026-04-26T22:00:00.000Z", text: "clean" },
    { run_id: "c2", task_id: "ct2", ts: "2026-04-26T22:01:00.000Z", text: "clean2" },
  ];
  writeFileSync(resolve(TMP_ROOT, "data/_kb/synthetic_a.jsonl"), cleanRows.map(r => JSON.stringify(r)).join("\n") + "\n");

  const r = await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED });

  expect(r.receipt.record_counts.skipped).toBe(0);
  expect(r.receipt.validation_pass).toBe(true);
});

// ─── Acceptance Test 7: dry-run does not write ───────────────────────

test("materializer: --dry-run reports counts but writes no evidence files", async () => {
  const r = await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED, dry_run: true });

  // Counts populated
  expect(r.totals.rows_read).toBe(7);
  expect(r.totals.rows_written).toBe(5);

  // No evidence files written
  const evidenceDir = resolve(TMP_ROOT, "data/evidence");
  expect(existsSync(evidenceDir)).toBe(false);

  // No skips file written
  const skipsPath = resolve(TMP_ROOT, "data/_kb/distillation_skips.jsonl");
  expect(existsSync(skipsPath)).toBe(false);
});

// ─── Acceptance Test 8: missing source file does not crash ───────────

test("materializer: missing source file is tallied as rows_present=false, no error", async () => {
  rmSync(resolve(TMP_ROOT, "data/_kb/synthetic_b.jsonl"));

  const r = await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED });

  const b = r.sources.find(s => s.source_file_relpath.endsWith("synthetic_b.jsonl"))!;
  expect(b.rows_present).toBe(false);
  expect(b.rows_read).toBe(0);

  // Source A still processes normally
  const a = r.sources.find(s => s.source_file_relpath.endsWith("synthetic_a.jsonl"))!;
  expect(a.rows_present).toBe(true);
  expect(a.rows_written).toBe(3);
});

// ─── Acceptance Test 9: provenance preserved on every row ────────────

test("materializer: every output row has provenance traceable to a source row", async () => {
  const r = await materializeAll({ root: TMP_ROOT, transforms: TEST_TRANSFORMS, recorded_at: RECORDED });

  for (const s of r.sources) {
    for (const out_path of s.output_files) {
      const lines = readFileSync(out_path, "utf8").trim().split("\n").filter(Boolean);
      for (const line of lines) {
        const row = JSON.parse(line);
        expect(row.provenance).toBeTruthy();
        expect(row.provenance.source_file).toBe(s.source_file_relpath);
        expect(typeof row.provenance.line_offset).toBe("number");
        expect(row.provenance.sig_hash).toMatch(/^[0-9a-f]{64}$/);
        expect(row.provenance.recorded_at).toBe(RECORDED);
      }
    }
  }
});