lakehouse/tests/distillation/replay.test.ts

// Phase 7 replay-layer tests. Pin the deterministic primitives
// (retrieval, context-bundle, validation) without making real LLM
// calls — those are exercised by the report's real-data run.

import { test, expect } from "bun:test";
import { mkdirSync, writeFileSync, rmSync, existsSync } from "node:fs";
import { resolve } from "node:path";
import { replay } from "../../scripts/distillation/replay";

const TMP = "/tmp/distillation_test_phase7";

function setupCorpus() {
  if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true });
  mkdirSync(resolve(TMP, "exports/rag"), { recursive: true });
  // Synthetic RAG corpus covering the test queries
  const samples = [
    {
      id: "rag-001",
      title: "Audit phase 38 provider routing",
      content: "Verify that /v1/chat correctly resolves provider via routing.toml.\n- check that openai/gpt-* routes through OpenAI direct.\n- assert no kimi-k2 fallthrough on cloud quota exhaustion.\nPhase 38 acceptance was wired in commit 21fd3b9.",
      tags: ["task:scrum_review", "category:accepted", "phase:38"],
      source_run_id: "scrum:1:foo",
      success_score: "accepted",
      source_category: "accepted",
    },
    {
      id: "rag-002",
      title: "Phase 40 circuit breaker drift",
      content: "PRD §40.4 claims a circuit breaker is shipped but no breaker class found in mcp-server/. Verify the breaker exists before approving.\nensure observer escalation has fallback path.",
      tags: ["task:audit_finding", "phase:40", "drift"],
      source_run_id: "audit:abc",
      success_score: "accepted",
      source_category: "accepted",
    },
    {
      id: "rag-003",
      title: "Partially accepted scrum review",
      content: "Review took 3 attempts to land. Output less precise than first-attempt runs.",
      tags: ["task:scrum_review", "category:partially_accepted"],
      source_run_id: "scrum:2:bar",
      success_score: "partially_accepted",
      source_category: "partially_accepted",
    },
    {
      id: "rag-004",
      title: "Unrelated staffing fill",
      content: "Welder × 2 in Toledo OH. 5 candidates within 30mi. Acceptance: all 5 confirmed by EOD.",
      tags: ["task:staffing_fill"],
      source_run_id: "staffing:1",
      success_score: "accepted",
      source_category: "accepted",
    },
  ];
  writeFileSync(
    resolve(TMP, "exports/rag/playbooks.jsonl"),
    samples.map(s => JSON.stringify(s)).join("\n") + "\n",
  );
}

test("replay: retrieval surfaces phase-38 playbook for phase-38 task", async () => {
  setupCorpus();
  // Bypass real model call by using --no-retrieval=false but expecting
  // model failure to show up gracefully in validation. Retrieval is
  // exercised even when the model fails.
  const r = await replay({
    task: "Audit phase 38 provider routing for placeholder code",
    local_only: true,
    dry_run: true,
    no_retrieval: false,
  }, TMP);
  // The phase-38 playbook should be the top-ranked retrieval
  expect(r.retrieved_artifacts.rag_ids[0]).toBe("rag-001");
  // The unrelated staffing record should NOT be in top-K (or should rank lower)
  const ranks = new Map(r.retrieved_artifacts.rag_ids.map((id, i) => [id, i]));
  if (ranks.has("rag-004") && ranks.has("rag-001")) {
    expect(ranks.get("rag-001")! < ranks.get("rag-004")!).toBe(true);
  }
});

test("replay: --no-retrieval produces empty context_bundle", async () => {
  setupCorpus();
  const r = await replay({
    task: "Audit phase 38 provider routing",
    local_only: true,
    dry_run: true,
    no_retrieval: true,
  }, TMP);
  expect(r.context_bundle).toBeNull();
  expect(r.retrieved_artifacts.rag_ids.length).toBe(0);
});

test("replay: prior_successful_outputs only contains accepted samples", async () => {
  setupCorpus();
  const r = await replay({
    task: "scrum review accepted",
    local_only: true,
    dry_run: true,
  }, TMP);
  if (r.context_bundle) {
    for (const p of r.context_bundle.prior_successful_outputs) {
      expect(p.success_score).toBe("accepted");
    }
  }
});

test("replay: failure_patterns only contains partially_accepted samples", async () => {
  setupCorpus();
  const r = await replay({
    task: "scrum review",
    local_only: true,
    dry_run: true,
  }, TMP);
  if (r.context_bundle) {
    for (const p of r.context_bundle.failure_patterns) {
      expect(p.success_score).toBe("partially_accepted");
    }
  }
});

test("replay: validation_steps extracted from accepted-record content lines", async () => {
  setupCorpus();
  const r = await replay({
    task: "phase 38 routing audit",
    local_only: true,
    dry_run: true,
  }, TMP);
  if (r.context_bundle) {
    // The fixture's rag-001 contains "Verify that /v1/chat..." which should land in validation_steps
    const matched = r.context_bundle.validation_steps.some(s => /verify|check|assert|ensure/i.test(s));
    expect(matched).toBe(true);
  }
});

test("replay: empty corpus produces empty bundle, no crash", async () => {
  if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true });
  mkdirSync(resolve(TMP, "exports/rag"), { recursive: true });
  writeFileSync(resolve(TMP, "exports/rag/playbooks.jsonl"), "");
  const r = await replay({
    task: "any task",
    local_only: true,
    dry_run: true,
  }, TMP);
  expect(r.retrieved_artifacts.rag_ids.length).toBe(0);
  if (r.context_bundle) {
    expect(r.context_bundle.retrieved_playbooks.length).toBe(0);
  }
});

test("replay: every run gets logged to data/_kb/replay_runs.jsonl with provenance", async () => {
  setupCorpus();
  await replay({ task: "Audit phase 38", local_only: true, dry_run: true }, TMP);
  const logPath = resolve(TMP, "data/_kb/replay_runs.jsonl");
  expect(existsSync(logPath)).toBe(true);
  const { readFileSync } = await import("node:fs");
  const lines = readFileSync(logPath, "utf8").split("\n").filter(Boolean);
  const last = JSON.parse(lines[lines.length - 1]);
  expect(last.schema).toBe("replay_run.v1");
  expect(typeof last.recorded_run_id).toBe("string");
  expect(typeof last.task_hash).toBe("string");
  expect(typeof last.recorded_at).toBe("string");
  expect(Array.isArray(last.escalation_path)).toBe(true);
});

test("replay: task_hash is deterministic for same task input", async () => {
  setupCorpus();
  const r1 = await replay({ task: "Audit phase 38", local_only: true, dry_run: true }, TMP);
  const r2 = await replay({ task: "Audit phase 38", local_only: true, dry_run: true }, TMP);
  // task_hash is the load-bearing assertion (canonical sha256 of task)
  expect(r1.task_hash).toBe(r2.task_hash);
  // task_hash is 64-char hex
  expect(r1.task_hash).toMatch(/^[0-9a-f]{64}$/);
  // recorded_run_id includes Date.now(); same-ms call may collide — that's OK
});

test("replay: --local-only does NOT escalate even if validation fails", async () => {
  setupCorpus();
  // qwen3.5:latest may or may not be available — either way, with
  // local_only=true, escalation_path must contain only the local model.
  const r = await replay({
    task: "deliberately weird task to maybe fail validation",
    local_only: true,
    dry_run: true,
  }, TMP);
  expect(r.escalation_path.length).toBe(1);
});

test("replay: validation gate blocks unreachable-gateway calls (deterministic failure path)", async () => {
  // No dry_run here — exercise the real callModel against an
  // unreachable gateway. Should fail-closed within ~1s (AbortSignal
  // timeout fires well before the 180s default since DNS resolves
  // immediately to a closed port).
  const oldGateway = process.env.LH_GATEWAY_URL;
  process.env.LH_GATEWAY_URL = "http://127.0.0.1:1";  // closed port
  try {
    setupCorpus();
    const r = await replay({
      task: "phase 38 audit",
      local_only: true,
    }, TMP);
    expect(r.validation_result.passed).toBe(false);
    const txt = r.validation_result.reasons.join(" ");
    expect(/empty response|local call failed/.test(txt)).toBe(true);
  } finally {
    if (oldGateway) process.env.LH_GATEWAY_URL = oldGateway;
    else delete process.env.LH_GATEWAY_URL;
  }
}, 30_000);