// Phase 7 replay-layer tests. Pin the deterministic primitives // (retrieval, context-bundle, validation) without making real LLM // calls — those are exercised by the report's real-data run. import { test, expect } from "bun:test"; import { mkdirSync, writeFileSync, rmSync, existsSync } from "node:fs"; import { resolve } from "node:path"; import { replay } from "../../scripts/distillation/replay"; const TMP = "/tmp/distillation_test_phase7"; function setupCorpus() { if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true }); mkdirSync(resolve(TMP, "exports/rag"), { recursive: true }); // Synthetic RAG corpus covering the test queries const samples = [ { id: "rag-001", title: "Audit phase 38 provider routing", content: "Verify that /v1/chat correctly resolves provider via routing.toml.\n- check that openai/gpt-* routes through OpenAI direct.\n- assert no kimi-k2 fallthrough on cloud quota exhaustion.\nPhase 38 acceptance was wired in commit 21fd3b9.", tags: ["task:scrum_review", "category:accepted", "phase:38"], source_run_id: "scrum:1:foo", success_score: "accepted", source_category: "accepted", }, { id: "rag-002", title: "Phase 40 circuit breaker drift", content: "PRD §40.4 claims a circuit breaker is shipped but no breaker class found in mcp-server/. Verify the breaker exists before approving.\nensure observer escalation has fallback path.", tags: ["task:audit_finding", "phase:40", "drift"], source_run_id: "audit:abc", success_score: "accepted", source_category: "accepted", }, { id: "rag-003", title: "Partially accepted scrum review", content: "Review took 3 attempts to land. Output less precise than first-attempt runs.", tags: ["task:scrum_review", "category:partially_accepted"], source_run_id: "scrum:2:bar", success_score: "partially_accepted", source_category: "partially_accepted", }, { id: "rag-004", title: "Unrelated staffing fill", content: "Welder × 2 in Toledo OH. 5 candidates within 30mi. Acceptance: all 5 confirmed by EOD.", tags: ["task:staffing_fill"], source_run_id: "staffing:1", success_score: "accepted", source_category: "accepted", }, ]; writeFileSync( resolve(TMP, "exports/rag/playbooks.jsonl"), samples.map(s => JSON.stringify(s)).join("\n") + "\n", ); } test("replay: retrieval surfaces phase-38 playbook for phase-38 task", async () => { setupCorpus(); // Bypass real model call by using --no-retrieval=false but expecting // model failure to show up gracefully in validation. Retrieval is // exercised even when the model fails. const r = await replay({ task: "Audit phase 38 provider routing for placeholder code", local_only: true, dry_run: true, no_retrieval: false, }, TMP); // The phase-38 playbook should be the top-ranked retrieval expect(r.retrieved_artifacts.rag_ids[0]).toBe("rag-001"); // The unrelated staffing record should NOT be in top-K (or should rank lower) const ranks = new Map(r.retrieved_artifacts.rag_ids.map((id, i) => [id, i])); if (ranks.has("rag-004") && ranks.has("rag-001")) { expect(ranks.get("rag-001")! < ranks.get("rag-004")!).toBe(true); } }); test("replay: --no-retrieval produces empty context_bundle", async () => { setupCorpus(); const r = await replay({ task: "Audit phase 38 provider routing", local_only: true, dry_run: true, no_retrieval: true, }, TMP); expect(r.context_bundle).toBeNull(); expect(r.retrieved_artifacts.rag_ids.length).toBe(0); }); test("replay: prior_successful_outputs only contains accepted samples", async () => { setupCorpus(); const r = await replay({ task: "scrum review accepted", local_only: true, dry_run: true, }, TMP); if (r.context_bundle) { for (const p of r.context_bundle.prior_successful_outputs) { expect(p.success_score).toBe("accepted"); } } }); test("replay: failure_patterns only contains partially_accepted samples", async () => { setupCorpus(); const r = await replay({ task: "scrum review", local_only: true, dry_run: true, }, TMP); if (r.context_bundle) { for (const p of r.context_bundle.failure_patterns) { expect(p.success_score).toBe("partially_accepted"); } } }); test("replay: validation_steps extracted from accepted-record content lines", async () => { setupCorpus(); const r = await replay({ task: "phase 38 routing audit", local_only: true, dry_run: true, }, TMP); if (r.context_bundle) { // The fixture's rag-001 contains "Verify that /v1/chat..." which should land in validation_steps const matched = r.context_bundle.validation_steps.some(s => /verify|check|assert|ensure/i.test(s)); expect(matched).toBe(true); } }); test("replay: empty corpus produces empty bundle, no crash", async () => { if (existsSync(TMP)) rmSync(TMP, { recursive: true, force: true }); mkdirSync(resolve(TMP, "exports/rag"), { recursive: true }); writeFileSync(resolve(TMP, "exports/rag/playbooks.jsonl"), ""); const r = await replay({ task: "any task", local_only: true, dry_run: true, }, TMP); expect(r.retrieved_artifacts.rag_ids.length).toBe(0); if (r.context_bundle) { expect(r.context_bundle.retrieved_playbooks.length).toBe(0); } }); test("replay: every run gets logged to data/_kb/replay_runs.jsonl with provenance", async () => { setupCorpus(); await replay({ task: "Audit phase 38", local_only: true, dry_run: true }, TMP); const logPath = resolve(TMP, "data/_kb/replay_runs.jsonl"); expect(existsSync(logPath)).toBe(true); const { readFileSync } = await import("node:fs"); const lines = readFileSync(logPath, "utf8").split("\n").filter(Boolean); const last = JSON.parse(lines[lines.length - 1]); expect(last.schema).toBe("replay_run.v1"); expect(typeof last.recorded_run_id).toBe("string"); expect(typeof last.task_hash).toBe("string"); expect(typeof last.recorded_at).toBe("string"); expect(Array.isArray(last.escalation_path)).toBe(true); }); test("replay: task_hash is deterministic for same task input", async () => { setupCorpus(); const r1 = await replay({ task: "Audit phase 38", local_only: true, dry_run: true }, TMP); const r2 = await replay({ task: "Audit phase 38", local_only: true, dry_run: true }, TMP); // task_hash is the load-bearing assertion (canonical sha256 of task) expect(r1.task_hash).toBe(r2.task_hash); // task_hash is 64-char hex expect(r1.task_hash).toMatch(/^[0-9a-f]{64}$/); // recorded_run_id includes Date.now(); same-ms call may collide — that's OK }); test("replay: --local-only does NOT escalate even if validation fails", async () => { setupCorpus(); // qwen3.5:latest may or may not be available — either way, with // local_only=true, escalation_path must contain only the local model. const r = await replay({ task: "deliberately weird task to maybe fail validation", local_only: true, dry_run: true, }, TMP); expect(r.escalation_path.length).toBe(1); }); test("replay: validation gate blocks unreachable-gateway calls (deterministic failure path)", async () => { // No dry_run here — exercise the real callModel against an // unreachable gateway. Should fail-closed within ~1s (AbortSignal // timeout fires well before the 180s default since DNS resolves // immediately to a closed port). const oldGateway = process.env.LH_GATEWAY_URL; process.env.LH_GATEWAY_URL = "http://127.0.0.1:1"; // closed port try { setupCorpus(); const r = await replay({ task: "phase 38 audit", local_only: true, }, TMP); expect(r.validation_result.passed).toBe(false); const txt = r.validation_result.reasons.join(" "); expect(/empty response|local call failed/.test(txt)).toBe(true); } finally { if (oldGateway) process.env.LH_GATEWAY_URL = oldGateway; else delete process.env.LH_GATEWAY_URL; } }, 30_000);