golangLAKEHOUSE/reports/scrum/_evidence/2026-05-02/diffs/c4_replay.diff

commit 89ca72d4718fcb20ba9dcc03110e090890a0736e
Author: root <root@island37.com>
Date:   Sat May 2 03:31:02 2026 -0500

    materializer + replay ports + vectord substrate fix verified at scale

    Two threads landing together — the doc edits interleave so they ship
    in a single commit.

    1. **vectord substrate fix verified at original scale** (closes the
       2026-05-01 thread). Re-ran multitier 5min @ conc=50: 132,211
       scenarios at 438/sec, 6/6 classes at 0% failure (was 4/6 pre-fix).
       Throughput dropped 1,115 → 438/sec because previously-broken
       scenarios now do real HNSW Add work — honest cost of correctness.
       The fix (i.vectors side-store + safeGraphAdd recover wrappers +
       smallIndexRebuildThreshold=32 + saveTask coalescing) holds at the
       footprint that originally surfaced the bug.

    2. **Materializer port** — internal/materializer + cmd/materializer +
       scripts/materializer_smoke.sh. Ports scripts/distillation/transforms.ts
       (12 transforms) + build_evidence_index.ts (idempotency, day-partition,
       receipt). On-wire JSON shape matches TS so Bun and Go runs are
       interchangeable. 14 tests green.

    3. **Replay port** — internal/replay + cmd/replay +
       scripts/replay_smoke.sh. Ports scripts/distillation/replay.ts
       (retrieve → bundle → /v1/chat → validate → log). Closes audit-FULL
       phase 7 live invocation on the Go side. Both runtimes append to the
       same data/_kb/replay_runs.jsonl (schema=replay_run.v1). 14 tests green.

    Side effect on internal/distillation/types.go: EvidenceRecord gained
    prompt_tokens, completion_tokens, and metadata fields to mirror the TS
    shape the materializer transforms produce.

    STATE_OF_PLAY refreshed to 2026-05-02; ARCHITECTURE_COMPARISON decisions
    tracker moves the materializer + replay items from _open_ to DONE and
    adds the substrate-fix scale verification row.

    Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git a/cmd/replay/main.go b/cmd/replay/main.go
new file mode 100644
index 0000000..f73d3b6
--- /dev/null
+++ b/cmd/replay/main.go
@@ -0,0 +1,87 @@
+// replay — Go-side distillation replay runner. Closes audit-FULL
+// phase 7 live invocation on the Go side. Mirrors
+// scripts/distillation/replay.ts; both runtimes append to the same
+// `data/_kb/replay_runs.jsonl` shape (schema=replay_run.v1).
+//
+// Usage:
+//
+//	replay -task "rebuild evidence index"
+//	replay -task "..."  -allow-escalation
+//	replay -task "..."  -no-retrieval                # baseline mode
+//	replay -task "..."  -dry-run                     # synthetic, no LLM
+//	replay -task "..."  -root /home/profit/lakehouse # custom repo root
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"strings"
+
+	"git.agentview.dev/profit/golangLAKEHOUSE/internal/replay"
+)
+
+func main() {
+	task := flag.String("task", "", "input task to replay")
+	localOnly := flag.Bool("local-only", false, "never escalate; record validation result only")
+	allowEscalation := flag.Bool("allow-escalation", false, "fall back to the bigger model when local validation fails")
+	noRetrieval := flag.Bool("no-retrieval", false, "baseline mode: skip retrieval bundle (still logs)")
+	dryRun := flag.Bool("dry-run", false, "synthesize a deterministic response — no LLM call")
+	root := flag.String("root", replay.DefaultRoot(), "lakehouse repo root (defaults to $LH_DISTILL_ROOT or cwd)")
+	gateway := flag.String("gateway", "", "override gateway URL (default: $LH_GATEWAY_URL or http://localhost:3110)")
+	localModel := flag.String("local-model", "", "override local model name")
+	escalationModel := flag.String("escalation-model", "", "override escalation model name")
+	flag.Parse()
+
+	if *task == "" {
+		fmt.Fprintln(os.Stderr, `usage: replay -task "<input>" [-local-only] [-allow-escalation] [-no-retrieval] [-dry-run]`)
+		os.Exit(2)
+	}
+
+	res, err := replay.Replay(context.Background(), replay.ReplayRequest{
+		Task:            *task,
+		LocalOnly:       *localOnly,
+		AllowEscalation: *allowEscalation,
+		NoRetrieval:     *noRetrieval,
+		DryRun:          *dryRun,
+		GatewayURL:      *gateway,
+		LocalModel:      *localModel,
+		EscalationModel: *escalationModel,
+	}, *root)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "replay: %v\n", err)
+		os.Exit(1)
+	}
+
+	fmt.Printf("[replay] run_id=%s\n", res.RecordedRunID)
+	if res.ContextBundle == nil {
+		fmt.Println("[replay] retrieval: DISABLED")
+	} else {
+		fmt.Printf("[replay] retrieval: %d playbooks\n", len(res.ContextBundle.RetrievedPlaybooks))
+	}
+	fmt.Printf("[replay] escalation_path: %s\n", strings.Join(res.EscalationPath, " → "))
+	fmt.Printf("[replay] model_used: %s · %dms\n", res.ModelUsed, res.DurationMs)
+	verdict := "PASS"
+	if !res.ValidationResult.Passed {
+		verdict = "FAIL"
+	}
+	suffix := ""
+	if len(res.ValidationResult.Reasons) > 0 {
+		suffix = " (" + strings.Join(res.ValidationResult.Reasons, "; ") + ")"
+	}
+	fmt.Printf("[replay] validation: %s%s\n", verdict, suffix)
+	fmt.Println()
+	fmt.Println("─── response ───")
+	body := res.ModelResponse
+	if len(body) > 1500 {
+		fmt.Println(body[:1500])
+		fmt.Printf("... [%d more chars]\n", len(body)-1500)
+	} else {
+		fmt.Println(body)
+	}
+
+	if !res.ValidationResult.Passed {
+		os.Exit(1)
+	}
+}
diff --git a/internal/replay/model.go b/internal/replay/model.go
new file mode 100644
index 0000000..cbad676
--- /dev/null
+++ b/internal/replay/model.go
@@ -0,0 +1,131 @@
+package replay
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+)
+
+// callModelResult is what the gateway round-trip returns.
+type callModelResult struct {
+	Content string
+	OK      bool
+	Error   string
+}
+
+// ModelCaller is the seam tests use to swap out HTTP. Production
+// supplies httpModelCaller; tests can supply scripted responses.
+type ModelCaller func(ctx context.Context, model, system, user string) callModelResult
+
+// httpModelCaller posts to ${gatewayURL}/v1/chat with provider derived
+// from model name. Mirrors replay.ts:callModel.
+func httpModelCaller(gatewayURL string) ModelCaller {
+	client := &http.Client{Timeout: 180 * time.Second}
+	return func(ctx context.Context, model, system, user string) callModelResult {
+		provider := inferProvider(model)
+		body, err := json.Marshal(map[string]any{
+			"provider": provider,
+			"model":    model,
+			"messages": []map[string]string{
+				{"role": "system", "content": system},
+				{"role": "user", "content": user},
+			},
+			"max_tokens":  1500,
+			"temperature": 0.1,
+		})
+		if err != nil {
+			return callModelResult{Error: "marshal request: " + err.Error()}
+		}
+		req, err := http.NewRequestWithContext(ctx, "POST", gatewayURL+"/v1/chat", bytes.NewReader(body))
+		if err != nil {
+			return callModelResult{Error: "build request: " + err.Error()}
+		}
+		req.Header.Set("Content-Type", "application/json")
+		resp, err := client.Do(req)
+		if err != nil {
+			return callModelResult{Error: trim(err.Error(), 240)}
+		}
+		defer resp.Body.Close()
+		buf, _ := io.ReadAll(resp.Body)
+		if resp.StatusCode >= 400 {
+			return callModelResult{Error: fmt.Sprintf("HTTP %d: %s", resp.StatusCode, trim(string(buf), 240))}
+		}
+		var parsed struct {
+			Choices []struct {
+				Message struct {
+					Content string `json:"content"`
+				} `json:"message"`
+			} `json:"choices"`
+		}
+		if err := json.Unmarshal(buf, &parsed); err != nil {
+			return callModelResult{Error: "parse response: " + err.Error()}
+		}
+		content := ""
+		if len(parsed.Choices) > 0 {
+			content = parsed.Choices[0].Message.Content
+		}
+		return callModelResult{Content: content, OK: true}
+	}
+}
+
+// inferProvider picks the right /v1/chat provider for a given model
+// name. Mirrors replay.ts:callModel's branching exactly so the gateway
+// sees the same request shape regardless of caller runtime.
+//
+//   "/" in name              → openrouter
+//   kimi-/qwen3-coder/...    → ollama_cloud
+//   else                     → ollama (local)
+func inferProvider(model string) string {
+	if strings.Contains(model, "/") {
+		return "openrouter"
+	}
+	switch {
+	case strings.HasPrefix(model, "kimi-"),
+		strings.HasPrefix(model, "qwen3-coder"),
+		strings.HasPrefix(model, "deepseek-v"),
+		strings.HasPrefix(model, "mistral-large"),
+		model == "gpt-oss:120b",
+		model == "qwen3.5:397b":
+		return "ollama_cloud"
+	}
+	return "ollama"
+}
+
+// dryRunSynthesize produces a deterministic synthetic response that
+// echoes context-bundle signals. Used by tests + dry-run mode to
+// exercise retrieval + validation without a live LLM.
+func dryRunSynthesize(task string, bundle *ContextBundle) string {
+	parts := []string{
+		"Synthetic dry-run response for task: " + trim(task, 120),
+		"",
+	}
+	if bundle != nil {
+		parts = append(parts, fmt.Sprintf(
+			"Retrieved %d playbooks; %d accepted, %d partial.",
+			len(bundle.RetrievedPlaybooks),
+			len(bundle.PriorSuccessfulOutputs),
+			len(bundle.FailurePatterns),
+		))
+		if len(bundle.ValidationSteps) > 0 {
+			parts = append(parts, "Following validation checklist:")
+			for i, s := range bundle.ValidationSteps {
+				if i >= 3 {
+					break
+				}
+				parts = append(parts, "- "+s)
+			}
+		}
+		if len(bundle.PriorSuccessfulOutputs) > 0 {
+			parts = append(parts, "")
+			parts = append(parts, "Anchored on prior accepted: "+bundle.PriorSuccessfulOutputs[0].Title)
+		}
+	} else {
+		parts = append(parts, "No retrieval context — answering from task alone. Verify and check produced output before approving.")
+	}
+	return strings.Join(parts, "\n")
+}
diff --git a/internal/replay/prompt.go b/internal/replay/prompt.go
new file mode 100644
index 0000000..f86eee4
--- /dev/null
+++ b/internal/replay/prompt.go
@@ -0,0 +1,64 @@
+package replay
+
+import "strings"
+
+// PromptParts captures the two roles the prompt assembly produces.
+type PromptParts struct {
+	System string
+	User   string
+}
+
+const systemPrompt = "You are a Lakehouse task executor. Stay grounded — only assert what you can derive from the prior successful patterns or the task itself. " +
+	"Do NOT hedge. Do NOT say 'as an AI'. Produce a concrete actionable answer. " +
+	"When prior successful outputs are provided, follow their style and format."
+
+// BuildPrompt assembles the system + user messages for a model call.
+// When bundle is nil (NoRetrieval mode), the user message is just the
+// task — same wording as replay.ts so completions stay comparable.
+func BuildPrompt(task string, bundle *ContextBundle) PromptParts {
+	if bundle == nil {
+		return PromptParts{
+			System: systemPrompt,
+			User:   "Task: " + task + "\n\nProduce the answer.",
+		}
+	}
+
+	var b strings.Builder
+	if len(bundle.PriorSuccessfulOutputs) > 0 {
+		b.WriteString("## Prior successful runs on similar tasks\n\n")
+		for _, r := range bundle.PriorSuccessfulOutputs {
+			b.WriteString("### ")
+			b.WriteString(r.Title)
+			b.WriteString(" (score: ")
+			b.WriteString(r.SuccessScore)
+			b.WriteString(")\n")
+			b.WriteString(r.ContentPreview)
+			b.WriteString("\n\n")
+		}
+	}
+	if len(bundle.FailurePatterns) > 0 {
+		b.WriteString("## Patterns that produced PARTIAL results — avoid these failure modes\n\n")
+		for _, r := range bundle.FailurePatterns {
+			b.WriteString("- ")
+			b.WriteString(r.Title)
+			b.WriteString(": ")
+			b.WriteString(trim(r.ContentPreview, 160))
+			b.WriteByte('\n')
+		}
+		b.WriteByte('\n')
+	}
+	if len(bundle.ValidationSteps) > 0 {
+		b.WriteString("## Validation checklist (from accepted runs)\n")
+		for _, s := range bundle.ValidationSteps {
+			b.WriteString("- ")
+			b.WriteString(s)
+			b.WriteByte('\n')
+		}
+		b.WriteByte('\n')
+	}
+	b.WriteString("## Task\n")
+	b.WriteString(task)
+	b.WriteString("\n\nProduce the answer following the style of the prior successful runs above.")
+
+	return PromptParts{System: systemPrompt, User: b.String()}
+}
diff --git a/internal/replay/replay.go b/internal/replay/replay.go
new file mode 100644
index 0000000..3ac74e6
--- /dev/null
+++ b/internal/replay/replay.go
@@ -0,0 +1,193 @@
+package replay
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"time"
+)
+
+// DefaultRoot is what the CLI uses when --root isn't passed.
+func DefaultRoot() string {
+	if r := os.Getenv("LH_DISTILL_ROOT"); r != "" {
+		return r
+	}
+	if cwd, err := os.Getwd(); err == nil {
+		return cwd
+	}
+	return "/home/profit/lakehouse"
+}
+
+// Replay runs the retrieve→prompt→model→validate→log pipeline.
+// Returns a ReplayResult that's already been appended to
+// data/_kb/replay_runs.jsonl unless DryRun + the file is read-only.
+//
+// Errors here are *infrastructure* failures (corpus unreadable, log
+// write failed). A failed model call OR a failed validation gate is
+// captured in ReplayResult.ValidationResult, not returned as error —
+// callers can branch on Passed / EscalationPath.
+func Replay(ctx context.Context, opts ReplayRequest, root string) (ReplayResult, error) {
+	t0 := time.Now()
+	recordedAt := time.Now().UTC().Format(time.RFC3339Nano)
+
+	taskHash := sha256Hex(opts.Task)
+
+	corpus, err := LoadRagCorpus(root)
+	if err != nil {
+		return ReplayResult{}, fmt.Errorf("load rag corpus: %w", err)
+	}
+
+	var bundle *ContextBundle
+	if !opts.NoRetrieval {
+		bundle = BuildContextBundle(corpus, opts.Task)
+	}
+	prompt := BuildPrompt(opts.Task, bundle)
+
+	localModel := orDefault(opts.LocalModel, DefaultLocalModel)
+	escalationModel := orDefault(opts.EscalationModel, DefaultEscalationModel)
+	gatewayURL := orDefault(opts.GatewayURL, gatewayFromEnv())
+
+	caller := httpModelCaller(gatewayURL)
+	if opts.DryRun {
+		caller = dryRunCaller(opts.Task, bundle)
+	}
+
+	escalation := []string{localModel}
+	modelUsed := localModel
+	var modelResponse string
+	var validation ValidationResult
+
+	localCall := caller(ctx, localModel, prompt.System, prompt.User)
+	if localCall.OK {
+		modelResponse = localCall.Content
+		validation = ValidateResponse(modelResponse, bundle)
+	} else {
+		validation = ValidationResult{
+			Passed:  false,
+			Reasons: []string{"local call failed: " + localCall.Error},
+		}
+	}
+
+	if !validation.Passed && opts.AllowEscalation && !opts.LocalOnly {
+		escalation = append(escalation, escalationModel)
+		escalCall := caller(ctx, escalationModel, prompt.System, prompt.User)
+		if escalCall.OK {
+			modelResponse = escalCall.Content
+			modelUsed = escalationModel
+			validation = ValidateResponse(modelResponse, bundle)
+			if validation.Passed {
+				validation.Reasons = append([]string{"recovered via escalation to " + escalationModel}, validation.Reasons...)
+			}
+		} else {
+			validation.Reasons = append(validation.Reasons, "escalation also failed: "+escalCall.Error)
+		}
+	}
+
+	recordedRunID := fmt.Sprintf("replay:%s:%s",
+		taskHash[:16],
+		sha256Hex(recordedAt)[:12],
+	)
+	result := ReplayResult{
+		InputTask:          opts.Task,
+		TaskHash:           taskHash,
+		RetrievedArtifacts: RetrievedIDs{RagIDs: ragIDs(bundle)},
+		ContextBundle:      bundle,
+		ModelResponse:      modelResponse,
+		ModelUsed:          modelUsed,
+		EscalationPath:     escalation,
+		ValidationResult:   validation,
+		RecordedRunID:      recordedRunID,
+		RecordedAt:         recordedAt,
+		DurationMs:         time.Since(t0).Milliseconds(),
+	}
+
+	if err := logReplayEvidence(root, result); err != nil {
+		// Logging failure is real — surface it. The caller still gets the
+		// in-memory result so they can inspect what happened.
+		return result, fmt.Errorf("log replay evidence: %w", err)
+	}
+	return result, nil
+}
+
+// dryRunCaller wraps dryRunSynthesize as a ModelCaller. The escalation
+// branch in Replay calls the caller a second time; for parity with TS,
+// we return the same content suffixed with [ESCALATED] so a smoke can
+// detect escalation in dry-run mode.
+func dryRunCaller(task string, bundle *ContextBundle) ModelCaller {
+	calls := 0
+	return func(_ context.Context, _ string, _ string, _ string) callModelResult {
+		calls++
+		content := dryRunSynthesize(task, bundle)
+		if calls >= 2 {
+			content += "\n\n[ESCALATED]"
+		}
+		return callModelResult{Content: content, OK: true}
+	}
+}
+
+// logReplayEvidence appends one row to data/_kb/replay_runs.jsonl.
+// model_response is truncated to 4000 chars in the persisted log to
+// keep the file lean (matches TS behavior).
+func logReplayEvidence(root string, result ReplayResult) error {
+	path := filepath.Join(root, "data", "_kb", "replay_runs.jsonl")
+	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
+		return err
+	}
+
+	persist := struct {
+		Schema string `json:"schema"`
+		ReplayResult
+	}{
+		Schema:       "replay_run.v1",
+		ReplayResult: result,
+	}
+	persist.ReplayResult.ModelResponse = trim(persist.ReplayResult.ModelResponse, 4000)
+
+	buf, err := json.Marshal(persist)
+	if err != nil {
+		return err
+	}
+	buf = append(buf, '\n')
+
+	f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	_, err = f.Write(buf)
+	return err
+}
+
+func ragIDs(bundle *ContextBundle) []string {
+	if bundle == nil {
+		return []string{}
+	}
+	out := make([]string, 0, len(bundle.RetrievedPlaybooks))
+	for _, p := range bundle.RetrievedPlaybooks {
+		out = append(out, p.RagID)
+	}
+	return out
+}
+
+func sha256Hex(s string) string {
+	h := sha256.Sum256([]byte(s))
+	return hex.EncodeToString(h[:])
+}
+
+func gatewayFromEnv() string {
+	if u := os.Getenv("LH_GATEWAY_URL"); u != "" {
+		return u
+	}
+	return DefaultGatewayURL
+}
+
+func orDefault(v, fallback string) string {
+	if v == "" {
+		return fallback
+	}
+	return v
+}
diff --git a/internal/replay/replay_test.go b/internal/replay/replay_test.go
new file mode 100644
index 0000000..4e1eedd
--- /dev/null
+++ b/internal/replay/replay_test.go
@@ -0,0 +1,283 @@
+package replay
+
+import (
+	"context"
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+// ─── Tokenization + retrieval primitives ───────────────────────────
+
+func TestTokenize_FiltersShortAndLowercase(t *testing.T) {
+	got := tokenize("Hello, World! Foo BAR baz x12 a")
+	want := map[string]bool{"hello": true, "world": true, "foo": true, "bar": true, "baz": true, "x12": true}
+	for k := range want {
+		if _, ok := got[k]; !ok {
+			t.Errorf("missing token %q", k)
+		}
+	}
+	if _, ok := got["a"]; ok {
+		t.Errorf("len=1 token should be filtered: a")
+	}
+}
+
+func TestJaccard_EdgeCases(t *testing.T) {
+	a := map[string]struct{}{"x": {}, "y": {}, "z": {}}
+	b := map[string]struct{}{"y": {}, "z": {}, "w": {}}
+	got := jaccard(a, b)
+	want := 2.0 / 4.0 // |A∩B|=2 (y,z); |A∪B|=4 (x,y,z,w)
+	if got != want {
+		t.Errorf("jaccard = %v, want %v", got, want)
+	}
+	if jaccard(map[string]struct{}{}, b) != 0 {
+		t.Error("empty set should produce 0")
+	}
+}
+
+// ─── Retrieval ───────────────────────────────────────────────────
+
+func TestRetrieveRag_ScoresAndCaps(t *testing.T) {
+	corpus := []RagSample{
+		{ID: "p1", Title: "validate scrum", Content: "verify the build, check tests", Tags: []string{"scrum"}, SuccessScore: "accepted"},
+		{ID: "p2", Title: "irrelevant cooking notes", Content: "boil pasta longer than ten minutes", Tags: []string{"food"}, SuccessScore: "accepted"},
+		{ID: "p3", Title: "build verification ladder", Content: "verify build steps, assert green", Tags: []string{"build"}, SuccessScore: "partially_accepted"},
+	}
+	got := retrieveRag(corpus, "verify the build assert green", 3)
+	if len(got) == 0 {
+		t.Fatal("expected at least one result")
+	}
+	for _, a := range got {
+		if a.RagID == "p2" {
+			t.Errorf("irrelevant sample p2 should not surface, got: %+v", got)
+		}
+	}
+}
+
+func TestBuildContextBundle_SplitsAcceptedAndPartial(t *testing.T) {
+	corpus := []RagSample{
+		{ID: "a1", Title: "A1", Content: "verify build assert green check tests", SuccessScore: "accepted"},
+		{ID: "p1", Title: "P1", Content: "verify build sometimes fails to assert", SuccessScore: "partially_accepted"},
+	}
+	b := BuildContextBundle(corpus, "verify build assert tests")
+	if b == nil {
+		t.Fatal("nil bundle")
+	}
+	if len(b.PriorSuccessfulOutputs) != 1 || b.PriorSuccessfulOutputs[0].RagID != "a1" {
+		t.Errorf("accepted bucket wrong: %+v", b.PriorSuccessfulOutputs)
+	}
+	if len(b.FailurePatterns) != 1 || b.FailurePatterns[0].RagID != "p1" {
+		t.Errorf("partially_accepted bucket wrong: %+v", b.FailurePatterns)
+	}
+	if len(b.ValidationSteps) == 0 {
+		t.Errorf("expected validation_steps from accepted sample, got none")
+	}
+}
+
+// ─── Prompt assembly ─────────────────────────────────────────────
+
+func TestBuildPrompt_NoBundleIsCompact(t *testing.T) {
+	p := BuildPrompt("rebuild evidence index", nil)
+	if !strings.Contains(p.User, "Task: rebuild evidence index") {
+		t.Errorf("user prompt missing task: %q", p.User)
+	}
+	if strings.Contains(p.User, "## Prior successful runs") {
+		t.Error("no-bundle prompt should not include retrieval headers")
+	}
+}
+
+func TestBuildPrompt_WithBundleIncludesAllSections(t *testing.T) {
+	bundle := &ContextBundle{
+		PriorSuccessfulOutputs: []RetrievedArtifact{{RagID: "a1", Title: "A1", ContentPreview: "verified", SuccessScore: "accepted"}},
+		FailurePatterns:        []RetrievedArtifact{{RagID: "p1", Title: "P1", ContentPreview: "partial result", SuccessScore: "partially_accepted"}},
+		ValidationSteps:        []string{"verify the build"},
+	}
+	p := BuildPrompt("task X", bundle)
+	for _, marker := range []string{
+		"## Prior successful runs",
+		"## Patterns that produced PARTIAL results",
+		"## Validation checklist",
+		"## Task",
+		"task X",
+	} {
+		if !strings.Contains(p.User, marker) {
+			t.Errorf("user prompt missing marker %q in:\n%s", marker, p.User)
+		}
+	}
+}
+
+// ─── Validation gate ─────────────────────────────────────────────
+
+func TestValidateResponse_FailsOnEmptyAndShort(t *testing.T) {
+	if got := ValidateResponse("", nil); got.Passed {
+		t.Error("empty should fail")
+	}
+	if got := ValidateResponse("too short", nil); got.Passed {
+		t.Error("too-short should fail")
+	}
+}
+
+func TestValidateResponse_FailsOnFiller(t *testing.T) {
+	resp := strings.Repeat("This is a real long response that meets the eighty character minimum for the gate. ", 2) +
+		" As an AI, I cannot help."
+	got := ValidateResponse(resp, nil)
+	if got.Passed {
+		t.Errorf("response with hedge phrase should fail, reasons=%v", got.Reasons)
+	}
+}
+
+func TestValidateResponse_PassesWhenChecklistOverlaps(t *testing.T) {
+	bundle := &ContextBundle{ValidationSteps: []string{"verify the build is green"}}
+	resp := "I followed the procedure and verified that the build is green and tests passed before merging the change."
+	got := ValidateResponse(resp, bundle)
+	if !got.Passed {
+		t.Errorf("expected pass, got reasons=%v", got.Reasons)
+	}
+}
+
+func TestValidateResponse_FailsWhenChecklistOrthogonal(t *testing.T) {
+	bundle := &ContextBundle{ValidationSteps: []string{"verify mango ripeness"}}
+	resp := "I followed completely unrelated steps about Quantum Tax compliance — I did not look at any fruit at all and that's the point."
+	got := ValidateResponse(resp, bundle)
+	if got.Passed {
+		t.Errorf("expected fail because no checklist token overlap, got pass")
+	}
+}
+
+// ─── End-to-end (dry-run, no LLM) ────────────────────────────────
+
+func TestReplay_DryRun_LogsResult(t *testing.T) {
+	root := t.TempDir()
+	mustWriteRagFixture(t, root, []RagSample{
+		{ID: "p1", Title: "build verification", Content: "verify the build, check tests pass before merge",
+			Tags: []string{"scrum"}, SuccessScore: "accepted", SourceRunID: "r-1"},
+	})
+
+	res, err := Replay(context.Background(), ReplayRequest{
+		Task:   "verify the build before merging",
+		DryRun: true,
+	}, root)
+	if err != nil {
+		t.Fatalf("Replay: %v", err)
+	}
+	if res.RecordedRunID == "" {
+		t.Error("expected recorded_run_id")
+	}
+	if !strings.HasPrefix(res.RecordedRunID, "replay:") {
+		t.Errorf("run_id shape: %s", res.RecordedRunID)
+	}
+	if res.ContextBundle == nil {
+		t.Fatal("expected retrieval to fire by default")
+	}
+	if len(res.ContextBundle.RetrievedPlaybooks) == 0 {
+		t.Errorf("expected at least one retrieved playbook")
+	}
+
+	logPath := filepath.Join(root, "data/_kb/replay_runs.jsonl")
+	body, err := os.ReadFile(logPath)
+	if err != nil {
+		t.Fatalf("read log: %v", err)
+	}
+	var row map[string]any
+	if err := json.Unmarshal([]byte(strings.TrimSpace(string(body))), &row); err != nil {
+		t.Fatalf("parse log row: %v", err)
+	}
+	if row["schema"] != "replay_run.v1" {
+		t.Errorf("schema field: %v", row["schema"])
+	}
+}
+
+func TestReplay_NoRetrievalSkipsCorpus(t *testing.T) {
+	root := t.TempDir()
+	mustWriteRagFixture(t, root, []RagSample{
+		{ID: "p1", Title: "would match", Content: "verify build assert", SuccessScore: "accepted"},
+	})
+
+	res, err := Replay(context.Background(), ReplayRequest{
+		Task:        "verify build assert",
+		DryRun:      true,
+		NoRetrieval: true,
+	}, root)
+	if err != nil {
+		t.Fatalf("Replay: %v", err)
+	}
+	if res.ContextBundle != nil {
+		t.Errorf("expected nil bundle in NoRetrieval mode")
+	}
+	if len(res.RetrievedArtifacts.RagIDs) != 0 {
+		t.Errorf("expected empty rag_ids, got %v", res.RetrievedArtifacts.RagIDs)
+	}
+}
+
+func TestReplay_EscalationFiresOnFailedValidation(t *testing.T) {
+	root := t.TempDir()
+	// Trick: the dry-run synthesizer copies validation_steps verbatim
+	// into its output. If a checklist step contains a hedge phrase, the
+	// synthesized response will contain it too — triggering the
+	// filler-pattern guard in ValidateResponse and forcing escalation.
+	mustWriteRagFixture(t, root, []RagSample{
+		{ID: "p1", Title: "demo step", Content: "verify the build then i cannot proceed without approval", SuccessScore: "accepted"},
+	})
+
+	res, err := Replay(context.Background(), ReplayRequest{
+		Task:            "verify the build then proceed",
+		DryRun:          true,
+		AllowEscalation: true,
+	}, root)
+	if err != nil {
+		t.Fatalf("Replay: %v", err)
+	}
+	if len(res.EscalationPath) < 2 {
+		t.Errorf("expected escalation, path=%v reasons=%v", res.EscalationPath, res.ValidationResult.Reasons)
+	}
+	if !strings.Contains(res.ModelResponse, "[ESCALATED]") {
+		t.Errorf("expected escalated marker in response, got: %q", res.ModelResponse)
+	}
+}
+
+func TestReplay_NoEscalationWhenValidationPasses(t *testing.T) {
+	root := t.TempDir()
+	mustWriteRagFixture(t, root, []RagSample{
+		{ID: "p1", Title: "build verification", Content: "verify the build, check tests pass before merge",
+			Tags: []string{"scrum"}, SuccessScore: "accepted", SourceRunID: "r-1"},
+	})
+
+	res, err := Replay(context.Background(), ReplayRequest{
+		Task:            "verify the build before merging",
+		DryRun:          true,
+		AllowEscalation: true,
+	}, root)
+	if err != nil {
+		t.Fatalf("Replay: %v", err)
+	}
+	if len(res.EscalationPath) != 1 {
+		t.Errorf("expected single-step path on validation pass, got %v", res.EscalationPath)
+	}
+	if !res.ValidationResult.Passed {
+		t.Errorf("expected pass, got reasons=%v", res.ValidationResult.Reasons)
+	}
+}
+
+// ─── Helpers ────────────────────────────────────────────────────
+
+func mustWriteRagFixture(t *testing.T, root string, samples []RagSample) {
+	t.Helper()
+	path := filepath.Join(root, "exports/rag/playbooks.jsonl")
+	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
+		t.Fatalf("mkdir: %v", err)
+	}
+	var buf strings.Builder
+	for _, s := range samples {
+		b, err := json.Marshal(s)
+		if err != nil {
+			t.Fatalf("marshal sample: %v", err)
+		}
+		buf.Write(b)
+		buf.WriteByte('\n')
+	}
+	if err := os.WriteFile(path, []byte(buf.String()), 0o644); err != nil {
+		t.Fatalf("write fixture: %v", err)
+	}
+}
diff --git a/internal/replay/retrieval.go b/internal/replay/retrieval.go
new file mode 100644
index 0000000..62e7575
--- /dev/null
+++ b/internal/replay/retrieval.go
@@ -0,0 +1,215 @@
+package replay
+
+import (
+	"bufio"
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strings"
+)
+
+// tokenize lowercases and splits on non-[a-z0-9_] runs, keeping tokens
+// of length ≥3. Matches replay.ts so retrieval scoring is consistent
+// across runtimes.
+func tokenize(text string) map[string]struct{} {
+	out := map[string]struct{}{}
+	if text == "" {
+		return out
+	}
+	lower := strings.ToLower(text)
+	var b strings.Builder
+	flush := func() {
+		if b.Len() >= 3 {
+			out[b.String()] = struct{}{}
+		}
+		b.Reset()
+	}
+	for _, r := range lower {
+		if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_' {
+			b.WriteRune(r)
+		} else {
+			flush()
+		}
+	}
+	flush()
+	return out
+}
+
+// jaccard returns |A ∩ B| / |A ∪ B| over token sets.
+func jaccard(a, b map[string]struct{}) float64 {
+	if len(a) == 0 || len(b) == 0 {
+		return 0
+	}
+	inter := 0
+	for t := range a {
+		if _, ok := b[t]; ok {
+			inter++
+		}
+	}
+	union := len(a) + len(b) - inter
+	if union == 0 {
+		return 0
+	}
+	return float64(inter) / float64(union)
+}
+
+// LoadRagCorpus reads `exports/rag/playbooks.jsonl` under root.
+// Returns empty slice when the file is missing — callers fall back to
+// a context-less prompt rather than failing.
+func LoadRagCorpus(root string) ([]RagSample, error) {
+	path := filepath.Join(root, "exports", "rag", "playbooks.jsonl")
+	f, err := os.Open(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, err
+	}
+	defer f.Close()
+	var corpus []RagSample
+	sc := bufio.NewScanner(f)
+	sc.Buffer(make([]byte, 0, 1<<16), 1<<24)
+	for sc.Scan() {
+		line := sc.Bytes()
+		if len(line) == 0 {
+			continue
+		}
+		var rec RagSample
+		if err := json.Unmarshal(line, &rec); err != nil {
+			continue // malformed line — skip, matches TS behavior
+		}
+		corpus = append(corpus, rec)
+	}
+	return corpus, sc.Err()
+}
+
+// retrieveRag returns up to topK playbooks with non-zero overlap.
+// Sorted by score descending. Matches replay.ts.
+func retrieveRag(corpus []RagSample, task string, topK int) []RetrievedArtifact {
+	taskTokens := tokenize(task)
+	type scored struct {
+		rec   RagSample
+		score float64
+	}
+	all := make([]scored, 0, len(corpus))
+	for _, r := range corpus {
+		text := r.Title + " " + r.Content + " " + strings.Join(r.Tags, " ")
+		all = append(all, scored{rec: r, score: jaccard(taskTokens, tokenize(text))})
+	}
+	sort.SliceStable(all, func(i, j int) bool { return all[i].score > all[j].score })
+
+	out := make([]RetrievedArtifact, 0, topK)
+	for _, s := range all {
+		if len(out) >= topK {
+			break
+		}
+		if s.score <= 0 {
+			break
+		}
+		out = append(out, RetrievedArtifact{
+			RagID:          s.rec.ID,
+			SourceRunID:    s.rec.SourceRunID,
+			Title:          s.rec.Title,
+			ContentPreview: trim(s.rec.Content, 240),
+			SuccessScore:   s.rec.SuccessScore,
+			Tags:           tagsOrEmpty(s.rec.Tags),
+			Score:          s.score,
+		})
+	}
+	return out
+}
+
+var validationLineRE = regexp.MustCompile(`(?i)^[-*]\s*(verify|check|assert|confirm|ensure)\b|^\s*(verify|check|assert|confirm|ensure)\s`)
+
+// extractValidationSteps pulls verify/check/assert/confirm/ensure
+// lines from accepted samples. Used as a soft-anchor in the
+// validation gate (response should touch at least one of these
+// tokens) and surfaced into the prompt.
+func extractValidationSteps(samples []RetrievedArtifact, corpus []RagSample) []string {
+	ids := map[string]struct{}{}
+	for _, s := range samples {
+		ids[s.RagID] = struct{}{}
+	}
+	var steps []string
+	for _, r := range corpus {
+		if _, ok := ids[r.ID]; !ok {
+			continue
+		}
+		for _, line := range strings.Split(r.Content, "\n") {
+			t := strings.TrimSpace(line)
+			if validationLineRE.MatchString(t) {
+				steps = append(steps, trim(t, 200))
+				if len(steps) >= 6 {
+					return steps
+				}
+			}
+		}
+	}
+	return steps
+}
+
+// BuildContextBundle assembles a ContextBundle from a corpus + task.
+// Top 8 retrieved → split by success_score → at most 3 accepted, 2
+// warnings → extract validation steps → estimate token cost.
+func BuildContextBundle(corpus []RagSample, task string) *ContextBundle {
+	top := retrieveRag(corpus, task, 8)
+	accepted := filterByScore(top, "accepted", 3)
+	warnings := filterByScore(top, "partially_accepted", 2)
+	steps := extractValidationSteps(accepted, corpus)
+
+	totalChars := 0
+	for _, r := range accepted {
+		totalChars += len(r.ContentPreview) + len(r.Title)
+	}
+	for _, r := range warnings {
+		totalChars += len(r.ContentPreview) + len(r.Title)
+	}
+	for _, s := range steps {
+		totalChars += len(s)
+	}
+	tokenEstimate := (totalChars + 3) / 4 // ceil(chars/4)
+
+	return &ContextBundle{
+		RetrievedPlaybooks:     top,
+		PriorSuccessfulOutputs: accepted,
+		FailurePatterns:        warnings,
+		ValidationSteps:        stepsOrEmpty(steps),
+		BundleTokenEstimate:    tokenEstimate,
+	}
+}
+
+func filterByScore(arts []RetrievedArtifact, score string, max int) []RetrievedArtifact {
+	out := make([]RetrievedArtifact, 0, max)
+	for _, a := range arts {
+		if a.SuccessScore == score {
+			out = append(out, a)
+			if len(out) >= max {
+				break
+			}
+		}
+	}
+	return out
+}
+
+func tagsOrEmpty(t []string) []string {
+	if t == nil {
+		return []string{}
+	}
+	return t
+}
+
+func stepsOrEmpty(s []string) []string {
+	if s == nil {
+		return []string{}
+	}
+	return s
+}
+
+func trim(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	return s[:n]
+}
diff --git a/internal/replay/types.go b/internal/replay/types.go
new file mode 100644
index 0000000..9048323
--- /dev/null
+++ b/internal/replay/types.go
@@ -0,0 +1,98 @@
+// Package replay ports scripts/distillation/replay.ts to Go.
+//
+// Replay takes a task → retrieves matching playbooks/RAG records →
+// builds a context bundle → calls a LOCAL model via the gateway's
+// /v1/chat → validates → escalates to a stronger model if needed →
+// logs the run as new evidence in `data/_kb/replay_runs.jsonl`.
+//
+// Spec invariants (carry over from replay.ts):
+//   - never bypass retrieval (unless caller passes NoRetrieval)
+//   - never discard provenance
+//   - never allow free-form hallucinated output (validation gate)
+//   - log every run as new evidence
+//
+// This is NOT training — it's runtime behavior shaping via retrieval.
+package replay
+
+// ReplayRequest mirrors the TS interface. NoRetrieval skips the
+// context bundle entirely (baseline mode for A/B tests). DryRun returns
+// a deterministic synthetic response without calling the gateway —
+// used by tests to exercise retrieval/validation without an LLM.
+type ReplayRequest struct {
+	Task             string
+	LocalOnly        bool
+	AllowEscalation  bool
+	NoRetrieval      bool
+	DryRun           bool
+	GatewayURL       string // overrides $LH_GATEWAY_URL
+	LocalModel       string // overrides default
+	EscalationModel  string // overrides default
+}
+
+// RagSample is one record in exports/rag/playbooks.jsonl.
+type RagSample struct {
+	ID             string   `json:"id"`
+	Title          string   `json:"title"`
+	Content        string   `json:"content"`
+	Tags           []string `json:"tags"`
+	SourceRunID    string   `json:"source_run_id"`
+	SuccessScore   string   `json:"success_score"`
+	SourceCategory string   `json:"source_category"`
+}
+
+// RetrievedArtifact is one playbook surfaced into a ContextBundle.
+type RetrievedArtifact struct {
+	RagID          string   `json:"rag_id"`
+	SourceRunID    string   `json:"source_run_id"`
+	Title          string   `json:"title"`
+	ContentPreview string   `json:"content_preview"` // first 240 chars
+	SuccessScore   string   `json:"success_score"`
+	Tags           []string `json:"tags"`
+	Score          float64  `json:"score"`
+}
+
+// ContextBundle is what the prompt builder consumes. Empty bundles
+// (no retrieved playbooks) still pass through — buildPrompt downgrades
+// to a no-context prompt when both accepted and warnings are empty.
+type ContextBundle struct {
+	RetrievedPlaybooks    []RetrievedArtifact `json:"retrieved_playbooks"`
+	PriorSuccessfulOutputs []RetrievedArtifact `json:"prior_successful_outputs"`
+	FailurePatterns       []RetrievedArtifact `json:"failure_patterns"`
+	ValidationSteps       []string            `json:"validation_steps"`
+	BundleTokenEstimate   int                 `json:"bundle_token_estimate"`
+}
+
+// ValidationResult is the deterministic gate's verdict. Reasons is
+// always non-nil so JSON consumers can iterate without a nil check.
+type ValidationResult struct {
+	Passed  bool     `json:"passed"`
+	Reasons []string `json:"reasons"`
+}
+
+// ReplayResult is what Replay returns. Mirrors the TS type one-to-one
+// so JSONL emitted by either runtime parses identically.
+type ReplayResult struct {
+	InputTask           string         `json:"input_task"`
+	TaskHash            string         `json:"task_hash"`
+	RetrievedArtifacts  RetrievedIDs   `json:"retrieved_artifacts"`
+	ContextBundle       *ContextBundle `json:"context_bundle"`
+	ModelResponse       string         `json:"model_response"`
+	ModelUsed           string         `json:"model_used"`
+	EscalationPath      []string       `json:"escalation_path"`
+	ValidationResult    ValidationResult `json:"validation_result"`
+	RecordedRunID       string         `json:"recorded_run_id"`
+	RecordedAt          string         `json:"recorded_at"`
+	DurationMs          int64          `json:"duration_ms"`
+}
+
+// RetrievedIDs is the {rag_ids} envelope the TS shape uses.
+type RetrievedIDs struct {
+	RagIDs []string `json:"rag_ids"`
+}
+
+// Defaults match replay.ts. Override via env or ReplayRequest fields.
+const (
+	DefaultLocalModel       = "qwen3.5:latest"
+	DefaultEscalationModel  = "deepseek-v3.1:671b"
+	DefaultGatewayURL       = "http://localhost:3110"
+)
diff --git a/internal/replay/validate.go b/internal/replay/validate.go
new file mode 100644
index 0000000..7fb217e
--- /dev/null
+++ b/internal/replay/validate.go
@@ -0,0 +1,66 @@
+package replay
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+)
+
+// fillerPatterns are the hedge phrases the spec rejects. Compiled once
+// per package — the gate runs on every replay call.
+var fillerPatterns = []*regexp.Regexp{
+	regexp.MustCompile(`(?i)as an ai`),
+	regexp.MustCompile(`(?i)i cannot`),
+	regexp.MustCompile(`(?i)i'?m sorry, but`),
+	regexp.MustCompile(`(?i)i don'?t have access`),
+	regexp.MustCompile(`(?i)i am unable to`),
+}
+
+// ValidateResponse runs the deterministic gate on a model response.
+// Empty / too-short / hedge-bearing / context-disconnected responses
+// fail. Matches replay.ts:validateResponse one-to-one.
+func ValidateResponse(response string, bundle *ContextBundle) ValidationResult {
+	trimmed := strings.TrimSpace(response)
+	var reasons []string
+
+	if len(trimmed) == 0 {
+		return ValidationResult{Passed: false, Reasons: []string{"empty response"}}
+	}
+	if len(trimmed) < 80 {
+		reasons = append(reasons, fmt.Sprintf("response too short (%d chars; min 80)", len(trimmed)))
+	}
+	for _, re := range fillerPatterns {
+		if re.MatchString(trimmed) {
+			reasons = append(reasons, fmt.Sprintf("filler/hedge phrase detected: %s", re.String()))
+		}
+	}
+	// Soft anchor: if a validation checklist was supplied, the response
+	// should share at least one token with it (≥3 chars per tokenize()).
+	if bundle != nil && len(bundle.ValidationSteps) > 0 {
+		checklistTokens := map[string]struct{}{}
+		for _, s := range bundle.ValidationSteps {
+			for t := range tokenize(s) {
+				checklistTokens[t] = struct{}{}
+			}
+		}
+		respTokens := tokenize(trimmed)
+		overlap := 0
+		for t := range checklistTokens {
+			if _, ok := respTokens[t]; ok {
+				overlap++
+			}
+		}
+		if len(checklistTokens) > 0 && overlap == 0 {
+			reasons = append(reasons, "response shares no tokens with validation checklist (may not have followed prior patterns)")
+		}
+	}
+
+	return ValidationResult{Passed: len(reasons) == 0, Reasons: reasonsOrEmpty(reasons)}
+}
+
+func reasonsOrEmpty(r []string) []string {
+	if r == nil {
+		return []string{}
+	}
+	return r
+}
diff --git a/scripts/replay_smoke.sh b/scripts/replay_smoke.sh
new file mode 100755
index 0000000..1274f2b
--- /dev/null
+++ b/scripts/replay_smoke.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+# replay smoke — Go port of scripts/distillation/replay.ts.
+# Validates that the replay tool:
+#   - Builds a context bundle from a synthetic playbooks corpus
+#   - Runs --dry-run end-to-end without an LLM
+#   - Logs a row to data/_kb/replay_runs.jsonl with schema=replay_run.v1
+#   - Honors --no-retrieval (no bundle, empty rag_ids)
+#   - Exits non-zero when validation fails
+
+set -euo pipefail
+cd "$(dirname "$0")/.."
+
+export PATH="$PATH:/usr/local/go/bin"
+
+echo "[replay-smoke] building bin/replay..."
+go build -o bin/replay ./cmd/replay
+
+ROOT="$(mktemp -d)"
+trap 'rm -rf "$ROOT"' EXIT INT TERM
+
+mkdir -p "$ROOT/exports/rag"
+cat > "$ROOT/exports/rag/playbooks.jsonl" <<'EOF'
+{"id":"p1","title":"build verification","content":"verify the build, check tests pass before merge\nensure no regressions in suites","tags":["scrum"],"source_run_id":"r-1","success_score":"accepted","source_category":"scrum_review"}
+{"id":"p2","title":"merge cleanup","content":"verify the build, then assert tests passed, then merge","tags":["scrum"],"source_run_id":"r-2","success_score":"accepted","source_category":"scrum_review"}
+{"id":"p3","title":"partial fix","content":"verify the build, sometimes assert tests passed","tags":["scrum"],"source_run_id":"r-3","success_score":"partially_accepted","source_category":"scrum_review"}
+EOF
+
+echo "[replay-smoke] dry-run (with retrieval)"
+./bin/replay -task "verify the build before merging" -dry-run -root "$ROOT" > /tmp/replay_smoke_a.txt 2>&1 || true
+grep -q "retrieval: " /tmp/replay_smoke_a.txt || {
+  echo "missing retrieval line"; cat /tmp/replay_smoke_a.txt; exit 1;
+}
+grep -q "escalation_path: qwen3.5:latest" /tmp/replay_smoke_a.txt || {
+  echo "missing escalation_path line"; cat /tmp/replay_smoke_a.txt; exit 1;
+}
+
+LOG="$ROOT/data/_kb/replay_runs.jsonl"
+[ -s "$LOG" ] || { echo "expected $LOG to be written"; exit 1; }
+grep -q "replay_run.v1" "$LOG" || {
+  echo "schema=replay_run.v1 missing in log";
+  cat "$LOG";
+  exit 1;
+}
+
+echo "[replay-smoke] dry-run (no retrieval)"
+./bin/replay -task "verify build" -dry-run -no-retrieval -root "$ROOT" > /tmp/replay_smoke_b.txt 2>&1 || true
+grep -q "retrieval: DISABLED" /tmp/replay_smoke_b.txt || {
+  echo "expected retrieval: DISABLED";
+  cat /tmp/replay_smoke_b.txt;
+  exit 1;
+}
+
+LINES_BEFORE=$(wc -l < "$LOG")
+
+echo "[replay-smoke] forced-fail with escalation"
+# Force validation failure by putting a hedge phrase as the FIRST
+# accepted sample's first verify line. extractValidationSteps walks
+# corpus order, and the dry-run synthesizer surfaces the first 3 steps,
+# so the hedge phrase needs to be in an early-corpus accepted sample.
+cat > "$ROOT/exports/rag/playbooks.jsonl" <<'EOF'
+{"id":"p9","title":"hedged step","content":"verify auth as an AI and proceed without checking","tags":["security"],"source_run_id":"r-9","success_score":"accepted","source_category":"audit"}
+{"id":"p1","title":"build verification","content":"verify the build, check tests pass before merge","tags":["scrum"],"source_run_id":"r-1","success_score":"accepted","source_category":"scrum_review"}
+EOF
+./bin/replay -task "verify auth proceed" -dry-run -allow-escalation -root "$ROOT" > /tmp/replay_smoke_c.txt 2>&1 || true
+grep -q "escalation_path: qwen3.5:latest → deepseek-v3.1:671b" /tmp/replay_smoke_c.txt || {
+  echo "expected escalation path to deepseek when validation fails";
+  cat /tmp/replay_smoke_c.txt;
+  exit 1;
+}
+
+LINES_AFTER=$(wc -l < "$LOG")
+[ "$LINES_AFTER" -gt "$LINES_BEFORE" ] || {
+  echo "expected log file to grow: before=$LINES_BEFORE after=$LINES_AFTER";
+  exit 1;
+}
+
+echo "[replay-smoke] PASS"