commit 89ca72d4718fcb20ba9dcc03110e090890a0736e Author: root Date: Sat May 2 03:31:02 2026 -0500 materializer + replay ports + vectord substrate fix verified at scale Two threads landing together — the doc edits interleave so they ship in a single commit. 1. **vectord substrate fix verified at original scale** (closes the 2026-05-01 thread). Re-ran multitier 5min @ conc=50: 132,211 scenarios at 438/sec, 6/6 classes at 0% failure (was 4/6 pre-fix). Throughput dropped 1,115 → 438/sec because previously-broken scenarios now do real HNSW Add work — honest cost of correctness. The fix (i.vectors side-store + safeGraphAdd recover wrappers + smallIndexRebuildThreshold=32 + saveTask coalescing) holds at the footprint that originally surfaced the bug. 2. **Materializer port** — internal/materializer + cmd/materializer + scripts/materializer_smoke.sh. Ports scripts/distillation/transforms.ts (12 transforms) + build_evidence_index.ts (idempotency, day-partition, receipt). On-wire JSON shape matches TS so Bun and Go runs are interchangeable. 14 tests green. 3. **Replay port** — internal/replay + cmd/replay + scripts/replay_smoke.sh. Ports scripts/distillation/replay.ts (retrieve → bundle → /v1/chat → validate → log). Closes audit-FULL phase 7 live invocation on the Go side. Both runtimes append to the same data/_kb/replay_runs.jsonl (schema=replay_run.v1). 14 tests green. Side effect on internal/distillation/types.go: EvidenceRecord gained prompt_tokens, completion_tokens, and metadata fields to mirror the TS shape the materializer transforms produce. STATE_OF_PLAY refreshed to 2026-05-02; ARCHITECTURE_COMPARISON decisions tracker moves the materializer + replay items from _open_ to DONE and adds the substrate-fix scale verification row. Co-Authored-By: Claude Opus 4.7 (1M context) diff --git a/cmd/replay/main.go b/cmd/replay/main.go new file mode 100644 index 0000000..f73d3b6 --- /dev/null +++ b/cmd/replay/main.go @@ -0,0 +1,87 @@ +// replay — Go-side distillation replay runner. Closes audit-FULL +// phase 7 live invocation on the Go side. Mirrors +// scripts/distillation/replay.ts; both runtimes append to the same +// `data/_kb/replay_runs.jsonl` shape (schema=replay_run.v1). +// +// Usage: +// +// replay -task "rebuild evidence index" +// replay -task "..." -allow-escalation +// replay -task "..." -no-retrieval # baseline mode +// replay -task "..." -dry-run # synthetic, no LLM +// replay -task "..." -root /home/profit/lakehouse # custom repo root +package main + +import ( + "context" + "flag" + "fmt" + "os" + "strings" + + "git.agentview.dev/profit/golangLAKEHOUSE/internal/replay" +) + +func main() { + task := flag.String("task", "", "input task to replay") + localOnly := flag.Bool("local-only", false, "never escalate; record validation result only") + allowEscalation := flag.Bool("allow-escalation", false, "fall back to the bigger model when local validation fails") + noRetrieval := flag.Bool("no-retrieval", false, "baseline mode: skip retrieval bundle (still logs)") + dryRun := flag.Bool("dry-run", false, "synthesize a deterministic response — no LLM call") + root := flag.String("root", replay.DefaultRoot(), "lakehouse repo root (defaults to $LH_DISTILL_ROOT or cwd)") + gateway := flag.String("gateway", "", "override gateway URL (default: $LH_GATEWAY_URL or http://localhost:3110)") + localModel := flag.String("local-model", "", "override local model name") + escalationModel := flag.String("escalation-model", "", "override escalation model name") + flag.Parse() + + if *task == "" { + fmt.Fprintln(os.Stderr, `usage: replay -task "" [-local-only] [-allow-escalation] [-no-retrieval] [-dry-run]`) + os.Exit(2) + } + + res, err := replay.Replay(context.Background(), replay.ReplayRequest{ + Task: *task, + LocalOnly: *localOnly, + AllowEscalation: *allowEscalation, + NoRetrieval: *noRetrieval, + DryRun: *dryRun, + GatewayURL: *gateway, + LocalModel: *localModel, + EscalationModel: *escalationModel, + }, *root) + if err != nil { + fmt.Fprintf(os.Stderr, "replay: %v\n", err) + os.Exit(1) + } + + fmt.Printf("[replay] run_id=%s\n", res.RecordedRunID) + if res.ContextBundle == nil { + fmt.Println("[replay] retrieval: DISABLED") + } else { + fmt.Printf("[replay] retrieval: %d playbooks\n", len(res.ContextBundle.RetrievedPlaybooks)) + } + fmt.Printf("[replay] escalation_path: %s\n", strings.Join(res.EscalationPath, " → ")) + fmt.Printf("[replay] model_used: %s · %dms\n", res.ModelUsed, res.DurationMs) + verdict := "PASS" + if !res.ValidationResult.Passed { + verdict = "FAIL" + } + suffix := "" + if len(res.ValidationResult.Reasons) > 0 { + suffix = " (" + strings.Join(res.ValidationResult.Reasons, "; ") + ")" + } + fmt.Printf("[replay] validation: %s%s\n", verdict, suffix) + fmt.Println() + fmt.Println("─── response ───") + body := res.ModelResponse + if len(body) > 1500 { + fmt.Println(body[:1500]) + fmt.Printf("... [%d more chars]\n", len(body)-1500) + } else { + fmt.Println(body) + } + + if !res.ValidationResult.Passed { + os.Exit(1) + } +} diff --git a/internal/replay/model.go b/internal/replay/model.go new file mode 100644 index 0000000..cbad676 --- /dev/null +++ b/internal/replay/model.go @@ -0,0 +1,131 @@ +package replay + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +// callModelResult is what the gateway round-trip returns. +type callModelResult struct { + Content string + OK bool + Error string +} + +// ModelCaller is the seam tests use to swap out HTTP. Production +// supplies httpModelCaller; tests can supply scripted responses. +type ModelCaller func(ctx context.Context, model, system, user string) callModelResult + +// httpModelCaller posts to ${gatewayURL}/v1/chat with provider derived +// from model name. Mirrors replay.ts:callModel. +func httpModelCaller(gatewayURL string) ModelCaller { + client := &http.Client{Timeout: 180 * time.Second} + return func(ctx context.Context, model, system, user string) callModelResult { + provider := inferProvider(model) + body, err := json.Marshal(map[string]any{ + "provider": provider, + "model": model, + "messages": []map[string]string{ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + }, + "max_tokens": 1500, + "temperature": 0.1, + }) + if err != nil { + return callModelResult{Error: "marshal request: " + err.Error()} + } + req, err := http.NewRequestWithContext(ctx, "POST", gatewayURL+"/v1/chat", bytes.NewReader(body)) + if err != nil { + return callModelResult{Error: "build request: " + err.Error()} + } + req.Header.Set("Content-Type", "application/json") + resp, err := client.Do(req) + if err != nil { + return callModelResult{Error: trim(err.Error(), 240)} + } + defer resp.Body.Close() + buf, _ := io.ReadAll(resp.Body) + if resp.StatusCode >= 400 { + return callModelResult{Error: fmt.Sprintf("HTTP %d: %s", resp.StatusCode, trim(string(buf), 240))} + } + var parsed struct { + Choices []struct { + Message struct { + Content string `json:"content"` + } `json:"message"` + } `json:"choices"` + } + if err := json.Unmarshal(buf, &parsed); err != nil { + return callModelResult{Error: "parse response: " + err.Error()} + } + content := "" + if len(parsed.Choices) > 0 { + content = parsed.Choices[0].Message.Content + } + return callModelResult{Content: content, OK: true} + } +} + +// inferProvider picks the right /v1/chat provider for a given model +// name. Mirrors replay.ts:callModel's branching exactly so the gateway +// sees the same request shape regardless of caller runtime. +// +// "/" in name → openrouter +// kimi-/qwen3-coder/... → ollama_cloud +// else → ollama (local) +func inferProvider(model string) string { + if strings.Contains(model, "/") { + return "openrouter" + } + switch { + case strings.HasPrefix(model, "kimi-"), + strings.HasPrefix(model, "qwen3-coder"), + strings.HasPrefix(model, "deepseek-v"), + strings.HasPrefix(model, "mistral-large"), + model == "gpt-oss:120b", + model == "qwen3.5:397b": + return "ollama_cloud" + } + return "ollama" +} + +// dryRunSynthesize produces a deterministic synthetic response that +// echoes context-bundle signals. Used by tests + dry-run mode to +// exercise retrieval + validation without a live LLM. +func dryRunSynthesize(task string, bundle *ContextBundle) string { + parts := []string{ + "Synthetic dry-run response for task: " + trim(task, 120), + "", + } + if bundle != nil { + parts = append(parts, fmt.Sprintf( + "Retrieved %d playbooks; %d accepted, %d partial.", + len(bundle.RetrievedPlaybooks), + len(bundle.PriorSuccessfulOutputs), + len(bundle.FailurePatterns), + )) + if len(bundle.ValidationSteps) > 0 { + parts = append(parts, "Following validation checklist:") + for i, s := range bundle.ValidationSteps { + if i >= 3 { + break + } + parts = append(parts, "- "+s) + } + } + if len(bundle.PriorSuccessfulOutputs) > 0 { + parts = append(parts, "") + parts = append(parts, "Anchored on prior accepted: "+bundle.PriorSuccessfulOutputs[0].Title) + } + } else { + parts = append(parts, "No retrieval context — answering from task alone. Verify and check produced output before approving.") + } + return strings.Join(parts, "\n") +} diff --git a/internal/replay/prompt.go b/internal/replay/prompt.go new file mode 100644 index 0000000..f86eee4 --- /dev/null +++ b/internal/replay/prompt.go @@ -0,0 +1,64 @@ +package replay + +import "strings" + +// PromptParts captures the two roles the prompt assembly produces. +type PromptParts struct { + System string + User string +} + +const systemPrompt = "You are a Lakehouse task executor. Stay grounded — only assert what you can derive from the prior successful patterns or the task itself. " + + "Do NOT hedge. Do NOT say 'as an AI'. Produce a concrete actionable answer. " + + "When prior successful outputs are provided, follow their style and format." + +// BuildPrompt assembles the system + user messages for a model call. +// When bundle is nil (NoRetrieval mode), the user message is just the +// task — same wording as replay.ts so completions stay comparable. +func BuildPrompt(task string, bundle *ContextBundle) PromptParts { + if bundle == nil { + return PromptParts{ + System: systemPrompt, + User: "Task: " + task + "\n\nProduce the answer.", + } + } + + var b strings.Builder + if len(bundle.PriorSuccessfulOutputs) > 0 { + b.WriteString("## Prior successful runs on similar tasks\n\n") + for _, r := range bundle.PriorSuccessfulOutputs { + b.WriteString("### ") + b.WriteString(r.Title) + b.WriteString(" (score: ") + b.WriteString(r.SuccessScore) + b.WriteString(")\n") + b.WriteString(r.ContentPreview) + b.WriteString("\n\n") + } + } + if len(bundle.FailurePatterns) > 0 { + b.WriteString("## Patterns that produced PARTIAL results — avoid these failure modes\n\n") + for _, r := range bundle.FailurePatterns { + b.WriteString("- ") + b.WriteString(r.Title) + b.WriteString(": ") + b.WriteString(trim(r.ContentPreview, 160)) + b.WriteByte('\n') + } + b.WriteByte('\n') + } + if len(bundle.ValidationSteps) > 0 { + b.WriteString("## Validation checklist (from accepted runs)\n") + for _, s := range bundle.ValidationSteps { + b.WriteString("- ") + b.WriteString(s) + b.WriteByte('\n') + } + b.WriteByte('\n') + } + b.WriteString("## Task\n") + b.WriteString(task) + b.WriteString("\n\nProduce the answer following the style of the prior successful runs above.") + + return PromptParts{System: systemPrompt, User: b.String()} +} diff --git a/internal/replay/replay.go b/internal/replay/replay.go new file mode 100644 index 0000000..3ac74e6 --- /dev/null +++ b/internal/replay/replay.go @@ -0,0 +1,193 @@ +package replay + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "os" + "path/filepath" + "time" +) + +// DefaultRoot is what the CLI uses when --root isn't passed. +func DefaultRoot() string { + if r := os.Getenv("LH_DISTILL_ROOT"); r != "" { + return r + } + if cwd, err := os.Getwd(); err == nil { + return cwd + } + return "/home/profit/lakehouse" +} + +// Replay runs the retrieve→prompt→model→validate→log pipeline. +// Returns a ReplayResult that's already been appended to +// data/_kb/replay_runs.jsonl unless DryRun + the file is read-only. +// +// Errors here are *infrastructure* failures (corpus unreadable, log +// write failed). A failed model call OR a failed validation gate is +// captured in ReplayResult.ValidationResult, not returned as error — +// callers can branch on Passed / EscalationPath. +func Replay(ctx context.Context, opts ReplayRequest, root string) (ReplayResult, error) { + t0 := time.Now() + recordedAt := time.Now().UTC().Format(time.RFC3339Nano) + + taskHash := sha256Hex(opts.Task) + + corpus, err := LoadRagCorpus(root) + if err != nil { + return ReplayResult{}, fmt.Errorf("load rag corpus: %w", err) + } + + var bundle *ContextBundle + if !opts.NoRetrieval { + bundle = BuildContextBundle(corpus, opts.Task) + } + prompt := BuildPrompt(opts.Task, bundle) + + localModel := orDefault(opts.LocalModel, DefaultLocalModel) + escalationModel := orDefault(opts.EscalationModel, DefaultEscalationModel) + gatewayURL := orDefault(opts.GatewayURL, gatewayFromEnv()) + + caller := httpModelCaller(gatewayURL) + if opts.DryRun { + caller = dryRunCaller(opts.Task, bundle) + } + + escalation := []string{localModel} + modelUsed := localModel + var modelResponse string + var validation ValidationResult + + localCall := caller(ctx, localModel, prompt.System, prompt.User) + if localCall.OK { + modelResponse = localCall.Content + validation = ValidateResponse(modelResponse, bundle) + } else { + validation = ValidationResult{ + Passed: false, + Reasons: []string{"local call failed: " + localCall.Error}, + } + } + + if !validation.Passed && opts.AllowEscalation && !opts.LocalOnly { + escalation = append(escalation, escalationModel) + escalCall := caller(ctx, escalationModel, prompt.System, prompt.User) + if escalCall.OK { + modelResponse = escalCall.Content + modelUsed = escalationModel + validation = ValidateResponse(modelResponse, bundle) + if validation.Passed { + validation.Reasons = append([]string{"recovered via escalation to " + escalationModel}, validation.Reasons...) + } + } else { + validation.Reasons = append(validation.Reasons, "escalation also failed: "+escalCall.Error) + } + } + + recordedRunID := fmt.Sprintf("replay:%s:%s", + taskHash[:16], + sha256Hex(recordedAt)[:12], + ) + result := ReplayResult{ + InputTask: opts.Task, + TaskHash: taskHash, + RetrievedArtifacts: RetrievedIDs{RagIDs: ragIDs(bundle)}, + ContextBundle: bundle, + ModelResponse: modelResponse, + ModelUsed: modelUsed, + EscalationPath: escalation, + ValidationResult: validation, + RecordedRunID: recordedRunID, + RecordedAt: recordedAt, + DurationMs: time.Since(t0).Milliseconds(), + } + + if err := logReplayEvidence(root, result); err != nil { + // Logging failure is real — surface it. The caller still gets the + // in-memory result so they can inspect what happened. + return result, fmt.Errorf("log replay evidence: %w", err) + } + return result, nil +} + +// dryRunCaller wraps dryRunSynthesize as a ModelCaller. The escalation +// branch in Replay calls the caller a second time; for parity with TS, +// we return the same content suffixed with [ESCALATED] so a smoke can +// detect escalation in dry-run mode. +func dryRunCaller(task string, bundle *ContextBundle) ModelCaller { + calls := 0 + return func(_ context.Context, _ string, _ string, _ string) callModelResult { + calls++ + content := dryRunSynthesize(task, bundle) + if calls >= 2 { + content += "\n\n[ESCALATED]" + } + return callModelResult{Content: content, OK: true} + } +} + +// logReplayEvidence appends one row to data/_kb/replay_runs.jsonl. +// model_response is truncated to 4000 chars in the persisted log to +// keep the file lean (matches TS behavior). +func logReplayEvidence(root string, result ReplayResult) error { + path := filepath.Join(root, "data", "_kb", "replay_runs.jsonl") + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + + persist := struct { + Schema string `json:"schema"` + ReplayResult + }{ + Schema: "replay_run.v1", + ReplayResult: result, + } + persist.ReplayResult.ModelResponse = trim(persist.ReplayResult.ModelResponse, 4000) + + buf, err := json.Marshal(persist) + if err != nil { + return err + } + buf = append(buf, '\n') + + f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) + if err != nil { + return err + } + defer f.Close() + _, err = f.Write(buf) + return err +} + +func ragIDs(bundle *ContextBundle) []string { + if bundle == nil { + return []string{} + } + out := make([]string, 0, len(bundle.RetrievedPlaybooks)) + for _, p := range bundle.RetrievedPlaybooks { + out = append(out, p.RagID) + } + return out +} + +func sha256Hex(s string) string { + h := sha256.Sum256([]byte(s)) + return hex.EncodeToString(h[:]) +} + +func gatewayFromEnv() string { + if u := os.Getenv("LH_GATEWAY_URL"); u != "" { + return u + } + return DefaultGatewayURL +} + +func orDefault(v, fallback string) string { + if v == "" { + return fallback + } + return v +} diff --git a/internal/replay/replay_test.go b/internal/replay/replay_test.go new file mode 100644 index 0000000..4e1eedd --- /dev/null +++ b/internal/replay/replay_test.go @@ -0,0 +1,283 @@ +package replay + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" +) + +// ─── Tokenization + retrieval primitives ─────────────────────────── + +func TestTokenize_FiltersShortAndLowercase(t *testing.T) { + got := tokenize("Hello, World! Foo BAR baz x12 a") + want := map[string]bool{"hello": true, "world": true, "foo": true, "bar": true, "baz": true, "x12": true} + for k := range want { + if _, ok := got[k]; !ok { + t.Errorf("missing token %q", k) + } + } + if _, ok := got["a"]; ok { + t.Errorf("len=1 token should be filtered: a") + } +} + +func TestJaccard_EdgeCases(t *testing.T) { + a := map[string]struct{}{"x": {}, "y": {}, "z": {}} + b := map[string]struct{}{"y": {}, "z": {}, "w": {}} + got := jaccard(a, b) + want := 2.0 / 4.0 // |A∩B|=2 (y,z); |A∪B|=4 (x,y,z,w) + if got != want { + t.Errorf("jaccard = %v, want %v", got, want) + } + if jaccard(map[string]struct{}{}, b) != 0 { + t.Error("empty set should produce 0") + } +} + +// ─── Retrieval ─────────────────────────────────────────────────── + +func TestRetrieveRag_ScoresAndCaps(t *testing.T) { + corpus := []RagSample{ + {ID: "p1", Title: "validate scrum", Content: "verify the build, check tests", Tags: []string{"scrum"}, SuccessScore: "accepted"}, + {ID: "p2", Title: "irrelevant cooking notes", Content: "boil pasta longer than ten minutes", Tags: []string{"food"}, SuccessScore: "accepted"}, + {ID: "p3", Title: "build verification ladder", Content: "verify build steps, assert green", Tags: []string{"build"}, SuccessScore: "partially_accepted"}, + } + got := retrieveRag(corpus, "verify the build assert green", 3) + if len(got) == 0 { + t.Fatal("expected at least one result") + } + for _, a := range got { + if a.RagID == "p2" { + t.Errorf("irrelevant sample p2 should not surface, got: %+v", got) + } + } +} + +func TestBuildContextBundle_SplitsAcceptedAndPartial(t *testing.T) { + corpus := []RagSample{ + {ID: "a1", Title: "A1", Content: "verify build assert green check tests", SuccessScore: "accepted"}, + {ID: "p1", Title: "P1", Content: "verify build sometimes fails to assert", SuccessScore: "partially_accepted"}, + } + b := BuildContextBundle(corpus, "verify build assert tests") + if b == nil { + t.Fatal("nil bundle") + } + if len(b.PriorSuccessfulOutputs) != 1 || b.PriorSuccessfulOutputs[0].RagID != "a1" { + t.Errorf("accepted bucket wrong: %+v", b.PriorSuccessfulOutputs) + } + if len(b.FailurePatterns) != 1 || b.FailurePatterns[0].RagID != "p1" { + t.Errorf("partially_accepted bucket wrong: %+v", b.FailurePatterns) + } + if len(b.ValidationSteps) == 0 { + t.Errorf("expected validation_steps from accepted sample, got none") + } +} + +// ─── Prompt assembly ───────────────────────────────────────────── + +func TestBuildPrompt_NoBundleIsCompact(t *testing.T) { + p := BuildPrompt("rebuild evidence index", nil) + if !strings.Contains(p.User, "Task: rebuild evidence index") { + t.Errorf("user prompt missing task: %q", p.User) + } + if strings.Contains(p.User, "## Prior successful runs") { + t.Error("no-bundle prompt should not include retrieval headers") + } +} + +func TestBuildPrompt_WithBundleIncludesAllSections(t *testing.T) { + bundle := &ContextBundle{ + PriorSuccessfulOutputs: []RetrievedArtifact{{RagID: "a1", Title: "A1", ContentPreview: "verified", SuccessScore: "accepted"}}, + FailurePatterns: []RetrievedArtifact{{RagID: "p1", Title: "P1", ContentPreview: "partial result", SuccessScore: "partially_accepted"}}, + ValidationSteps: []string{"verify the build"}, + } + p := BuildPrompt("task X", bundle) + for _, marker := range []string{ + "## Prior successful runs", + "## Patterns that produced PARTIAL results", + "## Validation checklist", + "## Task", + "task X", + } { + if !strings.Contains(p.User, marker) { + t.Errorf("user prompt missing marker %q in:\n%s", marker, p.User) + } + } +} + +// ─── Validation gate ───────────────────────────────────────────── + +func TestValidateResponse_FailsOnEmptyAndShort(t *testing.T) { + if got := ValidateResponse("", nil); got.Passed { + t.Error("empty should fail") + } + if got := ValidateResponse("too short", nil); got.Passed { + t.Error("too-short should fail") + } +} + +func TestValidateResponse_FailsOnFiller(t *testing.T) { + resp := strings.Repeat("This is a real long response that meets the eighty character minimum for the gate. ", 2) + + " As an AI, I cannot help." + got := ValidateResponse(resp, nil) + if got.Passed { + t.Errorf("response with hedge phrase should fail, reasons=%v", got.Reasons) + } +} + +func TestValidateResponse_PassesWhenChecklistOverlaps(t *testing.T) { + bundle := &ContextBundle{ValidationSteps: []string{"verify the build is green"}} + resp := "I followed the procedure and verified that the build is green and tests passed before merging the change." + got := ValidateResponse(resp, bundle) + if !got.Passed { + t.Errorf("expected pass, got reasons=%v", got.Reasons) + } +} + +func TestValidateResponse_FailsWhenChecklistOrthogonal(t *testing.T) { + bundle := &ContextBundle{ValidationSteps: []string{"verify mango ripeness"}} + resp := "I followed completely unrelated steps about Quantum Tax compliance — I did not look at any fruit at all and that's the point." + got := ValidateResponse(resp, bundle) + if got.Passed { + t.Errorf("expected fail because no checklist token overlap, got pass") + } +} + +// ─── End-to-end (dry-run, no LLM) ──────────────────────────────── + +func TestReplay_DryRun_LogsResult(t *testing.T) { + root := t.TempDir() + mustWriteRagFixture(t, root, []RagSample{ + {ID: "p1", Title: "build verification", Content: "verify the build, check tests pass before merge", + Tags: []string{"scrum"}, SuccessScore: "accepted", SourceRunID: "r-1"}, + }) + + res, err := Replay(context.Background(), ReplayRequest{ + Task: "verify the build before merging", + DryRun: true, + }, root) + if err != nil { + t.Fatalf("Replay: %v", err) + } + if res.RecordedRunID == "" { + t.Error("expected recorded_run_id") + } + if !strings.HasPrefix(res.RecordedRunID, "replay:") { + t.Errorf("run_id shape: %s", res.RecordedRunID) + } + if res.ContextBundle == nil { + t.Fatal("expected retrieval to fire by default") + } + if len(res.ContextBundle.RetrievedPlaybooks) == 0 { + t.Errorf("expected at least one retrieved playbook") + } + + logPath := filepath.Join(root, "data/_kb/replay_runs.jsonl") + body, err := os.ReadFile(logPath) + if err != nil { + t.Fatalf("read log: %v", err) + } + var row map[string]any + if err := json.Unmarshal([]byte(strings.TrimSpace(string(body))), &row); err != nil { + t.Fatalf("parse log row: %v", err) + } + if row["schema"] != "replay_run.v1" { + t.Errorf("schema field: %v", row["schema"]) + } +} + +func TestReplay_NoRetrievalSkipsCorpus(t *testing.T) { + root := t.TempDir() + mustWriteRagFixture(t, root, []RagSample{ + {ID: "p1", Title: "would match", Content: "verify build assert", SuccessScore: "accepted"}, + }) + + res, err := Replay(context.Background(), ReplayRequest{ + Task: "verify build assert", + DryRun: true, + NoRetrieval: true, + }, root) + if err != nil { + t.Fatalf("Replay: %v", err) + } + if res.ContextBundle != nil { + t.Errorf("expected nil bundle in NoRetrieval mode") + } + if len(res.RetrievedArtifacts.RagIDs) != 0 { + t.Errorf("expected empty rag_ids, got %v", res.RetrievedArtifacts.RagIDs) + } +} + +func TestReplay_EscalationFiresOnFailedValidation(t *testing.T) { + root := t.TempDir() + // Trick: the dry-run synthesizer copies validation_steps verbatim + // into its output. If a checklist step contains a hedge phrase, the + // synthesized response will contain it too — triggering the + // filler-pattern guard in ValidateResponse and forcing escalation. + mustWriteRagFixture(t, root, []RagSample{ + {ID: "p1", Title: "demo step", Content: "verify the build then i cannot proceed without approval", SuccessScore: "accepted"}, + }) + + res, err := Replay(context.Background(), ReplayRequest{ + Task: "verify the build then proceed", + DryRun: true, + AllowEscalation: true, + }, root) + if err != nil { + t.Fatalf("Replay: %v", err) + } + if len(res.EscalationPath) < 2 { + t.Errorf("expected escalation, path=%v reasons=%v", res.EscalationPath, res.ValidationResult.Reasons) + } + if !strings.Contains(res.ModelResponse, "[ESCALATED]") { + t.Errorf("expected escalated marker in response, got: %q", res.ModelResponse) + } +} + +func TestReplay_NoEscalationWhenValidationPasses(t *testing.T) { + root := t.TempDir() + mustWriteRagFixture(t, root, []RagSample{ + {ID: "p1", Title: "build verification", Content: "verify the build, check tests pass before merge", + Tags: []string{"scrum"}, SuccessScore: "accepted", SourceRunID: "r-1"}, + }) + + res, err := Replay(context.Background(), ReplayRequest{ + Task: "verify the build before merging", + DryRun: true, + AllowEscalation: true, + }, root) + if err != nil { + t.Fatalf("Replay: %v", err) + } + if len(res.EscalationPath) != 1 { + t.Errorf("expected single-step path on validation pass, got %v", res.EscalationPath) + } + if !res.ValidationResult.Passed { + t.Errorf("expected pass, got reasons=%v", res.ValidationResult.Reasons) + } +} + +// ─── Helpers ──────────────────────────────────────────────────── + +func mustWriteRagFixture(t *testing.T, root string, samples []RagSample) { + t.Helper() + path := filepath.Join(root, "exports/rag/playbooks.jsonl") + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + var buf strings.Builder + for _, s := range samples { + b, err := json.Marshal(s) + if err != nil { + t.Fatalf("marshal sample: %v", err) + } + buf.Write(b) + buf.WriteByte('\n') + } + if err := os.WriteFile(path, []byte(buf.String()), 0o644); err != nil { + t.Fatalf("write fixture: %v", err) + } +} diff --git a/internal/replay/retrieval.go b/internal/replay/retrieval.go new file mode 100644 index 0000000..62e7575 --- /dev/null +++ b/internal/replay/retrieval.go @@ -0,0 +1,215 @@ +package replay + +import ( + "bufio" + "encoding/json" + "os" + "path/filepath" + "regexp" + "sort" + "strings" +) + +// tokenize lowercases and splits on non-[a-z0-9_] runs, keeping tokens +// of length ≥3. Matches replay.ts so retrieval scoring is consistent +// across runtimes. +func tokenize(text string) map[string]struct{} { + out := map[string]struct{}{} + if text == "" { + return out + } + lower := strings.ToLower(text) + var b strings.Builder + flush := func() { + if b.Len() >= 3 { + out[b.String()] = struct{}{} + } + b.Reset() + } + for _, r := range lower { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_' { + b.WriteRune(r) + } else { + flush() + } + } + flush() + return out +} + +// jaccard returns |A ∩ B| / |A ∪ B| over token sets. +func jaccard(a, b map[string]struct{}) float64 { + if len(a) == 0 || len(b) == 0 { + return 0 + } + inter := 0 + for t := range a { + if _, ok := b[t]; ok { + inter++ + } + } + union := len(a) + len(b) - inter + if union == 0 { + return 0 + } + return float64(inter) / float64(union) +} + +// LoadRagCorpus reads `exports/rag/playbooks.jsonl` under root. +// Returns empty slice when the file is missing — callers fall back to +// a context-less prompt rather than failing. +func LoadRagCorpus(root string) ([]RagSample, error) { + path := filepath.Join(root, "exports", "rag", "playbooks.jsonl") + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, err + } + defer f.Close() + var corpus []RagSample + sc := bufio.NewScanner(f) + sc.Buffer(make([]byte, 0, 1<<16), 1<<24) + for sc.Scan() { + line := sc.Bytes() + if len(line) == 0 { + continue + } + var rec RagSample + if err := json.Unmarshal(line, &rec); err != nil { + continue // malformed line — skip, matches TS behavior + } + corpus = append(corpus, rec) + } + return corpus, sc.Err() +} + +// retrieveRag returns up to topK playbooks with non-zero overlap. +// Sorted by score descending. Matches replay.ts. +func retrieveRag(corpus []RagSample, task string, topK int) []RetrievedArtifact { + taskTokens := tokenize(task) + type scored struct { + rec RagSample + score float64 + } + all := make([]scored, 0, len(corpus)) + for _, r := range corpus { + text := r.Title + " " + r.Content + " " + strings.Join(r.Tags, " ") + all = append(all, scored{rec: r, score: jaccard(taskTokens, tokenize(text))}) + } + sort.SliceStable(all, func(i, j int) bool { return all[i].score > all[j].score }) + + out := make([]RetrievedArtifact, 0, topK) + for _, s := range all { + if len(out) >= topK { + break + } + if s.score <= 0 { + break + } + out = append(out, RetrievedArtifact{ + RagID: s.rec.ID, + SourceRunID: s.rec.SourceRunID, + Title: s.rec.Title, + ContentPreview: trim(s.rec.Content, 240), + SuccessScore: s.rec.SuccessScore, + Tags: tagsOrEmpty(s.rec.Tags), + Score: s.score, + }) + } + return out +} + +var validationLineRE = regexp.MustCompile(`(?i)^[-*]\s*(verify|check|assert|confirm|ensure)\b|^\s*(verify|check|assert|confirm|ensure)\s`) + +// extractValidationSteps pulls verify/check/assert/confirm/ensure +// lines from accepted samples. Used as a soft-anchor in the +// validation gate (response should touch at least one of these +// tokens) and surfaced into the prompt. +func extractValidationSteps(samples []RetrievedArtifact, corpus []RagSample) []string { + ids := map[string]struct{}{} + for _, s := range samples { + ids[s.RagID] = struct{}{} + } + var steps []string + for _, r := range corpus { + if _, ok := ids[r.ID]; !ok { + continue + } + for _, line := range strings.Split(r.Content, "\n") { + t := strings.TrimSpace(line) + if validationLineRE.MatchString(t) { + steps = append(steps, trim(t, 200)) + if len(steps) >= 6 { + return steps + } + } + } + } + return steps +} + +// BuildContextBundle assembles a ContextBundle from a corpus + task. +// Top 8 retrieved → split by success_score → at most 3 accepted, 2 +// warnings → extract validation steps → estimate token cost. +func BuildContextBundle(corpus []RagSample, task string) *ContextBundle { + top := retrieveRag(corpus, task, 8) + accepted := filterByScore(top, "accepted", 3) + warnings := filterByScore(top, "partially_accepted", 2) + steps := extractValidationSteps(accepted, corpus) + + totalChars := 0 + for _, r := range accepted { + totalChars += len(r.ContentPreview) + len(r.Title) + } + for _, r := range warnings { + totalChars += len(r.ContentPreview) + len(r.Title) + } + for _, s := range steps { + totalChars += len(s) + } + tokenEstimate := (totalChars + 3) / 4 // ceil(chars/4) + + return &ContextBundle{ + RetrievedPlaybooks: top, + PriorSuccessfulOutputs: accepted, + FailurePatterns: warnings, + ValidationSteps: stepsOrEmpty(steps), + BundleTokenEstimate: tokenEstimate, + } +} + +func filterByScore(arts []RetrievedArtifact, score string, max int) []RetrievedArtifact { + out := make([]RetrievedArtifact, 0, max) + for _, a := range arts { + if a.SuccessScore == score { + out = append(out, a) + if len(out) >= max { + break + } + } + } + return out +} + +func tagsOrEmpty(t []string) []string { + if t == nil { + return []string{} + } + return t +} + +func stepsOrEmpty(s []string) []string { + if s == nil { + return []string{} + } + return s +} + +func trim(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] +} diff --git a/internal/replay/types.go b/internal/replay/types.go new file mode 100644 index 0000000..9048323 --- /dev/null +++ b/internal/replay/types.go @@ -0,0 +1,98 @@ +// Package replay ports scripts/distillation/replay.ts to Go. +// +// Replay takes a task → retrieves matching playbooks/RAG records → +// builds a context bundle → calls a LOCAL model via the gateway's +// /v1/chat → validates → escalates to a stronger model if needed → +// logs the run as new evidence in `data/_kb/replay_runs.jsonl`. +// +// Spec invariants (carry over from replay.ts): +// - never bypass retrieval (unless caller passes NoRetrieval) +// - never discard provenance +// - never allow free-form hallucinated output (validation gate) +// - log every run as new evidence +// +// This is NOT training — it's runtime behavior shaping via retrieval. +package replay + +// ReplayRequest mirrors the TS interface. NoRetrieval skips the +// context bundle entirely (baseline mode for A/B tests). DryRun returns +// a deterministic synthetic response without calling the gateway — +// used by tests to exercise retrieval/validation without an LLM. +type ReplayRequest struct { + Task string + LocalOnly bool + AllowEscalation bool + NoRetrieval bool + DryRun bool + GatewayURL string // overrides $LH_GATEWAY_URL + LocalModel string // overrides default + EscalationModel string // overrides default +} + +// RagSample is one record in exports/rag/playbooks.jsonl. +type RagSample struct { + ID string `json:"id"` + Title string `json:"title"` + Content string `json:"content"` + Tags []string `json:"tags"` + SourceRunID string `json:"source_run_id"` + SuccessScore string `json:"success_score"` + SourceCategory string `json:"source_category"` +} + +// RetrievedArtifact is one playbook surfaced into a ContextBundle. +type RetrievedArtifact struct { + RagID string `json:"rag_id"` + SourceRunID string `json:"source_run_id"` + Title string `json:"title"` + ContentPreview string `json:"content_preview"` // first 240 chars + SuccessScore string `json:"success_score"` + Tags []string `json:"tags"` + Score float64 `json:"score"` +} + +// ContextBundle is what the prompt builder consumes. Empty bundles +// (no retrieved playbooks) still pass through — buildPrompt downgrades +// to a no-context prompt when both accepted and warnings are empty. +type ContextBundle struct { + RetrievedPlaybooks []RetrievedArtifact `json:"retrieved_playbooks"` + PriorSuccessfulOutputs []RetrievedArtifact `json:"prior_successful_outputs"` + FailurePatterns []RetrievedArtifact `json:"failure_patterns"` + ValidationSteps []string `json:"validation_steps"` + BundleTokenEstimate int `json:"bundle_token_estimate"` +} + +// ValidationResult is the deterministic gate's verdict. Reasons is +// always non-nil so JSON consumers can iterate without a nil check. +type ValidationResult struct { + Passed bool `json:"passed"` + Reasons []string `json:"reasons"` +} + +// ReplayResult is what Replay returns. Mirrors the TS type one-to-one +// so JSONL emitted by either runtime parses identically. +type ReplayResult struct { + InputTask string `json:"input_task"` + TaskHash string `json:"task_hash"` + RetrievedArtifacts RetrievedIDs `json:"retrieved_artifacts"` + ContextBundle *ContextBundle `json:"context_bundle"` + ModelResponse string `json:"model_response"` + ModelUsed string `json:"model_used"` + EscalationPath []string `json:"escalation_path"` + ValidationResult ValidationResult `json:"validation_result"` + RecordedRunID string `json:"recorded_run_id"` + RecordedAt string `json:"recorded_at"` + DurationMs int64 `json:"duration_ms"` +} + +// RetrievedIDs is the {rag_ids} envelope the TS shape uses. +type RetrievedIDs struct { + RagIDs []string `json:"rag_ids"` +} + +// Defaults match replay.ts. Override via env or ReplayRequest fields. +const ( + DefaultLocalModel = "qwen3.5:latest" + DefaultEscalationModel = "deepseek-v3.1:671b" + DefaultGatewayURL = "http://localhost:3110" +) diff --git a/internal/replay/validate.go b/internal/replay/validate.go new file mode 100644 index 0000000..7fb217e --- /dev/null +++ b/internal/replay/validate.go @@ -0,0 +1,66 @@ +package replay + +import ( + "fmt" + "regexp" + "strings" +) + +// fillerPatterns are the hedge phrases the spec rejects. Compiled once +// per package — the gate runs on every replay call. +var fillerPatterns = []*regexp.Regexp{ + regexp.MustCompile(`(?i)as an ai`), + regexp.MustCompile(`(?i)i cannot`), + regexp.MustCompile(`(?i)i'?m sorry, but`), + regexp.MustCompile(`(?i)i don'?t have access`), + regexp.MustCompile(`(?i)i am unable to`), +} + +// ValidateResponse runs the deterministic gate on a model response. +// Empty / too-short / hedge-bearing / context-disconnected responses +// fail. Matches replay.ts:validateResponse one-to-one. +func ValidateResponse(response string, bundle *ContextBundle) ValidationResult { + trimmed := strings.TrimSpace(response) + var reasons []string + + if len(trimmed) == 0 { + return ValidationResult{Passed: false, Reasons: []string{"empty response"}} + } + if len(trimmed) < 80 { + reasons = append(reasons, fmt.Sprintf("response too short (%d chars; min 80)", len(trimmed))) + } + for _, re := range fillerPatterns { + if re.MatchString(trimmed) { + reasons = append(reasons, fmt.Sprintf("filler/hedge phrase detected: %s", re.String())) + } + } + // Soft anchor: if a validation checklist was supplied, the response + // should share at least one token with it (≥3 chars per tokenize()). + if bundle != nil && len(bundle.ValidationSteps) > 0 { + checklistTokens := map[string]struct{}{} + for _, s := range bundle.ValidationSteps { + for t := range tokenize(s) { + checklistTokens[t] = struct{}{} + } + } + respTokens := tokenize(trimmed) + overlap := 0 + for t := range checklistTokens { + if _, ok := respTokens[t]; ok { + overlap++ + } + } + if len(checklistTokens) > 0 && overlap == 0 { + reasons = append(reasons, "response shares no tokens with validation checklist (may not have followed prior patterns)") + } + } + + return ValidationResult{Passed: len(reasons) == 0, Reasons: reasonsOrEmpty(reasons)} +} + +func reasonsOrEmpty(r []string) []string { + if r == nil { + return []string{} + } + return r +} diff --git a/scripts/replay_smoke.sh b/scripts/replay_smoke.sh new file mode 100755 index 0000000..1274f2b --- /dev/null +++ b/scripts/replay_smoke.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# replay smoke — Go port of scripts/distillation/replay.ts. +# Validates that the replay tool: +# - Builds a context bundle from a synthetic playbooks corpus +# - Runs --dry-run end-to-end without an LLM +# - Logs a row to data/_kb/replay_runs.jsonl with schema=replay_run.v1 +# - Honors --no-retrieval (no bundle, empty rag_ids) +# - Exits non-zero when validation fails + +set -euo pipefail +cd "$(dirname "$0")/.." + +export PATH="$PATH:/usr/local/go/bin" + +echo "[replay-smoke] building bin/replay..." +go build -o bin/replay ./cmd/replay + +ROOT="$(mktemp -d)" +trap 'rm -rf "$ROOT"' EXIT INT TERM + +mkdir -p "$ROOT/exports/rag" +cat > "$ROOT/exports/rag/playbooks.jsonl" <<'EOF' +{"id":"p1","title":"build verification","content":"verify the build, check tests pass before merge\nensure no regressions in suites","tags":["scrum"],"source_run_id":"r-1","success_score":"accepted","source_category":"scrum_review"} +{"id":"p2","title":"merge cleanup","content":"verify the build, then assert tests passed, then merge","tags":["scrum"],"source_run_id":"r-2","success_score":"accepted","source_category":"scrum_review"} +{"id":"p3","title":"partial fix","content":"verify the build, sometimes assert tests passed","tags":["scrum"],"source_run_id":"r-3","success_score":"partially_accepted","source_category":"scrum_review"} +EOF + +echo "[replay-smoke] dry-run (with retrieval)" +./bin/replay -task "verify the build before merging" -dry-run -root "$ROOT" > /tmp/replay_smoke_a.txt 2>&1 || true +grep -q "retrieval: " /tmp/replay_smoke_a.txt || { + echo "missing retrieval line"; cat /tmp/replay_smoke_a.txt; exit 1; +} +grep -q "escalation_path: qwen3.5:latest" /tmp/replay_smoke_a.txt || { + echo "missing escalation_path line"; cat /tmp/replay_smoke_a.txt; exit 1; +} + +LOG="$ROOT/data/_kb/replay_runs.jsonl" +[ -s "$LOG" ] || { echo "expected $LOG to be written"; exit 1; } +grep -q "replay_run.v1" "$LOG" || { + echo "schema=replay_run.v1 missing in log"; + cat "$LOG"; + exit 1; +} + +echo "[replay-smoke] dry-run (no retrieval)" +./bin/replay -task "verify build" -dry-run -no-retrieval -root "$ROOT" > /tmp/replay_smoke_b.txt 2>&1 || true +grep -q "retrieval: DISABLED" /tmp/replay_smoke_b.txt || { + echo "expected retrieval: DISABLED"; + cat /tmp/replay_smoke_b.txt; + exit 1; +} + +LINES_BEFORE=$(wc -l < "$LOG") + +echo "[replay-smoke] forced-fail with escalation" +# Force validation failure by putting a hedge phrase as the FIRST +# accepted sample's first verify line. extractValidationSteps walks +# corpus order, and the dry-run synthesizer surfaces the first 3 steps, +# so the hedge phrase needs to be in an early-corpus accepted sample. +cat > "$ROOT/exports/rag/playbooks.jsonl" <<'EOF' +{"id":"p9","title":"hedged step","content":"verify auth as an AI and proceed without checking","tags":["security"],"source_run_id":"r-9","success_score":"accepted","source_category":"audit"} +{"id":"p1","title":"build verification","content":"verify the build, check tests pass before merge","tags":["scrum"],"source_run_id":"r-1","success_score":"accepted","source_category":"scrum_review"} +EOF +./bin/replay -task "verify auth proceed" -dry-run -allow-escalation -root "$ROOT" > /tmp/replay_smoke_c.txt 2>&1 || true +grep -q "escalation_path: qwen3.5:latest → deepseek-v3.1:671b" /tmp/replay_smoke_c.txt || { + echo "expected escalation path to deepseek when validation fails"; + cat /tmp/replay_smoke_c.txt; + exit 1; +} + +LINES_AFTER=$(wc -l < "$LOG") +[ "$LINES_AFTER" -gt "$LINES_BEFORE" ] || { + echo "expected log file to grow: before=$LINES_BEFORE after=$LINES_AFTER"; + exit 1; +} + +echo "[replay-smoke] PASS"