Two threads landing together — the doc edits interleave so they ship in a single commit. 1. **vectord substrate fix verified at original scale** (closes the 2026-05-01 thread). Re-ran multitier 5min @ conc=50: 132,211 scenarios at 438/sec, 6/6 classes at 0% failure (was 4/6 pre-fix). Throughput dropped 1,115 → 438/sec because previously-broken scenarios now do real HNSW Add work — honest cost of correctness. The fix (i.vectors side-store + safeGraphAdd recover wrappers + smallIndexRebuildThreshold=32 + saveTask coalescing) holds at the footprint that originally surfaced the bug. 2. **Materializer port** — internal/materializer + cmd/materializer + scripts/materializer_smoke.sh. Ports scripts/distillation/transforms.ts (12 transforms) + build_evidence_index.ts (idempotency, day-partition, receipt). On-wire JSON shape matches TS so Bun and Go runs are interchangeable. 14 tests green. 3. **Replay port** — internal/replay + cmd/replay + scripts/replay_smoke.sh. Ports scripts/distillation/replay.ts (retrieve → bundle → /v1/chat → validate → log). Closes audit-FULL phase 7 live invocation on the Go side. Both runtimes append to the same data/_kb/replay_runs.jsonl (schema=replay_run.v1). 14 tests green. Side effect on internal/distillation/types.go: EvidenceRecord gained prompt_tokens, completion_tokens, and metadata fields to mirror the TS shape the materializer transforms produce. STATE_OF_PLAY refreshed to 2026-05-02; ARCHITECTURE_COMPARISON decisions tracker moves the materializer + replay items from _open_ to DONE and adds the substrate-fix scale verification row. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
284 lines
9.9 KiB
Go
284 lines
9.9 KiB
Go
package replay
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"os"
|
||
"path/filepath"
|
||
"strings"
|
||
"testing"
|
||
)
|
||
|
||
// ─── Tokenization + retrieval primitives ───────────────────────────
|
||
|
||
func TestTokenize_FiltersShortAndLowercase(t *testing.T) {
|
||
got := tokenize("Hello, World! Foo BAR baz x12 a")
|
||
want := map[string]bool{"hello": true, "world": true, "foo": true, "bar": true, "baz": true, "x12": true}
|
||
for k := range want {
|
||
if _, ok := got[k]; !ok {
|
||
t.Errorf("missing token %q", k)
|
||
}
|
||
}
|
||
if _, ok := got["a"]; ok {
|
||
t.Errorf("len=1 token should be filtered: a")
|
||
}
|
||
}
|
||
|
||
func TestJaccard_EdgeCases(t *testing.T) {
|
||
a := map[string]struct{}{"x": {}, "y": {}, "z": {}}
|
||
b := map[string]struct{}{"y": {}, "z": {}, "w": {}}
|
||
got := jaccard(a, b)
|
||
want := 2.0 / 4.0 // |A∩B|=2 (y,z); |A∪B|=4 (x,y,z,w)
|
||
if got != want {
|
||
t.Errorf("jaccard = %v, want %v", got, want)
|
||
}
|
||
if jaccard(map[string]struct{}{}, b) != 0 {
|
||
t.Error("empty set should produce 0")
|
||
}
|
||
}
|
||
|
||
// ─── Retrieval ───────────────────────────────────────────────────
|
||
|
||
func TestRetrieveRag_ScoresAndCaps(t *testing.T) {
|
||
corpus := []RagSample{
|
||
{ID: "p1", Title: "validate scrum", Content: "verify the build, check tests", Tags: []string{"scrum"}, SuccessScore: "accepted"},
|
||
{ID: "p2", Title: "irrelevant cooking notes", Content: "boil pasta longer than ten minutes", Tags: []string{"food"}, SuccessScore: "accepted"},
|
||
{ID: "p3", Title: "build verification ladder", Content: "verify build steps, assert green", Tags: []string{"build"}, SuccessScore: "partially_accepted"},
|
||
}
|
||
got := retrieveRag(corpus, "verify the build assert green", 3)
|
||
if len(got) == 0 {
|
||
t.Fatal("expected at least one result")
|
||
}
|
||
for _, a := range got {
|
||
if a.RagID == "p2" {
|
||
t.Errorf("irrelevant sample p2 should not surface, got: %+v", got)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestBuildContextBundle_SplitsAcceptedAndPartial(t *testing.T) {
|
||
corpus := []RagSample{
|
||
{ID: "a1", Title: "A1", Content: "verify build assert green check tests", SuccessScore: "accepted"},
|
||
{ID: "p1", Title: "P1", Content: "verify build sometimes fails to assert", SuccessScore: "partially_accepted"},
|
||
}
|
||
b := BuildContextBundle(corpus, "verify build assert tests")
|
||
if b == nil {
|
||
t.Fatal("nil bundle")
|
||
}
|
||
if len(b.PriorSuccessfulOutputs) != 1 || b.PriorSuccessfulOutputs[0].RagID != "a1" {
|
||
t.Errorf("accepted bucket wrong: %+v", b.PriorSuccessfulOutputs)
|
||
}
|
||
if len(b.FailurePatterns) != 1 || b.FailurePatterns[0].RagID != "p1" {
|
||
t.Errorf("partially_accepted bucket wrong: %+v", b.FailurePatterns)
|
||
}
|
||
if len(b.ValidationSteps) == 0 {
|
||
t.Errorf("expected validation_steps from accepted sample, got none")
|
||
}
|
||
}
|
||
|
||
// ─── Prompt assembly ─────────────────────────────────────────────
|
||
|
||
func TestBuildPrompt_NoBundleIsCompact(t *testing.T) {
|
||
p := BuildPrompt("rebuild evidence index", nil)
|
||
if !strings.Contains(p.User, "Task: rebuild evidence index") {
|
||
t.Errorf("user prompt missing task: %q", p.User)
|
||
}
|
||
if strings.Contains(p.User, "## Prior successful runs") {
|
||
t.Error("no-bundle prompt should not include retrieval headers")
|
||
}
|
||
}
|
||
|
||
func TestBuildPrompt_WithBundleIncludesAllSections(t *testing.T) {
|
||
bundle := &ContextBundle{
|
||
PriorSuccessfulOutputs: []RetrievedArtifact{{RagID: "a1", Title: "A1", ContentPreview: "verified", SuccessScore: "accepted"}},
|
||
FailurePatterns: []RetrievedArtifact{{RagID: "p1", Title: "P1", ContentPreview: "partial result", SuccessScore: "partially_accepted"}},
|
||
ValidationSteps: []string{"verify the build"},
|
||
}
|
||
p := BuildPrompt("task X", bundle)
|
||
for _, marker := range []string{
|
||
"## Prior successful runs",
|
||
"## Patterns that produced PARTIAL results",
|
||
"## Validation checklist",
|
||
"## Task",
|
||
"task X",
|
||
} {
|
||
if !strings.Contains(p.User, marker) {
|
||
t.Errorf("user prompt missing marker %q in:\n%s", marker, p.User)
|
||
}
|
||
}
|
||
}
|
||
|
||
// ─── Validation gate ─────────────────────────────────────────────
|
||
|
||
func TestValidateResponse_FailsOnEmptyAndShort(t *testing.T) {
|
||
if got := ValidateResponse("", nil); got.Passed {
|
||
t.Error("empty should fail")
|
||
}
|
||
if got := ValidateResponse("too short", nil); got.Passed {
|
||
t.Error("too-short should fail")
|
||
}
|
||
}
|
||
|
||
func TestValidateResponse_FailsOnFiller(t *testing.T) {
|
||
resp := strings.Repeat("This is a real long response that meets the eighty character minimum for the gate. ", 2) +
|
||
" As an AI, I cannot help."
|
||
got := ValidateResponse(resp, nil)
|
||
if got.Passed {
|
||
t.Errorf("response with hedge phrase should fail, reasons=%v", got.Reasons)
|
||
}
|
||
}
|
||
|
||
func TestValidateResponse_PassesWhenChecklistOverlaps(t *testing.T) {
|
||
bundle := &ContextBundle{ValidationSteps: []string{"verify the build is green"}}
|
||
resp := "I followed the procedure and verified that the build is green and tests passed before merging the change."
|
||
got := ValidateResponse(resp, bundle)
|
||
if !got.Passed {
|
||
t.Errorf("expected pass, got reasons=%v", got.Reasons)
|
||
}
|
||
}
|
||
|
||
func TestValidateResponse_FailsWhenChecklistOrthogonal(t *testing.T) {
|
||
bundle := &ContextBundle{ValidationSteps: []string{"verify mango ripeness"}}
|
||
resp := "I followed completely unrelated steps about Quantum Tax compliance — I did not look at any fruit at all and that's the point."
|
||
got := ValidateResponse(resp, bundle)
|
||
if got.Passed {
|
||
t.Errorf("expected fail because no checklist token overlap, got pass")
|
||
}
|
||
}
|
||
|
||
// ─── End-to-end (dry-run, no LLM) ────────────────────────────────
|
||
|
||
func TestReplay_DryRun_LogsResult(t *testing.T) {
|
||
root := t.TempDir()
|
||
mustWriteRagFixture(t, root, []RagSample{
|
||
{ID: "p1", Title: "build verification", Content: "verify the build, check tests pass before merge",
|
||
Tags: []string{"scrum"}, SuccessScore: "accepted", SourceRunID: "r-1"},
|
||
})
|
||
|
||
res, err := Replay(context.Background(), ReplayRequest{
|
||
Task: "verify the build before merging",
|
||
DryRun: true,
|
||
}, root)
|
||
if err != nil {
|
||
t.Fatalf("Replay: %v", err)
|
||
}
|
||
if res.RecordedRunID == "" {
|
||
t.Error("expected recorded_run_id")
|
||
}
|
||
if !strings.HasPrefix(res.RecordedRunID, "replay:") {
|
||
t.Errorf("run_id shape: %s", res.RecordedRunID)
|
||
}
|
||
if res.ContextBundle == nil {
|
||
t.Fatal("expected retrieval to fire by default")
|
||
}
|
||
if len(res.ContextBundle.RetrievedPlaybooks) == 0 {
|
||
t.Errorf("expected at least one retrieved playbook")
|
||
}
|
||
|
||
logPath := filepath.Join(root, "data/_kb/replay_runs.jsonl")
|
||
body, err := os.ReadFile(logPath)
|
||
if err != nil {
|
||
t.Fatalf("read log: %v", err)
|
||
}
|
||
var row map[string]any
|
||
if err := json.Unmarshal([]byte(strings.TrimSpace(string(body))), &row); err != nil {
|
||
t.Fatalf("parse log row: %v", err)
|
||
}
|
||
if row["schema"] != "replay_run.v1" {
|
||
t.Errorf("schema field: %v", row["schema"])
|
||
}
|
||
}
|
||
|
||
func TestReplay_NoRetrievalSkipsCorpus(t *testing.T) {
|
||
root := t.TempDir()
|
||
mustWriteRagFixture(t, root, []RagSample{
|
||
{ID: "p1", Title: "would match", Content: "verify build assert", SuccessScore: "accepted"},
|
||
})
|
||
|
||
res, err := Replay(context.Background(), ReplayRequest{
|
||
Task: "verify build assert",
|
||
DryRun: true,
|
||
NoRetrieval: true,
|
||
}, root)
|
||
if err != nil {
|
||
t.Fatalf("Replay: %v", err)
|
||
}
|
||
if res.ContextBundle != nil {
|
||
t.Errorf("expected nil bundle in NoRetrieval mode")
|
||
}
|
||
if len(res.RetrievedArtifacts.RagIDs) != 0 {
|
||
t.Errorf("expected empty rag_ids, got %v", res.RetrievedArtifacts.RagIDs)
|
||
}
|
||
}
|
||
|
||
func TestReplay_EscalationFiresOnFailedValidation(t *testing.T) {
|
||
root := t.TempDir()
|
||
// Trick: the dry-run synthesizer copies validation_steps verbatim
|
||
// into its output. If a checklist step contains a hedge phrase, the
|
||
// synthesized response will contain it too — triggering the
|
||
// filler-pattern guard in ValidateResponse and forcing escalation.
|
||
mustWriteRagFixture(t, root, []RagSample{
|
||
{ID: "p1", Title: "demo step", Content: "verify the build then i cannot proceed without approval", SuccessScore: "accepted"},
|
||
})
|
||
|
||
res, err := Replay(context.Background(), ReplayRequest{
|
||
Task: "verify the build then proceed",
|
||
DryRun: true,
|
||
AllowEscalation: true,
|
||
}, root)
|
||
if err != nil {
|
||
t.Fatalf("Replay: %v", err)
|
||
}
|
||
if len(res.EscalationPath) < 2 {
|
||
t.Errorf("expected escalation, path=%v reasons=%v", res.EscalationPath, res.ValidationResult.Reasons)
|
||
}
|
||
if !strings.Contains(res.ModelResponse, "[ESCALATED]") {
|
||
t.Errorf("expected escalated marker in response, got: %q", res.ModelResponse)
|
||
}
|
||
}
|
||
|
||
func TestReplay_NoEscalationWhenValidationPasses(t *testing.T) {
|
||
root := t.TempDir()
|
||
mustWriteRagFixture(t, root, []RagSample{
|
||
{ID: "p1", Title: "build verification", Content: "verify the build, check tests pass before merge",
|
||
Tags: []string{"scrum"}, SuccessScore: "accepted", SourceRunID: "r-1"},
|
||
})
|
||
|
||
res, err := Replay(context.Background(), ReplayRequest{
|
||
Task: "verify the build before merging",
|
||
DryRun: true,
|
||
AllowEscalation: true,
|
||
}, root)
|
||
if err != nil {
|
||
t.Fatalf("Replay: %v", err)
|
||
}
|
||
if len(res.EscalationPath) != 1 {
|
||
t.Errorf("expected single-step path on validation pass, got %v", res.EscalationPath)
|
||
}
|
||
if !res.ValidationResult.Passed {
|
||
t.Errorf("expected pass, got reasons=%v", res.ValidationResult.Reasons)
|
||
}
|
||
}
|
||
|
||
// ─── Helpers ────────────────────────────────────────────────────
|
||
|
||
func mustWriteRagFixture(t *testing.T, root string, samples []RagSample) {
|
||
t.Helper()
|
||
path := filepath.Join(root, "exports/rag/playbooks.jsonl")
|
||
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
|
||
t.Fatalf("mkdir: %v", err)
|
||
}
|
||
var buf strings.Builder
|
||
for _, s := range samples {
|
||
b, err := json.Marshal(s)
|
||
if err != nil {
|
||
t.Fatalf("marshal sample: %v", err)
|
||
}
|
||
buf.Write(b)
|
||
buf.WriteByte('\n')
|
||
}
|
||
if err := os.WriteFile(path, []byte(buf.String()), 0o644); err != nil {
|
||
t.Fatalf("write fixture: %v", err)
|
||
}
|
||
}
|