root 89ca72d471 materializer + replay ports + vectord substrate fix verified at scale
Two threads landing together — the doc edits interleave so they ship
in a single commit.

1. **vectord substrate fix verified at original scale** (closes the
   2026-05-01 thread). Re-ran multitier 5min @ conc=50: 132,211
   scenarios at 438/sec, 6/6 classes at 0% failure (was 4/6 pre-fix).
   Throughput dropped 1,115 → 438/sec because previously-broken
   scenarios now do real HNSW Add work — honest cost of correctness.
   The fix (i.vectors side-store + safeGraphAdd recover wrappers +
   smallIndexRebuildThreshold=32 + saveTask coalescing) holds at the
   footprint that originally surfaced the bug.

2. **Materializer port** — internal/materializer + cmd/materializer +
   scripts/materializer_smoke.sh. Ports scripts/distillation/transforms.ts
   (12 transforms) + build_evidence_index.ts (idempotency, day-partition,
   receipt). On-wire JSON shape matches TS so Bun and Go runs are
   interchangeable. 14 tests green.

3. **Replay port** — internal/replay + cmd/replay +
   scripts/replay_smoke.sh. Ports scripts/distillation/replay.ts
   (retrieve → bundle → /v1/chat → validate → log). Closes audit-FULL
   phase 7 live invocation on the Go side. Both runtimes append to the
   same data/_kb/replay_runs.jsonl (schema=replay_run.v1). 14 tests green.

Side effect on internal/distillation/types.go: EvidenceRecord gained
prompt_tokens, completion_tokens, and metadata fields to mirror the TS
shape the materializer transforms produce.

STATE_OF_PLAY refreshed to 2026-05-02; ARCHITECTURE_COMPARISON decisions
tracker moves the materializer + replay items from _open_ to DONE and
adds the substrate-fix scale verification row.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 03:31:02 -05:00

284 lines
9.9 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package replay
import (
"context"
"encoding/json"
"os"
"path/filepath"
"strings"
"testing"
)
// ─── Tokenization + retrieval primitives ───────────────────────────
func TestTokenize_FiltersShortAndLowercase(t *testing.T) {
got := tokenize("Hello, World! Foo BAR baz x12 a")
want := map[string]bool{"hello": true, "world": true, "foo": true, "bar": true, "baz": true, "x12": true}
for k := range want {
if _, ok := got[k]; !ok {
t.Errorf("missing token %q", k)
}
}
if _, ok := got["a"]; ok {
t.Errorf("len=1 token should be filtered: a")
}
}
func TestJaccard_EdgeCases(t *testing.T) {
a := map[string]struct{}{"x": {}, "y": {}, "z": {}}
b := map[string]struct{}{"y": {}, "z": {}, "w": {}}
got := jaccard(a, b)
want := 2.0 / 4.0 // |A∩B|=2 (y,z); |AB|=4 (x,y,z,w)
if got != want {
t.Errorf("jaccard = %v, want %v", got, want)
}
if jaccard(map[string]struct{}{}, b) != 0 {
t.Error("empty set should produce 0")
}
}
// ─── Retrieval ───────────────────────────────────────────────────
func TestRetrieveRag_ScoresAndCaps(t *testing.T) {
corpus := []RagSample{
{ID: "p1", Title: "validate scrum", Content: "verify the build, check tests", Tags: []string{"scrum"}, SuccessScore: "accepted"},
{ID: "p2", Title: "irrelevant cooking notes", Content: "boil pasta longer than ten minutes", Tags: []string{"food"}, SuccessScore: "accepted"},
{ID: "p3", Title: "build verification ladder", Content: "verify build steps, assert green", Tags: []string{"build"}, SuccessScore: "partially_accepted"},
}
got := retrieveRag(corpus, "verify the build assert green", 3)
if len(got) == 0 {
t.Fatal("expected at least one result")
}
for _, a := range got {
if a.RagID == "p2" {
t.Errorf("irrelevant sample p2 should not surface, got: %+v", got)
}
}
}
func TestBuildContextBundle_SplitsAcceptedAndPartial(t *testing.T) {
corpus := []RagSample{
{ID: "a1", Title: "A1", Content: "verify build assert green check tests", SuccessScore: "accepted"},
{ID: "p1", Title: "P1", Content: "verify build sometimes fails to assert", SuccessScore: "partially_accepted"},
}
b := BuildContextBundle(corpus, "verify build assert tests")
if b == nil {
t.Fatal("nil bundle")
}
if len(b.PriorSuccessfulOutputs) != 1 || b.PriorSuccessfulOutputs[0].RagID != "a1" {
t.Errorf("accepted bucket wrong: %+v", b.PriorSuccessfulOutputs)
}
if len(b.FailurePatterns) != 1 || b.FailurePatterns[0].RagID != "p1" {
t.Errorf("partially_accepted bucket wrong: %+v", b.FailurePatterns)
}
if len(b.ValidationSteps) == 0 {
t.Errorf("expected validation_steps from accepted sample, got none")
}
}
// ─── Prompt assembly ─────────────────────────────────────────────
func TestBuildPrompt_NoBundleIsCompact(t *testing.T) {
p := BuildPrompt("rebuild evidence index", nil)
if !strings.Contains(p.User, "Task: rebuild evidence index") {
t.Errorf("user prompt missing task: %q", p.User)
}
if strings.Contains(p.User, "## Prior successful runs") {
t.Error("no-bundle prompt should not include retrieval headers")
}
}
func TestBuildPrompt_WithBundleIncludesAllSections(t *testing.T) {
bundle := &ContextBundle{
PriorSuccessfulOutputs: []RetrievedArtifact{{RagID: "a1", Title: "A1", ContentPreview: "verified", SuccessScore: "accepted"}},
FailurePatterns: []RetrievedArtifact{{RagID: "p1", Title: "P1", ContentPreview: "partial result", SuccessScore: "partially_accepted"}},
ValidationSteps: []string{"verify the build"},
}
p := BuildPrompt("task X", bundle)
for _, marker := range []string{
"## Prior successful runs",
"## Patterns that produced PARTIAL results",
"## Validation checklist",
"## Task",
"task X",
} {
if !strings.Contains(p.User, marker) {
t.Errorf("user prompt missing marker %q in:\n%s", marker, p.User)
}
}
}
// ─── Validation gate ─────────────────────────────────────────────
func TestValidateResponse_FailsOnEmptyAndShort(t *testing.T) {
if got := ValidateResponse("", nil); got.Passed {
t.Error("empty should fail")
}
if got := ValidateResponse("too short", nil); got.Passed {
t.Error("too-short should fail")
}
}
func TestValidateResponse_FailsOnFiller(t *testing.T) {
resp := strings.Repeat("This is a real long response that meets the eighty character minimum for the gate. ", 2) +
" As an AI, I cannot help."
got := ValidateResponse(resp, nil)
if got.Passed {
t.Errorf("response with hedge phrase should fail, reasons=%v", got.Reasons)
}
}
func TestValidateResponse_PassesWhenChecklistOverlaps(t *testing.T) {
bundle := &ContextBundle{ValidationSteps: []string{"verify the build is green"}}
resp := "I followed the procedure and verified that the build is green and tests passed before merging the change."
got := ValidateResponse(resp, bundle)
if !got.Passed {
t.Errorf("expected pass, got reasons=%v", got.Reasons)
}
}
func TestValidateResponse_FailsWhenChecklistOrthogonal(t *testing.T) {
bundle := &ContextBundle{ValidationSteps: []string{"verify mango ripeness"}}
resp := "I followed completely unrelated steps about Quantum Tax compliance — I did not look at any fruit at all and that's the point."
got := ValidateResponse(resp, bundle)
if got.Passed {
t.Errorf("expected fail because no checklist token overlap, got pass")
}
}
// ─── End-to-end (dry-run, no LLM) ────────────────────────────────
func TestReplay_DryRun_LogsResult(t *testing.T) {
root := t.TempDir()
mustWriteRagFixture(t, root, []RagSample{
{ID: "p1", Title: "build verification", Content: "verify the build, check tests pass before merge",
Tags: []string{"scrum"}, SuccessScore: "accepted", SourceRunID: "r-1"},
})
res, err := Replay(context.Background(), ReplayRequest{
Task: "verify the build before merging",
DryRun: true,
}, root)
if err != nil {
t.Fatalf("Replay: %v", err)
}
if res.RecordedRunID == "" {
t.Error("expected recorded_run_id")
}
if !strings.HasPrefix(res.RecordedRunID, "replay:") {
t.Errorf("run_id shape: %s", res.RecordedRunID)
}
if res.ContextBundle == nil {
t.Fatal("expected retrieval to fire by default")
}
if len(res.ContextBundle.RetrievedPlaybooks) == 0 {
t.Errorf("expected at least one retrieved playbook")
}
logPath := filepath.Join(root, "data/_kb/replay_runs.jsonl")
body, err := os.ReadFile(logPath)
if err != nil {
t.Fatalf("read log: %v", err)
}
var row map[string]any
if err := json.Unmarshal([]byte(strings.TrimSpace(string(body))), &row); err != nil {
t.Fatalf("parse log row: %v", err)
}
if row["schema"] != "replay_run.v1" {
t.Errorf("schema field: %v", row["schema"])
}
}
func TestReplay_NoRetrievalSkipsCorpus(t *testing.T) {
root := t.TempDir()
mustWriteRagFixture(t, root, []RagSample{
{ID: "p1", Title: "would match", Content: "verify build assert", SuccessScore: "accepted"},
})
res, err := Replay(context.Background(), ReplayRequest{
Task: "verify build assert",
DryRun: true,
NoRetrieval: true,
}, root)
if err != nil {
t.Fatalf("Replay: %v", err)
}
if res.ContextBundle != nil {
t.Errorf("expected nil bundle in NoRetrieval mode")
}
if len(res.RetrievedArtifacts.RagIDs) != 0 {
t.Errorf("expected empty rag_ids, got %v", res.RetrievedArtifacts.RagIDs)
}
}
func TestReplay_EscalationFiresOnFailedValidation(t *testing.T) {
root := t.TempDir()
// Trick: the dry-run synthesizer copies validation_steps verbatim
// into its output. If a checklist step contains a hedge phrase, the
// synthesized response will contain it too — triggering the
// filler-pattern guard in ValidateResponse and forcing escalation.
mustWriteRagFixture(t, root, []RagSample{
{ID: "p1", Title: "demo step", Content: "verify the build then i cannot proceed without approval", SuccessScore: "accepted"},
})
res, err := Replay(context.Background(), ReplayRequest{
Task: "verify the build then proceed",
DryRun: true,
AllowEscalation: true,
}, root)
if err != nil {
t.Fatalf("Replay: %v", err)
}
if len(res.EscalationPath) < 2 {
t.Errorf("expected escalation, path=%v reasons=%v", res.EscalationPath, res.ValidationResult.Reasons)
}
if !strings.Contains(res.ModelResponse, "[ESCALATED]") {
t.Errorf("expected escalated marker in response, got: %q", res.ModelResponse)
}
}
func TestReplay_NoEscalationWhenValidationPasses(t *testing.T) {
root := t.TempDir()
mustWriteRagFixture(t, root, []RagSample{
{ID: "p1", Title: "build verification", Content: "verify the build, check tests pass before merge",
Tags: []string{"scrum"}, SuccessScore: "accepted", SourceRunID: "r-1"},
})
res, err := Replay(context.Background(), ReplayRequest{
Task: "verify the build before merging",
DryRun: true,
AllowEscalation: true,
}, root)
if err != nil {
t.Fatalf("Replay: %v", err)
}
if len(res.EscalationPath) != 1 {
t.Errorf("expected single-step path on validation pass, got %v", res.EscalationPath)
}
if !res.ValidationResult.Passed {
t.Errorf("expected pass, got reasons=%v", res.ValidationResult.Reasons)
}
}
// ─── Helpers ────────────────────────────────────────────────────
func mustWriteRagFixture(t *testing.T, root string, samples []RagSample) {
t.Helper()
path := filepath.Join(root, "exports/rag/playbooks.jsonl")
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
t.Fatalf("mkdir: %v", err)
}
var buf strings.Builder
for _, s := range samples {
b, err := json.Marshal(s)
if err != nil {
t.Fatalf("marshal sample: %v", err)
}
buf.Write(b)
buf.WriteByte('\n')
}
if err := os.WriteFile(path, []byte(buf.String()), 0o644); err != nil {
t.Fatalf("write fixture: %v", err)
}
}