Two threads landing together — the doc edits interleave so they ship in a single commit. 1. **vectord substrate fix verified at original scale** (closes the 2026-05-01 thread). Re-ran multitier 5min @ conc=50: 132,211 scenarios at 438/sec, 6/6 classes at 0% failure (was 4/6 pre-fix). Throughput dropped 1,115 → 438/sec because previously-broken scenarios now do real HNSW Add work — honest cost of correctness. The fix (i.vectors side-store + safeGraphAdd recover wrappers + smallIndexRebuildThreshold=32 + saveTask coalescing) holds at the footprint that originally surfaced the bug. 2. **Materializer port** — internal/materializer + cmd/materializer + scripts/materializer_smoke.sh. Ports scripts/distillation/transforms.ts (12 transforms) + build_evidence_index.ts (idempotency, day-partition, receipt). On-wire JSON shape matches TS so Bun and Go runs are interchangeable. 14 tests green. 3. **Replay port** — internal/replay + cmd/replay + scripts/replay_smoke.sh. Ports scripts/distillation/replay.ts (retrieve → bundle → /v1/chat → validate → log). Closes audit-FULL phase 7 live invocation on the Go side. Both runtimes append to the same data/_kb/replay_runs.jsonl (schema=replay_run.v1). 14 tests green. Side effect on internal/distillation/types.go: EvidenceRecord gained prompt_tokens, completion_tokens, and metadata fields to mirror the TS shape the materializer transforms produce. STATE_OF_PLAY refreshed to 2026-05-02; ARCHITECTURE_COMPARISON decisions tracker moves the materializer + replay items from _open_ to DONE and adds the substrate-fix scale verification row. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
132 lines
3.9 KiB
Go
132 lines
3.9 KiB
Go
package replay
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// callModelResult is what the gateway round-trip returns.
|
|
type callModelResult struct {
|
|
Content string
|
|
OK bool
|
|
Error string
|
|
}
|
|
|
|
// ModelCaller is the seam tests use to swap out HTTP. Production
|
|
// supplies httpModelCaller; tests can supply scripted responses.
|
|
type ModelCaller func(ctx context.Context, model, system, user string) callModelResult
|
|
|
|
// httpModelCaller posts to ${gatewayURL}/v1/chat with provider derived
|
|
// from model name. Mirrors replay.ts:callModel.
|
|
func httpModelCaller(gatewayURL string) ModelCaller {
|
|
client := &http.Client{Timeout: 180 * time.Second}
|
|
return func(ctx context.Context, model, system, user string) callModelResult {
|
|
provider := inferProvider(model)
|
|
body, err := json.Marshal(map[string]any{
|
|
"provider": provider,
|
|
"model": model,
|
|
"messages": []map[string]string{
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": user},
|
|
},
|
|
"max_tokens": 1500,
|
|
"temperature": 0.1,
|
|
})
|
|
if err != nil {
|
|
return callModelResult{Error: "marshal request: " + err.Error()}
|
|
}
|
|
req, err := http.NewRequestWithContext(ctx, "POST", gatewayURL+"/v1/chat", bytes.NewReader(body))
|
|
if err != nil {
|
|
return callModelResult{Error: "build request: " + err.Error()}
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return callModelResult{Error: trim(err.Error(), 240)}
|
|
}
|
|
defer resp.Body.Close()
|
|
buf, _ := io.ReadAll(resp.Body)
|
|
if resp.StatusCode >= 400 {
|
|
return callModelResult{Error: fmt.Sprintf("HTTP %d: %s", resp.StatusCode, trim(string(buf), 240))}
|
|
}
|
|
var parsed struct {
|
|
Choices []struct {
|
|
Message struct {
|
|
Content string `json:"content"`
|
|
} `json:"message"`
|
|
} `json:"choices"`
|
|
}
|
|
if err := json.Unmarshal(buf, &parsed); err != nil {
|
|
return callModelResult{Error: "parse response: " + err.Error()}
|
|
}
|
|
content := ""
|
|
if len(parsed.Choices) > 0 {
|
|
content = parsed.Choices[0].Message.Content
|
|
}
|
|
return callModelResult{Content: content, OK: true}
|
|
}
|
|
}
|
|
|
|
// inferProvider picks the right /v1/chat provider for a given model
|
|
// name. Mirrors replay.ts:callModel's branching exactly so the gateway
|
|
// sees the same request shape regardless of caller runtime.
|
|
//
|
|
// "/" in name → openrouter
|
|
// kimi-/qwen3-coder/... → ollama_cloud
|
|
// else → ollama (local)
|
|
func inferProvider(model string) string {
|
|
if strings.Contains(model, "/") {
|
|
return "openrouter"
|
|
}
|
|
switch {
|
|
case strings.HasPrefix(model, "kimi-"),
|
|
strings.HasPrefix(model, "qwen3-coder"),
|
|
strings.HasPrefix(model, "deepseek-v"),
|
|
strings.HasPrefix(model, "mistral-large"),
|
|
model == "gpt-oss:120b",
|
|
model == "qwen3.5:397b":
|
|
return "ollama_cloud"
|
|
}
|
|
return "ollama"
|
|
}
|
|
|
|
// dryRunSynthesize produces a deterministic synthetic response that
|
|
// echoes context-bundle signals. Used by tests + dry-run mode to
|
|
// exercise retrieval + validation without a live LLM.
|
|
func dryRunSynthesize(task string, bundle *ContextBundle) string {
|
|
parts := []string{
|
|
"Synthetic dry-run response for task: " + trim(task, 120),
|
|
"",
|
|
}
|
|
if bundle != nil {
|
|
parts = append(parts, fmt.Sprintf(
|
|
"Retrieved %d playbooks; %d accepted, %d partial.",
|
|
len(bundle.RetrievedPlaybooks),
|
|
len(bundle.PriorSuccessfulOutputs),
|
|
len(bundle.FailurePatterns),
|
|
))
|
|
if len(bundle.ValidationSteps) > 0 {
|
|
parts = append(parts, "Following validation checklist:")
|
|
for i, s := range bundle.ValidationSteps {
|
|
if i >= 3 {
|
|
break
|
|
}
|
|
parts = append(parts, "- "+s)
|
|
}
|
|
}
|
|
if len(bundle.PriorSuccessfulOutputs) > 0 {
|
|
parts = append(parts, "")
|
|
parts = append(parts, "Anchored on prior accepted: "+bundle.PriorSuccessfulOutputs[0].Title)
|
|
}
|
|
} else {
|
|
parts = append(parts, "No retrieval context — answering from task alone. Verify and check produced output before approving.")
|
|
}
|
|
return strings.Join(parts, "\n")
|
|
}
|