Closes J's 2026-05-02 multi-call observability gap: a single
/v1/iterate session with N retries used to surface in Langfuse as
N+1 disconnected traces (one per /v1/chat hop + one for the iterate
request itself), with no parent/child linkage. Operators couldn't
scroll the retry chain in one trace tree to spot where grounding
failed.
## Wire-level change
- New header constant `shared.TraceIDHeader = "X-Lakehouse-Trace-Id"`
- `langfuseMiddleware` honors the header on inbound requests: if
set, reuses that trace id instead of minting a new one. Stashes
the trace id on the request context so handlers can attach
application-level child spans.
- `validatord.chatCaller` forwards the header to chatd. Every chat
hop in an iterate session lands as a child of the parent trace.
## Application-level spans
- `validator.IterateConfig` gains `Tracer` (optional callback).
When wired, each iteration attempt emits one Langfuse span
via `validator.AttemptSpan`:
Name: iterate.attempt[N]
Input: { iteration, model, provider, prompt }
Output: { verdict, raw, error }
Level: WARNING when verdict != accepted
- `validatord.iterTracer` is the production hook — bridges
`validator.Tracer` → `langfuse.Client.Span`.
- `IterateRequest`/`IterateResponse`/`IterateFailure` gain
`TraceID`; each `IterateAttempt` gains `SpanID`. The /v1/iterate
caller can pivot from the JSON response straight into the
Langfuse trace tree.
## What an operator sees post-cutover
GET /v1/iterate {kind=fill, prompt=...} → Trace TR-1
├─ http.request span (from middleware)
├─ iterate.attempt[0] span (validator.Iterate emit)
│ input: prompt+model
│ output: { verdict: validation_failed, error: ..., raw }
├─ chatd /v1/chat call (X-Lakehouse-Trace-Id: TR-1)
│ ├─ http.request span (chatd middleware)
│ └─ chatd-internal spans (existing)
├─ iterate.attempt[1] span
└─ ...
All in one Langfuse trace tree, not N+1 separate traces.
## Hallucinated-worker safety net is unchanged
The /v1/iterate flow's hard correctness gate is still
FillValidator + WorkerLookup. Phantom candidate IDs raise
ValidationError::Consistency which 422s and forces the iteration
loop to retry. The trace-id propagation is the OBSERVABILITY layer
on top — it makes the existing safety net's outcomes visible per-call,
not a replacement for it.
## Verification
- internal/validator: 4 new tests
- TestIterate_TracerEmitsSpanPerAttempt — span/attempt count + SpanID
- TestIterate_NoTraceIDSkipsTracer — no orphan spans without trace_id
- TestIterate_ChatCallerReceivesTraceID — propagation contract
- (existing iterate tests updated for new ChatCaller signature)
- internal/shared: 1 new test
- TestLangfuseMiddleware_HonorsTraceIDHeader — cross-service linkage
- cmd/validatord: existing HTTP tests still PASS via the dual-shape
UnmarshalJSON contract.
- validatord_smoke.sh: 5/5 PASS through gateway :3110 (unchanged).
- Full go test ./... green across 33 packages.
## Architecture invariant added
STATE_OF_PLAY "DO NOT RELITIGATE" gains a paragraph documenting
the X-Lakehouse-Trace-Id header contract + the iterate.attempt[N]
span emission. Future-Claude won't re-propose "wire trace-id
propagation" — the header IS the wiring.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
279 lines
9.0 KiB
Go
279 lines
9.0 KiB
Go
package validator
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"testing"
|
|
)
|
|
|
|
func TestExtractJSON_FromFencedBlock(t *testing.T) {
|
|
raw := "Here's my answer:\n```json\n{\"fills\": [{\"candidate_id\": \"W-1\"}]}\n```\nDone."
|
|
v := ExtractJSON(raw)
|
|
if v == nil {
|
|
t.Fatal("expected match in fenced block")
|
|
}
|
|
if _, ok := v["fills"]; !ok {
|
|
t.Errorf("missing fills key: %+v", v)
|
|
}
|
|
}
|
|
|
|
func TestExtractJSON_FromBareBraces(t *testing.T) {
|
|
raw := "Here you go: {\"fills\": [{\"candidate_id\": \"W-2\"}]}"
|
|
v := ExtractJSON(raw)
|
|
if v == nil {
|
|
t.Fatal("expected match in bare braces")
|
|
}
|
|
}
|
|
|
|
func TestExtractJSON_ReturnsNilOnNoObject(t *testing.T) {
|
|
if v := ExtractJSON("just prose, no json"); v != nil {
|
|
t.Errorf("expected nil, got %+v", v)
|
|
}
|
|
}
|
|
|
|
func TestExtractJSON_PicksFirstBalancedObject(t *testing.T) {
|
|
v := ExtractJSON(`{"a":1} then {"b":2}`)
|
|
if v == nil {
|
|
t.Fatal("expected match")
|
|
}
|
|
if v["a"].(float64) != 1 {
|
|
t.Errorf("expected first object, got %+v", v)
|
|
}
|
|
}
|
|
|
|
func TestExtractJSON_NestedBalancedObjects(t *testing.T) {
|
|
v := ExtractJSON(`prefix {"outer": {"inner": [1,2,3]}, "x": "y"} suffix`)
|
|
if v == nil {
|
|
t.Fatal("expected match on balanced nested object")
|
|
}
|
|
if outer, ok := v["outer"].(map[string]any); !ok || outer["inner"] == nil {
|
|
t.Errorf("nested structure lost: %+v", v)
|
|
}
|
|
}
|
|
|
|
func TestExtractJSON_TopLevelArrayReturnsFirstInnerObject(t *testing.T) {
|
|
// Both Rust and Go runtimes accept the first balanced {...} as a
|
|
// successful match — for `[{"a":1},{"b":2}]` that's the first
|
|
// inner object. Documenting this so the contract stays consistent
|
|
// across runtimes.
|
|
v := ExtractJSON(`[{"a":1},{"b":2}]`)
|
|
if v == nil {
|
|
t.Fatal("expected first inner object to be returned")
|
|
}
|
|
if v["a"].(float64) != 1 {
|
|
t.Errorf("expected first object {a:1}, got %+v", v)
|
|
}
|
|
}
|
|
|
|
// ─── Iterate orchestrator tests with scripted ChatCaller ────────────
|
|
|
|
func scriptedChat(responses ...string) (ChatCaller, *int) {
|
|
idx := 0
|
|
return func(_ context.Context, _, _ string, _, _ string, _ *float64, _ int, _ string) (string, error) {
|
|
if idx >= len(responses) {
|
|
return "", errors.New("scripted chat exhausted")
|
|
}
|
|
r := responses[idx]
|
|
idx++
|
|
return r, nil
|
|
}, &idx
|
|
}
|
|
|
|
func TestIterate_AcceptsFirstValidArtifact(t *testing.T) {
|
|
chat, calls := scriptedChat(`{"endorsed_names":["W-1"]}`)
|
|
validate := func(_ string, _ map[string]any) (Report, error) {
|
|
return Report{ElapsedMs: 1}, nil
|
|
}
|
|
resp, fail, err := Iterate(context.Background(),
|
|
IterateRequest{Kind: "playbook", Prompt: "produce X", Provider: "ollama", Model: "qwen3.5:latest"},
|
|
IterateConfig{}, chat, validate)
|
|
if err != nil || fail != nil {
|
|
t.Fatalf("expected success, got err=%v fail=%+v", err, fail)
|
|
}
|
|
if resp.Iterations != 1 {
|
|
t.Errorf("iterations = %d, want 1", resp.Iterations)
|
|
}
|
|
if len(resp.History) != 1 || resp.History[0].Status.Kind != "accepted" {
|
|
t.Errorf("history: %+v", resp.History)
|
|
}
|
|
if *calls != 1 {
|
|
t.Errorf("expected 1 chat call, got %d", *calls)
|
|
}
|
|
}
|
|
|
|
func TestIterate_RetriesOnNoJsonThenSucceeds(t *testing.T) {
|
|
chat, _ := scriptedChat(
|
|
"sorry I cannot do that",
|
|
`{"endorsed_names":["W-1"]}`,
|
|
)
|
|
validate := func(_ string, _ map[string]any) (Report, error) {
|
|
return Report{}, nil
|
|
}
|
|
resp, _, err := Iterate(context.Background(),
|
|
IterateRequest{Kind: "playbook", Prompt: "produce X", Provider: "ollama", Model: "x"},
|
|
IterateConfig{}, chat, validate)
|
|
if err != nil || resp == nil {
|
|
t.Fatalf("expected success, err=%v", err)
|
|
}
|
|
if resp.Iterations != 2 {
|
|
t.Errorf("iterations = %d, want 2", resp.Iterations)
|
|
}
|
|
if resp.History[0].Status.Kind != "no_json" {
|
|
t.Errorf("first history status: %+v", resp.History[0].Status)
|
|
}
|
|
}
|
|
|
|
func TestIterate_RetriesOnValidationFailureThenSucceeds(t *testing.T) {
|
|
chat, _ := scriptedChat(
|
|
`{"bad":"shape"}`,
|
|
`{"good":"shape"}`,
|
|
)
|
|
calls := 0
|
|
validate := func(_ string, body map[string]any) (Report, error) {
|
|
calls++
|
|
if _, ok := body["good"]; ok {
|
|
return Report{}, nil
|
|
}
|
|
return Report{}, &ValidationError{Kind: ErrSchema, Field: "x", Reason: "missing good"}
|
|
}
|
|
resp, _, err := Iterate(context.Background(),
|
|
IterateRequest{Kind: "playbook", Prompt: "produce X", Provider: "ollama", Model: "x"},
|
|
IterateConfig{}, chat, validate)
|
|
if err != nil || resp == nil {
|
|
t.Fatalf("expected success, err=%v", err)
|
|
}
|
|
if calls != 2 {
|
|
t.Errorf("validate calls = %d, want 2", calls)
|
|
}
|
|
if resp.History[0].Status.Kind != "validation_failed" {
|
|
t.Errorf("first history status: %+v", resp.History[0].Status)
|
|
}
|
|
if resp.History[0].Status.Error == "" {
|
|
t.Errorf("validation_failed entry must carry error string")
|
|
}
|
|
}
|
|
|
|
func TestIterate_MaxIterationsExhaustedReturnsFailure(t *testing.T) {
|
|
chat, _ := scriptedChat(`{}`, `{}`, `{}`)
|
|
validate := func(_ string, _ map[string]any) (Report, error) {
|
|
return Report{}, &ValidationError{Kind: ErrCompleteness, Reason: "always wrong"}
|
|
}
|
|
resp, fail, err := Iterate(context.Background(),
|
|
IterateRequest{Kind: "playbook", Prompt: "X", Provider: "ollama", Model: "x", MaxIterations: 3},
|
|
IterateConfig{}, chat, validate)
|
|
if err != nil {
|
|
t.Fatalf("infrastructure error unexpected: %v", err)
|
|
}
|
|
if resp != nil {
|
|
t.Fatalf("expected failure, got %+v", resp)
|
|
}
|
|
if fail.Iterations != 3 {
|
|
t.Errorf("iterations = %d, want 3", fail.Iterations)
|
|
}
|
|
if len(fail.History) != 3 {
|
|
t.Errorf("history length = %d, want 3", len(fail.History))
|
|
}
|
|
}
|
|
|
|
// TestIterate_TracerEmitsSpanPerAttempt locks the per-attempt
|
|
// observability contract: when a Tracer is wired AND a TraceID is
|
|
// present, every retry produces exactly one span and the SpanID
|
|
// lands on the IterateAttempt history row.
|
|
func TestIterate_TracerEmitsSpanPerAttempt(t *testing.T) {
|
|
chat, _ := scriptedChat(
|
|
"no json",
|
|
`{"k":"v"}`,
|
|
)
|
|
validate := func(_ string, _ map[string]any) (Report, error) { return Report{}, nil }
|
|
|
|
var spans []AttemptSpan
|
|
tracer := func(s AttemptSpan) string {
|
|
spans = append(spans, s)
|
|
return fmt.Sprintf("span-%d", s.Iteration)
|
|
}
|
|
|
|
resp, _, err := Iterate(context.Background(),
|
|
IterateRequest{Kind: "playbook", Prompt: "X", Provider: "ollama", Model: "qwen", TraceID: "trace-abc"},
|
|
IterateConfig{Tracer: tracer}, chat, validate)
|
|
if err != nil {
|
|
t.Fatalf("Iterate: %v", err)
|
|
}
|
|
if resp.TraceID != "trace-abc" {
|
|
t.Errorf("TraceID echo: got %q, want trace-abc", resp.TraceID)
|
|
}
|
|
if len(spans) != 2 {
|
|
t.Fatalf("expected 2 emitted spans, got %d", len(spans))
|
|
}
|
|
if spans[0].Verdict.Kind != "no_json" || spans[1].Verdict.Kind != "accepted" {
|
|
t.Errorf("verdict propagation: %+v", spans)
|
|
}
|
|
if resp.History[0].SpanID != "span-0" || resp.History[1].SpanID != "span-1" {
|
|
t.Errorf("SpanID stamping: %+v", resp.History)
|
|
}
|
|
if spans[0].TraceID != "trace-abc" {
|
|
t.Errorf("TraceID didn't reach the tracer: %q", spans[0].TraceID)
|
|
}
|
|
}
|
|
|
|
// TestIterate_NoTraceIDSkipsTracer guards the "Langfuse not configured"
|
|
// path: tracer is non-nil but trace_id is empty → we MUST NOT emit
|
|
// orphan spans.
|
|
func TestIterate_NoTraceIDSkipsTracer(t *testing.T) {
|
|
chat, _ := scriptedChat(`{"k":"v"}`)
|
|
validate := func(_ string, _ map[string]any) (Report, error) { return Report{}, nil }
|
|
called := 0
|
|
tracer := func(_ AttemptSpan) string {
|
|
called++
|
|
return "should-not-be-used"
|
|
}
|
|
resp, _, err := Iterate(context.Background(),
|
|
IterateRequest{Kind: "playbook", Prompt: "X", Provider: "ollama", Model: "qwen"}, // no TraceID
|
|
IterateConfig{Tracer: tracer}, chat, validate)
|
|
if err != nil {
|
|
t.Fatalf("Iterate: %v", err)
|
|
}
|
|
if called != 0 {
|
|
t.Errorf("tracer should not fire without a trace_id; called=%d", called)
|
|
}
|
|
if resp.History[0].SpanID != "" {
|
|
t.Errorf("SpanID should be empty when no trace_id: %q", resp.History[0].SpanID)
|
|
}
|
|
}
|
|
|
|
// TestIterate_ChatCallerReceivesTraceID confirms the trace_id is
|
|
// forwarded into the ChatCaller signature so HTTP transports can set
|
|
// the X-Lakehouse-Trace-Id header. Without this, chatd's Langfuse
|
|
// emit would create an orphan trace per retry instead of nesting
|
|
// under the iterate parent.
|
|
func TestIterate_ChatCallerReceivesTraceID(t *testing.T) {
|
|
var observedTrace string
|
|
chat := func(_ context.Context, _, _ string, _, _ string, _ *float64, _ int, traceID string) (string, error) {
|
|
observedTrace = traceID
|
|
return `{"k":"v"}`, nil
|
|
}
|
|
validate := func(_ string, _ map[string]any) (Report, error) { return Report{}, nil }
|
|
_, _, err := Iterate(context.Background(),
|
|
IterateRequest{Kind: "playbook", Prompt: "X", Provider: "ollama", Model: "x", TraceID: "trace-xyz"},
|
|
IterateConfig{}, chat, validate)
|
|
if err != nil {
|
|
t.Fatalf("Iterate: %v", err)
|
|
}
|
|
if observedTrace != "trace-xyz" {
|
|
t.Errorf("ChatCaller should receive trace_id; got %q", observedTrace)
|
|
}
|
|
}
|
|
|
|
func TestIterate_PropagatesChatInfraError(t *testing.T) {
|
|
chat := func(_ context.Context, _, _ string, _, _ string, _ *float64, _ int, _ string) (string, error) {
|
|
return "", errors.New("connection refused")
|
|
}
|
|
validate := func(_ string, _ map[string]any) (Report, error) { return Report{}, nil }
|
|
_, _, err := Iterate(context.Background(),
|
|
IterateRequest{Kind: "playbook", Prompt: "X", Provider: "ollama", Model: "x"},
|
|
IterateConfig{}, chat, validate)
|
|
if err == nil {
|
|
t.Fatal("expected infrastructure error to surface")
|
|
}
|
|
}
|