root a730fc2016 scrum fixes: 4 real findings landed, 4 false positives dismissed
Cross-lineage scrum review on the 12 commits of this session
(afbb506..06e7152) via Rust gateway :3100 with Opus + Kimi +
Qwen3-coder. Results:

  Real findings landed:
    1. Opus BLOCK — vectord BatchAdd intra-batch duplicates panic
       coder/hnsw's "node not added" length-invariant. Fixed with
       last-write-wins dedup inside BatchAdd before the pre-pass.
       Regression test TestBatchAdd_IntraBatchDedup added.
    2. Opus + Kimi convergent WARN — strings.Contains(err.Error(),
       "status 404") was brittle string-matching to detect cold-
       start playbook state. Fixed: ErrCorpusNotFound sentinel
       returned by searchCorpus on HTTP 404; fetchPlaybookHits
       uses errors.Is.
    3. Opus WARN — corpusingest.Run returned nil on total batch
       failure, masking broken pipelines as "empty corpora." Fixed:
       Stats.FailedBatches counter, ErrPartialFailure sentinel
       returned when nonzero. New regression test
       TestRun_NonzeroFailedBatchesReturnsError.
    4. Opus WARN — dead var _ = io.EOF in staffing_500k/main.go
       was justified by a fictional comment. Removed.

  Drivers (staffing_500k, staffing_candidates, staffing_workers)
  updated to handle ErrPartialFailure gracefully — print warn, keep
  running queries — rather than fatal'ing on transient hiccups
  while still surfacing the failure clearly in the output.

  Documented (no code change):
    - Opus WARN: matrixd /matrix/downgrade reads
      LH_FORCE_FULL_ENRICHMENT from process env when body omits
      it. Comment now explains the opinionated default and points
      callers wanting deterministic behavior to pass the field
      explicitly.

  False positives dismissed (caught and verified, NOT acted on):
    A. Kimi BLOCK on errors.Is + wrapped error in cmd/matrixd:223.
       Verified false: Search wraps with %w (fmt.Errorf("%w: %v",
       ErrEmbed, err)), so errors.Is matches the chain correctly.
    B. Kimi INFO "BatchAdd has no unit tests." Verified false:
       batch_bench_test.go has BenchmarkBatchAdd; the new dedup
       test TestBatchAdd_IntraBatchDedup adds another.
    C. Opus BLOCK on missing finite/zero-norm pre-validation in
       cmd/vectord:280-291. Verified false: line 272 already calls
       vectord.ValidateVector before BatchAdd, so finite + zero-
       norm IS checked. Pre-validation is exhaustive.
    D. Opus WARN on relevance.go tokenRe (Opus self-corrected
       mid-finding when realizing leading char counts toward token
       length).

  Qwen3-coder returned NO FINDINGS — known issue with very long
  diffs through the OpenRouter free tier; lineage rotation worked
  as designed (Opus + Kimi between them caught everything Qwen
  would have).

15-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix, relevance, downgrade, playbook).
Unit tests all green (corpusingest +1, vectord +1).

Per feedback_cross_lineage_review.md: convergent finding #2 (404
detection) is the highest-signal one — both Opus and Kimi
flagged it independently. The other Opus findings stand on
single-reviewer signal but each one verified against the actual
code.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 19:42:39 -05:00

456 lines
12 KiB
Go

package corpusingest
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/http/httptest"
"strings"
"sync"
"testing"
"time"
)
// fakeGateway records the embed + add calls corpusingest fires and
// returns canned responses. The whole point of the unit test is to
// validate the pipeline shape (request payloads, batching, stats)
// without needing live embedd/vectord.
type fakeGateway struct {
mu sync.Mutex
embedCalls int
embedTexts [][]string // texts per call
addCalls int
addItems [][]addItem // items per call
createCalled bool
deleteCalled bool
indexConflict bool // simulate "index already exists" → 409
embedDimension int
}
type addItem struct {
ID string `json:"id"`
Vector []float32 `json:"vector"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
func newFakeGateway(dim int) *fakeGateway {
return &fakeGateway{embedDimension: dim}
}
func (f *fakeGateway) handler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/v1/vectors/index", func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "wrong method", http.StatusMethodNotAllowed)
return
}
f.mu.Lock()
f.createCalled = true
conflict := f.indexConflict
f.mu.Unlock()
if conflict {
http.Error(w, "exists", http.StatusConflict)
return
}
w.WriteHeader(http.StatusCreated)
})
mux.HandleFunc("/v1/embed", func(w http.ResponseWriter, r *http.Request) {
var req struct {
Texts []string `json:"texts"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
// Synthesize deterministic vectors: vector[i] = float32(i+1).
vecs := make([][]float32, len(req.Texts))
for i := range vecs {
v := make([]float32, f.embedDimension)
for j := range v {
v[j] = float32(i + j + 1)
}
vecs[i] = v
}
f.mu.Lock()
f.embedCalls++
// Copy because we'll release the slice after returning.
texts := append([]string(nil), req.Texts...)
f.embedTexts = append(f.embedTexts, texts)
f.mu.Unlock()
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(map[string]any{
"vectors": vecs,
"dimension": f.embedDimension,
"model": "fake-embed",
})
})
mux.HandleFunc("/v1/vectors/index/", func(w http.ResponseWriter, r *http.Request) {
// /v1/vectors/index/{name}/add
if !strings.HasSuffix(r.URL.Path, "/add") {
if r.Method == http.MethodDelete {
f.mu.Lock()
f.deleteCalled = true
f.mu.Unlock()
w.WriteHeader(http.StatusNoContent)
return
}
http.Error(w, "unhandled "+r.URL.Path, http.StatusNotFound)
return
}
var req struct {
Items []addItem `json:"items"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
f.mu.Lock()
f.addCalls++
f.addItems = append(f.addItems, append([]addItem(nil), req.Items...))
f.mu.Unlock()
_, _ = io.WriteString(w, `{"added":`+fmt.Sprint(len(req.Items))+`}`)
})
return mux
}
// staticSource yields a fixed slice of rows.
type staticSource struct {
rows []Row
i int
}
func (s *staticSource) Next() (Row, error) {
if s.i >= len(s.rows) {
return Row{}, io.EOF
}
r := s.rows[s.i]
s.i++
return r, nil
}
func TestRun_PipelineShapeAndStats(t *testing.T) {
const dim = 4
fg := newFakeGateway(dim)
srv := httptest.NewServer(fg.handler())
defer srv.Close()
rows := make([]Row, 50)
for i := range rows {
rows[i] = Row{
ID: fmt.Sprintf("r-%03d", i),
Text: fmt.Sprintf("row %d text", i),
Metadata: map[string]any{"i": i, "kind": "test"},
}
}
stats, err := Run(context.Background(), Config{
GatewayURL: srv.URL,
IndexName: "test_corpus",
Dimension: dim,
Distance: "cosine",
EmbedBatch: 16,
EmbedWorkers: 4,
HTTPClient: srv.Client(),
LogProgress: 0,
}, &staticSource{rows: rows})
if err != nil {
t.Fatalf("Run: %v", err)
}
if stats.Scanned != 50 {
t.Errorf("Scanned: want 50, got %d", stats.Scanned)
}
if stats.Embedded != 50 {
t.Errorf("Embedded: want 50, got %d", stats.Embedded)
}
if stats.Added != 50 {
t.Errorf("Added: want 50, got %d", stats.Added)
}
if !fg.createCalled {
t.Error("expected create-index to be called")
}
// 50 rows / 16 batch = ceil(50/16) = 4 batches → 4 embed calls + 4 add calls
if fg.embedCalls != 4 {
t.Errorf("embedCalls: want 4 (50 rows / 16 batch), got %d", fg.embedCalls)
}
if fg.addCalls != 4 {
t.Errorf("addCalls: want 4, got %d", fg.addCalls)
}
// Sum of texts across embed calls must be 50, and IDs across add
// calls must be every r-NNN exactly once.
totalTexts := 0
for _, ts := range fg.embedTexts {
totalTexts += len(ts)
}
if totalTexts != 50 {
t.Errorf("total embedded texts: want 50, got %d", totalTexts)
}
seen := make(map[string]bool)
for _, items := range fg.addItems {
for _, it := range items {
if seen[it.ID] {
t.Errorf("duplicate id in add stream: %s", it.ID)
}
seen[it.ID] = true
if len(it.Vector) != dim {
t.Errorf("vector dim: want %d, got %d", dim, len(it.Vector))
}
}
}
if len(seen) != 50 {
t.Errorf("unique ids added: want 50, got %d", len(seen))
}
}
func TestRun_DropExistingFiresDelete(t *testing.T) {
fg := newFakeGateway(4)
srv := httptest.NewServer(fg.handler())
defer srv.Close()
_, err := Run(context.Background(), Config{
GatewayURL: srv.URL,
IndexName: "drops_first",
Dimension: 4,
DropExisting: true,
HTTPClient: srv.Client(),
}, &staticSource{rows: []Row{{ID: "x", Text: "y", Metadata: nil}}})
if err != nil {
t.Fatalf("Run: %v", err)
}
if !fg.deleteCalled {
t.Error("expected delete-index to fire when DropExisting=true")
}
}
func TestRun_IndexAlreadyExistsIsReused(t *testing.T) {
fg := newFakeGateway(4)
fg.indexConflict = true // first POST /v1/vectors/index → 409
srv := httptest.NewServer(fg.handler())
defer srv.Close()
stats, err := Run(context.Background(), Config{
GatewayURL: srv.URL,
IndexName: "exists_already",
Dimension: 4,
HTTPClient: srv.Client(),
EmbedWorkers: 1,
}, &staticSource{rows: []Row{{ID: "x", Text: "y", Metadata: nil}}})
if err != nil {
t.Fatalf("Run with existing index should succeed: %v", err)
}
if stats.Added != 1 {
t.Errorf("Added: want 1, got %d", stats.Added)
}
}
func TestRun_LimitStopsEarly(t *testing.T) {
fg := newFakeGateway(4)
srv := httptest.NewServer(fg.handler())
defer srv.Close()
rows := make([]Row, 100)
for i := range rows {
rows[i] = Row{ID: fmt.Sprintf("r-%d", i), Text: "t", Metadata: nil}
}
stats, err := Run(context.Background(), Config{
GatewayURL: srv.URL,
IndexName: "limited",
Dimension: 4,
Limit: 25,
EmbedBatch: 8,
EmbedWorkers: 2,
HTTPClient: srv.Client(),
}, &staticSource{rows: rows})
if err != nil {
t.Fatalf("Run: %v", err)
}
if stats.Scanned != 25 {
t.Errorf("Scanned: want 25 (limit), got %d", stats.Scanned)
}
}
func TestRun_EmptyTextSkipped(t *testing.T) {
fg := newFakeGateway(4)
srv := httptest.NewServer(fg.handler())
defer srv.Close()
rows := []Row{
{ID: "a", Text: "real text", Metadata: nil},
{ID: "b", Text: "", Metadata: nil}, // skipped
{ID: "c", Text: "more text", Metadata: nil},
}
stats, err := Run(context.Background(), Config{
GatewayURL: srv.URL, IndexName: "skip", Dimension: 4,
HTTPClient: srv.Client(),
}, &staticSource{rows: rows})
if err != nil {
t.Fatalf("Run: %v", err)
}
if stats.Scanned != 3 {
t.Errorf("Scanned: want 3 (b is skipped but counted as scanned), got %d", stats.Scanned)
}
if stats.Added != 2 {
t.Errorf("Added: want 2 (b excluded from embed), got %d", stats.Added)
}
}
// TestRun_ProgressLoggerExits guards the bug caught 2026-04-29 in
// the candidates e2e: when LogProgress > 0, the progress goroutine's
// only exit was ctx.Done(). With context.Background() in the
// production driver, Run hung forever after the pipeline finished.
// This test bounds Run's wall to a few hundred ms — if it regresses,
// the test deadline kicks in.
func TestRun_ProgressLoggerExits(t *testing.T) {
fg := newFakeGateway(4)
srv := httptest.NewServer(fg.handler())
defer srv.Close()
rows := []Row{
{ID: "a", Text: "x", Metadata: nil},
{ID: "b", Text: "y", Metadata: nil},
}
done := make(chan error, 1)
go func() {
_, err := Run(context.Background(), Config{
GatewayURL: srv.URL,
IndexName: "progress_test",
Dimension: 4,
HTTPClient: srv.Client(),
LogProgress: 50 * time.Millisecond,
}, &staticSource{rows: rows})
done <- err
}()
select {
case err := <-done:
if err != nil {
t.Fatalf("Run: %v", err)
}
case <-time.After(2 * time.Second):
t.Fatal("Run did not return within 2s — progress goroutine likely hanging")
}
}
// TestRun_NonzeroFailedBatchesReturnsError guards the 2026-04-29
// scrum WARN: original behavior returned nil even when 100% of
// batches failed, making "embedded=0/scanned=N" look like an empty
// corpus rather than a broken pipeline.
func TestRun_NonzeroFailedBatchesReturnsError(t *testing.T) {
// Fake gateway that fails every embed call.
mux := http.NewServeMux()
mux.HandleFunc("/v1/vectors/index", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusCreated)
})
mux.HandleFunc("/v1/embed", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "embed failure injected", http.StatusBadGateway)
})
mux.HandleFunc("/v1/vectors/index/", func(w http.ResponseWriter, r *http.Request) {
// shouldn't reach here since embed fails first
http.Error(w, "should not be called", http.StatusInternalServerError)
})
srv := httptest.NewServer(mux)
defer srv.Close()
rows := make([]Row, 5)
for i := range rows {
rows[i] = Row{ID: fmt.Sprintf("r-%d", i), Text: "x"}
}
stats, err := Run(context.Background(), Config{
GatewayURL: srv.URL, IndexName: "fail_only", Dimension: 4,
EmbedBatch: 1, EmbedWorkers: 1, HTTPClient: srv.Client(),
}, &staticSource{rows: rows})
if !errors.Is(err, ErrPartialFailure) {
t.Errorf("want ErrPartialFailure, got %v", err)
}
if stats.FailedBatches == 0 {
t.Error("FailedBatches should be > 0 when embeds fail")
}
if stats.Added != 0 {
t.Errorf("Added: want 0 (all failed), got %d", stats.Added)
}
}
func TestRun_RequiresIndexName(t *testing.T) {
_, err := Run(context.Background(), Config{Dimension: 4},
&staticSource{rows: nil})
if err == nil || !strings.Contains(err.Error(), "IndexName") {
t.Errorf("want IndexName-required error, got %v", err)
}
}
func TestRun_RequiresDimension(t *testing.T) {
_, err := Run(context.Background(), Config{IndexName: "x"},
&staticSource{rows: nil})
if err == nil || !strings.Contains(err.Error(), "Dimension") {
t.Errorf("want Dimension-required error, got %v", err)
}
}
// TestRun_ContextCancel verifies the pipeline drains cleanly when
// ctx is cancelled mid-run. Source returns rows fast enough that
// without ctx the run would complete; cancelling early should stop
// well before all 1000 rows are processed.
func TestRun_ContextCancel(t *testing.T) {
fg := newFakeGateway(4)
// Slow embed handler: each call sleeps 50ms.
mux := http.NewServeMux()
mux.HandleFunc("/v1/vectors/index", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusCreated)
})
mux.HandleFunc("/v1/embed", func(w http.ResponseWriter, r *http.Request) {
var req struct {
Texts []string `json:"texts"`
}
_ = json.NewDecoder(r.Body).Decode(&req)
// Simulate slow-but-valid backend so we test ctx cancel, not
// degraded-payload handling (that's covered in production by
// the len-mismatch guard in Run's worker).
time.Sleep(50 * time.Millisecond)
_ = fg
vecs := make([][]float32, len(req.Texts))
for i := range vecs {
vecs[i] = []float32{1, 2, 3, 4}
}
_ = json.NewEncoder(w).Encode(map[string]any{
"vectors": vecs,
"dimension": 4,
"model": "x",
})
})
mux.HandleFunc("/v1/vectors/index/", func(w http.ResponseWriter, r *http.Request) {
_, _ = io.WriteString(w, `{}`)
})
srv := httptest.NewServer(mux)
defer srv.Close()
rows := make([]Row, 1000)
for i := range rows {
rows[i] = Row{ID: fmt.Sprintf("r-%d", i), Text: "t"}
}
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
stats, err := Run(ctx, Config{
GatewayURL: srv.URL, IndexName: "cancel_me", Dimension: 4,
EmbedBatch: 1, EmbedWorkers: 1, HTTPClient: srv.Client(),
}, &staticSource{rows: rows})
// Either an error or a partial stats; the point is "didn't process all 1000."
if stats.Scanned >= 1000 {
t.Errorf("ctx cancel did not stop early: scanned=%d err=%v", stats.Scanned, err)
}
}