root a730fc2016 scrum fixes: 4 real findings landed, 4 false positives dismissed
Cross-lineage scrum review on the 12 commits of this session
(afbb506..06e7152) via Rust gateway :3100 with Opus + Kimi +
Qwen3-coder. Results:

  Real findings landed:
    1. Opus BLOCK — vectord BatchAdd intra-batch duplicates panic
       coder/hnsw's "node not added" length-invariant. Fixed with
       last-write-wins dedup inside BatchAdd before the pre-pass.
       Regression test TestBatchAdd_IntraBatchDedup added.
    2. Opus + Kimi convergent WARN — strings.Contains(err.Error(),
       "status 404") was brittle string-matching to detect cold-
       start playbook state. Fixed: ErrCorpusNotFound sentinel
       returned by searchCorpus on HTTP 404; fetchPlaybookHits
       uses errors.Is.
    3. Opus WARN — corpusingest.Run returned nil on total batch
       failure, masking broken pipelines as "empty corpora." Fixed:
       Stats.FailedBatches counter, ErrPartialFailure sentinel
       returned when nonzero. New regression test
       TestRun_NonzeroFailedBatchesReturnsError.
    4. Opus WARN — dead var _ = io.EOF in staffing_500k/main.go
       was justified by a fictional comment. Removed.

  Drivers (staffing_500k, staffing_candidates, staffing_workers)
  updated to handle ErrPartialFailure gracefully — print warn, keep
  running queries — rather than fatal'ing on transient hiccups
  while still surfacing the failure clearly in the output.

  Documented (no code change):
    - Opus WARN: matrixd /matrix/downgrade reads
      LH_FORCE_FULL_ENRICHMENT from process env when body omits
      it. Comment now explains the opinionated default and points
      callers wanting deterministic behavior to pass the field
      explicitly.

  False positives dismissed (caught and verified, NOT acted on):
    A. Kimi BLOCK on errors.Is + wrapped error in cmd/matrixd:223.
       Verified false: Search wraps with %w (fmt.Errorf("%w: %v",
       ErrEmbed, err)), so errors.Is matches the chain correctly.
    B. Kimi INFO "BatchAdd has no unit tests." Verified false:
       batch_bench_test.go has BenchmarkBatchAdd; the new dedup
       test TestBatchAdd_IntraBatchDedup adds another.
    C. Opus BLOCK on missing finite/zero-norm pre-validation in
       cmd/vectord:280-291. Verified false: line 272 already calls
       vectord.ValidateVector before BatchAdd, so finite + zero-
       norm IS checked. Pre-validation is exhaustive.
    D. Opus WARN on relevance.go tokenRe (Opus self-corrected
       mid-finding when realizing leading char counts toward token
       length).

  Qwen3-coder returned NO FINDINGS — known issue with very long
  diffs through the OpenRouter free tier; lineage rotation worked
  as designed (Opus + Kimi between them caught everything Qwen
  would have).

15-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix, relevance, downgrade, playbook).
Unit tests all green (corpusingest +1, vectord +1).

Per feedback_cross_lineage_review.md: convergent finding #2 (404
detection) is the highest-signal one — both Opus and Kimi
flagged it independently. The other Opus findings stand on
single-reviewer signal but each one verified against the actual
code.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 19:42:39 -05:00

455 lines
14 KiB
Go

// Package vectord owns the vector-search surface — HNSW indexes
// keyed by string IDs with optional opaque JSON metadata. The
// underlying library is github.com/coder/hnsw (pure Go, no cgo).
//
// G1 scope: in-memory only. Persistence to storaged + rehydrate
// across restart is the next piece — keeping it out of this layer
// makes the index API easier to test and keeps the storaged
// dependency optional for downstream tooling.
package vectord
import (
"encoding/json"
"errors"
"fmt"
"io"
"math"
"sync"
"github.com/coder/hnsw"
)
// Distance names accepted by IndexParams.Distance.
const (
DistanceCosine = "cosine"
DistanceEuclidean = "euclidean"
)
// Default HNSW parameters — match coder/hnsw's NewGraph defaults
// which are tuned for OpenAI-shaped embeddings (1536-d, but the
// hyperparameters generalize).
const (
DefaultM = 16
DefaultEfSearch = 20
)
// IndexParams describes one vector index. Once an Index is built,
// these are fixed — changing M / dimension / distance requires a
// rebuild.
type IndexParams struct {
Name string `json:"name"`
Dimension int `json:"dimension"`
M int `json:"m"`
EfSearch int `json:"ef_search"`
Distance string `json:"distance"`
}
// Result is one search hit. Distance semantics depend on the
// configured distance function — for cosine it's `1 - cos(a,b)`
// where smaller = closer; for euclidean it's the L2 norm of
// `a - b`. Either way, smaller = closer and the result list is
// sorted ascending.
type Result struct {
ID string `json:"id"`
Distance float32 `json:"distance"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
// Index wraps a coder/hnsw graph plus a side map of opaque JSON
// metadata per ID. Concurrency: read-heavy via Search (read-lock);
// Add and Delete take the write lock.
type Index struct {
params IndexParams
g *hnsw.Graph[string]
meta map[string]json.RawMessage
mu sync.RWMutex
}
// Errors surfaced to HTTP handlers. Sentinel-based so the wire
// layer can map to status codes via errors.Is.
var (
ErrDimensionMismatch = errors.New("vectord: vector dimension mismatch")
ErrUnknownDistance = errors.New("vectord: unknown distance function")
ErrInvalidParams = errors.New("vectord: invalid index params")
)
// NewIndex builds a fresh index from validated params.
func NewIndex(p IndexParams) (*Index, error) {
if p.Name == "" {
return nil, fmt.Errorf("%w: empty name", ErrInvalidParams)
}
if p.Dimension <= 0 {
return nil, fmt.Errorf("%w: dimension must be > 0 (got %d)", ErrInvalidParams, p.Dimension)
}
if p.M <= 0 {
p.M = DefaultM
}
if p.EfSearch <= 0 {
p.EfSearch = DefaultEfSearch
}
if p.Distance == "" {
p.Distance = DistanceCosine
}
dist, err := distanceFn(p.Distance)
if err != nil {
return nil, err
}
g := hnsw.NewGraph[string]()
g.M = p.M
g.EfSearch = p.EfSearch
g.Distance = dist
// Ml stays at the library default (0.25); exposing it as a knob
// is a G2 concern when we have real tuning data.
return &Index{
params: p,
g: g,
meta: make(map[string]json.RawMessage),
}, nil
}
// distanceFn maps the string name to the underlying function.
// Easier to unit-test than calling out to coder/hnsw's registry.
func distanceFn(name string) (hnsw.DistanceFunc, error) {
switch name {
case DistanceCosine, "":
return hnsw.CosineDistance, nil
case DistanceEuclidean:
return hnsw.EuclideanDistance, nil
}
return nil, fmt.Errorf("%w: %q (want cosine or euclidean)", ErrUnknownDistance, name)
}
// Params returns a copy of the immutable index params.
func (i *Index) Params() IndexParams { return i.params }
// Len returns the number of vectors currently in the index.
func (i *Index) Len() int {
i.mu.RLock()
defer i.mu.RUnlock()
return i.g.Len()
}
// Add inserts a vector with optional metadata, with replace
// semantics for the vector: if id already exists, the prior
// vector is removed first. Dim must match the index dim or
// ErrDimensionMismatch is returned.
//
// Metadata semantics (post-scrum K-B1): nil meta is "leave
// existing alone" (upsert-style); to clear metadata, pass an
// empty `{}` or Delete+Add. This avoids silent metadata loss
// when the JSON `metadata` field is omitted on re-add.
//
// Validates that all vector components are finite (post-scrum
// O-W3). NaN/Inf in any component poisons HNSW: distance
// comparisons return false for both `<` and `>`, breaking the
// search heap invariants. Zero-norm vectors are also rejected
// under cosine distance — cos(0,x) = NaN.
//
// Note: coder/hnsw's Graph.Add panics on re-adding an existing
// key (internal "node not added" length-invariant check). We
// pre-Delete to make Add idempotent on re-insert.
func (i *Index) Add(id string, vec []float32, meta json.RawMessage) error {
if id == "" {
return errors.New("vectord: empty id")
}
if len(vec) != i.params.Dimension {
return fmt.Errorf("%w: index dim=%d, got=%d", ErrDimensionMismatch, i.params.Dimension, len(vec))
}
if err := validateVector(vec, i.params.Distance); err != nil {
return err
}
i.mu.Lock()
defer i.mu.Unlock()
// coder/hnsw has two sharp edges on re-add:
// 1. Add of an existing key panics with "node not added"
// (length-invariant fires because internal delete+re-add
// doesn't change Len). Pre-Delete fixes this for n>1.
// 2. Delete of the LAST node leaves layers[0] non-empty but
// entryless; the next Add SIGSEGVs in Dims() because
// entry().Value is nil. We rebuild the graph in that case.
_, exists := i.g.Lookup(id)
if exists {
if i.g.Len() == 1 {
i.resetGraphLocked()
} else {
i.g.Delete(id)
}
}
i.g.Add(hnsw.MakeNode(id, vec))
if meta != nil {
// Per scrum K-B1 (Kimi): only OVERWRITE on explicit non-nil.
// nil = "leave existing meta alone" (upsert). To clear, the
// caller should send an empty `{}` body or Delete the id.
i.meta[id] = meta
}
return nil
}
// resetGraphLocked recreates the underlying coder/hnsw Graph with
// the same params. Caller MUST hold i.mu (write-lock). Used to
// dodge the library's "delete the last node, then segfault on
// next Add" bug — see Add for details. Metadata map is preserved
// because the only entry it could affect is the one being
// re-added, which Add overwrites.
func (i *Index) resetGraphLocked() {
g := hnsw.NewGraph[string]()
g.M = i.params.M
g.EfSearch = i.params.EfSearch
g.Distance = i.g.Distance
i.g = g
}
// ValidateVector is the exported form of validateVector — the HTTP
// handler pre-validates batches before committing, so it needs the
// same predicate Add uses internally. Per scrum O-I3 (G1P).
func ValidateVector(vec []float32, distance string) error {
return validateVector(vec, distance)
}
// validateVector rejects vectors that would poison the HNSW
// graph or produce NaN distances. Per scrum O-W3 (Opus, G1).
func validateVector(vec []float32, distance string) error {
var sumSq float64
for j, v := range vec {
f := float64(v)
if math.IsNaN(f) || math.IsInf(f, 0) {
return fmt.Errorf("vectord: vec[%d] is non-finite (got %v)", j, v)
}
sumSq += f * f
}
if distance == DistanceCosine && sumSq == 0 {
return errors.New("vectord: zero-norm vector under cosine distance")
}
return nil
}
// BatchItem is one entry in a BatchAdd call. Same per-field
// contract as Add: ID + Vector required, Metadata follows
// upsert-style semantics (nil = leave existing alone).
type BatchItem struct {
ID string
Vector []float32
Metadata json.RawMessage
}
// BatchAdd inserts a slice of items under a single write-lock, with
// one variadic call into coder/hnsw's Graph.Add. Net win vs. a loop
// of single Add calls: N→1 lock acquisitions per HTTP batch and one
// variadic library call instead of N.
//
// Contract: items MUST be pre-validated by the caller (id non-empty,
// vector dimension matches, vector finite + non-zero-norm under
// cosine). Pre-validation lives in the HTTP handler so per-item
// error messages stay precise; reproducing it here would force
// position-encoded errors on every consumer.
//
// Intra-batch duplicate IDs: dedup'd internally with last-write-wins
// semantics (matches map-style behavior — second occurrence of an
// ID replaces the first). Without dedup, coder/hnsw's "node not
// added" length-invariant panics on the second occurrence. Caught
// by 2026-04-29 cross-lineage scrum (Opus BLOCK).
func (i *Index) BatchAdd(items []BatchItem) error {
if len(items) == 0 {
return nil
}
// Intra-batch dedup, last-write-wins. Walk forward, record the
// LAST index for each ID, then keep only items whose index is
// the recorded last. Preserves order of last occurrences in the
// original positions.
if hasDup := containsDuplicateID(items); hasDup {
items = dedupBatchLastWins(items)
}
i.mu.Lock()
defer i.mu.Unlock()
// Pre-pass: drop any existing IDs so coder/hnsw's variadic Add
// never sees a re-add. Same library-quirk handling as single
// Add — Len()==1 needs a full graph reset because Delete of the
// last node leaves layers[0] entryless.
for _, it := range items {
if _, exists := i.g.Lookup(it.ID); exists {
if i.g.Len() == 1 {
i.resetGraphLocked()
} else {
i.g.Delete(it.ID)
}
}
}
nodes := make([]hnsw.Node[string], len(items))
for j, it := range items {
nodes[j] = hnsw.MakeNode(it.ID, it.Vector)
}
i.g.Add(nodes...)
for _, it := range items {
if it.Metadata != nil {
i.meta[it.ID] = it.Metadata
}
}
return nil
}
// containsDuplicateID is a fast pre-check — if no dups, skip the
// dedup allocation. Most batches won't have dups so this is a hot
// path.
func containsDuplicateID(items []BatchItem) bool {
seen := make(map[string]struct{}, len(items))
for _, it := range items {
if _, ok := seen[it.ID]; ok {
return true
}
seen[it.ID] = struct{}{}
}
return false
}
// dedupBatchLastWins keeps only the last occurrence of each ID,
// preserving the relative order of those last occurrences. This
// matches map-style "set X to A then to B" semantics: B wins.
func dedupBatchLastWins(items []BatchItem) []BatchItem {
lastIdx := make(map[string]int, len(items))
for j, it := range items {
lastIdx[it.ID] = j
}
out := make([]BatchItem, 0, len(lastIdx))
for j, it := range items {
if lastIdx[it.ID] == j {
out = append(out, it)
}
}
return out
}
// Delete removes id from the index. Returns true if present.
func (i *Index) Delete(id string) bool {
i.mu.Lock()
defer i.mu.Unlock()
delete(i.meta, id)
return i.g.Delete(id)
}
// Search returns the k nearest neighbors of query, sorted
// ascending by distance.
//
// Note: coder/hnsw's Search returns `[]Node[K]` without distances —
// they're computed internally in the search candidate heap but
// dropped from the public API. We recompute distance from the
// returned vectors. O(k·dim) per search, negligible at typical
// k=10 / dim<2048.
func (i *Index) Search(query []float32, k int) ([]Result, error) {
if len(query) != i.params.Dimension {
return nil, fmt.Errorf("%w: index dim=%d, got=%d", ErrDimensionMismatch, i.params.Dimension, len(query))
}
if k <= 0 {
return nil, errors.New("vectord: k must be > 0")
}
i.mu.RLock()
defer i.mu.RUnlock()
// Per scrum O-I2 (Opus): use the stored distance function
// directly rather than re-resolving the string name on every
// search. The graph's Distance is set once at NewIndex.
dist := i.g.Distance
hits := i.g.Search(query, k)
out := make([]Result, len(hits))
for j, n := range hits {
out[j] = Result{
ID: n.Key,
Distance: dist(query, n.Value),
Metadata: i.meta[n.Key],
}
}
return out, nil
}
// IndexEnvelope is the JSON shape persisted alongside the binary
// HNSW graph bytes. params + metadata + format version travel
// together; the graph itself is opaque binary that round-trips
// through hnsw.Graph.Export / Import.
type IndexEnvelope struct {
Version int `json:"version"`
Params IndexParams `json:"params"`
Metadata map[string]json.RawMessage `json:"metadata"`
}
// envelopeVersion bumps when the on-disk JSON shape changes
// incompatibly. Reading a future version returns ErrVersionMismatch
// rather than producing a half-decoded index.
const envelopeVersion = 1
// ErrVersionMismatch is returned by DecodeIndex when the envelope
// claims a version this build doesn't understand.
var ErrVersionMismatch = errors.New("vectord: unknown envelope version")
// Encode writes the index's JSON envelope (params + metadata) and
// the binary HNSW graph bytes through two writers. Two-stream
// shape lets the persistor write each to a distinct storaged key
// without reframing.
//
// envelopeW receives params+metadata as JSON; graphW receives the
// raw output of hnsw.Graph.Export.
func (i *Index) Encode(envelopeW, graphW io.Writer) error {
i.mu.RLock()
defer i.mu.RUnlock()
env := IndexEnvelope{
Version: envelopeVersion,
Params: i.params,
Metadata: i.meta,
}
if err := json.NewEncoder(envelopeW).Encode(env); err != nil {
return fmt.Errorf("encode envelope: %w", err)
}
if err := i.g.Export(graphW); err != nil {
return fmt.Errorf("export graph: %w", err)
}
return nil
}
// DecodeIndex reconstructs an Index from a previously-Encoded pair
// of streams. The returned Index is independent — closing either
// reader after this call doesn't affect the result.
func DecodeIndex(envelopeR, graphR io.Reader) (*Index, error) {
var env IndexEnvelope
if err := json.NewDecoder(envelopeR).Decode(&env); err != nil {
return nil, fmt.Errorf("decode envelope: %w", err)
}
if env.Version != envelopeVersion {
return nil, fmt.Errorf("%w: have %d, got %d",
ErrVersionMismatch, envelopeVersion, env.Version)
}
idx, err := NewIndex(env.Params)
if err != nil {
return nil, err
}
if err := idx.g.Import(graphR); err != nil {
return nil, fmt.Errorf("import graph: %w", err)
}
if env.Metadata != nil {
idx.meta = env.Metadata
}
return idx, nil
}
// Lookup returns the stored vector + metadata for an id.
//
// Per scrum O-W1 (Opus): the vector is COPIED before return.
// coder/hnsw's Lookup hands back the underlying graph slice;
// caller mutation would corrupt the index without locking.
func (i *Index) Lookup(id string) (vec []float32, meta json.RawMessage, ok bool) {
i.mu.RLock()
defer i.mu.RUnlock()
v, found := i.g.Lookup(id)
if !found {
return nil, nil, false
}
out := make([]float32, len(v))
copy(out, v)
return out, i.meta[id], true
}