// Package vectord owns the vector-search surface — HNSW indexes // keyed by string IDs with optional opaque JSON metadata. The // underlying library is github.com/coder/hnsw (pure Go, no cgo). // // G1 scope: in-memory only. Persistence to storaged + rehydrate // across restart is the next piece — keeping it out of this layer // makes the index API easier to test and keeps the storaged // dependency optional for downstream tooling. package vectord import ( "encoding/json" "errors" "fmt" "io" "math" "sync" "github.com/coder/hnsw" ) // Distance names accepted by IndexParams.Distance. const ( DistanceCosine = "cosine" DistanceEuclidean = "euclidean" ) // Default HNSW parameters — match coder/hnsw's NewGraph defaults // which are tuned for OpenAI-shaped embeddings (1536-d, but the // hyperparameters generalize). const ( DefaultM = 16 DefaultEfSearch = 20 ) // smallIndexRebuildThreshold guards against coder/hnsw v0.6.1's // degenerate-state nil-deref (graph.go:95 layerNode.search) which // fires when the graph transitions through low-len states with a // stale entry pointer. Below this threshold, Add and BatchAdd // rebuild the entire graph from scratch — fresh graph + one // variadic Add never exercises the buggy incremental path. // // Why 32: HNSW's value is sub-linear search at large N; at N<32 a // rebuild's O(n) cost (snapshot ids + bulk Add) is negligible // (~µs at 768-d). The boundary is intentionally above the small // playbook-corpus regime (where multitier_100k surfaced the bug) // but well below realistic working-set indexes. // // The recover() guard in BatchAdd remains as belt-and-suspenders // for any incremental-path edge cases past the threshold. const smallIndexRebuildThreshold = 32 // IndexParams describes one vector index. Once an Index is built, // these are fixed — changing M / dimension / distance requires a // rebuild. type IndexParams struct { Name string `json:"name"` Dimension int `json:"dimension"` M int `json:"m"` EfSearch int `json:"ef_search"` Distance string `json:"distance"` } // Result is one search hit. Distance semantics depend on the // configured distance function — for cosine it's `1 - cos(a,b)` // where smaller = closer; for euclidean it's the L2 norm of // `a - b`. Either way, smaller = closer and the result list is // sorted ascending. type Result struct { ID string `json:"id"` Distance float32 `json:"distance"` Metadata json.RawMessage `json:"metadata,omitempty"` } // Index wraps a coder/hnsw graph plus side maps of opaque JSON // metadata and raw vectors per ID. Concurrency: read-heavy via // Search (read-lock); Add and Delete take the write lock. // // Why we keep vectors in a side map (i.vectors) in addition to the // graph: coder/hnsw v0.6.1 has a known bug where the graph // transitions through degenerate states after Delete cycles, and // later operations (Add / Lookup) can panic with nil-deref. The // side map is independent of graph state, so the rebuild path can // always reconstruct a clean graph even if the current one is // corrupted. Memory cost is ~2x for vectors (also held in graph), // which is acceptable for the safety it buys. Verified necessary // 2026-05-01 multitier_100k where the bug fired at len=40. type Index struct { params IndexParams g *hnsw.Graph[string] meta map[string]json.RawMessage // vectors is the panic-safe source of truth — every successful // Add stores the vector here, every Delete removes it, and // rebuildGraphLocked reads from this map (not i.g.Lookup) so // it tolerates a corrupted graph. Map keys are also the // canonical ID set (replaces the prior i.ids map). vectors map[string][]float32 mu sync.RWMutex } // Errors surfaced to HTTP handlers. Sentinel-based so the wire // layer can map to status codes via errors.Is. var ( ErrDimensionMismatch = errors.New("vectord: vector dimension mismatch") ErrUnknownDistance = errors.New("vectord: unknown distance function") ErrInvalidParams = errors.New("vectord: invalid index params") ) // NewIndex builds a fresh index from validated params. func NewIndex(p IndexParams) (*Index, error) { if p.Name == "" { return nil, fmt.Errorf("%w: empty name", ErrInvalidParams) } if p.Dimension <= 0 { return nil, fmt.Errorf("%w: dimension must be > 0 (got %d)", ErrInvalidParams, p.Dimension) } if p.M <= 0 { p.M = DefaultM } if p.EfSearch <= 0 { p.EfSearch = DefaultEfSearch } if p.Distance == "" { p.Distance = DistanceCosine } dist, err := distanceFn(p.Distance) if err != nil { return nil, err } g := hnsw.NewGraph[string]() g.M = p.M g.EfSearch = p.EfSearch g.Distance = dist // Ml stays at the library default (0.25); exposing it as a knob // is a G2 concern when we have real tuning data. return &Index{ params: p, g: g, meta: make(map[string]json.RawMessage), vectors: make(map[string][]float32), }, nil } // distanceFn maps the string name to the underlying function. // Easier to unit-test than calling out to coder/hnsw's registry. func distanceFn(name string) (hnsw.DistanceFunc, error) { switch name { case DistanceCosine, "": return hnsw.CosineDistance, nil case DistanceEuclidean: return hnsw.EuclideanDistance, nil } return nil, fmt.Errorf("%w: %q (want cosine or euclidean)", ErrUnknownDistance, name) } // Params returns a copy of the immutable index params. func (i *Index) Params() IndexParams { return i.params } // Len returns the number of vectors currently in the index. // // Reads from i.vectors (the panic-safe source of truth) rather // than i.g.Len() — the latter can drift past Len during a corrupted // graph state. i.vectors only changes on successful Add/Delete. func (i *Index) Len() int { i.mu.RLock() defer i.mu.RUnlock() return len(i.vectors) } // IDs returns a snapshot of every ID currently stored in the index. // Allocated under the read lock so callers receive a stable copy and // can iterate without holding the lock. Used by the merge endpoint // (OPEN #1: periodic fresh→main index merge — drains the fresh // corpus into the main one when it crosses the operational ceiling). // // Source of truth: the i.vectors keyset. The meta map stays sparse // (only items with explicit metadata appear there, per the K-B1 // nil-vs-{} distinction); using meta as the ID set would silently // miss items added with nil metadata. func (i *Index) IDs() []string { i.mu.RLock() defer i.mu.RUnlock() out := make([]string, 0, len(i.vectors)) for id := range i.vectors { out = append(out, id) } return out } // Add inserts a vector with optional metadata, with replace // semantics for the vector: if id already exists, the prior // vector is removed first. Dim must match the index dim or // ErrDimensionMismatch is returned. // // Metadata semantics (post-scrum K-B1): nil meta is "leave // existing alone" (upsert-style); to clear metadata, pass an // empty `{}` or Delete+Add. This avoids silent metadata loss // when the JSON `metadata` field is omitted on re-add. // // Validates that all vector components are finite (post-scrum // O-W3). NaN/Inf in any component poisons HNSW: distance // comparisons return false for both `<` and `>`, breaking the // search heap invariants. Zero-norm vectors are also rejected // under cosine distance — cos(0,x) = NaN. // // Note: coder/hnsw's Graph.Add panics on re-adding an existing // key (internal "node not added" length-invariant check). We // pre-Delete to make Add idempotent on re-insert. func (i *Index) Add(id string, vec []float32, meta json.RawMessage) error { if id == "" { return errors.New("vectord: empty id") } if len(vec) != i.params.Dimension { return fmt.Errorf("%w: index dim=%d, got=%d", ErrDimensionMismatch, i.params.Dimension, len(vec)) } if err := validateVector(vec, i.params.Distance); err != nil { return err } i.mu.Lock() defer i.mu.Unlock() // Re-add: drop existing graph entry AND side-store entry before // the new Add. Without removing from i.vectors, the rebuild path // below would see both old and new entries and double-add. // safeGraphDelete tolerates a corrupted graph; i.vectors is // authoritative regardless. if _, exists := i.vectors[id]; exists { _ = safeGraphDelete(i.g, id) delete(i.vectors, id) } newNode := hnsw.MakeNode(id, vec) postLen := len(i.vectors) + 1 addOK := false if postLen <= smallIndexRebuildThreshold { i.rebuildGraphLocked([]hnsw.Node[string]{newNode}) addOK = true } else { // Warm path: try incremental Add. If the graph is in a // degenerate state from a prior Delete cycle, this panics; // we recover and rebuild from the panic-safe i.vectors map. addOK = safeGraphAdd(i.g, newNode) if !addOK { i.rebuildGraphLocked([]hnsw.Node[string]{newNode}) addOK = true } } if !addOK { return errors.New("vectord: hnsw add failed even after rebuild — should never happen") } // Commit to the side stores after the graph mutation succeeded. out := make([]float32, len(vec)) copy(out, vec) i.vectors[id] = out if meta != nil { // Per scrum K-B1 (Kimi): only OVERWRITE on explicit non-nil. // nil = "leave existing meta alone" (upsert). To clear, the // caller should send an empty `{}` body or Delete the id. i.meta[id] = meta } return nil } // safeGraphAdd wraps coder/hnsw's variadic Graph.Add with a // recover() so v0.6.1's degenerate-state nil-deref returns false // instead of crashing the caller. Caller is expected to fall back // to rebuildGraphLocked on false. func safeGraphAdd(g *hnsw.Graph[string], nodes ...hnsw.Node[string]) (ok bool) { defer func() { if r := recover(); r != nil { ok = false } }() g.Add(nodes...) return true } // safeGraphDelete wraps Graph.Delete with recover for the same // reason — Delete can also touch corrupted layer state. func safeGraphDelete(g *hnsw.Graph[string], id string) (ok bool) { defer func() { if r := recover(); r != nil { ok = false } }() return g.Delete(id) } // rebuildGraphLocked replaces i.g with a fresh graph containing // the current items (snapshotted from the panic-safe i.vectors // map) plus the supplied extras, in one bulk Add into a freshly- // created graph. Caller MUST hold the write lock. // // Independence from i.g state is the load-bearing property — even // if i.g is corrupted from a prior coder/hnsw v0.6.1 panic, this // rebuild produces a clean graph because i.vectors is maintained // only on successful Add/Delete. // // Caller MUST ensure that any extra IDs already present in // i.vectors have been removed first (otherwise the bulk Add will // see duplicate IDs and panic). func (i *Index) rebuildGraphLocked(extras []hnsw.Node[string]) { g := hnsw.NewGraph[string]() g.M = i.params.M g.EfSearch = i.params.EfSearch g.Distance = i.g.Distance nodes := make([]hnsw.Node[string], 0, len(i.vectors)+len(extras)) for id, vec := range i.vectors { nodes = append(nodes, hnsw.MakeNode(id, vec)) } nodes = append(nodes, extras...) if len(nodes) > 0 { g.Add(nodes...) } i.g = g } // ValidateVector is the exported form of validateVector — the HTTP // handler pre-validates batches before committing, so it needs the // same predicate Add uses internally. Per scrum O-I3 (G1P). func ValidateVector(vec []float32, distance string) error { return validateVector(vec, distance) } // validateVector rejects vectors that would poison the HNSW // graph or produce NaN distances. Per scrum O-W3 (Opus, G1). func validateVector(vec []float32, distance string) error { var sumSq float64 for j, v := range vec { f := float64(v) if math.IsNaN(f) || math.IsInf(f, 0) { return fmt.Errorf("vectord: vec[%d] is non-finite (got %v)", j, v) } sumSq += f * f } if distance == DistanceCosine && sumSq == 0 { return errors.New("vectord: zero-norm vector under cosine distance") } return nil } // BatchItem is one entry in a BatchAdd call. Same per-field // contract as Add: ID + Vector required, Metadata follows // upsert-style semantics (nil = leave existing alone). type BatchItem struct { ID string Vector []float32 Metadata json.RawMessage } // BatchAdd inserts a slice of items under a single write-lock, with // one variadic call into coder/hnsw's Graph.Add. Net win vs. a loop // of single Add calls: N→1 lock acquisitions per HTTP batch and one // variadic library call instead of N. // // Contract: items MUST be pre-validated by the caller (id non-empty, // vector dimension matches, vector finite + non-zero-norm under // cosine). Pre-validation lives in the HTTP handler so per-item // error messages stay precise; reproducing it here would force // position-encoded errors on every consumer. // // Intra-batch duplicate IDs: dedup'd internally with last-write-wins // semantics (matches map-style behavior — second occurrence of an // ID replaces the first). Without dedup, coder/hnsw's "node not // added" length-invariant panics on the second occurrence. Caught // by 2026-04-29 cross-lineage scrum (Opus BLOCK). func (i *Index) BatchAdd(items []BatchItem) error { if len(items) == 0 { return nil } // Intra-batch dedup, last-write-wins. Walk forward, record the // LAST index for each ID, then keep only items whose index is // the recorded last. Preserves order of last occurrences in the // original positions. if hasDup := containsDuplicateID(items); hasDup { items = dedupBatchLastWins(items) } i.mu.Lock() defer i.mu.Unlock() // Pre-pass: drop any existing IDs from BOTH the graph and the // side-store map so the rebuild snapshot doesn't double-add and // the warm path's variadic Add never sees a re-add. Graph Delete // is wrapped in safeGraphDelete because corrupted graphs can also // panic on Delete; the side store remains authoritative. for _, it := range items { if _, exists := i.vectors[it.ID]; exists { _ = safeGraphDelete(i.g, it.ID) delete(i.vectors, it.ID) } } nodes := make([]hnsw.Node[string], len(items)) for j, it := range items { nodes[j] = hnsw.MakeNode(it.ID, it.Vector) } // Below threshold: rebuild from scratch unconditionally — fresh // graph + one bulk Add never exercises v0.6.1's degenerate-state // path. At/above threshold: try warm incremental Add, fall back // to rebuild on panic. The rebuild always succeeds because // i.vectors is independent of graph state. postLen := len(i.vectors) + len(nodes) if postLen <= smallIndexRebuildThreshold { i.rebuildGraphLocked(nodes) } else { if !safeGraphAdd(i.g, nodes...) { i.rebuildGraphLocked(nodes) } } // Commit to side stores after the graph is in good shape. for _, it := range items { out := make([]float32, len(it.Vector)) copy(out, it.Vector) i.vectors[it.ID] = out if it.Metadata != nil { i.meta[it.ID] = it.Metadata } } return nil } // containsDuplicateID is a fast pre-check — if no dups, skip the // dedup allocation. Most batches won't have dups so this is a hot // path. func containsDuplicateID(items []BatchItem) bool { seen := make(map[string]struct{}, len(items)) for _, it := range items { if _, ok := seen[it.ID]; ok { return true } seen[it.ID] = struct{}{} } return false } // dedupBatchLastWins keeps only the last occurrence of each ID, // preserving the relative order of those last occurrences. This // matches map-style "set X to A then to B" semantics: B wins. func dedupBatchLastWins(items []BatchItem) []BatchItem { lastIdx := make(map[string]int, len(items)) for j, it := range items { lastIdx[it.ID] = j } out := make([]BatchItem, 0, len(lastIdx)) for j, it := range items { if lastIdx[it.ID] == j { out = append(out, it) } } return out } // Delete removes id from the index. Returns true if present. // // The side store i.vectors is the authority on presence; the graph // Delete is best-effort (can panic on corrupted state, recovered // via safeGraphDelete). The side store always reflects the // post-Delete truth so the next rebuild produces a clean graph. func (i *Index) Delete(id string) bool { i.mu.Lock() defer i.mu.Unlock() _, present := i.vectors[id] if !present { return false } delete(i.meta, id) delete(i.vectors, id) _ = safeGraphDelete(i.g, id) return true } // Search returns the k nearest neighbors of query, sorted // ascending by distance. // // Note: coder/hnsw's Search returns `[]Node[K]` without distances — // they're computed internally in the search candidate heap but // dropped from the public API. We recompute distance from the // returned vectors. O(k·dim) per search, negligible at typical // k=10 / dim<2048. func (i *Index) Search(query []float32, k int) ([]Result, error) { if len(query) != i.params.Dimension { return nil, fmt.Errorf("%w: index dim=%d, got=%d", ErrDimensionMismatch, i.params.Dimension, len(query)) } if k <= 0 { return nil, errors.New("vectord: k must be > 0") } i.mu.RLock() defer i.mu.RUnlock() // Per scrum O-I2 (Opus): use the stored distance function // directly rather than re-resolving the string name on every // search. The graph's Distance is set once at NewIndex. dist := i.g.Distance hits := i.g.Search(query, k) out := make([]Result, len(hits)) for j, n := range hits { out[j] = Result{ ID: n.Key, Distance: dist(query, n.Value), Metadata: i.meta[n.Key], } } return out, nil } // IndexEnvelope is the JSON shape persisted alongside the binary // HNSW graph bytes. params + metadata + format version travel // together; the graph itself is opaque binary that round-trips // through hnsw.Graph.Export / Import. // // Version history: // - v1: original; metadata-only ID inference on decode (loses // items added with nil metadata across persistence). // - v2: explicit IDs slice closes the v1 gap. Older v1 envelopes // still load (backward-compat fallback path in DecodeIndex). type IndexEnvelope struct { Version int `json:"version"` Params IndexParams `json:"params"` Metadata map[string]json.RawMessage `json:"metadata"` IDs []string `json:"ids,omitempty"` // v2+ — canonical ID set } // envelopeVersion is the version this build EMITS. Decoders accept // any envelope ≤ this version (with version-specific fallbacks for // older shapes). Bumping this constant means the format changed // incompatibly going forward; readers without the version-specific // branch will return ErrVersionMismatch. const envelopeVersion = 2 // ErrVersionMismatch is returned by DecodeIndex when the envelope // claims a version this build doesn't understand. var ErrVersionMismatch = errors.New("vectord: unknown envelope version") // Encode writes the index's JSON envelope (params + metadata) and // the binary HNSW graph bytes through two writers. Two-stream // shape lets the persistor write each to a distinct storaged key // without reframing. // // envelopeW receives params+metadata as JSON; graphW receives the // raw output of hnsw.Graph.Export. func (i *Index) Encode(envelopeW, graphW io.Writer) error { i.mu.RLock() defer i.mu.RUnlock() // v2: serialize the canonical ID set explicitly so DecodeIndex // can restore i.vectors without depending on meta-key inference. idList := make([]string, 0, len(i.vectors)) for id := range i.vectors { idList = append(idList, id) } env := IndexEnvelope{ Version: envelopeVersion, Params: i.params, Metadata: i.meta, IDs: idList, } if err := json.NewEncoder(envelopeW).Encode(env); err != nil { return fmt.Errorf("encode envelope: %w", err) } if err := i.g.Export(graphW); err != nil { return fmt.Errorf("export graph: %w", err) } return nil } // DecodeIndex reconstructs an Index from a previously-Encoded pair // of streams. The returned Index is independent — closing either // reader after this call doesn't affect the result. func DecodeIndex(envelopeR, graphR io.Reader) (*Index, error) { var env IndexEnvelope if err := json.NewDecoder(envelopeR).Decode(&env); err != nil { return nil, fmt.Errorf("decode envelope: %w", err) } // Accept v1 (legacy: ids inferred from meta keys) and v2 (explicit // ids slice). Future versions are unknown — return mismatch // rather than half-decoding. if env.Version < 1 || env.Version > envelopeVersion { return nil, fmt.Errorf("%w: have %d, got %d", ErrVersionMismatch, envelopeVersion, env.Version) } idx, err := NewIndex(env.Params) if err != nil { return nil, err } if err := idx.g.Import(graphR); err != nil { return nil, fmt.Errorf("import graph: %w", err) } if env.Metadata != nil { idx.meta = env.Metadata } // Reconstruct i.vectors from the imported graph. Source of IDs: // v2 envelope's explicit IDs slice (canonical), or v1 fallback // via the meta keys. We then call i.g.Lookup on each ID to // recover the vector — Lookup on a freshly Imported graph is // safe (no degenerate state from prior Delete cycles). var idSource []string if env.Version >= 2 && env.IDs != nil { idSource = env.IDs } else { // v1 backward-compat path. Old envelopes don't carry ids // explicitly; the metadata keyset is the best signal we have. idSource = make([]string, 0, len(idx.meta)) for id := range idx.meta { idSource = append(idSource, id) } } for _, id := range idSource { if vec, ok := idx.g.Lookup(id); ok { out := make([]float32, len(vec)) copy(out, vec) idx.vectors[id] = out } } return idx, nil } // Lookup returns the stored vector + metadata for an id. // // Per scrum O-W1 (Opus): the vector is COPIED before return. // coder/hnsw's Lookup hands back the underlying graph slice; // caller mutation would corrupt the index without locking. func (i *Index) Lookup(id string) (vec []float32, meta json.RawMessage, ok bool) { i.mu.RLock() defer i.mu.RUnlock() v, found := i.g.Lookup(id) if !found { return nil, nil, false } out := make([]float32, len(v)) copy(out, v) return out, i.meta[id], true }