golangLAKEHOUSE/internal/vectord/index.go

// Package vectord owns the vector-search surface — HNSW indexes
// keyed by string IDs with optional opaque JSON metadata. The
// underlying library is github.com/coder/hnsw (pure Go, no cgo).
//
// G1 scope: in-memory only. Persistence to storaged + rehydrate
// across restart is the next piece — keeping it out of this layer
// makes the index API easier to test and keeps the storaged
// dependency optional for downstream tooling.
package vectord

import (
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"math"
	"sync"

	"github.com/coder/hnsw"
)

// Distance names accepted by IndexParams.Distance.
const (
	DistanceCosine    = "cosine"
	DistanceEuclidean = "euclidean"
)

// Default HNSW parameters — match coder/hnsw's NewGraph defaults
// which are tuned for OpenAI-shaped embeddings (1536-d, but the
// hyperparameters generalize).
const (
	DefaultM        = 16
	DefaultEfSearch = 20
)

// IndexParams describes one vector index. Once an Index is built,
// these are fixed — changing M / dimension / distance requires a
// rebuild.
type IndexParams struct {
	Name      string `json:"name"`
	Dimension int    `json:"dimension"`
	M         int    `json:"m"`
	EfSearch  int    `json:"ef_search"`
	Distance  string `json:"distance"`
}

// Result is one search hit. Distance semantics depend on the
// configured distance function — for cosine it's `1 - cos(a,b)`
// where smaller = closer; for euclidean it's the L2 norm of
// `a - b`. Either way, smaller = closer and the result list is
// sorted ascending.
type Result struct {
	ID       string          `json:"id"`
	Distance float32         `json:"distance"`
	Metadata json.RawMessage `json:"metadata,omitempty"`
}

// Index wraps a coder/hnsw graph plus a side map of opaque JSON
// metadata per ID. Concurrency: read-heavy via Search (read-lock);
// Add and Delete take the write lock.
type Index struct {
	params IndexParams
	g      *hnsw.Graph[string]
	meta   map[string]json.RawMessage
	// ids is the canonical ID set (a value-less map used as a set).
	// Maintained alongside i.g and i.meta in Add/Delete/resetGraph
	// so IDs() can enumerate without depending on the meta map's
	// sparse-on-nil-meta semantics. Underpins OPEN #1's merge
	// endpoint — necessary because two-tier callers
	// (multi_coord_stress et al.) sometimes Add with nil meta.
	ids map[string]struct{}
	mu  sync.RWMutex
}

// Errors surfaced to HTTP handlers. Sentinel-based so the wire
// layer can map to status codes via errors.Is.
var (
	ErrDimensionMismatch = errors.New("vectord: vector dimension mismatch")
	ErrUnknownDistance   = errors.New("vectord: unknown distance function")
	ErrInvalidParams     = errors.New("vectord: invalid index params")
)

// NewIndex builds a fresh index from validated params.
func NewIndex(p IndexParams) (*Index, error) {
	if p.Name == "" {
		return nil, fmt.Errorf("%w: empty name", ErrInvalidParams)
	}
	if p.Dimension <= 0 {
		return nil, fmt.Errorf("%w: dimension must be > 0 (got %d)", ErrInvalidParams, p.Dimension)
	}
	if p.M <= 0 {
		p.M = DefaultM
	}
	if p.EfSearch <= 0 {
		p.EfSearch = DefaultEfSearch
	}
	if p.Distance == "" {
		p.Distance = DistanceCosine
	}
	dist, err := distanceFn(p.Distance)
	if err != nil {
		return nil, err
	}

	g := hnsw.NewGraph[string]()
	g.M = p.M
	g.EfSearch = p.EfSearch
	g.Distance = dist
	// Ml stays at the library default (0.25); exposing it as a knob
	// is a G2 concern when we have real tuning data.

	return &Index{
		params: p,
		g:      g,
		meta:   make(map[string]json.RawMessage),
		ids:    make(map[string]struct{}),
	}, nil
}

// distanceFn maps the string name to the underlying function.
// Easier to unit-test than calling out to coder/hnsw's registry.
func distanceFn(name string) (hnsw.DistanceFunc, error) {
	switch name {
	case DistanceCosine, "":
		return hnsw.CosineDistance, nil
	case DistanceEuclidean:
		return hnsw.EuclideanDistance, nil
	}
	return nil, fmt.Errorf("%w: %q (want cosine or euclidean)", ErrUnknownDistance, name)
}

// Params returns a copy of the immutable index params.
func (i *Index) Params() IndexParams { return i.params }

// Len returns the number of vectors currently in the index.
func (i *Index) Len() int {
	i.mu.RLock()
	defer i.mu.RUnlock()
	return i.g.Len()
}

// IDs returns a snapshot of every ID currently stored in the index.
// Allocated under the read lock so callers receive a stable copy and
// can iterate without holding the lock. Used by the merge endpoint
// (OPEN #1: periodic fresh→main index merge — drains the fresh
// corpus into the main one when it crosses the operational ceiling).
//
// Source of truth: the i.ids tracker, NOT the meta map. The meta
// map intentionally stays sparse (only items with explicit
// metadata appear there, per the K-B1 nil-vs-{} distinction). Using
// meta as the ID set would silently miss items added with nil
// metadata.
func (i *Index) IDs() []string {
	i.mu.RLock()
	defer i.mu.RUnlock()
	out := make([]string, 0, len(i.ids))
	for id := range i.ids {
		out = append(out, id)
	}
	return out
}

// Add inserts a vector with optional metadata, with replace
// semantics for the vector: if id already exists, the prior
// vector is removed first. Dim must match the index dim or
// ErrDimensionMismatch is returned.
//
// Metadata semantics (post-scrum K-B1): nil meta is "leave
// existing alone" (upsert-style); to clear metadata, pass an
// empty `{}` or Delete+Add. This avoids silent metadata loss
// when the JSON `metadata` field is omitted on re-add.
//
// Validates that all vector components are finite (post-scrum
// O-W3). NaN/Inf in any component poisons HNSW: distance
// comparisons return false for both `<` and `>`, breaking the
// search heap invariants. Zero-norm vectors are also rejected
// under cosine distance — cos(0,x) = NaN.
//
// Note: coder/hnsw's Graph.Add panics on re-adding an existing
// key (internal "node not added" length-invariant check). We
// pre-Delete to make Add idempotent on re-insert.
func (i *Index) Add(id string, vec []float32, meta json.RawMessage) error {
	if id == "" {
		return errors.New("vectord: empty id")
	}
	if len(vec) != i.params.Dimension {
		return fmt.Errorf("%w: index dim=%d, got=%d", ErrDimensionMismatch, i.params.Dimension, len(vec))
	}
	if err := validateVector(vec, i.params.Distance); err != nil {
		return err
	}
	i.mu.Lock()
	defer i.mu.Unlock()
	// coder/hnsw has two sharp edges on re-add:
	//   1. Add of an existing key panics with "node not added"
	//      (length-invariant fires because internal delete+re-add
	//      doesn't change Len). Pre-Delete fixes this for n>1.
	//   2. Delete of the LAST node leaves layers[0] non-empty but
	//      entryless; the next Add SIGSEGVs in Dims() because
	//      entry().Value is nil. We rebuild the graph in that case.
	_, exists := i.g.Lookup(id)
	if exists {
		if i.g.Len() == 1 {
			i.resetGraphLocked()
		} else {
			i.g.Delete(id)
		}
	}
	i.g.Add(hnsw.MakeNode(id, vec))
	i.ids[id] = struct{}{}
	if meta != nil {
		// Per scrum K-B1 (Kimi): only OVERWRITE on explicit non-nil.
		// nil = "leave existing meta alone" (upsert). To clear, the
		// caller should send an empty `{}` body or Delete the id.
		i.meta[id] = meta
	}
	return nil
}

// resetGraphLocked recreates the underlying coder/hnsw Graph with
// the same params. Caller MUST hold i.mu (write-lock). Used to
// dodge the library's "delete the last node, then segfault on
// next Add" bug — see Add for details. Metadata map is preserved
// because the only entry it could affect is the one being
// re-added, which Add overwrites.
func (i *Index) resetGraphLocked() {
	g := hnsw.NewGraph[string]()
	g.M = i.params.M
	g.EfSearch = i.params.EfSearch
	g.Distance = i.g.Distance
	i.g = g
}

// ValidateVector is the exported form of validateVector — the HTTP
// handler pre-validates batches before committing, so it needs the
// same predicate Add uses internally. Per scrum O-I3 (G1P).
func ValidateVector(vec []float32, distance string) error {
	return validateVector(vec, distance)
}

// validateVector rejects vectors that would poison the HNSW
// graph or produce NaN distances. Per scrum O-W3 (Opus, G1).
func validateVector(vec []float32, distance string) error {
	var sumSq float64
	for j, v := range vec {
		f := float64(v)
		if math.IsNaN(f) || math.IsInf(f, 0) {
			return fmt.Errorf("vectord: vec[%d] is non-finite (got %v)", j, v)
		}
		sumSq += f * f
	}
	if distance == DistanceCosine && sumSq == 0 {
		return errors.New("vectord: zero-norm vector under cosine distance")
	}
	return nil
}

// BatchItem is one entry in a BatchAdd call. Same per-field
// contract as Add: ID + Vector required, Metadata follows
// upsert-style semantics (nil = leave existing alone).
type BatchItem struct {
	ID       string
	Vector   []float32
	Metadata json.RawMessage
}

// BatchAdd inserts a slice of items under a single write-lock, with
// one variadic call into coder/hnsw's Graph.Add. Net win vs. a loop
// of single Add calls: N→1 lock acquisitions per HTTP batch and one
// variadic library call instead of N.
//
// Contract: items MUST be pre-validated by the caller (id non-empty,
// vector dimension matches, vector finite + non-zero-norm under
// cosine). Pre-validation lives in the HTTP handler so per-item
// error messages stay precise; reproducing it here would force
// position-encoded errors on every consumer.
//
// Intra-batch duplicate IDs: dedup'd internally with last-write-wins
// semantics (matches map-style behavior — second occurrence of an
// ID replaces the first). Without dedup, coder/hnsw's "node not
// added" length-invariant panics on the second occurrence. Caught
// by 2026-04-29 cross-lineage scrum (Opus BLOCK).
func (i *Index) BatchAdd(items []BatchItem) error {
	if len(items) == 0 {
		return nil
	}

	// Intra-batch dedup, last-write-wins. Walk forward, record the
	// LAST index for each ID, then keep only items whose index is
	// the recorded last. Preserves order of last occurrences in the
	// original positions.
	if hasDup := containsDuplicateID(items); hasDup {
		items = dedupBatchLastWins(items)
	}

	i.mu.Lock()
	defer i.mu.Unlock()

	// Pre-pass: drop any existing IDs so coder/hnsw's variadic Add
	// never sees a re-add. Same library-quirk handling as single
	// Add — Len()==1 needs a full graph reset because Delete of the
	// last node leaves layers[0] entryless.
	for _, it := range items {
		if _, exists := i.g.Lookup(it.ID); exists {
			if i.g.Len() == 1 {
				i.resetGraphLocked()
			} else {
				i.g.Delete(it.ID)
			}
		}
	}

	nodes := make([]hnsw.Node[string], len(items))
	for j, it := range items {
		nodes[j] = hnsw.MakeNode(it.ID, it.Vector)
	}
	i.g.Add(nodes...)

	for _, it := range items {
		i.ids[it.ID] = struct{}{}
		if it.Metadata != nil {
			i.meta[it.ID] = it.Metadata
		}
	}
	return nil
}

// containsDuplicateID is a fast pre-check — if no dups, skip the
// dedup allocation. Most batches won't have dups so this is a hot
// path.
func containsDuplicateID(items []BatchItem) bool {
	seen := make(map[string]struct{}, len(items))
	for _, it := range items {
		if _, ok := seen[it.ID]; ok {
			return true
		}
		seen[it.ID] = struct{}{}
	}
	return false
}

// dedupBatchLastWins keeps only the last occurrence of each ID,
// preserving the relative order of those last occurrences. This
// matches map-style "set X to A then to B" semantics: B wins.
func dedupBatchLastWins(items []BatchItem) []BatchItem {
	lastIdx := make(map[string]int, len(items))
	for j, it := range items {
		lastIdx[it.ID] = j
	}
	out := make([]BatchItem, 0, len(lastIdx))
	for j, it := range items {
		if lastIdx[it.ID] == j {
			out = append(out, it)
		}
	}
	return out
}

// Delete removes id from the index. Returns true if present.
func (i *Index) Delete(id string) bool {
	i.mu.Lock()
	defer i.mu.Unlock()
	delete(i.meta, id)
	delete(i.ids, id)
	return i.g.Delete(id)
}

// Search returns the k nearest neighbors of query, sorted
// ascending by distance.
//
// Note: coder/hnsw's Search returns `[]Node[K]` without distances —
// they're computed internally in the search candidate heap but
// dropped from the public API. We recompute distance from the
// returned vectors. O(k·dim) per search, negligible at typical
// k=10 / dim<2048.
func (i *Index) Search(query []float32, k int) ([]Result, error) {
	if len(query) != i.params.Dimension {
		return nil, fmt.Errorf("%w: index dim=%d, got=%d", ErrDimensionMismatch, i.params.Dimension, len(query))
	}
	if k <= 0 {
		return nil, errors.New("vectord: k must be > 0")
	}
	i.mu.RLock()
	defer i.mu.RUnlock()

	// Per scrum O-I2 (Opus): use the stored distance function
	// directly rather than re-resolving the string name on every
	// search. The graph's Distance is set once at NewIndex.
	dist := i.g.Distance
	hits := i.g.Search(query, k)
	out := make([]Result, len(hits))
	for j, n := range hits {
		out[j] = Result{
			ID:       n.Key,
			Distance: dist(query, n.Value),
			Metadata: i.meta[n.Key],
		}
	}
	return out, nil
}

// IndexEnvelope is the JSON shape persisted alongside the binary
// HNSW graph bytes. params + metadata + format version travel
// together; the graph itself is opaque binary that round-trips
// through hnsw.Graph.Export / Import.
//
// Version history:
//   - v1: original; metadata-only ID inference on decode (loses
//     items added with nil metadata across persistence).
//   - v2: explicit IDs slice closes the v1 gap. Older v1 envelopes
//     still load (backward-compat fallback path in DecodeIndex).
type IndexEnvelope struct {
	Version  int                        `json:"version"`
	Params   IndexParams                `json:"params"`
	Metadata map[string]json.RawMessage `json:"metadata"`
	IDs      []string                   `json:"ids,omitempty"` // v2+ — canonical ID set
}

// envelopeVersion is the version this build EMITS. Decoders accept
// any envelope ≤ this version (with version-specific fallbacks for
// older shapes). Bumping this constant means the format changed
// incompatibly going forward; readers without the version-specific
// branch will return ErrVersionMismatch.
const envelopeVersion = 2

// ErrVersionMismatch is returned by DecodeIndex when the envelope
// claims a version this build doesn't understand.
var ErrVersionMismatch = errors.New("vectord: unknown envelope version")

// Encode writes the index's JSON envelope (params + metadata) and
// the binary HNSW graph bytes through two writers. Two-stream
// shape lets the persistor write each to a distinct storaged key
// without reframing.
//
// envelopeW receives params+metadata as JSON; graphW receives the
// raw output of hnsw.Graph.Export.
func (i *Index) Encode(envelopeW, graphW io.Writer) error {
	i.mu.RLock()
	defer i.mu.RUnlock()

	// v2: serialize the canonical ID set explicitly so DecodeIndex
	// can restore i.ids without depending on meta-key inference.
	idList := make([]string, 0, len(i.ids))
	for id := range i.ids {
		idList = append(idList, id)
	}
	env := IndexEnvelope{
		Version:  envelopeVersion,
		Params:   i.params,
		Metadata: i.meta,
		IDs:      idList,
	}
	if err := json.NewEncoder(envelopeW).Encode(env); err != nil {
		return fmt.Errorf("encode envelope: %w", err)
	}
	if err := i.g.Export(graphW); err != nil {
		return fmt.Errorf("export graph: %w", err)
	}
	return nil
}

// DecodeIndex reconstructs an Index from a previously-Encoded pair
// of streams. The returned Index is independent — closing either
// reader after this call doesn't affect the result.
func DecodeIndex(envelopeR, graphR io.Reader) (*Index, error) {
	var env IndexEnvelope
	if err := json.NewDecoder(envelopeR).Decode(&env); err != nil {
		return nil, fmt.Errorf("decode envelope: %w", err)
	}
	// Accept v1 (legacy: ids inferred from meta keys) and v2 (explicit
	// ids slice). Future versions are unknown — return mismatch
	// rather than half-decoding.
	if env.Version < 1 || env.Version > envelopeVersion {
		return nil, fmt.Errorf("%w: have %d, got %d",
			ErrVersionMismatch, envelopeVersion, env.Version)
	}
	idx, err := NewIndex(env.Params)
	if err != nil {
		return nil, err
	}
	if err := idx.g.Import(graphR); err != nil {
		return nil, fmt.Errorf("import graph: %w", err)
	}
	if env.Metadata != nil {
		idx.meta = env.Metadata
	}
	// v2: explicit IDs field is the canonical source. v1 fallback:
	// derive from meta keys, accepting that nil-meta items will be
	// invisible to IDs()/merge until they get re-Add'd. Closes the
	// scrum post_role_gate_v1 convergent finding (Opus + Kimi).
	if env.Version >= 2 && env.IDs != nil {
		for _, id := range env.IDs {
			idx.ids[id] = struct{}{}
		}
	} else {
		// v1 backward-compat path. Old envelopes don't carry ids
		// explicitly; the metadata keyset is the best signal we have.
		for id := range idx.meta {
			idx.ids[id] = struct{}{}
		}
	}
	return idx, nil
}

// Lookup returns the stored vector + metadata for an id.
//
// Per scrum O-W1 (Opus): the vector is COPIED before return.
// coder/hnsw's Lookup hands back the underlying graph slice;
// caller mutation would corrupt the index without locking.
func (i *Index) Lookup(id string) (vec []float32, meta json.RawMessage, ok bool) {
	i.mu.RLock()
	defer i.mu.RUnlock()
	v, found := i.g.Lookup(id)
	if !found {
		return nil, nil, false
	}
	out := make([]float32, len(v))
	copy(out, v)
	return out, i.meta[id], true
}