// persistor.go — drives Index ↔ storaged round-trips for vectord. // Single-file framed format per index (post-G1P-scrum C1 fix — // the prior two-file format had a torn-write class where a // successful envelope PUT followed by a failed graph PUT would // pass the "both present" List filter and load mismatched state): // // _vectors/.lhv1 — single binary blob, framed: // [4 bytes magic "LHV1"] // [4 bytes envelope_len uint32 big-endian] // [envelope bytes — JSON envelope from Index.Encode] // [graph bytes — rest of file, raw hnsw.Graph.Export] // // One Put → no torn-write window. One Get → no need to filter // half-saved orphans on List. package vectord import ( "bytes" "context" "encoding/binary" "errors" "fmt" "io" "sort" "strings" "git.agentview.dev/profit/golangLAKEHOUSE/internal/storeclient" ) // VectorPrefix is the storaged key prefix that holds vector index // state. Objects under this prefix are NOT user vectors — those // live in the index in memory; this is the persistence sidecar. const VectorPrefix = "_vectors/" // fileSuffix is the extension for the framed combined file. // Versioned in the magic+frame so future formats can land // alongside without rewriting all existing files. const fileSuffix = ".lhv1" // magic is the 4-byte file header. Lakehouse-go Vectord v1. var magic = [4]byte{'L', 'H', 'V', '1'} // Store is the subset of the storeclient API that persistor needs. // Defined as an interface so unit tests can inject a fake without // spinning up real storaged. type Store interface { Put(ctx context.Context, key string, body []byte) error Get(ctx context.Context, key string) ([]byte, error) Delete(ctx context.Context, key string) error List(ctx context.Context, prefix string) ([]string, error) } // ErrKeyMissing is returned by Load when the persisted file is // absent. The single-file format means partial persistence isn't // a concern (post-G1P-scrum C1) — torn-write would orphan a file // with bad framing, surfacing as a parse error not a silent miss. var ErrKeyMissing = errors.New("persistor: index file missing") // ErrBadFormat is returned by Load when the persisted bytes don't // match the expected magic or framing. Possible causes: bit-rot, // version skew (a future format we don't speak), partial PUT // before storaged grew transactional writes, or operator-edited. var ErrBadFormat = errors.New("persistor: bad file format") // Persistor wires Index Encode/Decode to a Store. type Persistor struct { store Store } func NewPersistor(s Store) *Persistor { return &Persistor{store: s} } // Save encodes idx into a single framed blob and writes it. // Atomic at the storaged layer: one Put → no torn-write hazard. func (p *Persistor) Save(ctx context.Context, idx *Index) error { var envBuf, graphBuf bytes.Buffer if err := idx.Encode(&envBuf, &graphBuf); err != nil { return fmt.Errorf("encode %q: %w", idx.params.Name, err) } body := frame(envBuf.Bytes(), graphBuf.Bytes()) if err := p.store.Put(ctx, fileKey(idx.params.Name), body); err != nil { return fmt.Errorf("put: %w", err) } return nil } // Load reconstructs an Index from the persisted single-file blob. // ErrKeyMissing if the file is absent; ErrBadFormat on framing // mismatch (corruption / version skew / operator edit). func (p *Persistor) Load(ctx context.Context, name string) (*Index, error) { body, err := p.store.Get(ctx, fileKey(name)) if err != nil { // Per scrum O-B1 (Opus): use errors.Is on the typed // sentinel, not substring matching. The prior substring // approach silently misclassified any 5xx body that // happened to contain "key not found" as missing. if errors.Is(err, storeclient.ErrKeyNotFound) { return nil, fmt.Errorf("%w: %q", ErrKeyMissing, name) } return nil, fmt.Errorf("get %q: %w", name, err) } envBytes, graphBytes, err := unframe(body) if err != nil { return nil, fmt.Errorf("unframe %q: %w", name, err) } idx, err := DecodeIndex(bytes.NewReader(envBytes), bytes.NewReader(graphBytes)) if err != nil { return nil, fmt.Errorf("decode %q: %w", name, err) } return idx, nil } // Delete removes the persisted file for an index. Idempotent on // missing — storaged DELETE is itself idempotent. func (p *Persistor) Delete(ctx context.Context, name string) error { if err := p.store.Delete(ctx, fileKey(name)); err != nil { return fmt.Errorf("delete %q: %w", name, err) } return nil } // List returns persisted index names, sorted. Single-file format // means no half-saved-orphan filtering needed — bad framing on Load // surfaces as ErrBadFormat per index, which the rehydrate caller // can log and skip. func (p *Persistor) List(ctx context.Context) ([]string, error) { keys, err := p.store.List(ctx, VectorPrefix) if err != nil { return nil, fmt.Errorf("list: %w", err) } out := make([]string, 0, len(keys)) for _, k := range keys { if !strings.HasPrefix(k, VectorPrefix) || !strings.HasSuffix(k, fileSuffix) { continue } name := strings.TrimSuffix(strings.TrimPrefix(k, VectorPrefix), fileSuffix) if name != "" { out = append(out, name) } } sort.Strings(out) return out, nil } func fileKey(name string) string { return VectorPrefix + name + fileSuffix } // frame produces the on-disk byte layout: // // [4 bytes magic "LHV1"] // [4 bytes envelope_len uint32 big-endian] // [envelope bytes] // [graph bytes — rest of file] func frame(envBytes, graphBytes []byte) []byte { out := make([]byte, 0, 8+len(envBytes)+len(graphBytes)) out = append(out, magic[:]...) var lenBuf [4]byte binary.BigEndian.PutUint32(lenBuf[:], uint32(len(envBytes))) out = append(out, lenBuf[:]...) out = append(out, envBytes...) out = append(out, graphBytes...) return out } // unframe reverses frame. Returns ErrBadFormat on any mismatch. func unframe(body []byte) (envBytes, graphBytes []byte, err error) { if len(body) < 8 { return nil, nil, fmt.Errorf("%w: body too short (%d bytes)", ErrBadFormat, len(body)) } if !bytes.Equal(body[:4], magic[:]) { return nil, nil, fmt.Errorf("%w: bad magic %q", ErrBadFormat, body[:4]) } envLen := binary.BigEndian.Uint32(body[4:8]) if int(envLen)+8 > len(body) { return nil, nil, fmt.Errorf("%w: envelope_len=%d exceeds body=%d", ErrBadFormat, envLen, len(body)) } envBytes = body[8 : 8+envLen] graphBytes = body[8+envLen:] return envBytes, graphBytes, nil } // (io import retained for future Save/Load that streams; current // path is buffer-based.) var _ = io.Discard