package main import ( "testing" "git.agentview.dev/profit/golangLAKEHOUSE/internal/vectord" ) func TestValidateKey(t *testing.T) { cases := []struct { name string key string wantErr bool }{ {"happy path", "data/workers_500k.parquet", false}, {"deeply nested", "a/b/c/d/e/f/g.txt", false}, {"empty", "", true}, {"too long", string(make([]byte, 1025)), true}, {"NUL byte", "foo\x00bar", true}, {"leading slash", "/foo/bar", true}, {"dotdot component", "data/../etc/passwd", true}, {"dotdot at end", "data/..", true}, {"dotdot at start", "../etc", true}, {"single dot is fine", "data/./x.txt", false}, {"dotdot embedded is fine", "data/.. /x.txt", false}, // not a path component {"newline", "foo\nbar", true}, {"carriage return", "foo\rbar", true}, {"tab", "foo\tbar", true}, {"unicode is fine", "résumé/data.txt", false}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { err := validateKey(tc.key) if tc.wantErr && err == nil { t.Errorf("expected error for %q, got nil", tc.key) } if !tc.wantErr && err != nil { t.Errorf("expected ok for %q, got %v", tc.key, err) } }) } } func TestMaxPutBytesFor(t *testing.T) { // Vectord LHV1 persistence gets the larger cap so single-file // indexes above ~150K vectors at d=768 (the 500K staffing test // gap) can survive a Save without 413. Default cap stays at // 256 MiB for everything else (parquets, manifests, etc.). cases := []struct { name string key string want int64 }{ {"vectord LHV1 file", "_vectors/workers_500k.lhv1", maxPutBytesVectors}, {"vectord nested", "_vectors/some/nested/index.lhv1", maxPutBytesVectors}, {"parquet under datasets/", "datasets/workers/abc.parquet", maxPutBytesDefault}, {"catalog manifest", "_catalog/manifests/foo.bin", maxPutBytesDefault}, {"plain key", "x.txt", maxPutBytesDefault}, {"empty key", "", maxPutBytesDefault}, {"key with vectors-ish substring (not prefix)", "datasets/_vectors/x", maxPutBytesDefault}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { got := maxPutBytesFor(tc.key) if got != tc.want { t.Errorf("maxPutBytesFor(%q) = %d, want %d", tc.key, got, tc.want) } }) } } // TestVectorPrefixSyncWithVectord locks the prefix value to vectord's // VectorPrefix constant. If a future refactor renames either side, // this test surfaces the drift before vectord saves silently bypass // the larger cap. func TestVectorPrefixSyncWithVectord(t *testing.T) { if vectorsPrefix != vectord.VectorPrefix { t.Errorf("storaged vectorsPrefix=%q out of sync with vectord.VectorPrefix=%q", vectorsPrefix, vectord.VectorPrefix) } } // TestVectorCapAccommodates500KStaffingTest reads the documented gap // (memory project_golang_lakehouse.md): "storaged 256 MiB PUT cap // blocks single-file LHV1 persistence above ~150K vectors at d=768." // 500K vectors at d=768 with HNSW graph overhead is approximately // 4.5 GiB resident, ~700 MiB on disk after compression. // 4 GiB cap covers more than the documented production workload. func TestVectorCapAccommodates500KStaffingTest(t *testing.T) { const fiveHundredKVectorsAtD768Disk = 700 << 20 // ~700 MiB conservative estimate if maxPutBytesVectors < fiveHundredKVectorsAtD768Disk { t.Errorf("maxPutBytesVectors=%d (%.0f MiB) below documented production "+ "workload of ~700 MiB for 500K vectors at d=768", maxPutBytesVectors, float64(maxPutBytesVectors)/(1<<20)) } }