golangLAKEHOUSE/cmd/storaged/main_test.go

package main

import (
	"testing"

	"git.agentview.dev/profit/golangLAKEHOUSE/internal/vectord"
)

func TestValidateKey(t *testing.T) {
	cases := []struct {
		name    string
		key     string
		wantErr bool
	}{
		{"happy path", "data/workers_500k.parquet", false},
		{"deeply nested", "a/b/c/d/e/f/g.txt", false},
		{"empty", "", true},
		{"too long", string(make([]byte, 1025)), true},
		{"NUL byte", "foo\x00bar", true},
		{"leading slash", "/foo/bar", true},
		{"dotdot component", "data/../etc/passwd", true},
		{"dotdot at end", "data/..", true},
		{"dotdot at start", "../etc", true},
		{"single dot is fine", "data/./x.txt", false},
		{"dotdot embedded is fine", "data/.. /x.txt", false}, // not a path component
		{"newline", "foo\nbar", true},
		{"carriage return", "foo\rbar", true},
		{"tab", "foo\tbar", true},
		{"unicode is fine", "résumé/data.txt", false},
	}
	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			err := validateKey(tc.key)
			if tc.wantErr && err == nil {
				t.Errorf("expected error for %q, got nil", tc.key)
			}
			if !tc.wantErr && err != nil {
				t.Errorf("expected ok for %q, got %v", tc.key, err)
			}
		})
	}
}

func TestMaxPutBytesFor(t *testing.T) {
	// Vectord LHV1 persistence gets the larger cap so single-file
	// indexes above ~150K vectors at d=768 (the 500K staffing test
	// gap) can survive a Save without 413. Default cap stays at
	// 256 MiB for everything else (parquets, manifests, etc.).
	cases := []struct {
		name string
		key  string
		want int64
	}{
		{"vectord LHV1 file", "_vectors/workers_500k.lhv1", maxPutBytesVectors},
		{"vectord nested", "_vectors/some/nested/index.lhv1", maxPutBytesVectors},
		{"parquet under datasets/", "datasets/workers/abc.parquet", maxPutBytesDefault},
		{"catalog manifest", "_catalog/manifests/foo.bin", maxPutBytesDefault},
		{"plain key", "x.txt", maxPutBytesDefault},
		{"empty key", "", maxPutBytesDefault},
		{"key with vectors-ish substring (not prefix)", "datasets/_vectors/x", maxPutBytesDefault},
	}
	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			got := maxPutBytesFor(tc.key)
			if got != tc.want {
				t.Errorf("maxPutBytesFor(%q) = %d, want %d", tc.key, got, tc.want)
			}
		})
	}
}

// TestVectorPrefixSyncWithVectord locks the prefix value to vectord's
// VectorPrefix constant. If a future refactor renames either side,
// this test surfaces the drift before vectord saves silently bypass
// the larger cap.
func TestVectorPrefixSyncWithVectord(t *testing.T) {
	if vectorsPrefix != vectord.VectorPrefix {
		t.Errorf("storaged vectorsPrefix=%q out of sync with vectord.VectorPrefix=%q",
			vectorsPrefix, vectord.VectorPrefix)
	}
}

// TestVectorCapAccommodates500KStaffingTest reads the documented gap
// (memory project_golang_lakehouse.md): "storaged 256 MiB PUT cap
// blocks single-file LHV1 persistence above ~150K vectors at d=768."
// 500K vectors at d=768 with HNSW graph overhead is approximately
// 4.5 GiB resident, ~700 MiB on disk after compression.
// 4 GiB cap covers more than the documented production workload.
func TestVectorCapAccommodates500KStaffingTest(t *testing.T) {
	const fiveHundredKVectorsAtD768Disk = 700 << 20 // ~700 MiB conservative estimate
	if maxPutBytesVectors < fiveHundredKVectorsAtD768Disk {
		t.Errorf("maxPutBytesVectors=%d (%.0f MiB) below documented production "+
			"workload of ~700 MiB for 500K vectors at d=768",
			maxPutBytesVectors, float64(maxPutBytesVectors)/(1<<20))
	}
}