From 9ee7fc55507cdf11a6ab6df99f3e132527055615 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 29 Apr 2026 01:42:27 -0500 Subject: [PATCH] =?UTF-8?q?G2:=20embedd=20=E2=80=94=20text=20=E2=86=92=20v?= =?UTF-8?q?ector=20via=20Ollama=20=C2=B7=202=20scrum=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bridges the missing piece for the staffing co-pilot: text inputs to vectord-shaped vectors. Standalone cmd/embedd on :3216 fronted by gateway at /v1/embed. Pluggable embed.Provider interface (G2 ships Ollama; OpenAI/Voyage swap in via the same interface in G3+). Wire format: POST /v1/embed {"texts":[...], "model":"..."} // model optional → 200 {"model","dimension","vectors":[[...]]} Default model: nomic-embed-text (768-d). Ollama returns float64; provider converts to float32 at the boundary so vectors flow through vectord/HNSW without re-conversion. Acceptance smoke 5/5 PASS — including the architectural payoff: end-to-end embed → vectord add → search by re-embedded text returns recall=1 at distance 5.96e-8 (float32 precision noise on identical unit vectors). The staffing co-pilot pipeline (text → vector → similarity search) is now functional end-to-end. All 9 smokes (D1-D6 + G1 + G1P + G2) PASS deterministically. Cross-lineage scrum on shipped code: - Opus 4.7 (opencode): 0 BLOCK + 4 WARN + 3 INFO - Kimi K2-0905 (openrouter): 0 BLOCK + 2 WARN + 1 INFO - Qwen3-coder (openrouter): "No BLOCKs" (3 tokens) Fixed (2 — 1 convergent + 1 single-reviewer): C1 (Opus + Kimi convergent WARN): per-text 60s timeout × N-text batch was up to N×60s with no batch-level cap. One stuck Ollama call would stall the whole handler indefinitely. Fix: context.WithTimeout(r.Context(), 60s) wraps the entire batch. O-W3 (Opus WARN): empty strings in texts went to Ollama unchecked, producing version-dependent garbage. Fix: reject "" with 400 at the handler boundary so callers get a deterministic answer instead of an upstream-conditional 502. Deferred (4): drainAndClose 64KiB cap (matches G0 pattern), no concurrency limit on /embed (single-tenant G2), missing Accept header (exotic-proxy concern), MaxBytesError string-match redundancy (paranoia layer kept consistent across codebase). Zero false positives this round — Qwen returned 3 tokens "No BLOCKs" and the other two reviewers' findings were all real. Setup confirmed: Ollama 0.21.0 on :11434 with nomic-embed-text loaded. Per-text /api/embeddings used (forward-compat with 0.21+); newer 0.4+ /api/embed batch endpoint can swap in via the Provider interface. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/embedd/main.go | 140 +++++++++++++++++++++++++++++ cmd/gateway/main.go | 5 ++ internal/embed/embed.go | 43 +++++++++ internal/embed/ollama.go | 127 ++++++++++++++++++++++++++ internal/embed/ollama_test.go | 115 ++++++++++++++++++++++++ internal/shared/config.go | 24 ++++- lakehouse.toml | 8 ++ scripts/g2_smoke.sh | 165 ++++++++++++++++++++++++++++++++++ 8 files changed, 624 insertions(+), 3 deletions(-) create mode 100644 cmd/embedd/main.go create mode 100644 internal/embed/embed.go create mode 100644 internal/embed/ollama.go create mode 100644 internal/embed/ollama_test.go create mode 100755 scripts/g2_smoke.sh diff --git a/cmd/embedd/main.go b/cmd/embedd/main.go new file mode 100644 index 0000000..85c3ed8 --- /dev/null +++ b/cmd/embedd/main.go @@ -0,0 +1,140 @@ +// embedd is the embedding service. Turns text into vectors via a +// pluggable Provider (G2: Ollama at :11434). Vectors flow through +// the rest of the stack as float32 — see internal/embed for the +// boundary conversion. Default model is config-resolved; callers +// can override per request. +package main + +import ( + "context" + "encoding/json" + "errors" + "flag" + "log/slog" + "net/http" + "os" + "strings" + "time" + + "github.com/go-chi/chi/v5" + + "git.agentview.dev/profit/golangLAKEHOUSE/internal/embed" + "git.agentview.dev/profit/golangLAKEHOUSE/internal/shared" +) + +const ( + maxRequestBytes = 4 << 20 // 4 MiB cap on /embed body — texts plural + batchDeadline = 60 * time.Second // upper bound on a single /embed batch +) + +func main() { + configPath := flag.String("config", "lakehouse.toml", "path to TOML config") + flag.Parse() + + cfg, err := shared.LoadConfig(*configPath) + if err != nil { + slog.Error("config", "err", err) + os.Exit(1) + } + if cfg.Embedd.ProviderURL == "" { + slog.Error("config", "err", "embedd.provider_url is required") + os.Exit(1) + } + + h := &handlers{ + provider: embed.NewOllama(cfg.Embedd.ProviderURL, cfg.Embedd.DefaultModel), + } + + if err := shared.Run("embedd", cfg.Embedd.Bind, h.register); err != nil { + slog.Error("server", "err", err) + os.Exit(1) + } +} + +type handlers struct { + provider embed.Provider +} + +func (h *handlers) register(r chi.Router) { + r.Post("/embed", h.handleEmbed) +} + +// embedRequest is the POST /embed body. Texts is the list to +// embed; Model is optional (empty → use server default). +type embedRequest struct { + Texts []string `json:"texts"` + Model string `json:"model,omitempty"` +} + +func (h *handlers) handleEmbed(w http.ResponseWriter, r *http.Request) { + defer r.Body.Close() + r.Body = http.MaxBytesReader(w, r.Body, maxRequestBytes) + var req embedRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + var maxErr *http.MaxBytesError + if errors.As(err, &maxErr) || strings.Contains(err.Error(), "http: request body too large") { + http.Error(w, "body too large", http.StatusRequestEntityTooLarge) + return + } + http.Error(w, "decode body: "+err.Error(), http.StatusBadRequest) + return + } + + // Per scrum O-W3 (Opus): reject empty strings up front. Ollama's + // behavior on empty prompt is version-dependent (some return + // errors, some return zero vectors); rejecting at the boundary + // gives callers a deterministic 400 instead of 502. + for j, t := range req.Texts { + if t == "" { + http.Error(w, "texts["+itoa(j)+"]: empty string", http.StatusBadRequest) + return + } + } + + // Per scrum C1 (Opus + Kimi convergent): per-text 60s timeout + // without a batch-level cap means a 100-text batch with one + // stuck call can pin the handler for ~6000s. Set a hard batch + // ceiling derived from the request ctx so a wedged Ollama + // surfaces as 504-ish (mapped to 502 by the upstream-error + // path below) rather than holding the connection forever. + ctx, cancel := context.WithTimeout(r.Context(), batchDeadline) + defer cancel() + + res, err := h.provider.Embed(ctx, req.Texts, req.Model) + if errors.Is(err, embed.ErrEmptyTexts) { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + if errors.Is(err, embed.ErrModelMismatch) { + http.Error(w, err.Error(), http.StatusBadGateway) + return + } + if err != nil { + // Upstream-shape errors (Ollama down, model missing, + // 5xx body) bubble up as 502 — distinguishes "your input + // was wrong" (400) from "the embedding backend was wrong" (502). + slog.Warn("embed", "err", err) + http.Error(w, "embed: "+err.Error(), http.StatusBadGateway) + return + } + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(res); err != nil { + slog.Warn("embed encode", "err", err) + } +} + +// itoa is a tiny helper for error messages without pulling strconv +// in just for one call site. +func itoa(i int) string { + if i == 0 { + return "0" + } + var buf [20]byte + pos := len(buf) + for i > 0 { + pos-- + buf[pos] = byte('0' + i%10) + i /= 10 + } + return string(buf[pos:]) +} diff --git a/cmd/gateway/main.go b/cmd/gateway/main.go index 4c93840..c3aea8c 100644 --- a/cmd/gateway/main.go +++ b/cmd/gateway/main.go @@ -43,6 +43,7 @@ func main() { "ingestd_url": cfg.Gateway.IngestdURL, "queryd_url": cfg.Gateway.QuerydURL, "vectord_url": cfg.Gateway.VectordURL, + "embedd_url": cfg.Gateway.EmbeddURL, } for k, v := range upstreams { if v == "" { @@ -61,12 +62,14 @@ func main() { ingestdURL := mustParseUpstream("ingestd_url", cfg.Gateway.IngestdURL) querydURL := mustParseUpstream("queryd_url", cfg.Gateway.QuerydURL) vectordURL := mustParseUpstream("vectord_url", cfg.Gateway.VectordURL) + embeddURL := mustParseUpstream("embedd_url", cfg.Gateway.EmbeddURL) storagedProxy := gateway.NewProxyHandler(storagedURL) catalogdProxy := gateway.NewProxyHandler(catalogdURL) ingestdProxy := gateway.NewProxyHandler(ingestdURL) querydProxy := gateway.NewProxyHandler(querydURL) vectordProxy := gateway.NewProxyHandler(vectordURL) + embeddProxy := gateway.NewProxyHandler(embeddURL) if err := shared.Run("gateway", cfg.Gateway.Bind, func(r chi.Router) { // Storage / catalog have multi-segment paths under their @@ -82,6 +85,8 @@ func main() { r.Handle("/v1/sql", querydProxy) // Vector search routes — /v1/vectors/index, /v1/vectors/index/{name}/... r.Handle("/v1/vectors/*", vectordProxy) + // Embedding service — /v1/embed + r.Handle("/v1/embed", embeddProxy) }); err != nil { slog.Error("server", "err", err) os.Exit(1) diff --git a/internal/embed/embed.go b/internal/embed/embed.go new file mode 100644 index 0000000..ebbbd1d --- /dev/null +++ b/internal/embed/embed.go @@ -0,0 +1,43 @@ +// Package embed turns text into vectors. G2 ships an Ollama-backed +// Provider (local model on :11434, no network call to a hosted +// service); the Provider interface keeps the door open for OpenAI / +// Voyage / etc. without per-call switching at the HTTP boundary. +// +// Vectors are float32 throughout the system (matches vectord + +// coder/hnsw types). Ollama returns float64; we convert at this +// package's boundary so callers don't have to think about it. +package embed + +import ( + "context" + "errors" +) + +// Provider is the embed boundary. Embed takes a list of input +// texts and returns the same number of vectors, in input order, +// using the named model. Empty model = use the Provider's default. +type Provider interface { + Embed(ctx context.Context, texts []string, model string) (Result, error) +} + +// Result is the per-call response shape. Model echoes which model +// was actually used (after default-resolution); Dimension is the +// vector dimension reported by the model. The Provider guarantees +// every Vector in Vectors has Dimension components. +type Result struct { + Model string `json:"model"` + Dimension int `json:"dimension"` + Vectors [][]float32 `json:"vectors"` +} + +// ErrEmptyTexts is returned when the caller passed no inputs. +// Empty-batch is an error because returning an empty result +// silently wastes the round trip and hides caller bugs. +var ErrEmptyTexts = errors.New("embed: empty texts") + +// ErrModelMismatch is returned when one call to the upstream +// returns a different vector dimension than the previous call in +// the same batch — only possible if the model actually switched +// mid-batch (load swap on the server). Surfaced as a 502 by +// callers because the upstream's behavior was inconsistent. +var ErrModelMismatch = errors.New("embed: dimension changed mid-batch") diff --git a/internal/embed/ollama.go b/internal/embed/ollama.go new file mode 100644 index 0000000..26adb4f --- /dev/null +++ b/internal/embed/ollama.go @@ -0,0 +1,127 @@ +// ollama.go — Provider backed by an Ollama HTTP server. Compatible +// with Ollama 0.21+ via the per-text /api/embeddings endpoint. +// Newer Ollama (0.4+) exposes /api/embed for batched calls, but +// the per-text loop is forward-compatible with both. +package embed + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +// OllamaProvider hits an Ollama server at the configured base URL. +type OllamaProvider struct { + baseURL string + defaultModel string + hc *http.Client +} + +// NewOllama builds a provider against baseURL (e.g. +// "http://localhost:11434"). defaultModel is what gets used when +// callers pass an empty model name. +func NewOllama(baseURL, defaultModel string) *OllamaProvider { + return &OllamaProvider{ + baseURL: strings.TrimRight(baseURL, "/"), + defaultModel: defaultModel, + hc: &http.Client{ + // Embeddings are CPU-bound on the server side; 60s gives + // plenty of headroom for a single-text call. Caller can + // add an outer ctx deadline for batch-level cap. + Timeout: 60 * time.Second, + }, + } +} + +// ollamaRequest is Ollama's /api/embeddings body shape. +type ollamaRequest struct { + Model string `json:"model"` + Prompt string `json:"prompt"` +} + +// ollamaResponse mirrors the success body. Embedding is float64 +// from Ollama; we convert to float32 at the boundary. +type ollamaResponse struct { + Embedding []float64 `json:"embedding"` +} + +// Embed loops over texts, issuing one HTTP call per text. Errors +// short-circuit — if call N fails, we return the error and the +// caller sees no partial Result. +func (p *OllamaProvider) Embed(ctx context.Context, texts []string, model string) (Result, error) { + if len(texts) == 0 { + return Result{}, ErrEmptyTexts + } + if model == "" { + model = p.defaultModel + } + if model == "" { + return Result{}, fmt.Errorf("embed: no model (empty request, no default)") + } + + out := Result{Model: model, Vectors: make([][]float32, 0, len(texts))} + for i, text := range texts { + vec, err := p.embedOne(ctx, model, text) + if err != nil { + return Result{}, fmt.Errorf("embed text[%d]: %w", i, err) + } + // Per-text vectors must agree on dimension. Lock on first. + if out.Dimension == 0 { + out.Dimension = len(vec) + } else if len(vec) != out.Dimension { + return Result{}, fmt.Errorf("%w: text[%d] returned %d, prior were %d", + ErrModelMismatch, i, len(vec), out.Dimension) + } + out.Vectors = append(out.Vectors, vec) + } + return out, nil +} + +func (p *OllamaProvider) embedOne(ctx context.Context, model, text string) ([]float32, error) { + body, err := json.Marshal(ollamaRequest{Model: model, Prompt: text}) + if err != nil { + return nil, fmt.Errorf("marshal: %w", err) + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, + p.baseURL+"/api/embeddings", bytes.NewReader(body)) + if err != nil { + return nil, fmt.Errorf("req: %w", err) + } + req.Header.Set("Content-Type", "application/json") + req.ContentLength = int64(len(body)) + + resp, err := p.hc.Do(req) + if err != nil { + return nil, fmt.Errorf("do: %w", err) + } + defer drainAndClose(resp.Body) + + if resp.StatusCode != http.StatusOK { + preview, _ := io.ReadAll(io.LimitReader(resp.Body, 256)) + return nil, fmt.Errorf("upstream status %d: %s", resp.StatusCode, string(preview)) + } + var or ollamaResponse + if err := json.NewDecoder(resp.Body).Decode(&or); err != nil { + return nil, fmt.Errorf("decode: %w", err) + } + if len(or.Embedding) == 0 { + return nil, fmt.Errorf("upstream returned empty embedding") + } + // Float64 → Float32. Loss of precision is acceptable for HNSW + // search; float32 matches the rest of the system. + out := make([]float32, len(or.Embedding)) + for i, v := range or.Embedding { + out[i] = float32(v) + } + return out, nil +} + +func drainAndClose(body io.ReadCloser) { + _, _ = io.Copy(io.Discard, io.LimitReader(body, 64<<10)) + _ = body.Close() +} diff --git a/internal/embed/ollama_test.go b/internal/embed/ollama_test.go new file mode 100644 index 0000000..a417fa9 --- /dev/null +++ b/internal/embed/ollama_test.go @@ -0,0 +1,115 @@ +package embed + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" +) + +func TestOllama_EmbedBatch_PreservesOrder(t *testing.T) { + var mu sync.Mutex + var seenPrompts []string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var req ollamaRequest + _ = json.NewDecoder(r.Body).Decode(&req) + mu.Lock() + seenPrompts = append(seenPrompts, req.Prompt) + mu.Unlock() + // Return a vector that encodes which prompt this was, so + // we can assert order at the caller. 4-d vector for cheap. + var vec [4]float64 + switch req.Prompt { + case "alpha": + vec = [4]float64{1, 0, 0, 0} + case "beta": + vec = [4]float64{0, 1, 0, 0} + case "gamma": + vec = [4]float64{0, 0, 1, 0} + } + _ = json.NewEncoder(w).Encode(map[string]any{"embedding": vec[:]}) + })) + defer srv.Close() + + p := NewOllama(srv.URL, "test-model") + res, err := p.Embed(context.Background(), []string{"alpha", "beta", "gamma"}, "") + if err != nil { + t.Fatal(err) + } + if res.Model != "test-model" || res.Dimension != 4 || len(res.Vectors) != 3 { + t.Fatalf("Result: got %+v", res) + } + if res.Vectors[0][0] != 1 || res.Vectors[1][1] != 1 || res.Vectors[2][2] != 1 { + t.Errorf("vectors out of order: %v", res.Vectors) + } + // Sanity: all three prompts hit the server. + if len(seenPrompts) != 3 { + t.Errorf("expected 3 upstream calls, got %d", len(seenPrompts)) + } +} + +func TestOllama_EmptyTextsErrors(t *testing.T) { + p := NewOllama("http://nope:0", "x") + _, err := p.Embed(context.Background(), nil, "") + if !errors.Is(err, ErrEmptyTexts) { + t.Errorf("expected ErrEmptyTexts, got %v", err) + } +} + +func TestOllama_NoModelNoDefault(t *testing.T) { + p := NewOllama("http://nope:0", "") // empty default + _, err := p.Embed(context.Background(), []string{"hi"}, "") + if err == nil || !strings.Contains(err.Error(), "no model") { + t.Errorf("expected no-model error, got %v", err) + } +} + +func TestOllama_UpstreamErrorPropagates(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "model not loaded", http.StatusInternalServerError) + })) + defer srv.Close() + p := NewOllama(srv.URL, "x") + _, err := p.Embed(context.Background(), []string{"hi"}, "") + if err == nil || !strings.Contains(err.Error(), "upstream status 500") { + t.Errorf("expected wrapped 500 error, got %v", err) + } +} + +func TestOllama_DimensionMismatchMidBatch(t *testing.T) { + calls := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + calls++ + // First call returns 4-d, second returns 8-d → server changed + // model under us. Provider should ErrModelMismatch. + var v []float64 + if calls == 1 { + v = []float64{1, 0, 0, 0} + } else { + v = []float64{1, 0, 0, 0, 0, 0, 0, 0} + } + _ = json.NewEncoder(w).Encode(map[string]any{"embedding": v}) + })) + defer srv.Close() + p := NewOllama(srv.URL, "x") + _, err := p.Embed(context.Background(), []string{"a", "b"}, "") + if !errors.Is(err, ErrModelMismatch) { + t.Errorf("expected ErrModelMismatch, got %v", err) + } +} + +func TestOllama_EmptyEmbeddingErrors(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _ = json.NewEncoder(w).Encode(map[string]any{"embedding": []float64{}}) + })) + defer srv.Close() + p := NewOllama(srv.URL, "x") + _, err := p.Embed(context.Background(), []string{"hi"}, "") + if err == nil || !strings.Contains(err.Error(), "empty embedding") { + t.Errorf("expected empty-embedding error, got %v", err) + } +} diff --git a/internal/shared/config.go b/internal/shared/config.go index 5e298a3..8947187 100644 --- a/internal/shared/config.go +++ b/internal/shared/config.go @@ -25,6 +25,7 @@ type Config struct { Ingestd IngestConfig `toml:"ingestd"` Queryd QuerydConfig `toml:"queryd"` Vectord VectordConfig `toml:"vectord"` + Embedd EmbeddConfig `toml:"embedd"` S3 S3Config `toml:"s3"` Log LogConfig `toml:"log"` } @@ -48,9 +49,9 @@ type IngestConfig struct { // GatewayConfig adds the upstream URLs the reverse proxy fronts. // Each route family (/v1/storage, /v1/catalog, /v1/ingest, /v1/sql, -// /v1/vectors) has its own upstream so we can scale services -// independently or move them to different boxes without touching -// gateway code. +// /v1/vectors, /v1/embed) has its own upstream so we can scale +// services independently or move them to different boxes without +// touching gateway code. type GatewayConfig struct { Bind string `toml:"bind"` StoragedURL string `toml:"storaged_url"` @@ -58,6 +59,17 @@ type GatewayConfig struct { IngestdURL string `toml:"ingestd_url"` QuerydURL string `toml:"queryd_url"` VectordURL string `toml:"vectord_url"` + EmbeddURL string `toml:"embedd_url"` +} + +// EmbeddConfig drives the embed service. ProviderURL points at the +// embedding backend (Ollama in G2, possibly OpenAI/Voyage in G3+). +// DefaultModel is what gets used when callers don't specify a +// model in their request body. +type EmbeddConfig struct { + Bind string `toml:"bind"` + ProviderURL string `toml:"provider_url"` + DefaultModel string `toml:"default_model"` } // VectordConfig adds vectord-specific knobs. StoragedURL is @@ -123,6 +135,7 @@ func DefaultConfig() Config { IngestdURL: "http://127.0.0.1:3213", QuerydURL: "http://127.0.0.1:3214", VectordURL: "http://127.0.0.1:3215", + EmbeddURL: "http://127.0.0.1:3216", }, Storaged: ServiceConfig{Bind: "127.0.0.1:3211"}, Catalogd: CatalogConfig{Bind: "127.0.0.1:3212", StoragedURL: "http://127.0.0.1:3211"}, @@ -136,6 +149,11 @@ func DefaultConfig() Config { Bind: "127.0.0.1:3215", StoragedURL: "http://127.0.0.1:3211", }, + Embedd: EmbeddConfig{ + Bind: "127.0.0.1:3216", + ProviderURL: "http://localhost:11434", // local Ollama + DefaultModel: "nomic-embed-text", + }, Queryd: QuerydConfig{ Bind: "127.0.0.1:3214", CatalogdURL: "http://127.0.0.1:3212", diff --git a/lakehouse.toml b/lakehouse.toml index a6ce460..685cfa0 100644 --- a/lakehouse.toml +++ b/lakehouse.toml @@ -11,6 +11,7 @@ catalogd_url = "http://127.0.0.1:3212" ingestd_url = "http://127.0.0.1:3213" queryd_url = "http://127.0.0.1:3214" vectord_url = "http://127.0.0.1:3215" +embedd_url = "http://127.0.0.1:3216" [storaged] bind = "127.0.0.1:3211" @@ -33,6 +34,13 @@ bind = "127.0.0.1:3215" # Optional — set to empty string to disable persistence (dev/test). storaged_url = "http://127.0.0.1:3211" +[embedd] +bind = "127.0.0.1:3216" +# G2: Ollama local. G3+ may swap in OpenAI/Voyage by changing +# this URL + the wire format inside the provider. +provider_url = "http://localhost:11434" +default_model = "nomic-embed-text" + [queryd] bind = "127.0.0.1:3214" catalogd_url = "http://127.0.0.1:3212" diff --git a/scripts/g2_smoke.sh b/scripts/g2_smoke.sh new file mode 100755 index 0000000..cc28ab6 --- /dev/null +++ b/scripts/g2_smoke.sh @@ -0,0 +1,165 @@ +#!/usr/bin/env bash +# G2 smoke — embedd service. All assertions go through gateway :3110. +# +# Validates: +# - POST /v1/embed with 2 texts → 200, dim=768 (nomic-embed-text), +# vectors[0] != vectors[1] (different texts → different vectors) +# - Same text twice → byte-identical vectors (deterministic) +# - Empty texts → 400 +# - Bad model → 502 from upstream Ollama +# - End-to-end with vectord: embed text → store → search by text → +# same text round-trips at distance ≈ 0 (proves embed→vectord +# pipeline works) +# +# Requires Ollama running at :11434 with nomic-embed-text loaded. +# +# Usage: ./scripts/g2_smoke.sh + +set -euo pipefail +cd "$(dirname "$0")/.." + +export PATH="$PATH:/usr/local/go/bin" + +echo "[g2-smoke] building embedd + vectord + gateway..." +go build -o bin/ ./cmd/embedd ./cmd/vectord ./cmd/gateway + +pkill -f "bin/(embedd|vectord|gateway)" 2>/dev/null || true +sleep 0.3 + +PIDS=() +TMP="$(mktemp -d)" +cleanup() { + echo "[g2-smoke] cleanup" + for p in "${PIDS[@]}"; do [ -n "$p" ] && kill "$p" 2>/dev/null || true; done + rm -rf "$TMP" +} +trap cleanup EXIT INT TERM + +poll_health() { + local port="$1" deadline=$(($(date +%s) + 5)) + while [ "$(date +%s)" -lt "$deadline" ]; do + if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi + sleep 0.05 + done + return 1 +} + +# Verify Ollama is up before the test even starts — otherwise every +# embed call would 502 and the smoke would be misleading. +if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then + echo "[g2-smoke] Ollama not reachable on :11434 — skipping" + exit 0 +fi + +echo "[g2-smoke] launching embedd → vectord (no persist) → gateway..." +./bin/embedd > /tmp/embedd.log 2>&1 & +PIDS+=($!) +poll_health 3216 || { echo "embedd failed"; tail /tmp/embedd.log; exit 1; } + +# vectord with persistence disabled (matches g1_smoke pattern — +# this smoke doesn't touch storaged). +./bin/vectord -config scripts/g1_smoke.toml > /tmp/vectord.log 2>&1 & +PIDS+=($!) +poll_health 3215 || { echo "vectord failed"; tail /tmp/vectord.log; exit 1; } + +./bin/gateway > /tmp/gateway.log 2>&1 & +PIDS+=($!) +poll_health 3110 || { echo "gateway failed"; tail /tmp/gateway.log; exit 1; } + +FAILED=0 + +echo "[g2-smoke] /v1/embed — two distinct texts:" +RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/embed \ + -H 'Content-Type: application/json' \ + -d '{"texts":["forklift operator with OSHA-30","CNC machinist precision parts"]}')" +DIM="$(echo "$RESP" | jq -r '.dimension')" +N="$(echo "$RESP" | jq -r '.vectors | length')" +MODEL="$(echo "$RESP" | jq -r '.model')" +SAME="$(echo "$RESP" | jq -r '.vectors[0][0] == .vectors[1][0]')" +if [ "$DIM" = "768" ] && [ "$N" = "2" ] && [ "$MODEL" = "nomic-embed-text" ] && [ "$SAME" = "false" ]; then + echo " ✓ dim=768, model=nomic-embed-text, 2 distinct vectors" +else + echo " ✗ resp: dim=$DIM n=$N model=$MODEL same=$SAME"; FAILED=1 +fi + +echo "[g2-smoke] determinism — same text twice → byte-identical vector:" +RESP1="$(curl -sS -X POST http://127.0.0.1:3110/v1/embed \ + -H 'Content-Type: application/json' \ + -d '{"texts":["determinism check"]}' | jq -c '.vectors[0]')" +RESP2="$(curl -sS -X POST http://127.0.0.1:3110/v1/embed \ + -H 'Content-Type: application/json' \ + -d '{"texts":["determinism check"]}' | jq -c '.vectors[0]')" +if [ "$RESP1" = "$RESP2" ]; then + echo " ✓ identical text → identical vector" +else + echo " ✗ deterministic mismatch"; FAILED=1 +fi + +echo "[g2-smoke] empty texts → 400:" +HTTP="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/embed \ + -H 'Content-Type: application/json' -d '{"texts":[]}')" +if [ "$HTTP" = "400" ]; then echo " ✓ empty → 400"; else echo " ✗ empty → $HTTP"; FAILED=1; fi + +echo "[g2-smoke] bad model → 502:" +HTTP="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/embed \ + -H 'Content-Type: application/json' -d '{"texts":["x"],"model":"definitely-not-loaded"}')" +if [ "$HTTP" = "502" ]; then echo " ✓ unknown model → 502"; else echo " ✗ unknown → $HTTP"; FAILED=1; fi + +echo "[g2-smoke] end-to-end: embed → vectord add → search by embed → recall:" +NAME="g2_demo" +# Create index. Default M/EfSearch; cosine distance. +curl -sS -o /dev/null -X POST http://127.0.0.1:3110/v1/vectors/index \ + -H 'Content-Type: application/json' \ + -d "{\"name\":\"$NAME\",\"dimension\":768,\"distance\":\"cosine\"}" + +# Embed a few staffing-ish texts and add them. +TEXTS='["forklift operator with OSHA-30","CNC machinist precision parts","warehouse picker night shift","dental hygienist 3 years experience"]' +EMBEDS="$(curl -sS -X POST http://127.0.0.1:3110/v1/embed \ + -H 'Content-Type: application/json' \ + -d "{\"texts\":$TEXTS}")" +# Build the add payload — id-i + vector from embeds[i]. +python3 - "$EMBEDS" <<'EOF' > "$TMP/add.json" +import json, sys +embeds = json.loads(sys.argv[1]) +items = [ + {"id": f"w-{i}", "vector": v, "metadata": {"text": t}} + for i, (v, t) in enumerate(zip(embeds["vectors"], [ + "forklift operator with OSHA-30", + "CNC machinist precision parts", + "warehouse picker night shift", + "dental hygienist 3 years experience", + ])) +] +print(json.dumps({"items": items})) +EOF +curl -sS -o /dev/null -X POST "http://127.0.0.1:3110/v1/vectors/index/$NAME/add" \ + -H 'Content-Type: application/json' -d @"$TMP/add.json" + +# Search by embedding the FIRST text again — should retrieve w-0 at dist≈0 +QUERY_VEC="$(curl -sS -X POST http://127.0.0.1:3110/v1/embed \ + -H 'Content-Type: application/json' \ + -d '{"texts":["forklift operator with OSHA-30"]}' | jq -c '.vectors[0]')" +SEARCH="$(curl -sS -X POST "http://127.0.0.1:3110/v1/vectors/index/$NAME/search" \ + -H 'Content-Type: application/json' \ + -d "{\"vector\":$QUERY_VEC,\"k\":3}")" +TOP_ID="$(echo "$SEARCH" | jq -r '.results[0].id')" +TOP_DIST="$(echo "$SEARCH" | jq -r '.results[0].distance')" +DIST_OK="$(python3 -c "import sys; sys.exit(0 if abs($TOP_DIST) < 1e-4 else 1)" && echo y || echo n)" +if [ "$TOP_ID" = "w-0" ] && [ "$DIST_OK" = "y" ]; then + echo " ✓ embed → store → search round-trip: w-0 at dist=$TOP_DIST" +else + echo " ✗ recall: top=$TOP_ID dist=$TOP_DIST" + echo " full: $SEARCH" + FAILED=1 +fi + +# Clean up the index. +curl -sS -o /dev/null -X DELETE "http://127.0.0.1:3110/v1/vectors/index/$NAME" || true + +if [ "$FAILED" -eq 0 ]; then + echo "[g2-smoke] G2 acceptance gate: PASSED" + exit 0 +else + echo "[g2-smoke] G2 acceptance gate: FAILED" + exit 1 +fi