Bridges the missing piece for the staffing co-pilot: text inputs to
vectord-shaped vectors. Standalone cmd/embedd on :3216 fronted by
gateway at /v1/embed. Pluggable embed.Provider interface (G2 ships
Ollama; OpenAI/Voyage swap in via the same interface in G3+).
Wire format:
POST /v1/embed {"texts":[...], "model":"..."} // model optional
→ 200 {"model","dimension","vectors":[[...]]}
Default model: nomic-embed-text (768-d). Ollama returns float64;
provider converts to float32 at the boundary so vectors flow through
vectord/HNSW without re-conversion.
Acceptance smoke 5/5 PASS — including the architectural payoff:
end-to-end embed → vectord add → search by re-embedded text returns
recall=1 at distance 5.96e-8 (float32 precision noise on identical
unit vectors). The staffing co-pilot pipeline (text → vector →
similarity search) is now functional end-to-end.
All 9 smokes (D1-D6 + G1 + G1P + G2) PASS deterministically.
Cross-lineage scrum on shipped code:
- Opus 4.7 (opencode): 0 BLOCK + 4 WARN + 3 INFO
- Kimi K2-0905 (openrouter): 0 BLOCK + 2 WARN + 1 INFO
- Qwen3-coder (openrouter): "No BLOCKs" (3 tokens)
Fixed (2 — 1 convergent + 1 single-reviewer):
C1 (Opus + Kimi convergent WARN): per-text 60s timeout × N-text
batch was up to N×60s with no batch-level cap. One stuck Ollama
call would stall the whole handler indefinitely. Fix:
context.WithTimeout(r.Context(), 60s) wraps the entire batch.
O-W3 (Opus WARN): empty strings in texts went to Ollama unchecked,
producing version-dependent garbage. Fix: reject "" with 400 at
the handler boundary so callers get a deterministic answer
instead of an upstream-conditional 502.
Deferred (4): drainAndClose 64KiB cap (matches G0 pattern), no
concurrency limit on /embed (single-tenant G2), missing Accept
header (exotic-proxy concern), MaxBytesError string-match
redundancy (paranoia layer kept consistent across codebase).
Zero false positives this round — Qwen returned 3 tokens "No BLOCKs"
and the other two reviewers' findings were all real.
Setup confirmed: Ollama 0.21.0 on :11434 with nomic-embed-text loaded.
Per-text /api/embeddings used (forward-compat with 0.21+); newer
0.4+ /api/embed batch endpoint can swap in via the Provider interface.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
128 lines
3.8 KiB
Go
128 lines
3.8 KiB
Go
// ollama.go — Provider backed by an Ollama HTTP server. Compatible
|
|
// with Ollama 0.21+ via the per-text /api/embeddings endpoint.
|
|
// Newer Ollama (0.4+) exposes /api/embed for batched calls, but
|
|
// the per-text loop is forward-compatible with both.
|
|
package embed
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// OllamaProvider hits an Ollama server at the configured base URL.
|
|
type OllamaProvider struct {
|
|
baseURL string
|
|
defaultModel string
|
|
hc *http.Client
|
|
}
|
|
|
|
// NewOllama builds a provider against baseURL (e.g.
|
|
// "http://localhost:11434"). defaultModel is what gets used when
|
|
// callers pass an empty model name.
|
|
func NewOllama(baseURL, defaultModel string) *OllamaProvider {
|
|
return &OllamaProvider{
|
|
baseURL: strings.TrimRight(baseURL, "/"),
|
|
defaultModel: defaultModel,
|
|
hc: &http.Client{
|
|
// Embeddings are CPU-bound on the server side; 60s gives
|
|
// plenty of headroom for a single-text call. Caller can
|
|
// add an outer ctx deadline for batch-level cap.
|
|
Timeout: 60 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
// ollamaRequest is Ollama's /api/embeddings body shape.
|
|
type ollamaRequest struct {
|
|
Model string `json:"model"`
|
|
Prompt string `json:"prompt"`
|
|
}
|
|
|
|
// ollamaResponse mirrors the success body. Embedding is float64
|
|
// from Ollama; we convert to float32 at the boundary.
|
|
type ollamaResponse struct {
|
|
Embedding []float64 `json:"embedding"`
|
|
}
|
|
|
|
// Embed loops over texts, issuing one HTTP call per text. Errors
|
|
// short-circuit — if call N fails, we return the error and the
|
|
// caller sees no partial Result.
|
|
func (p *OllamaProvider) Embed(ctx context.Context, texts []string, model string) (Result, error) {
|
|
if len(texts) == 0 {
|
|
return Result{}, ErrEmptyTexts
|
|
}
|
|
if model == "" {
|
|
model = p.defaultModel
|
|
}
|
|
if model == "" {
|
|
return Result{}, fmt.Errorf("embed: no model (empty request, no default)")
|
|
}
|
|
|
|
out := Result{Model: model, Vectors: make([][]float32, 0, len(texts))}
|
|
for i, text := range texts {
|
|
vec, err := p.embedOne(ctx, model, text)
|
|
if err != nil {
|
|
return Result{}, fmt.Errorf("embed text[%d]: %w", i, err)
|
|
}
|
|
// Per-text vectors must agree on dimension. Lock on first.
|
|
if out.Dimension == 0 {
|
|
out.Dimension = len(vec)
|
|
} else if len(vec) != out.Dimension {
|
|
return Result{}, fmt.Errorf("%w: text[%d] returned %d, prior were %d",
|
|
ErrModelMismatch, i, len(vec), out.Dimension)
|
|
}
|
|
out.Vectors = append(out.Vectors, vec)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (p *OllamaProvider) embedOne(ctx context.Context, model, text string) ([]float32, error) {
|
|
body, err := json.Marshal(ollamaRequest{Model: model, Prompt: text})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("marshal: %w", err)
|
|
}
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
|
|
p.baseURL+"/api/embeddings", bytes.NewReader(body))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("req: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.ContentLength = int64(len(body))
|
|
|
|
resp, err := p.hc.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("do: %w", err)
|
|
}
|
|
defer drainAndClose(resp.Body)
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
preview, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
|
|
return nil, fmt.Errorf("upstream status %d: %s", resp.StatusCode, string(preview))
|
|
}
|
|
var or ollamaResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&or); err != nil {
|
|
return nil, fmt.Errorf("decode: %w", err)
|
|
}
|
|
if len(or.Embedding) == 0 {
|
|
return nil, fmt.Errorf("upstream returned empty embedding")
|
|
}
|
|
// Float64 → Float32. Loss of precision is acceptable for HNSW
|
|
// search; float32 matches the rest of the system.
|
|
out := make([]float32, len(or.Embedding))
|
|
for i, v := range or.Embedding {
|
|
out[i] = float32(v)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func drainAndClose(body io.ReadCloser) {
|
|
_, _ = io.Copy(io.Discard, io.LimitReader(body, 64<<10))
|
|
_ = body.Close()
|
|
}
|