root 166470f532 corpusingest: extract reusable text→vector ingest pipeline
Generalizes the staffing_500k driver's embed-and-push loop into
internal/corpusingest. Per docs/SPEC.md §3.4 component 1 (corpus
builders): adding a new staffing/code/playbook corpus is now one
Source impl + one main.go calling Run, not 200 lines of pipeline
copy-paste.

API:
  type Source interface { Next() (Row, error) }
  func Run(ctx, Config, Source) (Stats, error)

Library owns:
  - Index lifecycle (create, optional drop-existing, idempotent
    reuse on 409)
  - Parallel embed dispatcher (configurable workers + batch size)
  - Vectord push batching
  - Progress logging + Stats reporting
  - Partial-failure semantics (log + continue per-batch errors;
    operator decides on re-run via Stats.Embedded vs Scanned delta)

Per-corpus driver owns: source parsing + column→Row mapping +
post-ingest validation queries.

Refactor scripts/staffing_500k/main.go to use it. Driver is now
~190 lines (was 339), with the embed/add plumbing replaced by one
Run call. -drop flag added so callers can opt out of the destructive
DELETE-first behavior (default still true to keep the 500K test
clean-recall semantics).

Unit tests (internal/corpusingest/ingest_test.go, 8/8 PASS):
  - Pipeline shape: 50 rows / 16 batch → 4 embed + 4 add calls,
    every ID added exactly once, vectors at correct dimension
  - DropExisting fires DELETE
  - 409 on create → reuse existing index
  - Limit stops early
  - Empty Text rows skipped (counted as scanned, not added)
  - Required IndexName + Dimension validation
  - Context cancel stops mid-pipeline

Real bug caught and fixed by the test suite: if embedd ever returns
fewer vectors than texts in the request (degraded backend), the
addBatch loop would panic with index-out-of-range. Worker now
length-checks the response and logs+skips on mismatch.

12-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix). vet clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 18:47:18 -05:00

218 lines
6.4 KiB
Go

// Staffing co-pilot scale test driver — workers_500k corpus.
//
// Pipeline: workers_500k.csv → /v1/embed → /v1/vectors/index/workers_500k/add.
// The pipeline itself lives in internal/corpusingest; this driver
// provides the CSV → Row mapping and the post-ingest semantic queries
// that are the human-readable check ("does forklift OSHA-30 actually
// retrieve forklift workers?").
//
// Designed to be re-run safely; index gets DELETEd at the start
// when -drop is set so leftover state doesn't bias recall.
package main
import (
"bytes"
"context"
"encoding/csv"
"encoding/json"
"flag"
"fmt"
"io"
"log"
"net/http"
"os"
"strings"
"time"
"git.agentview.dev/profit/golangLAKEHOUSE/internal/corpusingest"
)
const (
indexName = "workers_500k"
dim = 768
// Column indexes in workers_500k.csv. Stable contract; if the CSV
// schema changes these need updating.
colWorkerID = 0
colName = 1
colRole = 2
colCity = 5
colState = 6
colSkills = 8
colCerts = 9
colResume = 17
)
// workersCSV implements corpusingest.Source. CSV reader state +
// row → Row mapping live here; the embed/add pipeline is generic.
type workersCSV struct {
cr *csv.Reader
}
func (s *workersCSV) Next() (corpusingest.Row, error) {
for {
row, err := s.cr.Read()
if err != nil {
return corpusingest.Row{}, err
}
if len(row) <= colResume {
continue // skip malformed rows; matches prior behavior
}
id := strings.TrimSpace(row[colWorkerID])
return corpusingest.Row{
ID: "w-" + id,
Text: buildWorkerText(row),
Metadata: map[string]any{
"name": row[colName],
"role": row[colRole],
"city": row[colCity],
"state": row[colState],
},
}, nil
}
}
// buildWorkerText concatenates staffing-relevant columns into the
// embed-text. Order: role first (most semantically dense), then
// location, skills, certs, prose resume. Embedding models weight
// earlier tokens slightly more, so the front matter matters.
func buildWorkerText(row []string) string {
var b strings.Builder
b.WriteString(row[colRole])
b.WriteString(" in ")
b.WriteString(row[colCity])
b.WriteString(", ")
b.WriteString(row[colState])
b.WriteString(". Skills: ")
b.WriteString(row[colSkills])
b.WriteString(". Certifications: ")
b.WriteString(row[colCerts])
b.WriteString(". ")
b.WriteString(row[colResume])
return b.String()
}
func main() {
var (
gateway = flag.String("gateway", "http://127.0.0.1:3110", "gateway base URL")
csvPath = flag.String("csv", "/tmp/rs/workers_500k.csv", "path to workers CSV")
limit = flag.Int("limit", 0, "limit rows (0 = all)")
queries = flag.String("queries", "default", "default | <semicolon-separated query strings>")
skipPop = flag.Bool("skip-populate", false, "skip embed+add, only run queries")
drop = flag.Bool("drop", true, "DELETE index before populate (default true for clean recall)")
)
flag.Parse()
hc := &http.Client{Timeout: 5 * time.Minute}
ctx := context.Background()
if !*skipPop {
f, err := os.Open(*csvPath)
if err != nil {
log.Fatalf("open csv: %v", err)
}
defer f.Close()
cr := csv.NewReader(f)
cr.FieldsPerRecord = -1
if _, err := cr.Read(); err != nil { // skip header
log.Fatalf("read header: %v", err)
}
stats, err := corpusingest.Run(ctx, corpusingest.Config{
GatewayURL: *gateway,
IndexName: indexName,
Dimension: dim,
Distance: "cosine",
EmbedBatch: 16, // matches Ollama-on-A4000 sweet spot
EmbedWorkers: 8, // matches Ollama-on-A4000 sweet spot
AddBatch: 1000, // empirically fine; vectord BatchAdd lock-amortized at f1c1883
Limit: *limit,
DropExisting: *drop,
HTTPClient: hc,
LogProgress: 10 * time.Second,
}, &workersCSV{cr: cr})
if err != nil {
log.Fatalf("ingest: %v", err)
}
fmt.Printf("[sc] populate done: scanned=%d embedded=%d added=%d wall=%v\n",
stats.Scanned, stats.Embedded, stats.Added, stats.Wall.Round(time.Millisecond))
}
// Validate semantic queries against the populated index.
qs := defaultQueries()
if *queries != "default" {
qs = strings.Split(*queries, ";")
}
for _, q := range qs {
runQuery(hc, *gateway, q)
}
}
func defaultQueries() []string {
return []string{
"CNC operator with first article and gauge R&R experience",
"forklift driver OSHA-30 certified warehouse",
"warehouse picker night shift bilingual",
"dental hygienist three years experience",
"electrician with industrial wiring background",
}
}
// runQuery embeds a query, searches the index, prints top hits.
// Stays in this driver (not corpusingest) — query validation is
// per-corpus concern, not part of the ingest pipeline.
func runQuery(hc *http.Client, gateway, q string) {
t0 := time.Now()
body, _ := json.Marshal(map[string]any{"texts": []string{q}})
req, _ := http.NewRequest(http.MethodPost, gateway+"/v1/embed", bytes.NewReader(body))
req.Header.Set("Content-Type", "application/json")
resp, err := hc.Do(req)
if err != nil {
fmt.Printf("[sc] query %q: embed err: %v\n", q, err)
return
}
defer resp.Body.Close()
var er struct {
Vectors [][]float32 `json:"vectors"`
}
if err := json.NewDecoder(resp.Body).Decode(&er); err != nil || len(er.Vectors) == 0 {
fmt.Printf("[sc] query %q: embed decode err: %v\n", q, err)
return
}
embedDur := time.Since(t0)
t1 := time.Now()
body, _ = json.Marshal(map[string]any{"vector": er.Vectors[0], "k": 5})
req, _ = http.NewRequest(http.MethodPost,
gateway+"/v1/vectors/index/"+indexName+"/search", bytes.NewReader(body))
req.Header.Set("Content-Type", "application/json")
resp, err = hc.Do(req)
if err != nil {
fmt.Printf("[sc] query %q: search err: %v\n", q, err)
return
}
defer resp.Body.Close()
searchDur := time.Since(t1)
var sr struct {
Results []struct {
ID string `json:"id"`
Distance float32 `json:"distance"`
Metadata json.RawMessage `json:"metadata"`
} `json:"results"`
}
if err := json.NewDecoder(resp.Body).Decode(&sr); err != nil {
fmt.Printf("[sc] query %q: decode err: %v\n", q, err)
return
}
fmt.Printf("\n[sc] %q (embed=%v search=%v)\n", q, embedDur.Round(time.Millisecond), searchDur.Round(time.Millisecond))
for i, r := range sr.Results {
fmt.Printf(" %d. %s d=%.4f %s\n", i+1, r.ID, r.Distance, string(r.Metadata))
}
}
// io.EOF imported transitively via corpusingest; keep the explicit
// reference so a hypothetical future "EOF means done" check in this
// driver's Source impl doesn't need a fresh import line.
var _ = io.EOF