Generalizes the staffing_500k driver's embed-and-push loop into
internal/corpusingest. Per docs/SPEC.md §3.4 component 1 (corpus
builders): adding a new staffing/code/playbook corpus is now one
Source impl + one main.go calling Run, not 200 lines of pipeline
copy-paste.
API:
type Source interface { Next() (Row, error) }
func Run(ctx, Config, Source) (Stats, error)
Library owns:
- Index lifecycle (create, optional drop-existing, idempotent
reuse on 409)
- Parallel embed dispatcher (configurable workers + batch size)
- Vectord push batching
- Progress logging + Stats reporting
- Partial-failure semantics (log + continue per-batch errors;
operator decides on re-run via Stats.Embedded vs Scanned delta)
Per-corpus driver owns: source parsing + column→Row mapping +
post-ingest validation queries.
Refactor scripts/staffing_500k/main.go to use it. Driver is now
~190 lines (was 339), with the embed/add plumbing replaced by one
Run call. -drop flag added so callers can opt out of the destructive
DELETE-first behavior (default still true to keep the 500K test
clean-recall semantics).
Unit tests (internal/corpusingest/ingest_test.go, 8/8 PASS):
- Pipeline shape: 50 rows / 16 batch → 4 embed + 4 add calls,
every ID added exactly once, vectors at correct dimension
- DropExisting fires DELETE
- 409 on create → reuse existing index
- Limit stops early
- Empty Text rows skipped (counted as scanned, not added)
- Required IndexName + Dimension validation
- Context cancel stops mid-pipeline
Real bug caught and fixed by the test suite: if embedd ever returns
fewer vectors than texts in the request (degraded backend), the
addBatch loop would panic with index-out-of-range. Worker now
length-checks the response and logs+skips on mismatch.
12-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix). vet clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
218 lines
6.4 KiB
Go
218 lines
6.4 KiB
Go
// Staffing co-pilot scale test driver — workers_500k corpus.
|
|
//
|
|
// Pipeline: workers_500k.csv → /v1/embed → /v1/vectors/index/workers_500k/add.
|
|
// The pipeline itself lives in internal/corpusingest; this driver
|
|
// provides the CSV → Row mapping and the post-ingest semantic queries
|
|
// that are the human-readable check ("does forklift OSHA-30 actually
|
|
// retrieve forklift workers?").
|
|
//
|
|
// Designed to be re-run safely; index gets DELETEd at the start
|
|
// when -drop is set so leftover state doesn't bias recall.
|
|
|
|
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/csv"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"git.agentview.dev/profit/golangLAKEHOUSE/internal/corpusingest"
|
|
)
|
|
|
|
const (
|
|
indexName = "workers_500k"
|
|
dim = 768
|
|
|
|
// Column indexes in workers_500k.csv. Stable contract; if the CSV
|
|
// schema changes these need updating.
|
|
colWorkerID = 0
|
|
colName = 1
|
|
colRole = 2
|
|
colCity = 5
|
|
colState = 6
|
|
colSkills = 8
|
|
colCerts = 9
|
|
colResume = 17
|
|
)
|
|
|
|
// workersCSV implements corpusingest.Source. CSV reader state +
|
|
// row → Row mapping live here; the embed/add pipeline is generic.
|
|
type workersCSV struct {
|
|
cr *csv.Reader
|
|
}
|
|
|
|
func (s *workersCSV) Next() (corpusingest.Row, error) {
|
|
for {
|
|
row, err := s.cr.Read()
|
|
if err != nil {
|
|
return corpusingest.Row{}, err
|
|
}
|
|
if len(row) <= colResume {
|
|
continue // skip malformed rows; matches prior behavior
|
|
}
|
|
id := strings.TrimSpace(row[colWorkerID])
|
|
return corpusingest.Row{
|
|
ID: "w-" + id,
|
|
Text: buildWorkerText(row),
|
|
Metadata: map[string]any{
|
|
"name": row[colName],
|
|
"role": row[colRole],
|
|
"city": row[colCity],
|
|
"state": row[colState],
|
|
},
|
|
}, nil
|
|
}
|
|
}
|
|
|
|
// buildWorkerText concatenates staffing-relevant columns into the
|
|
// embed-text. Order: role first (most semantically dense), then
|
|
// location, skills, certs, prose resume. Embedding models weight
|
|
// earlier tokens slightly more, so the front matter matters.
|
|
func buildWorkerText(row []string) string {
|
|
var b strings.Builder
|
|
b.WriteString(row[colRole])
|
|
b.WriteString(" in ")
|
|
b.WriteString(row[colCity])
|
|
b.WriteString(", ")
|
|
b.WriteString(row[colState])
|
|
b.WriteString(". Skills: ")
|
|
b.WriteString(row[colSkills])
|
|
b.WriteString(". Certifications: ")
|
|
b.WriteString(row[colCerts])
|
|
b.WriteString(". ")
|
|
b.WriteString(row[colResume])
|
|
return b.String()
|
|
}
|
|
|
|
func main() {
|
|
var (
|
|
gateway = flag.String("gateway", "http://127.0.0.1:3110", "gateway base URL")
|
|
csvPath = flag.String("csv", "/tmp/rs/workers_500k.csv", "path to workers CSV")
|
|
limit = flag.Int("limit", 0, "limit rows (0 = all)")
|
|
queries = flag.String("queries", "default", "default | <semicolon-separated query strings>")
|
|
skipPop = flag.Bool("skip-populate", false, "skip embed+add, only run queries")
|
|
drop = flag.Bool("drop", true, "DELETE index before populate (default true for clean recall)")
|
|
)
|
|
flag.Parse()
|
|
|
|
hc := &http.Client{Timeout: 5 * time.Minute}
|
|
ctx := context.Background()
|
|
|
|
if !*skipPop {
|
|
f, err := os.Open(*csvPath)
|
|
if err != nil {
|
|
log.Fatalf("open csv: %v", err)
|
|
}
|
|
defer f.Close()
|
|
cr := csv.NewReader(f)
|
|
cr.FieldsPerRecord = -1
|
|
if _, err := cr.Read(); err != nil { // skip header
|
|
log.Fatalf("read header: %v", err)
|
|
}
|
|
|
|
stats, err := corpusingest.Run(ctx, corpusingest.Config{
|
|
GatewayURL: *gateway,
|
|
IndexName: indexName,
|
|
Dimension: dim,
|
|
Distance: "cosine",
|
|
EmbedBatch: 16, // matches Ollama-on-A4000 sweet spot
|
|
EmbedWorkers: 8, // matches Ollama-on-A4000 sweet spot
|
|
AddBatch: 1000, // empirically fine; vectord BatchAdd lock-amortized at f1c1883
|
|
Limit: *limit,
|
|
DropExisting: *drop,
|
|
HTTPClient: hc,
|
|
LogProgress: 10 * time.Second,
|
|
}, &workersCSV{cr: cr})
|
|
if err != nil {
|
|
log.Fatalf("ingest: %v", err)
|
|
}
|
|
fmt.Printf("[sc] populate done: scanned=%d embedded=%d added=%d wall=%v\n",
|
|
stats.Scanned, stats.Embedded, stats.Added, stats.Wall.Round(time.Millisecond))
|
|
}
|
|
|
|
// Validate semantic queries against the populated index.
|
|
qs := defaultQueries()
|
|
if *queries != "default" {
|
|
qs = strings.Split(*queries, ";")
|
|
}
|
|
for _, q := range qs {
|
|
runQuery(hc, *gateway, q)
|
|
}
|
|
}
|
|
|
|
func defaultQueries() []string {
|
|
return []string{
|
|
"CNC operator with first article and gauge R&R experience",
|
|
"forklift driver OSHA-30 certified warehouse",
|
|
"warehouse picker night shift bilingual",
|
|
"dental hygienist three years experience",
|
|
"electrician with industrial wiring background",
|
|
}
|
|
}
|
|
|
|
// runQuery embeds a query, searches the index, prints top hits.
|
|
// Stays in this driver (not corpusingest) — query validation is
|
|
// per-corpus concern, not part of the ingest pipeline.
|
|
func runQuery(hc *http.Client, gateway, q string) {
|
|
t0 := time.Now()
|
|
body, _ := json.Marshal(map[string]any{"texts": []string{q}})
|
|
req, _ := http.NewRequest(http.MethodPost, gateway+"/v1/embed", bytes.NewReader(body))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err := hc.Do(req)
|
|
if err != nil {
|
|
fmt.Printf("[sc] query %q: embed err: %v\n", q, err)
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
var er struct {
|
|
Vectors [][]float32 `json:"vectors"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&er); err != nil || len(er.Vectors) == 0 {
|
|
fmt.Printf("[sc] query %q: embed decode err: %v\n", q, err)
|
|
return
|
|
}
|
|
embedDur := time.Since(t0)
|
|
|
|
t1 := time.Now()
|
|
body, _ = json.Marshal(map[string]any{"vector": er.Vectors[0], "k": 5})
|
|
req, _ = http.NewRequest(http.MethodPost,
|
|
gateway+"/v1/vectors/index/"+indexName+"/search", bytes.NewReader(body))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err = hc.Do(req)
|
|
if err != nil {
|
|
fmt.Printf("[sc] query %q: search err: %v\n", q, err)
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
searchDur := time.Since(t1)
|
|
var sr struct {
|
|
Results []struct {
|
|
ID string `json:"id"`
|
|
Distance float32 `json:"distance"`
|
|
Metadata json.RawMessage `json:"metadata"`
|
|
} `json:"results"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&sr); err != nil {
|
|
fmt.Printf("[sc] query %q: decode err: %v\n", q, err)
|
|
return
|
|
}
|
|
fmt.Printf("\n[sc] %q (embed=%v search=%v)\n", q, embedDur.Round(time.Millisecond), searchDur.Round(time.Millisecond))
|
|
for i, r := range sr.Results {
|
|
fmt.Printf(" %d. %s d=%.4f %s\n", i+1, r.ID, r.Distance, string(r.Metadata))
|
|
}
|
|
}
|
|
|
|
// io.EOF imported transitively via corpusingest; keep the explicit
|
|
// reference so a hypothetical future "EOF means done" check in this
|
|
// driver's Source impl doesn't need a fresh import line.
|
|
var _ = io.EOF
|