// Staffing co-pilot scale test driver — workers_500k corpus. // // Pipeline: workers_500k.csv → /v1/embed → /v1/vectors/index/workers_500k/add. // The pipeline itself lives in internal/corpusingest; this driver // provides the CSV → Row mapping and the post-ingest semantic queries // that are the human-readable check ("does forklift OSHA-30 actually // retrieve forklift workers?"). // // Designed to be re-run safely; index gets DELETEd at the start // when -drop is set so leftover state doesn't bias recall. package main import ( "bytes" "context" "encoding/csv" "encoding/json" "errors" "flag" "fmt" "log" "net/http" "os" "strings" "time" "git.agentview.dev/profit/golangLAKEHOUSE/internal/corpusingest" ) const ( indexName = "workers_500k" dim = 768 // Column indexes in workers_500k.csv. Stable contract; if the CSV // schema changes these need updating. colWorkerID = 0 colName = 1 colRole = 2 colCity = 5 colState = 6 colSkills = 8 colCerts = 9 colResume = 17 ) // workersCSV implements corpusingest.Source. CSV reader state + // row → Row mapping live here; the embed/add pipeline is generic. type workersCSV struct { cr *csv.Reader } func (s *workersCSV) Next() (corpusingest.Row, error) { for { row, err := s.cr.Read() if err != nil { return corpusingest.Row{}, err } if len(row) <= colResume { continue // skip malformed rows; matches prior behavior } id := strings.TrimSpace(row[colWorkerID]) return corpusingest.Row{ ID: "w-" + id, Text: buildWorkerText(row), Metadata: map[string]any{ "name": row[colName], "role": row[colRole], "city": row[colCity], "state": row[colState], }, }, nil } } // buildWorkerText concatenates staffing-relevant columns into the // embed-text. Order: role first (most semantically dense), then // location, skills, certs, prose resume. Embedding models weight // earlier tokens slightly more, so the front matter matters. func buildWorkerText(row []string) string { var b strings.Builder b.WriteString(row[colRole]) b.WriteString(" in ") b.WriteString(row[colCity]) b.WriteString(", ") b.WriteString(row[colState]) b.WriteString(". Skills: ") b.WriteString(row[colSkills]) b.WriteString(". Certifications: ") b.WriteString(row[colCerts]) b.WriteString(". ") b.WriteString(row[colResume]) return b.String() } func main() { var ( gateway = flag.String("gateway", "http://127.0.0.1:3110", "gateway base URL") csvPath = flag.String("csv", "/tmp/rs/workers_500k.csv", "path to workers CSV") limit = flag.Int("limit", 0, "limit rows (0 = all)") queries = flag.String("queries", "default", "default | ") skipPop = flag.Bool("skip-populate", false, "skip embed+add, only run queries") drop = flag.Bool("drop", true, "DELETE index before populate (default true for clean recall)") ) flag.Parse() hc := &http.Client{Timeout: 5 * time.Minute} ctx := context.Background() if !*skipPop { f, err := os.Open(*csvPath) if err != nil { log.Fatalf("open csv: %v", err) } defer f.Close() cr := csv.NewReader(f) cr.FieldsPerRecord = -1 if _, err := cr.Read(); err != nil { // skip header log.Fatalf("read header: %v", err) } stats, err := corpusingest.Run(ctx, corpusingest.Config{ GatewayURL: *gateway, IndexName: indexName, Dimension: dim, Distance: "cosine", EmbedBatch: 16, // matches Ollama-on-A4000 sweet spot EmbedWorkers: 8, // matches Ollama-on-A4000 sweet spot AddBatch: 1000, // empirically fine; vectord BatchAdd lock-amortized at f1c1883 Limit: *limit, DropExisting: *drop, HTTPClient: hc, LogProgress: 10 * time.Second, }, &workersCSV{cr: cr}) if err != nil { // ErrPartialFailure means SOME batches failed but we still // have a corpus to query. Report and continue rather than // nuking the run for transient Ollama hiccups. if errors.Is(err, corpusingest.ErrPartialFailure) { fmt.Printf("[sc] WARN partial failure: %v\n", err) } else { log.Fatalf("ingest: %v", err) } } fmt.Printf("[sc] populate done: scanned=%d embedded=%d added=%d failed=%d wall=%v\n", stats.Scanned, stats.Embedded, stats.Added, stats.FailedBatches, stats.Wall.Round(time.Millisecond)) } // Validate semantic queries against the populated index. qs := defaultQueries() if *queries != "default" { qs = strings.Split(*queries, ";") } for _, q := range qs { runQuery(hc, *gateway, q) } } func defaultQueries() []string { return []string{ "CNC operator with first article and gauge R&R experience", "forklift driver OSHA-30 certified warehouse", "warehouse picker night shift bilingual", "dental hygienist three years experience", "electrician with industrial wiring background", } } // runQuery embeds a query, searches the index, prints top hits. // Stays in this driver (not corpusingest) — query validation is // per-corpus concern, not part of the ingest pipeline. func runQuery(hc *http.Client, gateway, q string) { t0 := time.Now() body, _ := json.Marshal(map[string]any{"texts": []string{q}}) req, _ := http.NewRequest(http.MethodPost, gateway+"/v1/embed", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") resp, err := hc.Do(req) if err != nil { fmt.Printf("[sc] query %q: embed err: %v\n", q, err) return } defer resp.Body.Close() var er struct { Vectors [][]float32 `json:"vectors"` } if err := json.NewDecoder(resp.Body).Decode(&er); err != nil || len(er.Vectors) == 0 { fmt.Printf("[sc] query %q: embed decode err: %v\n", q, err) return } embedDur := time.Since(t0) t1 := time.Now() body, _ = json.Marshal(map[string]any{"vector": er.Vectors[0], "k": 5}) req, _ = http.NewRequest(http.MethodPost, gateway+"/v1/vectors/index/"+indexName+"/search", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") resp, err = hc.Do(req) if err != nil { fmt.Printf("[sc] query %q: search err: %v\n", q, err) return } defer resp.Body.Close() searchDur := time.Since(t1) var sr struct { Results []struct { ID string `json:"id"` Distance float32 `json:"distance"` Metadata json.RawMessage `json:"metadata"` } `json:"results"` } if err := json.NewDecoder(resp.Body).Decode(&sr); err != nil { fmt.Printf("[sc] query %q: decode err: %v\n", q, err) return } fmt.Printf("\n[sc] %q (embed=%v search=%v)\n", q, embedDur.Round(time.Millisecond), searchDur.Round(time.Millisecond)) for i, r := range sr.Results { fmt.Printf(" %d. %s d=%.4f %s\n", i+1, r.ID, r.Distance, string(r.Metadata)) } }