golangLAKEHOUSE/scripts/staffing_500k/main.go

// Staffing co-pilot scale test driver — workers_500k corpus.
//
// Pipeline: workers_500k.csv → /v1/embed → /v1/vectors/index/workers_500k/add.
// The pipeline itself lives in internal/corpusingest; this driver
// provides the CSV → Row mapping and the post-ingest semantic queries
// that are the human-readable check ("does forklift OSHA-30 actually
// retrieve forklift workers?").
//
// Designed to be re-run safely; index gets DELETEd at the start
// when -drop is set so leftover state doesn't bias recall.

package main

import (
	"bytes"
	"context"
	"encoding/csv"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"strings"
	"time"

	"git.agentview.dev/profit/golangLAKEHOUSE/internal/corpusingest"
)

const (
	indexName = "workers_500k"
	dim       = 768

	// Column indexes in workers_500k.csv. Stable contract; if the CSV
	// schema changes these need updating.
	colWorkerID = 0
	colName     = 1
	colRole     = 2
	colCity     = 5
	colState    = 6
	colSkills   = 8
	colCerts    = 9
	colResume   = 17
)

// workersCSV implements corpusingest.Source. CSV reader state +
// row → Row mapping live here; the embed/add pipeline is generic.
type workersCSV struct {
	cr *csv.Reader
}

func (s *workersCSV) Next() (corpusingest.Row, error) {
	for {
		row, err := s.cr.Read()
		if err != nil {
			return corpusingest.Row{}, err
		}
		if len(row) <= colResume {
			continue // skip malformed rows; matches prior behavior
		}
		id := strings.TrimSpace(row[colWorkerID])
		return corpusingest.Row{
			ID:   "w-" + id,
			Text: buildWorkerText(row),
			Metadata: map[string]any{
				"name":  row[colName],
				"role":  row[colRole],
				"city":  row[colCity],
				"state": row[colState],
			},
		}, nil
	}
}

// buildWorkerText concatenates staffing-relevant columns into the
// embed-text. Order: role first (most semantically dense), then
// location, skills, certs, prose resume. Embedding models weight
// earlier tokens slightly more, so the front matter matters.
func buildWorkerText(row []string) string {
	var b strings.Builder
	b.WriteString(row[colRole])
	b.WriteString(" in ")
	b.WriteString(row[colCity])
	b.WriteString(", ")
	b.WriteString(row[colState])
	b.WriteString(". Skills: ")
	b.WriteString(row[colSkills])
	b.WriteString(". Certifications: ")
	b.WriteString(row[colCerts])
	b.WriteString(". ")
	b.WriteString(row[colResume])
	return b.String()
}

func main() {
	var (
		gateway = flag.String("gateway", "http://127.0.0.1:3110", "gateway base URL")
		csvPath = flag.String("csv", "/tmp/rs/workers_500k.csv", "path to workers CSV")
		limit   = flag.Int("limit", 0, "limit rows (0 = all)")
		queries = flag.String("queries", "default", "default | <semicolon-separated query strings>")
		skipPop = flag.Bool("skip-populate", false, "skip embed+add, only run queries")
		drop    = flag.Bool("drop", true, "DELETE index before populate (default true for clean recall)")
	)
	flag.Parse()

	hc := &http.Client{Timeout: 5 * time.Minute}
	ctx := context.Background()

	if !*skipPop {
		f, err := os.Open(*csvPath)
		if err != nil {
			log.Fatalf("open csv: %v", err)
		}
		defer f.Close()
		cr := csv.NewReader(f)
		cr.FieldsPerRecord = -1
		if _, err := cr.Read(); err != nil { // skip header
			log.Fatalf("read header: %v", err)
		}

		stats, err := corpusingest.Run(ctx, corpusingest.Config{
			GatewayURL:   *gateway,
			IndexName:    indexName,
			Dimension:    dim,
			Distance:     "cosine",
			EmbedBatch:   16,   // matches Ollama-on-A4000 sweet spot
			EmbedWorkers: 8,    // matches Ollama-on-A4000 sweet spot
			AddBatch:     1000, // empirically fine; vectord BatchAdd lock-amortized at f1c1883
			Limit:        *limit,
			DropExisting: *drop,
			HTTPClient:   hc,
			LogProgress:  10 * time.Second,
		}, &workersCSV{cr: cr})
		if err != nil {
			log.Fatalf("ingest: %v", err)
		}
		fmt.Printf("[sc] populate done: scanned=%d embedded=%d added=%d wall=%v\n",
			stats.Scanned, stats.Embedded, stats.Added, stats.Wall.Round(time.Millisecond))
	}

	// Validate semantic queries against the populated index.
	qs := defaultQueries()
	if *queries != "default" {
		qs = strings.Split(*queries, ";")
	}
	for _, q := range qs {
		runQuery(hc, *gateway, q)
	}
}

func defaultQueries() []string {
	return []string{
		"CNC operator with first article and gauge R&R experience",
		"forklift driver OSHA-30 certified warehouse",
		"warehouse picker night shift bilingual",
		"dental hygienist three years experience",
		"electrician with industrial wiring background",
	}
}

// runQuery embeds a query, searches the index, prints top hits.
// Stays in this driver (not corpusingest) — query validation is
// per-corpus concern, not part of the ingest pipeline.
func runQuery(hc *http.Client, gateway, q string) {
	t0 := time.Now()
	body, _ := json.Marshal(map[string]any{"texts": []string{q}})
	req, _ := http.NewRequest(http.MethodPost, gateway+"/v1/embed", bytes.NewReader(body))
	req.Header.Set("Content-Type", "application/json")
	resp, err := hc.Do(req)
	if err != nil {
		fmt.Printf("[sc] query %q: embed err: %v\n", q, err)
		return
	}
	defer resp.Body.Close()
	var er struct {
		Vectors [][]float32 `json:"vectors"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&er); err != nil || len(er.Vectors) == 0 {
		fmt.Printf("[sc] query %q: embed decode err: %v\n", q, err)
		return
	}
	embedDur := time.Since(t0)

	t1 := time.Now()
	body, _ = json.Marshal(map[string]any{"vector": er.Vectors[0], "k": 5})
	req, _ = http.NewRequest(http.MethodPost,
		gateway+"/v1/vectors/index/"+indexName+"/search", bytes.NewReader(body))
	req.Header.Set("Content-Type", "application/json")
	resp, err = hc.Do(req)
	if err != nil {
		fmt.Printf("[sc] query %q: search err: %v\n", q, err)
		return
	}
	defer resp.Body.Close()
	searchDur := time.Since(t1)
	var sr struct {
		Results []struct {
			ID       string          `json:"id"`
			Distance float32         `json:"distance"`
			Metadata json.RawMessage `json:"metadata"`
		} `json:"results"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&sr); err != nil {
		fmt.Printf("[sc] query %q: decode err: %v\n", q, err)
		return
	}
	fmt.Printf("\n[sc] %q (embed=%v search=%v)\n", q, embedDur.Round(time.Millisecond), searchDur.Round(time.Millisecond))
	for i, r := range sr.Results {
		fmt.Printf("  %d. %s d=%.4f  %s\n", i+1, r.ID, r.Distance, string(r.Metadata))
	}
}

// io.EOF imported transitively via corpusingest; keep the explicit
// reference so a hypothetical future "EOF means done" check in this
// driver's Source impl doesn't need a fresh import line.
var _ = io.EOF