// Staffing candidates corpus driver — second corpus on the Go side // after workers_500k. Validates the corpusingest substrate against // real production-shape parquet data and gives the matrix indexer a // second corpus to compose against. // // Source: /home/profit/lakehouse/data/datasets/candidates.parquet // (1000 candidates, 11 columns including skills + status + years). // // IDs are prefixed "c-" so merged matrix results across corpora // stay unambiguous (workers use "w-"). // // Post-ingest: runs a real staffing query through /v1/matrix/search // against just the candidates corpus — first deep-field reality test // using the new pipeline. package main import ( "bytes" "context" "encoding/json" "errors" "flag" "fmt" "io" "log" "net/http" "os" "strings" "time" "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/memory" "github.com/apache/arrow-go/v18/parquet/file" "github.com/apache/arrow-go/v18/parquet/pqarrow" "git.agentview.dev/profit/golangLAKEHOUSE/internal/corpusingest" ) const ( indexName = "candidates" dim = 768 ) // candidatesSource implements corpusingest.Source over an in-memory // arrow.Table loaded from candidates.parquet. 1000 rows fits // comfortably in RAM; a chunked-record-batch reader is the next // abstraction when a multi-million-row parquet shows up. type candidatesSource struct { cols struct { id, firstName, lastName, email, phone, city, state, skills, status *array.String years, rate *array.Int64 } n int cur int } func newCandidatesSource(path string) (*candidatesSource, func(), error) { f, err := os.Open(path) if err != nil { return nil, nil, fmt.Errorf("open parquet: %w", err) } pf, err := file.NewParquetReader(f) if err != nil { f.Close() return nil, nil, fmt.Errorf("parquet reader: %w", err) } fr, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator) if err != nil { pf.Close() f.Close() return nil, nil, fmt.Errorf("arrow reader: %w", err) } table, err := fr.ReadTable(context.Background()) if err != nil { pf.Close() f.Close() return nil, nil, fmt.Errorf("read table: %w", err) } src := &candidatesSource{n: int(table.NumRows())} schema := table.Schema() stringColByName := func(name string) (*array.String, error) { idx := schema.FieldIndices(name) if len(idx) == 0 { return nil, fmt.Errorf("column %q not found", name) } ch := table.Column(idx[0]).Data() if ch.Len() == 0 { return nil, fmt.Errorf("column %q empty", name) } // Single-chunk assumption — ReadTable on a single-row-group // 1000-row parquet returns one chunk. If parquets get larger, // switch to RecordReader and iterate chunks. if n := len(ch.Chunks()); n != 1 { return nil, fmt.Errorf("column %q has %d chunks; only 1 supported here", name, n) } s, ok := ch.Chunk(0).(*array.String) if !ok { return nil, fmt.Errorf("column %q is %T, want *array.String", name, ch.Chunk(0)) } return s, nil } int64ColByName := func(name string) (*array.Int64, error) { idx := schema.FieldIndices(name) if len(idx) == 0 { return nil, fmt.Errorf("column %q not found", name) } ch := table.Column(idx[0]).Data() i, ok := ch.Chunk(0).(*array.Int64) if !ok { return nil, fmt.Errorf("column %q is %T, want *array.Int64", name, ch.Chunk(0)) } return i, nil } cleanup := func() { table.Release() pf.Close() f.Close() } for _, t := range []struct { name string dst **array.String }{ {"candidate_id", &src.cols.id}, {"first_name", &src.cols.firstName}, {"last_name", &src.cols.lastName}, {"email", &src.cols.email}, {"phone", &src.cols.phone}, {"city", &src.cols.city}, {"state", &src.cols.state}, {"skills", &src.cols.skills}, {"status", &src.cols.status}, } { col, err := stringColByName(t.name) if err != nil { cleanup() return nil, nil, err } *t.dst = col } for _, t := range []struct { name string dst **array.Int64 }{ {"years_experience", &src.cols.years}, {"hourly_rate_usd", &src.cols.rate}, } { col, err := int64ColByName(t.name) if err != nil { cleanup() return nil, nil, err } *t.dst = col } return src, cleanup, nil } func (s *candidatesSource) Next() (corpusingest.Row, error) { if s.cur >= s.n { return corpusingest.Row{}, io.EOF } i := s.cur s.cur++ candidateID := s.cols.id.Value(i) firstName := s.cols.firstName.Value(i) lastName := s.cols.lastName.Value(i) city := s.cols.city.Value(i) state := s.cols.state.Value(i) skills := s.cols.skills.Value(i) status := s.cols.status.Value(i) years := s.cols.years.Value(i) rate := s.cols.rate.Value(i) // Embed text: name + role-shape from skills + location + experience // + status. Order matters — embedding models weight earlier tokens // slightly more, so role-relevant signal (skills) goes first. var b strings.Builder b.WriteString("Candidate skills: ") b.WriteString(skills) b.WriteString(". Based in ") b.WriteString(city) b.WriteString(", ") b.WriteString(state) b.WriteString(". ") fmt.Fprintf(&b, "%d years experience. Status: %s. ", years, status) b.WriteString(firstName) b.WriteString(" ") b.WriteString(lastName) b.WriteString(".") return corpusingest.Row{ ID: "c-" + candidateID, Text: b.String(), Metadata: map[string]any{ "candidate_id": candidateID, "first_name": firstName, "last_name": lastName, "email": s.cols.email.Value(i), "phone": s.cols.phone.Value(i), "city": city, "state": state, "skills": skills, "status": status, "years_experience": years, "hourly_rate_usd": rate, }, }, nil } func main() { var ( gateway = flag.String("gateway", "http://127.0.0.1:3110", "gateway base URL") parquetPath = flag.String("parquet", "/home/profit/lakehouse/data/datasets/candidates.parquet", "candidates parquet") limit = flag.Int("limit", 0, "limit rows (0 = all 1000)") query = flag.String("query", "Python AWS Docker engineer in Chicago available now", "post-ingest reality-test query") drop = flag.Bool("drop", true, "DELETE candidates index before populate") skipPop = flag.Bool("skip-populate", false, "skip ingest, only run query") ) flag.Parse() hc := &http.Client{Timeout: 5 * time.Minute} ctx := context.Background() if !*skipPop { src, cleanup, err := newCandidatesSource(*parquetPath) if err != nil { log.Fatalf("open candidates source: %v", err) } defer cleanup() stats, err := corpusingest.Run(ctx, corpusingest.Config{ GatewayURL: *gateway, IndexName: indexName, Dimension: dim, Distance: "cosine", EmbedBatch: 16, EmbedWorkers: 8, AddBatch: 500, // 1000 candidates → 2 add calls; small batches keep memory bounded Limit: *limit, DropExisting: *drop, HTTPClient: hc, LogProgress: 5 * time.Second, }, src) if err != nil { if errors.Is(err, corpusingest.ErrPartialFailure) { fmt.Printf("[candidates] WARN partial failure: %v\n", err) } else { log.Fatalf("ingest: %v", err) } } fmt.Printf("[candidates] populate: scanned=%d embedded=%d added=%d failed=%d wall=%v\n", stats.Scanned, stats.Embedded, stats.Added, stats.FailedBatches, stats.Wall.Round(time.Millisecond)) } // Reality test — run a real staffing query through /v1/matrix/search // against just the candidates corpus. Multi-corpus retrieval against // workers + candidates is the next step. fmt.Printf("\n[candidates] reality test query: %q\n", *query) runMatrixQuery(hc, *gateway, *query) } func runMatrixQuery(hc *http.Client, gateway, query string) { body, _ := json.Marshal(map[string]any{ "query_text": query, "corpora": []string{indexName}, "k": 5, "per_corpus_k": 10, }) req, _ := http.NewRequest(http.MethodPost, gateway+"/v1/matrix/search", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") t0 := time.Now() resp, err := hc.Do(req) if err != nil { log.Fatalf("matrix search: %v", err) } defer resp.Body.Close() dur := time.Since(t0) if resp.StatusCode != 200 { preview, _ := io.ReadAll(io.LimitReader(resp.Body, 512)) log.Fatalf("matrix search %d: %s", resp.StatusCode, preview) } var sr struct { Results []struct { ID string `json:"id"` Distance float32 `json:"distance"` Corpus string `json:"corpus"` Metadata json.RawMessage `json:"metadata"` } `json:"results"` } if err := json.NewDecoder(resp.Body).Decode(&sr); err != nil { log.Fatalf("decode: %v", err) } fmt.Printf("[candidates] matrix returned %d hits in %v:\n", len(sr.Results), dur.Round(time.Millisecond)) for i, r := range sr.Results { fmt.Printf(" %d. %s d=%.4f corpus=%s\n %s\n", i+1, r.ID, r.Distance, r.Corpus, string(r.Metadata)) } }