golangLAKEHOUSE/scripts/staffing_candidates/main.go

// Staffing candidates corpus driver — second corpus on the Go side
// after workers_500k. Validates the corpusingest substrate against
// real production-shape parquet data and gives the matrix indexer a
// second corpus to compose against.
//
// Source: /home/profit/lakehouse/data/datasets/candidates.parquet
// (1000 candidates, 11 columns including skills + status + years).
//
// IDs are prefixed "c-" so merged matrix results across corpora
// stay unambiguous (workers use "w-").
//
// Post-ingest: runs a real staffing query through /v1/matrix/search
// against just the candidates corpus — first deep-field reality test
// using the new pipeline.

package main

import (
	"bytes"
	"context"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"strings"
	"time"

	"github.com/apache/arrow-go/v18/arrow/array"
	"github.com/apache/arrow-go/v18/arrow/memory"
	"github.com/apache/arrow-go/v18/parquet/file"
	"github.com/apache/arrow-go/v18/parquet/pqarrow"

	"git.agentview.dev/profit/golangLAKEHOUSE/internal/corpusingest"
)

const (
	indexName = "candidates"
	dim       = 768
)

// candidatesSource implements corpusingest.Source over an in-memory
// arrow.Table loaded from candidates.parquet. 1000 rows fits
// comfortably in RAM; a chunked-record-batch reader is the next
// abstraction when a multi-million-row parquet shows up.
type candidatesSource struct {
	cols struct {
		id, firstName, lastName, email, phone, city, state, skills, status *array.String
		years, rate                                                        *array.Int64
	}
	n   int
	cur int
}

func newCandidatesSource(path string) (*candidatesSource, func(), error) {
	f, err := os.Open(path)
	if err != nil {
		return nil, nil, fmt.Errorf("open parquet: %w", err)
	}
	pf, err := file.NewParquetReader(f)
	if err != nil {
		f.Close()
		return nil, nil, fmt.Errorf("parquet reader: %w", err)
	}
	fr, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
	if err != nil {
		pf.Close()
		f.Close()
		return nil, nil, fmt.Errorf("arrow reader: %w", err)
	}
	table, err := fr.ReadTable(context.Background())
	if err != nil {
		pf.Close()
		f.Close()
		return nil, nil, fmt.Errorf("read table: %w", err)
	}

	src := &candidatesSource{n: int(table.NumRows())}
	schema := table.Schema()

	stringColByName := func(name string) (*array.String, error) {
		idx := schema.FieldIndices(name)
		if len(idx) == 0 {
			return nil, fmt.Errorf("column %q not found", name)
		}
		ch := table.Column(idx[0]).Data()
		if ch.Len() == 0 {
			return nil, fmt.Errorf("column %q empty", name)
		}
		// Single-chunk assumption — ReadTable on a single-row-group
		// 1000-row parquet returns one chunk. If parquets get larger,
		// switch to RecordReader and iterate chunks.
		if n := len(ch.Chunks()); n != 1 {
			return nil, fmt.Errorf("column %q has %d chunks; only 1 supported here", name, n)
		}
		s, ok := ch.Chunk(0).(*array.String)
		if !ok {
			return nil, fmt.Errorf("column %q is %T, want *array.String", name, ch.Chunk(0))
		}
		return s, nil
	}
	int64ColByName := func(name string) (*array.Int64, error) {
		idx := schema.FieldIndices(name)
		if len(idx) == 0 {
			return nil, fmt.Errorf("column %q not found", name)
		}
		ch := table.Column(idx[0]).Data()
		i, ok := ch.Chunk(0).(*array.Int64)
		if !ok {
			return nil, fmt.Errorf("column %q is %T, want *array.Int64", name, ch.Chunk(0))
		}
		return i, nil
	}

	cleanup := func() {
		table.Release()
		pf.Close()
		f.Close()
	}
	for _, t := range []struct {
		name string
		dst  **array.String
	}{
		{"candidate_id", &src.cols.id},
		{"first_name", &src.cols.firstName},
		{"last_name", &src.cols.lastName},
		{"email", &src.cols.email},
		{"phone", &src.cols.phone},
		{"city", &src.cols.city},
		{"state", &src.cols.state},
		{"skills", &src.cols.skills},
		{"status", &src.cols.status},
	} {
		col, err := stringColByName(t.name)
		if err != nil {
			cleanup()
			return nil, nil, err
		}
		*t.dst = col
	}
	for _, t := range []struct {
		name string
		dst  **array.Int64
	}{
		{"years_experience", &src.cols.years},
		{"hourly_rate_usd", &src.cols.rate},
	} {
		col, err := int64ColByName(t.name)
		if err != nil {
			cleanup()
			return nil, nil, err
		}
		*t.dst = col
	}
	return src, cleanup, nil
}

func (s *candidatesSource) Next() (corpusingest.Row, error) {
	if s.cur >= s.n {
		return corpusingest.Row{}, io.EOF
	}
	i := s.cur
	s.cur++

	candidateID := s.cols.id.Value(i)
	firstName := s.cols.firstName.Value(i)
	lastName := s.cols.lastName.Value(i)
	city := s.cols.city.Value(i)
	state := s.cols.state.Value(i)
	skills := s.cols.skills.Value(i)
	status := s.cols.status.Value(i)
	years := s.cols.years.Value(i)
	rate := s.cols.rate.Value(i)

	// Embed text: name + role-shape from skills + location + experience
	// + status. Order matters — embedding models weight earlier tokens
	// slightly more, so role-relevant signal (skills) goes first.
	var b strings.Builder
	b.WriteString("Candidate skills: ")
	b.WriteString(skills)
	b.WriteString(". Based in ")
	b.WriteString(city)
	b.WriteString(", ")
	b.WriteString(state)
	b.WriteString(". ")
	fmt.Fprintf(&b, "%d years experience. Status: %s. ", years, status)
	b.WriteString(firstName)
	b.WriteString(" ")
	b.WriteString(lastName)
	b.WriteString(".")

	return corpusingest.Row{
		ID:   "c-" + candidateID,
		Text: b.String(),
		Metadata: map[string]any{
			"candidate_id":     candidateID,
			"first_name":       firstName,
			"last_name":        lastName,
			"email":            s.cols.email.Value(i),
			"phone":            s.cols.phone.Value(i),
			"city":             city,
			"state":            state,
			"skills":           skills,
			"status":           status,
			"years_experience": years,
			"hourly_rate_usd":  rate,
		},
	}, nil
}

func main() {
	var (
		gateway     = flag.String("gateway", "http://127.0.0.1:3110", "gateway base URL")
		parquetPath = flag.String("parquet", "/home/profit/lakehouse/data/datasets/candidates.parquet", "candidates parquet")
		limit       = flag.Int("limit", 0, "limit rows (0 = all 1000)")
		query       = flag.String("query", "Python AWS Docker engineer in Chicago available now", "post-ingest reality-test query")
		drop        = flag.Bool("drop", true, "DELETE candidates index before populate")
		skipPop     = flag.Bool("skip-populate", false, "skip ingest, only run query")
	)
	flag.Parse()

	hc := &http.Client{Timeout: 5 * time.Minute}
	ctx := context.Background()

	if !*skipPop {
		src, cleanup, err := newCandidatesSource(*parquetPath)
		if err != nil {
			log.Fatalf("open candidates source: %v", err)
		}
		defer cleanup()

		stats, err := corpusingest.Run(ctx, corpusingest.Config{
			GatewayURL:   *gateway,
			IndexName:    indexName,
			Dimension:    dim,
			Distance:     "cosine",
			EmbedBatch:   16,
			EmbedWorkers: 8,
			AddBatch:     500, // 1000 candidates → 2 add calls; small batches keep memory bounded
			Limit:        *limit,
			DropExisting: *drop,
			HTTPClient:   hc,
			LogProgress:  5 * time.Second,
		}, src)
		if err != nil {
			log.Fatalf("ingest: %v", err)
		}
		fmt.Printf("[candidates] populate: scanned=%d embedded=%d added=%d wall=%v\n",
			stats.Scanned, stats.Embedded, stats.Added, stats.Wall.Round(time.Millisecond))
	}

	// Reality test — run a real staffing query through /v1/matrix/search
	// against just the candidates corpus. Multi-corpus retrieval against
	// workers + candidates is the next step.
	fmt.Printf("\n[candidates] reality test query: %q\n", *query)
	runMatrixQuery(hc, *gateway, *query)
}

func runMatrixQuery(hc *http.Client, gateway, query string) {
	body, _ := json.Marshal(map[string]any{
		"query_text":   query,
		"corpora":      []string{indexName},
		"k":            5,
		"per_corpus_k": 10,
	})
	req, _ := http.NewRequest(http.MethodPost, gateway+"/v1/matrix/search", bytes.NewReader(body))
	req.Header.Set("Content-Type", "application/json")
	t0 := time.Now()
	resp, err := hc.Do(req)
	if err != nil {
		log.Fatalf("matrix search: %v", err)
	}
	defer resp.Body.Close()
	dur := time.Since(t0)
	if resp.StatusCode != 200 {
		preview, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
		log.Fatalf("matrix search %d: %s", resp.StatusCode, preview)
	}
	var sr struct {
		Results []struct {
			ID       string          `json:"id"`
			Distance float32         `json:"distance"`
			Corpus   string          `json:"corpus"`
			Metadata json.RawMessage `json:"metadata"`
		} `json:"results"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&sr); err != nil {
		log.Fatalf("decode: %v", err)
	}
	fmt.Printf("[candidates] matrix returned %d hits in %v:\n", len(sr.Results), dur.Round(time.Millisecond))
	for i, r := range sr.Results {
		fmt.Printf("  %d. %s d=%.4f corpus=%s\n     %s\n",
			i+1, r.ID, r.Distance, r.Corpus, string(r.Metadata))
	}
}