Cross-lineage scrum review on the 12 commits of this session
(afbb506..06e7152) via Rust gateway :3100 with Opus + Kimi +
Qwen3-coder. Results:
Real findings landed:
1. Opus BLOCK — vectord BatchAdd intra-batch duplicates panic
coder/hnsw's "node not added" length-invariant. Fixed with
last-write-wins dedup inside BatchAdd before the pre-pass.
Regression test TestBatchAdd_IntraBatchDedup added.
2. Opus + Kimi convergent WARN — strings.Contains(err.Error(),
"status 404") was brittle string-matching to detect cold-
start playbook state. Fixed: ErrCorpusNotFound sentinel
returned by searchCorpus on HTTP 404; fetchPlaybookHits
uses errors.Is.
3. Opus WARN — corpusingest.Run returned nil on total batch
failure, masking broken pipelines as "empty corpora." Fixed:
Stats.FailedBatches counter, ErrPartialFailure sentinel
returned when nonzero. New regression test
TestRun_NonzeroFailedBatchesReturnsError.
4. Opus WARN — dead var _ = io.EOF in staffing_500k/main.go
was justified by a fictional comment. Removed.
Drivers (staffing_500k, staffing_candidates, staffing_workers)
updated to handle ErrPartialFailure gracefully — print warn, keep
running queries — rather than fatal'ing on transient hiccups
while still surfacing the failure clearly in the output.
Documented (no code change):
- Opus WARN: matrixd /matrix/downgrade reads
LH_FORCE_FULL_ENRICHMENT from process env when body omits
it. Comment now explains the opinionated default and points
callers wanting deterministic behavior to pass the field
explicitly.
False positives dismissed (caught and verified, NOT acted on):
A. Kimi BLOCK on errors.Is + wrapped error in cmd/matrixd:223.
Verified false: Search wraps with %w (fmt.Errorf("%w: %v",
ErrEmbed, err)), so errors.Is matches the chain correctly.
B. Kimi INFO "BatchAdd has no unit tests." Verified false:
batch_bench_test.go has BenchmarkBatchAdd; the new dedup
test TestBatchAdd_IntraBatchDedup adds another.
C. Opus BLOCK on missing finite/zero-norm pre-validation in
cmd/vectord:280-291. Verified false: line 272 already calls
vectord.ValidateVector before BatchAdd, so finite + zero-
norm IS checked. Pre-validation is exhaustive.
D. Opus WARN on relevance.go tokenRe (Opus self-corrected
mid-finding when realizing leading char counts toward token
length).
Qwen3-coder returned NO FINDINGS — known issue with very long
diffs through the OpenRouter free tier; lineage rotation worked
as designed (Opus + Kimi between them caught everything Qwen
would have).
15-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix, relevance, downgrade, playbook).
Unit tests all green (corpusingest +1, vectord +1).
Per feedback_cross_lineage_review.md: convergent finding #2 (404
detection) is the highest-signal one — both Opus and Kimi
flagged it independently. The other Opus findings stand on
single-reviewer signal but each one verified against the actual
code.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
304 lines
8.8 KiB
Go
304 lines
8.8 KiB
Go
// Staffing candidates corpus driver — second corpus on the Go side
|
|
// after workers_500k. Validates the corpusingest substrate against
|
|
// real production-shape parquet data and gives the matrix indexer a
|
|
// second corpus to compose against.
|
|
//
|
|
// Source: /home/profit/lakehouse/data/datasets/candidates.parquet
|
|
// (1000 candidates, 11 columns including skills + status + years).
|
|
//
|
|
// IDs are prefixed "c-" so merged matrix results across corpora
|
|
// stay unambiguous (workers use "w-").
|
|
//
|
|
// Post-ingest: runs a real staffing query through /v1/matrix/search
|
|
// against just the candidates corpus — first deep-field reality test
|
|
// using the new pipeline.
|
|
|
|
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/apache/arrow-go/v18/arrow/array"
|
|
"github.com/apache/arrow-go/v18/arrow/memory"
|
|
"github.com/apache/arrow-go/v18/parquet/file"
|
|
"github.com/apache/arrow-go/v18/parquet/pqarrow"
|
|
|
|
"git.agentview.dev/profit/golangLAKEHOUSE/internal/corpusingest"
|
|
)
|
|
|
|
const (
|
|
indexName = "candidates"
|
|
dim = 768
|
|
)
|
|
|
|
// candidatesSource implements corpusingest.Source over an in-memory
|
|
// arrow.Table loaded from candidates.parquet. 1000 rows fits
|
|
// comfortably in RAM; a chunked-record-batch reader is the next
|
|
// abstraction when a multi-million-row parquet shows up.
|
|
type candidatesSource struct {
|
|
cols struct {
|
|
id, firstName, lastName, email, phone, city, state, skills, status *array.String
|
|
years, rate *array.Int64
|
|
}
|
|
n int
|
|
cur int
|
|
}
|
|
|
|
func newCandidatesSource(path string) (*candidatesSource, func(), error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("open parquet: %w", err)
|
|
}
|
|
pf, err := file.NewParquetReader(f)
|
|
if err != nil {
|
|
f.Close()
|
|
return nil, nil, fmt.Errorf("parquet reader: %w", err)
|
|
}
|
|
fr, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
|
|
if err != nil {
|
|
pf.Close()
|
|
f.Close()
|
|
return nil, nil, fmt.Errorf("arrow reader: %w", err)
|
|
}
|
|
table, err := fr.ReadTable(context.Background())
|
|
if err != nil {
|
|
pf.Close()
|
|
f.Close()
|
|
return nil, nil, fmt.Errorf("read table: %w", err)
|
|
}
|
|
|
|
src := &candidatesSource{n: int(table.NumRows())}
|
|
schema := table.Schema()
|
|
|
|
stringColByName := func(name string) (*array.String, error) {
|
|
idx := schema.FieldIndices(name)
|
|
if len(idx) == 0 {
|
|
return nil, fmt.Errorf("column %q not found", name)
|
|
}
|
|
ch := table.Column(idx[0]).Data()
|
|
if ch.Len() == 0 {
|
|
return nil, fmt.Errorf("column %q empty", name)
|
|
}
|
|
// Single-chunk assumption — ReadTable on a single-row-group
|
|
// 1000-row parquet returns one chunk. If parquets get larger,
|
|
// switch to RecordReader and iterate chunks.
|
|
if n := len(ch.Chunks()); n != 1 {
|
|
return nil, fmt.Errorf("column %q has %d chunks; only 1 supported here", name, n)
|
|
}
|
|
s, ok := ch.Chunk(0).(*array.String)
|
|
if !ok {
|
|
return nil, fmt.Errorf("column %q is %T, want *array.String", name, ch.Chunk(0))
|
|
}
|
|
return s, nil
|
|
}
|
|
int64ColByName := func(name string) (*array.Int64, error) {
|
|
idx := schema.FieldIndices(name)
|
|
if len(idx) == 0 {
|
|
return nil, fmt.Errorf("column %q not found", name)
|
|
}
|
|
ch := table.Column(idx[0]).Data()
|
|
i, ok := ch.Chunk(0).(*array.Int64)
|
|
if !ok {
|
|
return nil, fmt.Errorf("column %q is %T, want *array.Int64", name, ch.Chunk(0))
|
|
}
|
|
return i, nil
|
|
}
|
|
|
|
cleanup := func() {
|
|
table.Release()
|
|
pf.Close()
|
|
f.Close()
|
|
}
|
|
for _, t := range []struct {
|
|
name string
|
|
dst **array.String
|
|
}{
|
|
{"candidate_id", &src.cols.id},
|
|
{"first_name", &src.cols.firstName},
|
|
{"last_name", &src.cols.lastName},
|
|
{"email", &src.cols.email},
|
|
{"phone", &src.cols.phone},
|
|
{"city", &src.cols.city},
|
|
{"state", &src.cols.state},
|
|
{"skills", &src.cols.skills},
|
|
{"status", &src.cols.status},
|
|
} {
|
|
col, err := stringColByName(t.name)
|
|
if err != nil {
|
|
cleanup()
|
|
return nil, nil, err
|
|
}
|
|
*t.dst = col
|
|
}
|
|
for _, t := range []struct {
|
|
name string
|
|
dst **array.Int64
|
|
}{
|
|
{"years_experience", &src.cols.years},
|
|
{"hourly_rate_usd", &src.cols.rate},
|
|
} {
|
|
col, err := int64ColByName(t.name)
|
|
if err != nil {
|
|
cleanup()
|
|
return nil, nil, err
|
|
}
|
|
*t.dst = col
|
|
}
|
|
return src, cleanup, nil
|
|
}
|
|
|
|
func (s *candidatesSource) Next() (corpusingest.Row, error) {
|
|
if s.cur >= s.n {
|
|
return corpusingest.Row{}, io.EOF
|
|
}
|
|
i := s.cur
|
|
s.cur++
|
|
|
|
candidateID := s.cols.id.Value(i)
|
|
firstName := s.cols.firstName.Value(i)
|
|
lastName := s.cols.lastName.Value(i)
|
|
city := s.cols.city.Value(i)
|
|
state := s.cols.state.Value(i)
|
|
skills := s.cols.skills.Value(i)
|
|
status := s.cols.status.Value(i)
|
|
years := s.cols.years.Value(i)
|
|
rate := s.cols.rate.Value(i)
|
|
|
|
// Embed text: name + role-shape from skills + location + experience
|
|
// + status. Order matters — embedding models weight earlier tokens
|
|
// slightly more, so role-relevant signal (skills) goes first.
|
|
var b strings.Builder
|
|
b.WriteString("Candidate skills: ")
|
|
b.WriteString(skills)
|
|
b.WriteString(". Based in ")
|
|
b.WriteString(city)
|
|
b.WriteString(", ")
|
|
b.WriteString(state)
|
|
b.WriteString(". ")
|
|
fmt.Fprintf(&b, "%d years experience. Status: %s. ", years, status)
|
|
b.WriteString(firstName)
|
|
b.WriteString(" ")
|
|
b.WriteString(lastName)
|
|
b.WriteString(".")
|
|
|
|
return corpusingest.Row{
|
|
ID: "c-" + candidateID,
|
|
Text: b.String(),
|
|
Metadata: map[string]any{
|
|
"candidate_id": candidateID,
|
|
"first_name": firstName,
|
|
"last_name": lastName,
|
|
"email": s.cols.email.Value(i),
|
|
"phone": s.cols.phone.Value(i),
|
|
"city": city,
|
|
"state": state,
|
|
"skills": skills,
|
|
"status": status,
|
|
"years_experience": years,
|
|
"hourly_rate_usd": rate,
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
func main() {
|
|
var (
|
|
gateway = flag.String("gateway", "http://127.0.0.1:3110", "gateway base URL")
|
|
parquetPath = flag.String("parquet", "/home/profit/lakehouse/data/datasets/candidates.parquet", "candidates parquet")
|
|
limit = flag.Int("limit", 0, "limit rows (0 = all 1000)")
|
|
query = flag.String("query", "Python AWS Docker engineer in Chicago available now", "post-ingest reality-test query")
|
|
drop = flag.Bool("drop", true, "DELETE candidates index before populate")
|
|
skipPop = flag.Bool("skip-populate", false, "skip ingest, only run query")
|
|
)
|
|
flag.Parse()
|
|
|
|
hc := &http.Client{Timeout: 5 * time.Minute}
|
|
ctx := context.Background()
|
|
|
|
if !*skipPop {
|
|
src, cleanup, err := newCandidatesSource(*parquetPath)
|
|
if err != nil {
|
|
log.Fatalf("open candidates source: %v", err)
|
|
}
|
|
defer cleanup()
|
|
|
|
stats, err := corpusingest.Run(ctx, corpusingest.Config{
|
|
GatewayURL: *gateway,
|
|
IndexName: indexName,
|
|
Dimension: dim,
|
|
Distance: "cosine",
|
|
EmbedBatch: 16,
|
|
EmbedWorkers: 8,
|
|
AddBatch: 500, // 1000 candidates → 2 add calls; small batches keep memory bounded
|
|
Limit: *limit,
|
|
DropExisting: *drop,
|
|
HTTPClient: hc,
|
|
LogProgress: 5 * time.Second,
|
|
}, src)
|
|
if err != nil {
|
|
if errors.Is(err, corpusingest.ErrPartialFailure) {
|
|
fmt.Printf("[candidates] WARN partial failure: %v\n", err)
|
|
} else {
|
|
log.Fatalf("ingest: %v", err)
|
|
}
|
|
}
|
|
fmt.Printf("[candidates] populate: scanned=%d embedded=%d added=%d failed=%d wall=%v\n",
|
|
stats.Scanned, stats.Embedded, stats.Added, stats.FailedBatches,
|
|
stats.Wall.Round(time.Millisecond))
|
|
}
|
|
|
|
// Reality test — run a real staffing query through /v1/matrix/search
|
|
// against just the candidates corpus. Multi-corpus retrieval against
|
|
// workers + candidates is the next step.
|
|
fmt.Printf("\n[candidates] reality test query: %q\n", *query)
|
|
runMatrixQuery(hc, *gateway, *query)
|
|
}
|
|
|
|
func runMatrixQuery(hc *http.Client, gateway, query string) {
|
|
body, _ := json.Marshal(map[string]any{
|
|
"query_text": query,
|
|
"corpora": []string{indexName},
|
|
"k": 5,
|
|
"per_corpus_k": 10,
|
|
})
|
|
req, _ := http.NewRequest(http.MethodPost, gateway+"/v1/matrix/search", bytes.NewReader(body))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
t0 := time.Now()
|
|
resp, err := hc.Do(req)
|
|
if err != nil {
|
|
log.Fatalf("matrix search: %v", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
dur := time.Since(t0)
|
|
if resp.StatusCode != 200 {
|
|
preview, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
|
|
log.Fatalf("matrix search %d: %s", resp.StatusCode, preview)
|
|
}
|
|
var sr struct {
|
|
Results []struct {
|
|
ID string `json:"id"`
|
|
Distance float32 `json:"distance"`
|
|
Corpus string `json:"corpus"`
|
|
Metadata json.RawMessage `json:"metadata"`
|
|
} `json:"results"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&sr); err != nil {
|
|
log.Fatalf("decode: %v", err)
|
|
}
|
|
fmt.Printf("[candidates] matrix returned %d hits in %v:\n", len(sr.Results), dur.Round(time.Millisecond))
|
|
for i, r := range sr.Results {
|
|
fmt.Printf(" %d. %s d=%.4f corpus=%s\n %s\n",
|
|
i+1, r.ID, r.Distance, r.Corpus, string(r.Metadata))
|
|
}
|
|
}
|