Cross-lineage scrum review on the 12 commits of this session
(afbb506..06e7152) via Rust gateway :3100 with Opus + Kimi +
Qwen3-coder. Results:
Real findings landed:
1. Opus BLOCK — vectord BatchAdd intra-batch duplicates panic
coder/hnsw's "node not added" length-invariant. Fixed with
last-write-wins dedup inside BatchAdd before the pre-pass.
Regression test TestBatchAdd_IntraBatchDedup added.
2. Opus + Kimi convergent WARN — strings.Contains(err.Error(),
"status 404") was brittle string-matching to detect cold-
start playbook state. Fixed: ErrCorpusNotFound sentinel
returned by searchCorpus on HTTP 404; fetchPlaybookHits
uses errors.Is.
3. Opus WARN — corpusingest.Run returned nil on total batch
failure, masking broken pipelines as "empty corpora." Fixed:
Stats.FailedBatches counter, ErrPartialFailure sentinel
returned when nonzero. New regression test
TestRun_NonzeroFailedBatchesReturnsError.
4. Opus WARN — dead var _ = io.EOF in staffing_500k/main.go
was justified by a fictional comment. Removed.
Drivers (staffing_500k, staffing_candidates, staffing_workers)
updated to handle ErrPartialFailure gracefully — print warn, keep
running queries — rather than fatal'ing on transient hiccups
while still surfacing the failure clearly in the output.
Documented (no code change):
- Opus WARN: matrixd /matrix/downgrade reads
LH_FORCE_FULL_ENRICHMENT from process env when body omits
it. Comment now explains the opinionated default and points
callers wanting deterministic behavior to pass the field
explicitly.
False positives dismissed (caught and verified, NOT acted on):
A. Kimi BLOCK on errors.Is + wrapped error in cmd/matrixd:223.
Verified false: Search wraps with %w (fmt.Errorf("%w: %v",
ErrEmbed, err)), so errors.Is matches the chain correctly.
B. Kimi INFO "BatchAdd has no unit tests." Verified false:
batch_bench_test.go has BenchmarkBatchAdd; the new dedup
test TestBatchAdd_IntraBatchDedup adds another.
C. Opus BLOCK on missing finite/zero-norm pre-validation in
cmd/vectord:280-291. Verified false: line 272 already calls
vectord.ValidateVector before BatchAdd, so finite + zero-
norm IS checked. Pre-validation is exhaustive.
D. Opus WARN on relevance.go tokenRe (Opus self-corrected
mid-finding when realizing leading char counts toward token
length).
Qwen3-coder returned NO FINDINGS — known issue with very long
diffs through the OpenRouter free tier; lineage rotation worked
as designed (Opus + Kimi between them caught everything Qwen
would have).
15-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix, relevance, downgrade, playbook).
Unit tests all green (corpusingest +1, vectord +1).
Per feedback_cross_lineage_review.md: convergent finding #2 (404
detection) is the highest-signal one — both Opus and Kimi
flagged it independently. The other Opus findings stand on
single-reviewer signal but each one verified against the actual
code.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
222 lines
6.5 KiB
Go
222 lines
6.5 KiB
Go
// Staffing co-pilot scale test driver — workers_500k corpus.
|
|
//
|
|
// Pipeline: workers_500k.csv → /v1/embed → /v1/vectors/index/workers_500k/add.
|
|
// The pipeline itself lives in internal/corpusingest; this driver
|
|
// provides the CSV → Row mapping and the post-ingest semantic queries
|
|
// that are the human-readable check ("does forklift OSHA-30 actually
|
|
// retrieve forklift workers?").
|
|
//
|
|
// Designed to be re-run safely; index gets DELETEd at the start
|
|
// when -drop is set so leftover state doesn't bias recall.
|
|
|
|
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/csv"
|
|
"encoding/json"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"git.agentview.dev/profit/golangLAKEHOUSE/internal/corpusingest"
|
|
)
|
|
|
|
const (
|
|
indexName = "workers_500k"
|
|
dim = 768
|
|
|
|
// Column indexes in workers_500k.csv. Stable contract; if the CSV
|
|
// schema changes these need updating.
|
|
colWorkerID = 0
|
|
colName = 1
|
|
colRole = 2
|
|
colCity = 5
|
|
colState = 6
|
|
colSkills = 8
|
|
colCerts = 9
|
|
colResume = 17
|
|
)
|
|
|
|
// workersCSV implements corpusingest.Source. CSV reader state +
|
|
// row → Row mapping live here; the embed/add pipeline is generic.
|
|
type workersCSV struct {
|
|
cr *csv.Reader
|
|
}
|
|
|
|
func (s *workersCSV) Next() (corpusingest.Row, error) {
|
|
for {
|
|
row, err := s.cr.Read()
|
|
if err != nil {
|
|
return corpusingest.Row{}, err
|
|
}
|
|
if len(row) <= colResume {
|
|
continue // skip malformed rows; matches prior behavior
|
|
}
|
|
id := strings.TrimSpace(row[colWorkerID])
|
|
return corpusingest.Row{
|
|
ID: "w-" + id,
|
|
Text: buildWorkerText(row),
|
|
Metadata: map[string]any{
|
|
"name": row[colName],
|
|
"role": row[colRole],
|
|
"city": row[colCity],
|
|
"state": row[colState],
|
|
},
|
|
}, nil
|
|
}
|
|
}
|
|
|
|
// buildWorkerText concatenates staffing-relevant columns into the
|
|
// embed-text. Order: role first (most semantically dense), then
|
|
// location, skills, certs, prose resume. Embedding models weight
|
|
// earlier tokens slightly more, so the front matter matters.
|
|
func buildWorkerText(row []string) string {
|
|
var b strings.Builder
|
|
b.WriteString(row[colRole])
|
|
b.WriteString(" in ")
|
|
b.WriteString(row[colCity])
|
|
b.WriteString(", ")
|
|
b.WriteString(row[colState])
|
|
b.WriteString(". Skills: ")
|
|
b.WriteString(row[colSkills])
|
|
b.WriteString(". Certifications: ")
|
|
b.WriteString(row[colCerts])
|
|
b.WriteString(". ")
|
|
b.WriteString(row[colResume])
|
|
return b.String()
|
|
}
|
|
|
|
func main() {
|
|
var (
|
|
gateway = flag.String("gateway", "http://127.0.0.1:3110", "gateway base URL")
|
|
csvPath = flag.String("csv", "/tmp/rs/workers_500k.csv", "path to workers CSV")
|
|
limit = flag.Int("limit", 0, "limit rows (0 = all)")
|
|
queries = flag.String("queries", "default", "default | <semicolon-separated query strings>")
|
|
skipPop = flag.Bool("skip-populate", false, "skip embed+add, only run queries")
|
|
drop = flag.Bool("drop", true, "DELETE index before populate (default true for clean recall)")
|
|
)
|
|
flag.Parse()
|
|
|
|
hc := &http.Client{Timeout: 5 * time.Minute}
|
|
ctx := context.Background()
|
|
|
|
if !*skipPop {
|
|
f, err := os.Open(*csvPath)
|
|
if err != nil {
|
|
log.Fatalf("open csv: %v", err)
|
|
}
|
|
defer f.Close()
|
|
cr := csv.NewReader(f)
|
|
cr.FieldsPerRecord = -1
|
|
if _, err := cr.Read(); err != nil { // skip header
|
|
log.Fatalf("read header: %v", err)
|
|
}
|
|
|
|
stats, err := corpusingest.Run(ctx, corpusingest.Config{
|
|
GatewayURL: *gateway,
|
|
IndexName: indexName,
|
|
Dimension: dim,
|
|
Distance: "cosine",
|
|
EmbedBatch: 16, // matches Ollama-on-A4000 sweet spot
|
|
EmbedWorkers: 8, // matches Ollama-on-A4000 sweet spot
|
|
AddBatch: 1000, // empirically fine; vectord BatchAdd lock-amortized at f1c1883
|
|
Limit: *limit,
|
|
DropExisting: *drop,
|
|
HTTPClient: hc,
|
|
LogProgress: 10 * time.Second,
|
|
}, &workersCSV{cr: cr})
|
|
if err != nil {
|
|
// ErrPartialFailure means SOME batches failed but we still
|
|
// have a corpus to query. Report and continue rather than
|
|
// nuking the run for transient Ollama hiccups.
|
|
if errors.Is(err, corpusingest.ErrPartialFailure) {
|
|
fmt.Printf("[sc] WARN partial failure: %v\n", err)
|
|
} else {
|
|
log.Fatalf("ingest: %v", err)
|
|
}
|
|
}
|
|
fmt.Printf("[sc] populate done: scanned=%d embedded=%d added=%d failed=%d wall=%v\n",
|
|
stats.Scanned, stats.Embedded, stats.Added, stats.FailedBatches,
|
|
stats.Wall.Round(time.Millisecond))
|
|
}
|
|
|
|
// Validate semantic queries against the populated index.
|
|
qs := defaultQueries()
|
|
if *queries != "default" {
|
|
qs = strings.Split(*queries, ";")
|
|
}
|
|
for _, q := range qs {
|
|
runQuery(hc, *gateway, q)
|
|
}
|
|
}
|
|
|
|
func defaultQueries() []string {
|
|
return []string{
|
|
"CNC operator with first article and gauge R&R experience",
|
|
"forklift driver OSHA-30 certified warehouse",
|
|
"warehouse picker night shift bilingual",
|
|
"dental hygienist three years experience",
|
|
"electrician with industrial wiring background",
|
|
}
|
|
}
|
|
|
|
// runQuery embeds a query, searches the index, prints top hits.
|
|
// Stays in this driver (not corpusingest) — query validation is
|
|
// per-corpus concern, not part of the ingest pipeline.
|
|
func runQuery(hc *http.Client, gateway, q string) {
|
|
t0 := time.Now()
|
|
body, _ := json.Marshal(map[string]any{"texts": []string{q}})
|
|
req, _ := http.NewRequest(http.MethodPost, gateway+"/v1/embed", bytes.NewReader(body))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err := hc.Do(req)
|
|
if err != nil {
|
|
fmt.Printf("[sc] query %q: embed err: %v\n", q, err)
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
var er struct {
|
|
Vectors [][]float32 `json:"vectors"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&er); err != nil || len(er.Vectors) == 0 {
|
|
fmt.Printf("[sc] query %q: embed decode err: %v\n", q, err)
|
|
return
|
|
}
|
|
embedDur := time.Since(t0)
|
|
|
|
t1 := time.Now()
|
|
body, _ = json.Marshal(map[string]any{"vector": er.Vectors[0], "k": 5})
|
|
req, _ = http.NewRequest(http.MethodPost,
|
|
gateway+"/v1/vectors/index/"+indexName+"/search", bytes.NewReader(body))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err = hc.Do(req)
|
|
if err != nil {
|
|
fmt.Printf("[sc] query %q: search err: %v\n", q, err)
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
searchDur := time.Since(t1)
|
|
var sr struct {
|
|
Results []struct {
|
|
ID string `json:"id"`
|
|
Distance float32 `json:"distance"`
|
|
Metadata json.RawMessage `json:"metadata"`
|
|
} `json:"results"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&sr); err != nil {
|
|
fmt.Printf("[sc] query %q: decode err: %v\n", q, err)
|
|
return
|
|
}
|
|
fmt.Printf("\n[sc] %q (embed=%v search=%v)\n", q, embedDur.Round(time.Millisecond), searchDur.Round(time.Millisecond))
|
|
for i, r := range sr.Results {
|
|
fmt.Printf(" %d. %s d=%.4f %s\n", i+1, r.ID, r.Distance, string(r.Metadata))
|
|
}
|
|
}
|
|
|