diff --git a/scripts/staffing_workers/main.go b/scripts/staffing_workers/main.go index 9f0c57d..50eba1e 100644 --- a/scripts/staffing_workers/main.go +++ b/scripts/staffing_workers/main.go @@ -215,10 +215,21 @@ func (s *workersSource) Next() (corpusingest.Row, error) { archetype := s.cols.archetype.At(i) resume := s.cols.resume.At(i) - // Embed text: role first (most semantically dense for staffing - // queries), then skills + certs, then location, archetype, finally - // the prose resume. Same ordering rationale as the candidates - // driver and the original 500K driver. + // Embed text — restored to V0 after 2026-04-29 D experiment. + // Three variants tested on a query of "Forklift operator with + // OSHA-30 certification, warehouse experience": + // V0 (this): structured "Worker role: ... Skills: ... " + // → 6 workers in top-8, 0 Forklift, top dist 0.327 + // V4a (drop): drop labels + resume + archetype, double the role + // → 6 workers in top-8, 0 Forklift, top dist 0.254 + // V4b (resume only): just resume_text, no structured prefix + // → 4 workers in top-8 (worse mix), 0 Forklift, top 0.379 + // All three surfaced Production Workers / Machine Operators / + // Line Leads above actual Forklift Operators. Conclusion: the + // bottleneck is nomic-embed-text 137M's geometry, not text + // design. Real fixes belong elsewhere — hybrid SQL+semantic + // (B in next-step menu) or playbook boost (component 5, + // already shipped). V0 keeps the best worker/candidate mix. var b strings.Builder b.WriteString("Worker role: ") b.WriteString(role) @@ -234,10 +245,11 @@ func (s *workersSource) Next() (corpusingest.Row, error) { b.WriteString(archetype) b.WriteString(". ") b.WriteString(resume) + text := b.String() return corpusingest.Row{ ID: fmt.Sprintf("w-%d", workerID), - Text: b.String(), + Text: text, Metadata: map[string]any{ "worker_id": workerID, "name": name,