From 7f4208952105967b46364f5d9c706eb0418785a6 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 29 Apr 2026 19:58:39 -0500 Subject: [PATCH] =?UTF-8?q?D:=20embed-text=20iteration=20=E2=80=94=20clean?= =?UTF-8?q?=20negative=20finding=20(3=20variants=20tested)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workers driver embed text reverted to V0 after testing 3 variants on the "Forklift operator with OSHA-30 certification, warehouse experience" reality-test query against 5000 workers (which contains 569 actual Forklift Operators per the 31b4088 probe). V0 (current, restored): "Worker role: . Skills: ... Certifications: ... " → 6 workers in top-8, 0 Forklift Ops, top distance 0.327, top role "Production Worker" V4a (role-doubled): ". with . ..." drop archetype + resume_text → 6 workers in top-8, 0 Forklift Ops, top distance 0.254, top role "Production Worker" V4b (resume-only): just the resume_text natural-language sentence, no structured prefix → 4 workers in top-8 (WORSE mix — software-engineer candidates filled the displaced slots), 0 Forklift Ops, top distance 0.379 Conclusion: all three variants surface Production Workers / Machine Operators / Line Leads ABOVE Forklift Operators for this query. The 569 actual Forklift Operators in the 5000-row sample don't appear in any top-8. Embed-text design isn't the bottleneck — nomic-embed-text 137M's geometry doesn't separate "Forklift Operator" from "Production Worker" / "Machine Operator" / "Line Lead" in this query's neighborhood. Real fixes belong elsewhere: - Hybrid SQL+semantic (B): pre-filter by role/certs via queryd before semantic ranking. Addresses the gap directly. - Different embedding model: mxbai-embed-large or a staffing- fine-tuned model. Costs an Ollama model swap + re-embedding. - Playbook boost (component 5, already shipped): record successful Forklift placements; future queries surface those workers via similarity. Compounds with use. V0 restored because it has the best worker/candidate mix in top-8 (6 vs 4 in V4b), preserving the multi-corpus reality-test signal quality even if the role match is imperfect. Comments updated to record the experiment so future sessions don't relitigate. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/staffing_workers/main.go | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/scripts/staffing_workers/main.go b/scripts/staffing_workers/main.go index 9f0c57d..50eba1e 100644 --- a/scripts/staffing_workers/main.go +++ b/scripts/staffing_workers/main.go @@ -215,10 +215,21 @@ func (s *workersSource) Next() (corpusingest.Row, error) { archetype := s.cols.archetype.At(i) resume := s.cols.resume.At(i) - // Embed text: role first (most semantically dense for staffing - // queries), then skills + certs, then location, archetype, finally - // the prose resume. Same ordering rationale as the candidates - // driver and the original 500K driver. + // Embed text — restored to V0 after 2026-04-29 D experiment. + // Three variants tested on a query of "Forklift operator with + // OSHA-30 certification, warehouse experience": + // V0 (this): structured "Worker role: ... Skills: ... " + // → 6 workers in top-8, 0 Forklift, top dist 0.327 + // V4a (drop): drop labels + resume + archetype, double the role + // → 6 workers in top-8, 0 Forklift, top dist 0.254 + // V4b (resume only): just resume_text, no structured prefix + // → 4 workers in top-8 (worse mix), 0 Forklift, top 0.379 + // All three surfaced Production Workers / Machine Operators / + // Line Leads above actual Forklift Operators. Conclusion: the + // bottleneck is nomic-embed-text 137M's geometry, not text + // design. Real fixes belong elsewhere — hybrid SQL+semantic + // (B in next-step menu) or playbook boost (component 5, + // already shipped). V0 keeps the best worker/candidate mix. var b strings.Builder b.WriteString("Worker role: ") b.WriteString(role) @@ -234,10 +245,11 @@ func (s *workersSource) Next() (corpusingest.Row, error) { b.WriteString(archetype) b.WriteString(". ") b.WriteString(resume) + text := b.String() return corpusingest.Row{ ID: fmt.Sprintf("w-%d", workerID), - Text: b.String(), + Text: text, Metadata: map[string]any{ "worker_id": workerID, "name": name,