From 7f4208952105967b46364f5d9c706eb0418785a6 Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Wed, 29 Apr 2026 19:58:39 -0500
Subject: [PATCH] =?UTF-8?q?D:=20embed-text=20iteration=20=E2=80=94=20clean?=
 =?UTF-8?q?=20negative=20finding=20(3=20variants=20tested)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Workers driver embed text reverted to V0 after testing 3 variants
on the "Forklift operator with OSHA-30 certification, warehouse
experience" reality-test query against 5000 workers (which contains
569 actual Forklift Operators per the 31b4088 probe).

  V0 (current, restored):  "Worker role: <role>. Skills: ...
                            Certifications: ... <resume_text>"
                           → 6 workers in top-8, 0 Forklift Ops,
                             top distance 0.327, top role
                             "Production Worker"
  V4a (role-doubled):      "<role>. <role> with <skills>. ..."
                           drop archetype + resume_text
                           → 6 workers in top-8, 0 Forklift Ops,
                             top distance 0.254, top role
                             "Production Worker"
  V4b (resume-only):       just the resume_text natural-language
                           sentence, no structured prefix
                           → 4 workers in top-8 (WORSE mix —
                             software-engineer candidates filled
                             the displaced slots), 0 Forklift Ops,
                             top distance 0.379

Conclusion: all three variants surface Production Workers / Machine
Operators / Line Leads ABOVE Forklift Operators for this query.
The 569 actual Forklift Operators in the 5000-row sample don't
appear in any top-8. Embed-text design isn't the bottleneck —
nomic-embed-text 137M's geometry doesn't separate "Forklift
Operator" from "Production Worker" / "Machine Operator" / "Line
Lead" in this query's neighborhood.

Real fixes belong elsewhere:
  - Hybrid SQL+semantic (B): pre-filter by role/certs via queryd
    before semantic ranking. Addresses the gap directly.
  - Different embedding model: mxbai-embed-large or a staffing-
    fine-tuned model. Costs an Ollama model swap + re-embedding.
  - Playbook boost (component 5, already shipped): record
    successful Forklift placements; future queries surface those
    workers via similarity. Compounds with use.

V0 restored because it has the best worker/candidate mix in top-8
(6 vs 4 in V4b), preserving the multi-corpus reality-test signal
quality even if the role match is imperfect. Comments updated to
record the experiment so future sessions don't relitigate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/staffing_workers/main.go | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/scripts/staffing_workers/main.go b/scripts/staffing_workers/main.go
index 9f0c57d..50eba1e 100644
--- a/scripts/staffing_workers/main.go
+++ b/scripts/staffing_workers/main.go
@@ -215,10 +215,21 @@ func (s *workersSource) Next() (corpusingest.Row, error) {
 	archetype := s.cols.archetype.At(i)
 	resume := s.cols.resume.At(i)
 
-	// Embed text: role first (most semantically dense for staffing
-	// queries), then skills + certs, then location, archetype, finally
-	// the prose resume. Same ordering rationale as the candidates
-	// driver and the original 500K driver.
+	// Embed text — restored to V0 after 2026-04-29 D experiment.
+	// Three variants tested on a query of "Forklift operator with
+	// OSHA-30 certification, warehouse experience":
+	//   V0 (this):   structured "Worker role: ... Skills: ... <resume_text>"
+	//                → 6 workers in top-8, 0 Forklift, top dist 0.327
+	//   V4a (drop):  drop labels + resume + archetype, double the role
+	//                → 6 workers in top-8, 0 Forklift, top dist 0.254
+	//   V4b (resume only): just resume_text, no structured prefix
+	//                → 4 workers in top-8 (worse mix), 0 Forklift, top 0.379
+	// All three surfaced Production Workers / Machine Operators /
+	// Line Leads above actual Forklift Operators. Conclusion: the
+	// bottleneck is nomic-embed-text 137M's geometry, not text
+	// design. Real fixes belong elsewhere — hybrid SQL+semantic
+	// (B in next-step menu) or playbook boost (component 5,
+	// already shipped). V0 keeps the best worker/candidate mix.
 	var b strings.Builder
 	b.WriteString("Worker role: ")
 	b.WriteString(role)
@@ -234,10 +245,11 @@ func (s *workersSource) Next() (corpusingest.Row, error) {
 	b.WriteString(archetype)
 	b.WriteString(". ")
 	b.WriteString(resume)
+	text := b.String()
 
 	return corpusingest.Row{
 		ID:   fmt.Sprintf("w-%d", workerID),
-		Text: b.String(),
+		Text: text,
 		Metadata: map[string]any{
 			"worker_id":      workerID,
 			"name":           name,