#!/usr/bin/env bash # build_workers_v9.sh — Decision B (corpus rebuild side). # # Rebuilds workers_500k_v9 vector corpus from workers_safe view rather # than the raw workers_500k table. Closes the PII enforcement gap # (verified 2026-04-27 that v8 was built directly from raw — LLM saw # names/emails/phones/resume_text for every staffing query). # # Run as a background job — embedding 500K chunks took ~4 min for v8 # of 50K rows; v9 of 500K rows will be 30+ min. Do not block on this. # # Usage: # ./scripts/staffing/build_workers_v9.sh # LH_GATEWAY=http://localhost:3100 ./scripts/staffing/build_workers_v9.sh # # After it completes: # - Verify via: curl /vectors/indexes/workers_500k_v9 | jq # - Flip config/modes.toml `staffing_inference` matrix_corpus to v9 # - Restart gateway to pick up the modes.toml change set -euo pipefail GATEWAY="${LH_GATEWAY:-http://localhost:3100}" # The /vectors/index endpoint accepts {name, sql, embed_model, ...}. # SQL pulls from workers_safe (see data/_catalog/views/workers_safe.json) # so the embedded text never contained raw PII by construction. # # Concatenated text is what gets embedded — keep it short enough that # 500K rows × N chunks fits in disk + memory budgets but still carries # the match signal (role, location, skills, scores). BODY=$(cat <<'JSON' { "name": "workers_500k_v9", "sql": "SELECT CAST(worker_id AS VARCHAR) AS doc_id, CONCAT(role, ' in ', city, ', ', state, '. Skills: ', COALESCE(skills, ''), '. Certifications: ', COALESCE(certifications, ''), '. Archetype: ', COALESCE(archetype, ''), '. Scores — reliability ', CAST(reliability AS VARCHAR), ', responsiveness ', CAST(responsiveness AS VARCHAR), ', availability ', CAST(availability AS VARCHAR), '.') AS text FROM workers_safe", "embed_model": "nomic-embed-text", "chunk_size": 500, "overlap": 50, "source_dataset": "workers_safe", "bucket": "primary" } JSON ) echo "POSTing /vectors/index → workers_500k_v9 (background job)..." curl -sS -X POST "${GATEWAY}/vectors/index" \ -H 'content-type: application/json' \ -d "$BODY" echo echo "Job started. Monitor progress:" echo " curl ${GATEWAY}/vectors/indexes/workers_500k_v9 | jq" echo " watch -n 5 'curl -s ${GATEWAY}/vectors/jobs | jq'"