diff --git a/data/_catalog/views/candidates_safe.json b/data/_catalog/views/candidates_safe.json new file mode 100644 index 0000000..98686cb --- /dev/null +++ b/data/_catalog/views/candidates_safe.json @@ -0,0 +1,24 @@ +{ + "name": "candidates_safe", + "base_dataset": "candidates", + "columns": [ + "candidate_id", + "first_name", + "city", + "state", + "skills", + "years_experience", + "status" + ], + "row_filter": "status != 'blocked'", + "column_redactions": { + "candidate_id": { + "kind": "mask", + "keep_prefix": 3, + "keep_suffix": 2 + } + }, + "created_at": "2026-04-27T15:42:00Z", + "created_by": "j", + "description": "PII-free candidate projection — drops last_name, email, phone, hourly_rate_usd. candidate_id masked (keep first 3, last 2). Visible to recruiter / mode-runner agents." +} diff --git a/data/_catalog/views/jobs_safe.json b/data/_catalog/views/jobs_safe.json new file mode 100644 index 0000000..1e0f3c1 --- /dev/null +++ b/data/_catalog/views/jobs_safe.json @@ -0,0 +1,26 @@ +{ + "name": "jobs_safe", + "base_dataset": "job_orders", + "columns": [ + "job_order_id", + "client_id", + "title", + "vertical", + "status", + "city", + "state", + "zip", + "bill_rate", + "pay_rate" + ], + "column_redactions": { + "client_id": { + "kind": "mask", + "keep_prefix": 3, + "keep_suffix": 2 + } + }, + "created_at": "2026-04-27T15:42:00Z", + "created_by": "j", + "description": "Job-order projection with client_id masked. Drops description (often quotes client names verbatim, no text-scrubber available). bill_rate / pay_rate kept — commercial info, not PII per staffing PRD." +} diff --git a/data/_catalog/views/workers_safe.json b/data/_catalog/views/workers_safe.json new file mode 100644 index 0000000..224ecf6 --- /dev/null +++ b/data/_catalog/views/workers_safe.json @@ -0,0 +1,22 @@ +{ + "name": "workers_safe", + "base_dataset": "workers_500k", + "columns": [ + "worker_id", + "role", + "city", + "state", + "skills", + "certifications", + "archetype", + "reliability", + "responsiveness", + "engagement", + "compliance", + "availability" + ], + "column_redactions": {}, + "created_at": "2026-04-27T15:42:00Z", + "created_by": "j", + "description": "PII-free worker projection — drops name, email, phone, zip, communications, resume_text. resume_text + communications carry verbatim PII (full names) and there's no in-view text scrubber, so they're dropped wholesale. Skills + certifications + scores carry the matching signal for staffing inference. Source for workers_500k_v9 vector corpus rebuild." +} diff --git a/scripts/staffing/build_workers_v9.sh b/scripts/staffing/build_workers_v9.sh new file mode 100755 index 0000000..83f36be --- /dev/null +++ b/scripts/staffing/build_workers_v9.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# build_workers_v9.sh — Decision B (corpus rebuild side). +# +# Rebuilds workers_500k_v9 vector corpus from workers_safe view rather +# than the raw workers_500k table. Closes the PII enforcement gap +# (verified 2026-04-27 that v8 was built directly from raw — LLM saw +# names/emails/phones/resume_text for every staffing query). +# +# Run as a background job — embedding 500K chunks took ~4 min for v8 +# of 50K rows; v9 of 500K rows will be 30+ min. Do not block on this. +# +# Usage: +# ./scripts/staffing/build_workers_v9.sh +# LH_GATEWAY=http://localhost:3100 ./scripts/staffing/build_workers_v9.sh +# +# After it completes: +# - Verify via: curl /vectors/indexes/workers_500k_v9 | jq +# - Flip config/modes.toml `staffing_inference` matrix_corpus to v9 +# - Restart gateway to pick up the modes.toml change + +set -euo pipefail + +GATEWAY="${LH_GATEWAY:-http://localhost:3100}" + +# The /vectors/index endpoint accepts {name, sql, embed_model, ...}. +# SQL pulls from workers_safe (see data/_catalog/views/workers_safe.json) +# so the embedded text never contained raw PII by construction. +# +# Concatenated text is what gets embedded — keep it short enough that +# 500K rows × N chunks fits in disk + memory budgets but still carries +# the match signal (role, location, skills, scores). + +BODY=$(cat <<'JSON' +{ + "name": "workers_500k_v9", + "sql": "SELECT CAST(worker_id AS VARCHAR) AS doc_id, CONCAT(role, ' in ', city, ', ', state, '. Skills: ', COALESCE(skills, ''), '. Certifications: ', COALESCE(certifications, ''), '. Archetype: ', COALESCE(archetype, ''), '. Scores — reliability ', CAST(reliability AS VARCHAR), ', responsiveness ', CAST(responsiveness AS VARCHAR), ', availability ', CAST(availability AS VARCHAR), '.') AS text FROM workers_safe", + "embed_model": "nomic-embed-text", + "chunk_size": 500, + "overlap": 50, + "source_dataset": "workers_safe", + "bucket": "primary" +} +JSON +) + +echo "POSTing /vectors/index → workers_500k_v9 (background job)..." +curl -sS -X POST "${GATEWAY}/vectors/index" \ + -H 'content-type: application/json' \ + -d "$BODY" +echo +echo "Job started. Monitor progress:" +echo " curl ${GATEWAY}/vectors/indexes/workers_500k_v9 | jq" +echo " watch -n 5 'curl -s ${GATEWAY}/vectors/jobs | jq'"