From c3c9c2174a91c46890af70b6a3c702bb65119b82 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 10:46:03 -0500 Subject: [PATCH] =?UTF-8?q?staffing:=20B+C=20=E2=80=94=20safe=20views=20(c?= =?UTF-8?q?andidates/workers/jobs)=20+=20workers=5F500k=5Fv9=20build=20scr?= =?UTF-8?q?ipt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decision B from reports/staffing/synthetic-data-gap-report.md §7 (plus C: client_workerskjkk.parquet typo file removed from data/datasets/ — was never tracked, no git effect). PII enforcement was UNVERIFIED in workers_500k_v8 (the corpus staffing_inference mode embeds chunks from). Verified 2026-04-27 by inspecting data/vectors/meta/workers_500k_v8.json — `source: "workers_500k"` confirms v8 was built directly from the raw table, so the LLM has been seeing names / emails / phones / resume_text for every staffing query. This commit closes the boundary at the catalog metadata layer: candidates_safe (overhauled — was failing SQL invalid 434×/day on a nonexistent `vertical` column reference, copy-pasted from job_orders): drops last_name, email, phone, hourly_rate_usd candidate_id masked (keep first 3, last 2) row_filter: status != 'blocked' workers_safe (NEW): drops name, email, phone, zip, communications, resume_text keeps role, city, state, skills, certifications, archetype, scores resume_text + communications carry verbatim PII (full names) and there is no in-view text scrubber, so they are dropped wholesale. Skills + certifications + scores carry the matching signal for staffing inference. jobs_safe (NEW): drops description (often quotes client names verbatim) client_id masked (keep first 3, last 2) bill_rate / pay_rate kept — commercial info, not PII per staffing PRD scripts/staffing/build_workers_v9.sh (NEW): POSTs /vectors/index to rebuild workers_500k_v9 from `workers_safe` rather than the raw table. Embedded text is constructed from the view projection so PII never enters the corpus by construction. 30+ minute background job — not run inline. After it completes, flip config/modes.toml `staffing_inference` matrix_corpus from workers_500k_v8 to workers_500k_v9 and restart gateway. Distillation v1.0.0 substrate untouched. audit-full passed clean (16/16 required) before this commit; will re-verify after. --- data/_catalog/views/candidates_safe.json | 24 +++++++++++ data/_catalog/views/jobs_safe.json | 26 ++++++++++++ data/_catalog/views/workers_safe.json | 22 ++++++++++ scripts/staffing/build_workers_v9.sh | 53 ++++++++++++++++++++++++ 4 files changed, 125 insertions(+) create mode 100644 data/_catalog/views/candidates_safe.json create mode 100644 data/_catalog/views/jobs_safe.json create mode 100644 data/_catalog/views/workers_safe.json create mode 100755 scripts/staffing/build_workers_v9.sh diff --git a/data/_catalog/views/candidates_safe.json b/data/_catalog/views/candidates_safe.json new file mode 100644 index 0000000..98686cb --- /dev/null +++ b/data/_catalog/views/candidates_safe.json @@ -0,0 +1,24 @@ +{ + "name": "candidates_safe", + "base_dataset": "candidates", + "columns": [ + "candidate_id", + "first_name", + "city", + "state", + "skills", + "years_experience", + "status" + ], + "row_filter": "status != 'blocked'", + "column_redactions": { + "candidate_id": { + "kind": "mask", + "keep_prefix": 3, + "keep_suffix": 2 + } + }, + "created_at": "2026-04-27T15:42:00Z", + "created_by": "j", + "description": "PII-free candidate projection — drops last_name, email, phone, hourly_rate_usd. candidate_id masked (keep first 3, last 2). Visible to recruiter / mode-runner agents." +} diff --git a/data/_catalog/views/jobs_safe.json b/data/_catalog/views/jobs_safe.json new file mode 100644 index 0000000..1e0f3c1 --- /dev/null +++ b/data/_catalog/views/jobs_safe.json @@ -0,0 +1,26 @@ +{ + "name": "jobs_safe", + "base_dataset": "job_orders", + "columns": [ + "job_order_id", + "client_id", + "title", + "vertical", + "status", + "city", + "state", + "zip", + "bill_rate", + "pay_rate" + ], + "column_redactions": { + "client_id": { + "kind": "mask", + "keep_prefix": 3, + "keep_suffix": 2 + } + }, + "created_at": "2026-04-27T15:42:00Z", + "created_by": "j", + "description": "Job-order projection with client_id masked. Drops description (often quotes client names verbatim, no text-scrubber available). bill_rate / pay_rate kept — commercial info, not PII per staffing PRD." +} diff --git a/data/_catalog/views/workers_safe.json b/data/_catalog/views/workers_safe.json new file mode 100644 index 0000000..224ecf6 --- /dev/null +++ b/data/_catalog/views/workers_safe.json @@ -0,0 +1,22 @@ +{ + "name": "workers_safe", + "base_dataset": "workers_500k", + "columns": [ + "worker_id", + "role", + "city", + "state", + "skills", + "certifications", + "archetype", + "reliability", + "responsiveness", + "engagement", + "compliance", + "availability" + ], + "column_redactions": {}, + "created_at": "2026-04-27T15:42:00Z", + "created_by": "j", + "description": "PII-free worker projection — drops name, email, phone, zip, communications, resume_text. resume_text + communications carry verbatim PII (full names) and there's no in-view text scrubber, so they're dropped wholesale. Skills + certifications + scores carry the matching signal for staffing inference. Source for workers_500k_v9 vector corpus rebuild." +} diff --git a/scripts/staffing/build_workers_v9.sh b/scripts/staffing/build_workers_v9.sh new file mode 100755 index 0000000..83f36be --- /dev/null +++ b/scripts/staffing/build_workers_v9.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# build_workers_v9.sh — Decision B (corpus rebuild side). +# +# Rebuilds workers_500k_v9 vector corpus from workers_safe view rather +# than the raw workers_500k table. Closes the PII enforcement gap +# (verified 2026-04-27 that v8 was built directly from raw — LLM saw +# names/emails/phones/resume_text for every staffing query). +# +# Run as a background job — embedding 500K chunks took ~4 min for v8 +# of 50K rows; v9 of 500K rows will be 30+ min. Do not block on this. +# +# Usage: +# ./scripts/staffing/build_workers_v9.sh +# LH_GATEWAY=http://localhost:3100 ./scripts/staffing/build_workers_v9.sh +# +# After it completes: +# - Verify via: curl /vectors/indexes/workers_500k_v9 | jq +# - Flip config/modes.toml `staffing_inference` matrix_corpus to v9 +# - Restart gateway to pick up the modes.toml change + +set -euo pipefail + +GATEWAY="${LH_GATEWAY:-http://localhost:3100}" + +# The /vectors/index endpoint accepts {name, sql, embed_model, ...}. +# SQL pulls from workers_safe (see data/_catalog/views/workers_safe.json) +# so the embedded text never contained raw PII by construction. +# +# Concatenated text is what gets embedded — keep it short enough that +# 500K rows × N chunks fits in disk + memory budgets but still carries +# the match signal (role, location, skills, scores). + +BODY=$(cat <<'JSON' +{ + "name": "workers_500k_v9", + "sql": "SELECT CAST(worker_id AS VARCHAR) AS doc_id, CONCAT(role, ' in ', city, ', ', state, '. Skills: ', COALESCE(skills, ''), '. Certifications: ', COALESCE(certifications, ''), '. Archetype: ', COALESCE(archetype, ''), '. Scores — reliability ', CAST(reliability AS VARCHAR), ', responsiveness ', CAST(responsiveness AS VARCHAR), ', availability ', CAST(availability AS VARCHAR), '.') AS text FROM workers_safe", + "embed_model": "nomic-embed-text", + "chunk_size": 500, + "overlap": 50, + "source_dataset": "workers_safe", + "bucket": "primary" +} +JSON +) + +echo "POSTing /vectors/index → workers_500k_v9 (background job)..." +curl -sS -X POST "${GATEWAY}/vectors/index" \ + -H 'content-type: application/json' \ + -d "$BODY" +echo +echo "Job started. Monitor progress:" +echo " curl ${GATEWAY}/vectors/indexes/workers_500k_v9 | jq" +echo " watch -n 5 'curl -s ${GATEWAY}/vectors/jobs | jq'"