From c3c9c2174a91c46890af70b6a3c702bb65119b82 Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Mon, 27 Apr 2026 10:46:03 -0500
Subject: [PATCH] =?UTF-8?q?staffing:=20B+C=20=E2=80=94=20safe=20views=20(c?=
 =?UTF-8?q?andidates/workers/jobs)=20+=20workers=5F500k=5Fv9=20build=20scr?=
 =?UTF-8?q?ipt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Decision B from reports/staffing/synthetic-data-gap-report.md §7
(plus C: client_workerskjkk.parquet typo file removed from
data/datasets/ — was never tracked, no git effect).

PII enforcement was UNVERIFIED in workers_500k_v8 (the corpus
staffing_inference mode embeds chunks from). Verified 2026-04-27 by
inspecting data/vectors/meta/workers_500k_v8.json — `source:
"workers_500k"` confirms v8 was built directly from the raw table, so
the LLM has been seeing names / emails / phones / resume_text for every
staffing query.

This commit closes the boundary at the catalog metadata layer:

candidates_safe (overhauled — was failing SQL invalid 434×/day on a
nonexistent `vertical` column reference, copy-pasted from job_orders):
  drops last_name, email, phone, hourly_rate_usd
  candidate_id masked (keep first 3, last 2)
  row_filter: status != 'blocked'

workers_safe (NEW):
  drops name, email, phone, zip, communications, resume_text
  keeps role, city, state, skills, certifications, archetype, scores
  resume_text + communications carry verbatim PII (full names) and
  there is no in-view text scrubber, so they are dropped wholesale.
  Skills + certifications + scores carry the matching signal for
  staffing inference.

jobs_safe (NEW):
  drops description (often quotes client names verbatim)
  client_id masked (keep first 3, last 2)
  bill_rate / pay_rate kept — commercial info, not PII per staffing PRD

scripts/staffing/build_workers_v9.sh (NEW):
  POSTs /vectors/index to rebuild workers_500k_v9 from `workers_safe`
  rather than the raw table. Embedded text is constructed from the
  view projection so PII never enters the corpus by construction.
  30+ minute background job — not run inline. After it completes,
  flip config/modes.toml `staffing_inference` matrix_corpus from
  workers_500k_v8 to workers_500k_v9 and restart gateway.

Distillation v1.0.0 substrate untouched. audit-full passed clean
(16/16 required) before this commit; will re-verify after.
---
 data/_catalog/views/candidates_safe.json | 24 +++++++++++
 data/_catalog/views/jobs_safe.json       | 26 ++++++++++++
 data/_catalog/views/workers_safe.json    | 22 ++++++++++
 scripts/staffing/build_workers_v9.sh     | 53 ++++++++++++++++++++++++
 4 files changed, 125 insertions(+)
 create mode 100644 data/_catalog/views/candidates_safe.json
 create mode 100644 data/_catalog/views/jobs_safe.json
 create mode 100644 data/_catalog/views/workers_safe.json
 create mode 100755 scripts/staffing/build_workers_v9.sh

diff --git a/data/_catalog/views/candidates_safe.json b/data/_catalog/views/candidates_safe.json
new file mode 100644
index 0000000..98686cb
--- /dev/null
+++ b/data/_catalog/views/candidates_safe.json
@@ -0,0 +1,24 @@
+{
+  "name": "candidates_safe",
+  "base_dataset": "candidates",
+  "columns": [
+    "candidate_id",
+    "first_name",
+    "city",
+    "state",
+    "skills",
+    "years_experience",
+    "status"
+  ],
+  "row_filter": "status != 'blocked'",
+  "column_redactions": {
+    "candidate_id": {
+      "kind": "mask",
+      "keep_prefix": 3,
+      "keep_suffix": 2
+    }
+  },
+  "created_at": "2026-04-27T15:42:00Z",
+  "created_by": "j",
+  "description": "PII-free candidate projection — drops last_name, email, phone, hourly_rate_usd. candidate_id masked (keep first 3, last 2). Visible to recruiter / mode-runner agents."
+}
diff --git a/data/_catalog/views/jobs_safe.json b/data/_catalog/views/jobs_safe.json
new file mode 100644
index 0000000..1e0f3c1
--- /dev/null
+++ b/data/_catalog/views/jobs_safe.json
@@ -0,0 +1,26 @@
+{
+  "name": "jobs_safe",
+  "base_dataset": "job_orders",
+  "columns": [
+    "job_order_id",
+    "client_id",
+    "title",
+    "vertical",
+    "status",
+    "city",
+    "state",
+    "zip",
+    "bill_rate",
+    "pay_rate"
+  ],
+  "column_redactions": {
+    "client_id": {
+      "kind": "mask",
+      "keep_prefix": 3,
+      "keep_suffix": 2
+    }
+  },
+  "created_at": "2026-04-27T15:42:00Z",
+  "created_by": "j",
+  "description": "Job-order projection with client_id masked. Drops description (often quotes client names verbatim, no text-scrubber available). bill_rate / pay_rate kept — commercial info, not PII per staffing PRD."
+}
diff --git a/data/_catalog/views/workers_safe.json b/data/_catalog/views/workers_safe.json
new file mode 100644
index 0000000..224ecf6
--- /dev/null
+++ b/data/_catalog/views/workers_safe.json
@@ -0,0 +1,22 @@
+{
+  "name": "workers_safe",
+  "base_dataset": "workers_500k",
+  "columns": [
+    "worker_id",
+    "role",
+    "city",
+    "state",
+    "skills",
+    "certifications",
+    "archetype",
+    "reliability",
+    "responsiveness",
+    "engagement",
+    "compliance",
+    "availability"
+  ],
+  "column_redactions": {},
+  "created_at": "2026-04-27T15:42:00Z",
+  "created_by": "j",
+  "description": "PII-free worker projection — drops name, email, phone, zip, communications, resume_text. resume_text + communications carry verbatim PII (full names) and there's no in-view text scrubber, so they're dropped wholesale. Skills + certifications + scores carry the matching signal for staffing inference. Source for workers_500k_v9 vector corpus rebuild."
+}
diff --git a/scripts/staffing/build_workers_v9.sh b/scripts/staffing/build_workers_v9.sh
new file mode 100755
index 0000000..83f36be
--- /dev/null
+++ b/scripts/staffing/build_workers_v9.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# build_workers_v9.sh — Decision B (corpus rebuild side).
+#
+# Rebuilds workers_500k_v9 vector corpus from workers_safe view rather
+# than the raw workers_500k table. Closes the PII enforcement gap
+# (verified 2026-04-27 that v8 was built directly from raw — LLM saw
+# names/emails/phones/resume_text for every staffing query).
+#
+# Run as a background job — embedding 500K chunks took ~4 min for v8
+# of 50K rows; v9 of 500K rows will be 30+ min. Do not block on this.
+#
+# Usage:
+#     ./scripts/staffing/build_workers_v9.sh
+#     LH_GATEWAY=http://localhost:3100 ./scripts/staffing/build_workers_v9.sh
+#
+# After it completes:
+#   - Verify via: curl /vectors/indexes/workers_500k_v9 | jq
+#   - Flip config/modes.toml `staffing_inference` matrix_corpus to v9
+#   - Restart gateway to pick up the modes.toml change
+
+set -euo pipefail
+
+GATEWAY="${LH_GATEWAY:-http://localhost:3100}"
+
+# The /vectors/index endpoint accepts {name, sql, embed_model, ...}.
+# SQL pulls from workers_safe (see data/_catalog/views/workers_safe.json)
+# so the embedded text never contained raw PII by construction.
+#
+# Concatenated text is what gets embedded — keep it short enough that
+# 500K rows × N chunks fits in disk + memory budgets but still carries
+# the match signal (role, location, skills, scores).
+
+BODY=$(cat <<'JSON'
+{
+  "name": "workers_500k_v9",
+  "sql": "SELECT CAST(worker_id AS VARCHAR) AS doc_id, CONCAT(role, ' in ', city, ', ', state, '. Skills: ', COALESCE(skills, ''), '. Certifications: ', COALESCE(certifications, ''), '. Archetype: ', COALESCE(archetype, ''), '. Scores — reliability ', CAST(reliability AS VARCHAR), ', responsiveness ', CAST(responsiveness AS VARCHAR), ', availability ', CAST(availability AS VARCHAR), '.') AS text FROM workers_safe",
+  "embed_model": "nomic-embed-text",
+  "chunk_size": 500,
+  "overlap": 50,
+  "source_dataset": "workers_safe",
+  "bucket": "primary"
+}
+JSON
+)
+
+echo "POSTing /vectors/index → workers_500k_v9 (background job)..."
+curl -sS -X POST "${GATEWAY}/vectors/index" \
+  -H 'content-type: application/json' \
+  -d "$BODY"
+echo
+echo "Job started. Monitor progress:"
+echo "  curl ${GATEWAY}/vectors/indexes/workers_500k_v9 | jq"
+echo "  watch -n 5 'curl -s ${GATEWAY}/vectors/jobs | jq'"