Session infrastructure: OpenRouter + tree-split reducer + observer→LLM Team + scrum_applier #11
24
data/_catalog/views/candidates_safe.json
Normal file
24
data/_catalog/views/candidates_safe.json
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"name": "candidates_safe",
|
||||||
|
"base_dataset": "candidates",
|
||||||
|
"columns": [
|
||||||
|
"candidate_id",
|
||||||
|
"first_name",
|
||||||
|
"city",
|
||||||
|
"state",
|
||||||
|
"skills",
|
||||||
|
"years_experience",
|
||||||
|
"status"
|
||||||
|
],
|
||||||
|
"row_filter": "status != 'blocked'",
|
||||||
|
"column_redactions": {
|
||||||
|
"candidate_id": {
|
||||||
|
"kind": "mask",
|
||||||
|
"keep_prefix": 3,
|
||||||
|
"keep_suffix": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"created_at": "2026-04-27T15:42:00Z",
|
||||||
|
"created_by": "j",
|
||||||
|
"description": "PII-free candidate projection — drops last_name, email, phone, hourly_rate_usd. candidate_id masked (keep first 3, last 2). Visible to recruiter / mode-runner agents."
|
||||||
|
}
|
||||||
26
data/_catalog/views/jobs_safe.json
Normal file
26
data/_catalog/views/jobs_safe.json
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"name": "jobs_safe",
|
||||||
|
"base_dataset": "job_orders",
|
||||||
|
"columns": [
|
||||||
|
"job_order_id",
|
||||||
|
"client_id",
|
||||||
|
"title",
|
||||||
|
"vertical",
|
||||||
|
"status",
|
||||||
|
"city",
|
||||||
|
"state",
|
||||||
|
"zip",
|
||||||
|
"bill_rate",
|
||||||
|
"pay_rate"
|
||||||
|
],
|
||||||
|
"column_redactions": {
|
||||||
|
"client_id": {
|
||||||
|
"kind": "mask",
|
||||||
|
"keep_prefix": 3,
|
||||||
|
"keep_suffix": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"created_at": "2026-04-27T15:42:00Z",
|
||||||
|
"created_by": "j",
|
||||||
|
"description": "Job-order projection with client_id masked. Drops description (often quotes client names verbatim, no text-scrubber available). bill_rate / pay_rate kept — commercial info, not PII per staffing PRD."
|
||||||
|
}
|
||||||
22
data/_catalog/views/workers_safe.json
Normal file
22
data/_catalog/views/workers_safe.json
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"name": "workers_safe",
|
||||||
|
"base_dataset": "workers_500k",
|
||||||
|
"columns": [
|
||||||
|
"worker_id",
|
||||||
|
"role",
|
||||||
|
"city",
|
||||||
|
"state",
|
||||||
|
"skills",
|
||||||
|
"certifications",
|
||||||
|
"archetype",
|
||||||
|
"reliability",
|
||||||
|
"responsiveness",
|
||||||
|
"engagement",
|
||||||
|
"compliance",
|
||||||
|
"availability"
|
||||||
|
],
|
||||||
|
"column_redactions": {},
|
||||||
|
"created_at": "2026-04-27T15:42:00Z",
|
||||||
|
"created_by": "j",
|
||||||
|
"description": "PII-free worker projection — drops name, email, phone, zip, communications, resume_text. resume_text + communications carry verbatim PII (full names) and there's no in-view text scrubber, so they're dropped wholesale. Skills + certifications + scores carry the matching signal for staffing inference. Source for workers_500k_v9 vector corpus rebuild."
|
||||||
|
}
|
||||||
53
scripts/staffing/build_workers_v9.sh
Executable file
53
scripts/staffing/build_workers_v9.sh
Executable file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# build_workers_v9.sh — Decision B (corpus rebuild side).
|
||||||
|
#
|
||||||
|
# Rebuilds workers_500k_v9 vector corpus from workers_safe view rather
|
||||||
|
# than the raw workers_500k table. Closes the PII enforcement gap
|
||||||
|
# (verified 2026-04-27 that v8 was built directly from raw — LLM saw
|
||||||
|
# names/emails/phones/resume_text for every staffing query).
|
||||||
|
#
|
||||||
|
# Run as a background job — embedding 500K chunks took ~4 min for v8
|
||||||
|
# of 50K rows; v9 of 500K rows will be 30+ min. Do not block on this.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./scripts/staffing/build_workers_v9.sh
|
||||||
|
# LH_GATEWAY=http://localhost:3100 ./scripts/staffing/build_workers_v9.sh
|
||||||
|
#
|
||||||
|
# After it completes:
|
||||||
|
# - Verify via: curl /vectors/indexes/workers_500k_v9 | jq
|
||||||
|
# - Flip config/modes.toml `staffing_inference` matrix_corpus to v9
|
||||||
|
# - Restart gateway to pick up the modes.toml change
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
GATEWAY="${LH_GATEWAY:-http://localhost:3100}"
|
||||||
|
|
||||||
|
# The /vectors/index endpoint accepts {name, sql, embed_model, ...}.
|
||||||
|
# SQL pulls from workers_safe (see data/_catalog/views/workers_safe.json)
|
||||||
|
# so the embedded text never contained raw PII by construction.
|
||||||
|
#
|
||||||
|
# Concatenated text is what gets embedded — keep it short enough that
|
||||||
|
# 500K rows × N chunks fits in disk + memory budgets but still carries
|
||||||
|
# the match signal (role, location, skills, scores).
|
||||||
|
|
||||||
|
BODY=$(cat <<'JSON'
|
||||||
|
{
|
||||||
|
"name": "workers_500k_v9",
|
||||||
|
"sql": "SELECT CAST(worker_id AS VARCHAR) AS doc_id, CONCAT(role, ' in ', city, ', ', state, '. Skills: ', COALESCE(skills, ''), '. Certifications: ', COALESCE(certifications, ''), '. Archetype: ', COALESCE(archetype, ''), '. Scores — reliability ', CAST(reliability AS VARCHAR), ', responsiveness ', CAST(responsiveness AS VARCHAR), ', availability ', CAST(availability AS VARCHAR), '.') AS text FROM workers_safe",
|
||||||
|
"embed_model": "nomic-embed-text",
|
||||||
|
"chunk_size": 500,
|
||||||
|
"overlap": 50,
|
||||||
|
"source_dataset": "workers_safe",
|
||||||
|
"bucket": "primary"
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
)
|
||||||
|
|
||||||
|
echo "POSTing /vectors/index → workers_500k_v9 (background job)..."
|
||||||
|
curl -sS -X POST "${GATEWAY}/vectors/index" \
|
||||||
|
-H 'content-type: application/json' \
|
||||||
|
-d "$BODY"
|
||||||
|
echo
|
||||||
|
echo "Job started. Monitor progress:"
|
||||||
|
echo " curl ${GATEWAY}/vectors/indexes/workers_500k_v9 | jq"
|
||||||
|
echo " watch -n 5 'curl -s ${GATEWAY}/vectors/jobs | jq'"
|
||||||
Loading…
x
Reference in New Issue
Block a user