Compare commits
3 Commits
ca7375ea2b
...
c3c9c2174a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c3c9c2174a | ||
|
|
940737daa7 | ||
|
|
d56f08e740 |
24
data/_catalog/views/candidates_safe.json
Normal file
24
data/_catalog/views/candidates_safe.json
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"name": "candidates_safe",
|
||||||
|
"base_dataset": "candidates",
|
||||||
|
"columns": [
|
||||||
|
"candidate_id",
|
||||||
|
"first_name",
|
||||||
|
"city",
|
||||||
|
"state",
|
||||||
|
"skills",
|
||||||
|
"years_experience",
|
||||||
|
"status"
|
||||||
|
],
|
||||||
|
"row_filter": "status != 'blocked'",
|
||||||
|
"column_redactions": {
|
||||||
|
"candidate_id": {
|
||||||
|
"kind": "mask",
|
||||||
|
"keep_prefix": 3,
|
||||||
|
"keep_suffix": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"created_at": "2026-04-27T15:42:00Z",
|
||||||
|
"created_by": "j",
|
||||||
|
"description": "PII-free candidate projection — drops last_name, email, phone, hourly_rate_usd. candidate_id masked (keep first 3, last 2). Visible to recruiter / mode-runner agents."
|
||||||
|
}
|
||||||
26
data/_catalog/views/jobs_safe.json
Normal file
26
data/_catalog/views/jobs_safe.json
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"name": "jobs_safe",
|
||||||
|
"base_dataset": "job_orders",
|
||||||
|
"columns": [
|
||||||
|
"job_order_id",
|
||||||
|
"client_id",
|
||||||
|
"title",
|
||||||
|
"vertical",
|
||||||
|
"status",
|
||||||
|
"city",
|
||||||
|
"state",
|
||||||
|
"zip",
|
||||||
|
"bill_rate",
|
||||||
|
"pay_rate"
|
||||||
|
],
|
||||||
|
"column_redactions": {
|
||||||
|
"client_id": {
|
||||||
|
"kind": "mask",
|
||||||
|
"keep_prefix": 3,
|
||||||
|
"keep_suffix": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"created_at": "2026-04-27T15:42:00Z",
|
||||||
|
"created_by": "j",
|
||||||
|
"description": "Job-order projection with client_id masked. Drops description (often quotes client names verbatim, no text-scrubber available). bill_rate / pay_rate kept — commercial info, not PII per staffing PRD."
|
||||||
|
}
|
||||||
22
data/_catalog/views/workers_safe.json
Normal file
22
data/_catalog/views/workers_safe.json
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"name": "workers_safe",
|
||||||
|
"base_dataset": "workers_500k",
|
||||||
|
"columns": [
|
||||||
|
"worker_id",
|
||||||
|
"role",
|
||||||
|
"city",
|
||||||
|
"state",
|
||||||
|
"skills",
|
||||||
|
"certifications",
|
||||||
|
"archetype",
|
||||||
|
"reliability",
|
||||||
|
"responsiveness",
|
||||||
|
"engagement",
|
||||||
|
"compliance",
|
||||||
|
"availability"
|
||||||
|
],
|
||||||
|
"column_redactions": {},
|
||||||
|
"created_at": "2026-04-27T15:42:00Z",
|
||||||
|
"created_by": "j",
|
||||||
|
"description": "PII-free worker projection — drops name, email, phone, zip, communications, resume_text. resume_text + communications carry verbatim PII (full names) and there's no in-view text scrubber, so they're dropped wholesale. Skills + certifications + scores carry the matching signal for staffing inference. Source for workers_500k_v9 vector corpus rebuild."
|
||||||
|
}
|
||||||
BIN
data/datasets/fill_events.parquet
Normal file
BIN
data/datasets/fill_events.parquet
Normal file
Binary file not shown.
157
scripts/staffing/build_fill_events.py
Normal file
157
scripts/staffing/build_fill_events.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
build_fill_events.py — Decision A from the synthetic-data gap report.
|
||||||
|
|
||||||
|
Walks tests/multi-agent/scenarios/*.json (43 client-day scenarios) and
|
||||||
|
data/_playbook_lessons/*.json (64 retrospective outcomes) and emits a
|
||||||
|
single normalized fill_events.parquet at data/datasets/fill_events.parquet.
|
||||||
|
|
||||||
|
Pure deterministic normalization — no LLM, no new data. Each scenario
|
||||||
|
event becomes one row. Lesson outcomes augment scenario events with
|
||||||
|
success/fail counts where (client, date, city, state) matches.
|
||||||
|
|
||||||
|
Reproducibility: identical inputs → bit-identical output. event_id is
|
||||||
|
SHA1(client|date|role|at|city) truncated to 16 hex chars; rows are
|
||||||
|
sorted by event_id before write so re-runs produce the same parquet.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
|
REPO = Path(__file__).resolve().parents[2]
|
||||||
|
SCENARIO_DIR = REPO / "tests" / "multi-agent" / "scenarios"
|
||||||
|
LESSONS_DIR = REPO / "data" / "_playbook_lessons"
|
||||||
|
OUT_PATH = REPO / "data" / "datasets" / "fill_events.parquet"
|
||||||
|
|
||||||
|
|
||||||
|
def event_id(client: str, date: str, role: str, at: str, city: str) -> str:
|
||||||
|
h = hashlib.sha1(f"{client}|{date}|{role}|{at}|{city}".encode()).hexdigest()
|
||||||
|
return h[:16]
|
||||||
|
|
||||||
|
|
||||||
|
def load_lessons() -> dict:
|
||||||
|
"""Returns map of (client, date) → outcome dict."""
|
||||||
|
out: dict = {}
|
||||||
|
for path in sorted(LESSONS_DIR.glob("*.json")):
|
||||||
|
try:
|
||||||
|
d = json.loads(path.read_text())
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
client = d.get("client")
|
||||||
|
date = d.get("date")
|
||||||
|
if not client or not date:
|
||||||
|
continue
|
||||||
|
out[(client, date)] = {
|
||||||
|
"outcome_events_total": d.get("events_total"),
|
||||||
|
"outcome_events_ok": d.get("events_ok"),
|
||||||
|
"outcome_checkpoint_count": d.get("checkpoint_count"),
|
||||||
|
"outcome_model": d.get("model"),
|
||||||
|
"outcome_cloud": d.get("cloud"),
|
||||||
|
"outcome_lesson_path": str(path.relative_to(REPO)),
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def load_scenarios(lessons: dict) -> list[dict]:
|
||||||
|
rows: list[dict] = []
|
||||||
|
for path in sorted(SCENARIO_DIR.glob("scen_*.json")):
|
||||||
|
try:
|
||||||
|
d = json.loads(path.read_text())
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
client = d.get("client")
|
||||||
|
date = d.get("date")
|
||||||
|
contract = d.get("contract") or {}
|
||||||
|
events = d.get("events") or []
|
||||||
|
if not client or not date or not events:
|
||||||
|
continue
|
||||||
|
outcome = lessons.get((client, date), {})
|
||||||
|
for event in events:
|
||||||
|
role = event.get("role") or ""
|
||||||
|
at = event.get("at") or ""
|
||||||
|
city = event.get("city") or ""
|
||||||
|
state = event.get("state") or ""
|
||||||
|
rows.append({
|
||||||
|
"event_id": event_id(client, date, role, at, city),
|
||||||
|
"source_file": str(path.relative_to(REPO)),
|
||||||
|
"source_kind": "scenario",
|
||||||
|
"client": client,
|
||||||
|
"date": date,
|
||||||
|
"city": city,
|
||||||
|
"state": state,
|
||||||
|
"role": role,
|
||||||
|
"count": int(event.get("count") or 0),
|
||||||
|
"kind": event.get("kind") or "",
|
||||||
|
"at": at,
|
||||||
|
"shift_start": event.get("shift_start") or "",
|
||||||
|
"contract_deadline": contract.get("deadline"),
|
||||||
|
"contract_budget_per_hour_max": contract.get("budget_per_hour_max"),
|
||||||
|
"contract_local_bonus_per_hour": contract.get("local_bonus_per_hour"),
|
||||||
|
"contract_local_bonus_radius_mi": contract.get("local_bonus_radius_mi"),
|
||||||
|
"contract_fill_requirement": contract.get("fill_requirement"),
|
||||||
|
"outcome_events_total": outcome.get("outcome_events_total"),
|
||||||
|
"outcome_events_ok": outcome.get("outcome_events_ok"),
|
||||||
|
"outcome_checkpoint_count": outcome.get("outcome_checkpoint_count"),
|
||||||
|
"outcome_model": outcome.get("outcome_model"),
|
||||||
|
"outcome_cloud": outcome.get("outcome_cloud"),
|
||||||
|
"outcome_lesson_path": outcome.get("outcome_lesson_path"),
|
||||||
|
})
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
lessons = load_lessons()
|
||||||
|
rows = load_scenarios(lessons)
|
||||||
|
if not rows:
|
||||||
|
print("no rows produced — scenario dir empty?", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
rows.sort(key=lambda r: r["event_id"])
|
||||||
|
|
||||||
|
schema = pa.schema([
|
||||||
|
("event_id", pa.string()),
|
||||||
|
("source_file", pa.string()),
|
||||||
|
("source_kind", pa.string()),
|
||||||
|
("client", pa.string()),
|
||||||
|
("date", pa.string()),
|
||||||
|
("city", pa.string()),
|
||||||
|
("state", pa.string()),
|
||||||
|
("role", pa.string()),
|
||||||
|
("count", pa.int32()),
|
||||||
|
("kind", pa.string()),
|
||||||
|
("at", pa.string()),
|
||||||
|
("shift_start", pa.string()),
|
||||||
|
("contract_deadline", pa.string()),
|
||||||
|
("contract_budget_per_hour_max", pa.int32()),
|
||||||
|
("contract_local_bonus_per_hour", pa.int32()),
|
||||||
|
("contract_local_bonus_radius_mi", pa.int32()),
|
||||||
|
("contract_fill_requirement", pa.string()),
|
||||||
|
("outcome_events_total", pa.int32()),
|
||||||
|
("outcome_events_ok", pa.int32()),
|
||||||
|
("outcome_checkpoint_count", pa.int32()),
|
||||||
|
("outcome_model", pa.string()),
|
||||||
|
("outcome_cloud", pa.bool_()),
|
||||||
|
("outcome_lesson_path", pa.string()),
|
||||||
|
])
|
||||||
|
table = pa.Table.from_pylist(rows, schema=schema)
|
||||||
|
|
||||||
|
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
pq.write_table(table, OUT_PATH, compression="snappy")
|
||||||
|
|
||||||
|
matched = sum(1 for r in rows if r["outcome_events_total"] is not None)
|
||||||
|
print(f"fill_events.parquet written: {OUT_PATH.relative_to(REPO)}")
|
||||||
|
print(f" rows: {len(rows)}")
|
||||||
|
print(f" scenarios: {len({r['source_file'] for r in rows})}")
|
||||||
|
print(f" with outcome: {matched}")
|
||||||
|
print(f" unique (client,date): {len({(r['client'], r['date']) for r in rows})}")
|
||||||
|
print(f" generated_at: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
53
scripts/staffing/build_workers_v9.sh
Executable file
53
scripts/staffing/build_workers_v9.sh
Executable file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# build_workers_v9.sh — Decision B (corpus rebuild side).
|
||||||
|
#
|
||||||
|
# Rebuilds workers_500k_v9 vector corpus from workers_safe view rather
|
||||||
|
# than the raw workers_500k table. Closes the PII enforcement gap
|
||||||
|
# (verified 2026-04-27 that v8 was built directly from raw — LLM saw
|
||||||
|
# names/emails/phones/resume_text for every staffing query).
|
||||||
|
#
|
||||||
|
# Run as a background job — embedding 500K chunks took ~4 min for v8
|
||||||
|
# of 50K rows; v9 of 500K rows will be 30+ min. Do not block on this.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./scripts/staffing/build_workers_v9.sh
|
||||||
|
# LH_GATEWAY=http://localhost:3100 ./scripts/staffing/build_workers_v9.sh
|
||||||
|
#
|
||||||
|
# After it completes:
|
||||||
|
# - Verify via: curl /vectors/indexes/workers_500k_v9 | jq
|
||||||
|
# - Flip config/modes.toml `staffing_inference` matrix_corpus to v9
|
||||||
|
# - Restart gateway to pick up the modes.toml change
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
GATEWAY="${LH_GATEWAY:-http://localhost:3100}"
|
||||||
|
|
||||||
|
# The /vectors/index endpoint accepts {name, sql, embed_model, ...}.
|
||||||
|
# SQL pulls from workers_safe (see data/_catalog/views/workers_safe.json)
|
||||||
|
# so the embedded text never contained raw PII by construction.
|
||||||
|
#
|
||||||
|
# Concatenated text is what gets embedded — keep it short enough that
|
||||||
|
# 500K rows × N chunks fits in disk + memory budgets but still carries
|
||||||
|
# the match signal (role, location, skills, scores).
|
||||||
|
|
||||||
|
BODY=$(cat <<'JSON'
|
||||||
|
{
|
||||||
|
"name": "workers_500k_v9",
|
||||||
|
"sql": "SELECT CAST(worker_id AS VARCHAR) AS doc_id, CONCAT(role, ' in ', city, ', ', state, '. Skills: ', COALESCE(skills, ''), '. Certifications: ', COALESCE(certifications, ''), '. Archetype: ', COALESCE(archetype, ''), '. Scores — reliability ', CAST(reliability AS VARCHAR), ', responsiveness ', CAST(responsiveness AS VARCHAR), ', availability ', CAST(availability AS VARCHAR), '.') AS text FROM workers_safe",
|
||||||
|
"embed_model": "nomic-embed-text",
|
||||||
|
"chunk_size": 500,
|
||||||
|
"overlap": 50,
|
||||||
|
"source_dataset": "workers_safe",
|
||||||
|
"bucket": "primary"
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
)
|
||||||
|
|
||||||
|
echo "POSTing /vectors/index → workers_500k_v9 (background job)..."
|
||||||
|
curl -sS -X POST "${GATEWAY}/vectors/index" \
|
||||||
|
-H 'content-type: application/json' \
|
||||||
|
-d "$BODY"
|
||||||
|
echo
|
||||||
|
echo "Job started. Monitor progress:"
|
||||||
|
echo " curl ${GATEWAY}/vectors/indexes/workers_500k_v9 | jq"
|
||||||
|
echo " watch -n 5 'curl -s ${GATEWAY}/vectors/jobs | jq'"
|
||||||
65
scripts/staffing/fixup_phone_type.py
Normal file
65
scripts/staffing/fixup_phone_type.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
fixup_phone_type.py — Decision D from the synthetic-data gap report.
|
||||||
|
|
||||||
|
Converts workers_500k.parquet `phone` column from int64 → string. Phones
|
||||||
|
in this dataset are 11-digit US numbers (1 + area + 7), e.g. 13122277740.
|
||||||
|
Stored as int64, the column compares fine numerically but breaks join
|
||||||
|
keys with string-typed phone columns elsewhere (formatted "+1...", or
|
||||||
|
loaded from a CSV).
|
||||||
|
|
||||||
|
Backs up the original to workers_500k.parquet.bak-<date> before write.
|
||||||
|
Idempotent: detects when the fix has already been applied and exits 0.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/staffing/fixup_phone_type.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import datetime as dt
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
import pyarrow.compute as pc
|
||||||
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
|
REPO = Path(__file__).resolve().parents[2]
|
||||||
|
TARGET = REPO / "data" / "datasets" / "workers_500k.parquet"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
if not TARGET.exists():
|
||||||
|
print(f"missing: {TARGET}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
table = pq.read_table(TARGET)
|
||||||
|
phone_field = table.schema.field("phone")
|
||||||
|
if phone_field.type == pa.string():
|
||||||
|
print(f"phone is already string — no-op")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
today = dt.date.today().isoformat()
|
||||||
|
backup = TARGET.with_suffix(f".parquet.bak-{today}")
|
||||||
|
if not backup.exists():
|
||||||
|
shutil.copy2(TARGET, backup)
|
||||||
|
print(f"backup: {backup.relative_to(REPO)}")
|
||||||
|
|
||||||
|
phone_str = pc.cast(table["phone"], pa.string())
|
||||||
|
new_table = table.set_column(
|
||||||
|
table.schema.get_field_index("phone"),
|
||||||
|
pa.field("phone", pa.string()),
|
||||||
|
phone_str,
|
||||||
|
)
|
||||||
|
|
||||||
|
pq.write_table(new_table, TARGET, compression="snappy")
|
||||||
|
rounds_trip = pq.read_table(TARGET, columns=["phone"])
|
||||||
|
sample = rounds_trip["phone"].slice(0, 3).to_pylist()
|
||||||
|
print(f"wrote: {TARGET.relative_to(REPO)}")
|
||||||
|
print(f"phone type: {rounds_trip.schema.field('phone').type}")
|
||||||
|
print(f"sample: {sample}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Loading…
x
Reference in New Issue
Block a user