MCP server at mcp-server/index.ts — 9 tools exposing the full lakehouse to any MCP-compatible model: search_workers (hybrid SQL+vector), query_sql, match_contract, get_worker, rag_question, log_success, get_playbooks, swap_profile, vram_status The "successful playbooks" pattern: log_success writes outcomes back to the lakehouse as a queryable dataset. Small models call get_playbooks to learn what approaches worked for similar tasks — no retraining needed, just data. generate_workers.py scales to 100K+ with realistic distributions: - 20 roles weighted by staffing industry frequency - 44 real Midwest/South cities across 12 states - Per-role skill pools (warehouse/production/machine/maintenance) - 13 certification types with realistic probability - 8 behavioral archetypes with score distributions - SMS communication templates (20 patterns) 100K worker dataset ingested: 70MB CSV → Parquet in 1.1s. Verified: 11K forklift ops, 27K in IL, archetype distribution matches weights. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
309 lines
14 KiB
Python
309 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate realistic staffing worker profiles at scale.
|
|
|
|
Usage:
|
|
python3 generate_workers.py 100000 > /tmp/workers_100k.csv
|
|
curl -X POST "http://localhost:3100/ingest/file?name=workers_100k" \
|
|
-F "file=@/tmp/workers_100k.csv"
|
|
|
|
Design: combinatorial generation with industry-realistic distributions.
|
|
No LLM dependency — runs in seconds. Each worker has:
|
|
- Unique name, email, phone
|
|
- Role drawn from weighted staffing-industry distribution
|
|
- City/state from real Midwest/South geography (staffing agency footprint)
|
|
- Skills per role (realistic combos, not random)
|
|
- Certifications with expiry dates
|
|
- Behavioral archetype + numeric scores
|
|
- SMS communication history (templated from real patterns)
|
|
- Resume summary text (for embedding)
|
|
|
|
The generated data is designed to stress-test:
|
|
- SQL filters (WHERE role=X AND state=Y AND reliability>Z)
|
|
- Vector search (resume_text embeddings)
|
|
- Hybrid SQL+vector (structured + semantic together)
|
|
- Profile-scoped search (bound_datasets filtering)
|
|
- Concurrent query load at scale
|
|
"""
|
|
|
|
import csv, random, sys, hashlib
|
|
from datetime import datetime, timedelta
|
|
|
|
# ─── Configuration ───
|
|
|
|
N = int(sys.argv[1]) if len(sys.argv) > 1 else 100000
|
|
SEED = 2026
|
|
random.seed(SEED)
|
|
|
|
# ─── Realistic data pools ───
|
|
|
|
FIRST_NAMES = [
|
|
"James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", "Linda",
|
|
"David", "Elizabeth", "William", "Barbara", "Richard", "Susan", "Joseph", "Jessica",
|
|
"Thomas", "Sarah", "Christopher", "Karen", "Charles", "Lisa", "Daniel", "Nancy",
|
|
"Matthew", "Betty", "Anthony", "Margaret", "Mark", "Sandra", "Donald", "Ashley",
|
|
"Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle",
|
|
"Kenneth", "Carol", "Kevin", "Amanda", "Brian", "Dorothy", "George", "Melissa",
|
|
"Timothy", "Deborah", "Ronald", "Stephanie", "Edward", "Rebecca", "Jason", "Sharon",
|
|
"Jeffrey", "Laura", "Ryan", "Cynthia", "Jacob", "Kathleen", "Gary", "Amy",
|
|
"Nicholas", "Angela", "Eric", "Shirley", "Jonathan", "Anna", "Stephen", "Brenda",
|
|
"Larry", "Pamela", "Justin", "Emma", "Scott", "Nicole", "Brandon", "Helen",
|
|
"Benjamin", "Samantha", "Samuel", "Katherine", "Raymond", "Christine", "Gregory", "Debra",
|
|
"Frank", "Rachel", "Alexander", "Carolyn", "Patrick", "Janet", "Jack", "Catherine",
|
|
"Dennis", "Maria", "Jerry", "Heather", "Tyler", "Diane", "Aaron", "Ruth",
|
|
"Jose", "Julie", "Adam", "Olivia", "Nathan", "Joyce", "Henry", "Virginia",
|
|
"Douglas", "Victoria", "Zachary", "Kelly", "Peter", "Lauren", "Kyle", "Christina",
|
|
"Jamal", "Terrence", "Marcus", "DeShawn", "Malik", "Andre", "Carlos", "Miguel",
|
|
"Luis", "Sofia", "Rosa", "Carmen", "Alejandro", "Roberto", "Priya", "Aisha",
|
|
"Wei", "Yuki", "Omar", "Fatima", "Raj", "Mei", "Olga", "Ivan",
|
|
]
|
|
|
|
LAST_NAMES = [
|
|
"Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis",
|
|
"Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson",
|
|
"Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson",
|
|
"White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson", "Walker",
|
|
"Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill",
|
|
"Flores", "Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell",
|
|
"Mitchell", "Carter", "Roberts", "Gomez", "Phillips", "Evans", "Turner", "Diaz",
|
|
"Parker", "Cruz", "Edwards", "Collins", "Reyes", "Stewart", "Morris", "Morales",
|
|
"Murphy", "Cook", "Rogers", "Gutierrez", "Ortiz", "Morgan", "Cooper", "Peterson",
|
|
"Bailey", "Reed", "Kelly", "Howard", "Ramos", "Kim", "Cox", "Ward",
|
|
"Richardson", "Watson", "Brooks", "Chavez", "Wood", "James", "Bennett", "Gray",
|
|
"Mendoza", "Ruiz", "Hughes", "Price", "Alvarez", "Castillo", "Sanders", "Patel",
|
|
"Myers", "Long", "Ross", "Foster", "Jimenez", "Powell", "Jenkins", "Perry",
|
|
]
|
|
|
|
# Real Midwest/South cities — staffing agency footprint
|
|
CITIES = [
|
|
("Chicago", "IL", "606"), ("Springfield", "IL", "627"), ("Rockford", "IL", "611"),
|
|
("Peoria", "IL", "616"), ("Champaign", "IL", "618"), ("Decatur", "IL", "625"),
|
|
("Bloomington", "IL", "617"), ("Joliet", "IL", "604"), ("Mattoon", "IL", "619"),
|
|
("Danville", "IL", "618"), ("Quincy", "IL", "623"), ("Galesburg", "IL", "614"),
|
|
("Indianapolis", "IN", "462"), ("Fort Wayne", "IN", "468"), ("Evansville", "IN", "477"),
|
|
("South Bend", "IN", "466"), ("Terre Haute", "IN", "478"), ("Bloomington", "IN", "474"),
|
|
("Columbus", "OH", "432"), ("Cleveland", "OH", "441"), ("Cincinnati", "OH", "452"),
|
|
("Dayton", "OH", "454"), ("Toledo", "OH", "436"), ("Akron", "OH", "443"),
|
|
("St. Louis", "MO", "631"), ("Kansas City", "MO", "641"), ("Springfield", "MO", "658"),
|
|
("Columbia", "MO", "652"), ("Jefferson City", "MO", "651"),
|
|
("Nashville", "TN", "372"), ("Memphis", "TN", "381"), ("Knoxville", "TN", "379"),
|
|
("Louisville", "KY", "402"), ("Lexington", "KY", "405"),
|
|
("Milwaukee", "WI", "532"), ("Madison", "WI", "537"), ("Green Bay", "WI", "543"),
|
|
("Detroit", "MI", "481"), ("Grand Rapids", "MI", "495"), ("Lansing", "MI", "489"),
|
|
("Des Moines", "IA", "503"), ("Cedar Rapids", "IA", "524"),
|
|
("Minneapolis", "MN", "554"), ("St. Paul", "MN", "551"),
|
|
]
|
|
|
|
# Roles with industry-realistic weights
|
|
ROLES = [
|
|
("Forklift Operator", 12), ("Material Handler", 11), ("Machine Operator", 10),
|
|
("Assembler", 9), ("Production Worker", 9), ("Warehouse Associate", 8),
|
|
("Quality Tech", 7), ("Shipping Clerk", 6), ("Loader", 6),
|
|
("Inventory Clerk", 5), ("Line Lead", 4), ("Maintenance Tech", 4),
|
|
("Welder", 3), ("CNC Operator", 3), ("Sanitation Worker", 3),
|
|
("Packaging Operator", 3), ("Electrician", 2), ("Tool & Die Maker", 2),
|
|
("Safety Coordinator", 1), ("Logistics Coordinator", 1),
|
|
]
|
|
ROLE_NAMES = [r for r, _ in ROLES]
|
|
ROLE_WEIGHTS = [w for _, w in ROLES]
|
|
|
|
# Skills per role family (realistic combos)
|
|
SKILL_POOLS = {
|
|
"warehouse": ["forklift", "pallet jack", "RF scanner", "inventory", "shipping",
|
|
"receiving", "pick-to-light", "packaging", "cold storage", "loading dock"],
|
|
"production": ["assembly", "line work", "quality inspection", "lean manufacturing",
|
|
"6S", "SPC", "conveyor ops", "batch processing", "labeling"],
|
|
"machine": ["CNC", "lathe", "mill", "grinder", "press brake", "EDM",
|
|
"blueprint reading", "GD&T", "micrometer", "calipers"],
|
|
"maintenance": ["preventive maintenance", "troubleshooting", "PLC", "electrical",
|
|
"hydraulics", "pneumatics", "welding", "lockout/tagout", "CMMS"],
|
|
"quality": ["inspection", "CMM", "SPC", "gauge R&R", "ISO 9001", "root cause analysis",
|
|
"first article", "nonconformance", "calibration"],
|
|
"general": ["Excel", "SAP", "first aid", "hazmat", "confined space",
|
|
"overhead crane", "team lead", "training", "bilingual"],
|
|
}
|
|
|
|
ROLE_SKILL_MAP = {
|
|
"Forklift Operator": ["warehouse", "general"],
|
|
"Material Handler": ["warehouse", "general"],
|
|
"Machine Operator": ["machine", "production"],
|
|
"Assembler": ["production", "quality"],
|
|
"Production Worker": ["production", "general"],
|
|
"Warehouse Associate": ["warehouse", "general"],
|
|
"Quality Tech": ["quality", "production"],
|
|
"Shipping Clerk": ["warehouse", "general"],
|
|
"Loader": ["warehouse", "general"],
|
|
"Inventory Clerk": ["warehouse", "general"],
|
|
"Line Lead": ["production", "general"],
|
|
"Maintenance Tech": ["maintenance", "general"],
|
|
"Welder": ["maintenance", "machine"],
|
|
"CNC Operator": ["machine", "quality"],
|
|
"Sanitation Worker": ["general"],
|
|
"Packaging Operator": ["production", "warehouse"],
|
|
"Electrician": ["maintenance"],
|
|
"Tool & Die Maker": ["machine", "maintenance"],
|
|
"Safety Coordinator": ["quality", "general"],
|
|
"Logistics Coordinator": ["warehouse", "general"],
|
|
}
|
|
|
|
CERTS = [
|
|
("OSHA-10", 0.35), ("OSHA-30", 0.15), ("Forklift", 0.30), ("Hazmat", 0.12),
|
|
("First Aid/CPR", 0.25), ("Reach Truck", 0.10), ("Order Picker", 0.08),
|
|
("ServSafe", 0.05), ("MSDS", 0.07), ("Confined Space", 0.06),
|
|
("Lockout/Tagout", 0.08), ("Fire Safety", 0.04), ("ISO 9001", 0.03),
|
|
]
|
|
|
|
ARCHETYPES = [
|
|
("reliable", 25), ("communicator", 25), ("flexible", 20),
|
|
("leader", 15), ("specialist", 10), ("improving", 3),
|
|
("erratic", 1), ("silent", 1),
|
|
]
|
|
ARCHETYPE_NAMES = [a for a, _ in ARCHETYPES]
|
|
ARCHETYPE_WEIGHTS = [w for _, w in ARCHETYPES]
|
|
|
|
SMS_TEMPLATES = [
|
|
"On my way, running about {min} minutes late.",
|
|
"Got it, I'll be there at {time}.",
|
|
"Can I switch to the {shift} shift this week?",
|
|
"Thanks for the update!",
|
|
"Is overtime available this Saturday?",
|
|
"I need to call in tomorrow, family emergency.",
|
|
"Stuck in traffic on I-{highway}, might be {min} late.",
|
|
"Hey, just confirming my start time is {time}?",
|
|
"I finished the {task} ahead of schedule.",
|
|
"Can you send me the address for the new site?",
|
|
"Do I need steel toes for this assignment?",
|
|
"What's the dress code at {client}?",
|
|
"My certification expires next month, where do I renew?",
|
|
"I'm available for any extra shifts this week.",
|
|
"Is there parking on site or do I need to take the bus?",
|
|
"The supervisor said I did great today!",
|
|
"I have a doctor's appointment {day}, can I come in late?",
|
|
"Weather looks bad tomorrow, is the site still open?",
|
|
"I completed the safety orientation.",
|
|
"Thanks for getting me this placement, really appreciate it.",
|
|
]
|
|
|
|
CLIENTS = [
|
|
"Midwest Logistics", "Precision Mfg", "AutoParts Direct", "CleanSpace",
|
|
"Summit Packaging", "Great Lakes Steel", "Heartland Foods", "Prairie Wind Energy",
|
|
"River City Plastics", "Cardinal Health", "TechFlow Assembly", "Union Pacific",
|
|
]
|
|
|
|
# ─── Generator ───
|
|
|
|
def gen_skills(role):
|
|
pools = ROLE_SKILL_MAP.get(role, ["general"])
|
|
skills = set()
|
|
for pool in pools:
|
|
available = SKILL_POOLS.get(pool, [])
|
|
n = random.randint(2, min(5, len(available)))
|
|
skills.update(random.sample(available, n))
|
|
return sorted(skills)
|
|
|
|
def gen_certs():
|
|
certs = []
|
|
for name, prob in CERTS:
|
|
if random.random() < prob:
|
|
expires = datetime(2026, 1, 1) + timedelta(days=random.randint(30, 730))
|
|
certs.append(f"{name}")
|
|
return certs
|
|
|
|
def gen_scores(archetype):
|
|
base = {
|
|
"reliable": (0.85, 0.05), "communicator": (0.70, 0.10),
|
|
"flexible": (0.75, 0.08), "leader": (0.80, 0.07),
|
|
"specialist": (0.78, 0.06), "improving": (0.60, 0.15),
|
|
"erratic": (0.40, 0.20), "silent": (0.65, 0.12),
|
|
}
|
|
mean, std = base.get(archetype, (0.70, 0.10))
|
|
return {
|
|
"reliability": max(0, min(1, random.gauss(mean, std))),
|
|
"responsiveness": max(0, min(1, random.gauss(mean - 0.05, std + 0.05))),
|
|
"engagement": max(0, min(1, random.gauss(mean - 0.03, std))),
|
|
"compliance": max(0, min(1, random.gauss(mean + 0.02, std - 0.02))),
|
|
"availability": max(0, min(1, random.gauss(0.75, 0.15))),
|
|
}
|
|
|
|
def gen_comms(archetype, n=5):
|
|
if archetype == "silent":
|
|
n = random.randint(0, 1)
|
|
elif archetype == "communicator":
|
|
n = random.randint(5, 10)
|
|
msgs = []
|
|
for _ in range(n):
|
|
tmpl = random.choice(SMS_TEMPLATES)
|
|
msg = tmpl.format(
|
|
min=random.randint(5, 25),
|
|
time=f"{random.randint(5,8)}:{random.choice(['00','15','30','45'])} AM",
|
|
shift=random.choice(["morning", "evening", "night"]),
|
|
highway=random.randint(55, 94),
|
|
task=random.choice(["pallet count", "safety check", "machine setup", "inventory audit"]),
|
|
client=random.choice(CLIENTS),
|
|
day=random.choice(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]),
|
|
)
|
|
msgs.append(msg)
|
|
return " | ".join(msgs)
|
|
|
|
def gen_email(first, last, wid):
|
|
domain = random.choice(["gmail.com", "yahoo.com", "outlook.com", "protonmail.com", "mail.com"])
|
|
tag = random.choice(["", str(random.randint(1,99)), str(wid)])
|
|
return f"{first.lower()}.{last.lower()}{tag}@{domain}"
|
|
|
|
def gen_phone():
|
|
area = random.choice([312,773,217,309,618,219,317,812,614,513,216,314,615,502,414,608,313,515,612])
|
|
return f"+1{area}{random.randint(2000000,9999999)}"
|
|
|
|
# ─── Main ───
|
|
|
|
writer = csv.writer(sys.stdout)
|
|
writer.writerow([
|
|
"worker_id", "name", "role", "email", "phone", "city", "state", "zip",
|
|
"skills", "certifications", "archetype",
|
|
"reliability", "responsiveness", "engagement", "compliance", "availability",
|
|
"communications", "resume_text",
|
|
])
|
|
|
|
for i in range(1, N + 1):
|
|
first = random.choice(FIRST_NAMES)
|
|
last = random.choice(LAST_NAMES)
|
|
# At scale, names repeat — that's realistic. Worker ID is the unique key.
|
|
# Add middle initial for variety above 15K workers.
|
|
if i > 15000:
|
|
mid = chr(65 + (i % 26)) # A-Z
|
|
name = f"{first} {mid}. {last}"
|
|
else:
|
|
name = f"{first} {last}"
|
|
|
|
role = random.choices(ROLE_NAMES, weights=ROLE_WEIGHTS, k=1)[0]
|
|
city, state, zip_pre = random.choice(CITIES)
|
|
zipcode = f"{zip_pre}{random.randint(10,99)}"
|
|
skills = gen_skills(role)
|
|
certs = gen_certs()
|
|
archetype = random.choices(ARCHETYPE_NAMES, weights=ARCHETYPE_WEIGHTS, k=1)[0]
|
|
scores = gen_scores(archetype)
|
|
comms = gen_comms(archetype)
|
|
|
|
resume = (
|
|
f"{name} — {role} in {city}, {state}. "
|
|
f"Skills: {'|'.join(skills)}. "
|
|
f"Certs: {'|'.join(certs) if certs else 'none'}. "
|
|
f"Archetype: {archetype}. "
|
|
f"Reliability: {scores['reliability']:.2f}, "
|
|
f"Availability: {scores['availability']:.2f}"
|
|
)
|
|
|
|
writer.writerow([
|
|
i, name, role, gen_email(first, last, i), gen_phone(),
|
|
city, state, zipcode,
|
|
", ".join(skills), ", ".join(certs), archetype,
|
|
f"{scores['reliability']:.4f}", f"{scores['responsiveness']:.4f}",
|
|
f"{scores['engagement']:.4f}", f"{scores['compliance']:.4f}",
|
|
f"{scores['availability']:.4f}",
|
|
comms, resume,
|
|
])
|
|
|
|
if i % 25000 == 0:
|
|
print(f" generated {i:,}/{N:,}...", file=sys.stderr)
|
|
|
|
print(f"Done: {N:,} workers", file=sys.stderr)
|