diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000..9071808 --- /dev/null +++ b/.mcp.json @@ -0,0 +1,11 @@ +{ + "mcpServers": { + "lakehouse": { + "command": "bun", + "args": ["run", "/home/profit/lakehouse/mcp-server/index.ts"], + "env": { + "LAKEHOUSE_URL": "http://localhost:3100" + } + } + } +} diff --git a/mcp-server/index.ts b/mcp-server/index.ts new file mode 100644 index 0000000..c2d19b5 --- /dev/null +++ b/mcp-server/index.ts @@ -0,0 +1,187 @@ +/** + * Lakehouse MCP Server — bridges local LLMs to the data substrate. + * + * Tools: + * - search_workers: hybrid SQL+vector (the core fix) + * - query_sql: analytical SQL on any dataset + * - match_contract: find workers for a job order + * - get_worker: single worker by ID + * - rag_question: full RAG pipeline + * - log_success: record what worked → playbook DB + * - get_playbooks: retrieve past successes + * - swap_profile: hot-swap model + data context + * - vram_status: GPU introspection + */ + +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; +import { z } from "zod"; + +const BASE = process.env.LAKEHOUSE_URL || "http://localhost:3100"; + +async function api(method: string, path: string, body?: any) { + const resp = await fetch(`${BASE}${path}`, { + method, + headers: body ? { "Content-Type": "application/json" } : {}, + body: body ? JSON.stringify(body) : undefined, + }); + const text = await resp.text(); + try { return JSON.parse(text); } catch { return { raw: text, status: resp.status }; } +} + +const server = new McpServer({ name: "lakehouse", version: "1.0.0" }); + +server.tool( + "search_workers", + "Hybrid SQL+vector search. SQL ensures structural accuracy (role, state, reliability), vector ranks by semantic relevance. Every result is verified against the golden dataset.", + { + question: z.string().describe("Natural language question about workers"), + sql_filter: z.string().optional().describe("SQL WHERE clause, e.g. \"role = 'Forklift Operator' AND state = 'IL' AND reliability > 0.8\""), + dataset: z.string().default("ethereal_workers"), + id_column: z.string().default("worker_id"), + top_k: z.number().default(5), + }, + async ({ question, sql_filter, dataset, id_column, top_k }) => { + const body: any = { question, index_name: "ethereal_workers_v1", filter_dataset: dataset, id_column, top_k, generate: true }; + if (sql_filter) body.sql_filter = sql_filter; + const r = await api("POST", "/vectors/hybrid", body); + return { content: [{ type: "text" as const, text: JSON.stringify(r, null, 2) }] }; + }, +); + +server.tool( + "query_sql", + "Run SQL against any lakehouse dataset. Tables: ethereal_workers (10K), candidates (100K), timesheets (1M), call_log (800K), email_log (500K), placements (50K), job_orders (15K), clients (2K).", + { sql: z.string().describe("SQL query") }, + async ({ sql }) => { + const r = await api("POST", "/query/sql", { sql }); + if (r.error) return { content: [{ type: "text" as const, text: `SQL Error: ${r.error}` }] }; + return { content: [{ type: "text" as const, text: `${r.row_count} rows:\n${JSON.stringify(r.rows?.slice(0, 20), null, 2)}` }] }; + }, +); + +server.tool( + "match_contract", + "Find qualified workers for a staffing contract. SQL-verified matches ranked by semantic fit.", + { + role: z.string(), state: z.string(), city: z.string().optional(), + min_reliability: z.number().default(0.7), + required_certs: z.array(z.string()).default([]), + headcount: z.number().default(5), + }, + async ({ role, state, city, min_reliability, required_certs, headcount }) => { + let filter = `role = '${role}' AND state = '${state}' AND reliability >= ${min_reliability}`; + if (city) filter += ` AND city = '${city}'`; + const r = await api("POST", "/vectors/hybrid", { + question: `Find the best ${role} workers with relevant skills and certifications`, + index_name: "ethereal_workers_v1", sql_filter: filter, + filter_dataset: "ethereal_workers", id_column: "worker_id", + top_k: headcount * 2, generate: false, + }); + let matches = r.sources || []; + if (required_certs.length > 0) { + const req = new Set(required_certs.map((c: string) => c.toLowerCase())); + matches = matches.filter((m: any) => { + const certs = (m.chunk_text || "").toLowerCase(); + return [...req].every(c => certs.includes(c)); + }); + } + return { content: [{ type: "text" as const, text: JSON.stringify({ + contract: { role, state, city, min_reliability, required_certs }, + matches: matches.slice(0, headcount), total_sql: r.sql_matches, method: r.method, + }, null, 2) }] }; + }, +); + +server.tool( + "get_worker", + "Fetch one worker profile by ID — all fields including scores and comms.", + { worker_id: z.number() }, + async ({ worker_id }) => { + const r = await api("POST", "/query/sql", { sql: `SELECT * FROM ethereal_workers WHERE worker_id = ${worker_id}` }); + if (!r.rows?.length) return { content: [{ type: "text" as const, text: `Worker ${worker_id} not found` }] }; + return { content: [{ type: "text" as const, text: JSON.stringify(r.rows[0], null, 2) }] }; + }, +); + +server.tool( + "rag_question", + "Natural language question answered via RAG (embed → search → retrieve → generate). For open-ended questions where SQL alone isn't enough.", + { question: z.string(), index: z.string().default("ethereal_workers_v1"), top_k: z.number().default(5) }, + async ({ question, index, top_k }) => { + const r = await api("POST", "/vectors/rag", { index_name: index, question, top_k }); + return { content: [{ type: "text" as const, text: r.error ? `RAG Error: ${r.error}` : `Answer: ${r.answer}\n\nSources: ${r.sources?.length || 0}` }] }; + }, +); + +server.tool( + "log_success", + "Record a successful operation to the playbook database. Small models query this later to learn what worked.", + { + operation: z.string().describe("What was done"), + approach: z.string().describe("How it was done"), + result: z.string().describe("Outcome"), + context: z.string().optional(), + }, + async ({ operation, approach, result, context }) => { + const csv = `timestamp,operation,approach,result,context\n"${new Date().toISOString()}","${operation.replace(/"/g, '""')}","${approach.replace(/"/g, '""')}","${result.replace(/"/g, '""')}","${(context||"").replace(/"/g, '""')}"`; + const form = new FormData(); + form.append("file", new Blob([csv], { type: "text/csv" }), "playbook.csv"); + const resp = await fetch(`${BASE}/ingest/file?name=successful_playbooks`, { method: "POST", body: form }); + return { content: [{ type: "text" as const, text: `Logged: ${await resp.text()}` }] }; + }, +); + +server.tool( + "get_playbooks", + "Retrieve past successful operations. Small models use this to learn what approaches worked.", + { keyword: z.string().optional(), limit: z.number().default(10) }, + async ({ keyword, limit }) => { + let sql = `SELECT * FROM successful_playbooks ORDER BY timestamp DESC LIMIT ${limit}`; + if (keyword) sql = `SELECT * FROM successful_playbooks WHERE operation LIKE '%${keyword}%' OR approach LIKE '%${keyword}%' ORDER BY timestamp DESC LIMIT ${limit}`; + const r = await api("POST", "/query/sql", { sql }); + if (r.error) return { content: [{ type: "text" as const, text: "No playbooks yet — log some successful operations first!" }] }; + return { content: [{ type: "text" as const, text: JSON.stringify(r.rows, null, 2) }] }; + }, +); + +server.tool( + "swap_profile", + "Hot-swap model profile. Changes Ollama model in VRAM + bound datasets. 'agent-parquet' = HNSW (fast), 'agent-lance' = IVF_PQ (scalable).", + { profile_id: z.string() }, + async ({ profile_id }) => { + const r = await api("POST", `/vectors/profile/${profile_id}/activate`); + return { content: [{ type: "text" as const, text: JSON.stringify({ + profile: r.profile_id, model: r.ollama_name, + indexes: r.indexes_warmed?.length, vectors: r.total_vectors, + previous: r.previous_profile, duration: r.duration_secs, + }, null, 2) }] }; + }, +); + +server.tool( + "vram_status", + "GPU VRAM usage + loaded Ollama models. Check before swapping profiles.", + {}, + async () => { + const r = await api("GET", "/ai/vram"); + return { content: [{ type: "text" as const, text: JSON.stringify(r, null, 2) }] }; + }, +); + +// Resources +server.resource("lakehouse://datasets", "lakehouse://datasets", async (uri) => { + const r = await api("GET", "/catalog/datasets") as any[]; + const text = r.map(d => `${d.name}: ${d.row_count || "?"} rows`).join("\n"); + return { contents: [{ uri: uri.href, mimeType: "text/plain", text }] }; +}); + +// Start +async function main() { + const transport = new StdioServerTransport(); + await server.connect(transport); + console.error(`Lakehouse MCP server started → ${BASE}`); + console.error("Tools: search_workers, query_sql, match_contract, get_worker, rag_question, log_success, get_playbooks, swap_profile, vram_status"); +} + +main().catch(console.error); diff --git a/mcp-server/package.json b/mcp-server/package.json new file mode 100644 index 0000000..9b9f941 --- /dev/null +++ b/mcp-server/package.json @@ -0,0 +1,15 @@ +{ + "name": "mcp-server", + "module": "index.ts", + "type": "module", + "private": true, + "devDependencies": { + "@types/bun": "latest" + }, + "peerDependencies": { + "typescript": "^5" + }, + "dependencies": { + "@modelcontextprotocol/sdk": "^1.29.0" + } +} diff --git a/scripts/generate_workers.py b/scripts/generate_workers.py new file mode 100644 index 0000000..2bdaf21 --- /dev/null +++ b/scripts/generate_workers.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +"""Generate realistic staffing worker profiles at scale. + +Usage: + python3 generate_workers.py 100000 > /tmp/workers_100k.csv + curl -X POST "http://localhost:3100/ingest/file?name=workers_100k" \ + -F "file=@/tmp/workers_100k.csv" + +Design: combinatorial generation with industry-realistic distributions. +No LLM dependency — runs in seconds. Each worker has: + - Unique name, email, phone + - Role drawn from weighted staffing-industry distribution + - City/state from real Midwest/South geography (staffing agency footprint) + - Skills per role (realistic combos, not random) + - Certifications with expiry dates + - Behavioral archetype + numeric scores + - SMS communication history (templated from real patterns) + - Resume summary text (for embedding) + +The generated data is designed to stress-test: + - SQL filters (WHERE role=X AND state=Y AND reliability>Z) + - Vector search (resume_text embeddings) + - Hybrid SQL+vector (structured + semantic together) + - Profile-scoped search (bound_datasets filtering) + - Concurrent query load at scale +""" + +import csv, random, sys, hashlib +from datetime import datetime, timedelta + +# ─── Configuration ─── + +N = int(sys.argv[1]) if len(sys.argv) > 1 else 100000 +SEED = 2026 +random.seed(SEED) + +# ─── Realistic data pools ─── + +FIRST_NAMES = [ + "James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", "Linda", + "David", "Elizabeth", "William", "Barbara", "Richard", "Susan", "Joseph", "Jessica", + "Thomas", "Sarah", "Christopher", "Karen", "Charles", "Lisa", "Daniel", "Nancy", + "Matthew", "Betty", "Anthony", "Margaret", "Mark", "Sandra", "Donald", "Ashley", + "Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle", + "Kenneth", "Carol", "Kevin", "Amanda", "Brian", "Dorothy", "George", "Melissa", + "Timothy", "Deborah", "Ronald", "Stephanie", "Edward", "Rebecca", "Jason", "Sharon", + "Jeffrey", "Laura", "Ryan", "Cynthia", "Jacob", "Kathleen", "Gary", "Amy", + "Nicholas", "Angela", "Eric", "Shirley", "Jonathan", "Anna", "Stephen", "Brenda", + "Larry", "Pamela", "Justin", "Emma", "Scott", "Nicole", "Brandon", "Helen", + "Benjamin", "Samantha", "Samuel", "Katherine", "Raymond", "Christine", "Gregory", "Debra", + "Frank", "Rachel", "Alexander", "Carolyn", "Patrick", "Janet", "Jack", "Catherine", + "Dennis", "Maria", "Jerry", "Heather", "Tyler", "Diane", "Aaron", "Ruth", + "Jose", "Julie", "Adam", "Olivia", "Nathan", "Joyce", "Henry", "Virginia", + "Douglas", "Victoria", "Zachary", "Kelly", "Peter", "Lauren", "Kyle", "Christina", + "Jamal", "Terrence", "Marcus", "DeShawn", "Malik", "Andre", "Carlos", "Miguel", + "Luis", "Sofia", "Rosa", "Carmen", "Alejandro", "Roberto", "Priya", "Aisha", + "Wei", "Yuki", "Omar", "Fatima", "Raj", "Mei", "Olga", "Ivan", +] + +LAST_NAMES = [ + "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", + "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", + "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson", + "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson", "Walker", + "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill", + "Flores", "Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell", + "Mitchell", "Carter", "Roberts", "Gomez", "Phillips", "Evans", "Turner", "Diaz", + "Parker", "Cruz", "Edwards", "Collins", "Reyes", "Stewart", "Morris", "Morales", + "Murphy", "Cook", "Rogers", "Gutierrez", "Ortiz", "Morgan", "Cooper", "Peterson", + "Bailey", "Reed", "Kelly", "Howard", "Ramos", "Kim", "Cox", "Ward", + "Richardson", "Watson", "Brooks", "Chavez", "Wood", "James", "Bennett", "Gray", + "Mendoza", "Ruiz", "Hughes", "Price", "Alvarez", "Castillo", "Sanders", "Patel", + "Myers", "Long", "Ross", "Foster", "Jimenez", "Powell", "Jenkins", "Perry", +] + +# Real Midwest/South cities — staffing agency footprint +CITIES = [ + ("Chicago", "IL", "606"), ("Springfield", "IL", "627"), ("Rockford", "IL", "611"), + ("Peoria", "IL", "616"), ("Champaign", "IL", "618"), ("Decatur", "IL", "625"), + ("Bloomington", "IL", "617"), ("Joliet", "IL", "604"), ("Mattoon", "IL", "619"), + ("Danville", "IL", "618"), ("Quincy", "IL", "623"), ("Galesburg", "IL", "614"), + ("Indianapolis", "IN", "462"), ("Fort Wayne", "IN", "468"), ("Evansville", "IN", "477"), + ("South Bend", "IN", "466"), ("Terre Haute", "IN", "478"), ("Bloomington", "IN", "474"), + ("Columbus", "OH", "432"), ("Cleveland", "OH", "441"), ("Cincinnati", "OH", "452"), + ("Dayton", "OH", "454"), ("Toledo", "OH", "436"), ("Akron", "OH", "443"), + ("St. Louis", "MO", "631"), ("Kansas City", "MO", "641"), ("Springfield", "MO", "658"), + ("Columbia", "MO", "652"), ("Jefferson City", "MO", "651"), + ("Nashville", "TN", "372"), ("Memphis", "TN", "381"), ("Knoxville", "TN", "379"), + ("Louisville", "KY", "402"), ("Lexington", "KY", "405"), + ("Milwaukee", "WI", "532"), ("Madison", "WI", "537"), ("Green Bay", "WI", "543"), + ("Detroit", "MI", "481"), ("Grand Rapids", "MI", "495"), ("Lansing", "MI", "489"), + ("Des Moines", "IA", "503"), ("Cedar Rapids", "IA", "524"), + ("Minneapolis", "MN", "554"), ("St. Paul", "MN", "551"), +] + +# Roles with industry-realistic weights +ROLES = [ + ("Forklift Operator", 12), ("Material Handler", 11), ("Machine Operator", 10), + ("Assembler", 9), ("Production Worker", 9), ("Warehouse Associate", 8), + ("Quality Tech", 7), ("Shipping Clerk", 6), ("Loader", 6), + ("Inventory Clerk", 5), ("Line Lead", 4), ("Maintenance Tech", 4), + ("Welder", 3), ("CNC Operator", 3), ("Sanitation Worker", 3), + ("Packaging Operator", 3), ("Electrician", 2), ("Tool & Die Maker", 2), + ("Safety Coordinator", 1), ("Logistics Coordinator", 1), +] +ROLE_NAMES = [r for r, _ in ROLES] +ROLE_WEIGHTS = [w for _, w in ROLES] + +# Skills per role family (realistic combos) +SKILL_POOLS = { + "warehouse": ["forklift", "pallet jack", "RF scanner", "inventory", "shipping", + "receiving", "pick-to-light", "packaging", "cold storage", "loading dock"], + "production": ["assembly", "line work", "quality inspection", "lean manufacturing", + "6S", "SPC", "conveyor ops", "batch processing", "labeling"], + "machine": ["CNC", "lathe", "mill", "grinder", "press brake", "EDM", + "blueprint reading", "GD&T", "micrometer", "calipers"], + "maintenance": ["preventive maintenance", "troubleshooting", "PLC", "electrical", + "hydraulics", "pneumatics", "welding", "lockout/tagout", "CMMS"], + "quality": ["inspection", "CMM", "SPC", "gauge R&R", "ISO 9001", "root cause analysis", + "first article", "nonconformance", "calibration"], + "general": ["Excel", "SAP", "first aid", "hazmat", "confined space", + "overhead crane", "team lead", "training", "bilingual"], +} + +ROLE_SKILL_MAP = { + "Forklift Operator": ["warehouse", "general"], + "Material Handler": ["warehouse", "general"], + "Machine Operator": ["machine", "production"], + "Assembler": ["production", "quality"], + "Production Worker": ["production", "general"], + "Warehouse Associate": ["warehouse", "general"], + "Quality Tech": ["quality", "production"], + "Shipping Clerk": ["warehouse", "general"], + "Loader": ["warehouse", "general"], + "Inventory Clerk": ["warehouse", "general"], + "Line Lead": ["production", "general"], + "Maintenance Tech": ["maintenance", "general"], + "Welder": ["maintenance", "machine"], + "CNC Operator": ["machine", "quality"], + "Sanitation Worker": ["general"], + "Packaging Operator": ["production", "warehouse"], + "Electrician": ["maintenance"], + "Tool & Die Maker": ["machine", "maintenance"], + "Safety Coordinator": ["quality", "general"], + "Logistics Coordinator": ["warehouse", "general"], +} + +CERTS = [ + ("OSHA-10", 0.35), ("OSHA-30", 0.15), ("Forklift", 0.30), ("Hazmat", 0.12), + ("First Aid/CPR", 0.25), ("Reach Truck", 0.10), ("Order Picker", 0.08), + ("ServSafe", 0.05), ("MSDS", 0.07), ("Confined Space", 0.06), + ("Lockout/Tagout", 0.08), ("Fire Safety", 0.04), ("ISO 9001", 0.03), +] + +ARCHETYPES = [ + ("reliable", 25), ("communicator", 25), ("flexible", 20), + ("leader", 15), ("specialist", 10), ("improving", 3), + ("erratic", 1), ("silent", 1), +] +ARCHETYPE_NAMES = [a for a, _ in ARCHETYPES] +ARCHETYPE_WEIGHTS = [w for _, w in ARCHETYPES] + +SMS_TEMPLATES = [ + "On my way, running about {min} minutes late.", + "Got it, I'll be there at {time}.", + "Can I switch to the {shift} shift this week?", + "Thanks for the update!", + "Is overtime available this Saturday?", + "I need to call in tomorrow, family emergency.", + "Stuck in traffic on I-{highway}, might be {min} late.", + "Hey, just confirming my start time is {time}?", + "I finished the {task} ahead of schedule.", + "Can you send me the address for the new site?", + "Do I need steel toes for this assignment?", + "What's the dress code at {client}?", + "My certification expires next month, where do I renew?", + "I'm available for any extra shifts this week.", + "Is there parking on site or do I need to take the bus?", + "The supervisor said I did great today!", + "I have a doctor's appointment {day}, can I come in late?", + "Weather looks bad tomorrow, is the site still open?", + "I completed the safety orientation.", + "Thanks for getting me this placement, really appreciate it.", +] + +CLIENTS = [ + "Midwest Logistics", "Precision Mfg", "AutoParts Direct", "CleanSpace", + "Summit Packaging", "Great Lakes Steel", "Heartland Foods", "Prairie Wind Energy", + "River City Plastics", "Cardinal Health", "TechFlow Assembly", "Union Pacific", +] + +# ─── Generator ─── + +def gen_skills(role): + pools = ROLE_SKILL_MAP.get(role, ["general"]) + skills = set() + for pool in pools: + available = SKILL_POOLS.get(pool, []) + n = random.randint(2, min(5, len(available))) + skills.update(random.sample(available, n)) + return sorted(skills) + +def gen_certs(): + certs = [] + for name, prob in CERTS: + if random.random() < prob: + expires = datetime(2026, 1, 1) + timedelta(days=random.randint(30, 730)) + certs.append(f"{name}") + return certs + +def gen_scores(archetype): + base = { + "reliable": (0.85, 0.05), "communicator": (0.70, 0.10), + "flexible": (0.75, 0.08), "leader": (0.80, 0.07), + "specialist": (0.78, 0.06), "improving": (0.60, 0.15), + "erratic": (0.40, 0.20), "silent": (0.65, 0.12), + } + mean, std = base.get(archetype, (0.70, 0.10)) + return { + "reliability": max(0, min(1, random.gauss(mean, std))), + "responsiveness": max(0, min(1, random.gauss(mean - 0.05, std + 0.05))), + "engagement": max(0, min(1, random.gauss(mean - 0.03, std))), + "compliance": max(0, min(1, random.gauss(mean + 0.02, std - 0.02))), + "availability": max(0, min(1, random.gauss(0.75, 0.15))), + } + +def gen_comms(archetype, n=5): + if archetype == "silent": + n = random.randint(0, 1) + elif archetype == "communicator": + n = random.randint(5, 10) + msgs = [] + for _ in range(n): + tmpl = random.choice(SMS_TEMPLATES) + msg = tmpl.format( + min=random.randint(5, 25), + time=f"{random.randint(5,8)}:{random.choice(['00','15','30','45'])} AM", + shift=random.choice(["morning", "evening", "night"]), + highway=random.randint(55, 94), + task=random.choice(["pallet count", "safety check", "machine setup", "inventory audit"]), + client=random.choice(CLIENTS), + day=random.choice(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]), + ) + msgs.append(msg) + return " | ".join(msgs) + +def gen_email(first, last, wid): + domain = random.choice(["gmail.com", "yahoo.com", "outlook.com", "protonmail.com", "mail.com"]) + tag = random.choice(["", str(random.randint(1,99)), str(wid)]) + return f"{first.lower()}.{last.lower()}{tag}@{domain}" + +def gen_phone(): + area = random.choice([312,773,217,309,618,219,317,812,614,513,216,314,615,502,414,608,313,515,612]) + return f"+1{area}{random.randint(2000000,9999999)}" + +# ─── Main ─── + +writer = csv.writer(sys.stdout) +writer.writerow([ + "worker_id", "name", "role", "email", "phone", "city", "state", "zip", + "skills", "certifications", "archetype", + "reliability", "responsiveness", "engagement", "compliance", "availability", + "communications", "resume_text", +]) + +for i in range(1, N + 1): + first = random.choice(FIRST_NAMES) + last = random.choice(LAST_NAMES) + # At scale, names repeat — that's realistic. Worker ID is the unique key. + # Add middle initial for variety above 15K workers. + if i > 15000: + mid = chr(65 + (i % 26)) # A-Z + name = f"{first} {mid}. {last}" + else: + name = f"{first} {last}" + + role = random.choices(ROLE_NAMES, weights=ROLE_WEIGHTS, k=1)[0] + city, state, zip_pre = random.choice(CITIES) + zipcode = f"{zip_pre}{random.randint(10,99)}" + skills = gen_skills(role) + certs = gen_certs() + archetype = random.choices(ARCHETYPE_NAMES, weights=ARCHETYPE_WEIGHTS, k=1)[0] + scores = gen_scores(archetype) + comms = gen_comms(archetype) + + resume = ( + f"{name} — {role} in {city}, {state}. " + f"Skills: {'|'.join(skills)}. " + f"Certs: {'|'.join(certs) if certs else 'none'}. " + f"Archetype: {archetype}. " + f"Reliability: {scores['reliability']:.2f}, " + f"Availability: {scores['availability']:.2f}" + ) + + writer.writerow([ + i, name, role, gen_email(first, last, i), gen_phone(), + city, state, zipcode, + ", ".join(skills), ", ".join(certs), archetype, + f"{scores['reliability']:.4f}", f"{scores['responsiveness']:.4f}", + f"{scores['engagement']:.4f}", f"{scores['compliance']:.4f}", + f"{scores['availability']:.4f}", + comms, resume, + ]) + + if i % 25000 == 0: + print(f" generated {i:,}/{N:,}...", file=sys.stderr) + +print(f"Done: {N:,} workers", file=sys.stderr)