diff --git a/data/_catalog/manifests/021ac283-883b-4b13-83ce-5395bacdc33a.json b/data/_catalog/manifests/021ac283-883b-4b13-83ce-5395bacdc33a.json new file mode 100644 index 0000000..89d98ae --- /dev/null +++ b/data/_catalog/manifests/021ac283-883b-4b13-83ce-5395bacdc33a.json @@ -0,0 +1,15 @@ +{ + "id": "021ac283-883b-4b13-83ce-5395bacdc33a", + "name": "clients", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/clients.parquet", + "size_bytes": 21971, + "created_at": "2026-03-27T13:15:18.000750302Z" + } + ], + "created_at": "2026-03-27T13:15:18.000757845Z", + "updated_at": "2026-03-27T13:15:18.000757845Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/052cf81b-f5b6-4439-92d7-ecf09b24bd8b.json b/data/_catalog/manifests/052cf81b-f5b6-4439-92d7-ecf09b24bd8b.json new file mode 100644 index 0000000..96de7c0 --- /dev/null +++ b/data/_catalog/manifests/052cf81b-f5b6-4439-92d7-ecf09b24bd8b.json @@ -0,0 +1,15 @@ +{ + "id": "052cf81b-f5b6-4439-92d7-ecf09b24bd8b", + "name": "candidates", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/candidates.parquet", + "size_bytes": 10592165, + "created_at": "2026-03-27T13:15:17.989860994Z" + } + ], + "created_at": "2026-03-27T13:15:17.989869155Z", + "updated_at": "2026-03-27T13:15:17.989869155Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/0927b27a-80a9-4790-a34f-bda7ff176aac.json b/data/_catalog/manifests/0927b27a-80a9-4790-a34f-bda7ff176aac.json deleted file mode 100644 index f2e6c91..0000000 --- a/data/_catalog/manifests/0927b27a-80a9-4790-a34f-bda7ff176aac.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "0927b27a-80a9-4790-a34f-bda7ff176aac", - "name": "job_orders", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/job_orders.parquet", - "size_bytes": 225889, - "created_at": "2026-03-27T13:11:41.384341257Z" - } - ], - "created_at": "2026-03-27T13:11:41.384344032Z", - "updated_at": "2026-03-27T13:11:41.384344032Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/0bf1eb1f-b182-4025-9b44-b8553e678bcf.json b/data/_catalog/manifests/0bf1eb1f-b182-4025-9b44-b8553e678bcf.json deleted file mode 100644 index ed940e9..0000000 --- a/data/_catalog/manifests/0bf1eb1f-b182-4025-9b44-b8553e678bcf.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "0bf1eb1f-b182-4025-9b44-b8553e678bcf", - "name": "timesheets", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/timesheets.parquet", - "size_bytes": 2458229, - "created_at": "2026-03-27T13:11:42.084209718Z" - } - ], - "created_at": "2026-03-27T13:11:42.084217486Z", - "updated_at": "2026-03-27T13:11:42.084217486Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/47756b77-9a2e-476c-8249-9b971f95fb2d.json b/data/_catalog/manifests/47756b77-9a2e-476c-8249-9b971f95fb2d.json new file mode 100644 index 0000000..5ec264f --- /dev/null +++ b/data/_catalog/manifests/47756b77-9a2e-476c-8249-9b971f95fb2d.json @@ -0,0 +1,15 @@ +{ + "id": "47756b77-9a2e-476c-8249-9b971f95fb2d", + "name": "call_log", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/call_log.parquet", + "size_bytes": 35951077, + "created_at": "2026-03-27T13:15:26.607093971Z" + } + ], + "created_at": "2026-03-27T13:15:26.607099665Z", + "updated_at": "2026-03-27T13:15:26.607099665Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/4be87c74-10b4-463c-b69d-f20c9cd18ed7.json b/data/_catalog/manifests/4be87c74-10b4-463c-b69d-f20c9cd18ed7.json deleted file mode 100644 index 66bae07..0000000 --- a/data/_catalog/manifests/4be87c74-10b4-463c-b69d-f20c9cd18ed7.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "4be87c74-10b4-463c-b69d-f20c9cd18ed7", - "name": "candidates", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/candidates.parquet", - "size_bytes": 2003395, - "created_at": "2026-03-27T13:11:41.341589905Z" - } - ], - "created_at": "2026-03-27T13:11:41.341599187Z", - "updated_at": "2026-03-27T13:11:41.341599187Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/75bb6855-488b-4300-89c2-970871bd99cc.json b/data/_catalog/manifests/75bb6855-488b-4300-89c2-970871bd99cc.json deleted file mode 100644 index 42e9bd8..0000000 --- a/data/_catalog/manifests/75bb6855-488b-4300-89c2-970871bd99cc.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "75bb6855-488b-4300-89c2-970871bd99cc", - "name": "email_log", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/email_log.parquet", - "size_bytes": 1873775, - "created_at": "2026-03-27T13:11:42.757205427Z" - } - ], - "created_at": "2026-03-27T13:11:42.757211105Z", - "updated_at": "2026-03-27T13:11:42.757211105Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/ad393eee-ba0c-4338-9a8b-236bba3816ac.json b/data/_catalog/manifests/ad393eee-ba0c-4338-9a8b-236bba3816ac.json deleted file mode 100644 index 965ad63..0000000 --- a/data/_catalog/manifests/ad393eee-ba0c-4338-9a8b-236bba3816ac.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "ad393eee-ba0c-4338-9a8b-236bba3816ac", - "name": "placements", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/placements.parquet", - "size_bytes": 217395, - "created_at": "2026-03-27T13:11:41.433628136Z" - } - ], - "created_at": "2026-03-27T13:11:41.433633927Z", - "updated_at": "2026-03-27T13:11:41.433633927Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/b334b1eb-d7a2-473f-a7fa-017b17de74bd.json b/data/_catalog/manifests/b334b1eb-d7a2-473f-a7fa-017b17de74bd.json deleted file mode 100644 index 502653a..0000000 --- a/data/_catalog/manifests/b334b1eb-d7a2-473f-a7fa-017b17de74bd.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "b334b1eb-d7a2-473f-a7fa-017b17de74bd", - "name": "clients", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/clients.parquet", - "size_bytes": 34228, - "created_at": "2026-03-27T13:11:41.350247882Z" - } - ], - "created_at": "2026-03-27T13:11:41.350250705Z", - "updated_at": "2026-03-27T13:11:41.350250705Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/c0224239-a265-4b15-a1e2-ebbc96aee60c.json b/data/_catalog/manifests/c0224239-a265-4b15-a1e2-ebbc96aee60c.json new file mode 100644 index 0000000..1f3bb69 --- /dev/null +++ b/data/_catalog/manifests/c0224239-a265-4b15-a1e2-ebbc96aee60c.json @@ -0,0 +1,15 @@ +{ + "id": "c0224239-a265-4b15-a1e2-ebbc96aee60c", + "name": "email_log", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/email_log.parquet", + "size_bytes": 16768671, + "created_at": "2026-03-27T13:15:28.446541739Z" + } + ], + "created_at": "2026-03-27T13:15:28.446547070Z", + "updated_at": "2026-03-27T13:15:28.446547070Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/c8c9d519-b8b5-4d04-ba2b-5acf53c41bc2.json b/data/_catalog/manifests/c8c9d519-b8b5-4d04-ba2b-5acf53c41bc2.json new file mode 100644 index 0000000..b29bb14 --- /dev/null +++ b/data/_catalog/manifests/c8c9d519-b8b5-4d04-ba2b-5acf53c41bc2.json @@ -0,0 +1,15 @@ +{ + "id": "c8c9d519-b8b5-4d04-ba2b-5acf53c41bc2", + "name": "timesheets", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/timesheets.parquet", + "size_bytes": 17539932, + "created_at": "2026-03-27T13:15:23.111118100Z" + } + ], + "created_at": "2026-03-27T13:15:23.111124272Z", + "updated_at": "2026-03-27T13:15:23.111124272Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/dcca449b-a2f6-4c1f-99b6-c69dcdbdd204.json b/data/_catalog/manifests/dcca449b-a2f6-4c1f-99b6-c69dcdbdd204.json new file mode 100644 index 0000000..6c99fb2 --- /dev/null +++ b/data/_catalog/manifests/dcca449b-a2f6-4c1f-99b6-c69dcdbdd204.json @@ -0,0 +1,15 @@ +{ + "id": "dcca449b-a2f6-4c1f-99b6-c69dcdbdd204", + "name": "placements", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/placements.parquet", + "size_bytes": 1213820, + "created_at": "2026-03-27T13:15:18.264258909Z" + } + ], + "created_at": "2026-03-27T13:15:18.264266375Z", + "updated_at": "2026-03-27T13:15:18.264266375Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/e015f0e2-51e4-4301-855d-76c54992c5b9.json b/data/_catalog/manifests/e015f0e2-51e4-4301-855d-76c54992c5b9.json deleted file mode 100644 index 792949e..0000000 --- a/data/_catalog/manifests/e015f0e2-51e4-4301-855d-76c54992c5b9.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "e015f0e2-51e4-4301-855d-76c54992c5b9", - "name": "call_log", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/call_log.parquet", - "size_bytes": 3276693, - "created_at": "2026-03-27T13:11:42.483220340Z" - } - ], - "created_at": "2026-03-27T13:11:42.483225870Z", - "updated_at": "2026-03-27T13:11:42.483225870Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/e8cc1ad2-114e-4441-a526-b8e6de10cb59.json b/data/_catalog/manifests/e8cc1ad2-114e-4441-a526-b8e6de10cb59.json new file mode 100644 index 0000000..c4cd6dd --- /dev/null +++ b/data/_catalog/manifests/e8cc1ad2-114e-4441-a526-b8e6de10cb59.json @@ -0,0 +1,15 @@ +{ + "id": "e8cc1ad2-114e-4441-a526-b8e6de10cb59", + "name": "job_orders", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/job_orders.parquet", + "size_bytes": 905534, + "created_at": "2026-03-27T13:15:18.114659931Z" + } + ], + "created_at": "2026-03-27T13:15:18.114667579Z", + "updated_at": "2026-03-27T13:15:18.114667579Z" +} \ No newline at end of file diff --git a/data/datasets/call_log.parquet b/data/datasets/call_log.parquet index 62e8b43..0f62d0e 100644 Binary files a/data/datasets/call_log.parquet and b/data/datasets/call_log.parquet differ diff --git a/data/datasets/candidates.parquet b/data/datasets/candidates.parquet index b454918..a7f05d5 100644 Binary files a/data/datasets/candidates.parquet and b/data/datasets/candidates.parquet differ diff --git a/data/datasets/clients.parquet b/data/datasets/clients.parquet index a8eff98..c61e229 100644 Binary files a/data/datasets/clients.parquet and b/data/datasets/clients.parquet differ diff --git a/data/datasets/email_log.parquet b/data/datasets/email_log.parquet index 18cee5e..34ee53c 100644 Binary files a/data/datasets/email_log.parquet and b/data/datasets/email_log.parquet differ diff --git a/data/datasets/job_orders.parquet b/data/datasets/job_orders.parquet index ddec4c5..3ad5c39 100644 Binary files a/data/datasets/job_orders.parquet and b/data/datasets/job_orders.parquet differ diff --git a/data/datasets/placements.parquet b/data/datasets/placements.parquet index 947c63a..49d3d55 100644 Binary files a/data/datasets/placements.parquet and b/data/datasets/placements.parquet differ diff --git a/data/datasets/timesheets.parquet b/data/datasets/timesheets.parquet index 3dd7c62..c1e6fb1 100644 Binary files a/data/datasets/timesheets.parquet and b/data/datasets/timesheets.parquet differ diff --git a/data/vectors/candidate_resumes.parquet b/data/vectors/candidate_resumes.parquet deleted file mode 100644 index 3acbdb3..0000000 Binary files a/data/vectors/candidate_resumes.parquet and /dev/null differ diff --git a/data/vectors/resumes_10k.parquet b/data/vectors/resumes_10k.parquet new file mode 100644 index 0000000..54e4d7d Binary files /dev/null and b/data/vectors/resumes_10k.parquet differ diff --git a/scripts/scale_test.py b/scripts/scale_test.py new file mode 100644 index 0000000..dedecb0 --- /dev/null +++ b/scripts/scale_test.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +""" +Scale test: 2.5M rows across staffing tables + 100K vector embeddings. +Designed for 128GB RAM machine. +""" + +import random, json, urllib.request, time +from datetime import datetime, timedelta +import pyarrow as pa, pyarrow.parquet as pq + +API = "http://localhost:3100" +random.seed(2026) + +def upload(name, table): + path = f"/tmp/{name}.parquet" + pq.write_table(table, path, compression="snappy") + with open(path, "rb") as f: + data = f.read() + key = f"datasets/{name}.parquet" + req = urllib.request.Request(f"{API}/storage/objects/{key}", data=data, method="PUT") + urllib.request.urlopen(req) + body = json.dumps({"name": name, "schema_fingerprint": "auto", + "objects": [{"bucket": "data", "key": key, "size_bytes": len(data)}]}).encode() + req = urllib.request.Request(f"{API}/catalog/datasets", data=body, method="POST", + headers={"Content-Type": "application/json"}) + urllib.request.urlopen(req) + print(f" {name}: {table.num_rows:,} rows ({len(data)/1024/1024:.1f} MB)") + +# Shared data +first_names = ["James","Mary","Robert","Patricia","John","Jennifer","Michael","Linda","David","Elizabeth", + "William","Barbara","Richard","Susan","Joseph","Jessica","Thomas","Sarah","Christopher","Karen", + "Charles","Lisa","Daniel","Nancy","Matthew","Betty","Anthony","Margaret","Mark","Sandra", + "Donald","Ashley","Steven","Dorothy","Paul","Kimberly","Andrew","Emily","Joshua","Donna", + "Kenneth","Michelle","Kevin","Carol","Brian","Amanda","George","Melissa","Timothy","Deborah"] +last_names = ["Smith","Johnson","Williams","Brown","Jones","Garcia","Miller","Davis","Rodriguez","Martinez", + "Hernandez","Lopez","Gonzalez","Wilson","Anderson","Thomas","Taylor","Moore","Jackson","Martin", + "Lee","Perez","Thompson","White","Harris","Sanchez","Clark","Ramirez","Lewis","Robinson", + "Walker","Young","Allen","King","Wright","Scott","Torres","Nguyen","Hill","Flores"] +cities_zips = [ + ("Chicago","IL","60601"),("Chicago","IL","60610"),("Chicago","IL","60614"),("Chicago","IL","60622"), + ("New York","NY","10001"),("New York","NY","10016"),("New York","NY","10022"),("New York","NY","10036"), + ("Los Angeles","CA","90001"),("Los Angeles","CA","90024"),("Houston","TX","77001"),("Houston","TX","77019"), + ("Dallas","TX","75201"),("Dallas","TX","75219"),("Atlanta","GA","30301"),("Atlanta","GA","30309"), + ("Denver","CO","80201"),("Denver","CO","80206"),("Phoenix","AZ","85001"),("Phoenix","AZ","85006"), + ("Seattle","WA","98101"),("Seattle","WA","98104"),("Miami","FL","33101"),("Miami","FL","33132"), +] +verticals = ["IT","Healthcare","Industrial","Accounting","Admin"] +skills_by_vert = { + "IT": ["Java","Python","C#",".NET","JavaScript","React","Angular","Node.js","SQL","AWS","Azure","Docker","Kubernetes","Linux","Git","REST APIs","MongoDB","PostgreSQL","Redis","Terraform","Jenkins","Agile","Spring Boot","Django","Go","Rust","TypeScript","GraphQL","Microservices","CI/CD"], + "Healthcare": ["RN","LPN","CNA","BLS","ACLS","EMR","Epic","Cerner","ICD-10","CPT","HIPAA","Phlebotomy","ICU","OR","ER","Med-Surg","Pediatrics","Oncology","Telemetry","IV Therapy"], + "Industrial": ["Forklift","OSHA 10","OSHA 30","Welding","CNC","PLC","Blueprint Reading","Quality Control","Six Sigma","AutoCAD","SolidWorks","Mechanical Assembly","Electrical","Hydraulics","Warehouse","Lean Manufacturing"], + "Accounting": ["QuickBooks","SAP","Oracle","Accounts Payable","Accounts Receivable","General Ledger","Financial Reporting","CPA","Payroll","Budgeting","Excel Advanced","Power BI","Tableau","GAAP","SOX","Audit"], + "Admin": ["Microsoft Office","Data Entry","Customer Service","Scheduling","Receptionist","Executive Assistant","Calendar Management","Salesforce","CRM","Multi-line Phone","Typing 60+ WPM","Bilingual Spanish","Notary"], +} +email_domains = ["gmail.com","yahoo.com","hotmail.com","outlook.com","icloud.com","protonmail.com"] +base_date = datetime(2026, 1, 1) + +def make_phone(): + return f"({random.randint(200,999)}) {random.randint(200,999)}-{random.randint(1000,9999)}" + +print("=" * 60) +print("SCALE TEST: 2.5M rows + 100K vectors") +print("=" * 60) +t_start = time.time() + +# ============================================================ +# 100K CANDIDATES +# ============================================================ +print("\nGenerating candidates (100K)...") +t0 = time.time() +N = 100_000 + +c_ids, c_first, c_last, c_emails, c_phones = [], [], [], [], [] +c_city, c_state, c_zip = [], [], [] +c_vertical, c_skills, c_resume = [], [], [] +c_status, c_source, c_pay, c_years = [], [], [], [] + +for i in range(N): + fn = random.choice(first_names) + ln = random.choice(last_names) + city, state, zc = random.choice(cities_zips) + vert = random.choice(verticals) + sk = random.sample(skills_by_vert[vert], min(random.randint(3, 10), len(skills_by_vert[vert]))) + yrs = random.randint(0, 30) + + resume = f"{fn} {ln} — {vert} professional with {yrs} years experience in {city}, {state} {zc}. Skills: {', '.join(sk)}. " + resume += random.choice([ + f"Previously at {random.choice(['Acme','TechFlow','GlobalStaff','MedPro','BuildRight','CoreSys','Apex','Summit'])} Corp.", + f"Seeking {random.choice(['contract','full-time','temp-to-hire'])} in {city} metro.", + f"Available {random.choice(['immediately','in 2 weeks','after current assignment'])}.", + f"Open to {random.choice(['remote','hybrid','on-site'])} work arrangements.", + f"Certified in {random.choice(sk)} with hands-on project experience.", + ]) + + c_ids.append(f"CAND-{i+1:06d}") + c_first.append(fn) + c_last.append(ln) + c_emails.append(f"{fn.lower()}.{ln.lower()}{random.randint(1,99)}@{random.choice(email_domains)}") + c_phones.append(make_phone()) + c_city.append(city) + c_state.append(state) + c_zip.append(zc) + c_vertical.append(vert) + c_skills.append("|".join(sk)) + c_resume.append(resume) + c_status.append(random.choice(["active","active","active","inactive","placed"])) + c_source.append(random.choice(["Indeed","LinkedIn","Referral","Walk-in","Monster","Website"])) + c_pay.append(round(random.uniform(12, 95), 2)) + c_years.append(yrs) + +candidates = pa.table({ + "candidate_id": c_ids, "first_name": c_first, "last_name": c_last, + "email": c_emails, "phone": c_phones, + "city": c_city, "state": c_state, "zip": c_zip, + "vertical": c_vertical, "skills": c_skills, "resume_summary": c_resume, + "status": c_status, "source": c_source, "min_pay_rate": c_pay, "years_experience": c_years, +}) +upload("candidates", candidates) +print(f" Generated in {time.time()-t0:.1f}s") + +# ============================================================ +# 2K CLIENTS +# ============================================================ +print("\nGenerating clients (2K)...") +prefixes = ["Apex","Summit","Core","National","Metro","Pacific","Global","United","Pinnacle","Horizon","Pioneer","Titan","Quantum","Vertex","Elite"] +suffixes = ["Industries","Solutions","Systems","Group","Corp","Technologies","Services","Partners","Manufacturing","Healthcare"] +cl_ids, cl_names, cl_verts, cl_city, cl_state, cl_zip = [], [], [], [], [], [] +for i in range(2000): + city, state, zc = random.choice(cities_zips) + cl_ids.append(f"CLI-{i+1:05d}") + cl_names.append(f"{random.choice(prefixes)} {random.choice(suffixes)}") + cl_verts.append(random.choice(verticals)) + cl_city.append(city) + cl_state.append(state) + cl_zip.append(zc) +clients = pa.table({"client_id": cl_ids, "company_name": cl_names, "vertical": cl_verts, "city": cl_city, "state": cl_state, "zip": cl_zip}) +upload("clients", clients) + +# ============================================================ +# 15K JOB ORDERS +# ============================================================ +print("\nGenerating job_orders (15K)...") +titles_map = { + "IT": ["Software Developer","Java Developer",".NET Developer","DevOps Engineer","Data Analyst","QA Engineer","Cloud Architect","React Developer","DBA","Security Analyst","Python Developer","Full Stack Developer"], + "Healthcare": ["Registered Nurse","LPN","CNA","Medical Assistant","Phlebotomist","Radiology Tech","Pharmacy Tech","Medical Coder"], + "Industrial": ["Forklift Operator","Welder","CNC Machinist","Quality Inspector","Maintenance Tech","Electrician","Warehouse Associate","Assembly Tech"], + "Accounting": ["Staff Accountant","AP Specialist","AR Specialist","Payroll Clerk","Financial Analyst","Bookkeeper","Controller","Tax Preparer"], + "Admin": ["Administrative Assistant","Executive Assistant","Receptionist","Data Entry Clerk","Office Manager","Customer Service Rep","HR Coordinator"], +} +jo_ids, jo_clients, jo_titles, jo_verts, jo_bills, jo_pays, jo_status = [], [], [], [], [], [], [] +jo_city, jo_state, jo_zip, jo_desc = [], [], [], [] +for i in range(15000): + vert = random.choice(verticals) + title = random.choice(titles_map[vert]) + ci = random.randint(0, 1999) + city, state, zc = random.choice(cities_zips) + bill = round(random.uniform(25, 150), 2) + pay = round(bill * random.uniform(0.55, 0.75), 2) + req_sk = random.sample(skills_by_vert[vert], min(random.randint(3, 6), len(skills_by_vert[vert]))) + desc = f"{title} for {cl_names[ci]} in {city}, {state}. Requires: {', '.join(req_sk)}. {random.randint(1,10)}+ years exp. ${bill}/hr." + jo_ids.append(f"JO-{i+1:06d}") + jo_clients.append(cl_ids[ci]) + jo_titles.append(title) + jo_verts.append(vert) + jo_bills.append(bill) + jo_pays.append(pay) + jo_status.append(random.choice(["open","open","filled","filled","closed"])) + jo_city.append(city) + jo_state.append(state) + jo_zip.append(zc) + jo_desc.append(desc) +job_orders = pa.table({"job_order_id": jo_ids, "client_id": jo_clients, "title": jo_titles, "vertical": jo_verts, "bill_rate": jo_bills, "pay_rate": jo_pays, "status": jo_status, "city": jo_city, "state": jo_state, "zip": jo_zip, "description": jo_desc}) +upload("job_orders", job_orders) + +# ============================================================ +# 50K PLACEMENTS +# ============================================================ +print("\nGenerating placements (50K)...") +recruiters = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(100)] +p_ids, p_cands, p_jobs, p_clients, p_bills, p_pays, p_recs, p_status = [], [], [], [], [], [], [], [] +for i in range(50000): + ci = random.randint(0, N-1) + ji = random.randint(0, 14999) + p_ids.append(f"PL-{i+1:06d}") + p_cands.append(c_ids[ci]) + p_jobs.append(jo_ids[ji]) + p_clients.append(jo_clients[ji]) + p_bills.append(jo_bills[ji]) + p_pays.append(jo_pays[ji]) + p_recs.append(random.choice(recruiters)) + p_status.append(random.choice(["active","active","completed","completed","terminated"])) +placements = pa.table({"placement_id": p_ids, "candidate_id": p_cands, "job_order_id": p_jobs, "client_id": p_clients, "bill_rate": p_bills, "pay_rate": p_pays, "recruiter": p_recs, "status": p_status}) +upload("placements", placements) + +# ============================================================ +# 1M TIMESHEETS +# ============================================================ +print("\nGenerating timesheets (1M)...") +ts_ids, ts_placements, ts_cands, ts_clients = [], [], [], [] +ts_hrs_reg, ts_hrs_ot, ts_bill_total, ts_pay_total, ts_weeks, ts_approved = [], [], [], [], [], [] +for i in range(1_000_000): + pi = random.randint(0, 49999) + hrs = random.choice([40.0, 40.0, 40.0, 32.0, 24.0, 20.0]) + ot = random.choice([0.0, 0.0, 0.0, 4.0, 8.0, 12.0]) + b = p_bills[pi] + p = p_pays[pi] + ts_ids.append(f"TS-{i+1:07d}") + ts_placements.append(p_ids[pi]) + ts_cands.append(p_cands[pi]) + ts_clients.append(p_clients[pi]) + ts_hrs_reg.append(hrs) + ts_hrs_ot.append(ot) + ts_bill_total.append(round(hrs * b + ot * b * 1.5, 2)) + ts_pay_total.append(round(hrs * p + ot * p * 1.5, 2)) + ts_weeks.append((base_date - timedelta(weeks=random.randint(0, 156))).strftime("%Y-%m-%d")) + ts_approved.append(random.random() < 0.85) +timesheets = pa.table({"timesheet_id": ts_ids, "placement_id": ts_placements, "candidate_id": ts_cands, "client_id": ts_clients, "hours_regular": ts_hrs_reg, "hours_overtime": ts_hrs_ot, "bill_total": ts_bill_total, "pay_total": ts_pay_total, "week_ending": ts_weeks, "approved": ts_approved}) +upload("timesheets", timesheets) + +# ============================================================ +# 800K CALL LOG +# ============================================================ +print("\nGenerating call_log (800K)...") +call_ids, call_from, call_to, call_dur, call_ts, call_rec, call_cand, call_disp = [], [], [], [], [], [], [], [] +disps = ["connected","voicemail","no_answer","busy","wrong_number","callback_scheduled"] +for i in range(800_000): + ci = random.randint(0, N-1) + call_ids.append(f"CALL-{i+1:07d}") + call_from.append(make_phone()) + call_to.append(c_phones[ci]) + call_dur.append(random.randint(0, 1800)) + call_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat()) + call_rec.append(random.choice(recruiters)) + call_cand.append(c_ids[ci]) + call_disp.append(random.choice(disps)) +call_log = pa.table({"call_id": call_ids, "from_number": call_from, "to_number": call_to, "duration_seconds": call_dur, "timestamp": call_ts, "recruiter": call_rec, "candidate_id": call_cand, "disposition": call_disp}) +upload("call_log", call_log) + +# ============================================================ +# 500K EMAIL LOG +# ============================================================ +print("\nGenerating email_log (500K)...") +em_ids, em_from, em_to, em_subj, em_ts, em_rec, em_cand, em_opened = [], [], [], [], [], [], [], [] +subjects = ["New opportunity: {}", "Following up", "Interview scheduled", "Timesheet reminder", "Background check complete", "Assignment details", "Rate update", "Welcome aboard"] +for i in range(500_000): + ci = random.randint(0, N-1) + ji = random.randint(0, 14999) + rec = random.choice(recruiters) + em_ids.append(f"EM-{i+1:07d}") + em_from.append(f"{rec.replace(' ','.').lower()}@acmestaffing.com") + em_to.append(c_emails[ci]) + em_subj.append(random.choice(subjects).format(jo_titles[ji])) + em_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat()) + em_rec.append(rec) + em_cand.append(c_ids[ci]) + em_opened.append(random.random() < 0.55) +email_log = pa.table({"email_id": em_ids, "from_addr": em_from, "to_addr": em_to, "subject": em_subj, "timestamp": em_ts, "recruiter": em_rec, "candidate_id": em_cand, "opened": em_opened}) +upload("email_log", email_log) + +total = 100_000 + 2_000 + 15_000 + 50_000 + 1_000_000 + 800_000 + 500_000 +t_total = time.time() - t_start +print(f"\n{'='*60}") +print(f"LOADED: {total:,} rows in {t_total:.0f}s") +print(f"{'='*60}")