Scale test: 2.47M rows + 10K vector index benchmarked

Benchmarks on 128GB RAM server:
- 100K candidate filter (skills+city+status): 257ms
- 1M timesheet aggregation (revenue by client): 942ms
- 800K call log cross-reference (cold leads): 642ms
- Triple JOIN recruiter performance: 487ms
- 500K email open rate aggregation: 259ms
- COUNT all 2.47M rows: 84ms
- 10K vector search (cosine similarity): ~450ms
- Embedding throughput: 49 chunks/sec via Ollama
- RAG correctly refuses to hallucinate when no match exists

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-03-27 08:31:37 -05:00
parent 26fc98c885
commit eae51977ab
24 changed files with 369 additions and 105 deletions

View File

@ -0,0 +1,15 @@
{
"id": "021ac283-883b-4b13-83ce-5395bacdc33a",
"name": "clients",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/clients.parquet",
"size_bytes": 21971,
"created_at": "2026-03-27T13:15:18.000750302Z"
}
],
"created_at": "2026-03-27T13:15:18.000757845Z",
"updated_at": "2026-03-27T13:15:18.000757845Z"
}

View File

@ -0,0 +1,15 @@
{
"id": "052cf81b-f5b6-4439-92d7-ecf09b24bd8b",
"name": "candidates",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/candidates.parquet",
"size_bytes": 10592165,
"created_at": "2026-03-27T13:15:17.989860994Z"
}
],
"created_at": "2026-03-27T13:15:17.989869155Z",
"updated_at": "2026-03-27T13:15:17.989869155Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "0927b27a-80a9-4790-a34f-bda7ff176aac",
"name": "job_orders",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/job_orders.parquet",
"size_bytes": 225889,
"created_at": "2026-03-27T13:11:41.384341257Z"
}
],
"created_at": "2026-03-27T13:11:41.384344032Z",
"updated_at": "2026-03-27T13:11:41.384344032Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "0bf1eb1f-b182-4025-9b44-b8553e678bcf",
"name": "timesheets",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/timesheets.parquet",
"size_bytes": 2458229,
"created_at": "2026-03-27T13:11:42.084209718Z"
}
],
"created_at": "2026-03-27T13:11:42.084217486Z",
"updated_at": "2026-03-27T13:11:42.084217486Z"
}

View File

@ -0,0 +1,15 @@
{
"id": "47756b77-9a2e-476c-8249-9b971f95fb2d",
"name": "call_log",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/call_log.parquet",
"size_bytes": 35951077,
"created_at": "2026-03-27T13:15:26.607093971Z"
}
],
"created_at": "2026-03-27T13:15:26.607099665Z",
"updated_at": "2026-03-27T13:15:26.607099665Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "4be87c74-10b4-463c-b69d-f20c9cd18ed7",
"name": "candidates",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/candidates.parquet",
"size_bytes": 2003395,
"created_at": "2026-03-27T13:11:41.341589905Z"
}
],
"created_at": "2026-03-27T13:11:41.341599187Z",
"updated_at": "2026-03-27T13:11:41.341599187Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "75bb6855-488b-4300-89c2-970871bd99cc",
"name": "email_log",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/email_log.parquet",
"size_bytes": 1873775,
"created_at": "2026-03-27T13:11:42.757205427Z"
}
],
"created_at": "2026-03-27T13:11:42.757211105Z",
"updated_at": "2026-03-27T13:11:42.757211105Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "ad393eee-ba0c-4338-9a8b-236bba3816ac",
"name": "placements",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/placements.parquet",
"size_bytes": 217395,
"created_at": "2026-03-27T13:11:41.433628136Z"
}
],
"created_at": "2026-03-27T13:11:41.433633927Z",
"updated_at": "2026-03-27T13:11:41.433633927Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "b334b1eb-d7a2-473f-a7fa-017b17de74bd",
"name": "clients",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/clients.parquet",
"size_bytes": 34228,
"created_at": "2026-03-27T13:11:41.350247882Z"
}
],
"created_at": "2026-03-27T13:11:41.350250705Z",
"updated_at": "2026-03-27T13:11:41.350250705Z"
}

View File

@ -0,0 +1,15 @@
{
"id": "c0224239-a265-4b15-a1e2-ebbc96aee60c",
"name": "email_log",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/email_log.parquet",
"size_bytes": 16768671,
"created_at": "2026-03-27T13:15:28.446541739Z"
}
],
"created_at": "2026-03-27T13:15:28.446547070Z",
"updated_at": "2026-03-27T13:15:28.446547070Z"
}

View File

@ -0,0 +1,15 @@
{
"id": "c8c9d519-b8b5-4d04-ba2b-5acf53c41bc2",
"name": "timesheets",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/timesheets.parquet",
"size_bytes": 17539932,
"created_at": "2026-03-27T13:15:23.111118100Z"
}
],
"created_at": "2026-03-27T13:15:23.111124272Z",
"updated_at": "2026-03-27T13:15:23.111124272Z"
}

View File

@ -0,0 +1,15 @@
{
"id": "dcca449b-a2f6-4c1f-99b6-c69dcdbdd204",
"name": "placements",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/placements.parquet",
"size_bytes": 1213820,
"created_at": "2026-03-27T13:15:18.264258909Z"
}
],
"created_at": "2026-03-27T13:15:18.264266375Z",
"updated_at": "2026-03-27T13:15:18.264266375Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "e015f0e2-51e4-4301-855d-76c54992c5b9",
"name": "call_log",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/call_log.parquet",
"size_bytes": 3276693,
"created_at": "2026-03-27T13:11:42.483220340Z"
}
],
"created_at": "2026-03-27T13:11:42.483225870Z",
"updated_at": "2026-03-27T13:11:42.483225870Z"
}

View File

@ -0,0 +1,15 @@
{
"id": "e8cc1ad2-114e-4441-a526-b8e6de10cb59",
"name": "job_orders",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/job_orders.parquet",
"size_bytes": 905534,
"created_at": "2026-03-27T13:15:18.114659931Z"
}
],
"created_at": "2026-03-27T13:15:18.114667579Z",
"updated_at": "2026-03-27T13:15:18.114667579Z"
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

264
scripts/scale_test.py Normal file
View File

@ -0,0 +1,264 @@
#!/usr/bin/env python3
"""
Scale test: 2.5M rows across staffing tables + 100K vector embeddings.
Designed for 128GB RAM machine.
"""
import random, json, urllib.request, time
from datetime import datetime, timedelta
import pyarrow as pa, pyarrow.parquet as pq
API = "http://localhost:3100"
random.seed(2026)
def upload(name, table):
path = f"/tmp/{name}.parquet"
pq.write_table(table, path, compression="snappy")
with open(path, "rb") as f:
data = f.read()
key = f"datasets/{name}.parquet"
req = urllib.request.Request(f"{API}/storage/objects/{key}", data=data, method="PUT")
urllib.request.urlopen(req)
body = json.dumps({"name": name, "schema_fingerprint": "auto",
"objects": [{"bucket": "data", "key": key, "size_bytes": len(data)}]}).encode()
req = urllib.request.Request(f"{API}/catalog/datasets", data=body, method="POST",
headers={"Content-Type": "application/json"})
urllib.request.urlopen(req)
print(f" {name}: {table.num_rows:,} rows ({len(data)/1024/1024:.1f} MB)")
# Shared data
first_names = ["James","Mary","Robert","Patricia","John","Jennifer","Michael","Linda","David","Elizabeth",
"William","Barbara","Richard","Susan","Joseph","Jessica","Thomas","Sarah","Christopher","Karen",
"Charles","Lisa","Daniel","Nancy","Matthew","Betty","Anthony","Margaret","Mark","Sandra",
"Donald","Ashley","Steven","Dorothy","Paul","Kimberly","Andrew","Emily","Joshua","Donna",
"Kenneth","Michelle","Kevin","Carol","Brian","Amanda","George","Melissa","Timothy","Deborah"]
last_names = ["Smith","Johnson","Williams","Brown","Jones","Garcia","Miller","Davis","Rodriguez","Martinez",
"Hernandez","Lopez","Gonzalez","Wilson","Anderson","Thomas","Taylor","Moore","Jackson","Martin",
"Lee","Perez","Thompson","White","Harris","Sanchez","Clark","Ramirez","Lewis","Robinson",
"Walker","Young","Allen","King","Wright","Scott","Torres","Nguyen","Hill","Flores"]
cities_zips = [
("Chicago","IL","60601"),("Chicago","IL","60610"),("Chicago","IL","60614"),("Chicago","IL","60622"),
("New York","NY","10001"),("New York","NY","10016"),("New York","NY","10022"),("New York","NY","10036"),
("Los Angeles","CA","90001"),("Los Angeles","CA","90024"),("Houston","TX","77001"),("Houston","TX","77019"),
("Dallas","TX","75201"),("Dallas","TX","75219"),("Atlanta","GA","30301"),("Atlanta","GA","30309"),
("Denver","CO","80201"),("Denver","CO","80206"),("Phoenix","AZ","85001"),("Phoenix","AZ","85006"),
("Seattle","WA","98101"),("Seattle","WA","98104"),("Miami","FL","33101"),("Miami","FL","33132"),
]
verticals = ["IT","Healthcare","Industrial","Accounting","Admin"]
skills_by_vert = {
"IT": ["Java","Python","C#",".NET","JavaScript","React","Angular","Node.js","SQL","AWS","Azure","Docker","Kubernetes","Linux","Git","REST APIs","MongoDB","PostgreSQL","Redis","Terraform","Jenkins","Agile","Spring Boot","Django","Go","Rust","TypeScript","GraphQL","Microservices","CI/CD"],
"Healthcare": ["RN","LPN","CNA","BLS","ACLS","EMR","Epic","Cerner","ICD-10","CPT","HIPAA","Phlebotomy","ICU","OR","ER","Med-Surg","Pediatrics","Oncology","Telemetry","IV Therapy"],
"Industrial": ["Forklift","OSHA 10","OSHA 30","Welding","CNC","PLC","Blueprint Reading","Quality Control","Six Sigma","AutoCAD","SolidWorks","Mechanical Assembly","Electrical","Hydraulics","Warehouse","Lean Manufacturing"],
"Accounting": ["QuickBooks","SAP","Oracle","Accounts Payable","Accounts Receivable","General Ledger","Financial Reporting","CPA","Payroll","Budgeting","Excel Advanced","Power BI","Tableau","GAAP","SOX","Audit"],
"Admin": ["Microsoft Office","Data Entry","Customer Service","Scheduling","Receptionist","Executive Assistant","Calendar Management","Salesforce","CRM","Multi-line Phone","Typing 60+ WPM","Bilingual Spanish","Notary"],
}
email_domains = ["gmail.com","yahoo.com","hotmail.com","outlook.com","icloud.com","protonmail.com"]
base_date = datetime(2026, 1, 1)
def make_phone():
return f"({random.randint(200,999)}) {random.randint(200,999)}-{random.randint(1000,9999)}"
print("=" * 60)
print("SCALE TEST: 2.5M rows + 100K vectors")
print("=" * 60)
t_start = time.time()
# ============================================================
# 100K CANDIDATES
# ============================================================
print("\nGenerating candidates (100K)...")
t0 = time.time()
N = 100_000
c_ids, c_first, c_last, c_emails, c_phones = [], [], [], [], []
c_city, c_state, c_zip = [], [], []
c_vertical, c_skills, c_resume = [], [], []
c_status, c_source, c_pay, c_years = [], [], [], []
for i in range(N):
fn = random.choice(first_names)
ln = random.choice(last_names)
city, state, zc = random.choice(cities_zips)
vert = random.choice(verticals)
sk = random.sample(skills_by_vert[vert], min(random.randint(3, 10), len(skills_by_vert[vert])))
yrs = random.randint(0, 30)
resume = f"{fn} {ln}{vert} professional with {yrs} years experience in {city}, {state} {zc}. Skills: {', '.join(sk)}. "
resume += random.choice([
f"Previously at {random.choice(['Acme','TechFlow','GlobalStaff','MedPro','BuildRight','CoreSys','Apex','Summit'])} Corp.",
f"Seeking {random.choice(['contract','full-time','temp-to-hire'])} in {city} metro.",
f"Available {random.choice(['immediately','in 2 weeks','after current assignment'])}.",
f"Open to {random.choice(['remote','hybrid','on-site'])} work arrangements.",
f"Certified in {random.choice(sk)} with hands-on project experience.",
])
c_ids.append(f"CAND-{i+1:06d}")
c_first.append(fn)
c_last.append(ln)
c_emails.append(f"{fn.lower()}.{ln.lower()}{random.randint(1,99)}@{random.choice(email_domains)}")
c_phones.append(make_phone())
c_city.append(city)
c_state.append(state)
c_zip.append(zc)
c_vertical.append(vert)
c_skills.append("|".join(sk))
c_resume.append(resume)
c_status.append(random.choice(["active","active","active","inactive","placed"]))
c_source.append(random.choice(["Indeed","LinkedIn","Referral","Walk-in","Monster","Website"]))
c_pay.append(round(random.uniform(12, 95), 2))
c_years.append(yrs)
candidates = pa.table({
"candidate_id": c_ids, "first_name": c_first, "last_name": c_last,
"email": c_emails, "phone": c_phones,
"city": c_city, "state": c_state, "zip": c_zip,
"vertical": c_vertical, "skills": c_skills, "resume_summary": c_resume,
"status": c_status, "source": c_source, "min_pay_rate": c_pay, "years_experience": c_years,
})
upload("candidates", candidates)
print(f" Generated in {time.time()-t0:.1f}s")
# ============================================================
# 2K CLIENTS
# ============================================================
print("\nGenerating clients (2K)...")
prefixes = ["Apex","Summit","Core","National","Metro","Pacific","Global","United","Pinnacle","Horizon","Pioneer","Titan","Quantum","Vertex","Elite"]
suffixes = ["Industries","Solutions","Systems","Group","Corp","Technologies","Services","Partners","Manufacturing","Healthcare"]
cl_ids, cl_names, cl_verts, cl_city, cl_state, cl_zip = [], [], [], [], [], []
for i in range(2000):
city, state, zc = random.choice(cities_zips)
cl_ids.append(f"CLI-{i+1:05d}")
cl_names.append(f"{random.choice(prefixes)} {random.choice(suffixes)}")
cl_verts.append(random.choice(verticals))
cl_city.append(city)
cl_state.append(state)
cl_zip.append(zc)
clients = pa.table({"client_id": cl_ids, "company_name": cl_names, "vertical": cl_verts, "city": cl_city, "state": cl_state, "zip": cl_zip})
upload("clients", clients)
# ============================================================
# 15K JOB ORDERS
# ============================================================
print("\nGenerating job_orders (15K)...")
titles_map = {
"IT": ["Software Developer","Java Developer",".NET Developer","DevOps Engineer","Data Analyst","QA Engineer","Cloud Architect","React Developer","DBA","Security Analyst","Python Developer","Full Stack Developer"],
"Healthcare": ["Registered Nurse","LPN","CNA","Medical Assistant","Phlebotomist","Radiology Tech","Pharmacy Tech","Medical Coder"],
"Industrial": ["Forklift Operator","Welder","CNC Machinist","Quality Inspector","Maintenance Tech","Electrician","Warehouse Associate","Assembly Tech"],
"Accounting": ["Staff Accountant","AP Specialist","AR Specialist","Payroll Clerk","Financial Analyst","Bookkeeper","Controller","Tax Preparer"],
"Admin": ["Administrative Assistant","Executive Assistant","Receptionist","Data Entry Clerk","Office Manager","Customer Service Rep","HR Coordinator"],
}
jo_ids, jo_clients, jo_titles, jo_verts, jo_bills, jo_pays, jo_status = [], [], [], [], [], [], []
jo_city, jo_state, jo_zip, jo_desc = [], [], [], []
for i in range(15000):
vert = random.choice(verticals)
title = random.choice(titles_map[vert])
ci = random.randint(0, 1999)
city, state, zc = random.choice(cities_zips)
bill = round(random.uniform(25, 150), 2)
pay = round(bill * random.uniform(0.55, 0.75), 2)
req_sk = random.sample(skills_by_vert[vert], min(random.randint(3, 6), len(skills_by_vert[vert])))
desc = f"{title} for {cl_names[ci]} in {city}, {state}. Requires: {', '.join(req_sk)}. {random.randint(1,10)}+ years exp. ${bill}/hr."
jo_ids.append(f"JO-{i+1:06d}")
jo_clients.append(cl_ids[ci])
jo_titles.append(title)
jo_verts.append(vert)
jo_bills.append(bill)
jo_pays.append(pay)
jo_status.append(random.choice(["open","open","filled","filled","closed"]))
jo_city.append(city)
jo_state.append(state)
jo_zip.append(zc)
jo_desc.append(desc)
job_orders = pa.table({"job_order_id": jo_ids, "client_id": jo_clients, "title": jo_titles, "vertical": jo_verts, "bill_rate": jo_bills, "pay_rate": jo_pays, "status": jo_status, "city": jo_city, "state": jo_state, "zip": jo_zip, "description": jo_desc})
upload("job_orders", job_orders)
# ============================================================
# 50K PLACEMENTS
# ============================================================
print("\nGenerating placements (50K)...")
recruiters = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(100)]
p_ids, p_cands, p_jobs, p_clients, p_bills, p_pays, p_recs, p_status = [], [], [], [], [], [], [], []
for i in range(50000):
ci = random.randint(0, N-1)
ji = random.randint(0, 14999)
p_ids.append(f"PL-{i+1:06d}")
p_cands.append(c_ids[ci])
p_jobs.append(jo_ids[ji])
p_clients.append(jo_clients[ji])
p_bills.append(jo_bills[ji])
p_pays.append(jo_pays[ji])
p_recs.append(random.choice(recruiters))
p_status.append(random.choice(["active","active","completed","completed","terminated"]))
placements = pa.table({"placement_id": p_ids, "candidate_id": p_cands, "job_order_id": p_jobs, "client_id": p_clients, "bill_rate": p_bills, "pay_rate": p_pays, "recruiter": p_recs, "status": p_status})
upload("placements", placements)
# ============================================================
# 1M TIMESHEETS
# ============================================================
print("\nGenerating timesheets (1M)...")
ts_ids, ts_placements, ts_cands, ts_clients = [], [], [], []
ts_hrs_reg, ts_hrs_ot, ts_bill_total, ts_pay_total, ts_weeks, ts_approved = [], [], [], [], [], []
for i in range(1_000_000):
pi = random.randint(0, 49999)
hrs = random.choice([40.0, 40.0, 40.0, 32.0, 24.0, 20.0])
ot = random.choice([0.0, 0.0, 0.0, 4.0, 8.0, 12.0])
b = p_bills[pi]
p = p_pays[pi]
ts_ids.append(f"TS-{i+1:07d}")
ts_placements.append(p_ids[pi])
ts_cands.append(p_cands[pi])
ts_clients.append(p_clients[pi])
ts_hrs_reg.append(hrs)
ts_hrs_ot.append(ot)
ts_bill_total.append(round(hrs * b + ot * b * 1.5, 2))
ts_pay_total.append(round(hrs * p + ot * p * 1.5, 2))
ts_weeks.append((base_date - timedelta(weeks=random.randint(0, 156))).strftime("%Y-%m-%d"))
ts_approved.append(random.random() < 0.85)
timesheets = pa.table({"timesheet_id": ts_ids, "placement_id": ts_placements, "candidate_id": ts_cands, "client_id": ts_clients, "hours_regular": ts_hrs_reg, "hours_overtime": ts_hrs_ot, "bill_total": ts_bill_total, "pay_total": ts_pay_total, "week_ending": ts_weeks, "approved": ts_approved})
upload("timesheets", timesheets)
# ============================================================
# 800K CALL LOG
# ============================================================
print("\nGenerating call_log (800K)...")
call_ids, call_from, call_to, call_dur, call_ts, call_rec, call_cand, call_disp = [], [], [], [], [], [], [], []
disps = ["connected","voicemail","no_answer","busy","wrong_number","callback_scheduled"]
for i in range(800_000):
ci = random.randint(0, N-1)
call_ids.append(f"CALL-{i+1:07d}")
call_from.append(make_phone())
call_to.append(c_phones[ci])
call_dur.append(random.randint(0, 1800))
call_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat())
call_rec.append(random.choice(recruiters))
call_cand.append(c_ids[ci])
call_disp.append(random.choice(disps))
call_log = pa.table({"call_id": call_ids, "from_number": call_from, "to_number": call_to, "duration_seconds": call_dur, "timestamp": call_ts, "recruiter": call_rec, "candidate_id": call_cand, "disposition": call_disp})
upload("call_log", call_log)
# ============================================================
# 500K EMAIL LOG
# ============================================================
print("\nGenerating email_log (500K)...")
em_ids, em_from, em_to, em_subj, em_ts, em_rec, em_cand, em_opened = [], [], [], [], [], [], [], []
subjects = ["New opportunity: {}", "Following up", "Interview scheduled", "Timesheet reminder", "Background check complete", "Assignment details", "Rate update", "Welcome aboard"]
for i in range(500_000):
ci = random.randint(0, N-1)
ji = random.randint(0, 14999)
rec = random.choice(recruiters)
em_ids.append(f"EM-{i+1:07d}")
em_from.append(f"{rec.replace(' ','.').lower()}@acmestaffing.com")
em_to.append(c_emails[ci])
em_subj.append(random.choice(subjects).format(jo_titles[ji]))
em_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat())
em_rec.append(rec)
em_cand.append(c_ids[ci])
em_opened.append(random.random() < 0.55)
email_log = pa.table({"email_id": em_ids, "from_addr": em_from, "to_addr": em_to, "subject": em_subj, "timestamp": em_ts, "recruiter": em_rec, "candidate_id": em_cand, "opened": em_opened})
upload("email_log", email_log)
total = 100_000 + 2_000 + 15_000 + 50_000 + 1_000_000 + 800_000 + 500_000
t_total = time.time() - t_start
print(f"\n{'='*60}")
print(f"LOADED: {total:,} rows in {t_total:.0f}s")
print(f"{'='*60}")