profit ac01fffd9a checkpoint: matrix-agent-validated (2026-04-25)
Architectural snapshot of the lakehouse codebase at the point where the
full matrix-driven agent loop with Mem0 versioning + deletion was
validated end-to-end.

WHAT THIS REPO IS
A clean single-commit snapshot of the lakehouse code. Heavy test data
(.parquet datasets, vector indexes) excluded — see REPLICATION.md for
regen path. Full lakehouse history at git.agentview.dev/profit/lakehouse.

WHAT WAS PROVEN
- Vector retrieval across multi-corpora matrix (chicago_permits + entity
  briefs + sec_tickers + distilled procedural + llm_team runs)
- Observer hand-review (cloud + heuristic fallback) gating each candidate
- Local-model agent loop (qwen3.5:latest) with tool use + scratchpad
- Playbook seal on success → next-iter retrieval surfaces it as preamble
- Mem0 versioning + deletion in pathway_memory:
    * UPSERT: ADD on new workflow, UPDATE bumps replay_count on identical
    * REVISE: chains versions, parent.superseded_at + superseded_by stamped
    * RETIRE: marks specific trace retired with reason, excluded from retrieval
    * HISTORY: walks chain root→tip, cycle-safe

KEY DIRECTORIES
- crates/vectord/src/pathway_memory.rs — Mem0 ops live here
- crates/vectord/src/playbook_memory.rs — original Mem0 reference
- tests/agent_test/ — local-model agent harness + PRD + session archives
- scripts/dump_raw_corpus.sh — MinIO bucket dump (raw test corpus)
- scripts/vectorize_raw_corpus.ts — corpus → vector indexes
- scripts/analyze_chicago_contracts.ts — real inference pipeline
- scripts/seal_agent_playbook.ts — Mem0 upsert from agent traces

Replication: see REPLICATION.md for Debian 13 clean install + cloud-only
adaptation (no local Ollama).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 19:43:27 -05:00

265 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Scale test: 2.5M rows across staffing tables + 100K vector embeddings.
Designed for 128GB RAM machine.
"""
import random, json, urllib.request, time
from datetime import datetime, timedelta
import pyarrow as pa, pyarrow.parquet as pq
API = "http://localhost:3100"
random.seed(2026)
def upload(name, table):
path = f"/tmp/{name}.parquet"
pq.write_table(table, path, compression="snappy")
with open(path, "rb") as f:
data = f.read()
key = f"datasets/{name}.parquet"
req = urllib.request.Request(f"{API}/storage/objects/{key}", data=data, method="PUT")
urllib.request.urlopen(req)
body = json.dumps({"name": name, "schema_fingerprint": "auto",
"objects": [{"bucket": "data", "key": key, "size_bytes": len(data)}]}).encode()
req = urllib.request.Request(f"{API}/catalog/datasets", data=body, method="POST",
headers={"Content-Type": "application/json"})
urllib.request.urlopen(req)
print(f" {name}: {table.num_rows:,} rows ({len(data)/1024/1024:.1f} MB)")
# Shared data
first_names = ["James","Mary","Robert","Patricia","John","Jennifer","Michael","Linda","David","Elizabeth",
"William","Barbara","Richard","Susan","Joseph","Jessica","Thomas","Sarah","Christopher","Karen",
"Charles","Lisa","Daniel","Nancy","Matthew","Betty","Anthony","Margaret","Mark","Sandra",
"Donald","Ashley","Steven","Dorothy","Paul","Kimberly","Andrew","Emily","Joshua","Donna",
"Kenneth","Michelle","Kevin","Carol","Brian","Amanda","George","Melissa","Timothy","Deborah"]
last_names = ["Smith","Johnson","Williams","Brown","Jones","Garcia","Miller","Davis","Rodriguez","Martinez",
"Hernandez","Lopez","Gonzalez","Wilson","Anderson","Thomas","Taylor","Moore","Jackson","Martin",
"Lee","Perez","Thompson","White","Harris","Sanchez","Clark","Ramirez","Lewis","Robinson",
"Walker","Young","Allen","King","Wright","Scott","Torres","Nguyen","Hill","Flores"]
cities_zips = [
("Chicago","IL","60601"),("Chicago","IL","60610"),("Chicago","IL","60614"),("Chicago","IL","60622"),
("New York","NY","10001"),("New York","NY","10016"),("New York","NY","10022"),("New York","NY","10036"),
("Los Angeles","CA","90001"),("Los Angeles","CA","90024"),("Houston","TX","77001"),("Houston","TX","77019"),
("Dallas","TX","75201"),("Dallas","TX","75219"),("Atlanta","GA","30301"),("Atlanta","GA","30309"),
("Denver","CO","80201"),("Denver","CO","80206"),("Phoenix","AZ","85001"),("Phoenix","AZ","85006"),
("Seattle","WA","98101"),("Seattle","WA","98104"),("Miami","FL","33101"),("Miami","FL","33132"),
]
verticals = ["IT","Healthcare","Industrial","Accounting","Admin"]
skills_by_vert = {
"IT": ["Java","Python","C#",".NET","JavaScript","React","Angular","Node.js","SQL","AWS","Azure","Docker","Kubernetes","Linux","Git","REST APIs","MongoDB","PostgreSQL","Redis","Terraform","Jenkins","Agile","Spring Boot","Django","Go","Rust","TypeScript","GraphQL","Microservices","CI/CD"],
"Healthcare": ["RN","LPN","CNA","BLS","ACLS","EMR","Epic","Cerner","ICD-10","CPT","HIPAA","Phlebotomy","ICU","OR","ER","Med-Surg","Pediatrics","Oncology","Telemetry","IV Therapy"],
"Industrial": ["Forklift","OSHA 10","OSHA 30","Welding","CNC","PLC","Blueprint Reading","Quality Control","Six Sigma","AutoCAD","SolidWorks","Mechanical Assembly","Electrical","Hydraulics","Warehouse","Lean Manufacturing"],
"Accounting": ["QuickBooks","SAP","Oracle","Accounts Payable","Accounts Receivable","General Ledger","Financial Reporting","CPA","Payroll","Budgeting","Excel Advanced","Power BI","Tableau","GAAP","SOX","Audit"],
"Admin": ["Microsoft Office","Data Entry","Customer Service","Scheduling","Receptionist","Executive Assistant","Calendar Management","Salesforce","CRM","Multi-line Phone","Typing 60+ WPM","Bilingual Spanish","Notary"],
}
email_domains = ["gmail.com","yahoo.com","hotmail.com","outlook.com","icloud.com","protonmail.com"]
base_date = datetime(2026, 1, 1)
def make_phone():
return f"({random.randint(200,999)}) {random.randint(200,999)}-{random.randint(1000,9999)}"
print("=" * 60)
print("SCALE TEST: 2.5M rows + 100K vectors")
print("=" * 60)
t_start = time.time()
# ============================================================
# 100K CANDIDATES
# ============================================================
print("\nGenerating candidates (100K)...")
t0 = time.time()
N = 100_000
c_ids, c_first, c_last, c_emails, c_phones = [], [], [], [], []
c_city, c_state, c_zip = [], [], []
c_vertical, c_skills, c_resume = [], [], []
c_status, c_source, c_pay, c_years = [], [], [], []
for i in range(N):
fn = random.choice(first_names)
ln = random.choice(last_names)
city, state, zc = random.choice(cities_zips)
vert = random.choice(verticals)
sk = random.sample(skills_by_vert[vert], min(random.randint(3, 10), len(skills_by_vert[vert])))
yrs = random.randint(0, 30)
resume = f"{fn} {ln}{vert} professional with {yrs} years experience in {city}, {state} {zc}. Skills: {', '.join(sk)}. "
resume += random.choice([
f"Previously at {random.choice(['Acme','TechFlow','GlobalStaff','MedPro','BuildRight','CoreSys','Apex','Summit'])} Corp.",
f"Seeking {random.choice(['contract','full-time','temp-to-hire'])} in {city} metro.",
f"Available {random.choice(['immediately','in 2 weeks','after current assignment'])}.",
f"Open to {random.choice(['remote','hybrid','on-site'])} work arrangements.",
f"Certified in {random.choice(sk)} with hands-on project experience.",
])
c_ids.append(f"CAND-{i+1:06d}")
c_first.append(fn)
c_last.append(ln)
c_emails.append(f"{fn.lower()}.{ln.lower()}{random.randint(1,99)}@{random.choice(email_domains)}")
c_phones.append(make_phone())
c_city.append(city)
c_state.append(state)
c_zip.append(zc)
c_vertical.append(vert)
c_skills.append("|".join(sk))
c_resume.append(resume)
c_status.append(random.choice(["active","active","active","inactive","placed"]))
c_source.append(random.choice(["Indeed","LinkedIn","Referral","Walk-in","Monster","Website"]))
c_pay.append(round(random.uniform(12, 95), 2))
c_years.append(yrs)
candidates = pa.table({
"candidate_id": c_ids, "first_name": c_first, "last_name": c_last,
"email": c_emails, "phone": c_phones,
"city": c_city, "state": c_state, "zip": c_zip,
"vertical": c_vertical, "skills": c_skills, "resume_summary": c_resume,
"status": c_status, "source": c_source, "min_pay_rate": c_pay, "years_experience": c_years,
})
upload("candidates", candidates)
print(f" Generated in {time.time()-t0:.1f}s")
# ============================================================
# 2K CLIENTS
# ============================================================
print("\nGenerating clients (2K)...")
prefixes = ["Apex","Summit","Core","National","Metro","Pacific","Global","United","Pinnacle","Horizon","Pioneer","Titan","Quantum","Vertex","Elite"]
suffixes = ["Industries","Solutions","Systems","Group","Corp","Technologies","Services","Partners","Manufacturing","Healthcare"]
cl_ids, cl_names, cl_verts, cl_city, cl_state, cl_zip = [], [], [], [], [], []
for i in range(2000):
city, state, zc = random.choice(cities_zips)
cl_ids.append(f"CLI-{i+1:05d}")
cl_names.append(f"{random.choice(prefixes)} {random.choice(suffixes)}")
cl_verts.append(random.choice(verticals))
cl_city.append(city)
cl_state.append(state)
cl_zip.append(zc)
clients = pa.table({"client_id": cl_ids, "company_name": cl_names, "vertical": cl_verts, "city": cl_city, "state": cl_state, "zip": cl_zip})
upload("clients", clients)
# ============================================================
# 15K JOB ORDERS
# ============================================================
print("\nGenerating job_orders (15K)...")
titles_map = {
"IT": ["Software Developer","Java Developer",".NET Developer","DevOps Engineer","Data Analyst","QA Engineer","Cloud Architect","React Developer","DBA","Security Analyst","Python Developer","Full Stack Developer"],
"Healthcare": ["Registered Nurse","LPN","CNA","Medical Assistant","Phlebotomist","Radiology Tech","Pharmacy Tech","Medical Coder"],
"Industrial": ["Forklift Operator","Welder","CNC Machinist","Quality Inspector","Maintenance Tech","Electrician","Warehouse Associate","Assembly Tech"],
"Accounting": ["Staff Accountant","AP Specialist","AR Specialist","Payroll Clerk","Financial Analyst","Bookkeeper","Controller","Tax Preparer"],
"Admin": ["Administrative Assistant","Executive Assistant","Receptionist","Data Entry Clerk","Office Manager","Customer Service Rep","HR Coordinator"],
}
jo_ids, jo_clients, jo_titles, jo_verts, jo_bills, jo_pays, jo_status = [], [], [], [], [], [], []
jo_city, jo_state, jo_zip, jo_desc = [], [], [], []
for i in range(15000):
vert = random.choice(verticals)
title = random.choice(titles_map[vert])
ci = random.randint(0, 1999)
city, state, zc = random.choice(cities_zips)
bill = round(random.uniform(25, 150), 2)
pay = round(bill * random.uniform(0.55, 0.75), 2)
req_sk = random.sample(skills_by_vert[vert], min(random.randint(3, 6), len(skills_by_vert[vert])))
desc = f"{title} for {cl_names[ci]} in {city}, {state}. Requires: {', '.join(req_sk)}. {random.randint(1,10)}+ years exp. ${bill}/hr."
jo_ids.append(f"JO-{i+1:06d}")
jo_clients.append(cl_ids[ci])
jo_titles.append(title)
jo_verts.append(vert)
jo_bills.append(bill)
jo_pays.append(pay)
jo_status.append(random.choice(["open","open","filled","filled","closed"]))
jo_city.append(city)
jo_state.append(state)
jo_zip.append(zc)
jo_desc.append(desc)
job_orders = pa.table({"job_order_id": jo_ids, "client_id": jo_clients, "title": jo_titles, "vertical": jo_verts, "bill_rate": jo_bills, "pay_rate": jo_pays, "status": jo_status, "city": jo_city, "state": jo_state, "zip": jo_zip, "description": jo_desc})
upload("job_orders", job_orders)
# ============================================================
# 50K PLACEMENTS
# ============================================================
print("\nGenerating placements (50K)...")
recruiters = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(100)]
p_ids, p_cands, p_jobs, p_clients, p_bills, p_pays, p_recs, p_status = [], [], [], [], [], [], [], []
for i in range(50000):
ci = random.randint(0, N-1)
ji = random.randint(0, 14999)
p_ids.append(f"PL-{i+1:06d}")
p_cands.append(c_ids[ci])
p_jobs.append(jo_ids[ji])
p_clients.append(jo_clients[ji])
p_bills.append(jo_bills[ji])
p_pays.append(jo_pays[ji])
p_recs.append(random.choice(recruiters))
p_status.append(random.choice(["active","active","completed","completed","terminated"]))
placements = pa.table({"placement_id": p_ids, "candidate_id": p_cands, "job_order_id": p_jobs, "client_id": p_clients, "bill_rate": p_bills, "pay_rate": p_pays, "recruiter": p_recs, "status": p_status})
upload("placements", placements)
# ============================================================
# 1M TIMESHEETS
# ============================================================
print("\nGenerating timesheets (1M)...")
ts_ids, ts_placements, ts_cands, ts_clients = [], [], [], []
ts_hrs_reg, ts_hrs_ot, ts_bill_total, ts_pay_total, ts_weeks, ts_approved = [], [], [], [], [], []
for i in range(1_000_000):
pi = random.randint(0, 49999)
hrs = random.choice([40.0, 40.0, 40.0, 32.0, 24.0, 20.0])
ot = random.choice([0.0, 0.0, 0.0, 4.0, 8.0, 12.0])
b = p_bills[pi]
p = p_pays[pi]
ts_ids.append(f"TS-{i+1:07d}")
ts_placements.append(p_ids[pi])
ts_cands.append(p_cands[pi])
ts_clients.append(p_clients[pi])
ts_hrs_reg.append(hrs)
ts_hrs_ot.append(ot)
ts_bill_total.append(round(hrs * b + ot * b * 1.5, 2))
ts_pay_total.append(round(hrs * p + ot * p * 1.5, 2))
ts_weeks.append((base_date - timedelta(weeks=random.randint(0, 156))).strftime("%Y-%m-%d"))
ts_approved.append(random.random() < 0.85)
timesheets = pa.table({"timesheet_id": ts_ids, "placement_id": ts_placements, "candidate_id": ts_cands, "client_id": ts_clients, "hours_regular": ts_hrs_reg, "hours_overtime": ts_hrs_ot, "bill_total": ts_bill_total, "pay_total": ts_pay_total, "week_ending": ts_weeks, "approved": ts_approved})
upload("timesheets", timesheets)
# ============================================================
# 800K CALL LOG
# ============================================================
print("\nGenerating call_log (800K)...")
call_ids, call_from, call_to, call_dur, call_ts, call_rec, call_cand, call_disp = [], [], [], [], [], [], [], []
disps = ["connected","voicemail","no_answer","busy","wrong_number","callback_scheduled"]
for i in range(800_000):
ci = random.randint(0, N-1)
call_ids.append(f"CALL-{i+1:07d}")
call_from.append(make_phone())
call_to.append(c_phones[ci])
call_dur.append(random.randint(0, 1800))
call_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat())
call_rec.append(random.choice(recruiters))
call_cand.append(c_ids[ci])
call_disp.append(random.choice(disps))
call_log = pa.table({"call_id": call_ids, "from_number": call_from, "to_number": call_to, "duration_seconds": call_dur, "timestamp": call_ts, "recruiter": call_rec, "candidate_id": call_cand, "disposition": call_disp})
upload("call_log", call_log)
# ============================================================
# 500K EMAIL LOG
# ============================================================
print("\nGenerating email_log (500K)...")
em_ids, em_from, em_to, em_subj, em_ts, em_rec, em_cand, em_opened = [], [], [], [], [], [], [], []
subjects = ["New opportunity: {}", "Following up", "Interview scheduled", "Timesheet reminder", "Background check complete", "Assignment details", "Rate update", "Welcome aboard"]
for i in range(500_000):
ci = random.randint(0, N-1)
ji = random.randint(0, 14999)
rec = random.choice(recruiters)
em_ids.append(f"EM-{i+1:07d}")
em_from.append(f"{rec.replace(' ','.').lower()}@acmestaffing.com")
em_to.append(c_emails[ci])
em_subj.append(random.choice(subjects).format(jo_titles[ji]))
em_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat())
em_rec.append(rec)
em_cand.append(c_ids[ci])
em_opened.append(random.random() < 0.55)
email_log = pa.table({"email_id": em_ids, "from_addr": em_from, "to_addr": em_to, "subject": em_subj, "timestamp": em_ts, "recruiter": em_rec, "candidate_id": em_cand, "opened": em_opened})
upload("email_log", email_log)
total = 100_000 + 2_000 + 15_000 + 50_000 + 1_000_000 + 800_000 + 500_000
t_total = time.time() - t_start
print(f"\n{'='*60}")
print(f"LOADED: {total:,} rows in {t_total:.0f}s")
print(f"{'='*60}")