From 0ff091c17356cca414e1830698125d3da41fbc57 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2026 19:07:47 -0500 Subject: [PATCH] =?UTF-8?q?Honesty=20fixes=20=E2=80=94=20no=20hard-coded?= =?UTF-8?q?=20counts,=20dynamic=20sample=20CSV?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - generateSampleRosterCSV(): 120-180 randomized rows per call, timestamp-prefixed IDs (no dedup on re-upload, no static 25 row lie) - /system/summary: truth via SQL COUNT(*), surfaces manifest_drift (caught candidates: manifest 100K, actual 1K) - search.html: loadSystemSummary() hydrates live counts; removed hard-coded 500K strings - MCP tool description: "candidates (100K)" → "candidates (1K)", added "workers_500k (500K)" --- mcp-server/console.html | 2 +- mcp-server/index.ts | 182 +++++++++++++++++++++++++++++++++++++++- mcp-server/search.html | 35 +++++++- 3 files changed, 212 insertions(+), 7 deletions(-) diff --git a/mcp-server/console.html b/mcp-server/console.html index e89bcc9..eada43c 100644 --- a/mcp-server/console.html +++ b/mcp-server/console.html @@ -134,7 +134,7 @@ details .body{padding-top:10px;font-size:12px;color:#8b949e}
Chapter 4

Watch the system rank candidates in real time

-
This takes the most recent Chicago permit, derives the staffing need, pulls ranked candidates from the 500K bench, and shows you why each one ranked. Everything below loaded in about 3 seconds against the live system.
+
This takes the most recent Chicago permit, derives the staffing need, pulls ranked candidates from the bench, and shows you why each one ranked. Everything below loaded in about 3 seconds against the live system.
Running demo query…
diff --git a/mcp-server/index.ts b/mcp-server/index.ts index 2e55018..a9ac500 100644 --- a/mcp-server/index.ts +++ b/mcp-server/index.ts @@ -86,7 +86,7 @@ server.tool( server.tool( "query_sql", - "Run SQL against any lakehouse dataset. Tables: ethereal_workers (10K), candidates (100K), timesheets (1M), call_log (800K), email_log (500K), placements (50K), job_orders (15K), clients (2K).", + "Run SQL against any lakehouse dataset. Tables: ethereal_workers (10K), candidates (1K), workers_500k (500K), timesheets (1M), call_log (800K), email_log (500K), placements (50K), job_orders (15K), clients (2K).", { sql: z.string().describe("SQL query") }, async ({ sql }) => { const r = await api("POST", "/query/sql", { sql }); @@ -749,12 +749,30 @@ async function main() { }); } - // Sample files (downloadable + fetchable from the onboard wizard) + // Sample CSV — generated fresh on every request so content-hash + // dedup on the ingest side always sees a new payload (two uploads + // in a row would otherwise be a no-op). Each generation has + // unique worker_ids (timestamp-prefixed), randomized names + roles + // + geos from realistic pools, and a random size (~120-180 rows) + // so the demo looks different every time and numbers actually + // update visibly in the dashboard after onboarding. if (url.pathname.startsWith("/samples/")) { const name = url.pathname.slice("/samples/".length); if (!/^[a-zA-Z0-9_\-\.]+\.csv$/.test(name)) { return err("invalid sample filename", 400); } + if (name === "staffing_roster_sample.csv") { + const csv = generateSampleRosterCSV(); + return new Response(csv, { + headers: { + ...cors, + "Content-Type": "text/csv", + "Content-Disposition": `attachment; filename="${name}"`, + "Cache-Control": "no-store", + }, + }); + } + // Other sample filenames fall through to the static dir const path = `${import.meta.dir}/samples/${name}`; const file = Bun.file(path); if (!(await file.exists())) return err("sample not found", 404); @@ -764,6 +782,57 @@ async function main() { }); } + // System-wide scale summary — truthful numbers for the UI. + // Pulls row counts via SQL (COUNT(*) from parquet footers) for + // the key datasets rather than trusting catalog manifests, which + // can go stale when data changes without re-registering. The + // workers_500k manifest is correct (500K); candidates manifest + // lied (said 100K, actual 1K) — the audit caught it. + // Everything else uses manifest row_count since it's O(1). + if (url.pathname === "/system/summary") { + const [ds, indexes, workersCount, candsCount] = await Promise.all([ + api("GET", "/catalog/datasets").catch(() => [] as any), + api("GET", "/vectors/indexes").catch(() => [] as any), + api("POST", "/query/sql", { sql: "SELECT COUNT(*) AS c FROM workers_500k" }) + .catch(() => null as any), + api("POST", "/query/sql", { sql: "SELECT COUNT(*) AS c FROM candidates" }) + .catch(() => null as any), + ]); + const datasets = Array.isArray(ds) ? ds : []; + const idxs = Array.isArray(indexes) ? indexes : []; + const workers = Number(workersCount?.rows?.[0]?.c ?? 0); + const candidates = Number(candsCount?.rows?.[0]?.c ?? 0); + // Sum manifest row_counts EXCLUDING workers_500k + candidates, + // then add the truthful SQL counts. This gives a total that + // reflects live state for the two most-quoted tables. + const otherManifest = datasets + .filter((d: any) => d?.name !== "workers_500k" && d?.name !== "candidates") + .reduce((s: number, d: any) => s + (d?.row_count || 0), 0); + const totalRows = otherManifest + workers + candidates; + const totalChunks = idxs.reduce((s: number, i: any) => s + (i?.chunk_count || 0), 0); + // Manifest drift audit — surface any cases where manifest + // disagrees with SQL for the two spot-checked tables so the UI + // can note it if ever meaningful. + const drift: any[] = []; + const workersManifest = datasets.find((d: any) => d?.name === "workers_500k")?.row_count; + const candidatesManifest = datasets.find((d: any) => d?.name === "candidates")?.row_count; + if (workersManifest !== undefined && workersManifest !== workers) { + drift.push({ dataset: "workers_500k", manifest: workersManifest, actual: workers }); + } + if (candidatesManifest !== undefined && candidatesManifest !== candidates) { + drift.push({ dataset: "candidates", manifest: candidatesManifest, actual: candidates }); + } + return ok({ + datasets: datasets.length, + total_rows: totalRows, + total_chunks: totalChunks, + workers_500k_rows: workers, + candidates_rows: candidates, + indexes: idxs.length, + manifest_drift: drift, + }); + } + // Proof JSON API (same data, no HTML) if (url.pathname === "/proof.json") { const ds = await api("GET", "/catalog/datasets") as any[]; @@ -1806,6 +1875,115 @@ async function runAlertsOnce() { // Seed playbook_memory from a filled contract so the next hybrid query // ranks against it. Used by both runWeekSimulation (per-day) and the /log // endpoint (per manual logging). Fail-soft — seeding is best-effort. +// ─── Sample CSV generator ─────────────────────────────────────────────── +// Fresh randomized staffing roster per request. Prevents the "upload +// same file twice and it's a no-op" problem from the static sample, +// and makes the dashboard numbers visibly update after onboarding. + +const SAMPLE_FIRST_NAMES = [ + "Sarah","Michael","Maria","David","Jennifer","Robert","Amanda","Carlos", + "Kim","James","Priya","Thomas","Lisa","Brandon","Emily","Marcus","Anita", + "Dmitri","Rachel","Samuel","Jordan","Natalia","Henry","Ava","Tyler", + "Hannah","Luis","Aisha","Victor","Monica","Derek","Yuki","Fatima","Kwame", + "Isabel","Rafael","Elena","Hiroshi","Nadia","Oscar","Sofia","Anders", + "Leila","Jamal","Chioma","Pavel","Bianca","Tariq","Inez","Reuben","Mira", +]; +const SAMPLE_LAST_NAMES = [ + "Johnson","Chen","Rodriguez","Park","Lopez","Williams","Taylor","Mendoza", + "Nguyen","O'Brien","Patel","Anderson","Nakamura","Moore","Zhang","Brooks", + "Volkov","Kim","Thompson","Martinez","Soto","Robinson","Clark","Hayes", + "Reyes","Brown","Wright","Diaz","Powell","Green","Castillo","Iwu", + "Kowalski","Lindström","Oyelaran","Saitō","Abebe","Mehta","Blanchard", +]; +const SAMPLE_ROLES = [ + "Forklift Operator","Welder","Warehouse Associate","Machine Operator", + "Loader","Maintenance Tech","Quality Tech","Electrician","Line Lead", + "Material Handler","Production Worker","Assembler","Shipping Clerk", +]; +const SAMPLE_CITY_STATE: Array<[string, string]> = [ + ["Chicago","IL"],["Springfield","IL"],["Rockford","IL"],["Peoria","IL"], + ["Indianapolis","IN"],["Fort Wayne","IN"],["Evansville","IN"],["South Bend","IN"], + ["Columbus","OH"],["Cleveland","OH"],["Cincinnati","OH"],["Toledo","OH"], + ["St. Louis","MO"],["Kansas City","MO"],["Springfield","MO"], + ["Nashville","TN"],["Memphis","TN"],["Knoxville","TN"], + ["Louisville","KY"],["Lexington","KY"], + ["Milwaukee","WI"],["Madison","WI"],["Green Bay","WI"], + ["Detroit","MI"],["Grand Rapids","MI"],["Lansing","MI"], +]; +const SAMPLE_SKILL_POOLS: Record = { + "Forklift Operator": ["pallet jack","hazmat","loading dock","overhead crane","cold storage","shipping","team lead"], + "Welder": ["TIG","MIG","pipe welding","blueprint reading","grinder","confined space"], + "Warehouse Associate": ["inventory","RF scanner","pick-to-light","Excel","packaging","team lead"], + "Machine Operator": ["CNC","SPC","gauge R&R","lean manufacturing","conveyor ops","first article"], + "Loader": ["loading dock","team lead","cold storage","first aid","bilingual"], + "Maintenance Tech": ["electrical","PLC","hydraulics","CMMS","LOTO","troubleshooting"], + "Quality Tech": ["ISO 9001","calibration","root cause analysis","SPC","Six Sigma"], + "Electrician": ["conduit","motor controls","troubleshooting","PLC","NEC"], + "Line Lead": ["team lead","training","SPC","scheduling"], + "Material Handler": ["RF scanner","pallet jack","receiving","packaging"], + "Production Worker": ["line work","first article","labeling","packaging","quality inspection"], + "Assembler": ["assembly","gauge R&R","line lead","first article"], + "Shipping Clerk": ["shipping","receiving","RF scanner","bilingual"], +}; +const SAMPLE_CERT_POOL = ["OSHA-10","OSHA-30","Forklift","Hazmat","First Aid","LOTO","Confined Space","AWS D1.1","ServSafe","Six Sigma Green"]; +const SAMPLE_ARCHETYPES = ["reliable","specialist","leader","communicator","flexible"]; + +function pick(arr: T[]): T { return arr[Math.floor(Math.random() * arr.length)]; } +function pickN(arr: T[], n: number): T[] { + const copy = arr.slice(); + const out: T[] = []; + for (let i = 0; i < n && copy.length > 0; i++) { + out.push(copy.splice(Math.floor(Math.random() * copy.length), 1)[0]); + } + return out; +} +function csvEscape(s: string): string { + if (s.indexOf(",") >= 0 || s.indexOf('"') >= 0 || s.indexOf("\n") >= 0) { + return `"${s.replace(/"/g, '""')}"`; + } + return s; +} + +function generateSampleRosterCSV(): string { + const count = 120 + Math.floor(Math.random() * 61); // 120-180 + const ts = Date.now(); + const lines: string[] = [ + "worker_id,name,role,city,state,email,phone,skills,certifications,availability,reliability,archetype", + ]; + for (let i = 0; i < count; i++) { + const first = pick(SAMPLE_FIRST_NAMES); + const last = pick(SAMPLE_LAST_NAMES); + const name = `${first} ${last}`; + const role = pick(SAMPLE_ROLES); + const [city, state] = pick(SAMPLE_CITY_STATE); + const handle = `${first}.${last}`.toLowerCase().replace(/[^a-z\.]/g, ""); + const email = `${handle}${Math.floor(Math.random() * 1000)}@example.com`; + const area = ["312","773","630","708","331","815","217","219","260","614","216","513","419","314","816","615","901","502","414","608","313","616"][Math.floor(Math.random() * 22)]; + const phone = `(${area}) 555-${String(1000 + Math.floor(Math.random() * 9000))}`; + const skillPool = SAMPLE_SKILL_POOLS[role] || ["general"]; + const skills = pickN(skillPool, 2 + Math.floor(Math.random() * 3)).join("|"); + const certs = pickN(SAMPLE_CERT_POOL, 1 + Math.floor(Math.random() * 3)).join("|"); + const availability = (0.3 + Math.random() * 0.69).toFixed(2); + const reliability = (0.55 + Math.random() * 0.44).toFixed(2); + const archetype = pick(SAMPLE_ARCHETYPES); + lines.push([ + `W-${ts}-${String(i).padStart(4, "0")}`, + csvEscape(name), + csvEscape(role), + csvEscape(city), + state, + email, + phone, + csvEscape(skills), + csvEscape(certs), + availability, + reliability, + archetype, + ].join(",")); + } + return lines.join("\n") + "\n"; +} + // ─── Rate/margin awareness ────────────────────────────────────────────── // Derive implied pay and bill rates per worker / per contract without // schema changes. Numbers are industry heuristics — a real deployment diff --git a/mcp-server/search.html b/mcp-server/search.html index 47a5850..aeff7dd 100644 --- a/mcp-server/search.html +++ b/mcp-server/search.html @@ -127,7 +127,7 @@ body{font-family:'Inter',-apple-system,system-ui,'Segoe UI',sans-serif;backgroun
Live Contracts — Chicago Permits → Proposed Fills - +
Loading live contracts...
@@ -151,7 +151,7 @@ body{font-family:'Inter',-apple-system,system-ui,'Segoe UI',sans-serif;backgroun
Worker Search - +
Search all workers
@@ -160,14 +160,41 @@ body{font-family:'Inter',-apple-system,system-ui,'Segoe UI',sans-serif;backgroun
-
Staffing Co-Pilot · Hybrid SQL + Vector Search · 500K embedded profiles · Console · Architecture
+