diff --git a/mcp-server/console.html b/mcp-server/console.html
index e89bcc9..eada43c 100644
--- a/mcp-server/console.html
+++ b/mcp-server/console.html
@@ -134,7 +134,7 @@ details .body{padding-top:10px;font-size:12px;color:#8b949e}
Chapter 4
Watch the system rank candidates in real time
-
This takes the most recent Chicago permit, derives the staffing need, pulls ranked candidates from the 500K bench, and shows you why each one ranked. Everything below loaded in about 3 seconds against the live system.
+
This takes the most recent Chicago permit, derives the staffing need, pulls ranked candidates from the bench, and shows you why each one ranked. Everything below loaded in about 3 seconds against the live system.
diff --git a/mcp-server/index.ts b/mcp-server/index.ts
index 2e55018..a9ac500 100644
--- a/mcp-server/index.ts
+++ b/mcp-server/index.ts
@@ -86,7 +86,7 @@ server.tool(
server.tool(
"query_sql",
- "Run SQL against any lakehouse dataset. Tables: ethereal_workers (10K), candidates (100K), timesheets (1M), call_log (800K), email_log (500K), placements (50K), job_orders (15K), clients (2K).",
+ "Run SQL against any lakehouse dataset. Tables: ethereal_workers (10K), candidates (1K), workers_500k (500K), timesheets (1M), call_log (800K), email_log (500K), placements (50K), job_orders (15K), clients (2K).",
{ sql: z.string().describe("SQL query") },
async ({ sql }) => {
const r = await api("POST", "/query/sql", { sql });
@@ -749,12 +749,30 @@ async function main() {
});
}
- // Sample files (downloadable + fetchable from the onboard wizard)
+ // Sample CSV — generated fresh on every request so content-hash
+ // dedup on the ingest side always sees a new payload (two uploads
+ // in a row would otherwise be a no-op). Each generation has
+ // unique worker_ids (timestamp-prefixed), randomized names + roles
+ // + geos from realistic pools, and a random size (~120-180 rows)
+ // so the demo looks different every time and numbers actually
+ // update visibly in the dashboard after onboarding.
if (url.pathname.startsWith("/samples/")) {
const name = url.pathname.slice("/samples/".length);
if (!/^[a-zA-Z0-9_\-\.]+\.csv$/.test(name)) {
return err("invalid sample filename", 400);
}
+ if (name === "staffing_roster_sample.csv") {
+ const csv = generateSampleRosterCSV();
+ return new Response(csv, {
+ headers: {
+ ...cors,
+ "Content-Type": "text/csv",
+ "Content-Disposition": `attachment; filename="${name}"`,
+ "Cache-Control": "no-store",
+ },
+ });
+ }
+ // Other sample filenames fall through to the static dir
const path = `${import.meta.dir}/samples/${name}`;
const file = Bun.file(path);
if (!(await file.exists())) return err("sample not found", 404);
@@ -764,6 +782,57 @@ async function main() {
});
}
+ // System-wide scale summary — truthful numbers for the UI.
+ // Pulls row counts via SQL (COUNT(*) from parquet footers) for
+ // the key datasets rather than trusting catalog manifests, which
+ // can go stale when data changes without re-registering. The
+ // workers_500k manifest is correct (500K); candidates manifest
+ // lied (said 100K, actual 1K) — the audit caught it.
+ // Everything else uses manifest row_count since it's O(1).
+ if (url.pathname === "/system/summary") {
+ const [ds, indexes, workersCount, candsCount] = await Promise.all([
+ api("GET", "/catalog/datasets").catch(() => [] as any),
+ api("GET", "/vectors/indexes").catch(() => [] as any),
+ api("POST", "/query/sql", { sql: "SELECT COUNT(*) AS c FROM workers_500k" })
+ .catch(() => null as any),
+ api("POST", "/query/sql", { sql: "SELECT COUNT(*) AS c FROM candidates" })
+ .catch(() => null as any),
+ ]);
+ const datasets = Array.isArray(ds) ? ds : [];
+ const idxs = Array.isArray(indexes) ? indexes : [];
+ const workers = Number(workersCount?.rows?.[0]?.c ?? 0);
+ const candidates = Number(candsCount?.rows?.[0]?.c ?? 0);
+ // Sum manifest row_counts EXCLUDING workers_500k + candidates,
+ // then add the truthful SQL counts. This gives a total that
+ // reflects live state for the two most-quoted tables.
+ const otherManifest = datasets
+ .filter((d: any) => d?.name !== "workers_500k" && d?.name !== "candidates")
+ .reduce((s: number, d: any) => s + (d?.row_count || 0), 0);
+ const totalRows = otherManifest + workers + candidates;
+ const totalChunks = idxs.reduce((s: number, i: any) => s + (i?.chunk_count || 0), 0);
+ // Manifest drift audit — surface any cases where manifest
+ // disagrees with SQL for the two spot-checked tables so the UI
+ // can note it if ever meaningful.
+ const drift: any[] = [];
+ const workersManifest = datasets.find((d: any) => d?.name === "workers_500k")?.row_count;
+ const candidatesManifest = datasets.find((d: any) => d?.name === "candidates")?.row_count;
+ if (workersManifest !== undefined && workersManifest !== workers) {
+ drift.push({ dataset: "workers_500k", manifest: workersManifest, actual: workers });
+ }
+ if (candidatesManifest !== undefined && candidatesManifest !== candidates) {
+ drift.push({ dataset: "candidates", manifest: candidatesManifest, actual: candidates });
+ }
+ return ok({
+ datasets: datasets.length,
+ total_rows: totalRows,
+ total_chunks: totalChunks,
+ workers_500k_rows: workers,
+ candidates_rows: candidates,
+ indexes: idxs.length,
+ manifest_drift: drift,
+ });
+ }
+
// Proof JSON API (same data, no HTML)
if (url.pathname === "/proof.json") {
const ds = await api("GET", "/catalog/datasets") as any[];
@@ -1806,6 +1875,115 @@ async function runAlertsOnce() {
// Seed playbook_memory from a filled contract so the next hybrid query
// ranks against it. Used by both runWeekSimulation (per-day) and the /log
// endpoint (per manual logging). Fail-soft — seeding is best-effort.
+// ─── Sample CSV generator ───────────────────────────────────────────────
+// Fresh randomized staffing roster per request. Prevents the "upload
+// same file twice and it's a no-op" problem from the static sample,
+// and makes the dashboard numbers visibly update after onboarding.
+
+const SAMPLE_FIRST_NAMES = [
+ "Sarah","Michael","Maria","David","Jennifer","Robert","Amanda","Carlos",
+ "Kim","James","Priya","Thomas","Lisa","Brandon","Emily","Marcus","Anita",
+ "Dmitri","Rachel","Samuel","Jordan","Natalia","Henry","Ava","Tyler",
+ "Hannah","Luis","Aisha","Victor","Monica","Derek","Yuki","Fatima","Kwame",
+ "Isabel","Rafael","Elena","Hiroshi","Nadia","Oscar","Sofia","Anders",
+ "Leila","Jamal","Chioma","Pavel","Bianca","Tariq","Inez","Reuben","Mira",
+];
+const SAMPLE_LAST_NAMES = [
+ "Johnson","Chen","Rodriguez","Park","Lopez","Williams","Taylor","Mendoza",
+ "Nguyen","O'Brien","Patel","Anderson","Nakamura","Moore","Zhang","Brooks",
+ "Volkov","Kim","Thompson","Martinez","Soto","Robinson","Clark","Hayes",
+ "Reyes","Brown","Wright","Diaz","Powell","Green","Castillo","Iwu",
+ "Kowalski","Lindström","Oyelaran","Saitō","Abebe","Mehta","Blanchard",
+];
+const SAMPLE_ROLES = [
+ "Forklift Operator","Welder","Warehouse Associate","Machine Operator",
+ "Loader","Maintenance Tech","Quality Tech","Electrician","Line Lead",
+ "Material Handler","Production Worker","Assembler","Shipping Clerk",
+];
+const SAMPLE_CITY_STATE: Array<[string, string]> = [
+ ["Chicago","IL"],["Springfield","IL"],["Rockford","IL"],["Peoria","IL"],
+ ["Indianapolis","IN"],["Fort Wayne","IN"],["Evansville","IN"],["South Bend","IN"],
+ ["Columbus","OH"],["Cleveland","OH"],["Cincinnati","OH"],["Toledo","OH"],
+ ["St. Louis","MO"],["Kansas City","MO"],["Springfield","MO"],
+ ["Nashville","TN"],["Memphis","TN"],["Knoxville","TN"],
+ ["Louisville","KY"],["Lexington","KY"],
+ ["Milwaukee","WI"],["Madison","WI"],["Green Bay","WI"],
+ ["Detroit","MI"],["Grand Rapids","MI"],["Lansing","MI"],
+];
+const SAMPLE_SKILL_POOLS: Record