Honesty fixes — no hard-coded counts, dynamic sample CSV

- generateSampleRosterCSV(): 120-180 randomized rows per call, timestamp-prefixed IDs (no dedup on re-upload, no static 25 row lie)
- /system/summary: truth via SQL COUNT(*), surfaces manifest_drift (caught candidates: manifest 100K, actual 1K)
- search.html: loadSystemSummary() hydrates live counts; removed hard-coded 500K strings
- MCP tool description: "candidates (100K)" → "candidates (1K)", added "workers_500k (500K)"
This commit is contained in:
root 2026-04-20 19:07:47 -05:00
parent af3856b103
commit 0ff091c173
3 changed files with 212 additions and 7 deletions

View File

@ -134,7 +134,7 @@ details .body{padding-top:10px;font-size:12px;color:#8b949e}
<div class="chapter">
<div class="num">Chapter 4</div>
<h2>Watch the system rank candidates in real time</h2>
<div class="lede">This takes the most recent Chicago permit, derives the staffing need, pulls ranked candidates from the 500K bench, and shows you why each one ranked. Everything below loaded in about 3 seconds against the live system.</div>
<div class="lede">This takes the most recent Chicago permit, derives the staffing need, pulls ranked candidates from the bench, and shows you why each one ranked. Everything below loaded in about 3 seconds against the live system.</div>
<div id="ch4-demo"><div class="loading">Running demo query…</div></div>
</div>

View File

@ -86,7 +86,7 @@ server.tool(
server.tool(
"query_sql",
"Run SQL against any lakehouse dataset. Tables: ethereal_workers (10K), candidates (100K), timesheets (1M), call_log (800K), email_log (500K), placements (50K), job_orders (15K), clients (2K).",
"Run SQL against any lakehouse dataset. Tables: ethereal_workers (10K), candidates (1K), workers_500k (500K), timesheets (1M), call_log (800K), email_log (500K), placements (50K), job_orders (15K), clients (2K).",
{ sql: z.string().describe("SQL query") },
async ({ sql }) => {
const r = await api("POST", "/query/sql", { sql });
@ -749,12 +749,30 @@ async function main() {
});
}
// Sample files (downloadable + fetchable from the onboard wizard)
// Sample CSV — generated fresh on every request so content-hash
// dedup on the ingest side always sees a new payload (two uploads
// in a row would otherwise be a no-op). Each generation has
// unique worker_ids (timestamp-prefixed), randomized names + roles
// + geos from realistic pools, and a random size (~120-180 rows)
// so the demo looks different every time and numbers actually
// update visibly in the dashboard after onboarding.
if (url.pathname.startsWith("/samples/")) {
const name = url.pathname.slice("/samples/".length);
if (!/^[a-zA-Z0-9_\-\.]+\.csv$/.test(name)) {
return err("invalid sample filename", 400);
}
if (name === "staffing_roster_sample.csv") {
const csv = generateSampleRosterCSV();
return new Response(csv, {
headers: {
...cors,
"Content-Type": "text/csv",
"Content-Disposition": `attachment; filename="${name}"`,
"Cache-Control": "no-store",
},
});
}
// Other sample filenames fall through to the static dir
const path = `${import.meta.dir}/samples/${name}`;
const file = Bun.file(path);
if (!(await file.exists())) return err("sample not found", 404);
@ -764,6 +782,57 @@ async function main() {
});
}
// System-wide scale summary — truthful numbers for the UI.
// Pulls row counts via SQL (COUNT(*) from parquet footers) for
// the key datasets rather than trusting catalog manifests, which
// can go stale when data changes without re-registering. The
// workers_500k manifest is correct (500K); candidates manifest
// lied (said 100K, actual 1K) — the audit caught it.
// Everything else uses manifest row_count since it's O(1).
if (url.pathname === "/system/summary") {
const [ds, indexes, workersCount, candsCount] = await Promise.all([
api("GET", "/catalog/datasets").catch(() => [] as any),
api("GET", "/vectors/indexes").catch(() => [] as any),
api("POST", "/query/sql", { sql: "SELECT COUNT(*) AS c FROM workers_500k" })
.catch(() => null as any),
api("POST", "/query/sql", { sql: "SELECT COUNT(*) AS c FROM candidates" })
.catch(() => null as any),
]);
const datasets = Array.isArray(ds) ? ds : [];
const idxs = Array.isArray(indexes) ? indexes : [];
const workers = Number(workersCount?.rows?.[0]?.c ?? 0);
const candidates = Number(candsCount?.rows?.[0]?.c ?? 0);
// Sum manifest row_counts EXCLUDING workers_500k + candidates,
// then add the truthful SQL counts. This gives a total that
// reflects live state for the two most-quoted tables.
const otherManifest = datasets
.filter((d: any) => d?.name !== "workers_500k" && d?.name !== "candidates")
.reduce((s: number, d: any) => s + (d?.row_count || 0), 0);
const totalRows = otherManifest + workers + candidates;
const totalChunks = idxs.reduce((s: number, i: any) => s + (i?.chunk_count || 0), 0);
// Manifest drift audit — surface any cases where manifest
// disagrees with SQL for the two spot-checked tables so the UI
// can note it if ever meaningful.
const drift: any[] = [];
const workersManifest = datasets.find((d: any) => d?.name === "workers_500k")?.row_count;
const candidatesManifest = datasets.find((d: any) => d?.name === "candidates")?.row_count;
if (workersManifest !== undefined && workersManifest !== workers) {
drift.push({ dataset: "workers_500k", manifest: workersManifest, actual: workers });
}
if (candidatesManifest !== undefined && candidatesManifest !== candidates) {
drift.push({ dataset: "candidates", manifest: candidatesManifest, actual: candidates });
}
return ok({
datasets: datasets.length,
total_rows: totalRows,
total_chunks: totalChunks,
workers_500k_rows: workers,
candidates_rows: candidates,
indexes: idxs.length,
manifest_drift: drift,
});
}
// Proof JSON API (same data, no HTML)
if (url.pathname === "/proof.json") {
const ds = await api("GET", "/catalog/datasets") as any[];
@ -1806,6 +1875,115 @@ async function runAlertsOnce() {
// Seed playbook_memory from a filled contract so the next hybrid query
// ranks against it. Used by both runWeekSimulation (per-day) and the /log
// endpoint (per manual logging). Fail-soft — seeding is best-effort.
// ─── Sample CSV generator ───────────────────────────────────────────────
// Fresh randomized staffing roster per request. Prevents the "upload
// same file twice and it's a no-op" problem from the static sample,
// and makes the dashboard numbers visibly update after onboarding.
const SAMPLE_FIRST_NAMES = [
"Sarah","Michael","Maria","David","Jennifer","Robert","Amanda","Carlos",
"Kim","James","Priya","Thomas","Lisa","Brandon","Emily","Marcus","Anita",
"Dmitri","Rachel","Samuel","Jordan","Natalia","Henry","Ava","Tyler",
"Hannah","Luis","Aisha","Victor","Monica","Derek","Yuki","Fatima","Kwame",
"Isabel","Rafael","Elena","Hiroshi","Nadia","Oscar","Sofia","Anders",
"Leila","Jamal","Chioma","Pavel","Bianca","Tariq","Inez","Reuben","Mira",
];
const SAMPLE_LAST_NAMES = [
"Johnson","Chen","Rodriguez","Park","Lopez","Williams","Taylor","Mendoza",
"Nguyen","O'Brien","Patel","Anderson","Nakamura","Moore","Zhang","Brooks",
"Volkov","Kim","Thompson","Martinez","Soto","Robinson","Clark","Hayes",
"Reyes","Brown","Wright","Diaz","Powell","Green","Castillo","Iwu",
"Kowalski","Lindström","Oyelaran","Saitō","Abebe","Mehta","Blanchard",
];
const SAMPLE_ROLES = [
"Forklift Operator","Welder","Warehouse Associate","Machine Operator",
"Loader","Maintenance Tech","Quality Tech","Electrician","Line Lead",
"Material Handler","Production Worker","Assembler","Shipping Clerk",
];
const SAMPLE_CITY_STATE: Array<[string, string]> = [
["Chicago","IL"],["Springfield","IL"],["Rockford","IL"],["Peoria","IL"],
["Indianapolis","IN"],["Fort Wayne","IN"],["Evansville","IN"],["South Bend","IN"],
["Columbus","OH"],["Cleveland","OH"],["Cincinnati","OH"],["Toledo","OH"],
["St. Louis","MO"],["Kansas City","MO"],["Springfield","MO"],
["Nashville","TN"],["Memphis","TN"],["Knoxville","TN"],
["Louisville","KY"],["Lexington","KY"],
["Milwaukee","WI"],["Madison","WI"],["Green Bay","WI"],
["Detroit","MI"],["Grand Rapids","MI"],["Lansing","MI"],
];
const SAMPLE_SKILL_POOLS: Record<string, string[]> = {
"Forklift Operator": ["pallet jack","hazmat","loading dock","overhead crane","cold storage","shipping","team lead"],
"Welder": ["TIG","MIG","pipe welding","blueprint reading","grinder","confined space"],
"Warehouse Associate": ["inventory","RF scanner","pick-to-light","Excel","packaging","team lead"],
"Machine Operator": ["CNC","SPC","gauge R&R","lean manufacturing","conveyor ops","first article"],
"Loader": ["loading dock","team lead","cold storage","first aid","bilingual"],
"Maintenance Tech": ["electrical","PLC","hydraulics","CMMS","LOTO","troubleshooting"],
"Quality Tech": ["ISO 9001","calibration","root cause analysis","SPC","Six Sigma"],
"Electrician": ["conduit","motor controls","troubleshooting","PLC","NEC"],
"Line Lead": ["team lead","training","SPC","scheduling"],
"Material Handler": ["RF scanner","pallet jack","receiving","packaging"],
"Production Worker": ["line work","first article","labeling","packaging","quality inspection"],
"Assembler": ["assembly","gauge R&R","line lead","first article"],
"Shipping Clerk": ["shipping","receiving","RF scanner","bilingual"],
};
const SAMPLE_CERT_POOL = ["OSHA-10","OSHA-30","Forklift","Hazmat","First Aid","LOTO","Confined Space","AWS D1.1","ServSafe","Six Sigma Green"];
const SAMPLE_ARCHETYPES = ["reliable","specialist","leader","communicator","flexible"];
function pick<T>(arr: T[]): T { return arr[Math.floor(Math.random() * arr.length)]; }
function pickN<T>(arr: T[], n: number): T[] {
const copy = arr.slice();
const out: T[] = [];
for (let i = 0; i < n && copy.length > 0; i++) {
out.push(copy.splice(Math.floor(Math.random() * copy.length), 1)[0]);
}
return out;
}
function csvEscape(s: string): string {
if (s.indexOf(",") >= 0 || s.indexOf('"') >= 0 || s.indexOf("\n") >= 0) {
return `"${s.replace(/"/g, '""')}"`;
}
return s;
}
function generateSampleRosterCSV(): string {
const count = 120 + Math.floor(Math.random() * 61); // 120-180
const ts = Date.now();
const lines: string[] = [
"worker_id,name,role,city,state,email,phone,skills,certifications,availability,reliability,archetype",
];
for (let i = 0; i < count; i++) {
const first = pick(SAMPLE_FIRST_NAMES);
const last = pick(SAMPLE_LAST_NAMES);
const name = `${first} ${last}`;
const role = pick(SAMPLE_ROLES);
const [city, state] = pick(SAMPLE_CITY_STATE);
const handle = `${first}.${last}`.toLowerCase().replace(/[^a-z\.]/g, "");
const email = `${handle}${Math.floor(Math.random() * 1000)}@example.com`;
const area = ["312","773","630","708","331","815","217","219","260","614","216","513","419","314","816","615","901","502","414","608","313","616"][Math.floor(Math.random() * 22)];
const phone = `(${area}) 555-${String(1000 + Math.floor(Math.random() * 9000))}`;
const skillPool = SAMPLE_SKILL_POOLS[role] || ["general"];
const skills = pickN(skillPool, 2 + Math.floor(Math.random() * 3)).join("|");
const certs = pickN(SAMPLE_CERT_POOL, 1 + Math.floor(Math.random() * 3)).join("|");
const availability = (0.3 + Math.random() * 0.69).toFixed(2);
const reliability = (0.55 + Math.random() * 0.44).toFixed(2);
const archetype = pick(SAMPLE_ARCHETYPES);
lines.push([
`W-${ts}-${String(i).padStart(4, "0")}`,
csvEscape(name),
csvEscape(role),
csvEscape(city),
state,
email,
phone,
csvEscape(skills),
csvEscape(certs),
availability,
reliability,
archetype,
].join(","));
}
return lines.join("\n") + "\n";
}
// ─── Rate/margin awareness ──────────────────────────────────────────────
// Derive implied pay and bill rates per worker / per contract without
// schema changes. Numbers are industry heuristics — a real deployment

View File

@ -127,7 +127,7 @@ body{font-family:'Inter',-apple-system,system-ui,'Segoe UI',sans-serif;backgroun
<div class="section" id="live-contracts-section">
<div class="section-header">
<span class="section-title">Live Contracts — Chicago Permits → Proposed Fills</span>
<span class="section-meta">Real public permit data + our 500K worker bench + past playbook patterns</span>
<span class="section-meta" id="live-contracts-meta">Real public permit data + worker bench + past playbook patterns</span>
</div>
<div id="live-contracts"><div class="ld">Loading live contracts...</div></div>
</div>
@ -151,7 +151,7 @@ body{font-family:'Inter',-apple-system,system-ui,'Segoe UI',sans-serif;backgroun
<div class="section">
<div class="section-header">
<span class="section-title">Worker Search</span>
<span class="section-meta">Natural language · 500K profiles</span>
<span class="section-meta" id="worker-search-meta">Natural language search</span>
</div>
<details class="sa" open><summary>Search all workers</summary><div class="inner">
<input type="text" id="sq" placeholder="Try: reliable forklift operator available in Nashville" onkeydown="if(event.key==='Enter')doSearch()">
@ -160,14 +160,41 @@ body{font-family:'Inter',-apple-system,system-ui,'Segoe UI',sans-serif;backgroun
<button class="sbtn" onclick="doSearch()">Find Workers</button><div id="sresults"></div></div></details>
</div>
<div class="ft">Staffing Co-Pilot · Hybrid SQL + Vector Search · 500K embedded profiles · <a href="console">Console</a> · <a href="proof">Architecture</a></div>
<div class="ft" id="footer">Staffing Co-Pilot · Hybrid SQL + Vector Search · loading scale… · <a href="console">Console</a> · <a href="proof">Architecture</a></div>
</div>
<script>
var P=location.pathname.indexOf('/lakehouse')>=0?'/lakehouse':'';
var A=location.origin+P;
var AC=['#1a2744','#1a3a2a','#2a1a3a','#3a2a1a','#1a3a3a','#2a2a1a'];
var lastQuery='';
window.addEventListener('load',function(){loadDay();loadStaffingForecast();loadLiveContracts();loadMarket();loadLearning()});
window.addEventListener('load',function(){loadSystemSummary();loadDay();loadStaffingForecast();loadLiveContracts();loadMarket();loadLearning()});
function loadSystemSummary(){
api('/system/summary',{}).then(function(s){
if(!s) return;
var totalRows=(s.total_rows||0).toLocaleString();
var workers=(s.workers_500k_rows||0).toLocaleString();
var chunks=(s.total_chunks||0).toLocaleString();
var ds=s.datasets||0;
var meta1=document.getElementById('live-contracts-meta');
if(meta1) meta1.textContent='Real public permit data + '+workers+' worker bench + past playbook patterns';
var meta2=document.getElementById('worker-search-meta');
if(meta2) meta2.textContent='Natural language · '+workers+' profiles across '+ds+' datasets';
var foot=document.getElementById('footer');
if(foot){
foot.textContent='';
foot.appendChild(document.createTextNode('Staffing Co-Pilot · Hybrid SQL + Vector Search · '+totalRows+' rows across '+ds+' datasets · '+chunks+' vector chunks · '));
var a1=document.createElement('a');a1.href='console';a1.textContent='Console';foot.appendChild(a1);
foot.appendChild(document.createTextNode(' · '));
var a2=document.createElement('a');a2.href='proof';a2.textContent='Architecture';foot.appendChild(a2);
}
// Also update the collapsible search box label if not yet populated
var sum=document.querySelector('.sa summary');
if(sum&&/Search all\s*\d*\s*workers/.test(sum.textContent)){
sum.textContent='Search all '+workers+' workers';
}
}).catch(function(){/* non-fatal */});
}
function loadStaffingForecast(){
api('/intelligence/staffing_forecast',{}).then(function(r){