lakehouse/mcp-server/onboard.html
root 23eb04a145 Onboarding wizard — ingest any staffing CSV in 3 steps
New /onboard page. Client-facing wizard for getting real data into
the system without engineering help.

Flow:
1. Drop a CSV (or click 'Use the sample as my data' — ships a 25-row
   realistic staffing roster under /samples/staffing_roster_sample.csv)
2. Browser parses client-side. Columns auto-typed (text/int/decimal/
   date). PII flagged by name hint AND content regex (emails, phones).
   First rows previewed. Read-only — nothing written yet.
3. Name the dataset (lowercase+underscores). Commit.
4. Post-commit: dataset is live. Shows 4 next steps the operator can
   take (SQL query, vector index, dashboard search, playbook training).

Backend:
- /onboard serves onboard.html
- /samples/*.csv serves CSV files from mcp-server/samples/ with
  filename validation (only [a-zA-Z0-9_-.]+.csv, prevents path traversal)
- /onboard/ingest forwards multipart/form-data to gateway /ingest/file
  preserving the boundary. The generic /api/* passthrough breaks
  multipart because it reads as text and forwards as JSON; this route
  uses arrayBuffer + original Content-Type.

Verified end-to-end: upload sample roster (25 rows, 12 columns) →
parse in browser → show columns + PII flags + preview → commit →
gateway writes Parquet, registers in catalog → immediately queryable:
  SELECT * FROM onboard_demo2 LIMIT 3
  → Sarah Johnson, Forklift Operator, Chicago, IL, 0.92
Round-trip <1 second.

Nav updated on all pages to link Onboard. Shipped with a sample CSV
so the full flow is demonstrable without real client data.

When a real client shows up, same path — they upload their CSV.
No engineering ticket, no code change, no schema pre-definition.

Security: sample filename regex prevents path traversal. CSV parse
is client-side pure JS (no DOM injection). Commit uses existing
/ingest/file validation (schema fingerprint, PII server-side,
content-hash dedup).
2026-04-20 18:13:56 -05:00

413 lines
19 KiB
HTML

<!DOCTYPE html>
<html lang="en"><head>
<meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
<title>Lakehouse — Connect Your Data</title>
<style>
*{margin:0;padding:0;box-sizing:border-box}
body{font-family:'Inter',-apple-system,system-ui,sans-serif;background:#090c10;color:#b0b8c4;font-size:14px;line-height:1.55;-webkit-font-smoothing:antialiased}
a{color:#58a6ff;text-decoration:none}
a:hover{color:#79c0ff}
.bar{background:#0d1117;padding:0 24px;height:56px;border-bottom:1px solid #171d27;display:flex;justify-content:space-between;align-items:center;position:sticky;top:0;z-index:10}
.bar h1{font-size:14px;font-weight:600;color:#e6edf3;letter-spacing:-0.2px}
.bar nav{display:flex;gap:2px}
.bar nav a{font-size:12px;color:#545d68;padding:6px 14px;border-radius:6px;transition:all 0.15s}
.bar nav a:hover{color:#e6edf3;background:#161b22}
.bar nav a.active{color:#e6edf3;background:#1c2333}
.bar .rt{font-size:11px;color:#545d68}
.wrap{max-width:1040px;margin:0 auto;padding:28px 20px 60px}
.step{margin-bottom:36px;padding-left:44px;position:relative}
.step:before{content:attr(data-n);position:absolute;left:0;top:-2px;width:30px;height:30px;border-radius:15px;background:#0d1117;border:1px solid #21262d;color:#58a6ff;font-weight:700;font-size:13px;display:flex;align-items:center;justify-content:center}
.step.active:before{background:#1f6feb;color:#fff;border-color:#1f6feb}
.step.done:before{content:"✓";background:#0d2818;color:#3fb950;border-color:#2ea043}
.step h2{color:#e6edf3;font-size:18px;font-weight:700;margin-bottom:6px;letter-spacing:-0.2px}
.step .lede{color:#8b949e;font-size:13px;margin-bottom:14px;line-height:1.6}
.card{background:#0d1117;border:1px solid #171d27;border-radius:10px;padding:18px;margin:8px 0}
.btn{padding:10px 20px;background:#1f6feb;border:none;border-radius:8px;color:#fff;font-size:13px;font-weight:600;cursor:pointer;text-decoration:none;display:inline-block}
.btn:hover{background:#388bfd}
.btn:disabled{opacity:0.4;cursor:not-allowed}
.btn.ghost{background:transparent;border:1px solid #21262d;color:#c9d1d9}
.btn.ghost:hover{background:#161b22;border-color:#30363d}
.btn.green{background:#2ea043}
.btn.green:hover{background:#3fb950}
.drop{border:2px dashed #30363d;border-radius:10px;padding:28px;text-align:center;transition:all 0.15s;cursor:pointer}
.drop:hover{border-color:#58a6ff;background:#0d1117}
.drop.active{border-color:#2ea043;background:#0d2818}
.drop input[type=file]{display:none}
.drop .title{color:#e6edf3;font-weight:600;margin-bottom:4px}
.drop .sub{color:#545d68;font-size:12px}
.actions{display:flex;gap:10px;flex-wrap:wrap;margin-top:10px}
input[type=text]{padding:10px 14px;background:#161b22;border:1px solid #21262d;border-radius:8px;color:#e6edf3;font-size:13px;outline:none;width:100%}
input[type=text]:focus{border-color:#388bfd}
table.preview{width:100%;border-collapse:collapse;font-size:11px;margin-top:8px;font-family:ui-monospace,Menlo,monospace}
table.preview th{text-align:left;padding:8px 10px;color:#8b949e;background:#0d1117;border-bottom:1px solid #21262d;font-weight:600;font-size:10px;text-transform:uppercase;letter-spacing:0.8px}
table.preview td{padding:6px 10px;border-bottom:1px solid #171d27;color:#c9d1d9;white-space:nowrap;max-width:180px;overflow:hidden;text-overflow:ellipsis}
table.preview tr:hover td{background:#0d1117}
.col-list{display:flex;flex-direction:column;gap:6px;margin-top:6px}
.col{display:flex;align-items:center;gap:8px;padding:8px 12px;background:#161b22;border-radius:6px;font-size:12px}
.col .name{color:#e6edf3;font-weight:500;min-width:140px}
.col .type{color:#58a6ff;font-size:11px;font-family:ui-monospace,Menlo,monospace;min-width:60px}
.col .pii{padding:2px 8px;border-radius:9px;font-size:9px;font-weight:600;letter-spacing:0.3px}
.pii.email{background:#2d1b00;color:#d29922;border:1px solid #854d0e}
.pii.phone{background:#2d1b00;color:#d29922;border:1px solid #854d0e}
.pii.ssn{background:#3a1a1a;color:#f85149;border:1px solid #7f1d1d}
.pii.addr{background:#2d1b00;color:#d29922;border:1px solid #854d0e}
.col .sample{color:#8b949e;font-size:11px;font-family:ui-monospace,Menlo,monospace;flex:1;min-width:0;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
.note{color:#8b949e;font-size:12px;line-height:1.7;padding:10px 14px;border-left:2px solid #21262d;margin:10px 0}
.note strong{color:#c9d1d9}
.result{padding:16px;border-radius:10px;margin:10px 0}
.result.ok{background:#0d2818;border:1px solid #2ea04360;color:#86efac}
.result.err{background:#3a1a1a;border:1px solid #f8514960;color:#fca5a5}
.result .big{font-size:16px;font-weight:700;margin-bottom:4px}
.result .detail{font-size:12px}
.spin{display:inline-block;width:14px;height:14px;border:2px solid #30363d;border-top-color:#58a6ff;border-radius:50%;animation:spin 0.7s linear infinite;vertical-align:middle;margin-right:6px}
@keyframes spin{to{transform:rotate(360deg)}}
.footer{border-top:1px solid #171d27;padding:20px;text-align:center;color:#3d444d;font-size:11px}
@media(max-width:720px){
.wrap{padding:20px 12px 40px}
.step{padding-left:36px}
.step:before{width:26px;height:26px;font-size:12px}
.bar nav{display:none}
}
</style></head>
<body>
<div class="bar">
<h1>Lakehouse — Connect Your Data</h1>
<nav>
<a href=".">Dashboard</a>
<a href="console">Walkthrough</a>
<a href="proof">Architecture</a>
<a href="spec">Spec</a>
<a href="onboard" class="active">Onboard</a>
</nav>
<div class="rt">30 minutes from CSV to live search</div>
</div>
<div class="wrap">
<div style="margin-bottom:24px">
<h1 style="color:#e6edf3;font-size:26px;font-weight:700;letter-spacing:-0.5px;margin-bottom:8px">Ingest any staffing CSV in three steps</h1>
<p style="color:#8b949e;font-size:14px;line-height:1.6;max-width:720px">
Upload your ATS export, your worker roster, or any CSV with a <code>name</code> column.
The wizard auto-detects columns, flags PII, previews the first rows, then ingests
as a queryable Parquet dataset. Everything that follows — hybrid search,
playbook ranking, pattern discovery — works against your data automatically.
</p>
</div>
<!-- Step 1 -->
<div class="step active" data-n="1" id="step-1">
<h2>Pick a file</h2>
<div class="lede">Drag a CSV in, pick from disk, or use the sample roster to see the flow without any real data.</div>
<label class="drop" id="drop">
<input type="file" id="file" accept=".csv,text/csv">
<div class="title">Drop a CSV here or click to choose</div>
<div class="sub">Max 50MB · UTF-8 · comma-separated with a header row</div>
</label>
<div class="actions">
<a class="btn ghost" href="samples/staffing_roster_sample.csv" download>Download sample roster (25 workers)</a>
<button class="btn ghost" id="use-sample" type="button">Use the sample as my data</button>
</div>
</div>
<!-- Step 2 -->
<div class="step" data-n="2" id="step-2" style="display:none">
<h2>Review what we detected</h2>
<div class="lede">
Columns auto-typed. PII columns flagged. First rows previewed. Nothing is written to the system yet — this is a read-only dry-run.
</div>
<div class="card">
<div style="color:#545d68;font-size:11px;text-transform:uppercase;letter-spacing:1.2px;font-weight:600;margin-bottom:10px">Columns detected</div>
<div class="col-list" id="col-list"></div>
</div>
<div class="card">
<div style="color:#545d68;font-size:11px;text-transform:uppercase;letter-spacing:1.2px;font-weight:600;margin-bottom:10px">First rows</div>
<div style="overflow-x:auto"><table class="preview" id="preview-table"></table></div>
<div style="color:#545d68;font-size:11px;margin-top:8px" id="row-count"></div>
</div>
<div class="note">
<strong>What happens next.</strong> On commit, the file is sent to <code>/ingest/file</code> — the same
endpoint every other ingest path uses. The Rust gateway writes it to object storage as
Parquet, computes a schema fingerprint, registers it in the catalog, and auto-detects PII
columns server-side. Re-uploading the same file is a no-op (deduplicated by content hash).
</div>
</div>
<!-- Step 3 -->
<div class="step" data-n="3" id="step-3" style="display:none">
<h2>Name it and commit</h2>
<div class="lede">Give the dataset a queryable name. This becomes the table you can <code>SELECT * FROM</code> immediately after commit.</div>
<div class="card">
<label style="display:block;color:#8b949e;font-size:12px;margin-bottom:6px">Dataset name</label>
<input type="text" id="dataset-name" placeholder="e.g. acme_staffing_roster" value="client_workers">
<div style="color:#545d68;font-size:11px;margin-top:6px">
Use lowercase + underscores. Once committed: queryable via <code>/query/sql</code>,
searchable via <code>/search</code>, indexable via <code>/vectors/index</code>.
</div>
<div class="actions" style="margin-top:14px">
<button class="btn green" id="commit-btn" type="button">Commit dataset</button>
<button class="btn ghost" id="back-btn" type="button">Back</button>
</div>
</div>
<div id="commit-result"></div>
</div>
<!-- Step 4: Post-commit next steps (hidden until success) -->
<div class="step" data-n="4" id="step-4" style="display:none">
<h2>Your dataset is live</h2>
<div class="lede">From here, the rest of the system applies to your data with zero additional setup.</div>
<div class="card">
<div style="color:#e6edf3;font-size:14px;font-weight:600;margin-bottom:12px">What you can do right now</div>
<div style="display:flex;flex-direction:column;gap:10px">
<div style="padding:10px 14px;background:#161b22;border-radius:6px;font-size:12px"><strong style="color:#e6edf3">Query via SQL.</strong> <code>POST /query/sql</code> with <code>SELECT * FROM <span id="ds-name-1">your_dataset</span> LIMIT 10</code>.</div>
<div style="padding:10px 14px;background:#161b22;border-radius:6px;font-size:12px"><strong style="color:#e6edf3">Build a vector index.</strong> <code>POST /vectors/index</code> with <code>{"dataset":"<span id="ds-name-2">your_dataset</span>","text_column":"skills"}</code>. Embeddings stream in; queryable progressively.</div>
<div style="padding:10px 14px;background:#161b22;border-radius:6px;font-size:12px"><strong style="color:#e6edf3">Search via the dashboard.</strong> Open the <a href=".">dashboard</a> and the "Search all workers" box. Results will come from your data.</div>
<div style="padding:10px 14px;background:#161b22;border-radius:6px;font-size:12px"><strong style="color:#e6edf3">Track with playbook memory.</strong> Every Call/SMS/No-show click on a worker card trains the system on your data.</div>
</div>
</div>
</div>
</div>
<div class="footer">Lakehouse · onboarding wizard · <a href="spec">full architecture spec</a></div>
<script>
var P=location.pathname.indexOf('/lakehouse')>=0?'/lakehouse':'';
var A=location.origin+P;
var parsedFile=null; // { name, rows, columns, sample }
var rawBlob=null;
function el(tag,cls,text){
var e=document.createElement(tag);
if(cls) e.className=cls;
if(text!==undefined && text!==null) e.textContent=String(text);
return e;
}
function setStep(n){
for(var i=1;i<=4;i++){
var s=document.getElementById('step-'+i);
if(!s) continue;
s.classList.remove('active','done');
if(i<n) s.classList.add('done');
if(i===n) { s.classList.add('active'); s.style.display='block'; }
else if(i>n) s.style.display='none';
else s.style.display='block';
}
}
// Basic CSV parser. Handles quoted fields with commas + escaped quotes.
function parseCSV(text){
var lines=[], i=0, cur='', row=[], inQ=false;
while(i<text.length){
var c=text[i];
if(inQ){
if(c==='"' && text[i+1]==='"'){ cur+='"'; i+=2; continue; }
if(c==='"'){ inQ=false; i++; continue; }
cur+=c; i++;
} else {
if(c==='"'){ inQ=true; i++; continue; }
if(c===','){ row.push(cur); cur=''; i++; continue; }
if(c==='\n'){ row.push(cur); lines.push(row); row=[]; cur=''; i++; continue; }
if(c==='\r'){ i++; continue; }
cur+=c; i++;
}
}
if(cur.length>0||row.length>0){ row.push(cur); lines.push(row); }
return lines;
}
// Type + PII inference from a sample of values for a column.
function inferColumn(name, values){
var nonEmpty=values.filter(function(v){return v!=null && v!==''});
var n=nonEmpty.length;
var lname=(name||'').toLowerCase();
// PII signals by name and sample content
var pii=null;
if(/email|e-mail|mail_address/.test(lname)) pii='email';
else if(/phone|mobile|cell|tel/.test(lname)) pii='phone';
else if(/ssn|social|tax_id/.test(lname)) pii='ssn';
else if(/address|street|zip|postal/.test(lname)) pii='addr';
// Content-based PII scan on sample
if(!pii){
var emailRe=/^[^\s@]+@[^\s@]+\.[^\s@]+$/;
var phoneRe=/^[\s\-\(\)\+]*\d[\s\-\(\)\+\d]{6,}$/;
var emailHits=nonEmpty.filter(function(v){return emailRe.test(v)}).length;
var phoneHits=nonEmpty.filter(function(v){return phoneRe.test(v)}).length;
if(emailHits>n*0.5) pii='email';
else if(phoneHits>n*0.5) pii='phone';
}
// Type inference
var type='text';
if(n>0){
var intHits=nonEmpty.filter(function(v){return /^-?\d+$/.test(v)}).length;
var numHits=nonEmpty.filter(function(v){return /^-?\d+(\.\d+)?$/.test(v)}).length;
if(intHits===n) type='int';
else if(numHits===n) type='decimal';
else if(nonEmpty.every(function(v){return /^\d{4}-\d{2}-\d{2}/.test(v)})) type='date';
}
return { name: name, type: type, pii: pii, sample: nonEmpty.slice(0,3).join(' · ') || '(empty)' };
}
function readFile(file){
rawBlob=file;
var reader=new FileReader();
reader.onload=function(e){
var text=String(e.target.result||'');
var lines=parseCSV(text);
if(lines.length<2){
alert('CSV is empty or has no data rows.');
return;
}
var header=lines[0].map(function(h){return String(h||'').trim()});
var rows=lines.slice(1).filter(function(r){return r.length>0 && r.some(function(c){return c!=null&&c!==''})});
// Build columns with inference
var columns=header.map(function(h,idx){
var colValues=rows.map(function(r){return r[idx]});
return inferColumn(h, colValues.slice(0,50)); // sample first 50 for perf
});
parsedFile={name: file.name, rowCount: rows.length, header: header, rows: rows, columns: columns};
renderPreview();
setStep(2);
};
reader.readAsText(file);
}
function renderPreview(){
var colList=document.getElementById('col-list'); colList.textContent='';
parsedFile.columns.forEach(function(c){
var row=el('div','col');
row.appendChild(el('span','name',c.name));
row.appendChild(el('span','type',c.type));
if(c.pii){
var chip=el('span','pii '+c.pii,c.pii.toUpperCase());
row.appendChild(chip);
}
row.appendChild(el('span','sample',c.sample));
colList.appendChild(row);
});
var table=document.getElementById('preview-table');table.textContent='';
var thead=document.createElement('thead');
var htr=document.createElement('tr');
parsedFile.header.forEach(function(h){ htr.appendChild(el('th',null,h)) });
thead.appendChild(htr);table.appendChild(thead);
var tbody=document.createElement('tbody');
parsedFile.rows.slice(0,5).forEach(function(r){
var tr=document.createElement('tr');
parsedFile.header.forEach(function(_,idx){
tr.appendChild(el('td',null,r[idx]||''));
});
tbody.appendChild(tr);
});
table.appendChild(tbody);
document.getElementById('row-count').textContent=parsedFile.rowCount.toLocaleString()+' data rows · '+parsedFile.header.length+' columns · source: '+parsedFile.name;
// Auto-advance button: a Next button at the bottom of step-2
var step2=document.getElementById('step-2');
var existing=document.getElementById('step-2-next');
if(!existing){
var wrap=el('div','actions');wrap.id='step-2-next';
var next=el('button','btn','Continue to commit');next.type='button';
next.onclick=function(){ setStep(3); };
var reset=el('button','btn ghost','Pick a different file');reset.type='button';
reset.onclick=function(){ parsedFile=null; rawBlob=null; document.getElementById('file').value=''; setStep(1); };
wrap.appendChild(next);wrap.appendChild(reset);
step2.appendChild(wrap);
}
}
function commit(){
var dsName=(document.getElementById('dataset-name').value||'').trim();
if(!dsName){ alert('Dataset name required.'); return; }
if(!/^[a-z][a-z0-9_]*$/.test(dsName)){ alert('Dataset name: lowercase letters, numbers, underscores only; start with a letter.'); return; }
if(!rawBlob){ alert('No file loaded.'); return; }
var btn=document.getElementById('commit-btn');
btn.disabled=true;btn.textContent='';
var spin=el('span','spin');btn.appendChild(spin);
btn.appendChild(document.createTextNode('Ingesting…'));
var form=new FormData();
form.append('file', rawBlob, parsedFile.name);
fetch(A+'/onboard/ingest?name='+encodeURIComponent(dsName), {
method: 'POST', body: form,
}).then(function(r){
return r.text().then(function(body){ return {status:r.status, body:body}; });
}).then(function(res){
btn.disabled=false;btn.textContent='Commit dataset';
var out=document.getElementById('commit-result');out.textContent='';
if(res.status>=200 && res.status<300){
var ok=el('div','result ok');
ok.appendChild(el('div','big','Dataset "'+dsName+'" is live.'));
ok.appendChild(el('div','detail',parsedFile.rowCount.toLocaleString()+' rows · '+parsedFile.header.length+' columns · registered in catalog'));
out.appendChild(ok);
document.getElementById('ds-name-1').textContent=dsName;
document.getElementById('ds-name-2').textContent=dsName;
setStep(4);
} else {
var err=el('div','result err');
err.appendChild(el('div','big','Ingest failed ('+res.status+')'));
err.appendChild(el('div','detail',res.body.slice(0,400)));
out.appendChild(err);
}
}).catch(function(e){
btn.disabled=false;btn.textContent='Commit dataset';
var out=document.getElementById('commit-result');out.textContent='';
var err=el('div','result err');
err.appendChild(el('div','big','Network error'));
err.appendChild(el('div','detail',e.message||String(e)));
out.appendChild(err);
});
}
document.addEventListener('DOMContentLoaded',function(){
var fileInput=document.getElementById('file');
var drop=document.getElementById('drop');
fileInput.addEventListener('change',function(e){
if(e.target.files.length) readFile(e.target.files[0]);
});
drop.addEventListener('dragover',function(e){e.preventDefault();drop.classList.add('active')});
drop.addEventListener('dragleave',function(){drop.classList.remove('active')});
drop.addEventListener('drop',function(e){
e.preventDefault();drop.classList.remove('active');
if(e.dataTransfer.files.length) readFile(e.dataTransfer.files[0]);
});
document.getElementById('use-sample').addEventListener('click',function(){
// Fetch the sample CSV and feed it through the same pipeline
fetch(A+'/samples/staffing_roster_sample.csv').then(function(r){return r.blob()}).then(function(blob){
var file=new File([blob],'staffing_roster_sample.csv',{type:'text/csv'});
readFile(file);
}).catch(function(e){ alert('Sample unavailable: '+e.message); });
});
document.getElementById('commit-btn').addEventListener('click',commit);
document.getElementById('back-btn').addEventListener('click',function(){ setStep(2); });
});
</script>
</body></html>