diff --git a/mcp-server/index.ts b/mcp-server/index.ts
index f1c09aa..b17d681 100644
--- a/mcp-server/index.ts
+++ b/mcp-server/index.ts
@@ -650,6 +650,54 @@ async function main() {
});
}
+ // Onboard — client-facing ingest wizard. Upload any CSV, preview
+ // columns + PII + sample rows, commit via /ingest/file. Works
+ // with a shipped sample roster so anyone can trial the flow
+ // without real client data.
+ if (url.pathname === "/onboard") {
+ return new Response(Bun.file(import.meta.dir + "/onboard.html"), {
+ headers: { ...cors, "Content-Type": "text/html" },
+ });
+ }
+
+ // Onboard ingest — forwards multipart/form-data correctly to
+ // the Rust gateway /ingest/file. The generic /api/* passthrough
+ // can't handle multipart because it reads as text and forwards
+ // as JSON, losing the boundary. This route preserves the body
+ // and Content-Type.
+ if (url.pathname === "/onboard/ingest" && req.method === "POST") {
+ const name = url.searchParams.get("name");
+ if (!name || !/^[a-z][a-z0-9_]*$/.test(name)) {
+ return err("dataset name required (lowercase+underscores)", 400);
+ }
+ const contentType = req.headers.get("content-type") || "";
+ const upstream = await fetch(`${BASE}/ingest/file?name=${encodeURIComponent(name)}`, {
+ method: "POST",
+ headers: { "Content-Type": contentType },
+ body: await req.arrayBuffer(),
+ });
+ const body = await upstream.text();
+ return new Response(body, {
+ status: upstream.status,
+ headers: { ...cors, "Content-Type": upstream.headers.get("content-type") || "application/json" },
+ });
+ }
+
+ // Sample files (downloadable + fetchable from the onboard wizard)
+ if (url.pathname.startsWith("/samples/")) {
+ const name = url.pathname.slice("/samples/".length);
+ if (!/^[a-zA-Z0-9_\-\.]+\.csv$/.test(name)) {
+ return err("invalid sample filename", 400);
+ }
+ const path = `${import.meta.dir}/samples/${name}`;
+ const file = Bun.file(path);
+ if (!(await file.exists())) return err("sample not found", 404);
+ return new Response(file, {
+ headers: { ...cors, "Content-Type": "text/csv",
+ "Content-Disposition": `attachment; filename="${name}"` },
+ });
+ }
+
// Proof JSON API (same data, no HTML)
if (url.pathname === "/proof.json") {
const ds = await api("GET", "/catalog/datasets") as any[];
diff --git a/mcp-server/onboard.html b/mcp-server/onboard.html
new file mode 100644
index 0000000..60105bf
--- /dev/null
+++ b/mcp-server/onboard.html
@@ -0,0 +1,412 @@
+
+
+
+Lakehouse — Connect Your Data
+
+
+
+
+
Lakehouse — Connect Your Data
+
+
30 minutes from CSV to live search
+
+
+
+
+
+
Ingest any staffing CSV in three steps
+
+ Upload your ATS export, your worker roster, or any CSV with a name column.
+ The wizard auto-detects columns, flags PII, previews the first rows, then ingests
+ as a queryable Parquet dataset. Everything that follows — hybrid search,
+ playbook ranking, pattern discovery — works against your data automatically.
+
+
+
+
+
+
Pick a file
+
Drag a CSV in, pick from disk, or use the sample roster to see the flow without any real data.
+ Columns auto-typed. PII columns flagged. First rows previewed. Nothing is written to the system yet — this is a read-only dry-run.
+
+
+
Columns detected
+
+
+
+
First rows
+
+
+
+
+ What happens next. On commit, the file is sent to /ingest/file — the same
+ endpoint every other ingest path uses. The Rust gateway writes it to object storage as
+ Parquet, computes a schema fingerprint, registers it in the catalog, and auto-detects PII
+ columns server-side. Re-uploading the same file is a no-op (deduplicated by content hash).
+
+
+
+
+
+
Name it and commit
+
Give the dataset a queryable name. This becomes the table you can SELECT * FROM immediately after commit.
+
+
+
+
+ Use lowercase + underscores. Once committed: queryable via /query/sql,
+ searchable via /search, indexable via /vectors/index.
+
+
+
+
+
+
+
+
+
+
+
+
Your dataset is live
+
From here, the rest of the system applies to your data with zero additional setup.
+
+
What you can do right now
+
+
Query via SQL.POST /query/sql with SELECT * FROM your_dataset LIMIT 10.
+
Build a vector index.POST /vectors/index with {"dataset":"your_dataset","text_column":"skills"}. Embeddings stream in; queryable progressively.
+
Search via the dashboard. Open the dashboard and the "Search all workers" box. Results will come from your data.
+
Track with playbook memory. Every Call/SMS/No-show click on a worker card trains the system on your data.