lakehouse/mcp-server/proof.html
root 468798c9ac /spec: technical specification — 11-chapter README-equivalent
J's ask: explain the full architecture so someone reading a README
can dispute it or recreate it. The repo isn't public yet; this page
IS the spec until it is.

Ch1 Repository layout — 13 crates + tests/multi-agent + docs + data,
    with owned responsibility and file path per crate.

Ch2 Data ingest pipeline (8 steps) — sources (file/inbox/DB/cron),
    parse+normalize with ADR-010 conservative typing, PII auto-tag,
    dedup, Parquet write, catalog register with fingerprint gate,
    mark embeddings stale, queryable immediately.

Ch3 Measurement & indexing — row count / fingerprint / owner /
    sensitivity / freshness / lineage per dataset. HNSW vs Lance
    tradeoff table with measured numbers (ADR-019). Autotune loop.
    Per-profile scoping (Phase 17).

Ch4 Contract inference from external signal — Chicago permit feed
    → role mapping → worker count heuristic → timeline → hybrid
    search with boost → pattern discovery → rendered card. All
    pre-computed before staffer opens UI.

Ch5 What a CRM can't do — 11-row comparison table of capabilities.

Ch6 How it gets better over time — three paths:
    - Phase 19 playbook boost (full math)
    - Pattern discovery meta-index
    - Autotune agent

Ch7 Scale story: 20 staffers, 300 contracts, midday +20/+1M surge
    - Async gateway + per-staffer profile isolation + client blacklists
    - 7-step surge handling flow (ingest, stale-mark, incremental refresh,
      degradation, hot-swap, autotune re-enter)
    - Known pain points: Ollama inference serial, RAM ceiling ~5M on
      HNSW (mitigated by Lance), VRAM 1-2 models sequential,
      playbook_memory unbounded.

Ch8 Error surfaces & recovery — 10-row table covering ingest schema
    conflicts, bucket failures, ghost names, dual-agent drift,
    empty searches, Ollama down, gateway restart, schema fingerprint
    divergence. Every failure has a named surface and recovery path.

Ch9 Per-staffer context — active profile, workspace, client blacklist,
    audit trail, daily summary. How 20 staffers don't see the same UI.

Ch10 Day in the life — 07:00 housekeeping → 07:30 refresh → 08:00
     staffer opens → 08:15 drill down → 08:30 Call click → 09:00
     second staffer shares memory → 12:30 surge → 14:00 no-show →
     15:00 new embeddings live → 17:00 retrospective → 22:00
     overnight trials.

Ch11 Known limits & non-goals — deferred (rate/margin, push, confidence
     calibration, neural re-ranker, pm compaction, call_log cross-ref)
     and explicitly out-of-scope (cloud, ACID, streaming, CRM replace,
     proprietary formats, hard multi-tenant).

Also: nav updated on /dashboard, /console, /proof to link /spec.
Every architectural claim in the spec cites either a code path, an
ADR number, or a phase reference so someone skeptical can target
the specific artifact.
2026-04-20 17:56:18 -05:00

455 lines
30 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="en"><head>
<meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
<title>Lakehouse — Architecture &amp; Reproduction</title>
<style>
*{margin:0;padding:0;box-sizing:border-box}
body{font-family:'Inter',-apple-system,system-ui,sans-serif;background:#090c10;color:#b0b8c4;font-size:14px;line-height:1.55;-webkit-font-smoothing:antialiased}
a{color:#58a6ff;text-decoration:none}
a:hover{color:#79c0ff}
.bar{background:#0d1117;padding:0 24px;height:56px;border-bottom:1px solid #171d27;display:flex;justify-content:space-between;align-items:center;position:sticky;top:0;z-index:10}
.bar h1{font-size:14px;font-weight:600;color:#e6edf3;letter-spacing:-0.2px}
.bar nav{display:flex;gap:2px}
.bar nav a{font-size:12px;color:#545d68;padding:6px 14px;border-radius:6px;transition:all 0.15s}
.bar nav a:hover{color:#e6edf3;background:#161b22}
.bar nav a.active{color:#e6edf3;background:#1c2333}
.bar .rt{font-size:11px;color:#545d68}
.wrap{max-width:1040px;margin:0 auto;padding:28px 20px 60px}
.chapter{margin-bottom:48px}
.chapter .num{color:#545d68;font-size:11px;font-weight:600;letter-spacing:1.6px;text-transform:uppercase;margin-bottom:6px}
.chapter h2{color:#e6edf3;font-size:24px;font-weight:700;letter-spacing:-0.4px;margin-bottom:8px;line-height:1.2}
.chapter .lede{color:#8b949e;font-size:14px;margin-bottom:18px;max-width:680px;line-height:1.6}
.chapter h3{color:#e6edf3;font-size:16px;font-weight:600;margin:18px 0 6px}
.card{background:#0d1117;border:1px solid #171d27;border-radius:12px;padding:20px;margin-bottom:12px}
.grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));gap:10px}
.stat-lg{padding:18px 20px}
.stat-lg .n{font-size:26px;font-weight:800;color:#e6edf3;letter-spacing:-1px;line-height:1}
.stat-lg .l{font-size:10px;color:#545d68;text-transform:uppercase;letter-spacing:1.2px;margin-top:8px;font-weight:600}
.stat-lg .sub{font-size:12px;color:#8b949e;margin-top:4px}
.accent-l{border-left:3px solid #2ea043}
.accent-b{border-left:3px solid #1f6feb}
.accent-a{border-left:3px solid #bc8cff}
.accent-w{border-left:3px solid #d29922}
.accent-r{border-left:3px solid #f85149}
.row{display:flex;justify-content:space-between;align-items:center;gap:12px;padding:10px 14px;background:#0d1117;border:1px solid #171d27;border-radius:8px;margin-bottom:6px;font-size:13px}
.row:hover{border-color:#21262d}
.row .title{color:#e6edf3;font-weight:500}
.row .meta{color:#8b949e;font-size:11px;margin-top:2px;font-family:ui-monospace,Menlo,monospace}
.row .val{color:#58a6ff;font-weight:600;white-space:nowrap;font-family:ui-monospace,Menlo,monospace}
.row.pass{border-left:3px solid #2ea043}
.row.fail{border-left:3px solid #f85149}
table.plain{width:100%;border-collapse:collapse;font-size:12px;margin-top:8px}
table.plain th{text-align:left;padding:8px 12px;color:#545d68;font-weight:600;text-transform:uppercase;font-size:10px;letter-spacing:0.8px;border-bottom:1px solid #171d27}
table.plain td{padding:8px 12px;border-bottom:1px solid #171d27;color:#c9d1d9;font-family:ui-monospace,Menlo,monospace}
table.plain tr:hover td{background:#0d1117}
code{background:#161b22;color:#e6edf3;padding:2px 6px;border-radius:4px;font-family:ui-monospace,Menlo,monospace;font-size:12px}
pre{background:#161b22;border:1px solid #171d27;border-radius:8px;padding:14px 16px;overflow-x:auto;font-family:ui-monospace,Menlo,monospace;font-size:12px;color:#c9d1d9;line-height:1.5;margin:8px 0}
.narr{color:#8b949e;font-size:13px;line-height:1.7;margin:10px 0;padding:10px 14px;border-left:2px solid #21262d}
.narr strong{color:#c9d1d9;font-weight:600}
.ref{color:#545d68;font-size:11px;margin-top:6px;font-family:ui-monospace,Menlo,monospace}
.ref strong{color:#79c0ff;font-weight:600}
.math{background:#0d1117;border:1px solid #171d27;border-radius:8px;padding:14px 16px;font-family:ui-monospace,Menlo,monospace;font-size:13px;color:#e6edf3;margin:8px 0}
.math .c{color:#8b949e}
.footer{border-top:1px solid #171d27;padding:20px;text-align:center;color:#3d444d;font-size:11px}
.loading{color:#484f58;font-style:italic;padding:20px 0;text-align:center}
.err{color:#f85149;font-size:12px;padding:10px}
@media(max-width:720px){
.wrap{padding:20px 12px 40px}
.chapter h2{font-size:20px}
.bar nav{display:none}
}
</style></head>
<body>
<div class="bar">
<h1>Lakehouse — Architecture &amp; Reproduction</h1>
<nav>
<a href=".">Dashboard</a>
<a href="console">Walkthrough</a>
<a href="proof" class="active">Architecture</a>
<a href="spec">Spec</a>
</nav>
<div class="rt" id="hdr-time">Running live tests…</div>
</div>
<div class="wrap">
<div class="chapter">
<div class="num">Chapter 1</div>
<h2>Receipts, not promises</h2>
<div class="lede">Every test below ran live against the real gateway when you loaded this page. Sub-100ms SQL on multi-million-row Parquet, hybrid search with playbook boost applied. No fixtures. If a test fails, you'll see ✗.</div>
<div id="ch1-tests"><div class="loading">Running tests…</div></div>
</div>
<div class="chapter">
<div class="num">Chapter 2</div>
<h2>Architecture — 13 crates, one object store, one local AI runtime</h2>
<div class="lede">Request flows top to bottom. Every node is independently swappable. Every line is a real HTTP or gRPC hop that you can trace with <code>tcpdump</code>.</div>
<div class="card accent-b">
<pre> HTTP :3100 + gRPC :3101
┌───────▼───────┐
│ gateway │ Rust · Axum · routing, CORS, auth, tools
└───────┬───────┘
┌────────────┬───────────┼───────────┬────────────┐
│ │ │ │ │
┌────▼───┐ ┌────▼───┐ ┌────▼───┐ ┌────▼───┐ ┌────▼───┐
│catalog │ │ query │ │ vector │ │ ingest │ │aibridge│
│ d │ │ d │ │ d │ │ d │ │ │
└────┬───┘ └────┬───┘ └────┬───┘ └────┬───┘ └────┬───┘
│ │ │ │ │
└────────────┴───────────┼───────────┴────────────┘
┌─────────────────┐
│ object storage │ Parquet files (local / S3)
└─────────────────┘
┌───────┴────────┐
│ Python sidecar │ FastAPI → Ollama
│ (aibridge) │ local models only
└────────────────┘</pre>
</div>
<h3>Per-crate responsibility</h3>
<table class="plain">
<thead><tr><th>Crate</th><th>Role</th><th>Path</th></tr></thead>
<tbody>
<tr><td>shared</td><td>Types, errors, Arrow helpers, PII detection, secrets provider</td><td>crates/shared/</td></tr>
<tr><td>storaged</td><td>object_store I/O, BucketRegistry (multi-bucket), AppendLog, ErrorJournal</td><td>crates/storaged/</td></tr>
<tr><td>catalogd</td><td>Metadata authority — manifests, views, tombstones, profiles, schema fingerprints</td><td>crates/catalogd/</td></tr>
<tr><td>queryd</td><td>DataFusion SQL engine, MemTable cache, delta merge-on-read, compaction</td><td>crates/queryd/</td></tr>
<tr><td>ingestd</td><td>CSV/JSON/PDF(+OCR)/Postgres/MySQL ingest, cron schedules, auto-PII</td><td>crates/ingestd/</td></tr>
<tr><td>vectord</td><td>Embeddings as Parquet, HNSW, trial system, autotune agent, playbook_memory</td><td>crates/vectord/</td></tr>
<tr><td>vectord-lance</td><td>Firewall crate — Lance 4.0 + Arrow 57 isolated from main Arrow 55</td><td>crates/vectord-lance/</td></tr>
<tr><td>journald</td><td>Append-only mutation event log for time-travel &amp; audit</td><td>crates/journald/</td></tr>
<tr><td>aibridge</td><td>Rust↔Python sidecar, Ollama HTTP client, VRAM introspection</td><td>crates/aibridge/</td></tr>
<tr><td>gateway</td><td>Axum HTTP :3100 + gRPC :3101, middleware, tools registry</td><td>crates/gateway/</td></tr>
<tr><td>ui</td><td>Dioxus WASM internal developer UI</td><td>crates/ui/</td></tr>
<tr><td>mcp-server</td><td>Bun TypeScript recruiter-facing app (this server)</td><td>mcp-server/</td></tr>
</tbody>
</table>
<div class="ref"><strong>Source:</strong> git.agentview.dev/profit/lakehouse &nbsp;·&nbsp; <strong>ADRs:</strong> docs/DECISIONS.md (currently 20 records)</div>
</div>
<div class="chapter">
<div class="num">Chapter 3</div>
<h2>Dual-agent recursive consensus loop</h2>
<div class="lede">The system we use to execute staffing fills is a dual-agent recursive protocol. Two agents with distinct roles iterate against a shared log until one of three terminal states is reached. It is deterministic in structure, stochastic in content, and verifiable through the per-run log artifact.</div>
<h3>Agents and protocol</h3>
<div class="card accent-a">
<pre> task in
┌───────────────────────────────────────────────────────────┐
│ EXECUTOR (mistral:latest) │
│ ──────────────────────────────────────────────────────── │
│ input: task spec + shared log + seen-candidates ledger │
│ output: one JSON action per turn │
│ · {kind:"plan",steps:[…]} │
│ · {kind:"tool_call",tool,args,rationale} │
│ · {kind:"propose_done",fills:[N of N]} │
└───────────┬───────────────────────────────┬───────────────┘
│ tool_call │ propose_done
▼ │
┌──────────────────────────┐ │
│ TOOL DISPATCH │ │
│ hybrid_search / sql │ │
│ (against live gateway) │ │
└──────────┬───────────────┘ │
│ result (trimmed, exclusions) │
▼ ▼
┌───────────────────────────────────────────────────────────┐
│ REVIEWER (qwen2.5:latest) │
│ ──────────────────────────────────────────────────────── │
│ input: task spec + shared log (including tool result) │
│ output: {kind:"critique",verdict:"continue|drift| │
│ approve_done",notes} │
└───────────┬───────────────────────────────────────────────┘
┌─────┴─────┐
▼ ▼ ▼
continue drift approve_done + propose_done ⟹ SEAL
(next turn) (cap ≈ 3 →
hard abort)
</pre>
</div>
<div class="ref"><strong>Code:</strong> tests/multi-agent/agent.ts (protocol + prompts) &nbsp;·&nbsp; tests/multi-agent/orchestrator.ts (run loop) &nbsp;·&nbsp; tests/multi-agent/scenario.ts (5-event warehouse week)</div>
<h3>Why "dual" — role specialization</h3>
<div class="narr">
<strong>The executor is an optimist.</strong> Its job is to produce progress: pull candidates, verify SQL, propose consensus. It's instructed to be decisive.
<br><br>
<strong>The reviewer is a pessimist.</strong> Its job is to catch drift: proposals that don't match the task's geography, fill count, or role. It's authorized to stop the loop.
<br><br>
This adversarial separation is cheaper and more deterministic than asking a single model to self-critique. The reviewer has a hard rule: on the turn after a <code>propose_done</code>, it MUST emit either <code>approve_done</code> or <code>drift</code> — it cannot stall with <code>continue</code>.
</div>
<h3>Why "parallel" — orchestrator can fan out</h3>
<div class="narr">
<strong>Independent pairs run concurrently.</strong> <code>tests/multi-agent/run_e2e_rated.ts</code> runs two task-specific agent pairs via <code>Promise.all</code>. Ollama serializes inference at the model level, so "parallel" is concurrent orchestration — but the substrate (gateway, queryd, vectord) handles concurrent requests cleanly. Verified in the scenario harness: two contracts sealing simultaneously.
</div>
<h3>Why "recursive" — each seal feeds the next</h3>
<div class="narr">
<strong>Consensus does not end at the sealed playbook.</strong> Every sealed playbook is persisted to <code>playbook_memory</code> via <code>POST /vectors/playbook_memory/seed</code>. The next hybrid search for a semantically similar operation consults that memory via <code>compute_boost_for(query_embedding, top_k, base_weight)</code> and re-ranks the candidate pool. The system builds on itself turn over turn, playbook over playbook.
</div>
<h3>Termination guarantees</h3>
<div class="math">
<span class="c">// three paths out, every run has one of these:</span><br>
sealed = executor.propose_done ∧ reviewer.approve_done ∧ fills.count == target<br>
abort = consecutive_tool_errors ≥ MAX_TOOL_ERRORS (3) &nbsp;&nbsp;<span class="c">// executor can't form a valid call</span><br>
abort = consecutive_drifts ≥ MAX_CONSECUTIVE_DRIFTS (3) &nbsp;<span class="c">// reviewer keeps flagging</span><br>
abort = turn &gt; MAX_TURNS (12) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="c">// no consensus reached in window</span>
</div>
<div class="narr">Every abort dumps the full log to <code>tests/multi-agent/playbooks/&lt;id&gt;-FAILED.json</code> for forensic review. No consensus is ever implicit.</div>
</div>
<div class="chapter">
<div class="num">Chapter 4</div>
<h2>Playbook memory — the compounding feedback loop</h2>
<div class="lede">A CRM stores events. This system turns events into re-ranking signal. Every sealed playbook endorses specific (worker, city, state) tuples. Every failure penalizes them. Every similar future query inherits the signal through cosine similarity.</div>
<h3>Seed shape</h3>
<div class="math">
PlaybookEntry {<br>
&nbsp;&nbsp;playbook_id, <span class="c">// pb-seed-&lt;sha8&gt;</span><br>
&nbsp;&nbsp;operation, <span class="c">// "fill: Welder x2 in Toledo, OH"</span><br>
&nbsp;&nbsp;approach, context, <span class="c">// short canonical — long strings dilute embedding</span><br>
&nbsp;&nbsp;timestamp, <span class="c">// RFC3339</span><br>
&nbsp;&nbsp;endorsed_names[], <span class="c">// validated against workers_500k for city+state</span><br>
&nbsp;&nbsp;city, state, <span class="c">// parsed from operation</span><br>
&nbsp;&nbsp;embedding <span class="c">// 768-d nomic-embed-text of text shape</span><br>
}
</div>
<div class="ref"><strong>Code:</strong> crates/vectord/src/playbook_memory.rs (PlaybookEntry, FailureRecord, PlaybookMemoryState)</div>
<h3>Boost math (positive + decay + negative)</h3>
<div class="math">
<span class="c">// For each playbook pb among top-K most cosine-similar:</span><br>
<span class="c">// given query embedding qv, constant base_weight, n_workers = |pb.endorsed_names|</span><br>
<br>
similarity = cosine(qv, pb.embedding) &nbsp;&nbsp; <span class="c">// skip if ≤ 0.05</span><br>
age_days = (now - pb.timestamp) / 86_400 seconds<br>
decay = e<sup>-age_days / 30</sup> &nbsp;&nbsp;<span class="c">// half-life = 30 days</span><br>
<br>
<span class="c">// For each endorsed worker in pb:</span><br>
key = (pb.city, pb.state, name)<br>
fail_count = failures[key] &nbsp;&nbsp;<span class="c">// # times this worker was marked no-show for same geo</span><br>
penalty = 0.5<sup>min(fail_count, 20)</sup><br>
<br>
per_worker = similarity × base_weight × decay × penalty / n_workers<br>
boost[key] = min(boost[key] + per_worker, MAX_BOOST_PER_WORKER)<br>
<br>
<span class="c">// MAX_BOOST_PER_WORKER = 0.25 — cap stops one popular worker from always winning</span>
</div>
<div class="ref"><strong>Code:</strong> crates/vectord/src/playbook_memory.rs::compute_boost_for &nbsp;·&nbsp; constants: MAX_BOOST_PER_WORKER, DEFAULT_TOP_K_PLAYBOOKS, BOOST_HALF_LIFE_DAYS</div>
<h3>Application at query time</h3>
<div class="math">
<span class="c">// In /vectors/hybrid handler (crates/vectord/src/service.rs):</span><br>
1. SQL filter narrows workers_500k to geo/role/availability<br>
2. Vector index returns top_k × 5 candidates by cosine to question<br>
3. compute_boost_for(qv, k=200) returns boost map<br>
4. For each candidate: parse (name, city, state) from chunk, look up boost, add to score<br>
5. Re-sort sources by boosted score<br>
6. Truncate to requested top_k, return with playbook_boost and playbook_citations
</div>
<div class="narr">
<strong>Why k=200.</strong> Direct measurement showed cosine similarity clusters in the 0.55-0.67 band across all playbooks regardless of geo (nomic-embed-text has narrow discrimination on this kind of structured operation text). A k of 25 silently missed geo-matched playbooks. k=200 is the measured floor for reliably catching compounding. Brute-force over 200 × 768-d is sub-ms even on this hardware.
</div>
<div class="ref"><strong>Evidence:</strong> Chicago Electrician compounding test 2026-04-20 — Carmen Green, Anna Patel, Fatima Wilson went from rank &gt;5 / boost 0 / 0 citations (run 0, no seed) to rank 1/2/3 / boost +0.250 (capped) / 3 citations each (run 3, after 3 identical seeds). Each seed increments citations; total boost caps at 0.25/worker.</div>
<h3>Write-through to SQL</h3>
<div class="narr">
<strong>successful_playbooks_live</strong> is a DataFusion-queryable Parquet surface maintained by <code>POST /vectors/playbook_memory/persist_sql</code>. Every <code>/log</code> from the recruiter UI triggers seed → persist_sql. The in-memory store and the SQL surface stay synchronized (full snapshot on each persist, safe because memory is source of truth).
</div>
<div class="ref"><strong>Code:</strong> crates/vectord/src/playbook_memory.rs::persist_to_sql &nbsp;·&nbsp; catalog-registered under "successful_playbooks_live"</div>
<h3>Pattern discovery (Path 2 — meta-index)</h3>
<div class="narr">
<strong>Beyond "who was endorsed."</strong> <code>POST /vectors/playbook_memory/patterns</code> takes a query, finds top-K similar past playbooks, pulls each endorsed worker's full workers_500k profile, and aggregates shared traits: recurring certifications, skill frequencies, modal archetype, reliability distribution. Returns a <code>discovered_pattern</code> string showing operator-actionable signal the user didn't explicitly query for.
</div>
<div class="ref"><strong>Code:</strong> crates/vectord/src/playbook_memory.rs::discover_patterns &nbsp;·&nbsp; <strong>Surfaces:</strong> /vectors/playbook_memory/patterns endpoint, /intelligence/chat response, /intelligence/permit_contracts cards</div>
</div>
<div class="chapter">
<div class="num">Chapter 5</div>
<h2>Key architectural choices — what was picked and why</h2>
<div class="lede">Each choice is documented in <code>docs/DECISIONS.md</code> (Architecture Decision Records). If you dispute any of these, the ADR names the alternatives we rejected and the measurement that drove the call.</div>
<div class="card">
<div class="row accent-b">
<div style="flex:1"><div class="title">ADR-001 · Object storage as source of truth</div><div class="meta">No traditional database. All data is Parquet on S3-compatible object storage. Eliminates DB operational overhead; every engine can read Parquet.</div></div>
</div>
<div class="row accent-b">
<div style="flex:1"><div class="title">ADR-008 · Embeddings stored as Parquet, not a vector DB</div><div class="meta">Keeps all data in one portable format. No Pinecone/Weaviate/Qdrant lock-in. Trade-off: brute-force search up to ~100K; HNSW beyond.</div></div>
</div>
<div class="row accent-l">
<div style="flex:1"><div class="title">ADR-012 · Append-only event journal — never destroy evidence</div><div class="meta">Every mutation is appended. Compliance, audit, AI-decision forensics. Impossible to retrofit; easy to add now.</div></div>
</div>
<div class="row accent-a">
<div style="flex:1"><div class="title">ADR-015 · Tool registry before raw SQL for agents</div><div class="meta">Named, governed, audited actions for agents. Permission checks, rate limits, parameter validation. MCP-compatible.</div></div>
</div>
<div class="row accent-w">
<div style="flex:1"><div class="title">ADR-019 · Hybrid Parquet+HNSW ⊕ Lance vector backend</div><div class="meta">Parquet+HNSW primary (2.55× faster search at 100K). Lance secondary for index-build speed (14× faster), random fetch (112× faster), append (structural). Per-profile <code>vector_backend: Parquet | Lance</code>.</div></div>
</div>
<div class="row accent-r">
<div style="flex:1"><div class="title">ADR-020 · Idempotent register() with schema-fingerprint gate</div><div class="meta">Same (name, fingerprint) reuses manifest. Different fingerprint = 409 Conflict. Prevents silent duplicate manifests. Cleanup run collapsed 374 → 31 datasets.</div></div>
</div>
<div class="row accent-l">
<div style="flex:1"><div class="title">Phase 19 design note · Statistical + semantic, not neural</div><div class="meta">Meta-index is cosine similarity + endorsement aggregation. No model training. Rebuildable from <code>successful_playbooks</code> alone. Neural re-ranker deferred to Phase 20+ only if statistical floor plateaus.</div></div>
</div>
</div>
</div>
<div class="chapter">
<div class="num">Chapter 6</div>
<h2>Measured at scale, on this machine</h2>
<div class="lede">Hardware: i9 + 128GB RAM + Nvidia A4000 16GB VRAM. Numbers below are from <em>this</em> running instance. Refresh the page and they'll recompute.</div>
<div class="grid" id="ch6-scale"><div class="loading">Loading scale data…</div></div>
<div id="ch6-recall" style="margin-top:10px"></div>
</div>
<div class="chapter">
<div class="num">Chapter 7</div>
<h2>Verify or dispute — reproduce it yourself</h2>
<div class="lede">Every claim below is a curl away from falsification.</div>
<div class="card">
<div class="narr"><strong>Health.</strong> Should return <code>lakehouse ok</code>.</div>
<pre>curl http://localhost:3100/health</pre>
<div class="narr"><strong>Any SQL on multi-million-row Parquet.</strong> Sub-100ms typical.</div>
<pre>curl -s -X POST http://localhost:3100/query/sql \
-H 'Content-Type: application/json' \
-d '{"sql":"SELECT role, COUNT(*) FROM workers_500k WHERE state=\"IL\" GROUP BY role LIMIT 5"}'</pre>
<div class="narr"><strong>Hybrid search with playbook boost.</strong> The whole Phase 19 feedback loop in one request.</div>
<pre>curl -s -X POST http://localhost:3100/vectors/hybrid \
-H 'Content-Type: application/json' \
-d '{"index_name":"workers_500k_v1",
"sql_filter":"role = '\''Forklift Operator'\'' AND city = '\''Chicago'\'' AND CAST(availability AS DOUBLE) > 0.5",
"question":"reliable forklift operator",
"top_k":5,"use_playbook_memory":true,"playbook_memory_k":200}'</pre>
<div class="narr"><strong>Playbook memory stats.</strong> Count + endorsed names + sample.</div>
<pre>curl http://localhost:3100/vectors/playbook_memory/stats</pre>
<div class="narr"><strong>Pattern discovery.</strong> What do past similar fills have in common?</div>
<pre>curl -s -X POST http://localhost:3100/vectors/playbook_memory/patterns \
-H 'Content-Type: application/json' \
-d '{"query":"Forklift Operator in Chicago, IL","top_k_playbooks":25,"min_trait_frequency":0.3}'</pre>
<div class="narr"><strong>Run the dual-agent scenario yourself.</strong> All 5 events, real fills, real artifacts.</div>
<pre>cd /home/profit/lakehouse
bun run tests/multi-agent/scenario.ts
# Output: tests/multi-agent/playbooks/scenario-&lt;timestamp&gt;/report.md</pre>
</div>
</div>
<div class="chapter">
<div class="num">Chapter 8</div>
<h2>What we are <em>not</em> claiming</h2>
<div class="lede">Every impressive-sounding number comes with a footnote. Here are the honest limits.</div>
<div class="card">
<div class="row accent-a"><div style="flex:1"><div class="title">workers_500k is synthetic.</div><div class="meta">Real client ATS export replaces this table. Schema is deliberately identical to a production ATS.</div></div></div>
<div class="row accent-a"><div style="flex:1"><div class="title">candidates table has 1,000 rows.</div><div class="meta">Intentionally small for demo. call_log references higher candidate_ids that don't cross-reference — this is a dataset alignment issue, not a pipeline issue.</div></div></div>
<div class="row accent-b"><div style="flex:1"><div class="title">Chicago permit data is real.</div><div class="meta">Pulled live from data.cityofchicago.org/resource/ydr8-5enu.json (Socrata API). Not synthetic. Not cached.</div></div></div>
<div class="row accent-l"><div style="flex:1"><div class="title">Playbook memory is seeded from demo runs.</div><div class="meta">The pipeline that seeds it is identical to what a live recruiter would trigger via /log. Same code path.</div></div></div>
<div class="row accent-w"><div style="flex:1"><div class="title">Local 7B models (mistral, qwen2.5) are imperfect.</div><div class="meta">They occasionally malform tool calls or drop fields. Multi-agent scenarios seal roughly 40-80% in one run. Larger models or constrained decoding would improve this. Not a substrate problem.</div></div></div>
<div class="row accent-r"><div style="flex:1"><div class="title">No rate/margin awareness yet.</div><div class="meta">Worker pay expectations vs contract bill rates are not modeled. Flagged as a Phase 20 item; no architectural blocker.</div></div></div>
</div>
</div>
</div>
<div class="footer">Lakehouse · Architecture page regenerates tests on every load · <a href="console">walkthrough</a> · <a href=".">dashboard</a></div>
<script>
var P=location.pathname.indexOf('/lakehouse')>=0?'/lakehouse':'';
var A=location.origin+P;
function el(tag, cls, text){
var e=document.createElement(tag);
if(cls) e.className=cls;
if(text!==undefined && text!==null) e.textContent=String(text);
return e;
}
function apiPost(path, body){
return fetch(A+path,{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body||{})}).then(function(r){return r.json()});
}
window.addEventListener('load',function(){
loadLiveSections();
});
function loadLiveSections(){
apiPost('/proof.json',{}).then(function(r){
var host1=document.getElementById('ch1-tests');host1.textContent='';
var tests=r.tests||[];
tests.forEach(function(t){
var row=el('div','row '+(t.pass?'pass':'fail'));
var left=document.createElement('div');left.style.flex='1';left.style.minWidth='0';
var title=el('div','title',(t.pass?'✓ ':'✗ ')+(t.name||'(unnamed)'));
var resultStr='';
if(typeof t.result==='object' && t.result) resultStr=JSON.stringify(t.result);
else if(t.result!==undefined && t.result!==null) resultStr=String(t.result);
var meta=el('div','meta',resultStr);
left.appendChild(title);left.appendChild(meta);
row.appendChild(left);
row.appendChild(el('div','val',(t.ms||0)+' ms'));
host1.appendChild(row);
});
var host6=document.getElementById('ch6-scale');host6.textContent='';
var scale=r.scale||{};
addStat(host6,(scale.total_rows||0).toLocaleString(),'Rows under management','across '+(scale.datasets||0)+' datasets','accent-b');
addStat(host6,(scale.total_chunks||0).toLocaleString(),'Vector chunks indexed','across '+(scale.indexes||0)+' HNSW + Lance indexes','accent-a');
var gpu=r.gpu||{};
var vramStr=(gpu.vram_used_mib!==undefined?gpu.vram_used_mib+' / '+gpu.vram_total_mib+' MiB':'—');
addStat(host6,vramStr,'GPU VRAM',(gpu.name||'A4000'),'accent-w');
if(r.recall){
var reco=r.recall||{};
var hnswStr=(reco.hnsw!==undefined?(reco.hnsw*100).toFixed(0)+'%':'—');
addStat(host6,hnswStr,'HNSW recall','measured · '+(reco.note||''),'accent-l');
}
var host6r=document.getElementById('ch6-recall');host6r.textContent='';
if(r.lance_10m){
var narr=el('div','narr');
narr.appendChild(el('strong',null,'Scale headroom: '));
narr.appendChild(document.createTextNode(
'Lance backend tested at '+r.lance_10m.vectors.toLocaleString()+' vectors, '
+r.lance_10m.disk_gb+' GB on disk, '+r.lance_10m.search_p50_ms+'ms p50. '
+(r.lance_10m.note||'')
));
host6r.appendChild(narr);
}
document.getElementById('hdr-time').textContent='Generated · '+(r.generated?new Date(r.generated).toLocaleTimeString():'live');
}).catch(function(e){
var host1=document.getElementById('ch1-tests');host1.textContent='';
host1.appendChild(el('div','err','Live tests failed: '+(e.message||e)+'. Server may be offline.'));
});
}
function addStat(host,n,l,sub,cls){
var d=el('div','card stat-lg '+(cls||''));
d.appendChild(el('div','n',n));
d.appendChild(el('div','l',l));
d.appendChild(el('div','sub',sub||''));
host.appendChild(d);
}
</script>
</body></html>