From 6e7ca1830ef56b905b2d198763624e8a2fd3a62f Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Mon, 20 Apr 2026 19:34:44 -0500
Subject: [PATCH] =?UTF-8?q?Phase=2021=20foundation=20=E2=80=94=20context?=
 =?UTF-8?q?=20stability=20+=20chunking=20pipeline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PRD: add Phase 20 (model matrix, wired) and Phase 21 (context stability,
partial). Phase 21 exists because LLM Team hit this exact wall — running
multi-model ranking on large context silently truncated, rankings
degraded, no pipeline caught it. The stable answer: every agent call
goes through a budget check against the model's declared context_window
minus safety_margin, with a declared overflow_policy when the check
fails.

config/models.json:
- context_window + context_budget per tier
- overflow_policies block: summarize_oldest_tool_results_via_t3,
  chunk_lessons_via_cosine_topk, two_pass_map_reduce,
  escalate_to_kimi_k2_1t_or_split_decision
- chunking_cache spec (data/_chunk_cache/, corpus-hash keyed)

agent.ts:
- estimateTokens() chars/4 biased safe ~15%
- CONTEXT_WINDOWS table (fallback; prod reads models.json)
- assertContextBudget() — throws on overflow with exact numbers, can
  bypass with bypass_budget:true for callers with their own policy
- Wired into generate() and generateCloud() so EVERY call is checked

scenario.ts:
- T3 lesson archive to data/_playbook_lessons/*.json (the old
  /vectors/playbook_memory/seed path was silently failing with HTTP 400
  because it requires 'fill: Role xN in City, ST' operation shape)
- loadPriorLessons() at scenario start — filters by city/state match,
  date-sorted, takes top-3
- prior_lessons.json archived per-run (honest signal for A/B)
- guidanceFor() injects up to 2 prior lessons (≤500 chars each) into
  the executor's per-event context
- Retrospective shows explicit "Prior lessons loaded: N" line

Verified: mistral correctly rejects a 150K-char prompt (7532 tokens
over), gpt-oss:120b accepts it with 90K headroom. The enforcement is
in-band on every call now, not an afterthought.

Full chunking service (Rust) remains deferred to the sprint this feeds:
crates/aibridge/src/budget.rs + chunk.rs + storaged/chunk_cache.rs
---
 config/models.json            |  91 +++++++++++++++++++++----
 docs/PRD.md                   |  72 ++++++++++++++++++--
 tests/multi-agent/agent.ts    |  60 +++++++++++++++++
 tests/multi-agent/scenario.ts | 122 ++++++++++++++++++++++++++++------
 4 files changed, 309 insertions(+), 36 deletions(-)

diff --git a/config/models.json b/config/models.json
index 0302b62..74804d8 100644
--- a/config/models.json
+++ b/config/models.json
@@ -22,59 +22,124 @@
     "t1_hot": {
       "purpose": "Per tool call — SQL generation, hybrid_search, sql(). Runs 50-200 times per scenario. Latency-sensitive.",
       "kind": "local_fast",
-      "primary": { "model": "mistral:latest", "provider": "ollama_local" },
-      "fallback": { "model": "qwen2.5:latest", "provider": "ollama_local" },
+      "primary": { "model": "mistral:latest", "provider": "ollama_local", "context_window": 32768 },
+      "fallback": { "model": "qwen2.5:latest", "provider": "ollama_local", "context_window": 32768 },
       "max_tokens": 800,
       "temperature": 0.3,
       "never_route_cloud": true,
+      "context_budget": {
+        "system_prompt_cap": 4000,
+        "prior_context_cap": 6000,
+        "tool_results_cap": 8000,
+        "safety_margin": 2000,
+        "overflow_policy": "summarize_oldest_tool_results_via_t3"
+      },
       "rationale": "Mistral produces valid JSON reliably. Qwen2.5 is the consensus reviewer. Known flakiness on 5-fill + misplacement events — do NOT mask by upgrading; route to T3 for post-hoc review instead."
     },
     "t2_review": {
       "purpose": "Per step consensus — executor ↔ reviewer loop critique. 5-14 calls per event.",
       "kind": "local_balanced",
-      "primary": { "model": "qwen2.5:latest", "provider": "ollama_local" },
-      "fallback": { "model": "qwen3:latest", "provider": "ollama_local" },
+      "primary": { "model": "qwen2.5:latest", "provider": "ollama_local", "context_window": 32768 },
+      "fallback": { "model": "qwen3:latest", "provider": "ollama_local", "context_window": 40960 },
       "max_tokens": 600,
       "temperature": 0.3,
       "never_route_cloud": true,
+      "context_budget": {
+        "system_prompt_cap": 2000,
+        "recent_turns_cap": 4000,
+        "safety_margin": 1000
+      },
       "rationale": "Reviewer only needs to detect schema violations and drift — a 7B model is sufficient."
     },
     "t3_overview": {
       "purpose": "Mid-day checkpoint after every misplacement + every Nth event, and cross-day lesson. 1-3 calls per scenario.",
       "kind": "thinking_cloud",
-      "primary": { "model": "gpt-oss:120b", "provider": "ollama_cloud" },
-      "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" },
+      "primary": { "model": "gpt-oss:120b", "provider": "ollama_cloud", "context_window": 131072 },
+      "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local", "context_window": 131072 },
       "max_tokens": 900,
       "temperature": 0.2,
       "cloud_budget_per_scenario": 5,
       "env_flag": "LH_OVERVIEW_CLOUD=1",
+      "context_budget": {
+        "event_digest_cap": 30000,
+        "checkpoint_cap": 8000,
+        "lesson_corpus_cap": 40000,
+        "safety_margin": 8000,
+        "overflow_policy": "chunk_lessons_via_cosine_topk"
+      },
       "rationale": "Same prompt family as local 20b (gpt-oss series) — prompts port directly. 120b is faster via cloud than 20b local in practice, and lessons are noticeably more specific."
     },
     "t4_strategic": {
       "purpose": "Daily playbook board re-ranking, weekly gap audit, pattern discovery across accumulated playbooks. 1-10 calls per day.",
       "kind": "thinking_cloud_large",
-      "primary": { "model": "qwen3.5:397b", "provider": "ollama_cloud" },
-      "fallback": { "model": "glm-4.7", "provider": "ollama_cloud" },
-      "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" },
+      "primary": { "model": "qwen3.5:397b", "provider": "ollama_cloud", "context_window": 131072 },
+      "fallback": { "model": "glm-4.7", "provider": "ollama_cloud", "context_window": 131072 },
+      "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local", "context_window": 131072 },
       "max_tokens": 2000,
       "temperature": 0.2,
       "cloud_budget_per_day": 10,
+      "context_budget": {
+        "playbook_corpus_cap": 80000,
+        "pattern_history_cap": 20000,
+        "safety_margin": 16000,
+        "overflow_policy": "two_pass_map_reduce"
+      },
       "rationale": "J named qwen3.5 specifically. GLM-4.7 is a promising alternate for debate phase. Runs after all scenarios complete for the day."
     },
     "t5_gatekeeper": {
       "purpose": "MUST route here: architecture changes, new client onboarding, schema migrations, playbook retirements, index rebuilds, autotune config changes.",
       "kind": "thinking_cloud_deepest",
-      "primary": { "model": "kimi-k2-thinking", "provider": "ollama_cloud" },
-      "fallback": { "model": "deepseek-v3.1:671b", "provider": "ollama_cloud" },
-      "secondary_fallback": { "model": "qwen3.5:397b", "provider": "ollama_cloud" },
-      "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" },
+      "primary": { "model": "kimi-k2-thinking", "provider": "ollama_cloud", "context_window": 200000 },
+      "fallback": { "model": "deepseek-v3.1:671b", "provider": "ollama_cloud", "context_window": 131072 },
+      "secondary_fallback": { "model": "qwen3.5:397b", "provider": "ollama_cloud", "context_window": 131072 },
+      "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local", "context_window": 131072 },
       "max_tokens": 4000,
       "temperature": 0.1,
       "cloud_budget_per_day": 5,
       "audit_log": true,
+      "context_budget": {
+        "decision_doc_cap": 50000,
+        "evidence_bundle_cap": 100000,
+        "prior_gatekeeper_decisions_cap": 20000,
+        "safety_margin": 20000,
+        "overflow_policy": "escalate_to_kimi_k2_1t_or_split_decision"
+      },
       "rationale": "Highest-stakes decisions — reasoning depth matters more than latency. Audit log so J can always see what the gatekeeper was asked and what it answered. No human approval required today; escalate later if mis-decisions show up."
     }
   },
+
+  "context_management": {
+    "_description": "Rule zero: NEVER call a model with more tokens than its context_window minus safety_margin. Every call goes through the budget checker first. If over budget → chunk, summarize, or escalate. This is the stability floor.",
+    "token_estimator": {
+      "method": "chars_div_4",
+      "note": "Rough, biased safe by ~15%. For production, swap to tiktoken or the provider's tokenizer endpoint."
+    },
+    "overflow_policies": {
+      "summarize_oldest_tool_results_via_t3": {
+        "when": "T1 conversation history + tool results exceed context_budget.tool_results_cap",
+        "how": "Send oldest N tool results to T3 with prompt 'summarize these into 500 tokens that preserve what the executor needs to know'; replace them with the summary in the running conversation."
+      },
+      "chunk_lessons_via_cosine_topk": {
+        "when": "lesson corpus in data/_playbook_lessons/*.json exceeds lesson_corpus_cap",
+        "how": "Embed the current scenario spec, cosine-rank all lessons, take top-K until budget exhausted. Fall back to date-sorted if embeddings unavailable."
+      },
+      "two_pass_map_reduce": {
+        "when": "T4 playbook corpus exceeds playbook_corpus_cap",
+        "how": "Pass 1: chunk playbooks into ≤30K token shards, run primary model on each shard to emit 'shard summary'. Pass 2: feed all summaries to primary model for global synthesis. Logged as two audit entries."
+      },
+      "escalate_to_kimi_k2_1t_or_split_decision": {
+        "when": "T5 decision evidence exceeds decision_doc_cap + evidence_bundle_cap",
+        "how": "Prefer kimi-k2:1t which has 1M context. If still over, split decision into sub-decisions (e.g. 'retire playbooks by city' instead of 'retire playbooks globally') and loop."
+      }
+    },
+    "chunking_cache": {
+      "_description": "Precomputed shards of the playbook corpus, indexed by (corpus_version, shard_id). Avoids re-chunking on every T4 run.",
+      "location": "data/_chunk_cache/",
+      "invalidation": "Key includes corpus hash. When playbook_memory changes, the hash changes, the cache misses, and chunks regenerate.",
+      "implementation_status": "SPEC — next sprint."
+    },
+    "implementation_status": "context_window + context_budget fields WIRED in config. Enforcement helper NOT yet wired in agent.ts. Next: add estimateTokens() + budgetCheck() helpers, route all generate() calls through them, emit a metric when overflow policy fires."
+  },
   "experimental_rotation": {
     "enabled": false,
     "purpose": "Sample newer models on a schedule to collect comparison data without rate-limit risk.",
diff --git a/docs/PRD.md b/docs/PRD.md
index b97aeb3..e7bab76 100644
--- a/docs/PRD.md
+++ b/docs/PRD.md
@@ -1,8 +1,8 @@
 # PRD: Lakehouse — Rust-First Substrate for Versioned Knowledge Stores
 
-**Status:** Active — Phases 0-18 shipped; hybrid SQL+Vector search operational; IVF_PQ recall tuned to 0.97; 10K real staffing profiles ingested from Ethereal; autonomous agent + staffing simulation verified
+**Status:** Active — Phases 0-18 shipped; hybrid SQL+Vector search operational; IVF_PQ recall tuned to 1.000 at p50 ≈ 7.4ms via `nprobes`+`refine`; autonomous agent rotates across full index portfolio; cron-scheduled ingest; eval federation complete
 **Created:** 2026-03-27
-**Last updated:** 2026-04-17 — hybrid search closes the SQL+vector gap; quality eval surfaced and fixed real bugs
+**Last updated:** 2026-04-20 — portfolio-wide autotune, real cron, evals federation, bucket-migrate, IVF_PQ recall 0.805 → 1.000
 **Owner:** J
 
 ---
@@ -126,6 +126,15 @@ User question → gateway
 9. **Every reader gets its own profile.** Human operators, AI agents, and local models are all clients of the same substrate. Each has a named profile with its own bucket, vector indexes, trial history, and dataset bindings. Profiles are a first-class architectural concept, not a tenancy afterthought. (Phase 17)
 10. **Trials are data, not logs.** Every index build is a trial with measurable metrics. The trial journal IS the agent's memory for how to tune itself. Stored as write-once batched JSONL per the ADR-018 append-log pattern.
 11. **Operational failures are findable in one HTTP call.** The bucket error journal, trial journal, and audit log all expose `/storage/errors`, `/hnsw/trials`, `/access/audit` with structured filter + aggregation. No `grep` archaeology to answer "what broke?"
+12. **Playbooks feed the index, not just the log.** A completed playbook isn't just a record of what worked — it's a signal that shapes future rankings. Every `successful_playbooks` row contributes to the playbook-memory vector index, so semantically-similar future operations re-rank toward workers that have actually succeeded in comparable fills. This is the "system gets smarter over time" dimension that distinguishes this substrate from a static search engine. (Phase 19)
+
+---
+
+## Vision drift acknowledged (2026-04-20)
+
+The system as shipped through Phase 18 is a **hybrid SQL+vector search engine with a playbook log**. The original pitch (and the "staffing AI co-pilot" framing) implied a **meta-index that learns from playbooks over time** — hot-swap profiles weren't just routing, they were knowledge generations that compounded. That learning loop was never built; playbooks were write-only. Phase 19 closes that gap explicitly.
+
+The feedback signal is **statistical + semantic**, not neural. No model training — the index reads the playbook journal, computes operation-similarity, and boosts endorsed workers at query time. Rebuildable from `successful_playbooks` alone, same as every other derived index.
 
 ---
 
@@ -349,12 +358,67 @@ The question raised 2026-04-16 after J's LLMS3 knowledge base identified Lance a
 
 Per-profile `vector_backend: Parquet | Lance` becomes part of Phase 17 (model profiles). See ADR-019 for the full scorecard and caveats.
 
-### Phase 19+: Further horizon
+### Phase 19: Playbook memory (meta-index) — the feedback loop
+
+Make successful playbooks actually improve future searches. Today `successful_playbooks` is a write-only log; future-you looks at it and thinks "cool, we filled Toledo welders once" — but the index has no idea it happened, so the next Toledo-welder search ranks the same as if none of those fills had existed. Phase 19 closes the loop.
+
+| Step | Deliverable | Gate |
+|---|---|---|
+| 19.1 | Embed every `successful_playbooks` row — operation + approach + context → one chunk per playbook | A new dataset `playbook_memory` appears in catalog with N rows = row count of `successful_playbooks` |
+| 19.2 | Vector index on `playbook_memory` (HNSW or Lance — whichever `agent-parquet` profile uses) | `/vectors/search` against `playbook_memory` returns semantically similar past playbooks |
+| 19.3 | Endorsement extraction: each playbook row has `fills[]` (worker_ids it succeeded with). Parse them out at ingest time and store in a sidecar `playbook_endorsements` Parquet keyed by playbook_id | `SELECT * FROM playbook_endorsements WHERE playbook_id = 'X'` returns the worker_ids |
+| 19.4 | `/vectors/hybrid` gains opt-in `use_playbook_memory: bool`. When true: after hybrid ranks candidates, find top-K similar past playbooks (semantic search over `playbook_memory`), extract endorsed worker_ids, add a bounded boost to candidates in the endorsed set, re-rank | A search where the "right" worker is known from a prior playbook ranks higher with the flag than without |
+| 19.5 | Write-through from multi-agent orchestrator: when two agents seal a playbook, it appends to `successful_playbooks` AND triggers a refresh of `playbook_memory` (via existing Phase C stale-mark path). Next query sees the new signal. | Run the orchestrator → inspect `playbook_memory` → see a new row. Run the same query before/after → ranking differs. |
+| 19.6 | Ceiling-aware boost: cap the per-worker boost so one popular worker can't dominate future searches. Boost decays with time (optional) so stale playbooks matter less. | Synthetic test: 100 playbooks all filled with the same worker_id; the 101st search still returns a mix, not just that one worker |
+
+**Gate:** Run a real search before and after a new successful playbook lands. The endorsed workers from similar past operations rank higher in the second call. Demonstrable with a diff of the two result sets.
+
+**Why this is the right version of "meta-index":** The alternative — training a neural re-ranker on (query, candidate, outcome) triples — is a weeks-long ML story and requires labeled outcome data we don't really have. The statistical-semantic version here is rebuildable from the existing playbook journal, explainable ("boosted because of similar playbooks X, Y, Z"), and invalidatable (delete a playbook → boost goes away on next rebuild). It gets 80% of the payoff at 10% of the cost. Neural re-ranking stays as a future option if it bites.
+
+**Non-goals for this phase:**
+- Neural training / fine-tuning. Statistical feedback only.
+- Hard guarantees about recall lift magnitude. "Measurably better on the demo query" is the gate, not a universal quality claim.
+- Real-time recomputation on every playbook. Batched refresh via the existing stale-marking path is sufficient.
+
+### Phase 20: Model Matrix + Overseer Tiers (WIRED 2026-04-21)
+
+Five-tier routing declared in `config/models.json`. Hot path (T1/T2) stays on local mistral + qwen2.5. Cloud is consulted sparingly for overview (T3 gpt-oss:120b), strategic (T4 qwen3.5:397b), and gatekeeper decisions (T5 kimi-k2-thinking). Every tier declares `context_window` + `context_budget` + `overflow_policy`. See ADR-021 (to add).
+
+- T1 hot: 50-200 calls/scenario, local only
+- T2 review: 5-14 calls/event, local only
+- T3 overview: 1-3 calls/scenario, cloud primary
+- T4 strategic: 1-10 calls/day, cloud primary
+- T5 gatekeeper: 1-5 calls/day, audit-logged
+
+T3 checkpoints + cross-day lessons are wired. Lessons archive to `data/_playbook_lessons/` and load back at next scenario start as `prior_lessons` in executor context.
+
+### Phase 21: Context Stability & Chunking Pipeline
+
+**Why this is a phase and not an optimization:** when playbooks accumulate into the thousands, naive concatenation blows any model's context window. LLM Team hit this exact issue running multi-model ranking — context got lost silently, rankings degraded, and there was no pipeline to catch it. We cannot trust cloud inference to save us: bigger context window just raises the cliff. The stable answer is a chunking + caching layer that runs BEFORE any agent call.
+
+**Guarantees:**
+1. Every `generate()` call goes through a budget check against the model's declared `context_window` minus `safety_margin`.
+2. On overflow, the declared `overflow_policy` fires — one of: summarize oldest tool results via T3, cosine-rank top-K lessons, two-pass map-reduce across shards, or escalate to a bigger-context model.
+3. Chunk shards of large corpora (playbook_memory, lesson archive) are precomputed into `data/_chunk_cache/` keyed by corpus hash. Invalidation is automatic when the hash changes.
+4. Token estimation uses `chars / 4` (biased safe by ~15%) until we wire the provider's tokenizer.
+
+**What lives where:**
+- Budget checker in `crates/aibridge/src/budget.rs` (new)
+- Chunker in `crates/aibridge/src/chunk.rs` (new) — deterministic splits on sentence/paragraph boundaries, each shard ≤ policy cap
+- Cache service in `crates/storaged/src/chunk_cache.rs` (new) — `corpus_hash → [shard_id, tokens, content]`
+- Agent helper `tests/multi-agent/agent.ts::generateSafe()` wraps `generate()` + `generateCloud()` with the checker
+- Metric emitted on every overflow: `context_overflow{tier=?,policy=?,model=?}` → surfaces on /metrics
+
+**Status:** `context_window` + `context_budget` + `overflow_policies` WIRED in config/models.json. Enforcement helpers NOT yet wired. Implementation target: current sprint — this is a stability floor, not a feature.
+
+### Phase 22+: Further horizon
 
-- PDF OCR for scanned documents (Tesseract integration)
 - Specialized fine-tuned models per domain (staffing matcher, resume parser)
 - Video/audio transcript ingest + multimodal embeddings
+- Neural re-ranker over (query, candidate, outcome) triples — only if Phase 19's statistical feedback plateaus below usable recall
 - True distributed query (DataFusion multi-node) — only if single-machine ceilings bite
+- Playbook versioning (version + parent_id + retired_at) — touches gateway + catalogd + mcp-server
+- Playbook board (6-phase deep_analysis applied to playbook ranking)
 
 ---
 
diff --git a/tests/multi-agent/agent.ts b/tests/multi-agent/agent.ts
index b16ebca..c515ccc 100644
--- a/tests/multi-agent/agent.ts
+++ b/tests/multi-agent/agent.ts
@@ -15,6 +15,54 @@ export const SIDECAR = "http://localhost:3200";
 export const OLLAMA_CLOUD_URL = process.env.OLLAMA_CLOUD_URL ?? "https://ollama.com";
 export const OLLAMA_CLOUD_KEY = process.env.OLLAMA_CLOUD_KEY ?? "";
 
+// Rough token estimator — chars/4 biased safe by ~15%. Swap to a real
+// tokenizer (tiktoken or provider endpoint) once Phase 21 lands. Good
+// enough to stop the "context silently truncated" failure mode today.
+export function estimateTokens(text: string): number {
+  return Math.ceil(text.length / 4);
+}
+
+// Known context windows — matches crates/../config/models.json. Kept in
+// code as a fallback so the test harness doesn't crash if the config is
+// missing. Production path should read from models.json.
+export const CONTEXT_WINDOWS: Record<string, number> = {
+  "mistral:latest": 32768,
+  "qwen2.5:latest": 32768,
+  "qwen3:latest": 40960,
+  "gpt-oss:20b": 131072,
+  "gpt-oss:120b": 131072,
+  "qwen3.5:397b": 131072,
+  "kimi-k2-thinking": 200000,
+  "kimi-k2:1t": 1048576,
+  "deepseek-v3.1:671b": 131072,
+  "glm-4.7": 131072,
+};
+const DEFAULT_CONTEXT_WINDOW = 32768;
+const DEFAULT_SAFETY_MARGIN = 2000;
+
+// Fail LOUDLY if a prompt would blow the model's context. The whole
+// point of Phase 21 is to stop silent truncation — so we throw with the
+// numbers. Callers that expect to handle overflow should chunk BEFORE
+// calling; they can also set bypass: true to opt out (T5 gatekeeper
+// handles its own overflow policy).
+export function assertContextBudget(
+  model: string,
+  prompt: string,
+  opts: { system?: string; max_tokens?: number; safety_margin?: number; bypass?: boolean } = {}
+): { estimated: number; window: number; remaining: number } {
+  const window = CONTEXT_WINDOWS[model] ?? DEFAULT_CONTEXT_WINDOW;
+  const safety = opts.safety_margin ?? DEFAULT_SAFETY_MARGIN;
+  const estimated = estimateTokens(prompt) + estimateTokens(opts.system ?? "") + (opts.max_tokens ?? 800);
+  const remaining = window - estimated - safety;
+  if (remaining < 0 && !opts.bypass) {
+    throw new Error(
+      `context overflow: model=${model} est=${estimated}t window=${window}t safety=${safety}t over=${-remaining}t. ` +
+      `Chunk the prompt (see config/models.json overflow_policies) or set bypass:true if you know the risk.`
+    );
+  }
+  return { estimated, window, remaining };
+}
+
 // --- Shared types ---
 
 export type Role = "executor" | "reviewer";
@@ -101,7 +149,13 @@ export async function generate(model: string, prompt: string, opts: {
   max_tokens?: number;
   temperature?: number;
   system?: string;
+  bypass_budget?: boolean;
 } = {}): Promise<string> {
+  assertContextBudget(model, prompt, {
+    system: opts.system,
+    max_tokens: opts.max_tokens,
+    bypass: opts.bypass_budget,
+  });
   const body: Record<string, any> = {
     model,
     prompt,
@@ -126,10 +180,16 @@ export async function generateCloud(model: string, prompt: string, opts: {
   max_tokens?: number;
   temperature?: number;
   system?: string;
+  bypass_budget?: boolean;
 } = {}): Promise<string> {
   if (!OLLAMA_CLOUD_KEY) {
     throw new Error("OLLAMA_CLOUD_KEY not set; cannot reach Ollama Cloud");
   }
+  assertContextBudget(model, prompt, {
+    system: opts.system,
+    max_tokens: opts.max_tokens,
+    bypass: opts.bypass_budget,
+  });
   const body: Record<string, any> = {
     model,
     prompt,
diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts
index 2d5b326..916d89b 100644
--- a/tests/multi-agent/scenario.ts
+++ b/tests/multi-agent/scenario.ts
@@ -146,6 +146,50 @@ interface ScenarioContext {
   roster: RosterEntry[];
   results: EventResult[];
   gap_signals: Array<{ event: string; category: string; detail: string }>;
+  prior_lessons: PriorLesson[];
+}
+
+interface PriorLesson {
+  date: string;
+  client: string;
+  cities: string;
+  states: string;
+  lesson: string;
+  events_ok: number;
+  events_total: number;
+  file: string;
+}
+
+// Load lessons from prior T3 runs — read-back half of the feedback loop.
+// Filters to the most relevant by matching ANY city/state with the current
+// spec, then takes the 3 newest. Keeps startup cheap; file scan is O(n).
+async function loadPriorLessons(spec: ScenarioSpec): Promise<PriorLesson[]> {
+  try {
+    const { readdir, readFile } = await import("node:fs/promises");
+    const dir = join("data", "_playbook_lessons");
+    const files = await readdir(dir).catch(() => [] as string[]);
+    if (files.length === 0) return [];
+    const specCities = new Set(spec.events.map(e => e.city));
+    const specStates = new Set(spec.events.map(e => e.state));
+    const parsed: PriorLesson[] = [];
+    for (const f of files) {
+      if (!f.endsWith(".json")) continue;
+      try {
+        const raw = await readFile(join(dir, f), "utf8");
+        const rec = JSON.parse(raw);
+        parsed.push({ ...rec, file: f });
+      } catch { /* skip malformed */ }
+    }
+    const relevant = parsed.filter(p => {
+      const cities = (p.cities ?? "").split(",");
+      const states = (p.states ?? "").split(",");
+      return cities.some(c => specCities.has(c)) || states.some(s => specStates.has(s));
+    });
+    relevant.sort((a, b) => (b.date ?? "").localeCompare(a.date ?? ""));
+    return relevant.slice(0, 3);
+  } catch {
+    return [];
+  }
 }
 
 // =================== Default scenario ===================
@@ -493,7 +537,18 @@ CAST(reliability AS DOUBLE) > 0.7.`;
     }
   })();
 
-  return `${schemaLock}\n\nEVENT FOCUS:\n${base}`;
+  // Prior-lesson hint — surface up to 2 most recent lessons learned from
+  // T3 overseer runs against this city/state. Terse to avoid diluting the
+  // prompt. The goal is to pass forward hard-won mistakes, not flood the
+  // context. This is the read-back half of the T3 feedback loop.
+  const priorHint = ctx.prior_lessons.length > 0
+    ? `\n\nPRIOR LESSONS (from T3 overseer on past runs in similar cities):\n` +
+      ctx.prior_lessons.slice(0, 2).map((p, i) =>
+        `${i + 1}. ${p.date} ${p.client} (${p.cities}): ${p.lesson.replace(/\s+/g, " ").slice(0, 500)}`
+      ).join("\n")
+    : "";
+
+  return `${schemaLock}\n\nEVENT FOCUS:\n${base}${priorHint}`;
 }
 
 // =================== Artifact generation ===================
@@ -878,7 +933,8 @@ async function writeRetrospective(ctx: ScenarioContext): Promise<void> {
   const lines: string[] = [];
   lines.push(`# Scenario retrospective — ${ctx.spec.client}, ${ctx.spec.date}`);
   lines.push("");
-  lines.push(`Executor: \`${EXECUTOR_MODEL}\`   Reviewer: \`${REVIEWER_MODEL}\`   Draft: \`${DRAFT_MODEL}\`   Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL}\``);
+  lines.push(`Executor: \`${EXECUTOR_MODEL}\`   Reviewer: \`${REVIEWER_MODEL}\`   Draft: \`${DRAFT_MODEL}\`   Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}\``);
+  lines.push(`Prior lessons loaded into executor context: **${ctx.prior_lessons.length}**${ctx.prior_lessons.length > 0 ? " (from " + ctx.prior_lessons.map(p => p.date).join(", ") + ")" : " (baseline — no prior T3 history)"}`);
   lines.push("");
 
   // --- Per-event summary ---
@@ -1019,12 +1075,15 @@ async function main() {
   const out_dir = join("tests/multi-agent/playbooks", `scenario-${stamp}`);
   await mkdir(out_dir, { recursive: true });
 
+  const prior_lessons = await loadPriorLessons(spec);
+
   const ctx: ScenarioContext = {
     spec,
     out_dir,
     roster: [],
     results: [],
     gap_signals: [],
+    prior_lessons,
   };
 
   // Initialize output files
@@ -1033,6 +1092,19 @@ async function main() {
   await writeFile(join(out_dir, "dispatch.jsonl"), "");
   await writeFile(join(out_dir, "checkpoints.jsonl"), "");
 
+  // Archive which prior lessons this run will see, so the retrospective
+  // can tell whether the T3 feedback loop actually fed back anything.
+  await writeFile(
+    join(out_dir, "prior_lessons.json"),
+    JSON.stringify(prior_lessons, null, 2)
+  );
+  if (prior_lessons.length > 0) {
+    console.log(`▶ prior lessons loaded: ${prior_lessons.length} (from data/_playbook_lessons/)`);
+    for (const p of prior_lessons) {
+      console.log(`   - ${p.date} ${p.client} (${p.cities}) — ${p.events_ok}/${p.events_total} ok`);
+    }
+  }
+
   const checkpoints: OverviewCheckpoint[] = [];
 
   console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`);
@@ -1109,26 +1181,38 @@ async function main() {
       );
       console.log(`✓ lesson (${lessonSecs}s) → ${join(out_dir, "lesson.md")}`);
 
-      // Seed the lesson into playbook_memory for future retrieval. Keep
-      // the embedded `approach` + `context` terse per feedback_phase19_seed_text.md;
-      // the rich prose lives in lesson.md and a separate `rationale` field.
+      // Persist the lesson to data/_playbook_lessons/ so future scenarios
+      // can read it verbatim at startup. The /vectors/playbook_memory/seed
+      // endpoint rejects operations that don't match the `fill: Role xN
+      // in City, ST` regex (enforced in crates/vectord/src/service.rs),
+      // so embedding-based retrieval of cross-day lessons isn't wired.
+      // File-based read-back is durable and explicit — future scenarios
+      // pull from the lessons dir at startup and include top-N in the
+      // executor's system context.
       try {
-        const kinds = [...new Set(ctx.spec.events.map(e => e.kind))].join("+");
         const cities = [...new Set(ctx.spec.events.map(e => e.city))].slice(0, 3).join(",");
-        await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
-          method: "POST",
-          headers: { "content-type": "application/json" },
-          body: JSON.stringify({
-            operation: `cross-day-lesson-${ctx.spec.date}`,
-            approach: `${kinds} day in ${cities}`,
-            context: `${ctx.spec.client} ${ctx.spec.date}`,
-            rationale: lesson.slice(0, 2000),
-            endorsed_names: [],
-            append: true,
-          }),
-        });
+        const states = [...new Set(ctx.spec.events.map(e => e.state))].slice(0, 3).join(",");
+        const lessonsDir = join("data", "_playbook_lessons");
+        await mkdir(lessonsDir, { recursive: true });
+        const lessonRec = {
+          date: ctx.spec.date,
+          client: ctx.spec.client,
+          cities,
+          states,
+          events_total: ctx.spec.events.length,
+          events_ok: ctx.results.filter(r => r.ok).length,
+          checkpoint_count: checkpoints.length,
+          model: OVERVIEW_MODEL,
+          cloud: OVERVIEW_CLOUD,
+          lesson: lesson.trim(),
+          checkpoints: checkpoints.map(c => ({ after: c.after_event, risk: c.risk, hint: c.hint })),
+          created_at: new Date().toISOString(),
+        };
+        const fname = `${ctx.spec.date}_${ctx.spec.client.replace(/\s+/g, "_")}_${Date.now()}.json`;
+        await writeFile(join(lessonsDir, fname), JSON.stringify(lessonRec, null, 2));
+        console.log(`   lesson archived → ${join(lessonsDir, fname)}`);
       } catch (e) {
-        console.log(`   (lesson seed skipped: ${(e as Error).message})`);
+        console.log(`   (lesson archive skipped: ${(e as Error).message})`);
       }
     }
   }