From 6e7ca1830ef56b905b2d198763624e8a2fd3a62f Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2026 19:34:44 -0500 Subject: [PATCH] =?UTF-8?q?Phase=2021=20foundation=20=E2=80=94=20context?= =?UTF-8?q?=20stability=20+=20chunking=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PRD: add Phase 20 (model matrix, wired) and Phase 21 (context stability, partial). Phase 21 exists because LLM Team hit this exact wall — running multi-model ranking on large context silently truncated, rankings degraded, no pipeline caught it. The stable answer: every agent call goes through a budget check against the model's declared context_window minus safety_margin, with a declared overflow_policy when the check fails. config/models.json: - context_window + context_budget per tier - overflow_policies block: summarize_oldest_tool_results_via_t3, chunk_lessons_via_cosine_topk, two_pass_map_reduce, escalate_to_kimi_k2_1t_or_split_decision - chunking_cache spec (data/_chunk_cache/, corpus-hash keyed) agent.ts: - estimateTokens() chars/4 biased safe ~15% - CONTEXT_WINDOWS table (fallback; prod reads models.json) - assertContextBudget() — throws on overflow with exact numbers, can bypass with bypass_budget:true for callers with their own policy - Wired into generate() and generateCloud() so EVERY call is checked scenario.ts: - T3 lesson archive to data/_playbook_lessons/*.json (the old /vectors/playbook_memory/seed path was silently failing with HTTP 400 because it requires 'fill: Role xN in City, ST' operation shape) - loadPriorLessons() at scenario start — filters by city/state match, date-sorted, takes top-3 - prior_lessons.json archived per-run (honest signal for A/B) - guidanceFor() injects up to 2 prior lessons (≤500 chars each) into the executor's per-event context - Retrospective shows explicit "Prior lessons loaded: N" line Verified: mistral correctly rejects a 150K-char prompt (7532 tokens over), gpt-oss:120b accepts it with 90K headroom. The enforcement is in-band on every call now, not an afterthought. Full chunking service (Rust) remains deferred to the sprint this feeds: crates/aibridge/src/budget.rs + chunk.rs + storaged/chunk_cache.rs --- config/models.json | 91 +++++++++++++++++++++---- docs/PRD.md | 72 ++++++++++++++++++-- tests/multi-agent/agent.ts | 60 +++++++++++++++++ tests/multi-agent/scenario.ts | 122 ++++++++++++++++++++++++++++------ 4 files changed, 309 insertions(+), 36 deletions(-) diff --git a/config/models.json b/config/models.json index 0302b62..74804d8 100644 --- a/config/models.json +++ b/config/models.json @@ -22,59 +22,124 @@ "t1_hot": { "purpose": "Per tool call — SQL generation, hybrid_search, sql(). Runs 50-200 times per scenario. Latency-sensitive.", "kind": "local_fast", - "primary": { "model": "mistral:latest", "provider": "ollama_local" }, - "fallback": { "model": "qwen2.5:latest", "provider": "ollama_local" }, + "primary": { "model": "mistral:latest", "provider": "ollama_local", "context_window": 32768 }, + "fallback": { "model": "qwen2.5:latest", "provider": "ollama_local", "context_window": 32768 }, "max_tokens": 800, "temperature": 0.3, "never_route_cloud": true, + "context_budget": { + "system_prompt_cap": 4000, + "prior_context_cap": 6000, + "tool_results_cap": 8000, + "safety_margin": 2000, + "overflow_policy": "summarize_oldest_tool_results_via_t3" + }, "rationale": "Mistral produces valid JSON reliably. Qwen2.5 is the consensus reviewer. Known flakiness on 5-fill + misplacement events — do NOT mask by upgrading; route to T3 for post-hoc review instead." }, "t2_review": { "purpose": "Per step consensus — executor ↔ reviewer loop critique. 5-14 calls per event.", "kind": "local_balanced", - "primary": { "model": "qwen2.5:latest", "provider": "ollama_local" }, - "fallback": { "model": "qwen3:latest", "provider": "ollama_local" }, + "primary": { "model": "qwen2.5:latest", "provider": "ollama_local", "context_window": 32768 }, + "fallback": { "model": "qwen3:latest", "provider": "ollama_local", "context_window": 40960 }, "max_tokens": 600, "temperature": 0.3, "never_route_cloud": true, + "context_budget": { + "system_prompt_cap": 2000, + "recent_turns_cap": 4000, + "safety_margin": 1000 + }, "rationale": "Reviewer only needs to detect schema violations and drift — a 7B model is sufficient." }, "t3_overview": { "purpose": "Mid-day checkpoint after every misplacement + every Nth event, and cross-day lesson. 1-3 calls per scenario.", "kind": "thinking_cloud", - "primary": { "model": "gpt-oss:120b", "provider": "ollama_cloud" }, - "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" }, + "primary": { "model": "gpt-oss:120b", "provider": "ollama_cloud", "context_window": 131072 }, + "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local", "context_window": 131072 }, "max_tokens": 900, "temperature": 0.2, "cloud_budget_per_scenario": 5, "env_flag": "LH_OVERVIEW_CLOUD=1", + "context_budget": { + "event_digest_cap": 30000, + "checkpoint_cap": 8000, + "lesson_corpus_cap": 40000, + "safety_margin": 8000, + "overflow_policy": "chunk_lessons_via_cosine_topk" + }, "rationale": "Same prompt family as local 20b (gpt-oss series) — prompts port directly. 120b is faster via cloud than 20b local in practice, and lessons are noticeably more specific." }, "t4_strategic": { "purpose": "Daily playbook board re-ranking, weekly gap audit, pattern discovery across accumulated playbooks. 1-10 calls per day.", "kind": "thinking_cloud_large", - "primary": { "model": "qwen3.5:397b", "provider": "ollama_cloud" }, - "fallback": { "model": "glm-4.7", "provider": "ollama_cloud" }, - "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" }, + "primary": { "model": "qwen3.5:397b", "provider": "ollama_cloud", "context_window": 131072 }, + "fallback": { "model": "glm-4.7", "provider": "ollama_cloud", "context_window": 131072 }, + "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local", "context_window": 131072 }, "max_tokens": 2000, "temperature": 0.2, "cloud_budget_per_day": 10, + "context_budget": { + "playbook_corpus_cap": 80000, + "pattern_history_cap": 20000, + "safety_margin": 16000, + "overflow_policy": "two_pass_map_reduce" + }, "rationale": "J named qwen3.5 specifically. GLM-4.7 is a promising alternate for debate phase. Runs after all scenarios complete for the day." }, "t5_gatekeeper": { "purpose": "MUST route here: architecture changes, new client onboarding, schema migrations, playbook retirements, index rebuilds, autotune config changes.", "kind": "thinking_cloud_deepest", - "primary": { "model": "kimi-k2-thinking", "provider": "ollama_cloud" }, - "fallback": { "model": "deepseek-v3.1:671b", "provider": "ollama_cloud" }, - "secondary_fallback": { "model": "qwen3.5:397b", "provider": "ollama_cloud" }, - "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" }, + "primary": { "model": "kimi-k2-thinking", "provider": "ollama_cloud", "context_window": 200000 }, + "fallback": { "model": "deepseek-v3.1:671b", "provider": "ollama_cloud", "context_window": 131072 }, + "secondary_fallback": { "model": "qwen3.5:397b", "provider": "ollama_cloud", "context_window": 131072 }, + "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local", "context_window": 131072 }, "max_tokens": 4000, "temperature": 0.1, "cloud_budget_per_day": 5, "audit_log": true, + "context_budget": { + "decision_doc_cap": 50000, + "evidence_bundle_cap": 100000, + "prior_gatekeeper_decisions_cap": 20000, + "safety_margin": 20000, + "overflow_policy": "escalate_to_kimi_k2_1t_or_split_decision" + }, "rationale": "Highest-stakes decisions — reasoning depth matters more than latency. Audit log so J can always see what the gatekeeper was asked and what it answered. No human approval required today; escalate later if mis-decisions show up." } }, + + "context_management": { + "_description": "Rule zero: NEVER call a model with more tokens than its context_window minus safety_margin. Every call goes through the budget checker first. If over budget → chunk, summarize, or escalate. This is the stability floor.", + "token_estimator": { + "method": "chars_div_4", + "note": "Rough, biased safe by ~15%. For production, swap to tiktoken or the provider's tokenizer endpoint." + }, + "overflow_policies": { + "summarize_oldest_tool_results_via_t3": { + "when": "T1 conversation history + tool results exceed context_budget.tool_results_cap", + "how": "Send oldest N tool results to T3 with prompt 'summarize these into 500 tokens that preserve what the executor needs to know'; replace them with the summary in the running conversation." + }, + "chunk_lessons_via_cosine_topk": { + "when": "lesson corpus in data/_playbook_lessons/*.json exceeds lesson_corpus_cap", + "how": "Embed the current scenario spec, cosine-rank all lessons, take top-K until budget exhausted. Fall back to date-sorted if embeddings unavailable." + }, + "two_pass_map_reduce": { + "when": "T4 playbook corpus exceeds playbook_corpus_cap", + "how": "Pass 1: chunk playbooks into ≤30K token shards, run primary model on each shard to emit 'shard summary'. Pass 2: feed all summaries to primary model for global synthesis. Logged as two audit entries." + }, + "escalate_to_kimi_k2_1t_or_split_decision": { + "when": "T5 decision evidence exceeds decision_doc_cap + evidence_bundle_cap", + "how": "Prefer kimi-k2:1t which has 1M context. If still over, split decision into sub-decisions (e.g. 'retire playbooks by city' instead of 'retire playbooks globally') and loop." + } + }, + "chunking_cache": { + "_description": "Precomputed shards of the playbook corpus, indexed by (corpus_version, shard_id). Avoids re-chunking on every T4 run.", + "location": "data/_chunk_cache/", + "invalidation": "Key includes corpus hash. When playbook_memory changes, the hash changes, the cache misses, and chunks regenerate.", + "implementation_status": "SPEC — next sprint." + }, + "implementation_status": "context_window + context_budget fields WIRED in config. Enforcement helper NOT yet wired in agent.ts. Next: add estimateTokens() + budgetCheck() helpers, route all generate() calls through them, emit a metric when overflow policy fires." + }, "experimental_rotation": { "enabled": false, "purpose": "Sample newer models on a schedule to collect comparison data without rate-limit risk.", diff --git a/docs/PRD.md b/docs/PRD.md index b97aeb3..e7bab76 100644 --- a/docs/PRD.md +++ b/docs/PRD.md @@ -1,8 +1,8 @@ # PRD: Lakehouse — Rust-First Substrate for Versioned Knowledge Stores -**Status:** Active — Phases 0-18 shipped; hybrid SQL+Vector search operational; IVF_PQ recall tuned to 0.97; 10K real staffing profiles ingested from Ethereal; autonomous agent + staffing simulation verified +**Status:** Active — Phases 0-18 shipped; hybrid SQL+Vector search operational; IVF_PQ recall tuned to 1.000 at p50 ≈ 7.4ms via `nprobes`+`refine`; autonomous agent rotates across full index portfolio; cron-scheduled ingest; eval federation complete **Created:** 2026-03-27 -**Last updated:** 2026-04-17 — hybrid search closes the SQL+vector gap; quality eval surfaced and fixed real bugs +**Last updated:** 2026-04-20 — portfolio-wide autotune, real cron, evals federation, bucket-migrate, IVF_PQ recall 0.805 → 1.000 **Owner:** J --- @@ -126,6 +126,15 @@ User question → gateway 9. **Every reader gets its own profile.** Human operators, AI agents, and local models are all clients of the same substrate. Each has a named profile with its own bucket, vector indexes, trial history, and dataset bindings. Profiles are a first-class architectural concept, not a tenancy afterthought. (Phase 17) 10. **Trials are data, not logs.** Every index build is a trial with measurable metrics. The trial journal IS the agent's memory for how to tune itself. Stored as write-once batched JSONL per the ADR-018 append-log pattern. 11. **Operational failures are findable in one HTTP call.** The bucket error journal, trial journal, and audit log all expose `/storage/errors`, `/hnsw/trials`, `/access/audit` with structured filter + aggregation. No `grep` archaeology to answer "what broke?" +12. **Playbooks feed the index, not just the log.** A completed playbook isn't just a record of what worked — it's a signal that shapes future rankings. Every `successful_playbooks` row contributes to the playbook-memory vector index, so semantically-similar future operations re-rank toward workers that have actually succeeded in comparable fills. This is the "system gets smarter over time" dimension that distinguishes this substrate from a static search engine. (Phase 19) + +--- + +## Vision drift acknowledged (2026-04-20) + +The system as shipped through Phase 18 is a **hybrid SQL+vector search engine with a playbook log**. The original pitch (and the "staffing AI co-pilot" framing) implied a **meta-index that learns from playbooks over time** — hot-swap profiles weren't just routing, they were knowledge generations that compounded. That learning loop was never built; playbooks were write-only. Phase 19 closes that gap explicitly. + +The feedback signal is **statistical + semantic**, not neural. No model training — the index reads the playbook journal, computes operation-similarity, and boosts endorsed workers at query time. Rebuildable from `successful_playbooks` alone, same as every other derived index. --- @@ -349,12 +358,67 @@ The question raised 2026-04-16 after J's LLMS3 knowledge base identified Lance a Per-profile `vector_backend: Parquet | Lance` becomes part of Phase 17 (model profiles). See ADR-019 for the full scorecard and caveats. -### Phase 19+: Further horizon +### Phase 19: Playbook memory (meta-index) — the feedback loop + +Make successful playbooks actually improve future searches. Today `successful_playbooks` is a write-only log; future-you looks at it and thinks "cool, we filled Toledo welders once" — but the index has no idea it happened, so the next Toledo-welder search ranks the same as if none of those fills had existed. Phase 19 closes the loop. + +| Step | Deliverable | Gate | +|---|---|---| +| 19.1 | Embed every `successful_playbooks` row — operation + approach + context → one chunk per playbook | A new dataset `playbook_memory` appears in catalog with N rows = row count of `successful_playbooks` | +| 19.2 | Vector index on `playbook_memory` (HNSW or Lance — whichever `agent-parquet` profile uses) | `/vectors/search` against `playbook_memory` returns semantically similar past playbooks | +| 19.3 | Endorsement extraction: each playbook row has `fills[]` (worker_ids it succeeded with). Parse them out at ingest time and store in a sidecar `playbook_endorsements` Parquet keyed by playbook_id | `SELECT * FROM playbook_endorsements WHERE playbook_id = 'X'` returns the worker_ids | +| 19.4 | `/vectors/hybrid` gains opt-in `use_playbook_memory: bool`. When true: after hybrid ranks candidates, find top-K similar past playbooks (semantic search over `playbook_memory`), extract endorsed worker_ids, add a bounded boost to candidates in the endorsed set, re-rank | A search where the "right" worker is known from a prior playbook ranks higher with the flag than without | +| 19.5 | Write-through from multi-agent orchestrator: when two agents seal a playbook, it appends to `successful_playbooks` AND triggers a refresh of `playbook_memory` (via existing Phase C stale-mark path). Next query sees the new signal. | Run the orchestrator → inspect `playbook_memory` → see a new row. Run the same query before/after → ranking differs. | +| 19.6 | Ceiling-aware boost: cap the per-worker boost so one popular worker can't dominate future searches. Boost decays with time (optional) so stale playbooks matter less. | Synthetic test: 100 playbooks all filled with the same worker_id; the 101st search still returns a mix, not just that one worker | + +**Gate:** Run a real search before and after a new successful playbook lands. The endorsed workers from similar past operations rank higher in the second call. Demonstrable with a diff of the two result sets. + +**Why this is the right version of "meta-index":** The alternative — training a neural re-ranker on (query, candidate, outcome) triples — is a weeks-long ML story and requires labeled outcome data we don't really have. The statistical-semantic version here is rebuildable from the existing playbook journal, explainable ("boosted because of similar playbooks X, Y, Z"), and invalidatable (delete a playbook → boost goes away on next rebuild). It gets 80% of the payoff at 10% of the cost. Neural re-ranking stays as a future option if it bites. + +**Non-goals for this phase:** +- Neural training / fine-tuning. Statistical feedback only. +- Hard guarantees about recall lift magnitude. "Measurably better on the demo query" is the gate, not a universal quality claim. +- Real-time recomputation on every playbook. Batched refresh via the existing stale-marking path is sufficient. + +### Phase 20: Model Matrix + Overseer Tiers (WIRED 2026-04-21) + +Five-tier routing declared in `config/models.json`. Hot path (T1/T2) stays on local mistral + qwen2.5. Cloud is consulted sparingly for overview (T3 gpt-oss:120b), strategic (T4 qwen3.5:397b), and gatekeeper decisions (T5 kimi-k2-thinking). Every tier declares `context_window` + `context_budget` + `overflow_policy`. See ADR-021 (to add). + +- T1 hot: 50-200 calls/scenario, local only +- T2 review: 5-14 calls/event, local only +- T3 overview: 1-3 calls/scenario, cloud primary +- T4 strategic: 1-10 calls/day, cloud primary +- T5 gatekeeper: 1-5 calls/day, audit-logged + +T3 checkpoints + cross-day lessons are wired. Lessons archive to `data/_playbook_lessons/` and load back at next scenario start as `prior_lessons` in executor context. + +### Phase 21: Context Stability & Chunking Pipeline + +**Why this is a phase and not an optimization:** when playbooks accumulate into the thousands, naive concatenation blows any model's context window. LLM Team hit this exact issue running multi-model ranking — context got lost silently, rankings degraded, and there was no pipeline to catch it. We cannot trust cloud inference to save us: bigger context window just raises the cliff. The stable answer is a chunking + caching layer that runs BEFORE any agent call. + +**Guarantees:** +1. Every `generate()` call goes through a budget check against the model's declared `context_window` minus `safety_margin`. +2. On overflow, the declared `overflow_policy` fires — one of: summarize oldest tool results via T3, cosine-rank top-K lessons, two-pass map-reduce across shards, or escalate to a bigger-context model. +3. Chunk shards of large corpora (playbook_memory, lesson archive) are precomputed into `data/_chunk_cache/` keyed by corpus hash. Invalidation is automatic when the hash changes. +4. Token estimation uses `chars / 4` (biased safe by ~15%) until we wire the provider's tokenizer. + +**What lives where:** +- Budget checker in `crates/aibridge/src/budget.rs` (new) +- Chunker in `crates/aibridge/src/chunk.rs` (new) — deterministic splits on sentence/paragraph boundaries, each shard ≤ policy cap +- Cache service in `crates/storaged/src/chunk_cache.rs` (new) — `corpus_hash → [shard_id, tokens, content]` +- Agent helper `tests/multi-agent/agent.ts::generateSafe()` wraps `generate()` + `generateCloud()` with the checker +- Metric emitted on every overflow: `context_overflow{tier=?,policy=?,model=?}` → surfaces on /metrics + +**Status:** `context_window` + `context_budget` + `overflow_policies` WIRED in config/models.json. Enforcement helpers NOT yet wired. Implementation target: current sprint — this is a stability floor, not a feature. + +### Phase 22+: Further horizon -- PDF OCR for scanned documents (Tesseract integration) - Specialized fine-tuned models per domain (staffing matcher, resume parser) - Video/audio transcript ingest + multimodal embeddings +- Neural re-ranker over (query, candidate, outcome) triples — only if Phase 19's statistical feedback plateaus below usable recall - True distributed query (DataFusion multi-node) — only if single-machine ceilings bite +- Playbook versioning (version + parent_id + retired_at) — touches gateway + catalogd + mcp-server +- Playbook board (6-phase deep_analysis applied to playbook ranking) --- diff --git a/tests/multi-agent/agent.ts b/tests/multi-agent/agent.ts index b16ebca..c515ccc 100644 --- a/tests/multi-agent/agent.ts +++ b/tests/multi-agent/agent.ts @@ -15,6 +15,54 @@ export const SIDECAR = "http://localhost:3200"; export const OLLAMA_CLOUD_URL = process.env.OLLAMA_CLOUD_URL ?? "https://ollama.com"; export const OLLAMA_CLOUD_KEY = process.env.OLLAMA_CLOUD_KEY ?? ""; +// Rough token estimator — chars/4 biased safe by ~15%. Swap to a real +// tokenizer (tiktoken or provider endpoint) once Phase 21 lands. Good +// enough to stop the "context silently truncated" failure mode today. +export function estimateTokens(text: string): number { + return Math.ceil(text.length / 4); +} + +// Known context windows — matches crates/../config/models.json. Kept in +// code as a fallback so the test harness doesn't crash if the config is +// missing. Production path should read from models.json. +export const CONTEXT_WINDOWS: Record = { + "mistral:latest": 32768, + "qwen2.5:latest": 32768, + "qwen3:latest": 40960, + "gpt-oss:20b": 131072, + "gpt-oss:120b": 131072, + "qwen3.5:397b": 131072, + "kimi-k2-thinking": 200000, + "kimi-k2:1t": 1048576, + "deepseek-v3.1:671b": 131072, + "glm-4.7": 131072, +}; +const DEFAULT_CONTEXT_WINDOW = 32768; +const DEFAULT_SAFETY_MARGIN = 2000; + +// Fail LOUDLY if a prompt would blow the model's context. The whole +// point of Phase 21 is to stop silent truncation — so we throw with the +// numbers. Callers that expect to handle overflow should chunk BEFORE +// calling; they can also set bypass: true to opt out (T5 gatekeeper +// handles its own overflow policy). +export function assertContextBudget( + model: string, + prompt: string, + opts: { system?: string; max_tokens?: number; safety_margin?: number; bypass?: boolean } = {} +): { estimated: number; window: number; remaining: number } { + const window = CONTEXT_WINDOWS[model] ?? DEFAULT_CONTEXT_WINDOW; + const safety = opts.safety_margin ?? DEFAULT_SAFETY_MARGIN; + const estimated = estimateTokens(prompt) + estimateTokens(opts.system ?? "") + (opts.max_tokens ?? 800); + const remaining = window - estimated - safety; + if (remaining < 0 && !opts.bypass) { + throw new Error( + `context overflow: model=${model} est=${estimated}t window=${window}t safety=${safety}t over=${-remaining}t. ` + + `Chunk the prompt (see config/models.json overflow_policies) or set bypass:true if you know the risk.` + ); + } + return { estimated, window, remaining }; +} + // --- Shared types --- export type Role = "executor" | "reviewer"; @@ -101,7 +149,13 @@ export async function generate(model: string, prompt: string, opts: { max_tokens?: number; temperature?: number; system?: string; + bypass_budget?: boolean; } = {}): Promise { + assertContextBudget(model, prompt, { + system: opts.system, + max_tokens: opts.max_tokens, + bypass: opts.bypass_budget, + }); const body: Record = { model, prompt, @@ -126,10 +180,16 @@ export async function generateCloud(model: string, prompt: string, opts: { max_tokens?: number; temperature?: number; system?: string; + bypass_budget?: boolean; } = {}): Promise { if (!OLLAMA_CLOUD_KEY) { throw new Error("OLLAMA_CLOUD_KEY not set; cannot reach Ollama Cloud"); } + assertContextBudget(model, prompt, { + system: opts.system, + max_tokens: opts.max_tokens, + bypass: opts.bypass_budget, + }); const body: Record = { model, prompt, diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts index 2d5b326..916d89b 100644 --- a/tests/multi-agent/scenario.ts +++ b/tests/multi-agent/scenario.ts @@ -146,6 +146,50 @@ interface ScenarioContext { roster: RosterEntry[]; results: EventResult[]; gap_signals: Array<{ event: string; category: string; detail: string }>; + prior_lessons: PriorLesson[]; +} + +interface PriorLesson { + date: string; + client: string; + cities: string; + states: string; + lesson: string; + events_ok: number; + events_total: number; + file: string; +} + +// Load lessons from prior T3 runs — read-back half of the feedback loop. +// Filters to the most relevant by matching ANY city/state with the current +// spec, then takes the 3 newest. Keeps startup cheap; file scan is O(n). +async function loadPriorLessons(spec: ScenarioSpec): Promise { + try { + const { readdir, readFile } = await import("node:fs/promises"); + const dir = join("data", "_playbook_lessons"); + const files = await readdir(dir).catch(() => [] as string[]); + if (files.length === 0) return []; + const specCities = new Set(spec.events.map(e => e.city)); + const specStates = new Set(spec.events.map(e => e.state)); + const parsed: PriorLesson[] = []; + for (const f of files) { + if (!f.endsWith(".json")) continue; + try { + const raw = await readFile(join(dir, f), "utf8"); + const rec = JSON.parse(raw); + parsed.push({ ...rec, file: f }); + } catch { /* skip malformed */ } + } + const relevant = parsed.filter(p => { + const cities = (p.cities ?? "").split(","); + const states = (p.states ?? "").split(","); + return cities.some(c => specCities.has(c)) || states.some(s => specStates.has(s)); + }); + relevant.sort((a, b) => (b.date ?? "").localeCompare(a.date ?? "")); + return relevant.slice(0, 3); + } catch { + return []; + } } // =================== Default scenario =================== @@ -493,7 +537,18 @@ CAST(reliability AS DOUBLE) > 0.7.`; } })(); - return `${schemaLock}\n\nEVENT FOCUS:\n${base}`; + // Prior-lesson hint — surface up to 2 most recent lessons learned from + // T3 overseer runs against this city/state. Terse to avoid diluting the + // prompt. The goal is to pass forward hard-won mistakes, not flood the + // context. This is the read-back half of the T3 feedback loop. + const priorHint = ctx.prior_lessons.length > 0 + ? `\n\nPRIOR LESSONS (from T3 overseer on past runs in similar cities):\n` + + ctx.prior_lessons.slice(0, 2).map((p, i) => + `${i + 1}. ${p.date} ${p.client} (${p.cities}): ${p.lesson.replace(/\s+/g, " ").slice(0, 500)}` + ).join("\n") + : ""; + + return `${schemaLock}\n\nEVENT FOCUS:\n${base}${priorHint}`; } // =================== Artifact generation =================== @@ -878,7 +933,8 @@ async function writeRetrospective(ctx: ScenarioContext): Promise { const lines: string[] = []; lines.push(`# Scenario retrospective — ${ctx.spec.client}, ${ctx.spec.date}`); lines.push(""); - lines.push(`Executor: \`${EXECUTOR_MODEL}\` Reviewer: \`${REVIEWER_MODEL}\` Draft: \`${DRAFT_MODEL}\` Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL}\``); + lines.push(`Executor: \`${EXECUTOR_MODEL}\` Reviewer: \`${REVIEWER_MODEL}\` Draft: \`${DRAFT_MODEL}\` Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}\``); + lines.push(`Prior lessons loaded into executor context: **${ctx.prior_lessons.length}**${ctx.prior_lessons.length > 0 ? " (from " + ctx.prior_lessons.map(p => p.date).join(", ") + ")" : " (baseline — no prior T3 history)"}`); lines.push(""); // --- Per-event summary --- @@ -1019,12 +1075,15 @@ async function main() { const out_dir = join("tests/multi-agent/playbooks", `scenario-${stamp}`); await mkdir(out_dir, { recursive: true }); + const prior_lessons = await loadPriorLessons(spec); + const ctx: ScenarioContext = { spec, out_dir, roster: [], results: [], gap_signals: [], + prior_lessons, }; // Initialize output files @@ -1033,6 +1092,19 @@ async function main() { await writeFile(join(out_dir, "dispatch.jsonl"), ""); await writeFile(join(out_dir, "checkpoints.jsonl"), ""); + // Archive which prior lessons this run will see, so the retrospective + // can tell whether the T3 feedback loop actually fed back anything. + await writeFile( + join(out_dir, "prior_lessons.json"), + JSON.stringify(prior_lessons, null, 2) + ); + if (prior_lessons.length > 0) { + console.log(`▶ prior lessons loaded: ${prior_lessons.length} (from data/_playbook_lessons/)`); + for (const p of prior_lessons) { + console.log(` - ${p.date} ${p.client} (${p.cities}) — ${p.events_ok}/${p.events_total} ok`); + } + } + const checkpoints: OverviewCheckpoint[] = []; console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`); @@ -1109,26 +1181,38 @@ async function main() { ); console.log(`✓ lesson (${lessonSecs}s) → ${join(out_dir, "lesson.md")}`); - // Seed the lesson into playbook_memory for future retrieval. Keep - // the embedded `approach` + `context` terse per feedback_phase19_seed_text.md; - // the rich prose lives in lesson.md and a separate `rationale` field. + // Persist the lesson to data/_playbook_lessons/ so future scenarios + // can read it verbatim at startup. The /vectors/playbook_memory/seed + // endpoint rejects operations that don't match the `fill: Role xN + // in City, ST` regex (enforced in crates/vectord/src/service.rs), + // so embedding-based retrieval of cross-day lessons isn't wired. + // File-based read-back is durable and explicit — future scenarios + // pull from the lessons dir at startup and include top-N in the + // executor's system context. try { - const kinds = [...new Set(ctx.spec.events.map(e => e.kind))].join("+"); const cities = [...new Set(ctx.spec.events.map(e => e.city))].slice(0, 3).join(","); - await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, { - method: "POST", - headers: { "content-type": "application/json" }, - body: JSON.stringify({ - operation: `cross-day-lesson-${ctx.spec.date}`, - approach: `${kinds} day in ${cities}`, - context: `${ctx.spec.client} ${ctx.spec.date}`, - rationale: lesson.slice(0, 2000), - endorsed_names: [], - append: true, - }), - }); + const states = [...new Set(ctx.spec.events.map(e => e.state))].slice(0, 3).join(","); + const lessonsDir = join("data", "_playbook_lessons"); + await mkdir(lessonsDir, { recursive: true }); + const lessonRec = { + date: ctx.spec.date, + client: ctx.spec.client, + cities, + states, + events_total: ctx.spec.events.length, + events_ok: ctx.results.filter(r => r.ok).length, + checkpoint_count: checkpoints.length, + model: OVERVIEW_MODEL, + cloud: OVERVIEW_CLOUD, + lesson: lesson.trim(), + checkpoints: checkpoints.map(c => ({ after: c.after_event, risk: c.risk, hint: c.hint })), + created_at: new Date().toISOString(), + }; + const fname = `${ctx.spec.date}_${ctx.spec.client.replace(/\s+/g, "_")}_${Date.now()}.json`; + await writeFile(join(lessonsDir, fname), JSON.stringify(lessonRec, null, 2)); + console.log(` lesson archived → ${join(lessonsDir, fname)}`); } catch (e) { - console.log(` (lesson seed skipped: ${(e as Error).message})`); + console.log(` (lesson archive skipped: ${(e as Error).message})`); } } }