Phase 21 foundation — context stability + chunking pipeline
PRD: add Phase 20 (model matrix, wired) and Phase 21 (context stability, partial). Phase 21 exists because LLM Team hit this exact wall — running multi-model ranking on large context silently truncated, rankings degraded, no pipeline caught it. The stable answer: every agent call goes through a budget check against the model's declared context_window minus safety_margin, with a declared overflow_policy when the check fails. config/models.json: - context_window + context_budget per tier - overflow_policies block: summarize_oldest_tool_results_via_t3, chunk_lessons_via_cosine_topk, two_pass_map_reduce, escalate_to_kimi_k2_1t_or_split_decision - chunking_cache spec (data/_chunk_cache/, corpus-hash keyed) agent.ts: - estimateTokens() chars/4 biased safe ~15% - CONTEXT_WINDOWS table (fallback; prod reads models.json) - assertContextBudget() — throws on overflow with exact numbers, can bypass with bypass_budget:true for callers with their own policy - Wired into generate() and generateCloud() so EVERY call is checked scenario.ts: - T3 lesson archive to data/_playbook_lessons/*.json (the old /vectors/playbook_memory/seed path was silently failing with HTTP 400 because it requires 'fill: Role xN in City, ST' operation shape) - loadPriorLessons() at scenario start — filters by city/state match, date-sorted, takes top-3 - prior_lessons.json archived per-run (honest signal for A/B) - guidanceFor() injects up to 2 prior lessons (≤500 chars each) into the executor's per-event context - Retrospective shows explicit "Prior lessons loaded: N" line Verified: mistral correctly rejects a 150K-char prompt (7532 tokens over), gpt-oss:120b accepts it with 90K headroom. The enforcement is in-band on every call now, not an afterthought. Full chunking service (Rust) remains deferred to the sprint this feeds: crates/aibridge/src/budget.rs + chunk.rs + storaged/chunk_cache.rs
This commit is contained in:
parent
03d723e7e6
commit
6e7ca1830e
@ -22,59 +22,124 @@
|
||||
"t1_hot": {
|
||||
"purpose": "Per tool call — SQL generation, hybrid_search, sql(). Runs 50-200 times per scenario. Latency-sensitive.",
|
||||
"kind": "local_fast",
|
||||
"primary": { "model": "mistral:latest", "provider": "ollama_local" },
|
||||
"fallback": { "model": "qwen2.5:latest", "provider": "ollama_local" },
|
||||
"primary": { "model": "mistral:latest", "provider": "ollama_local", "context_window": 32768 },
|
||||
"fallback": { "model": "qwen2.5:latest", "provider": "ollama_local", "context_window": 32768 },
|
||||
"max_tokens": 800,
|
||||
"temperature": 0.3,
|
||||
"never_route_cloud": true,
|
||||
"context_budget": {
|
||||
"system_prompt_cap": 4000,
|
||||
"prior_context_cap": 6000,
|
||||
"tool_results_cap": 8000,
|
||||
"safety_margin": 2000,
|
||||
"overflow_policy": "summarize_oldest_tool_results_via_t3"
|
||||
},
|
||||
"rationale": "Mistral produces valid JSON reliably. Qwen2.5 is the consensus reviewer. Known flakiness on 5-fill + misplacement events — do NOT mask by upgrading; route to T3 for post-hoc review instead."
|
||||
},
|
||||
"t2_review": {
|
||||
"purpose": "Per step consensus — executor ↔ reviewer loop critique. 5-14 calls per event.",
|
||||
"kind": "local_balanced",
|
||||
"primary": { "model": "qwen2.5:latest", "provider": "ollama_local" },
|
||||
"fallback": { "model": "qwen3:latest", "provider": "ollama_local" },
|
||||
"primary": { "model": "qwen2.5:latest", "provider": "ollama_local", "context_window": 32768 },
|
||||
"fallback": { "model": "qwen3:latest", "provider": "ollama_local", "context_window": 40960 },
|
||||
"max_tokens": 600,
|
||||
"temperature": 0.3,
|
||||
"never_route_cloud": true,
|
||||
"context_budget": {
|
||||
"system_prompt_cap": 2000,
|
||||
"recent_turns_cap": 4000,
|
||||
"safety_margin": 1000
|
||||
},
|
||||
"rationale": "Reviewer only needs to detect schema violations and drift — a 7B model is sufficient."
|
||||
},
|
||||
"t3_overview": {
|
||||
"purpose": "Mid-day checkpoint after every misplacement + every Nth event, and cross-day lesson. 1-3 calls per scenario.",
|
||||
"kind": "thinking_cloud",
|
||||
"primary": { "model": "gpt-oss:120b", "provider": "ollama_cloud" },
|
||||
"local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" },
|
||||
"primary": { "model": "gpt-oss:120b", "provider": "ollama_cloud", "context_window": 131072 },
|
||||
"local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local", "context_window": 131072 },
|
||||
"max_tokens": 900,
|
||||
"temperature": 0.2,
|
||||
"cloud_budget_per_scenario": 5,
|
||||
"env_flag": "LH_OVERVIEW_CLOUD=1",
|
||||
"context_budget": {
|
||||
"event_digest_cap": 30000,
|
||||
"checkpoint_cap": 8000,
|
||||
"lesson_corpus_cap": 40000,
|
||||
"safety_margin": 8000,
|
||||
"overflow_policy": "chunk_lessons_via_cosine_topk"
|
||||
},
|
||||
"rationale": "Same prompt family as local 20b (gpt-oss series) — prompts port directly. 120b is faster via cloud than 20b local in practice, and lessons are noticeably more specific."
|
||||
},
|
||||
"t4_strategic": {
|
||||
"purpose": "Daily playbook board re-ranking, weekly gap audit, pattern discovery across accumulated playbooks. 1-10 calls per day.",
|
||||
"kind": "thinking_cloud_large",
|
||||
"primary": { "model": "qwen3.5:397b", "provider": "ollama_cloud" },
|
||||
"fallback": { "model": "glm-4.7", "provider": "ollama_cloud" },
|
||||
"local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" },
|
||||
"primary": { "model": "qwen3.5:397b", "provider": "ollama_cloud", "context_window": 131072 },
|
||||
"fallback": { "model": "glm-4.7", "provider": "ollama_cloud", "context_window": 131072 },
|
||||
"local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local", "context_window": 131072 },
|
||||
"max_tokens": 2000,
|
||||
"temperature": 0.2,
|
||||
"cloud_budget_per_day": 10,
|
||||
"context_budget": {
|
||||
"playbook_corpus_cap": 80000,
|
||||
"pattern_history_cap": 20000,
|
||||
"safety_margin": 16000,
|
||||
"overflow_policy": "two_pass_map_reduce"
|
||||
},
|
||||
"rationale": "J named qwen3.5 specifically. GLM-4.7 is a promising alternate for debate phase. Runs after all scenarios complete for the day."
|
||||
},
|
||||
"t5_gatekeeper": {
|
||||
"purpose": "MUST route here: architecture changes, new client onboarding, schema migrations, playbook retirements, index rebuilds, autotune config changes.",
|
||||
"kind": "thinking_cloud_deepest",
|
||||
"primary": { "model": "kimi-k2-thinking", "provider": "ollama_cloud" },
|
||||
"fallback": { "model": "deepseek-v3.1:671b", "provider": "ollama_cloud" },
|
||||
"secondary_fallback": { "model": "qwen3.5:397b", "provider": "ollama_cloud" },
|
||||
"local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" },
|
||||
"primary": { "model": "kimi-k2-thinking", "provider": "ollama_cloud", "context_window": 200000 },
|
||||
"fallback": { "model": "deepseek-v3.1:671b", "provider": "ollama_cloud", "context_window": 131072 },
|
||||
"secondary_fallback": { "model": "qwen3.5:397b", "provider": "ollama_cloud", "context_window": 131072 },
|
||||
"local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local", "context_window": 131072 },
|
||||
"max_tokens": 4000,
|
||||
"temperature": 0.1,
|
||||
"cloud_budget_per_day": 5,
|
||||
"audit_log": true,
|
||||
"context_budget": {
|
||||
"decision_doc_cap": 50000,
|
||||
"evidence_bundle_cap": 100000,
|
||||
"prior_gatekeeper_decisions_cap": 20000,
|
||||
"safety_margin": 20000,
|
||||
"overflow_policy": "escalate_to_kimi_k2_1t_or_split_decision"
|
||||
},
|
||||
"rationale": "Highest-stakes decisions — reasoning depth matters more than latency. Audit log so J can always see what the gatekeeper was asked and what it answered. No human approval required today; escalate later if mis-decisions show up."
|
||||
}
|
||||
},
|
||||
|
||||
"context_management": {
|
||||
"_description": "Rule zero: NEVER call a model with more tokens than its context_window minus safety_margin. Every call goes through the budget checker first. If over budget → chunk, summarize, or escalate. This is the stability floor.",
|
||||
"token_estimator": {
|
||||
"method": "chars_div_4",
|
||||
"note": "Rough, biased safe by ~15%. For production, swap to tiktoken or the provider's tokenizer endpoint."
|
||||
},
|
||||
"overflow_policies": {
|
||||
"summarize_oldest_tool_results_via_t3": {
|
||||
"when": "T1 conversation history + tool results exceed context_budget.tool_results_cap",
|
||||
"how": "Send oldest N tool results to T3 with prompt 'summarize these into 500 tokens that preserve what the executor needs to know'; replace them with the summary in the running conversation."
|
||||
},
|
||||
"chunk_lessons_via_cosine_topk": {
|
||||
"when": "lesson corpus in data/_playbook_lessons/*.json exceeds lesson_corpus_cap",
|
||||
"how": "Embed the current scenario spec, cosine-rank all lessons, take top-K until budget exhausted. Fall back to date-sorted if embeddings unavailable."
|
||||
},
|
||||
"two_pass_map_reduce": {
|
||||
"when": "T4 playbook corpus exceeds playbook_corpus_cap",
|
||||
"how": "Pass 1: chunk playbooks into ≤30K token shards, run primary model on each shard to emit 'shard summary'. Pass 2: feed all summaries to primary model for global synthesis. Logged as two audit entries."
|
||||
},
|
||||
"escalate_to_kimi_k2_1t_or_split_decision": {
|
||||
"when": "T5 decision evidence exceeds decision_doc_cap + evidence_bundle_cap",
|
||||
"how": "Prefer kimi-k2:1t which has 1M context. If still over, split decision into sub-decisions (e.g. 'retire playbooks by city' instead of 'retire playbooks globally') and loop."
|
||||
}
|
||||
},
|
||||
"chunking_cache": {
|
||||
"_description": "Precomputed shards of the playbook corpus, indexed by (corpus_version, shard_id). Avoids re-chunking on every T4 run.",
|
||||
"location": "data/_chunk_cache/",
|
||||
"invalidation": "Key includes corpus hash. When playbook_memory changes, the hash changes, the cache misses, and chunks regenerate.",
|
||||
"implementation_status": "SPEC — next sprint."
|
||||
},
|
||||
"implementation_status": "context_window + context_budget fields WIRED in config. Enforcement helper NOT yet wired in agent.ts. Next: add estimateTokens() + budgetCheck() helpers, route all generate() calls through them, emit a metric when overflow policy fires."
|
||||
},
|
||||
"experimental_rotation": {
|
||||
"enabled": false,
|
||||
"purpose": "Sample newer models on a schedule to collect comparison data without rate-limit risk.",
|
||||
|
||||
72
docs/PRD.md
72
docs/PRD.md
@ -1,8 +1,8 @@
|
||||
# PRD: Lakehouse — Rust-First Substrate for Versioned Knowledge Stores
|
||||
|
||||
**Status:** Active — Phases 0-18 shipped; hybrid SQL+Vector search operational; IVF_PQ recall tuned to 0.97; 10K real staffing profiles ingested from Ethereal; autonomous agent + staffing simulation verified
|
||||
**Status:** Active — Phases 0-18 shipped; hybrid SQL+Vector search operational; IVF_PQ recall tuned to 1.000 at p50 ≈ 7.4ms via `nprobes`+`refine`; autonomous agent rotates across full index portfolio; cron-scheduled ingest; eval federation complete
|
||||
**Created:** 2026-03-27
|
||||
**Last updated:** 2026-04-17 — hybrid search closes the SQL+vector gap; quality eval surfaced and fixed real bugs
|
||||
**Last updated:** 2026-04-20 — portfolio-wide autotune, real cron, evals federation, bucket-migrate, IVF_PQ recall 0.805 → 1.000
|
||||
**Owner:** J
|
||||
|
||||
---
|
||||
@ -126,6 +126,15 @@ User question → gateway
|
||||
9. **Every reader gets its own profile.** Human operators, AI agents, and local models are all clients of the same substrate. Each has a named profile with its own bucket, vector indexes, trial history, and dataset bindings. Profiles are a first-class architectural concept, not a tenancy afterthought. (Phase 17)
|
||||
10. **Trials are data, not logs.** Every index build is a trial with measurable metrics. The trial journal IS the agent's memory for how to tune itself. Stored as write-once batched JSONL per the ADR-018 append-log pattern.
|
||||
11. **Operational failures are findable in one HTTP call.** The bucket error journal, trial journal, and audit log all expose `/storage/errors`, `/hnsw/trials`, `/access/audit` with structured filter + aggregation. No `grep` archaeology to answer "what broke?"
|
||||
12. **Playbooks feed the index, not just the log.** A completed playbook isn't just a record of what worked — it's a signal that shapes future rankings. Every `successful_playbooks` row contributes to the playbook-memory vector index, so semantically-similar future operations re-rank toward workers that have actually succeeded in comparable fills. This is the "system gets smarter over time" dimension that distinguishes this substrate from a static search engine. (Phase 19)
|
||||
|
||||
---
|
||||
|
||||
## Vision drift acknowledged (2026-04-20)
|
||||
|
||||
The system as shipped through Phase 18 is a **hybrid SQL+vector search engine with a playbook log**. The original pitch (and the "staffing AI co-pilot" framing) implied a **meta-index that learns from playbooks over time** — hot-swap profiles weren't just routing, they were knowledge generations that compounded. That learning loop was never built; playbooks were write-only. Phase 19 closes that gap explicitly.
|
||||
|
||||
The feedback signal is **statistical + semantic**, not neural. No model training — the index reads the playbook journal, computes operation-similarity, and boosts endorsed workers at query time. Rebuildable from `successful_playbooks` alone, same as every other derived index.
|
||||
|
||||
---
|
||||
|
||||
@ -349,12 +358,67 @@ The question raised 2026-04-16 after J's LLMS3 knowledge base identified Lance a
|
||||
|
||||
Per-profile `vector_backend: Parquet | Lance` becomes part of Phase 17 (model profiles). See ADR-019 for the full scorecard and caveats.
|
||||
|
||||
### Phase 19+: Further horizon
|
||||
### Phase 19: Playbook memory (meta-index) — the feedback loop
|
||||
|
||||
Make successful playbooks actually improve future searches. Today `successful_playbooks` is a write-only log; future-you looks at it and thinks "cool, we filled Toledo welders once" — but the index has no idea it happened, so the next Toledo-welder search ranks the same as if none of those fills had existed. Phase 19 closes the loop.
|
||||
|
||||
| Step | Deliverable | Gate |
|
||||
|---|---|---|
|
||||
| 19.1 | Embed every `successful_playbooks` row — operation + approach + context → one chunk per playbook | A new dataset `playbook_memory` appears in catalog with N rows = row count of `successful_playbooks` |
|
||||
| 19.2 | Vector index on `playbook_memory` (HNSW or Lance — whichever `agent-parquet` profile uses) | `/vectors/search` against `playbook_memory` returns semantically similar past playbooks |
|
||||
| 19.3 | Endorsement extraction: each playbook row has `fills[]` (worker_ids it succeeded with). Parse them out at ingest time and store in a sidecar `playbook_endorsements` Parquet keyed by playbook_id | `SELECT * FROM playbook_endorsements WHERE playbook_id = 'X'` returns the worker_ids |
|
||||
| 19.4 | `/vectors/hybrid` gains opt-in `use_playbook_memory: bool`. When true: after hybrid ranks candidates, find top-K similar past playbooks (semantic search over `playbook_memory`), extract endorsed worker_ids, add a bounded boost to candidates in the endorsed set, re-rank | A search where the "right" worker is known from a prior playbook ranks higher with the flag than without |
|
||||
| 19.5 | Write-through from multi-agent orchestrator: when two agents seal a playbook, it appends to `successful_playbooks` AND triggers a refresh of `playbook_memory` (via existing Phase C stale-mark path). Next query sees the new signal. | Run the orchestrator → inspect `playbook_memory` → see a new row. Run the same query before/after → ranking differs. |
|
||||
| 19.6 | Ceiling-aware boost: cap the per-worker boost so one popular worker can't dominate future searches. Boost decays with time (optional) so stale playbooks matter less. | Synthetic test: 100 playbooks all filled with the same worker_id; the 101st search still returns a mix, not just that one worker |
|
||||
|
||||
**Gate:** Run a real search before and after a new successful playbook lands. The endorsed workers from similar past operations rank higher in the second call. Demonstrable with a diff of the two result sets.
|
||||
|
||||
**Why this is the right version of "meta-index":** The alternative — training a neural re-ranker on (query, candidate, outcome) triples — is a weeks-long ML story and requires labeled outcome data we don't really have. The statistical-semantic version here is rebuildable from the existing playbook journal, explainable ("boosted because of similar playbooks X, Y, Z"), and invalidatable (delete a playbook → boost goes away on next rebuild). It gets 80% of the payoff at 10% of the cost. Neural re-ranking stays as a future option if it bites.
|
||||
|
||||
**Non-goals for this phase:**
|
||||
- Neural training / fine-tuning. Statistical feedback only.
|
||||
- Hard guarantees about recall lift magnitude. "Measurably better on the demo query" is the gate, not a universal quality claim.
|
||||
- Real-time recomputation on every playbook. Batched refresh via the existing stale-marking path is sufficient.
|
||||
|
||||
### Phase 20: Model Matrix + Overseer Tiers (WIRED 2026-04-21)
|
||||
|
||||
Five-tier routing declared in `config/models.json`. Hot path (T1/T2) stays on local mistral + qwen2.5. Cloud is consulted sparingly for overview (T3 gpt-oss:120b), strategic (T4 qwen3.5:397b), and gatekeeper decisions (T5 kimi-k2-thinking). Every tier declares `context_window` + `context_budget` + `overflow_policy`. See ADR-021 (to add).
|
||||
|
||||
- T1 hot: 50-200 calls/scenario, local only
|
||||
- T2 review: 5-14 calls/event, local only
|
||||
- T3 overview: 1-3 calls/scenario, cloud primary
|
||||
- T4 strategic: 1-10 calls/day, cloud primary
|
||||
- T5 gatekeeper: 1-5 calls/day, audit-logged
|
||||
|
||||
T3 checkpoints + cross-day lessons are wired. Lessons archive to `data/_playbook_lessons/` and load back at next scenario start as `prior_lessons` in executor context.
|
||||
|
||||
### Phase 21: Context Stability & Chunking Pipeline
|
||||
|
||||
**Why this is a phase and not an optimization:** when playbooks accumulate into the thousands, naive concatenation blows any model's context window. LLM Team hit this exact issue running multi-model ranking — context got lost silently, rankings degraded, and there was no pipeline to catch it. We cannot trust cloud inference to save us: bigger context window just raises the cliff. The stable answer is a chunking + caching layer that runs BEFORE any agent call.
|
||||
|
||||
**Guarantees:**
|
||||
1. Every `generate()` call goes through a budget check against the model's declared `context_window` minus `safety_margin`.
|
||||
2. On overflow, the declared `overflow_policy` fires — one of: summarize oldest tool results via T3, cosine-rank top-K lessons, two-pass map-reduce across shards, or escalate to a bigger-context model.
|
||||
3. Chunk shards of large corpora (playbook_memory, lesson archive) are precomputed into `data/_chunk_cache/` keyed by corpus hash. Invalidation is automatic when the hash changes.
|
||||
4. Token estimation uses `chars / 4` (biased safe by ~15%) until we wire the provider's tokenizer.
|
||||
|
||||
**What lives where:**
|
||||
- Budget checker in `crates/aibridge/src/budget.rs` (new)
|
||||
- Chunker in `crates/aibridge/src/chunk.rs` (new) — deterministic splits on sentence/paragraph boundaries, each shard ≤ policy cap
|
||||
- Cache service in `crates/storaged/src/chunk_cache.rs` (new) — `corpus_hash → [shard_id, tokens, content]`
|
||||
- Agent helper `tests/multi-agent/agent.ts::generateSafe()` wraps `generate()` + `generateCloud()` with the checker
|
||||
- Metric emitted on every overflow: `context_overflow{tier=?,policy=?,model=?}` → surfaces on /metrics
|
||||
|
||||
**Status:** `context_window` + `context_budget` + `overflow_policies` WIRED in config/models.json. Enforcement helpers NOT yet wired. Implementation target: current sprint — this is a stability floor, not a feature.
|
||||
|
||||
### Phase 22+: Further horizon
|
||||
|
||||
- PDF OCR for scanned documents (Tesseract integration)
|
||||
- Specialized fine-tuned models per domain (staffing matcher, resume parser)
|
||||
- Video/audio transcript ingest + multimodal embeddings
|
||||
- Neural re-ranker over (query, candidate, outcome) triples — only if Phase 19's statistical feedback plateaus below usable recall
|
||||
- True distributed query (DataFusion multi-node) — only if single-machine ceilings bite
|
||||
- Playbook versioning (version + parent_id + retired_at) — touches gateway + catalogd + mcp-server
|
||||
- Playbook board (6-phase deep_analysis applied to playbook ranking)
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -15,6 +15,54 @@ export const SIDECAR = "http://localhost:3200";
|
||||
export const OLLAMA_CLOUD_URL = process.env.OLLAMA_CLOUD_URL ?? "https://ollama.com";
|
||||
export const OLLAMA_CLOUD_KEY = process.env.OLLAMA_CLOUD_KEY ?? "";
|
||||
|
||||
// Rough token estimator — chars/4 biased safe by ~15%. Swap to a real
|
||||
// tokenizer (tiktoken or provider endpoint) once Phase 21 lands. Good
|
||||
// enough to stop the "context silently truncated" failure mode today.
|
||||
export function estimateTokens(text: string): number {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
// Known context windows — matches crates/../config/models.json. Kept in
|
||||
// code as a fallback so the test harness doesn't crash if the config is
|
||||
// missing. Production path should read from models.json.
|
||||
export const CONTEXT_WINDOWS: Record<string, number> = {
|
||||
"mistral:latest": 32768,
|
||||
"qwen2.5:latest": 32768,
|
||||
"qwen3:latest": 40960,
|
||||
"gpt-oss:20b": 131072,
|
||||
"gpt-oss:120b": 131072,
|
||||
"qwen3.5:397b": 131072,
|
||||
"kimi-k2-thinking": 200000,
|
||||
"kimi-k2:1t": 1048576,
|
||||
"deepseek-v3.1:671b": 131072,
|
||||
"glm-4.7": 131072,
|
||||
};
|
||||
const DEFAULT_CONTEXT_WINDOW = 32768;
|
||||
const DEFAULT_SAFETY_MARGIN = 2000;
|
||||
|
||||
// Fail LOUDLY if a prompt would blow the model's context. The whole
|
||||
// point of Phase 21 is to stop silent truncation — so we throw with the
|
||||
// numbers. Callers that expect to handle overflow should chunk BEFORE
|
||||
// calling; they can also set bypass: true to opt out (T5 gatekeeper
|
||||
// handles its own overflow policy).
|
||||
export function assertContextBudget(
|
||||
model: string,
|
||||
prompt: string,
|
||||
opts: { system?: string; max_tokens?: number; safety_margin?: number; bypass?: boolean } = {}
|
||||
): { estimated: number; window: number; remaining: number } {
|
||||
const window = CONTEXT_WINDOWS[model] ?? DEFAULT_CONTEXT_WINDOW;
|
||||
const safety = opts.safety_margin ?? DEFAULT_SAFETY_MARGIN;
|
||||
const estimated = estimateTokens(prompt) + estimateTokens(opts.system ?? "") + (opts.max_tokens ?? 800);
|
||||
const remaining = window - estimated - safety;
|
||||
if (remaining < 0 && !opts.bypass) {
|
||||
throw new Error(
|
||||
`context overflow: model=${model} est=${estimated}t window=${window}t safety=${safety}t over=${-remaining}t. ` +
|
||||
`Chunk the prompt (see config/models.json overflow_policies) or set bypass:true if you know the risk.`
|
||||
);
|
||||
}
|
||||
return { estimated, window, remaining };
|
||||
}
|
||||
|
||||
// --- Shared types ---
|
||||
|
||||
export type Role = "executor" | "reviewer";
|
||||
@ -101,7 +149,13 @@ export async function generate(model: string, prompt: string, opts: {
|
||||
max_tokens?: number;
|
||||
temperature?: number;
|
||||
system?: string;
|
||||
bypass_budget?: boolean;
|
||||
} = {}): Promise<string> {
|
||||
assertContextBudget(model, prompt, {
|
||||
system: opts.system,
|
||||
max_tokens: opts.max_tokens,
|
||||
bypass: opts.bypass_budget,
|
||||
});
|
||||
const body: Record<string, any> = {
|
||||
model,
|
||||
prompt,
|
||||
@ -126,10 +180,16 @@ export async function generateCloud(model: string, prompt: string, opts: {
|
||||
max_tokens?: number;
|
||||
temperature?: number;
|
||||
system?: string;
|
||||
bypass_budget?: boolean;
|
||||
} = {}): Promise<string> {
|
||||
if (!OLLAMA_CLOUD_KEY) {
|
||||
throw new Error("OLLAMA_CLOUD_KEY not set; cannot reach Ollama Cloud");
|
||||
}
|
||||
assertContextBudget(model, prompt, {
|
||||
system: opts.system,
|
||||
max_tokens: opts.max_tokens,
|
||||
bypass: opts.bypass_budget,
|
||||
});
|
||||
const body: Record<string, any> = {
|
||||
model,
|
||||
prompt,
|
||||
|
||||
@ -146,6 +146,50 @@ interface ScenarioContext {
|
||||
roster: RosterEntry[];
|
||||
results: EventResult[];
|
||||
gap_signals: Array<{ event: string; category: string; detail: string }>;
|
||||
prior_lessons: PriorLesson[];
|
||||
}
|
||||
|
||||
interface PriorLesson {
|
||||
date: string;
|
||||
client: string;
|
||||
cities: string;
|
||||
states: string;
|
||||
lesson: string;
|
||||
events_ok: number;
|
||||
events_total: number;
|
||||
file: string;
|
||||
}
|
||||
|
||||
// Load lessons from prior T3 runs — read-back half of the feedback loop.
|
||||
// Filters to the most relevant by matching ANY city/state with the current
|
||||
// spec, then takes the 3 newest. Keeps startup cheap; file scan is O(n).
|
||||
async function loadPriorLessons(spec: ScenarioSpec): Promise<PriorLesson[]> {
|
||||
try {
|
||||
const { readdir, readFile } = await import("node:fs/promises");
|
||||
const dir = join("data", "_playbook_lessons");
|
||||
const files = await readdir(dir).catch(() => [] as string[]);
|
||||
if (files.length === 0) return [];
|
||||
const specCities = new Set(spec.events.map(e => e.city));
|
||||
const specStates = new Set(spec.events.map(e => e.state));
|
||||
const parsed: PriorLesson[] = [];
|
||||
for (const f of files) {
|
||||
if (!f.endsWith(".json")) continue;
|
||||
try {
|
||||
const raw = await readFile(join(dir, f), "utf8");
|
||||
const rec = JSON.parse(raw);
|
||||
parsed.push({ ...rec, file: f });
|
||||
} catch { /* skip malformed */ }
|
||||
}
|
||||
const relevant = parsed.filter(p => {
|
||||
const cities = (p.cities ?? "").split(",");
|
||||
const states = (p.states ?? "").split(",");
|
||||
return cities.some(c => specCities.has(c)) || states.some(s => specStates.has(s));
|
||||
});
|
||||
relevant.sort((a, b) => (b.date ?? "").localeCompare(a.date ?? ""));
|
||||
return relevant.slice(0, 3);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
// =================== Default scenario ===================
|
||||
@ -493,7 +537,18 @@ CAST(reliability AS DOUBLE) > 0.7.`;
|
||||
}
|
||||
})();
|
||||
|
||||
return `${schemaLock}\n\nEVENT FOCUS:\n${base}`;
|
||||
// Prior-lesson hint — surface up to 2 most recent lessons learned from
|
||||
// T3 overseer runs against this city/state. Terse to avoid diluting the
|
||||
// prompt. The goal is to pass forward hard-won mistakes, not flood the
|
||||
// context. This is the read-back half of the T3 feedback loop.
|
||||
const priorHint = ctx.prior_lessons.length > 0
|
||||
? `\n\nPRIOR LESSONS (from T3 overseer on past runs in similar cities):\n` +
|
||||
ctx.prior_lessons.slice(0, 2).map((p, i) =>
|
||||
`${i + 1}. ${p.date} ${p.client} (${p.cities}): ${p.lesson.replace(/\s+/g, " ").slice(0, 500)}`
|
||||
).join("\n")
|
||||
: "";
|
||||
|
||||
return `${schemaLock}\n\nEVENT FOCUS:\n${base}${priorHint}`;
|
||||
}
|
||||
|
||||
// =================== Artifact generation ===================
|
||||
@ -878,7 +933,8 @@ async function writeRetrospective(ctx: ScenarioContext): Promise<void> {
|
||||
const lines: string[] = [];
|
||||
lines.push(`# Scenario retrospective — ${ctx.spec.client}, ${ctx.spec.date}`);
|
||||
lines.push("");
|
||||
lines.push(`Executor: \`${EXECUTOR_MODEL}\` Reviewer: \`${REVIEWER_MODEL}\` Draft: \`${DRAFT_MODEL}\` Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL}\``);
|
||||
lines.push(`Executor: \`${EXECUTOR_MODEL}\` Reviewer: \`${REVIEWER_MODEL}\` Draft: \`${DRAFT_MODEL}\` Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}\``);
|
||||
lines.push(`Prior lessons loaded into executor context: **${ctx.prior_lessons.length}**${ctx.prior_lessons.length > 0 ? " (from " + ctx.prior_lessons.map(p => p.date).join(", ") + ")" : " (baseline — no prior T3 history)"}`);
|
||||
lines.push("");
|
||||
|
||||
// --- Per-event summary ---
|
||||
@ -1019,12 +1075,15 @@ async function main() {
|
||||
const out_dir = join("tests/multi-agent/playbooks", `scenario-${stamp}`);
|
||||
await mkdir(out_dir, { recursive: true });
|
||||
|
||||
const prior_lessons = await loadPriorLessons(spec);
|
||||
|
||||
const ctx: ScenarioContext = {
|
||||
spec,
|
||||
out_dir,
|
||||
roster: [],
|
||||
results: [],
|
||||
gap_signals: [],
|
||||
prior_lessons,
|
||||
};
|
||||
|
||||
// Initialize output files
|
||||
@ -1033,6 +1092,19 @@ async function main() {
|
||||
await writeFile(join(out_dir, "dispatch.jsonl"), "");
|
||||
await writeFile(join(out_dir, "checkpoints.jsonl"), "");
|
||||
|
||||
// Archive which prior lessons this run will see, so the retrospective
|
||||
// can tell whether the T3 feedback loop actually fed back anything.
|
||||
await writeFile(
|
||||
join(out_dir, "prior_lessons.json"),
|
||||
JSON.stringify(prior_lessons, null, 2)
|
||||
);
|
||||
if (prior_lessons.length > 0) {
|
||||
console.log(`▶ prior lessons loaded: ${prior_lessons.length} (from data/_playbook_lessons/)`);
|
||||
for (const p of prior_lessons) {
|
||||
console.log(` - ${p.date} ${p.client} (${p.cities}) — ${p.events_ok}/${p.events_total} ok`);
|
||||
}
|
||||
}
|
||||
|
||||
const checkpoints: OverviewCheckpoint[] = [];
|
||||
|
||||
console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`);
|
||||
@ -1109,26 +1181,38 @@ async function main() {
|
||||
);
|
||||
console.log(`✓ lesson (${lessonSecs}s) → ${join(out_dir, "lesson.md")}`);
|
||||
|
||||
// Seed the lesson into playbook_memory for future retrieval. Keep
|
||||
// the embedded `approach` + `context` terse per feedback_phase19_seed_text.md;
|
||||
// the rich prose lives in lesson.md and a separate `rationale` field.
|
||||
// Persist the lesson to data/_playbook_lessons/ so future scenarios
|
||||
// can read it verbatim at startup. The /vectors/playbook_memory/seed
|
||||
// endpoint rejects operations that don't match the `fill: Role xN
|
||||
// in City, ST` regex (enforced in crates/vectord/src/service.rs),
|
||||
// so embedding-based retrieval of cross-day lessons isn't wired.
|
||||
// File-based read-back is durable and explicit — future scenarios
|
||||
// pull from the lessons dir at startup and include top-N in the
|
||||
// executor's system context.
|
||||
try {
|
||||
const kinds = [...new Set(ctx.spec.events.map(e => e.kind))].join("+");
|
||||
const cities = [...new Set(ctx.spec.events.map(e => e.city))].slice(0, 3).join(",");
|
||||
await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
operation: `cross-day-lesson-${ctx.spec.date}`,
|
||||
approach: `${kinds} day in ${cities}`,
|
||||
context: `${ctx.spec.client} ${ctx.spec.date}`,
|
||||
rationale: lesson.slice(0, 2000),
|
||||
endorsed_names: [],
|
||||
append: true,
|
||||
}),
|
||||
});
|
||||
const states = [...new Set(ctx.spec.events.map(e => e.state))].slice(0, 3).join(",");
|
||||
const lessonsDir = join("data", "_playbook_lessons");
|
||||
await mkdir(lessonsDir, { recursive: true });
|
||||
const lessonRec = {
|
||||
date: ctx.spec.date,
|
||||
client: ctx.spec.client,
|
||||
cities,
|
||||
states,
|
||||
events_total: ctx.spec.events.length,
|
||||
events_ok: ctx.results.filter(r => r.ok).length,
|
||||
checkpoint_count: checkpoints.length,
|
||||
model: OVERVIEW_MODEL,
|
||||
cloud: OVERVIEW_CLOUD,
|
||||
lesson: lesson.trim(),
|
||||
checkpoints: checkpoints.map(c => ({ after: c.after_event, risk: c.risk, hint: c.hint })),
|
||||
created_at: new Date().toISOString(),
|
||||
};
|
||||
const fname = `${ctx.spec.date}_${ctx.spec.client.replace(/\s+/g, "_")}_${Date.now()}.json`;
|
||||
await writeFile(join(lessonsDir, fname), JSON.stringify(lessonRec, null, 2));
|
||||
console.log(` lesson archived → ${join(lessonsDir, fname)}`);
|
||||
} catch (e) {
|
||||
console.log(` (lesson seed skipped: ${(e as Error).message})`);
|
||||
console.log(` (lesson archive skipped: ${(e as Error).message})`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user