{ "_description": "Lakehouse model matrix — authoritative routing for all agent tiers. Local models do the heavy lifting; cloud models are consulted sparingly for overview, strategic, and gatekeeper decisions. Read by tests/multi-agent/scenario.ts and mcp-server/index.ts.", "version": 1, "updated": "2026-04-21", "providers": { "ollama_local": { "base_url": "http://localhost:11434", "key_env": null }, "ollama_cloud": { "base_url": "https://ollama.com", "key_env": "OLLAMA_CLOUD_KEY", "key_source": "/root/llm_team_config.json → providers.ollama_cloud.api_key", "rate_budget": { "calls_per_hour": 200, "calls_per_day": 2000, "notes": "Paid tier — generous. Policy: keep overview calls ≤ 3/scenario, strategic ≤ 10/day, gatekeeper ≤ 5/day." } } }, "tiers": { "t1_hot": { "purpose": "Per tool call — SQL generation, hybrid_search, sql(). Runs 50-200 times per scenario. Latency-sensitive.", "kind": "local_fast", "primary": { "model": "mistral:latest", "provider": "ollama_local" }, "fallback": { "model": "qwen2.5:latest", "provider": "ollama_local" }, "max_tokens": 800, "temperature": 0.3, "never_route_cloud": true, "rationale": "Mistral produces valid JSON reliably. Qwen2.5 is the consensus reviewer. Known flakiness on 5-fill + misplacement events — do NOT mask by upgrading; route to T3 for post-hoc review instead." }, "t2_review": { "purpose": "Per step consensus — executor ↔ reviewer loop critique. 5-14 calls per event.", "kind": "local_balanced", "primary": { "model": "qwen2.5:latest", "provider": "ollama_local" }, "fallback": { "model": "qwen3:latest", "provider": "ollama_local" }, "max_tokens": 600, "temperature": 0.3, "never_route_cloud": true, "rationale": "Reviewer only needs to detect schema violations and drift — a 7B model is sufficient." }, "t3_overview": { "purpose": "Mid-day checkpoint after every misplacement + every Nth event, and cross-day lesson. 1-3 calls per scenario.", "kind": "thinking_cloud", "primary": { "model": "gpt-oss:120b", "provider": "ollama_cloud" }, "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" }, "max_tokens": 900, "temperature": 0.2, "cloud_budget_per_scenario": 5, "env_flag": "LH_OVERVIEW_CLOUD=1", "rationale": "Same prompt family as local 20b (gpt-oss series) — prompts port directly. 120b is faster via cloud than 20b local in practice, and lessons are noticeably more specific." }, "t4_strategic": { "purpose": "Daily playbook board re-ranking, weekly gap audit, pattern discovery across accumulated playbooks. 1-10 calls per day.", "kind": "thinking_cloud_large", "primary": { "model": "qwen3.5:397b", "provider": "ollama_cloud" }, "fallback": { "model": "glm-4.7", "provider": "ollama_cloud" }, "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" }, "max_tokens": 2000, "temperature": 0.2, "cloud_budget_per_day": 10, "rationale": "J named qwen3.5 specifically. GLM-4.7 is a promising alternate for debate phase. Runs after all scenarios complete for the day." }, "t5_gatekeeper": { "purpose": "MUST route here: architecture changes, new client onboarding, schema migrations, playbook retirements, index rebuilds, autotune config changes.", "kind": "thinking_cloud_deepest", "primary": { "model": "kimi-k2-thinking", "provider": "ollama_cloud" }, "fallback": { "model": "deepseek-v3.1:671b", "provider": "ollama_cloud" }, "secondary_fallback": { "model": "qwen3.5:397b", "provider": "ollama_cloud" }, "local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" }, "max_tokens": 4000, "temperature": 0.1, "cloud_budget_per_day": 5, "audit_log": true, "rationale": "Highest-stakes decisions — reasoning depth matters more than latency. Audit log so J can always see what the gatekeeper was asked and what it answered. No human approval required today; escalate later if mis-decisions show up." } }, "experimental_rotation": { "enabled": false, "purpose": "Sample newer models on a schedule to collect comparison data without rate-limit risk.", "candidates": [ { "model": "minimax-m2.7", "notes": "Newer minimax; unknown output stability" }, { "model": "glm-5", "notes": "GLM next-gen; larger context" }, { "model": "glm-5.1", "notes": "Incremental on GLM-5" }, { "model": "qwen3-next:80b", "notes": "Qwen's experimental successor; smaller than 3.5" }, { "model": "qwen3-coder-next", "notes": "Coder-optimized — good for SQL gen T1 experiments" }, { "model": "deepseek-v3.2", "notes": "Smaller deepseek; reasoning/coding" }, { "model": "nemotron-3-super", "notes": "NVIDIA 230B; general" }, { "model": "cogito-2.1:671b", "notes": "671B general" }, { "model": "mistral-large-3:675b", "notes": "Mistral's flagship; good T3 candidate" } ], "rotation": "weekly", "sample_rate": 0.1, "apply_to_tier": "t4_strategic", "notes": "When enabled, T4 routes 10% of calls to a rotating experimental model. Log comparison in /data/_model_eval/ — if the experimental consistently beats primary across 3 rotations, promote it to primary." }, "playbook_versioning": { "enabled": true, "purpose": "A playbook can work, then break when architecture changes. Versioning lets us pin which change retired which playbook.", "dataset": "playbook_memory", "schema_additions": { "version": "integer — auto-increment per operation", "parent_id": "string — previous version entry_id for same operation (null for v1)", "retired_at": "timestamp — set when success_rate drops or architecture changes", "retirement_reason": "string — e.g. 'schema_migration:workers_500k 2026-05-03'", "architecture_snapshot": "object — crate versions, index name, schema fingerprint at seed time" }, "retire_triggers": [ "success_rate < 0.3 over last 20 citations", "schema_fingerprint mismatch detected at retrieval time", "architecture change event emitted by ingestd/vectord", "T5 gatekeeper explicitly retires via /vectors/playbook_memory/retire" ], "read_back_policy": "retrieval returns only non-retired versions. History endpoint /vectors/playbook_memory/history/{operation} returns the full chain.", "ui_surface": "mcp-server to render a diff view: side-by-side of playbook versions with a timeline of what changed and when.", "implementation_status": "SPEC — not yet wired. Target: next sprint. Touches gateway + catalogd + mcp-server." }, "matrix_index_hybrid_search_note": "Phase 22 candidate: elevate the hybrid_search T1 tool to consult T3 when a pool returns <3 matches OR when the same (role, city) combo has failed N times in 24h. Consult result is a reformulated sql_filter the executor retries with. Keeps T1 fast on the happy path, escalates to T3 only on low-recall signals." }