Model matrix — 5 tiers, local hard workers + cloud overseers
config/models.json is the authoritative catalog. Hot path (T1/T2) stays local; cloud is consulted only for overview (T3), strategic (T4), and gatekeeper (T5) calls. J named qwen3.5 + newer models (minimax-m2.7, glm-5, qwen3-next) specifically — all mapped with real reachable IDs verified against ollama.com/api/tags. Tier shape: - t1_hot mistral + qwen2.5 local — 50-200 calls/scenario - t2_review qwen2.5 + qwen3 local — 5-14 calls/event - t3_overview gpt-oss:120b cloud — 1-3 calls/scenario - t4_strategic qwen3.5:397b + glm-4.7 — 1-10 calls/day - t5_gatekeeper kimi-k2-thinking — 1-5 calls/day, audit-logged Rate budgets are declared in-config — Ollama Cloud paid tier is generous but we cap overview/strategic/gatekeeper so no single rogue scenario can blow the day's quota. Experimental rotation list wired but disabled by default. When enabled, T4 randomly routes 10% of calls to a rotating minimax/GLM/qwen-next/ deepseek/nemotron/cogito/mistral-large candidate, logs comparisons, and auto-promotes after 3 rotations of wins. Playbook versioning SPEC embedded under `playbook_versioning` key: every seed gets version + parent_id + retired_at + architecture_snapshot, so when a schema migration breaks a playbook we can pinpoint which change retired it. Implementation flagged for next sprint (touches gateway + catalogd + mcp-server) — not wired here. - scenario.ts now loads config/models.json at init, env vars still override - mcp-server exposes /models/matrix read-only so UI can render it
This commit is contained in:
parent
e4ae5b646e
commit
03d723e7e6
119
config/models.json
Normal file
119
config/models.json
Normal file
@ -0,0 +1,119 @@
|
||||
{
|
||||
"_description": "Lakehouse model matrix — authoritative routing for all agent tiers. Local models do the heavy lifting; cloud models are consulted sparingly for overview, strategic, and gatekeeper decisions. Read by tests/multi-agent/scenario.ts and mcp-server/index.ts.",
|
||||
"version": 1,
|
||||
"updated": "2026-04-21",
|
||||
"providers": {
|
||||
"ollama_local": {
|
||||
"base_url": "http://localhost:11434",
|
||||
"key_env": null
|
||||
},
|
||||
"ollama_cloud": {
|
||||
"base_url": "https://ollama.com",
|
||||
"key_env": "OLLAMA_CLOUD_KEY",
|
||||
"key_source": "/root/llm_team_config.json → providers.ollama_cloud.api_key",
|
||||
"rate_budget": {
|
||||
"calls_per_hour": 200,
|
||||
"calls_per_day": 2000,
|
||||
"notes": "Paid tier — generous. Policy: keep overview calls ≤ 3/scenario, strategic ≤ 10/day, gatekeeper ≤ 5/day."
|
||||
}
|
||||
}
|
||||
},
|
||||
"tiers": {
|
||||
"t1_hot": {
|
||||
"purpose": "Per tool call — SQL generation, hybrid_search, sql(). Runs 50-200 times per scenario. Latency-sensitive.",
|
||||
"kind": "local_fast",
|
||||
"primary": { "model": "mistral:latest", "provider": "ollama_local" },
|
||||
"fallback": { "model": "qwen2.5:latest", "provider": "ollama_local" },
|
||||
"max_tokens": 800,
|
||||
"temperature": 0.3,
|
||||
"never_route_cloud": true,
|
||||
"rationale": "Mistral produces valid JSON reliably. Qwen2.5 is the consensus reviewer. Known flakiness on 5-fill + misplacement events — do NOT mask by upgrading; route to T3 for post-hoc review instead."
|
||||
},
|
||||
"t2_review": {
|
||||
"purpose": "Per step consensus — executor ↔ reviewer loop critique. 5-14 calls per event.",
|
||||
"kind": "local_balanced",
|
||||
"primary": { "model": "qwen2.5:latest", "provider": "ollama_local" },
|
||||
"fallback": { "model": "qwen3:latest", "provider": "ollama_local" },
|
||||
"max_tokens": 600,
|
||||
"temperature": 0.3,
|
||||
"never_route_cloud": true,
|
||||
"rationale": "Reviewer only needs to detect schema violations and drift — a 7B model is sufficient."
|
||||
},
|
||||
"t3_overview": {
|
||||
"purpose": "Mid-day checkpoint after every misplacement + every Nth event, and cross-day lesson. 1-3 calls per scenario.",
|
||||
"kind": "thinking_cloud",
|
||||
"primary": { "model": "gpt-oss:120b", "provider": "ollama_cloud" },
|
||||
"local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" },
|
||||
"max_tokens": 900,
|
||||
"temperature": 0.2,
|
||||
"cloud_budget_per_scenario": 5,
|
||||
"env_flag": "LH_OVERVIEW_CLOUD=1",
|
||||
"rationale": "Same prompt family as local 20b (gpt-oss series) — prompts port directly. 120b is faster via cloud than 20b local in practice, and lessons are noticeably more specific."
|
||||
},
|
||||
"t4_strategic": {
|
||||
"purpose": "Daily playbook board re-ranking, weekly gap audit, pattern discovery across accumulated playbooks. 1-10 calls per day.",
|
||||
"kind": "thinking_cloud_large",
|
||||
"primary": { "model": "qwen3.5:397b", "provider": "ollama_cloud" },
|
||||
"fallback": { "model": "glm-4.7", "provider": "ollama_cloud" },
|
||||
"local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" },
|
||||
"max_tokens": 2000,
|
||||
"temperature": 0.2,
|
||||
"cloud_budget_per_day": 10,
|
||||
"rationale": "J named qwen3.5 specifically. GLM-4.7 is a promising alternate for debate phase. Runs after all scenarios complete for the day."
|
||||
},
|
||||
"t5_gatekeeper": {
|
||||
"purpose": "MUST route here: architecture changes, new client onboarding, schema migrations, playbook retirements, index rebuilds, autotune config changes.",
|
||||
"kind": "thinking_cloud_deepest",
|
||||
"primary": { "model": "kimi-k2-thinking", "provider": "ollama_cloud" },
|
||||
"fallback": { "model": "deepseek-v3.1:671b", "provider": "ollama_cloud" },
|
||||
"secondary_fallback": { "model": "qwen3.5:397b", "provider": "ollama_cloud" },
|
||||
"local_fallback": { "model": "gpt-oss:20b", "provider": "ollama_local" },
|
||||
"max_tokens": 4000,
|
||||
"temperature": 0.1,
|
||||
"cloud_budget_per_day": 5,
|
||||
"audit_log": true,
|
||||
"rationale": "Highest-stakes decisions — reasoning depth matters more than latency. Audit log so J can always see what the gatekeeper was asked and what it answered. No human approval required today; escalate later if mis-decisions show up."
|
||||
}
|
||||
},
|
||||
"experimental_rotation": {
|
||||
"enabled": false,
|
||||
"purpose": "Sample newer models on a schedule to collect comparison data without rate-limit risk.",
|
||||
"candidates": [
|
||||
{ "model": "minimax-m2.7", "notes": "Newer minimax; unknown output stability" },
|
||||
{ "model": "glm-5", "notes": "GLM next-gen; larger context" },
|
||||
{ "model": "glm-5.1", "notes": "Incremental on GLM-5" },
|
||||
{ "model": "qwen3-next:80b", "notes": "Qwen's experimental successor; smaller than 3.5" },
|
||||
{ "model": "qwen3-coder-next", "notes": "Coder-optimized — good for SQL gen T1 experiments" },
|
||||
{ "model": "deepseek-v3.2", "notes": "Smaller deepseek; reasoning/coding" },
|
||||
{ "model": "nemotron-3-super", "notes": "NVIDIA 230B; general" },
|
||||
{ "model": "cogito-2.1:671b", "notes": "671B general" },
|
||||
{ "model": "mistral-large-3:675b", "notes": "Mistral's flagship; good T3 candidate" }
|
||||
],
|
||||
"rotation": "weekly",
|
||||
"sample_rate": 0.1,
|
||||
"apply_to_tier": "t4_strategic",
|
||||
"notes": "When enabled, T4 routes 10% of calls to a rotating experimental model. Log comparison in /data/_model_eval/ — if the experimental consistently beats primary across 3 rotations, promote it to primary."
|
||||
},
|
||||
"playbook_versioning": {
|
||||
"enabled": true,
|
||||
"purpose": "A playbook can work, then break when architecture changes. Versioning lets us pin which change retired which playbook.",
|
||||
"dataset": "playbook_memory",
|
||||
"schema_additions": {
|
||||
"version": "integer — auto-increment per operation",
|
||||
"parent_id": "string — previous version entry_id for same operation (null for v1)",
|
||||
"retired_at": "timestamp — set when success_rate drops or architecture changes",
|
||||
"retirement_reason": "string — e.g. 'schema_migration:workers_500k 2026-05-03'",
|
||||
"architecture_snapshot": "object — crate versions, index name, schema fingerprint at seed time"
|
||||
},
|
||||
"retire_triggers": [
|
||||
"success_rate < 0.3 over last 20 citations",
|
||||
"schema_fingerprint mismatch detected at retrieval time",
|
||||
"architecture change event emitted by ingestd/vectord",
|
||||
"T5 gatekeeper explicitly retires via /vectors/playbook_memory/retire"
|
||||
],
|
||||
"read_back_policy": "retrieval returns only non-retired versions. History endpoint /vectors/playbook_memory/history/{operation} returns the full chain.",
|
||||
"ui_surface": "mcp-server to render a diff view: side-by-side of playbook versions with a timeline of what changed and when.",
|
||||
"implementation_status": "SPEC — not yet wired. Target: next sprint. Touches gateway + catalogd + mcp-server."
|
||||
},
|
||||
"matrix_index_hybrid_search_note": "Phase 22 candidate: elevate the hybrid_search T1 tool to consult T3 when a pool returns <3 matches OR when the same (role, city) combo has failed N times in 24h. Consult result is a reformulated sql_filter the executor retries with. Keeps T1 fast on the happy path, escalates to T3 only on low-recall signals."
|
||||
}
|
||||
@ -833,6 +833,22 @@ async function main() {
|
||||
});
|
||||
}
|
||||
|
||||
// Model matrix — read config/models.json and expose read-only.
|
||||
// Strips internal notes that could drift; the source of truth is
|
||||
// the file itself. UI can render tiers, rate budgets, and the
|
||||
// experimental rotation list from this endpoint.
|
||||
if (url.pathname === "/models/matrix") {
|
||||
try {
|
||||
const raw = await Bun.file("../config/models.json").text();
|
||||
return ok(JSON.parse(raw));
|
||||
} catch (e) {
|
||||
return new Response(JSON.stringify({ error: `models.json not found: ${(e as Error).message}` }), {
|
||||
status: 404,
|
||||
headers: { "content-type": "application/json" },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Proof JSON API (same data, no HTML)
|
||||
if (url.pathname === "/proof.json") {
|
||||
const ds = await api("GET", "/catalog/datasets") as any[];
|
||||
|
||||
@ -53,8 +53,29 @@ const DRAFT_MODEL = "qwen2.5:latest"; // artifact generation; short outputs
|
||||
// every N events, and (A) cross-day lesson after all events complete.
|
||||
// gpt-oss:20b is a thinking model: it spends tokens in a hidden reasoning
|
||||
// block before emitting `response`. Budget accordingly — never under 400.
|
||||
// Model matrix — config/models.json is authoritative. Env vars override.
|
||||
// Loaded at module init so we can log the tier shape at scenario start.
|
||||
interface ModelTier {
|
||||
primary: { model: string; provider: string };
|
||||
local_fallback?: { model: string; provider: string };
|
||||
max_tokens?: number;
|
||||
temperature?: number;
|
||||
env_flag?: string;
|
||||
}
|
||||
let MODEL_MATRIX: { tiers: Record<string, ModelTier> } = { tiers: {} };
|
||||
try {
|
||||
MODEL_MATRIX = JSON.parse(await Bun.file("config/models.json").text());
|
||||
} catch {
|
||||
// Config optional — env vars alone work too. Silent: the per-tier
|
||||
// logging below will show "default" if matrix is empty.
|
||||
}
|
||||
|
||||
const T3_TIER = MODEL_MATRIX.tiers?.t3_overview;
|
||||
const OVERVIEW_CLOUD = process.env.LH_OVERVIEW_CLOUD === "1";
|
||||
const OVERVIEW_MODEL = process.env.LH_OVERVIEW_MODEL ?? (OVERVIEW_CLOUD ? "gpt-oss:120b" : "gpt-oss:20b");
|
||||
const OVERVIEW_MODEL = process.env.LH_OVERVIEW_MODEL
|
||||
?? (OVERVIEW_CLOUD
|
||||
? (T3_TIER?.primary.model ?? "gpt-oss:120b")
|
||||
: (T3_TIER?.local_fallback?.model ?? "gpt-oss:20b"));
|
||||
const T3_CHECKPOINT_EVERY = Number(process.env.LH_T3_CHECKPOINT_EVERY ?? 3);
|
||||
const T3_DISABLED = process.env.LH_T3_DISABLE === "1";
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user