From 4ac56564c00e95cc5c3821d9cf5acda44e0d1481 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 25 Apr 2026 17:49:02 -0500 Subject: [PATCH] scrum + applier + observer: switch to paid OpenRouter ladder, add Kimi K2.6 + Gemini 2.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ollama Cloud was throttled across all 6 cloud rungs in iters 1-9, which forced the loop into 0-review iterations even though the architecture was sound. Swapping to paid OpenRouter unblocks the test path. Ladder changes (top-of-ladder paid models, all under $0.85/M either side): - moonshotai/kimi-k2.6 ($0.74/$4.66, 256K) — capped at 25/hr - x-ai/grok-4.1-fast ($0.20/$0.50, 2M) — primary general - google/gemini-2.5-flash ($0.30/$2.50, 1M) — Google reasoning - deepseek/deepseek-v4-flash ($0.14/$0.28, 1M) — cheap workhorse - qwen/qwen3-235b-a22b-2507 ($0.07/$0.10, 262K) — cheapest big Existing rungs (Ollama Cloud + free OR + local qwen3.5) kept as fallback. Per-model rate limiter (MODEL_RATE_LIMITS in scrum_master_pipeline.ts): - Persists call timestamps to data/_kb/rate_limit_calls.jsonl so caps survive process restarts (autonomous loop spawns a fresh subprocess per iteration; without persistence each iter would reset) - O(1) writes, prune-on-read for the rolling 1h window - Capped models log "SKIP (rate-limited: cap N/hr reached)" and the ladder cycles to the next rung - J directive 2026-04-25: 25/hr on Kimi K2.6 to bound output cost Observer hand-review cloud tier swapped from ollama_cloud/qwen3-coder:480b to openrouter/x-ai/grok-4.1-fast — proven to emit precise semantic verdicts (named "AccessControl::can_access() doesn't exist" specifically in 2026-04-25 tests instead of the heuristic fallback). Applier patch emitter swapped from ollama_cloud/qwen3-coder:480b to openrouter/x-ai/grok-4.1-fast (default; LH_APPLIER_MODEL + LH_APPLIER_PROVIDER override). This was the third LLM call we missed — without it, observer accepts a review but applier never produces patches because its emitter was still hitting the throttled account. Co-Authored-By: Claude Opus 4.7 (1M context) --- mcp-server/observer.ts | 9 ++- tests/real-world/scrum_applier.ts | 10 ++- tests/real-world/scrum_master_pipeline.ts | 81 ++++++++++++++++++++--- 3 files changed, 86 insertions(+), 14 deletions(-) diff --git a/mcp-server/observer.ts b/mcp-server/observer.ts index 17e23d8..1150af1 100644 --- a/mcp-server/observer.ts +++ b/mcp-server/observer.ts @@ -325,12 +325,17 @@ Respond ONLY with a JSON object: - reject: review invents APIs, fabricates calls, contradicts source. Do NOT record. - cycle: review is mediocre — partially grounded but wrong shape, try a stronger model.`; + // Hand-review uses paid OpenRouter so it sidesteps the Ollama Cloud + // throttle that drove every prior iter into the heuristic fallback. + // Grok 4.1 fast: $0.20 in / $0.50 out per M tokens, 2M ctx. A typical + // hand-review (~6K input + 300 output) costs ~$0.0014. Selected via + // J directive 2026-04-25 ("best model under $0.72/M"). const resp = await fetch(`${LAKEHOUSE}/v1/chat`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - provider: "ollama_cloud", - model: "qwen3-coder:480b", + provider: "openrouter", + model: "x-ai/grok-4.1-fast", messages: [{ role: "user", content: prompt }], max_tokens: 300, temperature: 0.0, diff --git a/tests/real-world/scrum_applier.ts b/tests/real-world/scrum_applier.ts index 7708a92..f3bad91 100644 --- a/tests/real-world/scrum_applier.ts +++ b/tests/real-world/scrum_applier.ts @@ -48,7 +48,13 @@ const TARGET_FILES = (process.env.LH_APPLIER_FILES ?? "") // for targeted code changes and tends to stay within the mechanical-patch // constraint the prompt asks for. LLM Team's /api/run?mode=patch would be // the ideal choice but that mode isn't registered in llm_team_ui.py yet. -const MODEL = process.env.LH_APPLIER_MODEL ?? "qwen3-coder:480b"; +// Default patch emitter swapped to OpenRouter Grok 4.1 fast (2026-04-25) +// after observing the prior default (ollama_cloud::qwen3-coder:480b) sit +// at 429 throttle and never produce patches. Grok 4.1 fast: $0.20/$0.50 +// per M, 2M ctx, proven to emit precise structured patches in observer +// hand-review tests. Override with LH_APPLIER_MODEL + LH_APPLIER_PROVIDER. +const MODEL = process.env.LH_APPLIER_MODEL ?? "x-ai/grok-4.1-fast"; +const PROVIDER = (process.env.LH_APPLIER_PROVIDER ?? "openrouter") as "ollama_cloud" | "openrouter" | "ollama"; const BRANCH = process.env.LH_APPLIER_BRANCH ?? `scrum/auto-apply-${Date.now().toString(36)}`; // Deny-list — anything whose path starts with one of these is skipped @@ -206,7 +212,7 @@ ${source.slice(0, 14000)} Emit ONLY the JSON object.`; - const r = await chat({ provider: "ollama_cloud", model: MODEL, prompt, max_tokens: 2500 }); + const r = await chat({ provider: PROVIDER, model: MODEL, prompt, max_tokens: 2500 }); if (r.error || !r.content) return []; // Strip markdown fences if model wrapped the JSON. diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts index aabddcd..362d438 100644 --- a/tests/real-world/scrum_master_pipeline.ts +++ b/tests/real-world/scrum_master_pipeline.ts @@ -95,20 +95,26 @@ const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES // Hot-path pipelines (scenario.ts / execution_loop) stay local per // Phase 20 t1_hot — this scrum is not hot path. const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [ + // Paid-OpenRouter top of ladder (2026-04-25 J directive). These give + // us reliable cloud access independent of the Ollama Cloud account + // throttle that wedged iter 1-9. Kimi K2.6 has a 25/hour hard cap + // enforced by checkRateLimit() — when capped, the ladder skips it. + { provider: "openrouter", model: "moonshotai/kimi-k2.6", note: "OR paid · Kimi K2.6 · $0.74/$4.66 per M · 256K · 25/hr cap" }, + { provider: "openrouter", model: "x-ai/grok-4.1-fast", note: "OR paid · Grok 4.1 fast · $0.20/$0.50 per M · 2M ctx" }, + { provider: "openrouter", model: "google/gemini-2.5-flash", note: "OR paid · Gemini 2.5 flash · $0.30/$2.50 per M · 1M ctx" }, + { provider: "openrouter", model: "deepseek/deepseek-v4-flash", note: "OR paid · DeepSeek V4 flash · $0.14/$0.28 per M · 1M ctx" }, + { provider: "openrouter", model: "qwen/qwen3-235b-a22b-2507", note: "OR paid · Qwen3 235B · $0.07/$0.10 per M · 262K ctx" }, + // Ollama Cloud — kept as middle rungs. May 429 under load (account + // throttle); ladder cycles through them quickly. { provider: "ollama_cloud", model: "kimi-k2:1t", note: "cloud 1T — biggest available, 1.4s probe" }, { provider: "ollama_cloud", model: "qwen3-coder:480b", note: "cloud 480B — coding specialist, 0.9s probe" }, { provider: "ollama_cloud", model: "deepseek-v3.1:671b", note: "cloud 671B — fast reasoning (1.0s probe)" }, - { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B — deep analysis (0.9s probe)" }, - { provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B — reliable workhorse (iter1 baseline)" }, - { provider: "ollama_cloud", model: "qwen3.5:397b", note: "cloud 397B dense — deep final thinker (J 2026-04-24)" }, - // Free-tier rescue — different provider backbone, different quota. - // Added 2026-04-24 after iter 5 hit repeated Ollama Cloud 502s on - // kimi-k2:1t. These have lower parameter counts than the Ollama - // Cloud rungs but high availability: if upstream is down, we still - // land a review instead of giving up. - { provider: "openrouter", model: "openai/gpt-oss-120b:free", note: "OpenRouter free 120B — substantive rescue, 2.8s probe" }, + // Free-tier rescue — kept as later fallback. These hallucinate on + // grounding (10-21% verified 2026-04-25) and now must pass observer + // hand-review before scrum accepts them. + { provider: "openrouter", model: "openai/gpt-oss-120b:free", note: "OpenRouter free 120B — rescue (low grounding observed)" }, { provider: "openrouter", model: "google/gemma-3-27b-it:free", note: "OpenRouter free 27B — fastest rescue, 1.4s probe" }, - { provider: "ollama", model: "qwen3.5:latest", note: "local qwen3.5 — best local model per J (2026-04-24), last-resort if all cloud down" }, + { provider: "ollama", model: "qwen3.5:latest", note: "local qwen3.5 — last-resort if all cloud down" }, // Dropped from the ladder after 2026-04-24 probe: // - kimi-k2.6 — not available on current tier (empty response) // - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist) @@ -288,6 +294,49 @@ async function writePathwayTrace(trace: PathwayTracePayload): Promise { } } +// Per-model rate limiter. Persists timestamps to a JSONL file so +// caps survive process restarts (autonomous loop spawns a new +// scrum_master subprocess per iteration; without persistence each +// iter would reset to 0). File is append-only; pruning happens at +// read time to keep writes O(1). +// +// Config: model → { perHour }. Add an entry here to cap a model. +// J directive 2026-04-25: Kimi K2.6 capped at 25/hour because the +// $4.66/M output cost would compound fast otherwise. +const MODEL_RATE_LIMITS: Record = { + "moonshotai/kimi-k2.6": { perHour: 25 }, +}; + +const RATE_LIMIT_LOG = "/home/profit/lakehouse/data/_kb/rate_limit_calls.jsonl"; + +async function readRateLimitTimestamps(model: string, windowMs: number): Promise { + const f = Bun.file(RATE_LIMIT_LOG); + if (!(await f.exists())) return []; + const text = await f.text(); + const cutoff = Date.now() - windowMs; + const ts: number[] = []; + for (const line of text.split("\n")) { + if (!line.trim()) continue; + try { + const r = JSON.parse(line); + if (r.model === model && typeof r.ts === "number" && r.ts >= cutoff) { + ts.push(r.ts); + } + } catch { /* skip malformed */ } + } + return ts; +} + +async function checkRateLimit(model: string, perHour: number): Promise { + const ts = await readRateLimitTimestamps(model, 60 * 60 * 1000); + return ts.length < perHour; +} + +async function recordRateLimitCall(model: string): Promise { + const { appendFile } = await import("node:fs/promises"); + await appendFile(RATE_LIMIT_LOG, JSON.stringify({ model, ts: Date.now() }) + "\n"); +} + async function recordPathwayReplay(pathwayId: string, succeeded: boolean): Promise { try { await fetch(`${GATEWAY}/vectors/pathway/record_replay`, { @@ -1077,12 +1126,24 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of const i = ladderOrder[step]; const n = step + 1; const rung = LADDER[i]; + + // Per-model rate limit (e.g. Kimi K2.6 capped at 25/hour). When + // capped, log + skip the rung. Doesn't increment `n` so subsequent + // logs stay readable; just continues to the next rung in ladderOrder. + const limit = MODEL_RATE_LIMITS[rung.model]; + if (limit && !(await checkRateLimit(rung.model, limit.perHour))) { + log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model} — SKIP (rate-limited: cap ${limit.perHour}/hr reached)`); + pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: 0, accepted: false, reject_reason: `rate-limited (cap ${limit.perHour}/hr)` }); + continue; + } + const learning = history.length > 0 ? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status} — ${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══` : ""; log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}`); const attemptStarted = Date.now(); + if (limit) await recordRateLimitCall(rung.model); const r = await chat({ provider: rung.provider, model: rung.model,