diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts index 362d438..8af8fb2 100644 --- a/tests/real-world/scrum_master_pipeline.ts +++ b/tests/real-world/scrum_master_pipeline.ts @@ -37,6 +37,11 @@ const MAX_ATTEMPTS = 9; // crates//src/*.rs. const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000); const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500); +// Same-model retry budget after observer rejection. After this many +// quality rejects on the current model, advance to the next provider- +// error fallback. Counts ONLY observer/quality rejects, not provider +// errors (which advance immediately). +const MAX_QUALITY_RETRIES = Number(process.env.LH_SCRUM_MAX_QUALITY_RETRIES ?? 2); // Appended jsonl so auditor's kb_query can surface scrum findings for // files touched by a PR under review. Part of cohesion plan Phase C. const SCRUM_REVIEWS_JSONL = process.env.LH_SCRUM_REVIEWS_OUT @@ -94,27 +99,24 @@ const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES // Local fallbacks kept for cloud-down scenarios. // Hot-path pipelines (scenario.ts / execution_loop) stay local per // Phase 20 t1_hot — this scrum is not hot path. +// 2026-04-25 J architectural correction: stop cascading models on +// every failure. ONE model handles the work, with same-model retries +// using enriched context. Cycle to a different model ONLY on PROVIDER +// errors (network/auth/5xx) — not on quality issues. Quality issues +// signal that the context needs more enrichment, not a different model. +// +// Tree-split (treeSplitFile) is the ONE legitimate model-switch trigger +// for context-overflow, and even that just re-runs the same model +// against smaller chunks. +// +// This ladder is now a SAFETY chain for provider failures, not the +// strategy. Kimi K2.6, Gemini, free-tier, local fallback, etc. were +// removed — they're available as routable tools later (mode router) +// but not as automatic fallbacks. const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [ - // Paid-OpenRouter top of ladder (2026-04-25 J directive). These give - // us reliable cloud access independent of the Ollama Cloud account - // throttle that wedged iter 1-9. Kimi K2.6 has a 25/hour hard cap - // enforced by checkRateLimit() — when capped, the ladder skips it. - { provider: "openrouter", model: "moonshotai/kimi-k2.6", note: "OR paid · Kimi K2.6 · $0.74/$4.66 per M · 256K · 25/hr cap" }, - { provider: "openrouter", model: "x-ai/grok-4.1-fast", note: "OR paid · Grok 4.1 fast · $0.20/$0.50 per M · 2M ctx" }, - { provider: "openrouter", model: "google/gemini-2.5-flash", note: "OR paid · Gemini 2.5 flash · $0.30/$2.50 per M · 1M ctx" }, - { provider: "openrouter", model: "deepseek/deepseek-v4-flash", note: "OR paid · DeepSeek V4 flash · $0.14/$0.28 per M · 1M ctx" }, - { provider: "openrouter", model: "qwen/qwen3-235b-a22b-2507", note: "OR paid · Qwen3 235B · $0.07/$0.10 per M · 262K ctx" }, - // Ollama Cloud — kept as middle rungs. May 429 under load (account - // throttle); ladder cycles through them quickly. - { provider: "ollama_cloud", model: "kimi-k2:1t", note: "cloud 1T — biggest available, 1.4s probe" }, - { provider: "ollama_cloud", model: "qwen3-coder:480b", note: "cloud 480B — coding specialist, 0.9s probe" }, - { provider: "ollama_cloud", model: "deepseek-v3.1:671b", note: "cloud 671B — fast reasoning (1.0s probe)" }, - // Free-tier rescue — kept as later fallback. These hallucinate on - // grounding (10-21% verified 2026-04-25) and now must pass observer - // hand-review before scrum accepts them. - { provider: "openrouter", model: "openai/gpt-oss-120b:free", note: "OpenRouter free 120B — rescue (low grounding observed)" }, - { provider: "openrouter", model: "google/gemma-3-27b-it:free", note: "OpenRouter free 27B — fastest rescue, 1.4s probe" }, - { provider: "ollama", model: "qwen3.5:latest", note: "local qwen3.5 — last-resort if all cloud down" }, + { provider: "openrouter", model: "x-ai/grok-4.1-fast", note: "PRIMARY · Grok 4.1 fast · $0.20/$0.50 · 2M ctx · single-model strategy" }, + { provider: "openrouter", model: "deepseek/deepseek-v4-flash", note: "FALLBACK on provider error · DeepSeek V4 flash · $0.14/$0.28 · 1M ctx" }, + { provider: "openrouter", model: "qwen/qwen3-235b-a22b-2507", note: "LAST FALLBACK on provider error · Qwen3 235B · $0.07/$0.10 · 262K" }, // Dropped from the ladder after 2026-04-24 probe: // - kimi-k2.6 — not available on current tier (empty response) // - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist) @@ -1122,18 +1124,32 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of // Collect attempts for the pathway trace sidecar. const pathwayAttempts: LadderAttemptRec[] = []; + // Single-model strategy with same-model retry. modelIdx advances + // only on PROVIDER errors. Quality rejects from observer keep the + // same model and retry with enriched context (history feeds back + // into the `learning` preamble so the model sees what was wrong). + // After MAX_QUALITY_RETRIES on the current model, advance to the + // next fallback model in the safety chain. + let modelIdx = 0; + let qualityRetriesOnCurrentModel = 0; + for (let step = 0; step < MAX_ATTEMPTS; step++) { - const i = ladderOrder[step]; + if (modelIdx >= ladderOrder.length) { + log(` ✗ all ${ladderOrder.length} fallback models exhausted, marking UNRESOLVED`); + break; + } + const i = ladderOrder[modelIdx]; const n = step + 1; const rung = LADDER[i]; - // Per-model rate limit (e.g. Kimi K2.6 capped at 25/hour). When - // capped, log + skip the rung. Doesn't increment `n` so subsequent - // logs stay readable; just continues to the next rung in ladderOrder. + // Per-model rate limit. When capped, advance modelIdx (this model + // is unavailable for the rest of the hour) and reset retries. const limit = MODEL_RATE_LIMITS[rung.model]; if (limit && !(await checkRateLimit(rung.model, limit.perHour))) { log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model} — SKIP (rate-limited: cap ${limit.perHour}/hr reached)`); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: 0, accepted: false, reject_reason: `rate-limited (cap ${limit.perHour}/hr)` }); + modelIdx++; + qualityRetriesOnCurrentModel = 0; continue; } @@ -1141,7 +1157,10 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of ? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status} — ${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══` : ""; - log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}`); + const retryTag = qualityRetriesOnCurrentModel > 0 + ? ` [retry ${qualityRetriesOnCurrentModel + 1}/${MAX_QUALITY_RETRIES + 1} same model + enrichment]` + : ""; + log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}${retryTag}`); const attemptStarted = Date.now(); if (limit) await recordRateLimitCall(rung.model); const r = await chat({ @@ -1153,15 +1172,28 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of const attemptMs = Date.now() - attemptStarted; if (r.error) { + // PROVIDER error (network, auth, 5xx) → cycle to next fallback + // model. Reset retry counter for the new model. history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `error: ${r.error.slice(0, 100)}` }); - log(` ✗ error: ${r.error.slice(0, 80)}`); + log(` ✗ provider error: ${r.error.slice(0, 80)} — advancing to next fallback model`); + modelIdx++; + qualityRetriesOnCurrentModel = 0; continue; } if (!isAcceptable(r.content)) { + // Thin/unstructured response = quality issue. Retry SAME model + // with the failure logged to learning so it sees what to fix. history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `thin (${r.content.length} chars)` }); - log(` ✗ thin/unstructured (${r.content.length} chars)`); + qualityRetriesOnCurrentModel++; + if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) { + log(` ✗ thin (${r.content.length} chars) — quality retries exhausted on ${rung.model}, advancing fallback`); + modelIdx++; + qualityRetriesOnCurrentModel = 0; + } else { + log(` ✗ thin (${r.content.length} chars) — retrying same model with enrichment hint`); + } continue; } // Compute grounding stats as DATA — feed to observer for hand-review. @@ -1184,10 +1216,21 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of attempt: n, }); if (obsVerdict.verdict === "reject" || obsVerdict.verdict === "cycle") { + // Observer rejected on quality grounds → retry SAME model with + // the rejection notes feeding into `learning`. This is the + // architectural correction (J 2026-04-25): quality issues mean + // the context needs more enrichment, not a different model. const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`; history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason }); - log(` ✗ ${reason} — cycling ladder`); + qualityRetriesOnCurrentModel++; + if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) { + log(` ✗ ${reason} — quality retries exhausted on ${rung.model}, advancing fallback`); + modelIdx++; + qualityRetriesOnCurrentModel = 0; + } else { + log(` ✗ ${reason} — retrying same model with enrichment hint`); + } continue; } history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });