2026-04-27 15:55:24 +00:00
1 changed files with 71 additions and 28 deletions
--- a/tests/real-world/scrum_master_pipeline.ts
+++ b/tests/real-world/scrum_master_pipeline.ts
@ -37,6 +37,11 @@ const MAX_ATTEMPTS = 9;
 // crates/<crate>/src/*.rs.
 const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000);
 const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500);
 // Same-model retry budget after observer rejection. After this many
 // quality rejects on the current model, advance to the next provider-
 // error fallback. Counts ONLY observer/quality rejects, not provider
 // errors (which advance immediately).
 const MAX_QUALITY_RETRIES = Number(process.env.LH_SCRUM_MAX_QUALITY_RETRIES ?? 2);
 // Appended jsonl so auditor's kb_query can surface scrum findings for
 // files touched by a PR under review. Part of cohesion plan Phase C.
 const SCRUM_REVIEWS_JSONL = process.env.LH_SCRUM_REVIEWS_OUT
@ -94,27 +99,24 @@ const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES
 // Local fallbacks kept for cloud-down scenarios.
 // Hot-path pipelines (scenario.ts / execution_loop) stay local per
 // Phase 20 t1_hot — this scrum is not hot path.
 // 2026-04-25 J architectural correction: stop cascading models on
 // every failure. ONE model handles the work, with same-model retries
 // using enriched context. Cycle to a different model ONLY on PROVIDER
 // errors (network/auth/5xx) — not on quality issues. Quality issues
 // signal that the context needs more enrichment, not a different model.
 //
 // Tree-split (treeSplitFile) is the ONE legitimate model-switch trigger
 // for context-overflow, and even that just re-runs the same model
 // against smaller chunks.
 //
 // This ladder is now a SAFETY chain for provider failures, not the
 // strategy. Kimi K2.6, Gemini, free-tier, local fallback, etc. were
 // removed — they're available as routable tools later (mode router)
 // but not as automatic fallbacks.
 const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [
-  // Paid-OpenRouter top of ladder (2026-04-25 J directive). These give
+  { provider: "openrouter",   model: "x-ai/grok-4.1-fast",                   note: "PRIMARY · Grok 4.1 fast · $0.20/$0.50 · 2M ctx · single-model strategy" },
-  // us reliable cloud access independent of the Ollama Cloud account
+  { provider: "openrouter",   model: "deepseek/deepseek-v4-flash",           note: "FALLBACK on provider error · DeepSeek V4 flash · $0.14/$0.28 · 1M ctx" },
-  // throttle that wedged iter 1-9. Kimi K2.6 has a 25/hour hard cap
+  { provider: "openrouter",   model: "qwen/qwen3-235b-a22b-2507",            note: "LAST FALLBACK on provider error · Qwen3 235B · $0.07/$0.10 · 262K" },
  // enforced by checkRateLimit() — when capped, the ladder skips it.
  { provider: "openrouter",   model: "moonshotai/kimi-k2.6",                 note: "OR paid · Kimi K2.6 · $0.74/$4.66 per M · 256K · 25/hr cap" },
  { provider: "openrouter",   model: "x-ai/grok-4.1-fast",                   note: "OR paid · Grok 4.1 fast · $0.20/$0.50 per M · 2M ctx" },
  { provider: "openrouter",   model: "google/gemini-2.5-flash",              note: "OR paid · Gemini 2.5 flash · $0.30/$2.50 per M · 1M ctx" },
  { provider: "openrouter",   model: "deepseek/deepseek-v4-flash",           note: "OR paid · DeepSeek V4 flash · $0.14/$0.28 per M · 1M ctx" },
  { provider: "openrouter",   model: "qwen/qwen3-235b-a22b-2507",            note: "OR paid · Qwen3 235B · $0.07/$0.10 per M · 262K ctx" },
  // Ollama Cloud — kept as middle rungs. May 429 under load (account
  // throttle); ladder cycles through them quickly.
  { provider: "ollama_cloud", model: "kimi-k2:1t",                           note: "cloud 1T — biggest available, 1.4s probe" },
  { provider: "ollama_cloud", model: "qwen3-coder:480b",                     note: "cloud 480B — coding specialist, 0.9s probe" },
  { provider: "ollama_cloud", model: "deepseek-v3.1:671b",                   note: "cloud 671B — fast reasoning (1.0s probe)" },
  // Free-tier rescue — kept as later fallback. These hallucinate on
  // grounding (10-21% verified 2026-04-25) and now must pass observer
  // hand-review before scrum accepts them.
  { provider: "openrouter",   model: "openai/gpt-oss-120b:free",             note: "OpenRouter free 120B — rescue (low grounding observed)" },
  { provider: "openrouter",   model: "google/gemma-3-27b-it:free",           note: "OpenRouter free 27B — fastest rescue, 1.4s probe" },
  { provider: "ollama",       model: "qwen3.5:latest",                       note: "local qwen3.5 — last-resort if all cloud down" },
  // Dropped from the ladder after 2026-04-24 probe:
  //   - kimi-k2.6 — not available on current tier (empty response)
  //   - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist)
@ -1122,18 +1124,32 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
  // Collect attempts for the pathway trace sidecar.
  const pathwayAttempts: LadderAttemptRec[] = [];
  // Single-model strategy with same-model retry. modelIdx advances
  // only on PROVIDER errors. Quality rejects from observer keep the
  // same model and retry with enriched context (history feeds back
  // into the `learning` preamble so the model sees what was wrong).
  // After MAX_QUALITY_RETRIES on the current model, advance to the
  // next fallback model in the safety chain.
  let modelIdx = 0;
  let qualityRetriesOnCurrentModel = 0;
  for (let step = 0; step < MAX_ATTEMPTS; step++) {
-    const i = ladderOrder[step];
+    if (modelIdx >= ladderOrder.length) {
      log(`  ✗ all ${ladderOrder.length} fallback models exhausted, marking UNRESOLVED`);
      break;
    }
    const i = ladderOrder[modelIdx];
    const n = step + 1;
    const rung = LADDER[i];
-    // Per-model rate limit (e.g. Kimi K2.6 capped at 25/hour). When
+    // Per-model rate limit. When capped, advance modelIdx (this model
-    // capped, log + skip the rung. Doesn't increment `n` so subsequent
+    // is unavailable for the rest of the hour) and reset retries.
    // logs stay readable; just continues to the next rung in ladderOrder.
    const limit = MODEL_RATE_LIMITS[rung.model];
    if (limit && !(await checkRateLimit(rung.model, limit.perHour))) {
      log(`  attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model} — SKIP (rate-limited: cap ${limit.perHour}/hr reached)`);
      pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: 0, accepted: false, reject_reason: `rate-limited (cap ${limit.perHour}/hr)` });
      modelIdx++;
      qualityRetriesOnCurrentModel = 0;
      continue;
    }
@ -1141,7 +1157,10 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
      ? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status} — ${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══`
      : "";
-    log(`  attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}`);
+    const retryTag = qualityRetriesOnCurrentModel > 0
      ? ` [retry ${qualityRetriesOnCurrentModel + 1}/${MAX_QUALITY_RETRIES + 1} same model + enrichment]`
      : "";
    log(`  attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}${retryTag}`);
    const attemptStarted = Date.now();
    if (limit) await recordRateLimitCall(rung.model);
    const r = await chat({
@ -1153,15 +1172,28 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
    const attemptMs = Date.now() - attemptStarted;
    if (r.error) {
      // PROVIDER error (network, auth, 5xx) → cycle to next fallback
      // model. Reset retry counter for the new model.
      history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) });
      pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `error: ${r.error.slice(0, 100)}` });
-      log(`    ✗ error: ${r.error.slice(0, 80)}`);
+      log(`    ✗ provider error: ${r.error.slice(0, 80)} — advancing to next fallback model`);
      modelIdx++;
      qualityRetriesOnCurrentModel = 0;
      continue;
    }
    if (!isAcceptable(r.content)) {
      // Thin/unstructured response = quality issue. Retry SAME model
      // with the failure logged to learning so it sees what to fix.
      history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` });
      pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `thin (${r.content.length} chars)` });
-      log(`    ✗ thin/unstructured (${r.content.length} chars)`);
+      qualityRetriesOnCurrentModel++;
      if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) {
        log(`    ✗ thin (${r.content.length} chars) — quality retries exhausted on ${rung.model}, advancing fallback`);
        modelIdx++;
        qualityRetriesOnCurrentModel = 0;
      } else {
        log(`    ✗ thin (${r.content.length} chars) — retrying same model with enrichment hint`);
      }
      continue;
    }
    // Compute grounding stats as DATA — feed to observer for hand-review.
@ -1184,10 +1216,21 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
      attempt: n,
    });
    if (obsVerdict.verdict === "reject" || obsVerdict.verdict === "cycle") {
      // Observer rejected on quality grounds → retry SAME model with
      // the rejection notes feeding into `learning`. This is the
      // architectural correction (J 2026-04-25): quality issues mean
      // the context needs more enrichment, not a different model.
      const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`;
      history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason });
      pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason });
-      log(`    ✗ ${reason} — cycling ladder`);
+      qualityRetriesOnCurrentModel++;
      if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) {
        log(`    ✗ ${reason} — quality retries exhausted on ${rung.model}, advancing fallback`);
        modelIdx++;
        qualityRetriesOnCurrentModel = 0;
      } else {
        log(`    ✗ ${reason} — retrying same model with enrichment hint`);
      }
      continue;
    }
    history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });