Session infrastructure: OpenRouter + tree-split reducer + observer→LLM Team + scrum_applier #11

Merged
profit merged 118 commits from scrum/auto-apply-19814 into main 2026-04-27 15:55:24 +00:00
Showing only changes of commit d187bcd8ac - Show all commits

View File

@ -37,6 +37,11 @@ const MAX_ATTEMPTS = 9;
// crates/<crate>/src/*.rs. // crates/<crate>/src/*.rs.
const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000); const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000);
const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500); const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500);
// Same-model retry budget after observer rejection. After this many
// quality rejects on the current model, advance to the next provider-
// error fallback. Counts ONLY observer/quality rejects, not provider
// errors (which advance immediately).
const MAX_QUALITY_RETRIES = Number(process.env.LH_SCRUM_MAX_QUALITY_RETRIES ?? 2);
// Appended jsonl so auditor's kb_query can surface scrum findings for // Appended jsonl so auditor's kb_query can surface scrum findings for
// files touched by a PR under review. Part of cohesion plan Phase C. // files touched by a PR under review. Part of cohesion plan Phase C.
const SCRUM_REVIEWS_JSONL = process.env.LH_SCRUM_REVIEWS_OUT const SCRUM_REVIEWS_JSONL = process.env.LH_SCRUM_REVIEWS_OUT
@ -94,27 +99,24 @@ const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES
// Local fallbacks kept for cloud-down scenarios. // Local fallbacks kept for cloud-down scenarios.
// Hot-path pipelines (scenario.ts / execution_loop) stay local per // Hot-path pipelines (scenario.ts / execution_loop) stay local per
// Phase 20 t1_hot — this scrum is not hot path. // Phase 20 t1_hot — this scrum is not hot path.
// 2026-04-25 J architectural correction: stop cascading models on
// every failure. ONE model handles the work, with same-model retries
// using enriched context. Cycle to a different model ONLY on PROVIDER
// errors (network/auth/5xx) — not on quality issues. Quality issues
// signal that the context needs more enrichment, not a different model.
//
// Tree-split (treeSplitFile) is the ONE legitimate model-switch trigger
// for context-overflow, and even that just re-runs the same model
// against smaller chunks.
//
// This ladder is now a SAFETY chain for provider failures, not the
// strategy. Kimi K2.6, Gemini, free-tier, local fallback, etc. were
// removed — they're available as routable tools later (mode router)
// but not as automatic fallbacks.
const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [ const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [
// Paid-OpenRouter top of ladder (2026-04-25 J directive). These give { provider: "openrouter", model: "x-ai/grok-4.1-fast", note: "PRIMARY · Grok 4.1 fast · $0.20/$0.50 · 2M ctx · single-model strategy" },
// us reliable cloud access independent of the Ollama Cloud account { provider: "openrouter", model: "deepseek/deepseek-v4-flash", note: "FALLBACK on provider error · DeepSeek V4 flash · $0.14/$0.28 · 1M ctx" },
// throttle that wedged iter 1-9. Kimi K2.6 has a 25/hour hard cap { provider: "openrouter", model: "qwen/qwen3-235b-a22b-2507", note: "LAST FALLBACK on provider error · Qwen3 235B · $0.07/$0.10 · 262K" },
// enforced by checkRateLimit() — when capped, the ladder skips it.
{ provider: "openrouter", model: "moonshotai/kimi-k2.6", note: "OR paid · Kimi K2.6 · $0.74/$4.66 per M · 256K · 25/hr cap" },
{ provider: "openrouter", model: "x-ai/grok-4.1-fast", note: "OR paid · Grok 4.1 fast · $0.20/$0.50 per M · 2M ctx" },
{ provider: "openrouter", model: "google/gemini-2.5-flash", note: "OR paid · Gemini 2.5 flash · $0.30/$2.50 per M · 1M ctx" },
{ provider: "openrouter", model: "deepseek/deepseek-v4-flash", note: "OR paid · DeepSeek V4 flash · $0.14/$0.28 per M · 1M ctx" },
{ provider: "openrouter", model: "qwen/qwen3-235b-a22b-2507", note: "OR paid · Qwen3 235B · $0.07/$0.10 per M · 262K ctx" },
// Ollama Cloud — kept as middle rungs. May 429 under load (account
// throttle); ladder cycles through them quickly.
{ provider: "ollama_cloud", model: "kimi-k2:1t", note: "cloud 1T — biggest available, 1.4s probe" },
{ provider: "ollama_cloud", model: "qwen3-coder:480b", note: "cloud 480B — coding specialist, 0.9s probe" },
{ provider: "ollama_cloud", model: "deepseek-v3.1:671b", note: "cloud 671B — fast reasoning (1.0s probe)" },
// Free-tier rescue — kept as later fallback. These hallucinate on
// grounding (10-21% verified 2026-04-25) and now must pass observer
// hand-review before scrum accepts them.
{ provider: "openrouter", model: "openai/gpt-oss-120b:free", note: "OpenRouter free 120B — rescue (low grounding observed)" },
{ provider: "openrouter", model: "google/gemma-3-27b-it:free", note: "OpenRouter free 27B — fastest rescue, 1.4s probe" },
{ provider: "ollama", model: "qwen3.5:latest", note: "local qwen3.5 — last-resort if all cloud down" },
// Dropped from the ladder after 2026-04-24 probe: // Dropped from the ladder after 2026-04-24 probe:
// - kimi-k2.6 — not available on current tier (empty response) // - kimi-k2.6 — not available on current tier (empty response)
// - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist) // - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist)
@ -1122,18 +1124,32 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
// Collect attempts for the pathway trace sidecar. // Collect attempts for the pathway trace sidecar.
const pathwayAttempts: LadderAttemptRec[] = []; const pathwayAttempts: LadderAttemptRec[] = [];
// Single-model strategy with same-model retry. modelIdx advances
// only on PROVIDER errors. Quality rejects from observer keep the
// same model and retry with enriched context (history feeds back
// into the `learning` preamble so the model sees what was wrong).
// After MAX_QUALITY_RETRIES on the current model, advance to the
// next fallback model in the safety chain.
let modelIdx = 0;
let qualityRetriesOnCurrentModel = 0;
for (let step = 0; step < MAX_ATTEMPTS; step++) { for (let step = 0; step < MAX_ATTEMPTS; step++) {
const i = ladderOrder[step]; if (modelIdx >= ladderOrder.length) {
log(` ✗ all ${ladderOrder.length} fallback models exhausted, marking UNRESOLVED`);
break;
}
const i = ladderOrder[modelIdx];
const n = step + 1; const n = step + 1;
const rung = LADDER[i]; const rung = LADDER[i];
// Per-model rate limit (e.g. Kimi K2.6 capped at 25/hour). When // Per-model rate limit. When capped, advance modelIdx (this model
// capped, log + skip the rung. Doesn't increment `n` so subsequent // is unavailable for the rest of the hour) and reset retries.
// logs stay readable; just continues to the next rung in ladderOrder.
const limit = MODEL_RATE_LIMITS[rung.model]; const limit = MODEL_RATE_LIMITS[rung.model];
if (limit && !(await checkRateLimit(rung.model, limit.perHour))) { if (limit && !(await checkRateLimit(rung.model, limit.perHour))) {
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model} — SKIP (rate-limited: cap ${limit.perHour}/hr reached)`); log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model} — SKIP (rate-limited: cap ${limit.perHour}/hr reached)`);
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: 0, accepted: false, reject_reason: `rate-limited (cap ${limit.perHour}/hr)` }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: 0, accepted: false, reject_reason: `rate-limited (cap ${limit.perHour}/hr)` });
modelIdx++;
qualityRetriesOnCurrentModel = 0;
continue; continue;
} }
@ -1141,7 +1157,10 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status}${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══` ? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status}${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══`
: ""; : "";
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}`); const retryTag = qualityRetriesOnCurrentModel > 0
? ` [retry ${qualityRetriesOnCurrentModel + 1}/${MAX_QUALITY_RETRIES + 1} same model + enrichment]`
: "";
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}${retryTag}`);
const attemptStarted = Date.now(); const attemptStarted = Date.now();
if (limit) await recordRateLimitCall(rung.model); if (limit) await recordRateLimitCall(rung.model);
const r = await chat({ const r = await chat({
@ -1153,15 +1172,28 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
const attemptMs = Date.now() - attemptStarted; const attemptMs = Date.now() - attemptStarted;
if (r.error) { if (r.error) {
// PROVIDER error (network, auth, 5xx) → cycle to next fallback
// model. Reset retry counter for the new model.
history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) }); history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) });
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `error: ${r.error.slice(0, 100)}` }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `error: ${r.error.slice(0, 100)}` });
log(` ✗ error: ${r.error.slice(0, 80)}`); log(` ✗ provider error: ${r.error.slice(0, 80)} — advancing to next fallback model`);
modelIdx++;
qualityRetriesOnCurrentModel = 0;
continue; continue;
} }
if (!isAcceptable(r.content)) { if (!isAcceptable(r.content)) {
// Thin/unstructured response = quality issue. Retry SAME model
// with the failure logged to learning so it sees what to fix.
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` }); history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` });
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `thin (${r.content.length} chars)` }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `thin (${r.content.length} chars)` });
log(` ✗ thin/unstructured (${r.content.length} chars)`); qualityRetriesOnCurrentModel++;
if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) {
log(` ✗ thin (${r.content.length} chars) — quality retries exhausted on ${rung.model}, advancing fallback`);
modelIdx++;
qualityRetriesOnCurrentModel = 0;
} else {
log(` ✗ thin (${r.content.length} chars) — retrying same model with enrichment hint`);
}
continue; continue;
} }
// Compute grounding stats as DATA — feed to observer for hand-review. // Compute grounding stats as DATA — feed to observer for hand-review.
@ -1184,10 +1216,21 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
attempt: n, attempt: n,
}); });
if (obsVerdict.verdict === "reject" || obsVerdict.verdict === "cycle") { if (obsVerdict.verdict === "reject" || obsVerdict.verdict === "cycle") {
// Observer rejected on quality grounds → retry SAME model with
// the rejection notes feeding into `learning`. This is the
// architectural correction (J 2026-04-25): quality issues mean
// the context needs more enrichment, not a different model.
const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`; const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`;
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason }); history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason });
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason }); pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason });
log(`${reason} — cycling ladder`); qualityRetriesOnCurrentModel++;
if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) {
log(`${reason} — quality retries exhausted on ${rung.model}, advancing fallback`);
modelIdx++;
qualityRetriesOnCurrentModel = 0;
} else {
log(`${reason} — retrying same model with enrichment hint`);
}
continue; continue;
} }
history.push({ n, model: rung.model, status: "accepted", chars: r.content.length }); history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });