Session infrastructure: OpenRouter + tree-split reducer + observer→LLM Team + scrum_applier #11
@ -37,6 +37,11 @@ const MAX_ATTEMPTS = 9;
|
|||||||
// crates/<crate>/src/*.rs.
|
// crates/<crate>/src/*.rs.
|
||||||
const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000);
|
const FILE_TREE_SPLIT_THRESHOLD = Number(process.env.LH_SCRUM_TREE_SPLIT_THRESHOLD ?? 6000);
|
||||||
const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500);
|
const FILE_SHARD_SIZE = Number(process.env.LH_SCRUM_SHARD_SIZE ?? 3500);
|
||||||
|
// Same-model retry budget after observer rejection. After this many
|
||||||
|
// quality rejects on the current model, advance to the next provider-
|
||||||
|
// error fallback. Counts ONLY observer/quality rejects, not provider
|
||||||
|
// errors (which advance immediately).
|
||||||
|
const MAX_QUALITY_RETRIES = Number(process.env.LH_SCRUM_MAX_QUALITY_RETRIES ?? 2);
|
||||||
// Appended jsonl so auditor's kb_query can surface scrum findings for
|
// Appended jsonl so auditor's kb_query can surface scrum findings for
|
||||||
// files touched by a PR under review. Part of cohesion plan Phase C.
|
// files touched by a PR under review. Part of cohesion plan Phase C.
|
||||||
const SCRUM_REVIEWS_JSONL = process.env.LH_SCRUM_REVIEWS_OUT
|
const SCRUM_REVIEWS_JSONL = process.env.LH_SCRUM_REVIEWS_OUT
|
||||||
@ -94,27 +99,24 @@ const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES
|
|||||||
// Local fallbacks kept for cloud-down scenarios.
|
// Local fallbacks kept for cloud-down scenarios.
|
||||||
// Hot-path pipelines (scenario.ts / execution_loop) stay local per
|
// Hot-path pipelines (scenario.ts / execution_loop) stay local per
|
||||||
// Phase 20 t1_hot — this scrum is not hot path.
|
// Phase 20 t1_hot — this scrum is not hot path.
|
||||||
|
// 2026-04-25 J architectural correction: stop cascading models on
|
||||||
|
// every failure. ONE model handles the work, with same-model retries
|
||||||
|
// using enriched context. Cycle to a different model ONLY on PROVIDER
|
||||||
|
// errors (network/auth/5xx) — not on quality issues. Quality issues
|
||||||
|
// signal that the context needs more enrichment, not a different model.
|
||||||
|
//
|
||||||
|
// Tree-split (treeSplitFile) is the ONE legitimate model-switch trigger
|
||||||
|
// for context-overflow, and even that just re-runs the same model
|
||||||
|
// against smaller chunks.
|
||||||
|
//
|
||||||
|
// This ladder is now a SAFETY chain for provider failures, not the
|
||||||
|
// strategy. Kimi K2.6, Gemini, free-tier, local fallback, etc. were
|
||||||
|
// removed — they're available as routable tools later (mode router)
|
||||||
|
// but not as automatic fallbacks.
|
||||||
const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [
|
const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [
|
||||||
// Paid-OpenRouter top of ladder (2026-04-25 J directive). These give
|
{ provider: "openrouter", model: "x-ai/grok-4.1-fast", note: "PRIMARY · Grok 4.1 fast · $0.20/$0.50 · 2M ctx · single-model strategy" },
|
||||||
// us reliable cloud access independent of the Ollama Cloud account
|
{ provider: "openrouter", model: "deepseek/deepseek-v4-flash", note: "FALLBACK on provider error · DeepSeek V4 flash · $0.14/$0.28 · 1M ctx" },
|
||||||
// throttle that wedged iter 1-9. Kimi K2.6 has a 25/hour hard cap
|
{ provider: "openrouter", model: "qwen/qwen3-235b-a22b-2507", note: "LAST FALLBACK on provider error · Qwen3 235B · $0.07/$0.10 · 262K" },
|
||||||
// enforced by checkRateLimit() — when capped, the ladder skips it.
|
|
||||||
{ provider: "openrouter", model: "moonshotai/kimi-k2.6", note: "OR paid · Kimi K2.6 · $0.74/$4.66 per M · 256K · 25/hr cap" },
|
|
||||||
{ provider: "openrouter", model: "x-ai/grok-4.1-fast", note: "OR paid · Grok 4.1 fast · $0.20/$0.50 per M · 2M ctx" },
|
|
||||||
{ provider: "openrouter", model: "google/gemini-2.5-flash", note: "OR paid · Gemini 2.5 flash · $0.30/$2.50 per M · 1M ctx" },
|
|
||||||
{ provider: "openrouter", model: "deepseek/deepseek-v4-flash", note: "OR paid · DeepSeek V4 flash · $0.14/$0.28 per M · 1M ctx" },
|
|
||||||
{ provider: "openrouter", model: "qwen/qwen3-235b-a22b-2507", note: "OR paid · Qwen3 235B · $0.07/$0.10 per M · 262K ctx" },
|
|
||||||
// Ollama Cloud — kept as middle rungs. May 429 under load (account
|
|
||||||
// throttle); ladder cycles through them quickly.
|
|
||||||
{ provider: "ollama_cloud", model: "kimi-k2:1t", note: "cloud 1T — biggest available, 1.4s probe" },
|
|
||||||
{ provider: "ollama_cloud", model: "qwen3-coder:480b", note: "cloud 480B — coding specialist, 0.9s probe" },
|
|
||||||
{ provider: "ollama_cloud", model: "deepseek-v3.1:671b", note: "cloud 671B — fast reasoning (1.0s probe)" },
|
|
||||||
// Free-tier rescue — kept as later fallback. These hallucinate on
|
|
||||||
// grounding (10-21% verified 2026-04-25) and now must pass observer
|
|
||||||
// hand-review before scrum accepts them.
|
|
||||||
{ provider: "openrouter", model: "openai/gpt-oss-120b:free", note: "OpenRouter free 120B — rescue (low grounding observed)" },
|
|
||||||
{ provider: "openrouter", model: "google/gemma-3-27b-it:free", note: "OpenRouter free 27B — fastest rescue, 1.4s probe" },
|
|
||||||
{ provider: "ollama", model: "qwen3.5:latest", note: "local qwen3.5 — last-resort if all cloud down" },
|
|
||||||
// Dropped from the ladder after 2026-04-24 probe:
|
// Dropped from the ladder after 2026-04-24 probe:
|
||||||
// - kimi-k2.6 — not available on current tier (empty response)
|
// - kimi-k2.6 — not available on current tier (empty response)
|
||||||
// - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist)
|
// - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist)
|
||||||
@ -1122,18 +1124,32 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
|
|||||||
// Collect attempts for the pathway trace sidecar.
|
// Collect attempts for the pathway trace sidecar.
|
||||||
const pathwayAttempts: LadderAttemptRec[] = [];
|
const pathwayAttempts: LadderAttemptRec[] = [];
|
||||||
|
|
||||||
|
// Single-model strategy with same-model retry. modelIdx advances
|
||||||
|
// only on PROVIDER errors. Quality rejects from observer keep the
|
||||||
|
// same model and retry with enriched context (history feeds back
|
||||||
|
// into the `learning` preamble so the model sees what was wrong).
|
||||||
|
// After MAX_QUALITY_RETRIES on the current model, advance to the
|
||||||
|
// next fallback model in the safety chain.
|
||||||
|
let modelIdx = 0;
|
||||||
|
let qualityRetriesOnCurrentModel = 0;
|
||||||
|
|
||||||
for (let step = 0; step < MAX_ATTEMPTS; step++) {
|
for (let step = 0; step < MAX_ATTEMPTS; step++) {
|
||||||
const i = ladderOrder[step];
|
if (modelIdx >= ladderOrder.length) {
|
||||||
|
log(` ✗ all ${ladderOrder.length} fallback models exhausted, marking UNRESOLVED`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const i = ladderOrder[modelIdx];
|
||||||
const n = step + 1;
|
const n = step + 1;
|
||||||
const rung = LADDER[i];
|
const rung = LADDER[i];
|
||||||
|
|
||||||
// Per-model rate limit (e.g. Kimi K2.6 capped at 25/hour). When
|
// Per-model rate limit. When capped, advance modelIdx (this model
|
||||||
// capped, log + skip the rung. Doesn't increment `n` so subsequent
|
// is unavailable for the rest of the hour) and reset retries.
|
||||||
// logs stay readable; just continues to the next rung in ladderOrder.
|
|
||||||
const limit = MODEL_RATE_LIMITS[rung.model];
|
const limit = MODEL_RATE_LIMITS[rung.model];
|
||||||
if (limit && !(await checkRateLimit(rung.model, limit.perHour))) {
|
if (limit && !(await checkRateLimit(rung.model, limit.perHour))) {
|
||||||
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model} — SKIP (rate-limited: cap ${limit.perHour}/hr reached)`);
|
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model} — SKIP (rate-limited: cap ${limit.perHour}/hr reached)`);
|
||||||
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: 0, accepted: false, reject_reason: `rate-limited (cap ${limit.perHour}/hr)` });
|
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: 0, accepted: false, reject_reason: `rate-limited (cap ${limit.perHour}/hr)` });
|
||||||
|
modelIdx++;
|
||||||
|
qualityRetriesOnCurrentModel = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1141,7 +1157,10 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
|
|||||||
? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status} — ${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══`
|
? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status} — ${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══`
|
||||||
: "";
|
: "";
|
||||||
|
|
||||||
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}`);
|
const retryTag = qualityRetriesOnCurrentModel > 0
|
||||||
|
? ` [retry ${qualityRetriesOnCurrentModel + 1}/${MAX_QUALITY_RETRIES + 1} same model + enrichment]`
|
||||||
|
: "";
|
||||||
|
log(` attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}${pathwayPreamble ? " [w/ pathway memory]" : ""}${retryTag}`);
|
||||||
const attemptStarted = Date.now();
|
const attemptStarted = Date.now();
|
||||||
if (limit) await recordRateLimitCall(rung.model);
|
if (limit) await recordRateLimitCall(rung.model);
|
||||||
const r = await chat({
|
const r = await chat({
|
||||||
@ -1153,15 +1172,28 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
|
|||||||
const attemptMs = Date.now() - attemptStarted;
|
const attemptMs = Date.now() - attemptStarted;
|
||||||
|
|
||||||
if (r.error) {
|
if (r.error) {
|
||||||
|
// PROVIDER error (network, auth, 5xx) → cycle to next fallback
|
||||||
|
// model. Reset retry counter for the new model.
|
||||||
history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) });
|
history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) });
|
||||||
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `error: ${r.error.slice(0, 100)}` });
|
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `error: ${r.error.slice(0, 100)}` });
|
||||||
log(` ✗ error: ${r.error.slice(0, 80)}`);
|
log(` ✗ provider error: ${r.error.slice(0, 80)} — advancing to next fallback model`);
|
||||||
|
modelIdx++;
|
||||||
|
qualityRetriesOnCurrentModel = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!isAcceptable(r.content)) {
|
if (!isAcceptable(r.content)) {
|
||||||
|
// Thin/unstructured response = quality issue. Retry SAME model
|
||||||
|
// with the failure logged to learning so it sees what to fix.
|
||||||
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` });
|
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` });
|
||||||
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `thin (${r.content.length} chars)` });
|
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: `thin (${r.content.length} chars)` });
|
||||||
log(` ✗ thin/unstructured (${r.content.length} chars)`);
|
qualityRetriesOnCurrentModel++;
|
||||||
|
if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) {
|
||||||
|
log(` ✗ thin (${r.content.length} chars) — quality retries exhausted on ${rung.model}, advancing fallback`);
|
||||||
|
modelIdx++;
|
||||||
|
qualityRetriesOnCurrentModel = 0;
|
||||||
|
} else {
|
||||||
|
log(` ✗ thin (${r.content.length} chars) — retrying same model with enrichment hint`);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Compute grounding stats as DATA — feed to observer for hand-review.
|
// Compute grounding stats as DATA — feed to observer for hand-review.
|
||||||
@ -1184,10 +1216,21 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
|
|||||||
attempt: n,
|
attempt: n,
|
||||||
});
|
});
|
||||||
if (obsVerdict.verdict === "reject" || obsVerdict.verdict === "cycle") {
|
if (obsVerdict.verdict === "reject" || obsVerdict.verdict === "cycle") {
|
||||||
|
// Observer rejected on quality grounds → retry SAME model with
|
||||||
|
// the rejection notes feeding into `learning`. This is the
|
||||||
|
// architectural correction (J 2026-04-25): quality issues mean
|
||||||
|
// the context needs more enrichment, not a different model.
|
||||||
const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`;
|
const reason = `observer ${obsVerdict.verdict}: ${obsVerdict.notes ?? "no notes"} (conf=${obsVerdict.confidence ?? "?"})`;
|
||||||
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason });
|
history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: reason });
|
||||||
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason });
|
pathwayAttempts.push({ rung: i + 1, model: rung.model, latency_ms: attemptMs, accepted: false, reject_reason: reason });
|
||||||
log(` ✗ ${reason} — cycling ladder`);
|
qualityRetriesOnCurrentModel++;
|
||||||
|
if (qualityRetriesOnCurrentModel > MAX_QUALITY_RETRIES) {
|
||||||
|
log(` ✗ ${reason} — quality retries exhausted on ${rung.model}, advancing fallback`);
|
||||||
|
modelIdx++;
|
||||||
|
qualityRetriesOnCurrentModel = 0;
|
||||||
|
} else {
|
||||||
|
log(` ✗ ${reason} — retrying same model with enrichment hint`);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });
|
history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user