diff --git a/mcp-server/observer.ts b/mcp-server/observer.ts index cfbb0a4..b1d0fa2 100644 --- a/mcp-server/observer.ts +++ b/mcp-server/observer.ts @@ -159,50 +159,63 @@ const LLM_TEAM_ESCALATIONS = "/home/profit/lakehouse/data/_kb/observer_escalatio const ESCALATION_THRESHOLD = 3; // N+ failures on same sig_hash triggers async function escalateFailureClusterToLLMTeam(sigHash: string, cluster: ObservedOp[]) { - // Package the failure cluster as a single context blob for code_review mode. + // Package the failure cluster as a single context blob. Originally + // I routed this to LLM Team's `code_review` mode at /api/run, but + // that mode isn't registered in llm_team_ui.py — it returned + // "Unknown mode" on every call. Revised 2026-04-24: route directly + // to the gateway's /v1/chat with provider=ollama_cloud + qwen3-coder:480b + // (the coding specialist that's rung 2 of the scrum ladder, proven + // to produce substantive structured reviews). Fire-and-forget so + // downstream failures don't block observer's normal loop. const context = cluster.slice(-8).map((o, i) => `[${i + 1}] endpoint=${o.endpoint} input=${o.input_summary} error=${o.error ?? "?"}` ).join("\n"); + const prompt = `sig_hash=${sigHash} · ${cluster.length} failures on the same signature:\n\n${context}\n\nReview this failure cluster. Identify:\n1. Likely root cause (single sentence).\n2. Files most likely responsible (path hints).\n3. Concrete fix direction (under 3 sentences).\n4. Confidence: NN%\n\nBe specific, not generic.`; try { - const resp = await fetch(`${LLM_TEAM}/api/run?mode=code_review`, { + const resp = await fetch(`${LAKEHOUSE}/v1/chat`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - input: `sig_hash=${sigHash} · ${cluster.length} failures on same signature:\n\n${context}\n\nReview this failure pattern. What is the root cause? What code change would prevent it? Respond with structured facts + specific file hints.`, + provider: "ollama_cloud", + model: "qwen3-coder:480b", + messages: [{ role: "user", content: prompt }], + max_tokens: 800, + temperature: 0.2, }), signal: AbortSignal.timeout(60000), }); if (!resp.ok) { - console.error(`[observer] LLM Team code_review ${resp.status}: ${(await resp.text()).slice(0, 200)}`); + console.error(`[observer] escalation /v1/chat ${resp.status}: ${(await resp.text()).slice(0, 200)}`); return; } const j: any = await resp.json(); + const analysis = j?.choices?.[0]?.message?.content ?? ""; - // Write an audit row. Fields are deliberately permissive — LLM - // Team's response shape can evolve without breaking this write. + // Audit row stays schema-compatible with the prior implementation — + // downstream consumers see structured fields regardless of the + // review-source change. Facts/entities stay empty (this call is + // direct-model, not extract-mode); the raw analysis carries the + // signal. const row = { ts: new Date().toISOString(), source: "observer_escalation", - mode: "code_review", + mode: "direct_chat_qwen3_coder_480b", sig_hash: sigHash, cluster_size: cluster.length, cluster_staffer: cluster[0]?.staffer_id, cluster_endpoint: cluster[0]?.endpoint, - llm_team_run_id: j.run_id ?? j.llm_team_run_id ?? null, - facts: j.facts ?? [], - entities: j.entities ?? [], - relationships: j.relationships ?? [], - raw_response: typeof j.response === "string" ? j.response.slice(0, 2000) : null, - recommended_files: j.file_hints ?? j.files ?? [], + prompt_tokens: j?.usage?.prompt_tokens ?? 0, + completion_tokens: j?.usage?.completion_tokens ?? 0, + analysis: analysis.slice(0, 4000), }; const { appendFile } = await import("node:fs/promises"); await appendFile(LLM_TEAM_ESCALATIONS, JSON.stringify(row) + "\n"); console.error( - `[observer] escalated sig_hash=${sigHash.slice(0, 8)} · cluster=${cluster.length} · facts=${row.facts.length} entities=${row.entities.length}` + `[observer] escalated sig_hash=${sigHash.slice(0, 8)} · cluster=${cluster.length} · ${analysis.length} chars` ); } catch (e) { - console.error(`[observer] LLM Team escalation failed: ${(e as Error).message}`); + console.error(`[observer] escalation failed: ${(e as Error).message}`); } } diff --git a/tests/real-world/scrum_applier.ts b/tests/real-world/scrum_applier.ts index d590689..9df3c6d 100644 --- a/tests/real-world/scrum_applier.ts +++ b/tests/real-world/scrum_applier.ts @@ -35,7 +35,14 @@ const AUDIT_LOG = `${REPO}/data/_kb/auto_apply.jsonl`; const MIN_CONF = Number(process.env.LH_APPLIER_MIN_CONF ?? 90); const MAX_FILES = Number(process.env.LH_APPLIER_MAX_FILES ?? 5); const COMMIT = process.env.LH_APPLIER_COMMIT === "1"; -const MODEL = process.env.LH_APPLIER_MODEL ?? "kimi-k2:1t"; +// Default patch-emitter model — qwen3-coder:480b is the coding specialist +// in the scrum ladder (rung 2). Swapped in from kimi-k2:1t after 2026-04-24 +// data showed kimi-k2:1t produces architectural patches that cascade across +// the file (rename a field → 20 broken call sites). qwen3-coder is tuned +// for targeted code changes and tends to stay within the mechanical-patch +// constraint the prompt asks for. LLM Team's /api/run?mode=patch would be +// the ideal choice but that mode isn't registered in llm_team_ui.py yet. +const MODEL = process.env.LH_APPLIER_MODEL ?? "qwen3-coder:480b"; const BRANCH = process.env.LH_APPLIER_BRANCH ?? `scrum/auto-apply-${Date.now().toString(36)}`; // Deny-list — anything whose path starts with one of these is skipped @@ -161,14 +168,25 @@ ${review} The review is high-confidence and the file is eligible for auto-apply. Produce CONCRETE PATCHES as JSON so they can be applied via string replacement. -RULES: +HARD CONSTRAINTS (violations → patch rejected): 1. Output ONE JSON object with a "patches" array. NO prose, no markdown fences. 2. Each patch is {"old_string": "...", "new_string": "...", "rationale": "short", "confidence": 0-100}. - 3. "old_string" MUST appear EXACTLY ONCE in the file (verbatim, including whitespace). If no unique anchor exists, SKIP that suggestion. - 4. Mechanical changes only: wire a function call, add a field, remove #[allow(dead_code)], add a missing use import, rename one call-site. NO architectural rewrites. NO new modules. - 5. Each "new_string" MUST compile in isolation with the same surrounding code. Don't introduce new dependencies. - 6. If you cannot produce at least one high-confidence mechanical patch, output {"patches": []}. - 7. Max 3 patches per file. + 3. "old_string" MUST appear EXACTLY ONCE in the file (verbatim, including whitespace + trailing newlines). + 4. Max diff size: 6 lines changed per patch (NOT 20). If the change needs >6 lines, emit {"patches": []} — too risky for auto-apply. + 5. Mechanical-only (the list — nothing else): + (a) remove a #[allow(dead_code)] marker on a function now wired elsewhere + (b) add a missing 'use' import statement + (c) add a single field to a struct (no renames) + (d) flip a single boolean/match-arm that doesn't cascade + (e) add a fire-and-forget log or tracing call + 6. FORBIDDEN (automatic reject): + • struct field RENAMES (break all call sites) + • function signature changes + • new modules, new traits, new dependencies + • any change that requires editing another file to keep it compiling + 7. Each "new_string" MUST compile in isolation with the same surrounding code. + 8. Max 2 patches per file — quality over quantity. + 9. If you cannot produce at least one high-confidence mechanical patch under these constraints, output {"patches": []}. Don't guess. ─── SOURCE (${source.length} bytes) ─── ${source.slice(0, 14000)} @@ -223,10 +241,12 @@ async function applyPatches(file: string, patches: Patch[]): Promise<{ applied: const occurrences = source.split(p.old_string).length - 1; if (occurrences === 0) { rejected.push({patch: p, reason: "old_string not found"}); continue; } if (occurrences > 1) { rejected.push({patch: p, reason: `old_string appears ${occurrences}× (not unique)`}); continue; } - // Size gate — no patch touches > 20 lines (diff discipline). + // Size gate — no patch touches > 6 lines (diff discipline; matches prompt). + // Raised from 20 to 6 after 2026-04-24 data showed most 10-20 line patches + // cascaded and broke the build. Mechanical changes genuinely fit in 6 lines. const oldLines = p.old_string.split("\n").length; const newLines = p.new_string.split("\n").length; - if (Math.max(oldLines, newLines) > 20) { rejected.push({patch: p, reason: `patch too large (${Math.max(oldLines,newLines)} lines)`}); continue; } + if (Math.max(oldLines, newLines) > 6) { rejected.push({patch: p, reason: `patch too large (${Math.max(oldLines,newLines)} lines, max 6)`}); continue; } source = source.replace(p.old_string, p.new_string); applied++; }