From d475fc7fffc9e4f404e0ec39c222156a4e5a7062 Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Tue, 28 Apr 2026 06:13:30 -0500
Subject: [PATCH] infra: replace gpt-oss with Ollama Pro + OpenCode Zen across
 hot paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ollama Pro plan went live today (39-model fleet on the same
OLLAMA_CLOUD_KEY) and OpenCode Zen was already wired in the gateway
but not consumed. Routing every gpt-oss call site to faster /
stronger replacements:

| Site | gpt-oss → replacement | Why |
|---|---|---|
| ollama_cloud default | gpt-oss:120b → deepseek-v3.2 | newest DeepSeek revision; live-probed `pong` |
| openrouter default | openai/gpt-oss-120b:free → x-ai/grok-4.1-fast | already the scrum LADDER's PRIMARY |
| modes.toml staffing_inference | openai/gpt-oss-120b:free → kimi-k2.6 | coding-specialized, on Ollama Pro |
| modes.toml doc_drift_check | gpt-oss:120b → gemini-3-flash-preview | speed leader for factual checks |
| scrum_master_pipeline tree-split MAP+REDUCE | gpt-oss:120b → gemini-3-flash-preview | latency-dominated path (5-20× per file) |
| bot/propose.ts CLOUD_MODEL | gpt-oss:120b → deepseek-v3.2 | same Ollama key, faster |
| mcp-server/observer.ts overseer label fallback | gpt-oss:120b → claude-opus-4-7 | matches new overseer model |
| crates/gateway/src/execution_loop overseer escalation | ollama_cloud/gpt-oss:120b → opencode/claude-opus-4-7 | frontier reasoning matters here — fires only after local self-correct fails twice; Zen pay-per-token cost is bounded |

Verification:
- `cargo check -p gateway --tests` — clean
- Live probes through localhost:3100/v1/chat:
  - `opencode/claude-opus-4-7` → "pong"
  - `gemini-3-flash-preview` (ollama_cloud) → "pong"
  - `kimi-k2.6` (ollama_cloud) → "pong"
  - `deepseek-v3.2` (ollama_cloud) → "Pong! 🏓"

Notes:
- kimi-k2:1t still upstream-broken (HTTP 500 on Ollama Pro probe today,
  matches yesterday's memory). Replacement table never picks it.
- The Rust changes need a `systemctl restart lakehouse.service` to
  take effect on the running gateway. TS callers reload on next run.
- aibridge/src/context.rs still has gpt-oss:{20b,120b} in its window-
  size lookup table; harmless and kept for callers that pass it
  explicitly as an override.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bot/propose.ts                            |  8 +++--
 config/modes.toml                         |  9 ++++--
 config/providers.toml                     | 15 ++++++---
 crates/gateway/src/execution_loop/mod.rs  | 39 ++++++++++++++---------
 mcp-server/observer.ts                    |  2 +-
 tests/real-world/scrum_master_pipeline.ts | 15 +++++++--
 6 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/bot/propose.ts b/bot/propose.ts
index ab7b6ca..441529f 100644
--- a/bot/propose.ts
+++ b/bot/propose.ts
@@ -16,12 +16,14 @@ import type { Gap, Proposal } from "./types.ts";
 // Phase 44 migration (2026-04-27): bot/propose.ts now flows through
 // the gateway's /v1/chat instead of hitting the sidecar's /generate
 // directly. /v1/usage tracks the call, Langfuse traces it, observer
-// sees it. Same upstream model (CLOUD_MODEL gpt-oss:120b on
-// Ollama Cloud) — gateway just owns the routing.
+// sees it. Gateway owns the routing.
+//
+// 2026-04-28: gpt-oss:120b → deepseek-v3.2 via Ollama Pro. Newer
+// DeepSeek revision, faster, still on the same OLLAMA_CLOUD_KEY.
 const GATEWAY_URL = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const REPO_ROOT = "/home/profit/lakehouse";
 const PRD_PATH = `${REPO_ROOT}/docs/PRD.md`;
-const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "gpt-oss:120b";
+const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "deepseek-v3.2";
 const MAX_TOKENS = 6000;
 
 export async function findGaps(): Promise<Gap[]> {
diff --git a/config/modes.toml b/config/modes.toml
index 169b4d2..bf7f159 100644
--- a/config/modes.toml
+++ b/config/modes.toml
@@ -44,7 +44,10 @@ name = "staffing_inference"
 # pattern generalizes beyond code review.
 preferred_mode = "staffing_inference_lakehouse"
 fallback_modes = ["ladder", "consensus", "pipeline"]
-default_model = "openai/gpt-oss-120b:free"
+# 2026-04-28: gpt-oss-120b:free → kimi-k2.6 via Ollama Pro. Coding-
+# specialized, faster than gpt-oss, on the same OLLAMA_CLOUD_KEY so
+# no extra provider hop.
+default_model = "kimi-k2.6"
 matrix_corpus = "workers_500k_v8"
 
 [[task_class]]
@@ -58,7 +61,9 @@ matrix_corpus = "kb_team_runs_v1"
 name = "doc_drift_check"
 preferred_mode = "drift"
 fallback_modes = ["validator"]
-default_model = "gpt-oss:120b"
+# 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama Pro.
+# Speed leader on factual checking, same OLLAMA_CLOUD_KEY.
+default_model = "gemini-3-flash-preview"
 matrix_corpus = "distilled_factual_v20260423095819"
 
 [[task_class]]
diff --git a/config/providers.toml b/config/providers.toml
index 248d672..81eea70 100644
--- a/config/providers.toml
+++ b/config/providers.toml
@@ -27,10 +27,15 @@ name = "ollama_cloud"
 base_url = "https://ollama.com"
 auth = "bearer"
 auth_env = "OLLAMA_CLOUD_KEY"
-default_model = "gpt-oss:120b"
-# Cloud-tier Ollama. Key resolved from OLLAMA_CLOUD_KEY env at gateway
-# boot. Model-prefix routing: "cloud/<model>" auto-routes here
-# (see gateway::v1::resolve_provider).
+default_model = "deepseek-v3.2"
+# Cloud-tier Ollama (Pro plan as of 2026-04-28). Key resolved from
+# OLLAMA_CLOUD_KEY at gateway boot; Pro tier upgraded the account so
+# rate limits + model access widen without a key change. Model-prefix
+# routing: "cloud/<model>" auto-routes here. 39-model fleet now
+# includes deepseek-v3.2, deepseek-v4-{flash,pro}, gemini-3-flash-
+# preview, glm-{5,5.1}, kimi-k2.6, qwen3-coder-next.
+# 2026-04-28: default upgraded gpt-oss:120b → deepseek-v3.2 (newest
+# DeepSeek revision; kimi-k2:1t still upstream-broken with HTTP 500).
 
 [[provider]]
 name = "openrouter"
@@ -38,7 +43,7 @@ base_url = "https://openrouter.ai/api/v1"
 auth = "bearer"
 auth_env = "OPENROUTER_API_KEY"
 auth_fallback_files = ["/home/profit/.env", "/root/llm_team_config.json"]
-default_model = "openai/gpt-oss-120b:free"
+default_model = "x-ai/grok-4.1-fast"
 # Multi-provider gateway. Covers Anthropic, Google, OpenAI, MiniMax,
 # Qwen, Gemma, etc. Key resolved via crates/gateway/src/v1/openrouter.rs
 # resolve_openrouter_key() — env first, then fallback files.
diff --git a/crates/gateway/src/execution_loop/mod.rs b/crates/gateway/src/execution_loop/mod.rs
index aaab58d..57cb86f 100644
--- a/crates/gateway/src/execution_loop/mod.rs
+++ b/crates/gateway/src/execution_loop/mod.rs
@@ -582,10 +582,10 @@ impl ExecutionLoop {
     /// Phase 20 step (8) — T3 overseer escalation.
     ///
     /// When the local executor/reviewer loop can't self-correct, call
-    /// the cloud overseer (`gpt-oss:120b` via Ollama Cloud) with (a)
-    /// the KB context — recent outcomes + prior corrections for this
-    /// sig_hash + task_class, across every profile that has run it —
-    /// and (b) the recent log tail. Its output is appended as a
+    /// the cloud overseer (`claude-opus-4-7` via OpenCode Zen) with
+    /// (a) the KB context — recent outcomes + prior corrections for
+    /// this sig_hash + task_class, across every profile that has run
+    /// it — and (b) the recent log tail. Its output is appended as a
     /// `system` role turn so the next executor generation sees it,
     /// AND written to `data/_kb/overseer_corrections.jsonl` so every
     /// future profile activation reads from the same learning pool.
@@ -593,9 +593,16 @@ impl ExecutionLoop {
     /// This is the "pipe to the overviewer" piece from 2026-04-23 —
     /// the overseer is now a first-class KB consumer AND producer, not
     /// a one-shot correction oracle.
+    ///
+    /// 2026-04-28: routed through OpenCode (Zen tier) for Claude Opus
+    /// 4.7. Frontier reasoning matters here because the overseer fires
+    /// only after local self-correction has failed twice — by that
+    /// point we need the strongest reasoning available, not the
+    /// cheapest token. Frequency is low so the Zen pay-per-token cost
+    /// stays bounded.
     async fn escalate_to_overseer(&mut self, turn: u32, reason: &str) -> Result<(), String> {
-        let Some(cloud_key) = self.state.ollama_cloud_key.clone() else {
-            return Err("OLLAMA_CLOUD_KEY not configured — skipping escalation".into());
+        let Some(opencode_key) = self.state.opencode_key.clone() else {
+            return Err("OPENCODE_API_KEY not configured — skipping escalation".into());
         };
 
         let kb = KbContext::load_for(&sig_hash(&self.req), &self.req.task_class).await;
@@ -604,16 +611,18 @@ impl ExecutionLoop {
         let started = std::time::Instant::now();
         let start_time = chrono::Utc::now();
         let chat_req = crate::v1::ChatRequest {
-            model: "gpt-oss:120b".to_string(),
+            model: "claude-opus-4-7".to_string(),
             messages: vec![crate::v1::Message::new_text("user", prompt.clone())],
             temperature: Some(0.1),
             max_tokens: None,
             stream: Some(false),
-            think: Some(true),    // overseer KEEPS thinking (Phase 20 rule)
-            provider: Some("ollama_cloud".into()),
+            // Anthropic models on opencode reject `think` (handled in
+            // the adapter), but we keep the intent flag for parity.
+            think: Some(true),
+            provider: Some("opencode".into()),
         };
-        let resp = crate::v1::ollama_cloud::chat(&cloud_key, &chat_req).await
-            .map_err(|e| format!("ollama_cloud: {e}"))?;
+        let resp = crate::v1::opencode::chat(&opencode_key, &chat_req).await
+            .map_err(|e| format!("opencode: {e}"))?;
         let latency_ms = started.elapsed().as_millis() as u64;
         let end_time = chrono::Utc::now();
         let correction_text: String = resp.choices.into_iter().next()
@@ -633,8 +642,8 @@ impl ExecutionLoop {
         if let Some(lf) = &self.state.langfuse {
             use crate::v1::langfuse_trace::ChatTrace;
             lf.emit_chat(ChatTrace {
-                provider: "ollama_cloud".into(),
-                model: "gpt-oss:120b".into(),
+                provider: "opencode".into(),
+                model: "claude-opus-4-7".into(),
                 input: vec![crate::v1::Message::new_text("user", prompt.clone())],
                 output: correction_text.clone(),
                 prompt_tokens: resp.usage.prompt_tokens,
@@ -650,7 +659,7 @@ impl ExecutionLoop {
 
         // Append to the transcript so the next executor turn sees it.
         self.append(LogEntry::new(
-            turn, "system", "gpt-oss:120b", "overseer_correction",
+            turn, "system", "claude-opus-4-7", "overseer_correction",
             serde_json::json!({
                 "reason": reason,
                 "correction": correction_text,
@@ -672,7 +681,7 @@ impl ExecutionLoop {
             "task_class": self.req.task_class,
             "operation": self.req.operation,
             "reason": reason,
-            "model": "gpt-oss:120b",
+            "model": "claude-opus-4-7",
             "correction": correction_text,
             "applied_at_turn": turn,
             "kb_context_used": kb,
diff --git a/mcp-server/observer.ts b/mcp-server/observer.ts
index edb6e45..24e8042 100644
--- a/mcp-server/observer.ts
+++ b/mcp-server/observer.ts
@@ -769,7 +769,7 @@ async function tailOverseerCorrections(): Promise<number> {
     try { row = JSON.parse(line); } catch { continue; }
     const op: ObservedOp = {
       timestamp: row.created_at ?? new Date().toISOString(),
-      endpoint: `overseer:${row.model ?? "gpt-oss:120b"}`,
+      endpoint: `overseer:${row.model ?? "claude-opus-4-7"}`,
       input_summary: `${row.task_class ?? "?"}: ${row.reason ?? "escalation"}`,
       // Correction itself is neither success nor failure — it's a
       // mitigation attempt. We mark success=true so analyzeErrors
diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts
index fb18d5e..bf3a474 100644
--- a/tests/real-world/scrum_master_pipeline.ts
+++ b/tests/real-world/scrum_master_pipeline.ts
@@ -1143,9 +1143,15 @@ Format each as a code-fenced block with the byte offset within the shard:
 EXACT LINE OF SOURCE — DO NOT PARAPHRASE, DO NOT TRUNCATE
 \`\`\`
 Pick the most reviewer-relevant lines: route definitions (e.g. \`@app.route(...)\`), function signatures, security-sensitive calls (auth/SQL/exec/template/secrets), hardcoded credentials/defaults, exception handlers, sensitive imports. The reviewer will REFUSE to act on any claim not backed by a verbatim anchor — so anchors are how you prove findings are real.`;
+    // 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama
+    // Pro. Tree-split MAP fires once per shard (potentially 5-20×
+    // per file), so latency dominates total scrum time. Gemini 3
+    // flash returns shard digests substantially faster than the old
+    // 120B free model while staying strong enough for byte-anchored
+    // extraction.
     const r = await chat({
       provider: "ollama_cloud",
-      model: "gpt-oss:120b",
+      model: "gemini-3-flash-preview",
       prompt,
       max_tokens: 900,
     });
@@ -1195,9 +1201,14 @@ COPY EVERY anchor block from the piece notes IN ORDER, character-perfect. DO NOT
 
 Output the anchor blocks under their original \`\`\`@offset...\`\`\` fences, each on its own with a blank line between. The reviewer rejects findings that don't quote a string from this anchors block, so completeness here directly determines review quality.`;
 
+  // 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama
+  // Pro. The reducer runs once per file (vs once per shard for MAP)
+  // but on a much larger context (all shard digests stacked), so
+  // throughput per token still matters. Same model as MAP for
+  // consistency in tree-split outputs.
   const reduced = await chat({
     provider: "ollama_cloud",
-    model: "gpt-oss:120b",
+    model: "gemini-3-flash-preview",
     prompt: reducePrompt,
     max_tokens: 2400,
   });