9 changed files with 43 additions and 88 deletions
--- a/bot/propose.ts
+++ b/bot/propose.ts
@ -16,14 +16,12 @@ import type { Gap, Proposal } from "./types.ts";
 // Phase 44 migration (2026-04-27): bot/propose.ts now flows through
 // the gateway's /v1/chat instead of hitting the sidecar's /generate
 // directly. /v1/usage tracks the call, Langfuse traces it, observer
-// sees it. Gateway owns the routing.
-//
-// 2026-04-28: gpt-oss:120b → deepseek-v3.2 via Ollama Pro. Newer
-// DeepSeek revision, faster, still on the same OLLAMA_CLOUD_KEY.
+// sees it. Same upstream model (CLOUD_MODEL gpt-oss:120b on
+// Ollama Cloud) — gateway just owns the routing.
 const GATEWAY_URL = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const REPO_ROOT = "/home/profit/lakehouse";
 const PRD_PATH = `${REPO_ROOT}/docs/PRD.md`;
-const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "deepseek-v3.2";
+const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "gpt-oss:120b";
 const MAX_TOKENS = 6000;

 export async function findGaps(): Promise<Gap[]> {
--- a/config/modes.toml
+++ b/config/modes.toml
@ -44,10 +44,7 @@ name = "staffing_inference"
 # pattern generalizes beyond code review.
 preferred_mode = "staffing_inference_lakehouse"
 fallback_modes = ["ladder", "consensus", "pipeline"]
-# 2026-04-28: gpt-oss-120b:free → kimi-k2.6 via Ollama Pro. Coding-
-# specialized, faster than gpt-oss, on the same OLLAMA_CLOUD_KEY so
-# no extra provider hop.
-default_model = "kimi-k2.6"
+default_model = "openai/gpt-oss-120b:free"
 matrix_corpus = "workers_500k_v8"

 [[task_class]]
@ -61,9 +58,7 @@ matrix_corpus = "kb_team_runs_v1"
 name = "doc_drift_check"
 preferred_mode = "drift"
 fallback_modes = ["validator"]
-# 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama Pro.
-# Speed leader on factual checking, same OLLAMA_CLOUD_KEY.
-default_model = "gemini-3-flash-preview"
+default_model = "gpt-oss:120b"
 matrix_corpus = "distilled_factual_v20260423095819"

 [[task_class]]
--- a/config/providers.toml
+++ b/config/providers.toml
@ -27,15 +27,10 @@ name = "ollama_cloud"
 base_url = "https://ollama.com"
 auth = "bearer"
 auth_env = "OLLAMA_CLOUD_KEY"
-default_model = "deepseek-v3.2"
-# Cloud-tier Ollama (Pro plan as of 2026-04-28). Key resolved from
-# OLLAMA_CLOUD_KEY at gateway boot; Pro tier upgraded the account so
-# rate limits + model access widen without a key change. Model-prefix
-# routing: "cloud/<model>" auto-routes here. 39-model fleet now
-# includes deepseek-v3.2, deepseek-v4-{flash,pro}, gemini-3-flash-
-# preview, glm-{5,5.1}, kimi-k2.6, qwen3-coder-next.
-# 2026-04-28: default upgraded gpt-oss:120b → deepseek-v3.2 (newest
-# DeepSeek revision; kimi-k2:1t still upstream-broken with HTTP 500).
+default_model = "gpt-oss:120b"
+# Cloud-tier Ollama. Key resolved from OLLAMA_CLOUD_KEY env at gateway
+# boot. Model-prefix routing: "cloud/<model>" auto-routes here
+# (see gateway::v1::resolve_provider).

 [[provider]]
 name = "openrouter"
@ -43,7 +38,7 @@ base_url = "https://openrouter.ai/api/v1"
 auth = "bearer"
 auth_env = "OPENROUTER_API_KEY"
 auth_fallback_files = ["/home/profit/.env", "/root/llm_team_config.json"]
-default_model = "x-ai/grok-4.1-fast"
+default_model = "openai/gpt-oss-120b:free"
 # Multi-provider gateway. Covers Anthropic, Google, OpenAI, MiniMax,
 # Qwen, Gemma, etc. Key resolved via crates/gateway/src/v1/openrouter.rs
 # resolve_openrouter_key() — env first, then fallback files.
--- a/crates/gateway/src/execution_loop/mod.rs
+++ b/crates/gateway/src/execution_loop/mod.rs
@ -582,10 +582,10 @@ impl ExecutionLoop {
    /// Phase 20 step (8) — T3 overseer escalation.
    ///
    /// When the local executor/reviewer loop can't self-correct, call
-    /// the cloud overseer (`claude-opus-4-7` via OpenCode Zen) with
-    /// (a) the KB context — recent outcomes + prior corrections for
-    /// this sig_hash + task_class, across every profile that has run
-    /// it — and (b) the recent log tail. Its output is appended as a
+    /// the cloud overseer (`gpt-oss:120b` via Ollama Cloud) with (a)
+    /// the KB context — recent outcomes + prior corrections for this
+    /// sig_hash + task_class, across every profile that has run it —
+    /// and (b) the recent log tail. Its output is appended as a
    /// `system` role turn so the next executor generation sees it,
    /// AND written to `data/_kb/overseer_corrections.jsonl` so every
    /// future profile activation reads from the same learning pool.
@ -593,16 +593,9 @@ impl ExecutionLoop {
    /// This is the "pipe to the overviewer" piece from 2026-04-23 —
    /// the overseer is now a first-class KB consumer AND producer, not
    /// a one-shot correction oracle.
-    ///
-    /// 2026-04-28: routed through OpenCode (Zen tier) for Claude Opus
-    /// 4.7. Frontier reasoning matters here because the overseer fires
-    /// only after local self-correction has failed twice — by that
-    /// point we need the strongest reasoning available, not the
-    /// cheapest token. Frequency is low so the Zen pay-per-token cost
-    /// stays bounded.
    async fn escalate_to_overseer(&mut self, turn: u32, reason: &str) -> Result<(), String> {
-        let Some(opencode_key) = self.state.opencode_key.clone() else {
-            return Err("OPENCODE_API_KEY not configured — skipping escalation".into());
+        let Some(cloud_key) = self.state.ollama_cloud_key.clone() else {
+            return Err("OLLAMA_CLOUD_KEY not configured — skipping escalation".into());
        };

        let kb = KbContext::load_for(&sig_hash(&self.req), &self.req.task_class).await;
@ -611,18 +604,16 @@ impl ExecutionLoop {
        let started = std::time::Instant::now();
        let start_time = chrono::Utc::now();
        let chat_req = crate::v1::ChatRequest {
-            model: "claude-opus-4-7".to_string(),
+            model: "gpt-oss:120b".to_string(),
            messages: vec![crate::v1::Message::new_text("user", prompt.clone())],
            temperature: Some(0.1),
            max_tokens: None,
            stream: Some(false),
-            // Anthropic models on opencode reject `think` (handled in
-            // the adapter), but we keep the intent flag for parity.
-            think: Some(true),
-            provider: Some("opencode".into()),
+            think: Some(true),    // overseer KEEPS thinking (Phase 20 rule)
+            provider: Some("ollama_cloud".into()),
        };
-        let resp = crate::v1::opencode::chat(&opencode_key, &chat_req).await
-            .map_err(|e| format!("opencode: {e}"))?;
+        let resp = crate::v1::ollama_cloud::chat(&cloud_key, &chat_req).await
+            .map_err(|e| format!("ollama_cloud: {e}"))?;
        let latency_ms = started.elapsed().as_millis() as u64;
        let end_time = chrono::Utc::now();
        let correction_text: String = resp.choices.into_iter().next()
@ -642,8 +633,8 @@ impl ExecutionLoop {
        if let Some(lf) = &self.state.langfuse {
            use crate::v1::langfuse_trace::ChatTrace;
            lf.emit_chat(ChatTrace {
-                provider: "opencode".into(),
-                model: "claude-opus-4-7".into(),
+                provider: "ollama_cloud".into(),
+                model: "gpt-oss:120b".into(),
                input: vec![crate::v1::Message::new_text("user", prompt.clone())],
                output: correction_text.clone(),
                prompt_tokens: resp.usage.prompt_tokens,
@ -659,7 +650,7 @@ impl ExecutionLoop {

        // Append to the transcript so the next executor turn sees it.
        self.append(LogEntry::new(
-            turn, "system", "claude-opus-4-7", "overseer_correction",
+            turn, "system", "gpt-oss:120b", "overseer_correction",
            serde_json::json!({
                "reason": reason,
                "correction": correction_text,
@ -681,7 +672,7 @@ impl ExecutionLoop {
            "task_class": self.req.task_class,
            "operation": self.req.operation,
            "reason": reason,
-            "model": "claude-opus-4-7",
+            "model": "gpt-oss:120b",
            "correction": correction_text,
            "applied_at_turn": turn,
            "kb_context_used": kb,
--- a/crates/vectord/src/rag.rs
+++ b/crates/vectord/src/rag.rs
@ -163,11 +163,7 @@ pub async fn query(
    // production caller of the Phase 21 primitives — see audit finding
    // "Phase 21 Rust primitives are wired but not CALLED by any
    // production surface" from 2026-04-21.
-    // 2026-04-30 model bump: qwen2.5:latest → qwen3.5:latest to match
-    // the small-model-pipeline local-tier default. Same JSON-clean
-    // property, more capacity. think=Some(false) preserved — RAG hot
-    // path doesn't need reasoning traces; direct answers only.
-    let mut cont_opts = ContinuableOpts::new("qwen3.5:latest");
+    let mut cont_opts = ContinuableOpts::new("qwen2.5:latest");
    cont_opts.max_tokens = Some(512);
    cont_opts.temperature = Some(0.2);
    cont_opts.shape = ResponseShape::Text;
@ -180,7 +176,7 @@ pub async fn query(
        // echoes whatever Ollama loaded). Use the configured tier model
        // for now; if RAG needs to report the actual resolved model,
        // the runner can add a post-call ps probe later.
-        model: "qwen3.5:latest".to_string(),
+        model: "qwen2.5:latest".to_string(),
        sources: results,
        tokens_generated: None,
    })
--- a/lakehouse.toml
+++ b/lakehouse.toml
@ -48,13 +48,8 @@ url = "http://localhost:3200"

 [ai]
 embed_model = "nomic-embed-text"
-# Local-tier defaults bumped 2026-04-30: qwen3.5:latest is the
-# stronger local rung in the 5-loop substrate (per
-# project_small_model_pipeline_vision.md). Same JSON-clean property
-# as qwen2.5, more capacity. Ollama still serves both — bump back
-# in this file if a workload regressed.
-gen_model = "qwen3.5:latest"
-rerank_model = "qwen3.5:latest"
+gen_model = "qwen2.5"
+rerank_model = "qwen2.5"

 [auth]
 enabled = false
@ -77,9 +72,7 @@ min_recall = 0.9                          # never promote below this
 max_trials_per_hour = 20                  # hard budget cap

 # Model roster — available for profile hot-swap
-# qwen3.5:latest: stronger local rung — JSON-clean, 8K+ context,
-#                 default for gen_model and rerank_model
 # qwen3: 8.2B, 40K context, thinking+tools, best for reasoning tasks
-# qwen2.5: 7B, 8K context, fast — kept loaded for the 2026-04 era
-#          comparison runs; new defaults use qwen3.5:latest
+# qwen2.5: 7B, 8K context, fast, good for SQL generation
+# mistral: 7B, 8K context, good for general generation
 # nomic-embed-text: 137M, embedding-only, used by all profiles
--- a/mcp-server/index.ts
+++ b/mcp-server/index.ts
@ -313,9 +313,9 @@ ${(buckets as any[] || []).map((b: any) => `- ${b.name}: ${b.backend} (${b.reach
 - Ollama: :11434

 ## Available Models
- qwen3.5:latest: stronger local rung, JSON-clean (default for gen + rerank)
 - qwen3: 8.2B, 40K context, thinking+tools (best for reasoning)
- qwen2.5: 7B, 8K context (legacy — 2026-04 era comparison runs only)
+- qwen2.5: 7B, 8K context (best for fast SQL generation)
+- mistral: 7B, 8K context (general generation)
 - nomic-embed-text: 137M (embedding, automatic)
 `;
  return { contents: [{ uri: uri.href, mimeType: "text/plain", text }] };
--- a/mcp-server/observer.ts
+++ b/mcp-server/observer.ts
@ -146,16 +146,15 @@ async function persistOp(op: ObservedOp) {
 // ─── LLM Team escalation (code_review mode) ───
 //
 // When recent failures on a single sig_hash cross a threshold the
-// local-model analysis is probably insufficient. J's 2026-04-24
+// local qwen2.5 analysis is probably insufficient. J's 2026-04-24
 // direction: "the observer would trigger to give more context" —
 // route failure clusters to LLM Team's specialized code_review mode
 // (via /api/run) so richer structured signal lands in the KB for
 // scrum + auditor + playbook memory to consume next pass.
 //
-// Non-destructive: runs in parallel to the existing local diagnose
-// call (qwen3.5:latest after the 2026-04-30 bump), never replaces
-// it. Writes to data/_kb/observer_escalations.jsonl as a dedicated
-// audit surface.
+// Non-destructive: runs in parallel to the existing qwen2.5 analysis,
+// never replaces it. Writes to data/_kb/observer_escalations.jsonl
+// as a dedicated audit surface.

 const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000";
 const LLM_TEAM_ESCALATIONS = "/home/profit/lakehouse/data/_kb/observer_escalations.jsonl";
@ -543,7 +542,7 @@ async function analyzeErrors() {
  if (failures.length === 0) return;

  // NEW 2026-04-24: escalate recurring sig_hash clusters to LLM Team
-  // code_review mode. Runs in parallel to the local diagnose call
+  // code_review mode. Runs in parallel to the local qwen2.5 analysis
  // below — non-blocking, richer downstream signal for scrum/auditor.
  maybeEscalate(failures).catch(() => {});

@ -553,14 +552,13 @@ async function analyzeErrors() {

  // Ask local model to diagnose. Phase 44 migration (2026-04-27):
  // /v1/chat instead of legacy /ai/generate so /v1/usage tracks the
-  // call + Langfuse traces it. 2026-04-30 model bump: qwen2.5 →
-  // qwen3.5:latest to match the small-model-pipeline local-tier default.
+  // call + Langfuse traces it. Same upstream model (qwen2.5 local).
  try {
    const resp = await fetch(`${LAKEHOUSE}/v1/chat`, {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify({
-        model: "qwen3.5:latest",
+        model: "qwen2.5",
        provider: "ollama",
        messages: [{
          role: "user",
@ -771,7 +769,7 @@ async function tailOverseerCorrections(): Promise<number> {
    try { row = JSON.parse(line); } catch { continue; }
    const op: ObservedOp = {
      timestamp: row.created_at ?? new Date().toISOString(),
-      endpoint: `overseer:${row.model ?? "claude-opus-4-7"}`,
+      endpoint: `overseer:${row.model ?? "gpt-oss:120b"}`,
      input_summary: `${row.task_class ?? "?"}: ${row.reason ?? "escalation"}`,
      // Correction itself is neither success nor failure — it's a
      // mitigation attempt. We mark success=true so analyzeErrors
--- a/tests/real-world/scrum_master_pipeline.ts
+++ b/tests/real-world/scrum_master_pipeline.ts
@ -1143,15 +1143,9 @@ Format each as a code-fenced block with the byte offset within the shard:
 EXACT LINE OF SOURCE — DO NOT PARAPHRASE, DO NOT TRUNCATE
 \`\`\`
 Pick the most reviewer-relevant lines: route definitions (e.g. \`@app.route(...)\`), function signatures, security-sensitive calls (auth/SQL/exec/template/secrets), hardcoded credentials/defaults, exception handlers, sensitive imports. The reviewer will REFUSE to act on any claim not backed by a verbatim anchor — so anchors are how you prove findings are real.`;
-    // 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama
-    // Pro. Tree-split MAP fires once per shard (potentially 5-20×
-    // per file), so latency dominates total scrum time. Gemini 3
-    // flash returns shard digests substantially faster than the old
-    // 120B free model while staying strong enough for byte-anchored
-    // extraction.
    const r = await chat({
      provider: "ollama_cloud",
-      model: "gemini-3-flash-preview",
+      model: "gpt-oss:120b",
      prompt,
      max_tokens: 900,
    });
@ -1201,14 +1195,9 @@ COPY EVERY anchor block from the piece notes IN ORDER, character-perfect. DO NOT

 Output the anchor blocks under their original \`\`\`@offset...\`\`\` fences, each on its own with a blank line between. The reviewer rejects findings that don't quote a string from this anchors block, so completeness here directly determines review quality.`;

-  // 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama
-  // Pro. The reducer runs once per file (vs once per shard for MAP)
-  // but on a much larger context (all shard digests stacked), so
-  // throughput per token still matters. Same model as MAP for
-  // consistency in tree-split outputs.
  const reduced = await chat({
    provider: "ollama_cloud",
-    model: "gemini-3-flash-preview",
+    model: "gpt-oss:120b",
    prompt: reducePrompt,
    max_tokens: 2400,
  });