From d475fc7fffc9e4f404e0ec39c222156a4e5a7062 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 28 Apr 2026 06:13:30 -0500 Subject: [PATCH] infra: replace gpt-oss with Ollama Pro + OpenCode Zen across hot paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ollama Pro plan went live today (39-model fleet on the same OLLAMA_CLOUD_KEY) and OpenCode Zen was already wired in the gateway but not consumed. Routing every gpt-oss call site to faster / stronger replacements: | Site | gpt-oss → replacement | Why | |---|---|---| | ollama_cloud default | gpt-oss:120b → deepseek-v3.2 | newest DeepSeek revision; live-probed `pong` | | openrouter default | openai/gpt-oss-120b:free → x-ai/grok-4.1-fast | already the scrum LADDER's PRIMARY | | modes.toml staffing_inference | openai/gpt-oss-120b:free → kimi-k2.6 | coding-specialized, on Ollama Pro | | modes.toml doc_drift_check | gpt-oss:120b → gemini-3-flash-preview | speed leader for factual checks | | scrum_master_pipeline tree-split MAP+REDUCE | gpt-oss:120b → gemini-3-flash-preview | latency-dominated path (5-20× per file) | | bot/propose.ts CLOUD_MODEL | gpt-oss:120b → deepseek-v3.2 | same Ollama key, faster | | mcp-server/observer.ts overseer label fallback | gpt-oss:120b → claude-opus-4-7 | matches new overseer model | | crates/gateway/src/execution_loop overseer escalation | ollama_cloud/gpt-oss:120b → opencode/claude-opus-4-7 | frontier reasoning matters here — fires only after local self-correct fails twice; Zen pay-per-token cost is bounded | Verification: - `cargo check -p gateway --tests` — clean - Live probes through localhost:3100/v1/chat: - `opencode/claude-opus-4-7` → "pong" - `gemini-3-flash-preview` (ollama_cloud) → "pong" - `kimi-k2.6` (ollama_cloud) → "pong" - `deepseek-v3.2` (ollama_cloud) → "Pong! 🏓" Notes: - kimi-k2:1t still upstream-broken (HTTP 500 on Ollama Pro probe today, matches yesterday's memory). Replacement table never picks it. - The Rust changes need a `systemctl restart lakehouse.service` to take effect on the running gateway. TS callers reload on next run. - aibridge/src/context.rs still has gpt-oss:{20b,120b} in its window- size lookup table; harmless and kept for callers that pass it explicitly as an override. Co-Authored-By: Claude Opus 4.7 (1M context) --- bot/propose.ts | 8 +++-- config/modes.toml | 9 ++++-- config/providers.toml | 15 ++++++--- crates/gateway/src/execution_loop/mod.rs | 39 ++++++++++++++--------- mcp-server/observer.ts | 2 +- tests/real-world/scrum_master_pipeline.ts | 15 +++++++-- 6 files changed, 60 insertions(+), 28 deletions(-) diff --git a/bot/propose.ts b/bot/propose.ts index ab7b6ca..441529f 100644 --- a/bot/propose.ts +++ b/bot/propose.ts @@ -16,12 +16,14 @@ import type { Gap, Proposal } from "./types.ts"; // Phase 44 migration (2026-04-27): bot/propose.ts now flows through // the gateway's /v1/chat instead of hitting the sidecar's /generate // directly. /v1/usage tracks the call, Langfuse traces it, observer -// sees it. Same upstream model (CLOUD_MODEL gpt-oss:120b on -// Ollama Cloud) — gateway just owns the routing. +// sees it. Gateway owns the routing. +// +// 2026-04-28: gpt-oss:120b → deepseek-v3.2 via Ollama Pro. Newer +// DeepSeek revision, faster, still on the same OLLAMA_CLOUD_KEY. const GATEWAY_URL = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; const REPO_ROOT = "/home/profit/lakehouse"; const PRD_PATH = `${REPO_ROOT}/docs/PRD.md`; -const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "gpt-oss:120b"; +const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "deepseek-v3.2"; const MAX_TOKENS = 6000; export async function findGaps(): Promise { diff --git a/config/modes.toml b/config/modes.toml index 169b4d2..bf7f159 100644 --- a/config/modes.toml +++ b/config/modes.toml @@ -44,7 +44,10 @@ name = "staffing_inference" # pattern generalizes beyond code review. preferred_mode = "staffing_inference_lakehouse" fallback_modes = ["ladder", "consensus", "pipeline"] -default_model = "openai/gpt-oss-120b:free" +# 2026-04-28: gpt-oss-120b:free → kimi-k2.6 via Ollama Pro. Coding- +# specialized, faster than gpt-oss, on the same OLLAMA_CLOUD_KEY so +# no extra provider hop. +default_model = "kimi-k2.6" matrix_corpus = "workers_500k_v8" [[task_class]] @@ -58,7 +61,9 @@ matrix_corpus = "kb_team_runs_v1" name = "doc_drift_check" preferred_mode = "drift" fallback_modes = ["validator"] -default_model = "gpt-oss:120b" +# 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama Pro. +# Speed leader on factual checking, same OLLAMA_CLOUD_KEY. +default_model = "gemini-3-flash-preview" matrix_corpus = "distilled_factual_v20260423095819" [[task_class]] diff --git a/config/providers.toml b/config/providers.toml index 248d672..81eea70 100644 --- a/config/providers.toml +++ b/config/providers.toml @@ -27,10 +27,15 @@ name = "ollama_cloud" base_url = "https://ollama.com" auth = "bearer" auth_env = "OLLAMA_CLOUD_KEY" -default_model = "gpt-oss:120b" -# Cloud-tier Ollama. Key resolved from OLLAMA_CLOUD_KEY env at gateway -# boot. Model-prefix routing: "cloud/" auto-routes here -# (see gateway::v1::resolve_provider). +default_model = "deepseek-v3.2" +# Cloud-tier Ollama (Pro plan as of 2026-04-28). Key resolved from +# OLLAMA_CLOUD_KEY at gateway boot; Pro tier upgraded the account so +# rate limits + model access widen without a key change. Model-prefix +# routing: "cloud/" auto-routes here. 39-model fleet now +# includes deepseek-v3.2, deepseek-v4-{flash,pro}, gemini-3-flash- +# preview, glm-{5,5.1}, kimi-k2.6, qwen3-coder-next. +# 2026-04-28: default upgraded gpt-oss:120b → deepseek-v3.2 (newest +# DeepSeek revision; kimi-k2:1t still upstream-broken with HTTP 500). [[provider]] name = "openrouter" @@ -38,7 +43,7 @@ base_url = "https://openrouter.ai/api/v1" auth = "bearer" auth_env = "OPENROUTER_API_KEY" auth_fallback_files = ["/home/profit/.env", "/root/llm_team_config.json"] -default_model = "openai/gpt-oss-120b:free" +default_model = "x-ai/grok-4.1-fast" # Multi-provider gateway. Covers Anthropic, Google, OpenAI, MiniMax, # Qwen, Gemma, etc. Key resolved via crates/gateway/src/v1/openrouter.rs # resolve_openrouter_key() — env first, then fallback files. diff --git a/crates/gateway/src/execution_loop/mod.rs b/crates/gateway/src/execution_loop/mod.rs index aaab58d..57cb86f 100644 --- a/crates/gateway/src/execution_loop/mod.rs +++ b/crates/gateway/src/execution_loop/mod.rs @@ -582,10 +582,10 @@ impl ExecutionLoop { /// Phase 20 step (8) — T3 overseer escalation. /// /// When the local executor/reviewer loop can't self-correct, call - /// the cloud overseer (`gpt-oss:120b` via Ollama Cloud) with (a) - /// the KB context — recent outcomes + prior corrections for this - /// sig_hash + task_class, across every profile that has run it — - /// and (b) the recent log tail. Its output is appended as a + /// the cloud overseer (`claude-opus-4-7` via OpenCode Zen) with + /// (a) the KB context — recent outcomes + prior corrections for + /// this sig_hash + task_class, across every profile that has run + /// it — and (b) the recent log tail. Its output is appended as a /// `system` role turn so the next executor generation sees it, /// AND written to `data/_kb/overseer_corrections.jsonl` so every /// future profile activation reads from the same learning pool. @@ -593,9 +593,16 @@ impl ExecutionLoop { /// This is the "pipe to the overviewer" piece from 2026-04-23 — /// the overseer is now a first-class KB consumer AND producer, not /// a one-shot correction oracle. + /// + /// 2026-04-28: routed through OpenCode (Zen tier) for Claude Opus + /// 4.7. Frontier reasoning matters here because the overseer fires + /// only after local self-correction has failed twice — by that + /// point we need the strongest reasoning available, not the + /// cheapest token. Frequency is low so the Zen pay-per-token cost + /// stays bounded. async fn escalate_to_overseer(&mut self, turn: u32, reason: &str) -> Result<(), String> { - let Some(cloud_key) = self.state.ollama_cloud_key.clone() else { - return Err("OLLAMA_CLOUD_KEY not configured — skipping escalation".into()); + let Some(opencode_key) = self.state.opencode_key.clone() else { + return Err("OPENCODE_API_KEY not configured — skipping escalation".into()); }; let kb = KbContext::load_for(&sig_hash(&self.req), &self.req.task_class).await; @@ -604,16 +611,18 @@ impl ExecutionLoop { let started = std::time::Instant::now(); let start_time = chrono::Utc::now(); let chat_req = crate::v1::ChatRequest { - model: "gpt-oss:120b".to_string(), + model: "claude-opus-4-7".to_string(), messages: vec![crate::v1::Message::new_text("user", prompt.clone())], temperature: Some(0.1), max_tokens: None, stream: Some(false), - think: Some(true), // overseer KEEPS thinking (Phase 20 rule) - provider: Some("ollama_cloud".into()), + // Anthropic models on opencode reject `think` (handled in + // the adapter), but we keep the intent flag for parity. + think: Some(true), + provider: Some("opencode".into()), }; - let resp = crate::v1::ollama_cloud::chat(&cloud_key, &chat_req).await - .map_err(|e| format!("ollama_cloud: {e}"))?; + let resp = crate::v1::opencode::chat(&opencode_key, &chat_req).await + .map_err(|e| format!("opencode: {e}"))?; let latency_ms = started.elapsed().as_millis() as u64; let end_time = chrono::Utc::now(); let correction_text: String = resp.choices.into_iter().next() @@ -633,8 +642,8 @@ impl ExecutionLoop { if let Some(lf) = &self.state.langfuse { use crate::v1::langfuse_trace::ChatTrace; lf.emit_chat(ChatTrace { - provider: "ollama_cloud".into(), - model: "gpt-oss:120b".into(), + provider: "opencode".into(), + model: "claude-opus-4-7".into(), input: vec![crate::v1::Message::new_text("user", prompt.clone())], output: correction_text.clone(), prompt_tokens: resp.usage.prompt_tokens, @@ -650,7 +659,7 @@ impl ExecutionLoop { // Append to the transcript so the next executor turn sees it. self.append(LogEntry::new( - turn, "system", "gpt-oss:120b", "overseer_correction", + turn, "system", "claude-opus-4-7", "overseer_correction", serde_json::json!({ "reason": reason, "correction": correction_text, @@ -672,7 +681,7 @@ impl ExecutionLoop { "task_class": self.req.task_class, "operation": self.req.operation, "reason": reason, - "model": "gpt-oss:120b", + "model": "claude-opus-4-7", "correction": correction_text, "applied_at_turn": turn, "kb_context_used": kb, diff --git a/mcp-server/observer.ts b/mcp-server/observer.ts index edb6e45..24e8042 100644 --- a/mcp-server/observer.ts +++ b/mcp-server/observer.ts @@ -769,7 +769,7 @@ async function tailOverseerCorrections(): Promise { try { row = JSON.parse(line); } catch { continue; } const op: ObservedOp = { timestamp: row.created_at ?? new Date().toISOString(), - endpoint: `overseer:${row.model ?? "gpt-oss:120b"}`, + endpoint: `overseer:${row.model ?? "claude-opus-4-7"}`, input_summary: `${row.task_class ?? "?"}: ${row.reason ?? "escalation"}`, // Correction itself is neither success nor failure — it's a // mitigation attempt. We mark success=true so analyzeErrors diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts index fb18d5e..bf3a474 100644 --- a/tests/real-world/scrum_master_pipeline.ts +++ b/tests/real-world/scrum_master_pipeline.ts @@ -1143,9 +1143,15 @@ Format each as a code-fenced block with the byte offset within the shard: EXACT LINE OF SOURCE — DO NOT PARAPHRASE, DO NOT TRUNCATE \`\`\` Pick the most reviewer-relevant lines: route definitions (e.g. \`@app.route(...)\`), function signatures, security-sensitive calls (auth/SQL/exec/template/secrets), hardcoded credentials/defaults, exception handlers, sensitive imports. The reviewer will REFUSE to act on any claim not backed by a verbatim anchor — so anchors are how you prove findings are real.`; + // 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama + // Pro. Tree-split MAP fires once per shard (potentially 5-20× + // per file), so latency dominates total scrum time. Gemini 3 + // flash returns shard digests substantially faster than the old + // 120B free model while staying strong enough for byte-anchored + // extraction. const r = await chat({ provider: "ollama_cloud", - model: "gpt-oss:120b", + model: "gemini-3-flash-preview", prompt, max_tokens: 900, }); @@ -1195,9 +1201,14 @@ COPY EVERY anchor block from the piece notes IN ORDER, character-perfect. DO NOT Output the anchor blocks under their original \`\`\`@offset...\`\`\` fences, each on its own with a blank line between. The reviewer rejects findings that don't quote a string from this anchors block, so completeness here directly determines review quality.`; + // 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama + // Pro. The reducer runs once per file (vs once per shard for MAP) + // but on a much larger context (all shard digests stacked), so + // throughput per token still matters. Same model as MAP for + // consistency in tree-split outputs. const reduced = await chat({ provider: "ollama_cloud", - model: "gpt-oss:120b", + model: "gemini-3-flash-preview", prompt: reducePrompt, max_tokens: 2400, });