infra: replace gpt-oss with Ollama Pro + OpenCode Zen #13

Merged
profit merged 1 commits from infra/replace-gpt-oss-2026-04-28 into main 2026-05-03 03:39:54 +00:00
6 changed files with 60 additions and 28 deletions
Showing only changes of commit a00e9bb438 - Show all commits

View File

@ -16,12 +16,14 @@ import type { Gap, Proposal } from "./types.ts";
// Phase 44 migration (2026-04-27): bot/propose.ts now flows through
// the gateway's /v1/chat instead of hitting the sidecar's /generate
// directly. /v1/usage tracks the call, Langfuse traces it, observer
// sees it. Same upstream model (CLOUD_MODEL gpt-oss:120b on
// Ollama Cloud) — gateway just owns the routing.
// sees it. Gateway owns the routing.
//
// 2026-04-28: gpt-oss:120b → deepseek-v3.2 via Ollama Pro. Newer
// DeepSeek revision, faster, still on the same OLLAMA_CLOUD_KEY.
const GATEWAY_URL = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
const REPO_ROOT = "/home/profit/lakehouse";
const PRD_PATH = `${REPO_ROOT}/docs/PRD.md`;
const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "gpt-oss:120b";
const CLOUD_MODEL = process.env.LH_BOT_MODEL ?? "deepseek-v3.2";
const MAX_TOKENS = 6000;
export async function findGaps(): Promise<Gap[]> {

View File

@ -44,7 +44,10 @@ name = "staffing_inference"
# pattern generalizes beyond code review.
preferred_mode = "staffing_inference_lakehouse"
fallback_modes = ["ladder", "consensus", "pipeline"]
default_model = "openai/gpt-oss-120b:free"
# 2026-04-28: gpt-oss-120b:free → kimi-k2.6 via Ollama Pro. Coding-
# specialized, faster than gpt-oss, on the same OLLAMA_CLOUD_KEY so
# no extra provider hop.
default_model = "kimi-k2.6"
matrix_corpus = "workers_500k_v8"
[[task_class]]
@ -58,7 +61,9 @@ matrix_corpus = "kb_team_runs_v1"
name = "doc_drift_check"
preferred_mode = "drift"
fallback_modes = ["validator"]
default_model = "gpt-oss:120b"
# 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama Pro.
# Speed leader on factual checking, same OLLAMA_CLOUD_KEY.
default_model = "gemini-3-flash-preview"
matrix_corpus = "distilled_factual_v20260423095819"
[[task_class]]

View File

@ -27,10 +27,15 @@ name = "ollama_cloud"
base_url = "https://ollama.com"
auth = "bearer"
auth_env = "OLLAMA_CLOUD_KEY"
default_model = "gpt-oss:120b"
# Cloud-tier Ollama. Key resolved from OLLAMA_CLOUD_KEY env at gateway
# boot. Model-prefix routing: "cloud/<model>" auto-routes here
# (see gateway::v1::resolve_provider).
default_model = "deepseek-v3.2"
# Cloud-tier Ollama (Pro plan as of 2026-04-28). Key resolved from
# OLLAMA_CLOUD_KEY at gateway boot; Pro tier upgraded the account so
# rate limits + model access widen without a key change. Model-prefix
# routing: "cloud/<model>" auto-routes here. 39-model fleet now
# includes deepseek-v3.2, deepseek-v4-{flash,pro}, gemini-3-flash-
# preview, glm-{5,5.1}, kimi-k2.6, qwen3-coder-next.
# 2026-04-28: default upgraded gpt-oss:120b → deepseek-v3.2 (newest
# DeepSeek revision; kimi-k2:1t still upstream-broken with HTTP 500).
[[provider]]
name = "openrouter"
@ -38,7 +43,7 @@ base_url = "https://openrouter.ai/api/v1"
auth = "bearer"
auth_env = "OPENROUTER_API_KEY"
auth_fallback_files = ["/home/profit/.env", "/root/llm_team_config.json"]
default_model = "openai/gpt-oss-120b:free"
default_model = "x-ai/grok-4.1-fast"
# Multi-provider gateway. Covers Anthropic, Google, OpenAI, MiniMax,
# Qwen, Gemma, etc. Key resolved via crates/gateway/src/v1/openrouter.rs
# resolve_openrouter_key() — env first, then fallback files.

View File

@ -582,10 +582,10 @@ impl ExecutionLoop {
/// Phase 20 step (8) — T3 overseer escalation.
///
/// When the local executor/reviewer loop can't self-correct, call
/// the cloud overseer (`gpt-oss:120b` via Ollama Cloud) with (a)
/// the KB context — recent outcomes + prior corrections for this
/// sig_hash + task_class, across every profile that has run it —
/// and (b) the recent log tail. Its output is appended as a
/// the cloud overseer (`claude-opus-4-7` via OpenCode Zen) with
/// (a) the KB context — recent outcomes + prior corrections for
/// this sig_hash + task_class, across every profile that has run
/// it — and (b) the recent log tail. Its output is appended as a
/// `system` role turn so the next executor generation sees it,
/// AND written to `data/_kb/overseer_corrections.jsonl` so every
/// future profile activation reads from the same learning pool.
@ -593,9 +593,16 @@ impl ExecutionLoop {
/// This is the "pipe to the overviewer" piece from 2026-04-23 —
/// the overseer is now a first-class KB consumer AND producer, not
/// a one-shot correction oracle.
///
/// 2026-04-28: routed through OpenCode (Zen tier) for Claude Opus
/// 4.7. Frontier reasoning matters here because the overseer fires
/// only after local self-correction has failed twice — by that
/// point we need the strongest reasoning available, not the
/// cheapest token. Frequency is low so the Zen pay-per-token cost
/// stays bounded.
async fn escalate_to_overseer(&mut self, turn: u32, reason: &str) -> Result<(), String> {
let Some(cloud_key) = self.state.ollama_cloud_key.clone() else {
return Err("OLLAMA_CLOUD_KEY not configured — skipping escalation".into());
let Some(opencode_key) = self.state.opencode_key.clone() else {
return Err("OPENCODE_API_KEY not configured — skipping escalation".into());
};
let kb = KbContext::load_for(&sig_hash(&self.req), &self.req.task_class).await;
@ -604,16 +611,18 @@ impl ExecutionLoop {
let started = std::time::Instant::now();
let start_time = chrono::Utc::now();
let chat_req = crate::v1::ChatRequest {
model: "gpt-oss:120b".to_string(),
model: "claude-opus-4-7".to_string(),
messages: vec![crate::v1::Message::new_text("user", prompt.clone())],
temperature: Some(0.1),
max_tokens: None,
stream: Some(false),
think: Some(true), // overseer KEEPS thinking (Phase 20 rule)
provider: Some("ollama_cloud".into()),
// Anthropic models on opencode reject `think` (handled in
// the adapter), but we keep the intent flag for parity.
think: Some(true),
provider: Some("opencode".into()),
};
let resp = crate::v1::ollama_cloud::chat(&cloud_key, &chat_req).await
.map_err(|e| format!("ollama_cloud: {e}"))?;
let resp = crate::v1::opencode::chat(&opencode_key, &chat_req).await
.map_err(|e| format!("opencode: {e}"))?;
let latency_ms = started.elapsed().as_millis() as u64;
let end_time = chrono::Utc::now();
let correction_text: String = resp.choices.into_iter().next()
@ -633,8 +642,8 @@ impl ExecutionLoop {
if let Some(lf) = &self.state.langfuse {
use crate::v1::langfuse_trace::ChatTrace;
lf.emit_chat(ChatTrace {
provider: "ollama_cloud".into(),
model: "gpt-oss:120b".into(),
provider: "opencode".into(),
model: "claude-opus-4-7".into(),
input: vec![crate::v1::Message::new_text("user", prompt.clone())],
output: correction_text.clone(),
prompt_tokens: resp.usage.prompt_tokens,
@ -650,7 +659,7 @@ impl ExecutionLoop {
// Append to the transcript so the next executor turn sees it.
self.append(LogEntry::new(
turn, "system", "gpt-oss:120b", "overseer_correction",
turn, "system", "claude-opus-4-7", "overseer_correction",
serde_json::json!({
"reason": reason,
"correction": correction_text,
@ -672,7 +681,7 @@ impl ExecutionLoop {
"task_class": self.req.task_class,
"operation": self.req.operation,
"reason": reason,
"model": "gpt-oss:120b",
"model": "claude-opus-4-7",
"correction": correction_text,
"applied_at_turn": turn,
"kb_context_used": kb,

View File

@ -769,7 +769,7 @@ async function tailOverseerCorrections(): Promise<number> {
try { row = JSON.parse(line); } catch { continue; }
const op: ObservedOp = {
timestamp: row.created_at ?? new Date().toISOString(),
endpoint: `overseer:${row.model ?? "gpt-oss:120b"}`,
endpoint: `overseer:${row.model ?? "claude-opus-4-7"}`,
input_summary: `${row.task_class ?? "?"}: ${row.reason ?? "escalation"}`,
// Correction itself is neither success nor failure — it's a
// mitigation attempt. We mark success=true so analyzeErrors

View File

@ -1143,9 +1143,15 @@ Format each as a code-fenced block with the byte offset within the shard:
EXACT LINE OF SOURCE DO NOT PARAPHRASE, DO NOT TRUNCATE
\`\`\`
Pick the most reviewer-relevant lines: route definitions (e.g. \`@app.route(...)\`), function signatures, security-sensitive calls (auth/SQL/exec/template/secrets), hardcoded credentials/defaults, exception handlers, sensitive imports. The reviewer will REFUSE to act on any claim not backed by a verbatim anchor — so anchors are how you prove findings are real.`;
// 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama
// Pro. Tree-split MAP fires once per shard (potentially 5-20×
// per file), so latency dominates total scrum time. Gemini 3
// flash returns shard digests substantially faster than the old
// 120B free model while staying strong enough for byte-anchored
// extraction.
const r = await chat({
provider: "ollama_cloud",
model: "gpt-oss:120b",
model: "gemini-3-flash-preview",
prompt,
max_tokens: 900,
});
@ -1195,9 +1201,14 @@ COPY EVERY anchor block from the piece notes IN ORDER, character-perfect. DO NOT
Output the anchor blocks under their original \`\`\`@offset...\`\`\` fences, each on its own with a blank line between. The reviewer rejects findings that don't quote a string from this anchors block, so completeness here directly determines review quality.`;
// 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama
// Pro. The reducer runs once per file (vs once per shard for MAP)
// but on a much larger context (all shard digests stacked), so
// throughput per token still matters. Same model as MAP for
// consistency in tree-split outputs.
const reduced = await chat({
provider: "ollama_cloud",
model: "gpt-oss:120b",
model: "gemini-3-flash-preview",
prompt: reducePrompt,
max_tokens: 2400,
});