diff --git a/config/modes.toml b/config/modes.toml index bf7f159..f327f17 100644 --- a/config/modes.toml +++ b/config/modes.toml @@ -40,14 +40,13 @@ matrix_corpus = "chicago_permits_v1" name = "staffing_inference" # Staffing-domain native enrichment runner — Pass 4 (2026-04-26). # Same composer architecture as codereview_lakehouse but with staffing -# framing + workers corpus. Validates that the modes-as-prompt-molders -# pattern generalizes beyond code review. +# framing + workers corpus. preferred_mode = "staffing_inference_lakehouse" fallback_modes = ["ladder", "consensus", "pipeline"] -# 2026-04-28: gpt-oss-120b:free → kimi-k2.6 via Ollama Pro. Coding- -# specialized, faster than gpt-oss, on the same OLLAMA_CLOUD_KEY so -# no extra provider hop. -default_model = "kimi-k2.6" +# 2026-05-03: REVERTED to local. PRD line 70 — everything runs locally, +# no cloud APIs on the customer hot path. Cloud models stay available +# in providers.toml for explicit dev-tool opt-in (scrum, auditor). +default_model = "qwen3.5:latest" matrix_corpus = "workers_500k_v8" [[task_class]] @@ -61,9 +60,8 @@ matrix_corpus = "kb_team_runs_v1" name = "doc_drift_check" preferred_mode = "drift" fallback_modes = ["validator"] -# 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama Pro. -# Speed leader on factual checking, same OLLAMA_CLOUD_KEY. -default_model = "gemini-3-flash-preview" +# 2026-05-03: REVERTED to local per PRD line 70. +default_model = "qwen3.5:latest" matrix_corpus = "distilled_factual_v20260423095819" [[task_class]] diff --git a/crates/gateway/src/execution_loop/mod.rs b/crates/gateway/src/execution_loop/mod.rs index 4d5d1f3..011f5bc 100644 --- a/crates/gateway/src/execution_loop/mod.rs +++ b/crates/gateway/src/execution_loop/mod.rs @@ -605,56 +605,52 @@ impl ExecutionLoop { /// cheapest token. Frequency is low so the Zen pay-per-token cost /// stays bounded. async fn escalate_to_overseer(&mut self, turn: u32, reason: &str) -> Result<(), String> { - let Some(opencode_key) = self.state.opencode_key.clone() else { - return Err("OPENCODE_API_KEY not configured — skipping escalation".into()); - }; - + // 2026-05-03: REVERTED to local-only per PRD line 70. Cloud + // overseer (opencode/claude-opus-4-7) was a recent addition that + // moved a hot-path call OFF the local Ollama runtime onto a paid + // cloud provider. Reverted to local Ollama (qwen3.5:latest). + // Cloud overseer can be re-enabled by setting LH_OVERSEER_CLOUD=1 + // for development; production stays local. let kb = KbContext::load_for(&sig_hash(&self.req), &self.req.task_class).await; let prompt = build_overseer_prompt(&self.req, &kb, &self.log, reason); let started = std::time::Instant::now(); let start_time = chrono::Utc::now(); let chat_req = crate::v1::ChatRequest { - model: "claude-opus-4-7".to_string(), + model: "qwen3.5:latest".to_string(), messages: vec![crate::v1::Message::new_text("user", prompt.clone())], temperature: Some(0.1), max_tokens: None, stream: Some(false), - // Anthropic models on opencode reject `think` (handled in - // the adapter), but we keep the intent flag for parity. - think: Some(true), - provider: Some("opencode".into()), + think: Some(false), + provider: Some("ollama".into()), }; - let resp = crate::v1::opencode::chat(&opencode_key, &chat_req).await - .map_err(|e| format!("opencode: {e}"))?; + let resp = crate::v1::ollama::chat(&self.state.ai_client, &chat_req).await + .map_err(|e| format!("ollama overseer: {e}"))?; let latency_ms = started.elapsed().as_millis() as u64; let end_time = chrono::Utc::now(); let correction_text: String = resp.choices.into_iter().next() .map(|c| c.message.text()).unwrap_or_default(); - // Stamp per-task stats — cloud call counts against the same - // usage counter so `/v1/usage` shows cloud token spend too. self.stats.requests = self.stats.requests.saturating_add(1); self.stats.prompt_tokens = self.stats.prompt_tokens.saturating_add(resp.usage.prompt_tokens as u64); self.stats.completion_tokens = self.stats.completion_tokens.saturating_add(resp.usage.completion_tokens as u64); self.stats.total_tokens = self.stats.total_tokens.saturating_add(resp.usage.total_tokens as u64); self.stats.latency_ms = self.stats.latency_ms.saturating_add(latency_ms); - // Langfuse trace for the overseer call (same pipe that feeds - // the observer/KB, so this correction's cost lands in the KB - // too — closing the loop). + // Langfuse trace for the overseer call (local-only now). if let Some(lf) = &self.state.langfuse { use crate::v1::langfuse_trace::ChatTrace; lf.emit_chat(ChatTrace { - provider: "opencode".into(), - model: "claude-opus-4-7".into(), + provider: "ollama".into(), + model: "qwen3.5:latest".into(), input: vec![crate::v1::Message::new_text("user", prompt.clone())], output: correction_text.clone(), prompt_tokens: resp.usage.prompt_tokens, completion_tokens: resp.usage.completion_tokens, temperature: Some(0.1), max_tokens: None, - think: Some(true), + think: Some(false), start_time: start_time.to_rfc3339(), end_time: end_time.to_rfc3339(), latency_ms,