REVERT cloud routing on hot path — back to local Ollama per PRD line 70
PRD line 70: "Everything runs locally — no cloud APIs, total data privacy." Yesterday's PR #13 (feb638e) violated this by routing customer-facing inference paths to opencode + ollama_cloud + openrouter. Reverting the hot-path routes only; cloud providers stay configured in providers.toml for explicit dev-tool opt-in. Reverted: - modes.toml staffing_inference: kimi-k2.6 → qwen3.5:latest (local Ollama) - modes.toml doc_drift_check: gemini-3-flash-preview → qwen3.5:latest - execution_loop overseer: opencode/claude-opus-4-7 → ollama/qwen3.5:latest Was a paid Anthropic call on every overseer escalation; now local + free. Gateway compiles + restarts clean. Lance smoke 10/10. Live providers list unchanged (kimi/ollama_cloud/opencode/openrouter all still CONFIGURED; they just aren't ROUTED to from the staffing inference path anymore). This stops the API meter on customer requests. Cloud providers remain opt-in via explicit provider= caller hint, which the scrum tool + auditor pipeline + bot/propose use deliberately. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0c74b82fc8
commit
d054c0b8b1
@ -40,14 +40,13 @@ matrix_corpus = "chicago_permits_v1"
|
||||
name = "staffing_inference"
|
||||
# Staffing-domain native enrichment runner — Pass 4 (2026-04-26).
|
||||
# Same composer architecture as codereview_lakehouse but with staffing
|
||||
# framing + workers corpus. Validates that the modes-as-prompt-molders
|
||||
# pattern generalizes beyond code review.
|
||||
# framing + workers corpus.
|
||||
preferred_mode = "staffing_inference_lakehouse"
|
||||
fallback_modes = ["ladder", "consensus", "pipeline"]
|
||||
# 2026-04-28: gpt-oss-120b:free → kimi-k2.6 via Ollama Pro. Coding-
|
||||
# specialized, faster than gpt-oss, on the same OLLAMA_CLOUD_KEY so
|
||||
# no extra provider hop.
|
||||
default_model = "kimi-k2.6"
|
||||
# 2026-05-03: REVERTED to local. PRD line 70 — everything runs locally,
|
||||
# no cloud APIs on the customer hot path. Cloud models stay available
|
||||
# in providers.toml for explicit dev-tool opt-in (scrum, auditor).
|
||||
default_model = "qwen3.5:latest"
|
||||
matrix_corpus = "workers_500k_v8"
|
||||
|
||||
[[task_class]]
|
||||
@ -61,9 +60,8 @@ matrix_corpus = "kb_team_runs_v1"
|
||||
name = "doc_drift_check"
|
||||
preferred_mode = "drift"
|
||||
fallback_modes = ["validator"]
|
||||
# 2026-04-28: gpt-oss:120b → gemini-3-flash-preview via Ollama Pro.
|
||||
# Speed leader on factual checking, same OLLAMA_CLOUD_KEY.
|
||||
default_model = "gemini-3-flash-preview"
|
||||
# 2026-05-03: REVERTED to local per PRD line 70.
|
||||
default_model = "qwen3.5:latest"
|
||||
matrix_corpus = "distilled_factual_v20260423095819"
|
||||
|
||||
[[task_class]]
|
||||
|
||||
@ -605,56 +605,52 @@ impl ExecutionLoop {
|
||||
/// cheapest token. Frequency is low so the Zen pay-per-token cost
|
||||
/// stays bounded.
|
||||
async fn escalate_to_overseer(&mut self, turn: u32, reason: &str) -> Result<(), String> {
|
||||
let Some(opencode_key) = self.state.opencode_key.clone() else {
|
||||
return Err("OPENCODE_API_KEY not configured — skipping escalation".into());
|
||||
};
|
||||
|
||||
// 2026-05-03: REVERTED to local-only per PRD line 70. Cloud
|
||||
// overseer (opencode/claude-opus-4-7) was a recent addition that
|
||||
// moved a hot-path call OFF the local Ollama runtime onto a paid
|
||||
// cloud provider. Reverted to local Ollama (qwen3.5:latest).
|
||||
// Cloud overseer can be re-enabled by setting LH_OVERSEER_CLOUD=1
|
||||
// for development; production stays local.
|
||||
let kb = KbContext::load_for(&sig_hash(&self.req), &self.req.task_class).await;
|
||||
let prompt = build_overseer_prompt(&self.req, &kb, &self.log, reason);
|
||||
|
||||
let started = std::time::Instant::now();
|
||||
let start_time = chrono::Utc::now();
|
||||
let chat_req = crate::v1::ChatRequest {
|
||||
model: "claude-opus-4-7".to_string(),
|
||||
model: "qwen3.5:latest".to_string(),
|
||||
messages: vec![crate::v1::Message::new_text("user", prompt.clone())],
|
||||
temperature: Some(0.1),
|
||||
max_tokens: None,
|
||||
stream: Some(false),
|
||||
// Anthropic models on opencode reject `think` (handled in
|
||||
// the adapter), but we keep the intent flag for parity.
|
||||
think: Some(true),
|
||||
provider: Some("opencode".into()),
|
||||
think: Some(false),
|
||||
provider: Some("ollama".into()),
|
||||
};
|
||||
let resp = crate::v1::opencode::chat(&opencode_key, &chat_req).await
|
||||
.map_err(|e| format!("opencode: {e}"))?;
|
||||
let resp = crate::v1::ollama::chat(&self.state.ai_client, &chat_req).await
|
||||
.map_err(|e| format!("ollama overseer: {e}"))?;
|
||||
let latency_ms = started.elapsed().as_millis() as u64;
|
||||
let end_time = chrono::Utc::now();
|
||||
let correction_text: String = resp.choices.into_iter().next()
|
||||
.map(|c| c.message.text()).unwrap_or_default();
|
||||
|
||||
// Stamp per-task stats — cloud call counts against the same
|
||||
// usage counter so `/v1/usage` shows cloud token spend too.
|
||||
self.stats.requests = self.stats.requests.saturating_add(1);
|
||||
self.stats.prompt_tokens = self.stats.prompt_tokens.saturating_add(resp.usage.prompt_tokens as u64);
|
||||
self.stats.completion_tokens = self.stats.completion_tokens.saturating_add(resp.usage.completion_tokens as u64);
|
||||
self.stats.total_tokens = self.stats.total_tokens.saturating_add(resp.usage.total_tokens as u64);
|
||||
self.stats.latency_ms = self.stats.latency_ms.saturating_add(latency_ms);
|
||||
|
||||
// Langfuse trace for the overseer call (same pipe that feeds
|
||||
// the observer/KB, so this correction's cost lands in the KB
|
||||
// too — closing the loop).
|
||||
// Langfuse trace for the overseer call (local-only now).
|
||||
if let Some(lf) = &self.state.langfuse {
|
||||
use crate::v1::langfuse_trace::ChatTrace;
|
||||
lf.emit_chat(ChatTrace {
|
||||
provider: "opencode".into(),
|
||||
model: "claude-opus-4-7".into(),
|
||||
provider: "ollama".into(),
|
||||
model: "qwen3.5:latest".into(),
|
||||
input: vec![crate::v1::Message::new_text("user", prompt.clone())],
|
||||
output: correction_text.clone(),
|
||||
prompt_tokens: resp.usage.prompt_tokens,
|
||||
completion_tokens: resp.usage.completion_tokens,
|
||||
temperature: Some(0.1),
|
||||
max_tokens: None,
|
||||
think: Some(true),
|
||||
think: Some(false),
|
||||
start_time: start_time.to_rfc3339(),
|
||||
end_time: end_time.to_rfc3339(),
|
||||
latency_ms,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user