From 8cbbd0ef70eb4ba839fdd930cd471236fc63303c Mon Sep 17 00:00:00 2001 From: profit Date: Wed, 22 Apr 2026 02:50:09 -0500 Subject: [PATCH] Phase 38 fix: default think=false on /v1/chat Live-test caught the Phase 21 thinking-model trap on first call. qwen3.5 with max_tokens=50 and default think behavior burned all 50 tokens on hidden reasoning; visible content was "". completion_tokens exactly matching max_tokens was the tell. Adapter now defaults think: Some(false) matching scenario.ts hot-path discipline. Callers that want reasoning (overseers, T3+) opt in via a non-OpenAI `think: true` extension field on the request. Verified end-to-end after restart: - "Lakehouse supports ACID and raw data." (5 words, 516ms) - "tokio\nasync-std\nsmol" (3 Rust crates, 391ms) - /v1/usage accumulates across calls (2 req / 95 total tokens) Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/gateway/src/v1/mod.rs | 8 ++++++++ crates/gateway/src/v1/ollama.rs | 13 ++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/crates/gateway/src/v1/mod.rs b/crates/gateway/src/v1/mod.rs index ca18487..cba16a2 100644 --- a/crates/gateway/src/v1/mod.rs +++ b/crates/gateway/src/v1/mod.rs @@ -68,6 +68,14 @@ pub struct ChatRequest { /// Phase 39+ wires real streaming. #[serde(default)] pub stream: Option, + /// Non-OpenAI extension. Passes through to the provider's thinking + /// toggle. Default: **false** — hot-path discipline for thinking + /// models (qwen3.5, qwen3, gpt-oss) that otherwise burn the token + /// budget on hidden reasoning before visible output starts, + /// producing empty responses. Set true explicitly when calling an + /// overseer / reasoning-heavy path. + #[serde(default)] + pub think: Option, } #[derive(Serialize)] diff --git a/crates/gateway/src/v1/ollama.rs b/crates/gateway/src/v1/ollama.rs index c768960..d7a5e4d 100644 --- a/crates/gateway/src/v1/ollama.rs +++ b/crates/gateway/src/v1/ollama.rs @@ -20,11 +20,14 @@ pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result