From 8cbbd0ef70eb4ba839fdd930cd471236fc63303c Mon Sep 17 00:00:00 2001
From: profit <profit@lakehouse>
Date: Wed, 22 Apr 2026 02:50:09 -0500
Subject: [PATCH] Phase 38 fix: default think=false on /v1/chat

Live-test caught the Phase 21 thinking-model trap on first call.
qwen3.5 with max_tokens=50 and default think behavior burned all 50
tokens on hidden reasoning; visible content was "". completion_tokens
exactly matching max_tokens was the tell.

Adapter now defaults think: Some(false) matching scenario.ts hot-path
discipline. Callers that want reasoning (overseers, T3+) opt in via
a non-OpenAI `think: true` extension field on the request.

Verified end-to-end after restart:
- "Lakehouse supports ACID and raw data." (5 words, 516ms)
- "tokio\nasync-std\nsmol" (3 Rust crates, 391ms)
- /v1/usage accumulates across calls (2 req / 95 total tokens)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/gateway/src/v1/mod.rs    |  8 ++++++++
 crates/gateway/src/v1/ollama.rs | 13 ++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/crates/gateway/src/v1/mod.rs b/crates/gateway/src/v1/mod.rs
index ca18487..cba16a2 100644
--- a/crates/gateway/src/v1/mod.rs
+++ b/crates/gateway/src/v1/mod.rs
@@ -68,6 +68,14 @@ pub struct ChatRequest {
     /// Phase 39+ wires real streaming.
     #[serde(default)]
     pub stream: Option<bool>,
+    /// Non-OpenAI extension. Passes through to the provider's thinking
+    /// toggle. Default: **false** — hot-path discipline for thinking
+    /// models (qwen3.5, qwen3, gpt-oss) that otherwise burn the token
+    /// budget on hidden reasoning before visible output starts,
+    /// producing empty responses. Set true explicitly when calling an
+    /// overseer / reasoning-heavy path.
+    #[serde(default)]
+    pub think: Option<bool>,
 }
 
 #[derive(Serialize)]
diff --git a/crates/gateway/src/v1/ollama.rs b/crates/gateway/src/v1/ollama.rs
index c768960..d7a5e4d 100644
--- a/crates/gateway/src/v1/ollama.rs
+++ b/crates/gateway/src/v1/ollama.rs
@@ -20,11 +20,14 @@ pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse,
         system: if system.is_empty() { None } else { Some(system) },
         temperature: req.temperature,
         max_tokens: req.max_tokens,
-        // Phase 38 default: leave thinking behavior to the model's
-        // default (None). Phase 21's `think:false` discipline is a
-        // call-site concern for hot-path JSON emitters — Phase 40's
-        // routing engine can set it per task class.
-        think: None,
+        // Phase 38 default: think=false. Hot-path discipline for
+        // thinking models (qwen3.5, qwen3, gpt-oss) which otherwise
+        // burn the max_tokens budget on hidden reasoning before any
+        // visible output, producing empty responses. Callers that
+        // actually want reasoning (overseers, T3+ tiers) opt in via
+        // the `think: true` extension field. Phase 40 routing engine
+        // flips this per task class.
+        think: Some(req.think.unwrap_or(false)),
     };
 
     let t0 = std::time::Instant::now();