2026-04-23 03:28:33 +00:00
1 changed files with 14 additions and 11 deletions
--- a/auditor/checks/inference.ts
+++ b/auditor/checks/inference.ts
@ -112,19 +112,22 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
          { role: "system", content: systemMsg },
          { role: "user", content: userMsg },
        ],
-        // Deterministic classification mode — temp=0 is greedy-sample,
-        // so identical input → identical output on the same model
-        // version. think=false disables the reasoning trace that was
-        // letting variable prose leak into the classification output
-        // and inflate the audit_lessons signature set (observed as
-        // sig_count creep across the 9-run empirical test).
+        // Deterministic classification — temp=0 is greedy-sample, so
+        // identical input yields identical output on the same model
+        // version. This kills the signature creep we observed in the
+        // 9-run empirical test (sig_count 16→27 from cloud phrasing
+        // variance at temp=0.2).
        //
-        // max_tokens tightened to 1500 — the structured JSON response
-        // fits comfortably in 1500 tokens for typical PRs (~7 claims);
-        // the old 3000 just gave the model room to wander.
-        max_tokens: 1500,
+        // IMPORTANT: keep think=true. gpt-oss:120b is a reasoning
+        // model; setting think=false caused it to return empty content
+        // on large prompts (observed during Level 1 validation: 13421
+        // tokens used, empty content returned). The reasoning trace is
+        // variable prose, but at temp=0 the FINAL classification is
+        // still deterministic because greedy sampling converges to
+        // the same conclusion from the same starting state.
+        max_tokens: 3000,
        temperature: 0,
-        think: false,
+        think: true,
      }),
      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
    });