diff --git a/auditor/checks/inference.ts b/auditor/checks/inference.ts index d3a0bde..5cddbc7 100644 --- a/auditor/checks/inference.ts +++ b/auditor/checks/inference.ts @@ -112,9 +112,19 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< { role: "system", content: systemMsg }, { role: "user", content: userMsg }, ], - max_tokens: 3000, - temperature: 0.2, - think: true, // T3 overseer should reason — JSON shape is still required + // Deterministic classification mode — temp=0 is greedy-sample, + // so identical input → identical output on the same model + // version. think=false disables the reasoning trace that was + // letting variable prose leak into the classification output + // and inflate the audit_lessons signature set (observed as + // sig_count creep across the 9-run empirical test). + // + // max_tokens tightened to 1500 — the structured JSON response + // fits comfortably in 1500 tokens for typical PRs (~7 claims); + // the old 3000 just gave the model room to wander. + max_tokens: 1500, + temperature: 0, + think: false, }), signal: AbortSignal.timeout(CALL_TIMEOUT_MS), }); diff --git a/tests/real-world/nine_consecutive_audits.ts b/tests/real-world/nine_consecutive_audits.ts index 999a9de..21255d0 100644 --- a/tests/real-world/nine_consecutive_audits.ts +++ b/tests/real-world/nine_consecutive_audits.ts @@ -28,7 +28,7 @@ const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`; const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`; const POLL_INTERVAL_MS = 5_000; const AUDIT_TIMEOUT_MS = 180_000; -const RUNS = 9; +const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9); const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8); async function sh(cmd: string): Promise<{ stdout: string; stderr: string; code: number }> {