root a934a76988
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
aibridge: delete deprecated estimate_tokens wrapper — fully migrated
cdc24d8 migrated all 5 call sites to shared::model_matrix::ModelMatrix.
Grep across the workspace confirms zero remaining callers (only doc
comments in the new module reference the old name). Wrapper was there
to smooth the transition; transition is done.

Leaves a 3-line breadcrumb comment pointing to the new location so
anyone opening this file sees the migration history. The deprecated
wrapper itself is 4 lines deleted.

Workspace warnings still at 0 (both lib + tests).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 06:38:01 -05:00

189 lines
7.3 KiB
Rust

//! Phase 21 — context-budget accounting for model calls.
//!
//! Ports `assertContextBudget` + `estimateTokens` + `CONTEXT_WINDOWS`
//! from `tests/multi-agent/agent.ts` so Rust-side callers (gateway
//! tool surfaces, future Rust agents) get the same loud-fail behavior
//! on window overflow instead of silent truncation.
//!
//! The token estimator is deliberately the same chars/4 heuristic as
//! the TS side. It's biased ~15% safe — pessimistic on English, correct
//! within a factor of 2 on code. Swap to a provider tokenizer only when
//! the estimator drives a decision (we're nowhere near that yet).
use std::collections::HashMap;
use std::sync::OnceLock;
// `estimate_tokens` moved to `shared::model_matrix::ModelMatrix::estimate_tokens`
// (cdc24d8). All callers migrated; the deprecated wrapper that stood in its
// place has been removed since it had zero external consumers.
/// Phase 21 — per-model context windows, mirroring the TS table in
/// `tests/multi-agent/agent.ts`. Anchored on each model's documented
/// max; unknown models fall back to `DEFAULT_CONTEXT_WINDOW`.
pub const DEFAULT_CONTEXT_WINDOW: usize = 32_768;
pub const DEFAULT_SAFETY_MARGIN: usize = 2_000;
pub const DEFAULT_MAX_TOKENS: usize = 800;
fn known_windows() -> &'static HashMap<&'static str, usize> {
static TABLE: OnceLock<HashMap<&'static str, usize>> = OnceLock::new();
TABLE.get_or_init(|| {
let mut m = HashMap::new();
m.insert("mistral:latest", 32_768);
m.insert("qwen2.5:latest", 32_768);
m.insert("qwen3:latest", 40_960);
m.insert("qwen3.5:latest", 262_144);
m.insert("qwen3-embedding", 32_768);
m.insert("nomic-embed-text-v2-moe", 2_048);
m.insert("gpt-oss:20b", 131_072);
m.insert("gpt-oss:120b", 131_072);
m.insert("qwen3.5:397b", 131_072);
m.insert("kimi-k2-thinking", 200_000);
m.insert("kimi-k2.6", 200_000);
m.insert("kimi-k2:1t", 1_048_576);
m.insert("deepseek-v3.1:671b", 131_072);
m.insert("glm-4.7", 131_072);
m
})
}
pub fn context_window_for(model: &str) -> usize {
known_windows().get(model).copied().unwrap_or(DEFAULT_CONTEXT_WINDOW)
}
/// Result of a budget check — exposes the numbers so callers can log
/// how much headroom remains without re-running the estimator.
#[derive(Debug, Clone, Copy)]
pub struct BudgetCheck {
pub estimated: usize,
pub window: usize,
pub remaining: i64,
}
/// Inputs to `assert_context_budget`. `bypass` exists for call sites
/// that handle their own overflow (continuation's second pass already
/// counted the partial; T5 gatekeeper prompts have a separate policy).
#[derive(Debug, Clone, Default)]
pub struct BudgetOpts<'a> {
pub system: Option<&'a str>,
pub max_tokens: Option<usize>,
pub safety_margin: Option<usize>,
pub bypass: bool,
}
/// Phase 21's loud-fail primitive. Returns a `BudgetCheck` on success
/// and the same struct plus over-by count on failure. The whole point
/// is to stop silent truncation — callers that expect overflow should
/// chunk BEFORE calling or set `bypass: true`.
pub fn assert_context_budget(
model: &str,
prompt: &str,
opts: BudgetOpts,
) -> Result<BudgetCheck, (BudgetCheck, usize)> {
let window = context_window_for(model);
let safety = opts.safety_margin.unwrap_or(DEFAULT_SAFETY_MARGIN);
let max_tokens = opts.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS);
let sys_tokens = opts.system.map(shared::model_matrix::ModelMatrix::estimate_tokens).unwrap_or(0);
let estimated = shared::model_matrix::ModelMatrix::estimate_tokens(prompt) + sys_tokens + max_tokens;
let remaining = window as i64 - estimated as i64 - safety as i64;
let check = BudgetCheck { estimated, window, remaining };
if remaining < 0 && !opts.bypass {
return Err((check, (-remaining) as usize));
}
Ok(check)
}
/// Convenience — format an overflow error the same way the TS side
/// does. Exposed so downstream crates render consistent messages.
pub fn overflow_message(model: &str, check: &BudgetCheck, over_by: usize, safety: usize) -> String {
format!(
"context overflow: model={} est={}t window={}t safety={}t over={}t. \
Chunk the prompt (see config/models.json overflow_policies) or set \
bypass:true if you know the risk.",
model, check.estimated, check.window, safety, over_by,
)
}
#[cfg(test)]
mod tests {
use super::*;
// Deprecated-function behavior is now canonically tested in
// crates/shared/src/model_matrix.rs. This test was the legacy
// pin that preceded the migration; delete when the deprecated
// wrapper itself goes (see the #[deprecated] attribute).
#[test]
fn context_window_known_and_fallback() {
assert_eq!(context_window_for("qwen3.5:latest"), 262_144);
assert_eq!(context_window_for("kimi-k2:1t"), 1_048_576);
assert_eq!(context_window_for("some-unreleased-model"), DEFAULT_CONTEXT_WINDOW);
}
#[test]
fn budget_passes_well_under_window() {
let check = assert_context_budget(
"qwen3:latest",
&"x".repeat(4_000), // ~1000 tokens
BudgetOpts { max_tokens: Some(500), ..Default::default() },
).expect("well under 40K window");
assert!(check.remaining > 30_000);
}
#[test]
fn budget_fails_when_prompt_overflows_window() {
let huge = "x".repeat(200_000); // ~50K tokens, over qwen3's 40K
let err = assert_context_budget(
"qwen3:latest",
&huge,
BudgetOpts::default(),
).expect_err("should overflow qwen3's 40K window");
assert!(err.1 > 0, "over_by must be positive");
}
#[test]
fn budget_bypass_returns_ok_even_over() {
let huge = "x".repeat(200_000);
let check = assert_context_budget(
"qwen3:latest",
&huge,
BudgetOpts { bypass: true, ..Default::default() },
).expect("bypass must suppress the error");
assert!(check.remaining < 0, "check still reports negative remaining");
}
#[test]
fn budget_counts_system_prompt() {
// 10K-char system prompt → ~2500 tokens. With a big max_tokens
// this should push us closer to the window.
let sys = "s".repeat(10_000);
let prompt = "p".repeat(4_000);
let with_sys = assert_context_budget(
"qwen3:latest",
&prompt,
BudgetOpts {
system: Some(&sys),
max_tokens: Some(500),
..Default::default()
},
).unwrap();
let without_sys = assert_context_budget(
"qwen3:latest",
&prompt,
BudgetOpts { max_tokens: Some(500), ..Default::default() },
).unwrap();
assert!(with_sys.estimated > without_sys.estimated,
"system prompt should raise estimate");
assert_eq!(with_sys.estimated - without_sys.estimated, shared::model_matrix::ModelMatrix::estimate_tokens(&sys));
}
#[test]
fn overflow_message_includes_numbers() {
let check = BudgetCheck { estimated: 42_000, window: 40_960, remaining: -1_040 };
let msg = overflow_message("qwen3:latest", &check, 3_040, 2_000);
assert!(msg.contains("qwen3:latest"));
assert!(msg.contains("42000t"));
assert!(msg.contains("40960t"));
assert!(msg.contains("3040t"));
}
}