//! Phase 21 — context-budget accounting for model calls. //! //! Ports `assertContextBudget` + `estimateTokens` + `CONTEXT_WINDOWS` //! from `tests/multi-agent/agent.ts` so Rust-side callers (gateway //! tool surfaces, future Rust agents) get the same loud-fail behavior //! on window overflow instead of silent truncation. //! //! The token estimator is deliberately the same chars/4 heuristic as //! the TS side. It's biased ~15% safe — pessimistic on English, correct //! within a factor of 2 on code. Swap to a provider tokenizer only when //! the estimator drives a decision (we're nowhere near that yet). use std::collections::HashMap; use std::sync::OnceLock; // `estimate_tokens` moved to `shared::model_matrix::ModelMatrix::estimate_tokens` // (cdc24d8). All callers migrated; the deprecated wrapper that stood in its // place has been removed since it had zero external consumers. /// Phase 21 — per-model context windows, mirroring the TS table in /// `tests/multi-agent/agent.ts`. Anchored on each model's documented /// max; unknown models fall back to `DEFAULT_CONTEXT_WINDOW`. pub const DEFAULT_CONTEXT_WINDOW: usize = 32_768; pub const DEFAULT_SAFETY_MARGIN: usize = 2_000; pub const DEFAULT_MAX_TOKENS: usize = 800; fn known_windows() -> &'static HashMap<&'static str, usize> { static TABLE: OnceLock> = OnceLock::new(); TABLE.get_or_init(|| { let mut m = HashMap::new(); m.insert("mistral:latest", 32_768); m.insert("qwen2.5:latest", 32_768); m.insert("qwen3:latest", 40_960); m.insert("qwen3.5:latest", 262_144); m.insert("qwen3-embedding", 32_768); m.insert("nomic-embed-text-v2-moe", 2_048); m.insert("gpt-oss:20b", 131_072); m.insert("gpt-oss:120b", 131_072); m.insert("qwen3.5:397b", 131_072); m.insert("kimi-k2-thinking", 200_000); m.insert("kimi-k2.6", 200_000); m.insert("kimi-k2:1t", 1_048_576); m.insert("deepseek-v3.1:671b", 131_072); m.insert("glm-4.7", 131_072); m }) } pub fn context_window_for(model: &str) -> usize { known_windows().get(model).copied().unwrap_or(DEFAULT_CONTEXT_WINDOW) } /// Result of a budget check — exposes the numbers so callers can log /// how much headroom remains without re-running the estimator. #[derive(Debug, Clone, Copy)] pub struct BudgetCheck { pub estimated: usize, pub window: usize, pub remaining: i64, } /// Inputs to `assert_context_budget`. `bypass` exists for call sites /// that handle their own overflow (continuation's second pass already /// counted the partial; T5 gatekeeper prompts have a separate policy). #[derive(Debug, Clone, Default)] pub struct BudgetOpts<'a> { pub system: Option<&'a str>, pub max_tokens: Option, pub safety_margin: Option, pub bypass: bool, } /// Phase 21's loud-fail primitive. Returns a `BudgetCheck` on success /// and the same struct plus over-by count on failure. The whole point /// is to stop silent truncation — callers that expect overflow should /// chunk BEFORE calling or set `bypass: true`. pub fn assert_context_budget( model: &str, prompt: &str, opts: BudgetOpts, ) -> Result { let window = context_window_for(model); let safety = opts.safety_margin.unwrap_or(DEFAULT_SAFETY_MARGIN); let max_tokens = opts.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS); let sys_tokens = opts.system.map(shared::model_matrix::ModelMatrix::estimate_tokens).unwrap_or(0); let estimated = shared::model_matrix::ModelMatrix::estimate_tokens(prompt) + sys_tokens + max_tokens; let remaining = window as i64 - estimated as i64 - safety as i64; let check = BudgetCheck { estimated, window, remaining }; if remaining < 0 && !opts.bypass { return Err((check, (-remaining) as usize)); } Ok(check) } /// Convenience — format an overflow error the same way the TS side /// does. Exposed so downstream crates render consistent messages. pub fn overflow_message(model: &str, check: &BudgetCheck, over_by: usize, safety: usize) -> String { format!( "context overflow: model={} est={}t window={}t safety={}t over={}t. \ Chunk the prompt (see config/models.json overflow_policies) or set \ bypass:true if you know the risk.", model, check.estimated, check.window, safety, over_by, ) } #[cfg(test)] mod tests { use super::*; // Deprecated-function behavior is now canonically tested in // crates/shared/src/model_matrix.rs. This test was the legacy // pin that preceded the migration; delete when the deprecated // wrapper itself goes (see the #[deprecated] attribute). #[test] fn context_window_known_and_fallback() { assert_eq!(context_window_for("qwen3.5:latest"), 262_144); assert_eq!(context_window_for("kimi-k2:1t"), 1_048_576); assert_eq!(context_window_for("some-unreleased-model"), DEFAULT_CONTEXT_WINDOW); } #[test] fn budget_passes_well_under_window() { let check = assert_context_budget( "qwen3:latest", &"x".repeat(4_000), // ~1000 tokens BudgetOpts { max_tokens: Some(500), ..Default::default() }, ).expect("well under 40K window"); assert!(check.remaining > 30_000); } #[test] fn budget_fails_when_prompt_overflows_window() { let huge = "x".repeat(200_000); // ~50K tokens, over qwen3's 40K let err = assert_context_budget( "qwen3:latest", &huge, BudgetOpts::default(), ).expect_err("should overflow qwen3's 40K window"); assert!(err.1 > 0, "over_by must be positive"); } #[test] fn budget_bypass_returns_ok_even_over() { let huge = "x".repeat(200_000); let check = assert_context_budget( "qwen3:latest", &huge, BudgetOpts { bypass: true, ..Default::default() }, ).expect("bypass must suppress the error"); assert!(check.remaining < 0, "check still reports negative remaining"); } #[test] fn budget_counts_system_prompt() { // 10K-char system prompt → ~2500 tokens. With a big max_tokens // this should push us closer to the window. let sys = "s".repeat(10_000); let prompt = "p".repeat(4_000); let with_sys = assert_context_budget( "qwen3:latest", &prompt, BudgetOpts { system: Some(&sys), max_tokens: Some(500), ..Default::default() }, ).unwrap(); let without_sys = assert_context_budget( "qwen3:latest", &prompt, BudgetOpts { max_tokens: Some(500), ..Default::default() }, ).unwrap(); assert!(with_sys.estimated > without_sys.estimated, "system prompt should raise estimate"); assert_eq!(with_sys.estimated - without_sys.estimated, shared::model_matrix::ModelMatrix::estimate_tokens(&sys)); } #[test] fn overflow_message_includes_numbers() { let check = BudgetCheck { estimated: 42_000, window: 40_960, remaining: -1_040 }; let msg = overflow_message("qwen3:latest", &check, 3_040, 2_000); assert!(msg.contains("qwen3:latest")); assert!(msg.contains("42000t")); assert!(msg.contains("40960t")); assert!(msg.contains("3040t")); } }