Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
cdc24d8 migrated all 5 call sites to shared::model_matrix::ModelMatrix. Grep across the workspace confirms zero remaining callers (only doc comments in the new module reference the old name). Wrapper was there to smooth the transition; transition is done. Leaves a 3-line breadcrumb comment pointing to the new location so anyone opening this file sees the migration history. The deprecated wrapper itself is 4 lines deleted. Workspace warnings still at 0 (both lib + tests). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
189 lines
7.3 KiB
Rust
189 lines
7.3 KiB
Rust
//! Phase 21 — context-budget accounting for model calls.
|
|
//!
|
|
//! Ports `assertContextBudget` + `estimateTokens` + `CONTEXT_WINDOWS`
|
|
//! from `tests/multi-agent/agent.ts` so Rust-side callers (gateway
|
|
//! tool surfaces, future Rust agents) get the same loud-fail behavior
|
|
//! on window overflow instead of silent truncation.
|
|
//!
|
|
//! The token estimator is deliberately the same chars/4 heuristic as
|
|
//! the TS side. It's biased ~15% safe — pessimistic on English, correct
|
|
//! within a factor of 2 on code. Swap to a provider tokenizer only when
|
|
//! the estimator drives a decision (we're nowhere near that yet).
|
|
|
|
use std::collections::HashMap;
|
|
use std::sync::OnceLock;
|
|
|
|
// `estimate_tokens` moved to `shared::model_matrix::ModelMatrix::estimate_tokens`
|
|
// (cdc24d8). All callers migrated; the deprecated wrapper that stood in its
|
|
// place has been removed since it had zero external consumers.
|
|
|
|
/// Phase 21 — per-model context windows, mirroring the TS table in
|
|
/// `tests/multi-agent/agent.ts`. Anchored on each model's documented
|
|
/// max; unknown models fall back to `DEFAULT_CONTEXT_WINDOW`.
|
|
pub const DEFAULT_CONTEXT_WINDOW: usize = 32_768;
|
|
pub const DEFAULT_SAFETY_MARGIN: usize = 2_000;
|
|
pub const DEFAULT_MAX_TOKENS: usize = 800;
|
|
|
|
fn known_windows() -> &'static HashMap<&'static str, usize> {
|
|
static TABLE: OnceLock<HashMap<&'static str, usize>> = OnceLock::new();
|
|
TABLE.get_or_init(|| {
|
|
let mut m = HashMap::new();
|
|
m.insert("mistral:latest", 32_768);
|
|
m.insert("qwen2.5:latest", 32_768);
|
|
m.insert("qwen3:latest", 40_960);
|
|
m.insert("qwen3.5:latest", 262_144);
|
|
m.insert("qwen3-embedding", 32_768);
|
|
m.insert("nomic-embed-text-v2-moe", 2_048);
|
|
m.insert("gpt-oss:20b", 131_072);
|
|
m.insert("gpt-oss:120b", 131_072);
|
|
m.insert("qwen3.5:397b", 131_072);
|
|
m.insert("kimi-k2-thinking", 200_000);
|
|
m.insert("kimi-k2.6", 200_000);
|
|
m.insert("kimi-k2:1t", 1_048_576);
|
|
m.insert("deepseek-v3.1:671b", 131_072);
|
|
m.insert("glm-4.7", 131_072);
|
|
m
|
|
})
|
|
}
|
|
|
|
pub fn context_window_for(model: &str) -> usize {
|
|
known_windows().get(model).copied().unwrap_or(DEFAULT_CONTEXT_WINDOW)
|
|
}
|
|
|
|
/// Result of a budget check — exposes the numbers so callers can log
|
|
/// how much headroom remains without re-running the estimator.
|
|
#[derive(Debug, Clone, Copy)]
|
|
pub struct BudgetCheck {
|
|
pub estimated: usize,
|
|
pub window: usize,
|
|
pub remaining: i64,
|
|
}
|
|
|
|
/// Inputs to `assert_context_budget`. `bypass` exists for call sites
|
|
/// that handle their own overflow (continuation's second pass already
|
|
/// counted the partial; T5 gatekeeper prompts have a separate policy).
|
|
#[derive(Debug, Clone, Default)]
|
|
pub struct BudgetOpts<'a> {
|
|
pub system: Option<&'a str>,
|
|
pub max_tokens: Option<usize>,
|
|
pub safety_margin: Option<usize>,
|
|
pub bypass: bool,
|
|
}
|
|
|
|
/// Phase 21's loud-fail primitive. Returns a `BudgetCheck` on success
|
|
/// and the same struct plus over-by count on failure. The whole point
|
|
/// is to stop silent truncation — callers that expect overflow should
|
|
/// chunk BEFORE calling or set `bypass: true`.
|
|
pub fn assert_context_budget(
|
|
model: &str,
|
|
prompt: &str,
|
|
opts: BudgetOpts,
|
|
) -> Result<BudgetCheck, (BudgetCheck, usize)> {
|
|
let window = context_window_for(model);
|
|
let safety = opts.safety_margin.unwrap_or(DEFAULT_SAFETY_MARGIN);
|
|
let max_tokens = opts.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS);
|
|
let sys_tokens = opts.system.map(shared::model_matrix::ModelMatrix::estimate_tokens).unwrap_or(0);
|
|
let estimated = shared::model_matrix::ModelMatrix::estimate_tokens(prompt) + sys_tokens + max_tokens;
|
|
let remaining = window as i64 - estimated as i64 - safety as i64;
|
|
let check = BudgetCheck { estimated, window, remaining };
|
|
if remaining < 0 && !opts.bypass {
|
|
return Err((check, (-remaining) as usize));
|
|
}
|
|
Ok(check)
|
|
}
|
|
|
|
/// Convenience — format an overflow error the same way the TS side
|
|
/// does. Exposed so downstream crates render consistent messages.
|
|
pub fn overflow_message(model: &str, check: &BudgetCheck, over_by: usize, safety: usize) -> String {
|
|
format!(
|
|
"context overflow: model={} est={}t window={}t safety={}t over={}t. \
|
|
Chunk the prompt (see config/models.json overflow_policies) or set \
|
|
bypass:true if you know the risk.",
|
|
model, check.estimated, check.window, safety, over_by,
|
|
)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
// Deprecated-function behavior is now canonically tested in
|
|
// crates/shared/src/model_matrix.rs. This test was the legacy
|
|
// pin that preceded the migration; delete when the deprecated
|
|
// wrapper itself goes (see the #[deprecated] attribute).
|
|
|
|
#[test]
|
|
fn context_window_known_and_fallback() {
|
|
assert_eq!(context_window_for("qwen3.5:latest"), 262_144);
|
|
assert_eq!(context_window_for("kimi-k2:1t"), 1_048_576);
|
|
assert_eq!(context_window_for("some-unreleased-model"), DEFAULT_CONTEXT_WINDOW);
|
|
}
|
|
|
|
#[test]
|
|
fn budget_passes_well_under_window() {
|
|
let check = assert_context_budget(
|
|
"qwen3:latest",
|
|
&"x".repeat(4_000), // ~1000 tokens
|
|
BudgetOpts { max_tokens: Some(500), ..Default::default() },
|
|
).expect("well under 40K window");
|
|
assert!(check.remaining > 30_000);
|
|
}
|
|
|
|
#[test]
|
|
fn budget_fails_when_prompt_overflows_window() {
|
|
let huge = "x".repeat(200_000); // ~50K tokens, over qwen3's 40K
|
|
let err = assert_context_budget(
|
|
"qwen3:latest",
|
|
&huge,
|
|
BudgetOpts::default(),
|
|
).expect_err("should overflow qwen3's 40K window");
|
|
assert!(err.1 > 0, "over_by must be positive");
|
|
}
|
|
|
|
#[test]
|
|
fn budget_bypass_returns_ok_even_over() {
|
|
let huge = "x".repeat(200_000);
|
|
let check = assert_context_budget(
|
|
"qwen3:latest",
|
|
&huge,
|
|
BudgetOpts { bypass: true, ..Default::default() },
|
|
).expect("bypass must suppress the error");
|
|
assert!(check.remaining < 0, "check still reports negative remaining");
|
|
}
|
|
|
|
#[test]
|
|
fn budget_counts_system_prompt() {
|
|
// 10K-char system prompt → ~2500 tokens. With a big max_tokens
|
|
// this should push us closer to the window.
|
|
let sys = "s".repeat(10_000);
|
|
let prompt = "p".repeat(4_000);
|
|
let with_sys = assert_context_budget(
|
|
"qwen3:latest",
|
|
&prompt,
|
|
BudgetOpts {
|
|
system: Some(&sys),
|
|
max_tokens: Some(500),
|
|
..Default::default()
|
|
},
|
|
).unwrap();
|
|
let without_sys = assert_context_budget(
|
|
"qwen3:latest",
|
|
&prompt,
|
|
BudgetOpts { max_tokens: Some(500), ..Default::default() },
|
|
).unwrap();
|
|
assert!(with_sys.estimated > without_sys.estimated,
|
|
"system prompt should raise estimate");
|
|
assert_eq!(with_sys.estimated - without_sys.estimated, shared::model_matrix::ModelMatrix::estimate_tokens(&sys));
|
|
}
|
|
|
|
#[test]
|
|
fn overflow_message_includes_numbers() {
|
|
let check = BudgetCheck { estimated: 42_000, window: 40_960, remaining: -1_040 };
|
|
let msg = overflow_message("qwen3:latest", &check, 3_040, 2_000);
|
|
assert!(msg.contains("qwen3:latest"));
|
|
assert!(msg.contains("42000t"));
|
|
assert!(msg.contains("40960t"));
|
|
assert!(msg.contains("3040t"));
|
|
}
|
|
}
|