diff --git a/crates/aibridge/src/context.rs b/crates/aibridge/src/context.rs index b394ac2..ca55afe 100644 --- a/crates/aibridge/src/context.rs +++ b/crates/aibridge/src/context.rs @@ -85,8 +85,8 @@ pub fn assert_context_budget( let window = context_window_for(model); let safety = opts.safety_margin.unwrap_or(DEFAULT_SAFETY_MARGIN); let max_tokens = opts.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS); - let sys_tokens = opts.system.map(estimate_tokens).unwrap_or(0); - let estimated = estimate_tokens(prompt) + sys_tokens + max_tokens; + let sys_tokens = opts.system.map(shared::model_matrix::ModelMatrix::estimate_tokens).unwrap_or(0); + let estimated = shared::model_matrix::ModelMatrix::estimate_tokens(prompt) + sys_tokens + max_tokens; let remaining = window as i64 - estimated as i64 - safety as i64; let check = BudgetCheck { estimated, window, remaining }; if remaining < 0 && !opts.bypass { @@ -110,14 +110,10 @@ pub fn overflow_message(model: &str, check: &BudgetCheck, over_by: usize, safety mod tests { use super::*; - #[test] - fn estimate_tokens_ceiling_divides_by_four() { - assert_eq!(estimate_tokens(""), 0); - assert_eq!(estimate_tokens("abc"), 1); // 3 → ceil(3/4) = 1 - assert_eq!(estimate_tokens("abcd"), 1); // 4 → ceil(4/4) = 1 - assert_eq!(estimate_tokens("abcde"), 2); // 5 → ceil(5/4) = 2 - assert_eq!(estimate_tokens(&"x".repeat(400)), 100); - } + // Deprecated-function behavior is now canonically tested in + // crates/shared/src/model_matrix.rs. This test was the legacy + // pin that preceded the migration; delete when the deprecated + // wrapper itself goes (see the #[deprecated] attribute). #[test] fn context_window_known_and_fallback() { @@ -180,7 +176,7 @@ mod tests { ).unwrap(); assert!(with_sys.estimated > without_sys.estimated, "system prompt should raise estimate"); - assert_eq!(with_sys.estimated - without_sys.estimated, estimate_tokens(&sys)); + assert_eq!(with_sys.estimated - without_sys.estimated, shared::model_matrix::ModelMatrix::estimate_tokens(&sys)); } #[test] diff --git a/crates/aibridge/src/tree_split.rs b/crates/aibridge/src/tree_split.rs index c29dff6..781a80a 100644 --- a/crates/aibridge/src/tree_split.rs +++ b/crates/aibridge/src/tree_split.rs @@ -19,8 +19,9 @@ //! we bubble the error up rather than silently truncating. That's the //! whole point of Phase 21. -use crate::context::{assert_context_budget, BudgetOpts, estimate_tokens, overflow_message, +use crate::context::{assert_context_budget, BudgetOpts, overflow_message, DEFAULT_MAX_TOKENS, DEFAULT_SAFETY_MARGIN}; +use shared::model_matrix::ModelMatrix; use crate::continuation::{generate_continuable, ContinuableOpts, ResponseShape, TextGenerator}; /// Callback signatures — caller supplies closures that stitch the @@ -80,12 +81,12 @@ pub struct TreeSplitResult { /// by `\n— shard N/M digest —\n` so we can find the first one and /// chop everything before its successor. fn truncate_scratchpad(scratchpad: &mut String, budget_tokens: usize) -> bool { - if estimate_tokens(scratchpad) <= budget_tokens { return false; } + if ModelMatrix::estimate_tokens(scratchpad) <= budget_tokens { return false; } // Find the second delimiter — everything before it gets dropped. const DELIM_PREFIX: &str = "\n— shard "; let mut cursor = 0; let mut truncated = false; - while estimate_tokens(&scratchpad[cursor..]) > budget_tokens { + while ModelMatrix::estimate_tokens(&scratchpad[cursor..]) > budget_tokens { // Skip past a leading delimiter (if we're sitting on one from // a previous iteration), then find the next. let search_from = cursor + if scratchpad[cursor..].starts_with(DELIM_PREFIX) { @@ -278,7 +279,7 @@ mod tests { // Scratchpad should still fit roughly within the budget // (post-truncation); the estimator uses chars/4 so the bound // is ~budget*4 chars. Give some slack for the delimiter. - let scratchpad_tokens = estimate_tokens(&result.scratchpad); + let scratchpad_tokens = ModelMatrix::estimate_tokens(&result.scratchpad); assert!(scratchpad_tokens <= opts.scratchpad_budget * 2, "scratchpad {} tokens vs budget {}", scratchpad_tokens, opts.scratchpad_budget); } diff --git a/crates/shared/src/lib.rs b/crates/shared/src/lib.rs index c7e072d..4c5f758 100644 --- a/crates/shared/src/lib.rs +++ b/crates/shared/src/lib.rs @@ -4,3 +4,4 @@ pub mod arrow_helpers; pub mod config; pub mod pii; pub mod secrets; +pub mod model_matrix; diff --git a/crates/shared/src/model_matrix.rs b/crates/shared/src/model_matrix.rs new file mode 100644 index 0000000..c09af4c --- /dev/null +++ b/crates/shared/src/model_matrix.rs @@ -0,0 +1,69 @@ +//! Per-model token accounting. Entry point for the ModelMatrix work +//! the aibridge `context::estimate_tokens` deprecation has been pointing +//! at. Starts minimal — just `estimate_tokens` — so call sites can +//! migrate off the deprecated helper. Extend with per-model context +//! windows, max_tokens defaults, provider hints, etc. as we move the +//! rest of `aibridge::context::known_windows` over. + +/// Namespace for per-model token + context accounting. Methods are +/// associated functions — no instance required — because the underlying +/// estimates are deterministic and stateless. +pub struct ModelMatrix; + +impl ModelMatrix { + /// Rough token count — char count divided by 4, rounded up. This + /// is the same heuristic OpenAI's cookbook uses for English text; + /// it's within ±15% of BPE tokenizers for code + prose and doesn't + /// require a tokenizer lookup. Good enough for budget math where + /// the goal is "don't blow the context window" rather than exact + /// billing. + /// + /// Moved from `aibridge::context::estimate_tokens` (still there with + /// a `#[deprecated]` pointer — callers should migrate here). Empty + /// string → 0; one char → 1 (ceiling of 1/4 = 1). + pub fn estimate_tokens(text: &str) -> usize { + (text.chars().count() + 3) / 4 + } +} + +#[cfg(test)] +mod tests { + use super::ModelMatrix; + + #[test] + fn empty_string_is_zero_tokens() { + assert_eq!(ModelMatrix::estimate_tokens(""), 0); + } + + #[test] + fn three_chars_is_one_token() { + // 3 → ceil(3/4) = 1. Matches the deprecated helper's behavior + // so the migration is a drop-in replacement. + assert_eq!(ModelMatrix::estimate_tokens("abc"), 1); + } + + #[test] + fn four_chars_is_one_token() { + assert_eq!(ModelMatrix::estimate_tokens("abcd"), 1); + } + + #[test] + fn five_chars_is_two_tokens() { + assert_eq!(ModelMatrix::estimate_tokens("abcde"), 2); + } + + #[test] + fn counts_chars_not_bytes() { + // Multi-byte UTF-8 chars count as 1 char each — important for + // prompts with emoji or non-ASCII text. "héllo" is 5 chars + // (5 unicode scalars) → ceil(5/4) = 2 tokens, same as "hello". + assert_eq!(ModelMatrix::estimate_tokens("héllo"), 2); + assert_eq!(ModelMatrix::estimate_tokens("📚📚📚📚"), 1); // 4 chars + } + + #[test] + fn large_text_scales_linearly() { + assert_eq!(ModelMatrix::estimate_tokens(&"x".repeat(400)), 100); + assert_eq!(ModelMatrix::estimate_tokens(&"x".repeat(401)), 101); + } +} diff --git a/crates/vectord/src/playbook_memory.rs b/crates/vectord/src/playbook_memory.rs index b8c4fd9..f407c55 100644 --- a/crates/vectord/src/playbook_memory.rs +++ b/crates/vectord/src/playbook_memory.rs @@ -1647,7 +1647,7 @@ mod validity_window_tests { let past = (chrono::Utc::now() - chrono::Duration::days(1)).to_rfc3339(); let future = (chrono::Utc::now() + chrono::Duration::days(1)).to_rfc3339(); let e_expired = mkentry("pb-expired", "Nashville", "TN", None, Some(past)); - let e_alive = { let mut e = mkentry("pb-alive", "Nashville", "TN", None, Some(future)); e }; + let e_alive = mkentry("pb-alive", "Nashville", "TN", None, Some(future)); pm.set_entries(vec![e_expired, e_alive]).await.unwrap(); let boosts = pm.compute_boost_for_filtered_with_role( &[1.0, 0.0, 0.0], 100, 0.5,