lakehouse/crates/aibridge/src/context.rs

//! Phase 21 — context-budget accounting for model calls.
//!
//! Ports `assertContextBudget` + `estimateTokens` + `CONTEXT_WINDOWS`
//! from `tests/multi-agent/agent.ts` so Rust-side callers (gateway
//! tool surfaces, future Rust agents) get the same loud-fail behavior
//! on window overflow instead of silent truncation.
//!
//! The token estimator is deliberately the same chars/4 heuristic as
//! the TS side. It's biased ~15% safe — pessimistic on English, correct
//! within a factor of 2 on code. Swap to a provider tokenizer only when
//! the estimator drives a decision (we're nowhere near that yet).

use std::collections::HashMap;
use std::sync::OnceLock;

// `estimate_tokens` moved to `shared::model_matrix::ModelMatrix::estimate_tokens`
// (cdc24d8). All callers migrated; the deprecated wrapper that stood in its
// place has been removed since it had zero external consumers.

/// Phase 21 — per-model context windows, mirroring the TS table in
/// `tests/multi-agent/agent.ts`. Anchored on each model's documented
/// max; unknown models fall back to `DEFAULT_CONTEXT_WINDOW`.
pub const DEFAULT_CONTEXT_WINDOW: usize = 32_768;
pub const DEFAULT_SAFETY_MARGIN: usize = 2_000;
pub const DEFAULT_MAX_TOKENS: usize = 800;

fn known_windows() -> &'static HashMap<&'static str, usize> {
    static TABLE: OnceLock<HashMap<&'static str, usize>> = OnceLock::new();
    TABLE.get_or_init(|| {
        let mut m = HashMap::new();
        m.insert("mistral:latest", 32_768);
        m.insert("qwen2.5:latest", 32_768);
        m.insert("qwen3:latest", 40_960);
        m.insert("qwen3.5:latest", 262_144);
        m.insert("qwen3-embedding", 32_768);
        m.insert("nomic-embed-text-v2-moe", 2_048);
        m.insert("gpt-oss:20b", 131_072);
        m.insert("gpt-oss:120b", 131_072);
        m.insert("qwen3.5:397b", 131_072);
        m.insert("kimi-k2-thinking", 200_000);
        m.insert("kimi-k2.6", 200_000);
        m.insert("kimi-k2:1t", 1_048_576);
        m.insert("deepseek-v3.1:671b", 131_072);
        m.insert("glm-4.7", 131_072);
        m
    })
}

pub fn context_window_for(model: &str) -> usize {
    known_windows().get(model).copied().unwrap_or(DEFAULT_CONTEXT_WINDOW)
}

/// Result of a budget check — exposes the numbers so callers can log
/// how much headroom remains without re-running the estimator.
#[derive(Debug, Clone, Copy)]
pub struct BudgetCheck {
    pub estimated: usize,
    pub window: usize,
    pub remaining: i64,
}

/// Inputs to `assert_context_budget`. `bypass` exists for call sites
/// that handle their own overflow (continuation's second pass already
/// counted the partial; T5 gatekeeper prompts have a separate policy).
#[derive(Debug, Clone, Default)]
pub struct BudgetOpts<'a> {
    pub system: Option<&'a str>,
    pub max_tokens: Option<usize>,
    pub safety_margin: Option<usize>,
    pub bypass: bool,
}

/// Phase 21's loud-fail primitive. Returns a `BudgetCheck` on success
/// and the same struct plus over-by count on failure. The whole point
/// is to stop silent truncation — callers that expect overflow should
/// chunk BEFORE calling or set `bypass: true`.
pub fn assert_context_budget(
    model: &str,
    prompt: &str,
    opts: BudgetOpts,
) -> Result<BudgetCheck, (BudgetCheck, usize)> {
    let window = context_window_for(model);
    let safety = opts.safety_margin.unwrap_or(DEFAULT_SAFETY_MARGIN);
    let max_tokens = opts.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS);
    let sys_tokens = opts.system.map(shared::model_matrix::ModelMatrix::estimate_tokens).unwrap_or(0);
    let estimated = shared::model_matrix::ModelMatrix::estimate_tokens(prompt) + sys_tokens + max_tokens;
    let remaining = window as i64 - estimated as i64 - safety as i64;
    let check = BudgetCheck { estimated, window, remaining };
    if remaining < 0 && !opts.bypass {
        return Err((check, (-remaining) as usize));
    }
    Ok(check)
}

/// Convenience — format an overflow error the same way the TS side
/// does. Exposed so downstream crates render consistent messages.
pub fn overflow_message(model: &str, check: &BudgetCheck, over_by: usize, safety: usize) -> String {
    format!(
        "context overflow: model={} est={}t window={}t safety={}t over={}t. \
         Chunk the prompt (see config/models.json overflow_policies) or set \
         bypass:true if you know the risk.",
        model, check.estimated, check.window, safety, over_by,
    )
}

#[cfg(test)]
mod tests {
    use super::*;

    // Deprecated-function behavior is now canonically tested in
    // crates/shared/src/model_matrix.rs. This test was the legacy
    // pin that preceded the migration; delete when the deprecated
    // wrapper itself goes (see the #[deprecated] attribute).

    #[test]
    fn context_window_known_and_fallback() {
        assert_eq!(context_window_for("qwen3.5:latest"), 262_144);
        assert_eq!(context_window_for("kimi-k2:1t"), 1_048_576);
        assert_eq!(context_window_for("some-unreleased-model"), DEFAULT_CONTEXT_WINDOW);
    }

    #[test]
    fn budget_passes_well_under_window() {
        let check = assert_context_budget(
            "qwen3:latest",
            &"x".repeat(4_000), // ~1000 tokens
            BudgetOpts { max_tokens: Some(500), ..Default::default() },
        ).expect("well under 40K window");
        assert!(check.remaining > 30_000);
    }

    #[test]
    fn budget_fails_when_prompt_overflows_window() {
        let huge = "x".repeat(200_000); // ~50K tokens, over qwen3's 40K
        let err = assert_context_budget(
            "qwen3:latest",
            &huge,
            BudgetOpts::default(),
        ).expect_err("should overflow qwen3's 40K window");
        assert!(err.1 > 0, "over_by must be positive");
    }

    #[test]
    fn budget_bypass_returns_ok_even_over() {
        let huge = "x".repeat(200_000);
        let check = assert_context_budget(
            "qwen3:latest",
            &huge,
            BudgetOpts { bypass: true, ..Default::default() },
        ).expect("bypass must suppress the error");
        assert!(check.remaining < 0, "check still reports negative remaining");
    }

    #[test]
    fn budget_counts_system_prompt() {
        // 10K-char system prompt → ~2500 tokens. With a big max_tokens
        // this should push us closer to the window.
        let sys = "s".repeat(10_000);
        let prompt = "p".repeat(4_000);
        let with_sys = assert_context_budget(
            "qwen3:latest",
            &prompt,
            BudgetOpts {
                system: Some(&sys),
                max_tokens: Some(500),
                ..Default::default()
            },
        ).unwrap();
        let without_sys = assert_context_budget(
            "qwen3:latest",
            &prompt,
            BudgetOpts { max_tokens: Some(500), ..Default::default() },
        ).unwrap();
        assert!(with_sys.estimated > without_sys.estimated,
            "system prompt should raise estimate");
        assert_eq!(with_sys.estimated - without_sys.estimated, shared::model_matrix::ModelMatrix::estimate_tokens(&sys));
    }

    #[test]
    fn overflow_message_includes_numbers() {
        let check = BudgetCheck { estimated: 42_000, window: 40_960, remaining: -1_040 };
        let msg = overflow_message("qwen3:latest", &check, 3_040, 2_000);
        assert!(msg.contains("qwen3:latest"));
        assert!(msg.contains("42000t"));
        assert!(msg.contains("40960t"));
        assert!(msg.contains("3040t"));
    }
}