Use ~3.3 chars/token base ratio (more conservative than the previous ~4) and apply a non-ASCII density penalty that shifts toward ~2.5 chars/token for CJK, emoji, or symbol-heavy text. Prevents underestimation that could bypass the token gate on code-heavy or multilingual prompts. https://claude.ai/code/session_01K7BWJY2gUoJi6dq91Yc7nx
63 lines
2.1 KiB
Python
63 lines
2.1 KiB
Python
"""
|
|
ra2.token_gate — Token estimation and hard cap enforcement.
|
|
|
|
Provides a fast, deterministic token estimator (no external tokenizer dependency)
|
|
and gate logic that prevents any prompt from exceeding MAX_TOKENS.
|
|
"""
|
|
|
|
import os
|
|
|
|
# Configurable via environment or direct override
|
|
MAX_TOKENS: int = int(os.environ.get("RA2_MAX_TOKENS", "6000"))
|
|
LIVE_WINDOW: int = int(os.environ.get("RA2_LIVE_WINDOW", "16"))
|
|
LIVE_WINDOW_MIN: int = 4 # Never shrink below this
|
|
|
|
|
|
class TokenBudgetExceeded(Exception):
|
|
"""Raised when prompt exceeds MAX_TOKENS even after shrinking."""
|
|
|
|
def __init__(self, estimated: int, limit: int):
|
|
self.estimated = estimated
|
|
self.limit = limit
|
|
super().__init__(
|
|
f"Token budget exceeded: {estimated} > {limit} after all shrink attempts"
|
|
)
|
|
|
|
|
|
def estimate_tokens(text: str) -> int:
|
|
"""Fast deterministic token estimate.
|
|
|
|
Base ratio: ~3.3 chars/token (conservative vs the common ~4 estimate).
|
|
Applies a penalty when non-ASCII density is high, since code symbols
|
|
and multilingual characters tend to produce shorter tokens.
|
|
No external dependency.
|
|
"""
|
|
if not text:
|
|
return 0
|
|
length = len(text)
|
|
non_ascii = sum(1 for ch in text if ord(ch) > 127)
|
|
ratio = non_ascii / length if length else 0
|
|
# Shift from 3.3 toward 2.5 chars/token as non-ASCII density rises
|
|
chars_per_token = 3.3 - (0.8 * ratio)
|
|
return max(1, int(length / chars_per_token))
|
|
|
|
|
|
def check_budget(estimated: int, limit: int | None = None) -> bool:
|
|
"""Return True if *estimated* is within budget, False otherwise."""
|
|
limit = limit if limit is not None else MAX_TOKENS
|
|
return estimated <= limit
|
|
|
|
|
|
def shrink_window(current_window: int) -> int:
|
|
"""Halve the live window, respecting the minimum.
|
|
|
|
Returns the new window size, or raises TokenBudgetExceeded if
|
|
already at minimum.
|
|
"""
|
|
if current_window <= LIVE_WINDOW_MIN:
|
|
raise TokenBudgetExceeded(
|
|
estimated=0, # caller should fill real value
|
|
limit=MAX_TOKENS,
|
|
)
|
|
return max(LIVE_WINDOW_MIN, current_window // 2)
|