fix(ra2/token_gate): improve token estimation for code and multilingual content
Use ~3.3 chars/token base ratio (more conservative than the previous ~4) and apply a non-ASCII density penalty that shifts toward ~2.5 chars/token for CJK, emoji, or symbol-heavy text. Prevents underestimation that could bypass the token gate on code-heavy or multilingual prompts. https://claude.ai/code/session_01K7BWJY2gUoJi6dq91Yc7nx
This commit is contained in:
parent
218358da18
commit
79f23f76eb
@ -15,19 +15,30 @@ class TestEstimateTokens:
|
||||
assert estimate_tokens("") == 0
|
||||
|
||||
def test_short_string(self):
|
||||
# "ab" = 2 chars, 2//4 = 0 → clamped to 1
|
||||
assert estimate_tokens("ab") == 1
|
||||
|
||||
def test_known_length(self):
|
||||
def test_known_length_ascii(self):
|
||||
text = "a" * 400
|
||||
# 400 / 4 = 100
|
||||
assert estimate_tokens(text) == 100
|
||||
# 400 / 3.3 ≈ 121
|
||||
assert estimate_tokens(text) == int(400 / 3.3)
|
||||
|
||||
def test_proportional(self):
|
||||
short = estimate_tokens("hello world")
|
||||
long = estimate_tokens("hello world " * 100)
|
||||
assert long > short
|
||||
|
||||
def test_non_ascii_increases_estimate(self):
|
||||
ascii_text = "a" * 100
|
||||
# Mix in non-ASCII to trigger the penalty
|
||||
non_ascii_text = "\u4e00" * 100 # CJK characters
|
||||
assert estimate_tokens(non_ascii_text) > estimate_tokens(ascii_text)
|
||||
|
||||
def test_code_heavy_reasonable(self):
|
||||
code = 'def foo(x: int) -> bool:\n return x > 0\n' * 10
|
||||
tokens = estimate_tokens(code)
|
||||
# Should be more conservative than len//4
|
||||
assert tokens > len(code) // 4
|
||||
|
||||
|
||||
class TestCheckBudget:
|
||||
def test_within_budget(self):
|
||||
|
||||
@ -27,13 +27,19 @@ class TokenBudgetExceeded(Exception):
|
||||
def estimate_tokens(text: str) -> int:
|
||||
"""Fast deterministic token estimate.
|
||||
|
||||
Uses the ~4 chars per token heuristic which is a reasonable average
|
||||
across GPT/Claude tokenizers for English text. No external dependency.
|
||||
Base ratio: ~3.3 chars/token (conservative vs the common ~4 estimate).
|
||||
Applies a penalty when non-ASCII density is high, since code symbols
|
||||
and multilingual characters tend to produce shorter tokens.
|
||||
No external dependency.
|
||||
"""
|
||||
if not text:
|
||||
return 0
|
||||
# Rough estimate: 1 token per 4 characters, minimum 1
|
||||
return max(1, len(text) // 4)
|
||||
length = len(text)
|
||||
non_ascii = sum(1 for ch in text if ord(ch) > 127)
|
||||
ratio = non_ascii / length if length else 0
|
||||
# Shift from 3.3 toward 2.5 chars/token as non-ASCII density rises
|
||||
chars_per_token = 3.3 - (0.8 * ratio)
|
||||
return max(1, int(length / chars_per_token))
|
||||
|
||||
|
||||
def check_budget(estimated: int, limit: int | None = None) -> bool:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user