fix(ra2/token_gate): improve token estimation for code and multilingual content

Use ~3.3 chars/token base ratio (more conservative than the previous ~4)
and apply a non-ASCII density penalty that shifts toward ~2.5 chars/token
for CJK, emoji, or symbol-heavy text. Prevents underestimation that
could bypass the token gate on code-heavy or multilingual prompts.

https://claude.ai/code/session_01K7BWJY2gUoJi6dq91Yc7nx
This commit is contained in:
Claude 2026-02-19 23:20:23 +00:00
parent 218358da18
commit 79f23f76eb
No known key found for this signature in database
2 changed files with 25 additions and 8 deletions

View File

@ -15,19 +15,30 @@ class TestEstimateTokens:
assert estimate_tokens("") == 0
def test_short_string(self):
# "ab" = 2 chars, 2//4 = 0 → clamped to 1
assert estimate_tokens("ab") == 1
def test_known_length(self):
def test_known_length_ascii(self):
text = "a" * 400
# 400 / 4 = 100
assert estimate_tokens(text) == 100
# 400 / 3.3 ≈ 121
assert estimate_tokens(text) == int(400 / 3.3)
def test_proportional(self):
short = estimate_tokens("hello world")
long = estimate_tokens("hello world " * 100)
assert long > short
def test_non_ascii_increases_estimate(self):
ascii_text = "a" * 100
# Mix in non-ASCII to trigger the penalty
non_ascii_text = "\u4e00" * 100 # CJK characters
assert estimate_tokens(non_ascii_text) > estimate_tokens(ascii_text)
def test_code_heavy_reasonable(self):
code = 'def foo(x: int) -> bool:\n return x > 0\n' * 10
tokens = estimate_tokens(code)
# Should be more conservative than len//4
assert tokens > len(code) // 4
class TestCheckBudget:
def test_within_budget(self):

View File

@ -27,13 +27,19 @@ class TokenBudgetExceeded(Exception):
def estimate_tokens(text: str) -> int:
"""Fast deterministic token estimate.
Uses the ~4 chars per token heuristic which is a reasonable average
across GPT/Claude tokenizers for English text. No external dependency.
Base ratio: ~3.3 chars/token (conservative vs the common ~4 estimate).
Applies a penalty when non-ASCII density is high, since code symbols
and multilingual characters tend to produce shorter tokens.
No external dependency.
"""
if not text:
return 0
# Rough estimate: 1 token per 4 characters, minimum 1
return max(1, len(text) // 4)
length = len(text)
non_ascii = sum(1 for ch in text if ord(ch) > 127)
ratio = non_ascii / length if length else 0
# Shift from 3.3 toward 2.5 chars/token as non-ASCII density rises
chars_per_token = 3.3 - (0.8 * ratio)
return max(1, int(length / chars_per_token))
def check_budget(estimated: int, limit: int | None = None) -> bool: