fix(ra2/token_gate): improve token estimation for code and multilingual content

Use ~3.3 chars/token base ratio (more conservative than the previous ~4) and apply a non-ASCII density penalty that shifts toward ~2.5 chars/token for CJK, emoji, or symbol-heavy text. Prevents underestimation that could bypass the token gate on code-heavy or multilingual prompts. https://claude.ai/code/session_01K7BWJY2gUoJi6dq91Yc7nx
2026-02-19 23:20:23 +00:00 · 2026-02-19 23:20:23 +00:00 · 79f23f76eb
commit 79f23f76eb
parent 218358da18
2 changed files with 25 additions and 8 deletions
--- a/ra2/tests/test_token_gate.py
+++ b/ra2/tests/test_token_gate.py
@ -15,19 +15,30 @@ class TestEstimateTokens:
        assert estimate_tokens("") == 0

    def test_short_string(self):
-        # "ab" = 2 chars, 2//4 = 0 → clamped to 1
        assert estimate_tokens("ab") == 1

-    def test_known_length(self):
+    def test_known_length_ascii(self):
        text = "a" * 400
-        # 400 / 4 = 100
-        assert estimate_tokens(text) == 100
+        # 400 / 3.3 ≈ 121
+        assert estimate_tokens(text) == int(400 / 3.3)

    def test_proportional(self):
        short = estimate_tokens("hello world")
        long = estimate_tokens("hello world " * 100)
        assert long > short

+    def test_non_ascii_increases_estimate(self):
+        ascii_text = "a" * 100
+        # Mix in non-ASCII to trigger the penalty
+        non_ascii_text = "\u4e00" * 100  # CJK characters
+        assert estimate_tokens(non_ascii_text) > estimate_tokens(ascii_text)
+
+    def test_code_heavy_reasonable(self):
+        code = 'def foo(x: int) -> bool:\n    return x > 0\n' * 10
+        tokens = estimate_tokens(code)
+        # Should be more conservative than len//4
+        assert tokens > len(code) // 4
+

 class TestCheckBudget:
    def test_within_budget(self):
--- a/ra2/token_gate.py
+++ b/ra2/token_gate.py
@ -27,13 +27,19 @@ class TokenBudgetExceeded(Exception):
 def estimate_tokens(text: str) -> int:
    """Fast deterministic token estimate.

-    Uses the ~4 chars per token heuristic which is a reasonable average
-    across GPT/Claude tokenizers for English text.  No external dependency.
+    Base ratio: ~3.3 chars/token (conservative vs the common ~4 estimate).
+    Applies a penalty when non-ASCII density is high, since code symbols
+    and multilingual characters tend to produce shorter tokens.
+    No external dependency.
    """
    if not text:
        return 0
-    # Rough estimate: 1 token per 4 characters, minimum 1
-    return max(1, len(text) // 4)
+    length = len(text)
+    non_ascii = sum(1 for ch in text if ord(ch) > 127)
+    ratio = non_ascii / length if length else 0
+    # Shift from 3.3 toward 2.5 chars/token as non-ASCII density rises
+    chars_per_token = 3.3 - (0.8 * ratio)
+    return max(1, int(length / chars_per_token))


 def check_budget(estimated: int, limit: int | None = None) -> bool: