From 79f23f76eb49432c0c899c3e2ecbe415674b3701 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 19 Feb 2026 23:20:23 +0000
Subject: [PATCH] fix(ra2/token_gate): improve token estimation for code and
 multilingual content

Use ~3.3 chars/token base ratio (more conservative than the previous ~4)
and apply a non-ASCII density penalty that shifts toward ~2.5 chars/token
for CJK, emoji, or symbol-heavy text. Prevents underestimation that
could bypass the token gate on code-heavy or multilingual prompts.

https://claude.ai/code/session_01K7BWJY2gUoJi6dq91Yc7nx
---
 ra2/tests/test_token_gate.py | 19 +++++++++++++++----
 ra2/token_gate.py            | 14 ++++++++++----
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/ra2/tests/test_token_gate.py b/ra2/tests/test_token_gate.py
index 7f98a73ecbd..874c98a2d1e 100644
--- a/ra2/tests/test_token_gate.py
+++ b/ra2/tests/test_token_gate.py
@@ -15,19 +15,30 @@ class TestEstimateTokens:
         assert estimate_tokens("") == 0
 
     def test_short_string(self):
-        # "ab" = 2 chars, 2//4 = 0 → clamped to 1
         assert estimate_tokens("ab") == 1
 
-    def test_known_length(self):
+    def test_known_length_ascii(self):
         text = "a" * 400
-        # 400 / 4 = 100
-        assert estimate_tokens(text) == 100
+        # 400 / 3.3 ≈ 121
+        assert estimate_tokens(text) == int(400 / 3.3)
 
     def test_proportional(self):
         short = estimate_tokens("hello world")
         long = estimate_tokens("hello world " * 100)
         assert long > short
 
+    def test_non_ascii_increases_estimate(self):
+        ascii_text = "a" * 100
+        # Mix in non-ASCII to trigger the penalty
+        non_ascii_text = "\u4e00" * 100  # CJK characters
+        assert estimate_tokens(non_ascii_text) > estimate_tokens(ascii_text)
+
+    def test_code_heavy_reasonable(self):
+        code = 'def foo(x: int) -> bool:\n    return x > 0\n' * 10
+        tokens = estimate_tokens(code)
+        # Should be more conservative than len//4
+        assert tokens > len(code) // 4
+
 
 class TestCheckBudget:
     def test_within_budget(self):
diff --git a/ra2/token_gate.py b/ra2/token_gate.py
index cc07e50d0b1..364bb29552a 100644
--- a/ra2/token_gate.py
+++ b/ra2/token_gate.py
@@ -27,13 +27,19 @@ class TokenBudgetExceeded(Exception):
 def estimate_tokens(text: str) -> int:
     """Fast deterministic token estimate.
 
-    Uses the ~4 chars per token heuristic which is a reasonable average
-    across GPT/Claude tokenizers for English text.  No external dependency.
+    Base ratio: ~3.3 chars/token (conservative vs the common ~4 estimate).
+    Applies a penalty when non-ASCII density is high, since code symbols
+    and multilingual characters tend to produce shorter tokens.
+    No external dependency.
     """
     if not text:
         return 0
-    # Rough estimate: 1 token per 4 characters, minimum 1
-    return max(1, len(text) // 4)
+    length = len(text)
+    non_ascii = sum(1 for ch in text if ord(ch) > 127)
+    ratio = non_ascii / length if length else 0
+    # Shift from 3.3 toward 2.5 chars/token as non-ASCII density rises
+    chars_per_token = 3.3 - (0.8 * ratio)
+    return max(1, int(length / chars_per_token))
 
 
 def check_budget(estimated: int, limit: int | None = None) -> bool: