From 79f23f76eb49432c0c899c3e2ecbe415674b3701 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Feb 2026 23:20:23 +0000 Subject: [PATCH] fix(ra2/token_gate): improve token estimation for code and multilingual content Use ~3.3 chars/token base ratio (more conservative than the previous ~4) and apply a non-ASCII density penalty that shifts toward ~2.5 chars/token for CJK, emoji, or symbol-heavy text. Prevents underestimation that could bypass the token gate on code-heavy or multilingual prompts. https://claude.ai/code/session_01K7BWJY2gUoJi6dq91Yc7nx --- ra2/tests/test_token_gate.py | 19 +++++++++++++++---- ra2/token_gate.py | 14 ++++++++++---- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/ra2/tests/test_token_gate.py b/ra2/tests/test_token_gate.py index 7f98a73ecbd..874c98a2d1e 100644 --- a/ra2/tests/test_token_gate.py +++ b/ra2/tests/test_token_gate.py @@ -15,19 +15,30 @@ class TestEstimateTokens: assert estimate_tokens("") == 0 def test_short_string(self): - # "ab" = 2 chars, 2//4 = 0 → clamped to 1 assert estimate_tokens("ab") == 1 - def test_known_length(self): + def test_known_length_ascii(self): text = "a" * 400 - # 400 / 4 = 100 - assert estimate_tokens(text) == 100 + # 400 / 3.3 ≈ 121 + assert estimate_tokens(text) == int(400 / 3.3) def test_proportional(self): short = estimate_tokens("hello world") long = estimate_tokens("hello world " * 100) assert long > short + def test_non_ascii_increases_estimate(self): + ascii_text = "a" * 100 + # Mix in non-ASCII to trigger the penalty + non_ascii_text = "\u4e00" * 100 # CJK characters + assert estimate_tokens(non_ascii_text) > estimate_tokens(ascii_text) + + def test_code_heavy_reasonable(self): + code = 'def foo(x: int) -> bool:\n return x > 0\n' * 10 + tokens = estimate_tokens(code) + # Should be more conservative than len//4 + assert tokens > len(code) // 4 + class TestCheckBudget: def test_within_budget(self): diff --git a/ra2/token_gate.py b/ra2/token_gate.py index cc07e50d0b1..364bb29552a 100644 --- a/ra2/token_gate.py +++ b/ra2/token_gate.py @@ -27,13 +27,19 @@ class TokenBudgetExceeded(Exception): def estimate_tokens(text: str) -> int: """Fast deterministic token estimate. - Uses the ~4 chars per token heuristic which is a reasonable average - across GPT/Claude tokenizers for English text. No external dependency. + Base ratio: ~3.3 chars/token (conservative vs the common ~4 estimate). + Applies a penalty when non-ASCII density is high, since code symbols + and multilingual characters tend to produce shorter tokens. + No external dependency. """ if not text: return 0 - # Rough estimate: 1 token per 4 characters, minimum 1 - return max(1, len(text) // 4) + length = len(text) + non_ascii = sum(1 for ch in text if ord(ch) > 127) + ratio = non_ascii / length if length else 0 + # Shift from 3.3 toward 2.5 chars/token as non-ASCII density rises + chars_per_token = 3.3 - (0.8 * ratio) + return max(1, int(length / chars_per_token)) def check_budget(estimated: int, limit: int | None = None) -> bool: