Add deterministic context control layer that intercepts prompt construction without modifying existing architecture: - context_engine.py: single choke point (build_context) that assembles structured prompts from ledger + sigil + live window, with token budget enforcement and automatic window shrinking - ledger.py: bounded per-stream JSON state (orientation, blockers, open questions, delta) with hard field/list limits - sigil.py: FIFO shorthand memory (max 15 entries) with deterministic rule-based generation from message patterns - token_gate.py: fast token estimation (~4 chars/token) and hard cap enforcement with configurable MAX_TOKENS/LIVE_WINDOW - redact.py: secret pattern detection (Discord, OpenAI, Anthropic, AWS, Slack, GitHub, Telegram, Bearer, generic key=value) replaced with [REDACTED_SECRET] before any output path All 64 tests passing. No modifications to existing agent spawning, model routing, tool system, or Discord relay architecture. https://claude.ai/code/session_01K7BWJY2gUoJi6dq91Yc7nx
57 lines
1.8 KiB
Python
57 lines
1.8 KiB
Python
"""
|
|
ra2.token_gate — Token estimation and hard cap enforcement.
|
|
|
|
Provides a fast, deterministic token estimator (no external tokenizer dependency)
|
|
and gate logic that prevents any prompt from exceeding MAX_TOKENS.
|
|
"""
|
|
|
|
import os
|
|
|
|
# Configurable via environment or direct override
|
|
MAX_TOKENS: int = int(os.environ.get("RA2_MAX_TOKENS", "6000"))
|
|
LIVE_WINDOW: int = int(os.environ.get("RA2_LIVE_WINDOW", "16"))
|
|
LIVE_WINDOW_MIN: int = 4 # Never shrink below this
|
|
|
|
|
|
class TokenBudgetExceeded(Exception):
|
|
"""Raised when prompt exceeds MAX_TOKENS even after shrinking."""
|
|
|
|
def __init__(self, estimated: int, limit: int):
|
|
self.estimated = estimated
|
|
self.limit = limit
|
|
super().__init__(
|
|
f"Token budget exceeded: {estimated} > {limit} after all shrink attempts"
|
|
)
|
|
|
|
|
|
def estimate_tokens(text: str) -> int:
|
|
"""Fast deterministic token estimate.
|
|
|
|
Uses the ~4 chars per token heuristic which is a reasonable average
|
|
across GPT/Claude tokenizers for English text. No external dependency.
|
|
"""
|
|
if not text:
|
|
return 0
|
|
# Rough estimate: 1 token per 4 characters, minimum 1
|
|
return max(1, len(text) // 4)
|
|
|
|
|
|
def check_budget(estimated: int, limit: int | None = None) -> bool:
|
|
"""Return True if *estimated* is within budget, False otherwise."""
|
|
limit = limit if limit is not None else MAX_TOKENS
|
|
return estimated <= limit
|
|
|
|
|
|
def shrink_window(current_window: int) -> int:
|
|
"""Halve the live window, respecting the minimum.
|
|
|
|
Returns the new window size, or raises TokenBudgetExceeded if
|
|
already at minimum.
|
|
"""
|
|
if current_window <= LIVE_WINDOW_MIN:
|
|
raise TokenBudgetExceeded(
|
|
estimated=0, # caller should fill real value
|
|
limit=MAX_TOKENS,
|
|
)
|
|
return max(LIVE_WINDOW_MIN, current_window // 2)
|