- Add reset_running_to_pending() to PersistentTaskQueue for recovery - Implement recover_on_startup() to reset interrupted tasks and clean zombies - Add ContainerCleaner for periodic removal of old sandbox containers - Add 4 tests covering recovery scenarios and container cleanup logic
91 lines
3.0 KiB
Python
91 lines
3.0 KiB
Python
"""서버 시작 시 복구 + 좀비 컨테이너 정리."""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
|
|
from agent.task_queue import PersistentTaskQueue
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def recover_on_startup(task_queue: PersistentTaskQueue) -> None:
|
|
reset_count = await task_queue.reset_running_to_pending()
|
|
if reset_count:
|
|
logger.info("Recovery: reset %d running task(s) to pending", reset_count)
|
|
await _cleanup_zombie_containers()
|
|
|
|
|
|
async def _cleanup_zombie_containers() -> int:
|
|
try:
|
|
import docker
|
|
client = docker.from_env()
|
|
containers = client.containers.list(
|
|
filters={"label": "galaxis-agent-sandbox"}, all=True,
|
|
)
|
|
cleaned = 0
|
|
for container in containers:
|
|
try:
|
|
container.stop(timeout=10)
|
|
container.remove(force=True)
|
|
cleaned += 1
|
|
logger.info("Recovery: removed zombie container %s", container.name)
|
|
except Exception:
|
|
logger.warning("Recovery: failed to remove container %s", container.name)
|
|
return cleaned
|
|
except Exception:
|
|
logger.debug("Recovery: Docker not available, skipping container cleanup")
|
|
return 0
|
|
|
|
|
|
class ContainerCleaner:
|
|
def __init__(self, docker_client=None, max_age_seconds: int = 1200, interval_seconds: int = 1800):
|
|
self._docker = docker_client
|
|
self._max_age = max_age_seconds
|
|
self._interval = interval_seconds
|
|
self._running = False
|
|
self._task: asyncio.Task | None = None
|
|
|
|
async def start(self) -> None:
|
|
self._running = True
|
|
self._task = asyncio.create_task(self._loop())
|
|
|
|
async def stop(self) -> None:
|
|
self._running = False
|
|
if self._task:
|
|
self._task.cancel()
|
|
try:
|
|
await self._task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
async def _loop(self) -> None:
|
|
while self._running:
|
|
try:
|
|
await self.cleanup_once()
|
|
except Exception:
|
|
logger.exception("ContainerCleaner error")
|
|
await asyncio.sleep(self._interval)
|
|
|
|
async def cleanup_once(self) -> int:
|
|
if not self._docker:
|
|
return 0
|
|
now = datetime.now(timezone.utc)
|
|
containers = self._docker.containers.list(
|
|
filters={"label": "galaxis-agent-sandbox"}, all=True,
|
|
)
|
|
removed = 0
|
|
for container in containers:
|
|
created_str = container.attrs.get("Created", "")
|
|
try:
|
|
created = datetime.fromisoformat(created_str.replace("Z", "+00:00"))
|
|
age = (now - created).total_seconds()
|
|
if age > self._max_age:
|
|
container.stop(timeout=10)
|
|
container.remove(force=True)
|
|
removed += 1
|
|
except Exception:
|
|
logger.debug("Failed to check/remove container %s", getattr(container, "name", "unknown"))
|
|
return removed
|