galaxis-agent/agent/recovery.py

91 lines
3.0 KiB
Python
Raw Normal View History

"""서버 시작 시 복구 + 좀비 컨테이너 정리."""
from __future__ import annotations
import asyncio
import logging
from datetime import datetime, timezone
from agent.task_queue import PersistentTaskQueue
logger = logging.getLogger(__name__)
async def recover_on_startup(task_queue: PersistentTaskQueue) -> None:
reset_count = await task_queue.reset_running_to_pending()
if reset_count:
logger.info("Recovery: reset %d running task(s) to pending", reset_count)
await _cleanup_zombie_containers()
async def _cleanup_zombie_containers() -> int:
try:
import docker
client = docker.from_env()
containers = client.containers.list(
filters={"label": "galaxis-agent-sandbox"}, all=True,
)
cleaned = 0
for container in containers:
try:
container.stop(timeout=10)
container.remove(force=True)
cleaned += 1
logger.info("Recovery: removed zombie container %s", container.name)
except Exception:
logger.warning("Recovery: failed to remove container %s", container.name)
return cleaned
except Exception:
logger.debug("Recovery: Docker not available, skipping container cleanup")
return 0
class ContainerCleaner:
def __init__(self, docker_client=None, max_age_seconds: int = 1200, interval_seconds: int = 1800):
self._docker = docker_client
self._max_age = max_age_seconds
self._interval = interval_seconds
self._running = False
self._task: asyncio.Task | None = None
async def start(self) -> None:
self._running = True
self._task = asyncio.create_task(self._loop())
async def stop(self) -> None:
self._running = False
if self._task:
self._task.cancel()
try:
await self._task
except asyncio.CancelledError:
pass
async def _loop(self) -> None:
while self._running:
try:
await self.cleanup_once()
except Exception:
logger.exception("ContainerCleaner error")
await asyncio.sleep(self._interval)
async def cleanup_once(self) -> int:
if not self._docker:
return 0
now = datetime.now(timezone.utc)
containers = self._docker.containers.list(
filters={"label": "galaxis-agent-sandbox"}, all=True,
)
removed = 0
for container in containers:
created_str = container.attrs.get("Created", "")
try:
created = datetime.fromisoformat(created_str.replace("Z", "+00:00"))
age = (now - created).total_seconds()
if age > self._max_age:
container.stop(timeout=10)
container.remove(force=True)
removed += 1
except Exception:
logger.debug("Failed to check/remove container %s", getattr(container, "name", "unknown"))
return removed