125 lines
4.0 KiB
Python
Raw Normal View History

"""
Base collector class for data collection jobs.
"""
2026-02-08 22:48:35 +09:00
import logging
import re
from abc import ABC, abstractmethod
from contextlib import contextmanager
from datetime import datetime, timezone
from typing import Optional, Generator
2026-02-08 22:48:35 +09:00
import requests
from bs4 import BeautifulSoup
from sqlalchemy.orm import Session
from app.models.stock import JobLog
2026-02-08 22:48:35 +09:00
logger = logging.getLogger(__name__)
HEARTBEAT_STALE_MINUTES = 30
class BaseCollector(ABC):
"""Base class for all data collectors."""
2026-02-08 22:48:35 +09:00
REQUEST_TIMEOUT = 10
def __init__(self, db: Session):
if db is None:
raise ValueError("Database session cannot be None")
self.db = db
self.job_name = self.__class__.__name__
self.job_log: Optional[JobLog] = None
2026-02-08 22:48:35 +09:00
def _get_latest_biz_day(self) -> str:
"""Get the latest business day from Naver Finance."""
url = "https://finance.naver.com/sise/sise_index.naver?code=KOSPI"
response = requests.get(url, timeout=self.REQUEST_TIMEOUT)
soup = BeautifulSoup(response.content, "lxml")
time_elem = soup.select_one("div.ly_realtime > span#time")
if time_elem:
date_str = re.sub(r"[^0-9]", "", time_elem.text)
return date_str[:8]
raise RuntimeError("Failed to detect latest business day from Naver Finance")
def start_job(self) -> JobLog:
"""Create a job log entry when starting."""
self.job_log = JobLog(
job_name=self.job_name,
status="running",
started_at=datetime.now(timezone.utc),
last_heartbeat=datetime.now(timezone.utc),
)
self.db.add(self.job_log)
self.db.commit()
return self.job_log
def heartbeat(self) -> None:
"""Update last_heartbeat so watchdog knows the job is still alive."""
if self.job_log:
try:
self.job_log.last_heartbeat = datetime.now(timezone.utc)
self.db.commit()
except Exception:
self.db.rollback()
def complete_job(self, records_count: int):
"""Mark job as completed."""
if self.job_log:
try:
self.job_log.status = "success"
self.job_log.finished_at = datetime.now(timezone.utc)
self.job_log.records_count = records_count
self.db.commit()
except Exception:
self.db.rollback()
raise
def fail_job(self, error_msg: str):
"""Mark job as failed."""
if self.job_log:
try:
self.job_log.status = "failed"
self.job_log.finished_at = datetime.now(timezone.utc)
self.job_log.error_msg = error_msg
self.db.commit()
except Exception:
self.db.rollback()
raise
def complete_if_running(self) -> None:
"""If the job is still 'running' in the DB, mark it failed_orphaned.
Called in finally blocks to handle unexpected termination paths."""
if self.job_log and self.job_log.status == "running":
try:
self.job_log.status = "failed_orphaned"
self.job_log.finished_at = datetime.now(timezone.utc)
self.job_log.error_msg = "Job exited without explicit success/fail"
self.db.commit()
except Exception:
self.db.rollback()
@abstractmethod
def collect(self) -> int:
"""
Perform the data collection.
Returns the number of records collected.
"""
pass
def run(self) -> JobLog:
"""Execute the collection job with logging."""
self.start_job()
try:
records = self.collect()
self.complete_job(records)
except Exception as e:
try:
self.fail_job(str(e))
except Exception:
pass # Log update failed, but original exception is more important
raise
finally:
self.complete_if_running()
return self.job_log