""" Base collector class for data collection jobs. """ import logging import re from abc import ABC, abstractmethod from contextlib import contextmanager from datetime import datetime, timezone from typing import Optional, Generator import requests from bs4 import BeautifulSoup from sqlalchemy.orm import Session from app.models.stock import JobLog logger = logging.getLogger(__name__) HEARTBEAT_STALE_MINUTES = 30 class BaseCollector(ABC): """Base class for all data collectors.""" REQUEST_TIMEOUT = 10 def __init__(self, db: Session): if db is None: raise ValueError("Database session cannot be None") self.db = db self.job_name = self.__class__.__name__ self.job_log: Optional[JobLog] = None def _get_latest_biz_day(self) -> str: """Get the latest business day from Naver Finance.""" url = "https://finance.naver.com/sise/sise_index.naver?code=KOSPI" response = requests.get(url, timeout=self.REQUEST_TIMEOUT) soup = BeautifulSoup(response.content, "lxml") time_elem = soup.select_one("div.ly_realtime > span#time") if time_elem: date_str = re.sub(r"[^0-9]", "", time_elem.text) return date_str[:8] raise RuntimeError("Failed to detect latest business day from Naver Finance") def start_job(self) -> JobLog: """Create a job log entry when starting.""" self.job_log = JobLog( job_name=self.job_name, status="running", started_at=datetime.now(timezone.utc), last_heartbeat=datetime.now(timezone.utc), ) self.db.add(self.job_log) self.db.commit() return self.job_log def heartbeat(self) -> None: """Update last_heartbeat so watchdog knows the job is still alive.""" if self.job_log: try: self.job_log.last_heartbeat = datetime.now(timezone.utc) self.db.commit() except Exception: self.db.rollback() def complete_job(self, records_count: int): """Mark job as completed.""" if self.job_log: try: self.job_log.status = "success" self.job_log.finished_at = datetime.now(timezone.utc) self.job_log.records_count = records_count self.db.commit() except Exception: self.db.rollback() raise def fail_job(self, error_msg: str): """Mark job as failed.""" if self.job_log: try: self.job_log.status = "failed" self.job_log.finished_at = datetime.now(timezone.utc) self.job_log.error_msg = error_msg self.db.commit() except Exception: self.db.rollback() raise def complete_if_running(self) -> None: """If the job is still 'running' in the DB, mark it failed_orphaned. Called in finally blocks to handle unexpected termination paths.""" if self.job_log and self.job_log.status == "running": try: self.job_log.status = "failed_orphaned" self.job_log.finished_at = datetime.now(timezone.utc) self.job_log.error_msg = "Job exited without explicit success/fail" self.db.commit() except Exception: self.db.rollback() @abstractmethod def collect(self) -> int: """ Perform the data collection. Returns the number of records collected. """ pass def run(self) -> JobLog: """Execute the collection job with logging.""" self.start_job() try: records = self.collect() self.complete_job(records) except Exception as e: try: self.fail_job(str(e)) except Exception: pass # Log update failed, but original exception is more important raise finally: self.complete_if_running() return self.job_log