From 72c72994b2d2b0743f94490a6f94d2ae5b5058aa Mon Sep 17 00:00:00 2001 From: zephyrdark Date: Sun, 8 Feb 2026 22:48:35 +0900 Subject: [PATCH] fix: collector error --- backend/app/services/collectors/base.py | 19 +++++++++++++++++++ .../services/collectors/sector_collector.py | 4 ++-- .../services/collectors/stock_collector.py | 18 ++---------------- .../collectors/valuation_collector.py | 6 +++--- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/backend/app/services/collectors/base.py b/backend/app/services/collectors/base.py index 6d54155..5b803a7 100644 --- a/backend/app/services/collectors/base.py +++ b/backend/app/services/collectors/base.py @@ -1,18 +1,26 @@ """ Base collector class for data collection jobs. """ +import logging +import re from abc import ABC, abstractmethod from datetime import datetime from typing import Optional +import requests +from bs4 import BeautifulSoup from sqlalchemy.orm import Session from app.models.stock import JobLog +logger = logging.getLogger(__name__) + class BaseCollector(ABC): """Base class for all data collectors.""" + REQUEST_TIMEOUT = 10 + def __init__(self, db: Session): if db is None: raise ValueError("Database session cannot be None") @@ -20,6 +28,17 @@ class BaseCollector(ABC): self.job_name = self.__class__.__name__ self.job_log: Optional[JobLog] = None + def _get_latest_biz_day(self) -> str: + """Get the latest business day from Naver Finance.""" + url = "https://finance.naver.com/sise/sise_index.naver?code=KOSPI" + response = requests.get(url, timeout=self.REQUEST_TIMEOUT) + soup = BeautifulSoup(response.content, "lxml") + time_elem = soup.select_one("div.ly_realtime > span#time") + if time_elem: + date_str = re.sub(r"[^0-9]", "", time_elem.text) + return date_str[:8] + raise RuntimeError("Failed to detect latest business day from Naver Finance") + def start_job(self) -> JobLog: """Create a job log entry when starting.""" self.job_log = JobLog( diff --git a/backend/app/services/collectors/sector_collector.py b/backend/app/services/collectors/sector_collector.py index dd17f22..c45607f 100644 --- a/backend/app/services/collectors/sector_collector.py +++ b/backend/app/services/collectors/sector_collector.py @@ -24,7 +24,7 @@ class SectorCollector(BaseCollector): def __init__(self, db: Session, biz_day: str = None): super().__init__(db) - self.biz_day = biz_day or datetime.now().strftime("%Y%m%d") + self.biz_day = biz_day or self._get_latest_biz_day() self._validate_biz_day() def _validate_biz_day(self) -> None: @@ -43,7 +43,7 @@ class SectorCollector(BaseCollector): try: response = requests.get(url, timeout=10) data = response.json() - if "list" in data: + if "list" in data and data["list"]: df = pd.json_normalize(data["list"]) all_data.append(df) except (requests.RequestException, ValueError, KeyError) as e: diff --git a/backend/app/services/collectors/stock_collector.py b/backend/app/services/collectors/stock_collector.py index b2c88db..028ae0e 100644 --- a/backend/app/services/collectors/stock_collector.py +++ b/backend/app/services/collectors/stock_collector.py @@ -24,8 +24,8 @@ class StockCollector(BaseCollector): GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd" DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd" HEADERS = { - "Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201050201", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", } REQUEST_TIMEOUT = 10 RATE_LIMIT_DELAY = 1 @@ -42,20 +42,6 @@ class StockCollector(BaseCollector): except ValueError: raise ValueError(f"Invalid biz_day format. Expected YYYYMMDD, got: {self.biz_day}") - def _get_latest_biz_day(self) -> str: - """Get the latest business day from Naver Finance.""" - from bs4 import BeautifulSoup - import re - - url = "https://finance.naver.com/sise/sise_index.naver?code=KOSPI" - response = requests.get(url, timeout=self.REQUEST_TIMEOUT) - soup = BeautifulSoup(response.content, "lxml") - time_elem = soup.select_one("div.ly_realtime > span#time") - if time_elem: - date_str = re.sub(r"[^0-9]", "", time_elem.text) - return date_str[:8] - return datetime.now().strftime("%Y%m%d") - def _fetch_stock_data(self, mkt_id: str) -> pd.DataFrame: """Fetch stock data for a specific market.""" gen_otp_data = { diff --git a/backend/app/services/collectors/valuation_collector.py b/backend/app/services/collectors/valuation_collector.py index e65283c..8a953cc 100644 --- a/backend/app/services/collectors/valuation_collector.py +++ b/backend/app/services/collectors/valuation_collector.py @@ -25,15 +25,15 @@ class ValuationCollector(BaseCollector): GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd" DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd" HEADERS = { - "Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201050201", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", } REQUEST_TIMEOUT = 10 RATE_LIMIT_DELAY = 1 def __init__(self, db: Session, biz_day: str = None): super().__init__(db) - self.biz_day = biz_day or datetime.now().strftime("%Y%m%d") + self.biz_day = biz_day or self._get_latest_biz_day() self._validate_biz_day() def _validate_biz_day(self) -> None: