fix: collector error
All checks were successful
Deploy to Production / deploy (push) Successful in 1m8s

This commit is contained in:
zephyrdark 2026-02-08 22:48:35 +09:00
parent 9f756331c4
commit 72c72994b2
4 changed files with 26 additions and 21 deletions

View File

@ -1,18 +1,26 @@
"""
Base collector class for data collection jobs.
"""
import logging
import re
from abc import ABC, abstractmethod
from datetime import datetime
from typing import Optional
import requests
from bs4 import BeautifulSoup
from sqlalchemy.orm import Session
from app.models.stock import JobLog
logger = logging.getLogger(__name__)
class BaseCollector(ABC):
"""Base class for all data collectors."""
REQUEST_TIMEOUT = 10
def __init__(self, db: Session):
if db is None:
raise ValueError("Database session cannot be None")
@ -20,6 +28,17 @@ class BaseCollector(ABC):
self.job_name = self.__class__.__name__
self.job_log: Optional[JobLog] = None
def _get_latest_biz_day(self) -> str:
"""Get the latest business day from Naver Finance."""
url = "https://finance.naver.com/sise/sise_index.naver?code=KOSPI"
response = requests.get(url, timeout=self.REQUEST_TIMEOUT)
soup = BeautifulSoup(response.content, "lxml")
time_elem = soup.select_one("div.ly_realtime > span#time")
if time_elem:
date_str = re.sub(r"[^0-9]", "", time_elem.text)
return date_str[:8]
raise RuntimeError("Failed to detect latest business day from Naver Finance")
def start_job(self) -> JobLog:
"""Create a job log entry when starting."""
self.job_log = JobLog(

View File

@ -24,7 +24,7 @@ class SectorCollector(BaseCollector):
def __init__(self, db: Session, biz_day: str = None):
super().__init__(db)
self.biz_day = biz_day or datetime.now().strftime("%Y%m%d")
self.biz_day = biz_day or self._get_latest_biz_day()
self._validate_biz_day()
def _validate_biz_day(self) -> None:
@ -43,7 +43,7 @@ class SectorCollector(BaseCollector):
try:
response = requests.get(url, timeout=10)
data = response.json()
if "list" in data:
if "list" in data and data["list"]:
df = pd.json_normalize(data["list"])
all_data.append(df)
except (requests.RequestException, ValueError, KeyError) as e:

View File

@ -24,8 +24,8 @@ class StockCollector(BaseCollector):
GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd"
DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd"
HEADERS = {
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201050201",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
}
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1
@ -42,20 +42,6 @@ class StockCollector(BaseCollector):
except ValueError:
raise ValueError(f"Invalid biz_day format. Expected YYYYMMDD, got: {self.biz_day}")
def _get_latest_biz_day(self) -> str:
"""Get the latest business day from Naver Finance."""
from bs4 import BeautifulSoup
import re
url = "https://finance.naver.com/sise/sise_index.naver?code=KOSPI"
response = requests.get(url, timeout=self.REQUEST_TIMEOUT)
soup = BeautifulSoup(response.content, "lxml")
time_elem = soup.select_one("div.ly_realtime > span#time")
if time_elem:
date_str = re.sub(r"[^0-9]", "", time_elem.text)
return date_str[:8]
return datetime.now().strftime("%Y%m%d")
def _fetch_stock_data(self, mkt_id: str) -> pd.DataFrame:
"""Fetch stock data for a specific market."""
gen_otp_data = {

View File

@ -25,15 +25,15 @@ class ValuationCollector(BaseCollector):
GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd"
DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd"
HEADERS = {
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201050201",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
}
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1
def __init__(self, db: Session, biz_day: str = None):
super().__init__(db)
self.biz_day = biz_day or datetime.now().strftime("%Y%m%d")
self.biz_day = biz_day or self._get_latest_biz_day()
self._validate_biz_day()
def _validate_biz_day(self) -> None: