fix: collector error
All checks were successful
Deploy to Production / deploy (push) Successful in 1m8s
All checks were successful
Deploy to Production / deploy (push) Successful in 1m8s
This commit is contained in:
parent
9f756331c4
commit
72c72994b2
@ -1,18 +1,26 @@
|
|||||||
"""
|
"""
|
||||||
Base collector class for data collection jobs.
|
Base collector class for data collection jobs.
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from app.models.stock import JobLog
|
from app.models.stock import JobLog
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseCollector(ABC):
|
class BaseCollector(ABC):
|
||||||
"""Base class for all data collectors."""
|
"""Base class for all data collectors."""
|
||||||
|
|
||||||
|
REQUEST_TIMEOUT = 10
|
||||||
|
|
||||||
def __init__(self, db: Session):
|
def __init__(self, db: Session):
|
||||||
if db is None:
|
if db is None:
|
||||||
raise ValueError("Database session cannot be None")
|
raise ValueError("Database session cannot be None")
|
||||||
@ -20,6 +28,17 @@ class BaseCollector(ABC):
|
|||||||
self.job_name = self.__class__.__name__
|
self.job_name = self.__class__.__name__
|
||||||
self.job_log: Optional[JobLog] = None
|
self.job_log: Optional[JobLog] = None
|
||||||
|
|
||||||
|
def _get_latest_biz_day(self) -> str:
|
||||||
|
"""Get the latest business day from Naver Finance."""
|
||||||
|
url = "https://finance.naver.com/sise/sise_index.naver?code=KOSPI"
|
||||||
|
response = requests.get(url, timeout=self.REQUEST_TIMEOUT)
|
||||||
|
soup = BeautifulSoup(response.content, "lxml")
|
||||||
|
time_elem = soup.select_one("div.ly_realtime > span#time")
|
||||||
|
if time_elem:
|
||||||
|
date_str = re.sub(r"[^0-9]", "", time_elem.text)
|
||||||
|
return date_str[:8]
|
||||||
|
raise RuntimeError("Failed to detect latest business day from Naver Finance")
|
||||||
|
|
||||||
def start_job(self) -> JobLog:
|
def start_job(self) -> JobLog:
|
||||||
"""Create a job log entry when starting."""
|
"""Create a job log entry when starting."""
|
||||||
self.job_log = JobLog(
|
self.job_log = JobLog(
|
||||||
|
|||||||
@ -24,7 +24,7 @@ class SectorCollector(BaseCollector):
|
|||||||
|
|
||||||
def __init__(self, db: Session, biz_day: str = None):
|
def __init__(self, db: Session, biz_day: str = None):
|
||||||
super().__init__(db)
|
super().__init__(db)
|
||||||
self.biz_day = biz_day or datetime.now().strftime("%Y%m%d")
|
self.biz_day = biz_day or self._get_latest_biz_day()
|
||||||
self._validate_biz_day()
|
self._validate_biz_day()
|
||||||
|
|
||||||
def _validate_biz_day(self) -> None:
|
def _validate_biz_day(self) -> None:
|
||||||
@ -43,7 +43,7 @@ class SectorCollector(BaseCollector):
|
|||||||
try:
|
try:
|
||||||
response = requests.get(url, timeout=10)
|
response = requests.get(url, timeout=10)
|
||||||
data = response.json()
|
data = response.json()
|
||||||
if "list" in data:
|
if "list" in data and data["list"]:
|
||||||
df = pd.json_normalize(data["list"])
|
df = pd.json_normalize(data["list"])
|
||||||
all_data.append(df)
|
all_data.append(df)
|
||||||
except (requests.RequestException, ValueError, KeyError) as e:
|
except (requests.RequestException, ValueError, KeyError) as e:
|
||||||
|
|||||||
@ -24,8 +24,8 @@ class StockCollector(BaseCollector):
|
|||||||
GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd"
|
GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd"
|
||||||
DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd"
|
DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd"
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader",
|
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201050201",
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||||||
}
|
}
|
||||||
REQUEST_TIMEOUT = 10
|
REQUEST_TIMEOUT = 10
|
||||||
RATE_LIMIT_DELAY = 1
|
RATE_LIMIT_DELAY = 1
|
||||||
@ -42,20 +42,6 @@ class StockCollector(BaseCollector):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError(f"Invalid biz_day format. Expected YYYYMMDD, got: {self.biz_day}")
|
raise ValueError(f"Invalid biz_day format. Expected YYYYMMDD, got: {self.biz_day}")
|
||||||
|
|
||||||
def _get_latest_biz_day(self) -> str:
|
|
||||||
"""Get the latest business day from Naver Finance."""
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import re
|
|
||||||
|
|
||||||
url = "https://finance.naver.com/sise/sise_index.naver?code=KOSPI"
|
|
||||||
response = requests.get(url, timeout=self.REQUEST_TIMEOUT)
|
|
||||||
soup = BeautifulSoup(response.content, "lxml")
|
|
||||||
time_elem = soup.select_one("div.ly_realtime > span#time")
|
|
||||||
if time_elem:
|
|
||||||
date_str = re.sub(r"[^0-9]", "", time_elem.text)
|
|
||||||
return date_str[:8]
|
|
||||||
return datetime.now().strftime("%Y%m%d")
|
|
||||||
|
|
||||||
def _fetch_stock_data(self, mkt_id: str) -> pd.DataFrame:
|
def _fetch_stock_data(self, mkt_id: str) -> pd.DataFrame:
|
||||||
"""Fetch stock data for a specific market."""
|
"""Fetch stock data for a specific market."""
|
||||||
gen_otp_data = {
|
gen_otp_data = {
|
||||||
|
|||||||
@ -25,15 +25,15 @@ class ValuationCollector(BaseCollector):
|
|||||||
GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd"
|
GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd"
|
||||||
DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd"
|
DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd"
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader",
|
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201050201",
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||||||
}
|
}
|
||||||
REQUEST_TIMEOUT = 10
|
REQUEST_TIMEOUT = 10
|
||||||
RATE_LIMIT_DELAY = 1
|
RATE_LIMIT_DELAY = 1
|
||||||
|
|
||||||
def __init__(self, db: Session, biz_day: str = None):
|
def __init__(self, db: Session, biz_day: str = None):
|
||||||
super().__init__(db)
|
super().__init__(db)
|
||||||
self.biz_day = biz_day or datetime.now().strftime("%Y%m%d")
|
self.biz_day = biz_day or self._get_latest_biz_day()
|
||||||
self._validate_biz_day()
|
self._validate_biz_day()
|
||||||
|
|
||||||
def _validate_biz_day(self) -> None:
|
def _validate_biz_day(self) -> None:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user