fix: add validation and rate limiting to ValuationCollector

- Add time import and RATE_LIMIT_DELAY constant for rate limiting between HTTP requests
- Add 1-second delay after OTP request to respect API rate limits
- Validate OTP response is not empty before using it
- Add CSV column structure validation with required columns check
- Add data quality check to skip records where all metrics are None
- Improve error handling and data integrity

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
zephyrdark 2026-02-02 23:56:05 +09:00
parent 3e723b6146
commit 8cc2d3fa41

View File

@ -2,6 +2,7 @@
Valuation data collector from KRX. Valuation data collector from KRX.
""" """
import logging import logging
import time
from io import BytesIO from io import BytesIO
from datetime import datetime from datetime import datetime
@ -28,6 +29,7 @@ class ValuationCollector(BaseCollector):
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
} }
REQUEST_TIMEOUT = 10 REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1
def __init__(self, db: Session, biz_day: str = None): def __init__(self, db: Session, biz_day: str = None):
super().__init__(db) super().__init__(db)
@ -71,9 +73,15 @@ class ValuationCollector(BaseCollector):
) )
otp.raise_for_status() otp.raise_for_status()
otp_code = otp.text.strip()
if not otp_code:
raise RuntimeError("Received empty OTP from KRX API")
time.sleep(self.RATE_LIMIT_DELAY)
response = requests.post( response = requests.post(
self.DOWN_URL, self.DOWN_URL,
data={"code": otp.text}, data={"code": otp_code},
headers=self.HEADERS, headers=self.HEADERS,
timeout=self.REQUEST_TIMEOUT timeout=self.REQUEST_TIMEOUT
) )
@ -82,7 +90,18 @@ class ValuationCollector(BaseCollector):
raise RuntimeError(f"Failed to fetch valuation data: {e}") raise RuntimeError(f"Failed to fetch valuation data: {e}")
df = pd.read_csv(BytesIO(response.content), encoding="EUC-KR") df = pd.read_csv(BytesIO(response.content), encoding="EUC-KR")
if df.empty:
logger.warning(f"Empty CSV response for {self.biz_day}")
return 0
df.columns = df.columns.str.replace(" ", "") df.columns = df.columns.str.replace(" ", "")
required_cols = ["종목코드", "PER", "PBR", "배당수익률"]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Required columns missing from CSV: {missing_cols}")
base_date = datetime.strptime(self.biz_day, "%Y%m%d").date() base_date = datetime.strptime(self.biz_day, "%Y%m%d").date()
logger.info(f"Processing {len(df)} valuation records for {self.biz_day}") logger.info(f"Processing {len(df)} valuation records for {self.biz_day}")
@ -93,14 +112,22 @@ class ValuationCollector(BaseCollector):
if not ticker or pd.isna(ticker): if not ticker or pd.isna(ticker):
continue continue
per = self._safe_float(row.get("PER"))
pbr = self._safe_float(row.get("PBR"))
dividend_yield = self._safe_float(row.get("배당수익률"))
# Skip records where all metrics are None
if all(v is None for v in [per, pbr, dividend_yield]):
continue
records.append({ records.append({
"ticker": ticker, "ticker": ticker,
"base_date": base_date, "base_date": base_date,
"per": self._safe_float(row.get("PER")), "per": per,
"pbr": self._safe_float(row.get("PBR")), "pbr": pbr,
"psr": None, # Not available from this endpoint "psr": None, # Not available from this endpoint
"pcr": None, # Not available from this endpoint "pcr": None, # Not available from this endpoint
"dividend_yield": self._safe_float(row.get("배당수익률")), "dividend_yield": dividend_yield,
}) })
if records: if records: