fix: add validation and rate limiting to ValuationCollector

- Add time import and RATE_LIMIT_DELAY constant for rate limiting between HTTP requests
- Add 1-second delay after OTP request to respect API rate limits
- Validate OTP response is not empty before using it
- Add CSV column structure validation with required columns check
- Add data quality check to skip records where all metrics are None
- Improve error handling and data integrity

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
zephyrdark 2026-02-02 23:56:05 +09:00
parent 3e723b6146
commit 8cc2d3fa41

View File

@ -2,6 +2,7 @@
Valuation data collector from KRX.
"""
import logging
import time
from io import BytesIO
from datetime import datetime
@ -28,6 +29,7 @@ class ValuationCollector(BaseCollector):
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
}
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1
def __init__(self, db: Session, biz_day: str = None):
super().__init__(db)
@ -71,9 +73,15 @@ class ValuationCollector(BaseCollector):
)
otp.raise_for_status()
otp_code = otp.text.strip()
if not otp_code:
raise RuntimeError("Received empty OTP from KRX API")
time.sleep(self.RATE_LIMIT_DELAY)
response = requests.post(
self.DOWN_URL,
data={"code": otp.text},
data={"code": otp_code},
headers=self.HEADERS,
timeout=self.REQUEST_TIMEOUT
)
@ -82,7 +90,18 @@ class ValuationCollector(BaseCollector):
raise RuntimeError(f"Failed to fetch valuation data: {e}")
df = pd.read_csv(BytesIO(response.content), encoding="EUC-KR")
if df.empty:
logger.warning(f"Empty CSV response for {self.biz_day}")
return 0
df.columns = df.columns.str.replace(" ", "")
required_cols = ["종목코드", "PER", "PBR", "배당수익률"]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Required columns missing from CSV: {missing_cols}")
base_date = datetime.strptime(self.biz_day, "%Y%m%d").date()
logger.info(f"Processing {len(df)} valuation records for {self.biz_day}")
@ -93,14 +112,22 @@ class ValuationCollector(BaseCollector):
if not ticker or pd.isna(ticker):
continue
per = self._safe_float(row.get("PER"))
pbr = self._safe_float(row.get("PBR"))
dividend_yield = self._safe_float(row.get("배당수익률"))
# Skip records where all metrics are None
if all(v is None for v in [per, pbr, dividend_yield]):
continue
records.append({
"ticker": ticker,
"base_date": base_date,
"per": self._safe_float(row.get("PER")),
"pbr": self._safe_float(row.get("PBR")),
"per": per,
"pbr": pbr,
"psr": None, # Not available from this endpoint
"pcr": None, # Not available from this endpoint
"dividend_yield": self._safe_float(row.get("배당수익률")),
"dividend_yield": dividend_yield,
})
if records: