fix: add validation and rate limiting to ValuationCollector

- Add time import and RATE_LIMIT_DELAY constant for rate limiting between HTTP requests - Add 1-second delay after OTP request to respect API rate limits - Validate OTP response is not empty before using it - Add CSV column structure validation with required columns check - Add data quality check to skip records where all metrics are None - Improve error handling and data integrity Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 23:56:05 +09:00 · 2026-02-02 23:56:05 +09:00 · 8cc2d3fa41
commit 8cc2d3fa41
parent 3e723b6146
1 changed files with 31 additions and 4 deletions
--- a/backend/app/services/collectors/valuation_collector.py
+++ b/backend/app/services/collectors/valuation_collector.py
@ -2,6 +2,7 @@
 Valuation data collector from KRX.
 """
 import logging
+import time
 from io import BytesIO
 from datetime import datetime

@ -28,6 +29,7 @@ class ValuationCollector(BaseCollector):
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    }
    REQUEST_TIMEOUT = 10
+    RATE_LIMIT_DELAY = 1

    def __init__(self, db: Session, biz_day: str = None):
        super().__init__(db)
@ -71,9 +73,15 @@ class ValuationCollector(BaseCollector):
            )
            otp.raise_for_status()

+            otp_code = otp.text.strip()
+            if not otp_code:
+                raise RuntimeError("Received empty OTP from KRX API")
+
+            time.sleep(self.RATE_LIMIT_DELAY)
+
            response = requests.post(
                self.DOWN_URL,
-                data={"code": otp.text},
+                data={"code": otp_code},
                headers=self.HEADERS,
                timeout=self.REQUEST_TIMEOUT
            )
@ -82,7 +90,18 @@ class ValuationCollector(BaseCollector):
            raise RuntimeError(f"Failed to fetch valuation data: {e}")

        df = pd.read_csv(BytesIO(response.content), encoding="EUC-KR")
+
+        if df.empty:
+            logger.warning(f"Empty CSV response for {self.biz_day}")
+            return 0
+
        df.columns = df.columns.str.replace(" ", "")
+
+        required_cols = ["종목코드", "PER", "PBR", "배당수익률"]
+        missing_cols = [col for col in required_cols if col not in df.columns]
+        if missing_cols:
+            raise ValueError(f"Required columns missing from CSV: {missing_cols}")
+
        base_date = datetime.strptime(self.biz_day, "%Y%m%d").date()

        logger.info(f"Processing {len(df)} valuation records for {self.biz_day}")
@ -93,14 +112,22 @@ class ValuationCollector(BaseCollector):
            if not ticker or pd.isna(ticker):
                continue

+            per = self._safe_float(row.get("PER"))
+            pbr = self._safe_float(row.get("PBR"))
+            dividend_yield = self._safe_float(row.get("배당수익률"))
+
+            # Skip records where all metrics are None
+            if all(v is None for v in [per, pbr, dividend_yield]):
+                continue
+
            records.append({
                "ticker": ticker,
                "base_date": base_date,
-                "per": self._safe_float(row.get("PER")),
-                "pbr": self._safe_float(row.get("PBR")),
+                "per": per,
+                "pbr": pbr,
                "psr": None,  # Not available from this endpoint
                "pcr": None,  # Not available from this endpoint
-                "dividend_yield": self._safe_float(row.get("배당수익률")),
+                "dividend_yield": dividend_yield,
            })

        if records: