2026-05-13 22:05:09 +09:00

185 lines
6.9 KiB
Python

"""
ETF master data collector from KRX.
Uses KRX Open API when KRX_OPENAPI_KEY is set, falls back to pykrx scraping.
"""
import logging
import time
from json import JSONDecodeError
import pandas as pd
from sqlalchemy.orm import Session
from sqlalchemy.dialects.postgresql import insert
from app.services.collectors.base import BaseCollector
from app.services.krx_client import get_krx_client
from app.models.stock import ETF, AssetClass
logger = logging.getLogger(__name__)
class KRXDataError(Exception):
"""Raised when KRX returns invalid or empty data (e.g. login required, server down)."""
pass
class ETFCollector(BaseCollector):
"""Collects ETF master data from KRX."""
def __init__(self, db: Session):
super().__init__(db)
def _classify_asset_class(self, asset_class_str: str, name: str) -> str:
"""Classify ETF asset class."""
if not asset_class_str or pd.isna(asset_class_str):
return AssetClass.MIXED.value
if "주식" in asset_class_str:
return AssetClass.EQUITY.value
elif "채권" in asset_class_str:
return AssetClass.BOND.value
elif "부동산" in asset_class_str:
return AssetClass.EQUITY.value
elif "원자재" in asset_class_str:
name_lower = name if name else ""
if "" in name_lower or "골드" in name_lower:
return AssetClass.GOLD.value
return AssetClass.MIXED.value
else:
return AssetClass.MIXED.value
def _get_openapi_ticker(self, row) -> str:
"""Extract 6-digit ticker from KRX Open API rows."""
ticker = row.get("ISU_SRT_CD", row.get("ISU_CD", ""))
if pd.isna(ticker):
return ""
if isinstance(ticker, float):
return str(int(ticker)).zfill(6)
ticker = str(ticker).strip()
return ticker.zfill(6) if ticker.isdigit() else ticker
def _fetch_etf_data_openapi(self) -> pd.DataFrame:
"""Fetch ETF data via KRX Open API."""
client = get_krx_client()
biz_day = self._get_latest_biz_day()
df = client.get_etf_daily(biz_day)
if df is None or df.empty:
raise KRXDataError("KRX Open API returned empty ETF data")
return df
def _fetch_etf_data_pykrx(self) -> pd.DataFrame:
"""Fetch ETF data via pykrx scraping with 1 retry on failure."""
from pykrx.website.krx.etx.core import ETF_전종목기본종목
last_exc = None
for attempt in range(2):
try:
df = ETF_전종목기본종목().fetch()
if df is None or df.empty:
raise KRXDataError("KRX returned empty ETF data (login may be required)")
return df
except (JSONDecodeError, ConnectionError, ValueError, KeyError) as e:
last_exc = e
if attempt == 0:
logger.warning(f"ETF fetch failed (attempt 1/2), retrying in 3s: {e}")
time.sleep(3)
error_msg = f"ETF fetch failed after 2 attempts: {last_exc}"
if isinstance(last_exc, JSONDecodeError):
error_msg += " (KRX may require login — set KRX_ID/KRX_PW env vars)"
logger.error(error_msg)
raise KRXDataError(error_msg)
def _fetch_etf_data(self) -> pd.DataFrame:
"""Fetch ETF data with Open API preference and pykrx fallback."""
client = get_krx_client()
if client:
try:
logger.info("Fetching ETF data via KRX Open API")
return self._fetch_etf_data_openapi()
except Exception as e:
logger.warning(f"KRX Open API failed, falling back to pykrx: {e}")
logger.info("Fetching ETF data via pykrx (scraping)")
return self._fetch_etf_data_pykrx()
def _parse_openapi_records(self, df: pd.DataFrame) -> list[dict]:
"""Parse Open API response DataFrame into ETF records."""
records = []
for _, row in df.iterrows():
ticker = self._get_openapi_ticker(row)
name = row.get("ISU_ABBRV", row.get("ISU_NM"))
if not ticker or pd.isna(ticker) or not name or pd.isna(name):
continue
records.append({
"ticker": str(ticker),
"name": str(name),
"asset_class": AssetClass.MIXED.value,
"market": "",
"expense_ratio": None,
})
return records
def _parse_pykrx_records(self, df: pd.DataFrame) -> list[dict]:
"""Parse pykrx response DataFrame into ETF records."""
records = []
for _, row in df.iterrows():
ticker = self._get_openapi_ticker(row)
name = row.get("ISU_ABBRV", row.get("ISU_NM"))
if not ticker or pd.isna(ticker) or not name or pd.isna(name):
continue
asset_class_str = row.get("IDX_ASST_CLSS_NM", "")
market = row.get("IDX_MKT_CLSS_NM", "")
expense_ratio_raw = row.get("ETF_TOT_FEE")
expense_ratio = None
if expense_ratio_raw and not pd.isna(expense_ratio_raw):
try:
expense_ratio = float(expense_ratio_raw)
except (ValueError, TypeError):
pass
records.append({
"ticker": str(ticker),
"name": str(name),
"asset_class": self._classify_asset_class(asset_class_str, name),
"market": market if market and not pd.isna(market) else "",
"expense_ratio": expense_ratio,
})
return records
def collect(self) -> int:
"""Collect ETF master data."""
client = get_krx_client()
if client:
try:
logger.info("Fetching ETF data via KRX Open API")
df = self._fetch_etf_data_openapi()
records = self._parse_openapi_records(df)
except Exception as e:
logger.warning(f"KRX Open API failed, falling back to pykrx: {e}")
df = self._fetch_etf_data_pykrx()
records = self._parse_pykrx_records(df)
else:
logger.info("Fetching ETF data via pykrx (scraping)")
df = self._fetch_etf_data_pykrx()
records = self._parse_pykrx_records(df)
if records:
stmt = insert(ETF).values(records)
stmt = stmt.on_conflict_do_update(
index_elements=["ticker"],
set_={
"name": stmt.excluded.name,
"asset_class": stmt.excluded.asset_class,
"market": stmt.excluded.market,
"expense_ratio": stmt.excluded.expense_ratio,
},
)
self.db.execute(stmt)
self.db.commit()
logger.info(f"Collected {len(records)} ETF records")
return len(records)