185 lines
6.9 KiB
Python
185 lines
6.9 KiB
Python
"""
|
|
ETF master data collector from KRX.
|
|
|
|
Uses KRX Open API when KRX_OPENAPI_KEY is set, falls back to pykrx scraping.
|
|
"""
|
|
import logging
|
|
import time
|
|
from json import JSONDecodeError
|
|
|
|
import pandas as pd
|
|
from sqlalchemy.orm import Session
|
|
from sqlalchemy.dialects.postgresql import insert
|
|
|
|
from app.services.collectors.base import BaseCollector
|
|
from app.services.krx_client import get_krx_client
|
|
from app.models.stock import ETF, AssetClass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class KRXDataError(Exception):
|
|
"""Raised when KRX returns invalid or empty data (e.g. login required, server down)."""
|
|
pass
|
|
|
|
|
|
class ETFCollector(BaseCollector):
|
|
"""Collects ETF master data from KRX."""
|
|
|
|
def __init__(self, db: Session):
|
|
super().__init__(db)
|
|
|
|
def _classify_asset_class(self, asset_class_str: str, name: str) -> str:
|
|
"""Classify ETF asset class."""
|
|
if not asset_class_str or pd.isna(asset_class_str):
|
|
return AssetClass.MIXED.value
|
|
|
|
if "주식" in asset_class_str:
|
|
return AssetClass.EQUITY.value
|
|
elif "채권" in asset_class_str:
|
|
return AssetClass.BOND.value
|
|
elif "부동산" in asset_class_str:
|
|
return AssetClass.EQUITY.value
|
|
elif "원자재" in asset_class_str:
|
|
name_lower = name if name else ""
|
|
if "금" in name_lower or "골드" in name_lower:
|
|
return AssetClass.GOLD.value
|
|
return AssetClass.MIXED.value
|
|
else:
|
|
return AssetClass.MIXED.value
|
|
|
|
def _get_openapi_ticker(self, row) -> str:
|
|
"""Extract 6-digit ticker from KRX Open API rows."""
|
|
ticker = row.get("ISU_SRT_CD", row.get("ISU_CD", ""))
|
|
if pd.isna(ticker):
|
|
return ""
|
|
if isinstance(ticker, float):
|
|
return str(int(ticker)).zfill(6)
|
|
ticker = str(ticker).strip()
|
|
return ticker.zfill(6) if ticker.isdigit() else ticker
|
|
|
|
def _fetch_etf_data_openapi(self) -> pd.DataFrame:
|
|
"""Fetch ETF data via KRX Open API."""
|
|
client = get_krx_client()
|
|
biz_day = self._get_latest_biz_day()
|
|
df = client.get_etf_daily(biz_day)
|
|
if df is None or df.empty:
|
|
raise KRXDataError("KRX Open API returned empty ETF data")
|
|
return df
|
|
|
|
def _fetch_etf_data_pykrx(self) -> pd.DataFrame:
|
|
"""Fetch ETF data via pykrx scraping with 1 retry on failure."""
|
|
from pykrx.website.krx.etx.core import ETF_전종목기본종목
|
|
|
|
last_exc = None
|
|
for attempt in range(2):
|
|
try:
|
|
df = ETF_전종목기본종목().fetch()
|
|
if df is None or df.empty:
|
|
raise KRXDataError("KRX returned empty ETF data (login may be required)")
|
|
return df
|
|
except (JSONDecodeError, ConnectionError, ValueError, KeyError) as e:
|
|
last_exc = e
|
|
if attempt == 0:
|
|
logger.warning(f"ETF fetch failed (attempt 1/2), retrying in 3s: {e}")
|
|
time.sleep(3)
|
|
error_msg = f"ETF fetch failed after 2 attempts: {last_exc}"
|
|
if isinstance(last_exc, JSONDecodeError):
|
|
error_msg += " (KRX may require login — set KRX_ID/KRX_PW env vars)"
|
|
logger.error(error_msg)
|
|
raise KRXDataError(error_msg)
|
|
|
|
def _fetch_etf_data(self) -> pd.DataFrame:
|
|
"""Fetch ETF data with Open API preference and pykrx fallback."""
|
|
client = get_krx_client()
|
|
if client:
|
|
try:
|
|
logger.info("Fetching ETF data via KRX Open API")
|
|
return self._fetch_etf_data_openapi()
|
|
except Exception as e:
|
|
logger.warning(f"KRX Open API failed, falling back to pykrx: {e}")
|
|
|
|
logger.info("Fetching ETF data via pykrx (scraping)")
|
|
return self._fetch_etf_data_pykrx()
|
|
|
|
def _parse_openapi_records(self, df: pd.DataFrame) -> list[dict]:
|
|
"""Parse Open API response DataFrame into ETF records."""
|
|
records = []
|
|
for _, row in df.iterrows():
|
|
ticker = self._get_openapi_ticker(row)
|
|
name = row.get("ISU_ABBRV", row.get("ISU_NM"))
|
|
if not ticker or pd.isna(ticker) or not name or pd.isna(name):
|
|
continue
|
|
|
|
records.append({
|
|
"ticker": str(ticker),
|
|
"name": str(name),
|
|
"asset_class": AssetClass.MIXED.value,
|
|
"market": "",
|
|
"expense_ratio": None,
|
|
})
|
|
return records
|
|
|
|
def _parse_pykrx_records(self, df: pd.DataFrame) -> list[dict]:
|
|
"""Parse pykrx response DataFrame into ETF records."""
|
|
records = []
|
|
for _, row in df.iterrows():
|
|
ticker = self._get_openapi_ticker(row)
|
|
name = row.get("ISU_ABBRV", row.get("ISU_NM"))
|
|
if not ticker or pd.isna(ticker) or not name or pd.isna(name):
|
|
continue
|
|
|
|
asset_class_str = row.get("IDX_ASST_CLSS_NM", "")
|
|
market = row.get("IDX_MKT_CLSS_NM", "")
|
|
expense_ratio_raw = row.get("ETF_TOT_FEE")
|
|
|
|
expense_ratio = None
|
|
if expense_ratio_raw and not pd.isna(expense_ratio_raw):
|
|
try:
|
|
expense_ratio = float(expense_ratio_raw)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
records.append({
|
|
"ticker": str(ticker),
|
|
"name": str(name),
|
|
"asset_class": self._classify_asset_class(asset_class_str, name),
|
|
"market": market if market and not pd.isna(market) else "",
|
|
"expense_ratio": expense_ratio,
|
|
})
|
|
return records
|
|
|
|
def collect(self) -> int:
|
|
"""Collect ETF master data."""
|
|
client = get_krx_client()
|
|
if client:
|
|
try:
|
|
logger.info("Fetching ETF data via KRX Open API")
|
|
df = self._fetch_etf_data_openapi()
|
|
records = self._parse_openapi_records(df)
|
|
except Exception as e:
|
|
logger.warning(f"KRX Open API failed, falling back to pykrx: {e}")
|
|
df = self._fetch_etf_data_pykrx()
|
|
records = self._parse_pykrx_records(df)
|
|
else:
|
|
logger.info("Fetching ETF data via pykrx (scraping)")
|
|
df = self._fetch_etf_data_pykrx()
|
|
records = self._parse_pykrx_records(df)
|
|
|
|
if records:
|
|
stmt = insert(ETF).values(records)
|
|
stmt = stmt.on_conflict_do_update(
|
|
index_elements=["ticker"],
|
|
set_={
|
|
"name": stmt.excluded.name,
|
|
"asset_class": stmt.excluded.asset_class,
|
|
"market": stmt.excluded.market,
|
|
"expense_ratio": stmt.excluded.expense_ratio,
|
|
},
|
|
)
|
|
self.db.execute(stmt)
|
|
self.db.commit()
|
|
|
|
logger.info(f"Collected {len(records)} ETF records")
|
|
return len(records)
|