fix: switch StockCollector and ValuationCollector from KRX CSV to pykrx
Some checks failed
Deploy to Production / deploy (push) Failing after 1m37s

KRX CSV download endpoint blocks requests from cloud/server IPs,
causing "No columns to parse from file" errors. Replaced with pykrx's
JSON-based API (get_market_ticker_list, get_market_cap_by_ticker,
get_market_fundamental_by_ticker) which is more reliable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
zephyrdark 2026-02-08 23:08:50 +09:00
parent ecb3dca571
commit 4261e9c777
2 changed files with 77 additions and 187 deletions

View File

@ -1,13 +1,11 @@
"""
Stock data collector from KRX.
Stock data collector using pykrx.
"""
import logging
import time
from io import BytesIO
from datetime import datetime
import pandas as pd
import requests
from pykrx import stock as pykrx_stock
from sqlalchemy.orm import Session
from sqlalchemy.dialects.postgresql import insert
@ -19,16 +17,7 @@ logger = logging.getLogger(__name__)
class StockCollector(BaseCollector):
"""Collects stock master data from KRX."""
GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd"
DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd"
HEADERS = {
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201050201",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
}
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1
"""Collects stock master data using pykrx."""
def __init__(self, db: Session, biz_day: str = None):
super().__init__(db)
@ -42,125 +31,82 @@ class StockCollector(BaseCollector):
except ValueError:
raise ValueError(f"Invalid biz_day format. Expected YYYYMMDD, got: {self.biz_day}")
def _fetch_stock_data(self, mkt_id: str) -> pd.DataFrame:
"""Fetch stock data for a specific market."""
gen_otp_data = {
"locale": "ko_KR",
"mktId": mkt_id,
"trdDd": self.biz_day,
"money": "1",
"csvxls_isNo": "false",
"name": "fileDown",
"url": "dbms/MDC/STAT/standard/MDCSTAT03901",
}
try:
otp = requests.post(
self.GEN_OTP_URL,
data=gen_otp_data,
headers=self.HEADERS,
timeout=self.REQUEST_TIMEOUT
)
otp.raise_for_status()
response = requests.post(
self.DOWN_URL,
data={"code": otp.text},
headers=self.HEADERS,
timeout=self.REQUEST_TIMEOUT
)
response.raise_for_status()
time.sleep(self.RATE_LIMIT_DELAY)
return pd.read_csv(BytesIO(response.content), encoding="EUC-KR")
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch stock data for market {mkt_id}: {e}")
def _fetch_ind_data(self) -> pd.DataFrame:
"""Fetch individual stock indicators."""
gen_otp_data = {
"locale": "ko_KR",
"searchType": "1",
"mktId": "ALL",
"trdDd": self.biz_day,
"csvxls_isNo": "false",
"name": "fileDown",
"url": "dbms/MDC/STAT/standard/MDCSTAT03501",
}
try:
otp = requests.post(
self.GEN_OTP_URL,
data=gen_otp_data,
headers=self.HEADERS,
timeout=self.REQUEST_TIMEOUT
)
otp.raise_for_status()
response = requests.post(
self.DOWN_URL,
data={"code": otp.text},
headers=self.HEADERS,
timeout=self.REQUEST_TIMEOUT
)
response.raise_for_status()
time.sleep(self.RATE_LIMIT_DELAY)
return pd.read_csv(BytesIO(response.content), encoding="EUC-KR")
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch indicator data: {e}")
def _classify_stock_type(self, row: pd.Series) -> str:
def _classify_stock_type(self, name: str, ticker: str) -> str:
"""Classify stock type based on name and code."""
name = row.get("종목명", "")
code = row.get("종목코드", "")
if "스팩" in name or "" in name and "" in name:
if "스팩" in name or ("" in name and "" in name):
return StockType.SPAC.value
elif code and code[-1] != "0":
elif ticker and ticker[-1] != "0":
return StockType.PREFERRED.value
elif name.endswith("리츠"):
return StockType.REIT.value
else:
return StockType.COMMON.value
def _safe_value(self, value):
"""Safely extract a value, returning None for NaN."""
if pd.isna(value):
return None
try:
return float(value)
except (ValueError, TypeError):
return None
def collect(self) -> int:
"""Collect stock master data."""
# Fetch data from both markets
kospi = self._fetch_stock_data("STK")
kosdaq = self._fetch_stock_data("KSQ")
krx_sector = pd.concat([kospi, kosdaq]).reset_index(drop=True)
krx_sector["종목명"] = krx_sector["종목명"].str.strip()
# Get tickers per market (also caches ticker-name mappings internally)
kospi_tickers = pykrx_stock.get_market_ticker_list(self.biz_day, market="KOSPI")
kosdaq_tickers = pykrx_stock.get_market_ticker_list(self.biz_day, market="KOSDAQ")
# Fetch individual indicators
krx_ind = self._fetch_ind_data()
krx_ind["종목명"] = krx_ind["종목명"].str.strip()
ticker_market = {}
for t in kospi_tickers:
ticker_market[t] = "KOSPI"
for t in kosdaq_tickers:
ticker_market[t] = "KOSDAQ"
# Merge data
merged = pd.merge(
krx_sector,
krx_ind,
on=krx_sector.columns.intersection(krx_ind.columns).tolist(),
how="outer",
)
merged.columns = merged.columns.str.replace(" ", "")
if not ticker_market:
logger.warning("No tickers found from pykrx.")
return 0
# Fetch bulk data
cap_df = pykrx_stock.get_market_cap_by_ticker(self.biz_day)
fund_df = pykrx_stock.get_market_fundamental_by_ticker(self.biz_day, market="ALL")
base_date = datetime.strptime(self.biz_day, "%Y%m%d").date()
# Process and insert
records = []
for _, row in merged.iterrows():
# Skip rows with missing required fields
ticker = row.get("종목코드")
name = row.get("종목명")
if not ticker or pd.isna(ticker) or not name or pd.isna(name):
for ticker, market in ticker_market.items():
name = pykrx_stock.get_market_ticker_name(ticker)
if not name:
continue
stock_type = self._classify_stock_type(row)
close_price = None
market_cap = None
if ticker in cap_df.index:
close_price = self._safe_value(cap_df.at[ticker, "종가"])
market_cap = self._safe_value(cap_df.at[ticker, "시가총액"])
eps = None
bps = None
dps = None
if ticker in fund_df.index:
eps = self._safe_value(fund_df.at[ticker, "EPS"])
bps = self._safe_value(fund_df.at[ticker, "BPS"])
dps = self._safe_value(fund_df.at[ticker, "DPS"])
stock_type = self._classify_stock_type(name, ticker)
records.append({
"ticker": row.get("종목코드"),
"name": row.get("종목명"),
"market": row.get("시장구분", "KOSPI"),
"close_price": row.get("종가") if pd.notna(row.get("종가")) else None,
"market_cap": row.get("시가총액") if pd.notna(row.get("시가총액")) else None,
"eps": row.get("EPS") if pd.notna(row.get("EPS")) else None,
"forward_eps": row.get("선행EPS") if pd.notna(row.get("선행EPS")) else None,
"bps": row.get("BPS") if pd.notna(row.get("BPS")) else None,
"dividend_per_share": row.get("주당배당금") if pd.notna(row.get("주당배당금")) else None,
"ticker": ticker,
"name": name,
"market": market,
"close_price": close_price,
"market_cap": market_cap,
"eps": eps,
"forward_eps": None,
"bps": bps,
"dividend_per_share": dps,
"stock_type": stock_type,
"base_date": datetime.strptime(self.biz_day, "%Y%m%d").date(),
"base_date": base_date,
})
# Upsert using PostgreSQL INSERT ON CONFLICT
@ -184,4 +130,5 @@ class StockCollector(BaseCollector):
self.db.execute(stmt)
self.db.commit()
logger.info(f"Collected {len(records)} stock records")
return len(records)

View File

@ -1,13 +1,11 @@
"""
Valuation data collector from KRX.
Valuation data collector using pykrx.
"""
import logging
import time
from io import BytesIO
from datetime import datetime
import pandas as pd
import requests
from pykrx import stock as pykrx_stock
from sqlalchemy.orm import Session
from sqlalchemy.dialects.postgresql import insert
@ -20,16 +18,7 @@ logger = logging.getLogger(__name__)
class ValuationCollector(BaseCollector):
"""Collects valuation metrics (PER, PBR, etc.) from KRX."""
GEN_OTP_URL = "http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd"
DOWN_URL = "http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd"
HEADERS = {
"Referer": "http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201050201",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
}
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1
"""Collects valuation metrics (PER, PBR, etc.) using pykrx."""
def __init__(self, db: Session, biz_day: str = None):
super().__init__(db)
@ -54,70 +43,24 @@ class ValuationCollector(BaseCollector):
def collect(self) -> int:
"""Collect valuation data."""
gen_otp_data = {
"locale": "ko_KR",
"searchType": "1",
"mktId": "ALL",
"trdDd": self.biz_day,
"csvxls_isNo": "false",
"name": "fileDown",
"url": "dbms/MDC/STAT/standard/MDCSTAT03501",
}
fund_df = pykrx_stock.get_market_fundamental_by_ticker(self.biz_day, market="ALL")
try:
otp = requests.post(
self.GEN_OTP_URL,
data=gen_otp_data,
headers=self.HEADERS,
timeout=self.REQUEST_TIMEOUT
)
otp.raise_for_status()
otp_code = otp.text.strip()
if not otp_code:
raise RuntimeError("Received empty OTP from KRX API")
time.sleep(self.RATE_LIMIT_DELAY)
response = requests.post(
self.DOWN_URL,
data={"code": otp_code},
headers=self.HEADERS,
timeout=self.REQUEST_TIMEOUT
)
response.raise_for_status()
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch valuation data: {e}")
df = pd.read_csv(BytesIO(response.content), encoding="EUC-KR")
if df.empty:
logger.warning(f"Empty CSV response for {self.biz_day}")
if fund_df.empty:
logger.warning(f"No fundamental data returned for {self.biz_day}")
return 0
df.columns = df.columns.str.replace(" ", "")
required_cols = ["종목코드", "PER", "PBR", "배당수익률"]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Required columns missing from CSV: {missing_cols}")
base_date = datetime.strptime(self.biz_day, "%Y%m%d").date()
logger.info(f"Processing {len(df)} valuation records for {self.biz_day}")
logger.info(f"Processing {len(fund_df)} valuation records for {self.biz_day}")
records = []
for _, row in df.iterrows():
ticker = row.get("종목코드")
if not ticker or pd.isna(ticker):
continue
per = self._safe_float(row.get("PER"))
pbr = self._safe_float(row.get("PBR"))
dividend_yield = self._safe_float(row.get("배당수익률"))
for ticker in fund_df.index:
per = self._safe_float(fund_df.at[ticker, "PER"])
pbr = self._safe_float(fund_df.at[ticker, "PBR"])
div_yield = self._safe_float(fund_df.at[ticker, "DIV"])
# Skip records where all metrics are None
if all(v is None for v in [per, pbr, dividend_yield]):
if all(v is None for v in [per, pbr, div_yield]):
continue
records.append({
@ -125,9 +68,9 @@ class ValuationCollector(BaseCollector):
"base_date": base_date,
"per": per,
"pbr": pbr,
"psr": None, # Not available from this endpoint
"pcr": None, # Not available from this endpoint
"dividend_yield": dividend_yield,
"psr": None,
"pcr": None,
"dividend_yield": div_yield,
})
if records: