galaxis-po/backend/jobs/collection_job.py

"""
Data collection orchestration jobs.
"""
import logging
from datetime import date, timedelta

from sqlalchemy import func

from app.core.database import SessionLocal
from app.models.stock import Price, ETFPrice
from app.services.collectors import (
    StockCollector,
    SectorCollector,
    PriceCollector,
    ValuationCollector,
    FinancialCollector,
    ETFCollector,
    ETFPriceCollector,
    IndexCollector,
)

logger = logging.getLogger(__name__)


def _get_daily_collectors():
    """
    Return collectors in dependency order: master data first, then derived data.

    Built at call time so that module-level names can be patched in tests.
    """
    return [
        ("StockCollector", StockCollector, {}),
        ("SectorCollector", SectorCollector, {}),
        ("PriceCollector", PriceCollector, {}),
        ("ValuationCollector", ValuationCollector, {}),
        ("ETFCollector", ETFCollector, {}),
        ("ETFPriceCollector", ETFPriceCollector, {}),
        ("IndexCollector", IndexCollector, {}),
    ]


def run_daily_collection():
    """
    Run all data collectors in dependency order.

    Each collector gets its own DB session. If one fails, the rest continue.
    Designed to be called by APScheduler at 18:00 Mon-Fri.
    """
    logger.info("Starting daily data collection")
    results = {}

    for name, collector_cls, kwargs in _get_daily_collectors():
        db = SessionLocal()
        try:
            collector = collector_cls(db, **kwargs)
            collector.run()
            results[name] = "success"
            logger.info(f"{name} completed successfully")
        except Exception as e:
            results[name] = f"failed: {e}"
            logger.error(f"{name} failed: {e}")
        finally:
            db.close()

    logger.info(f"Daily collection finished: {results}")
    return results


def run_financial_collection():
    """
    Run financial statement collector.

    Financial data updates quarterly, so this runs weekly (Monday)
    rather than daily. Separated from daily collection to avoid
    unnecessary FnGuide scraping on every business day.
    """
    logger.info("Starting weekly financial statement collection")
    db = SessionLocal()
    try:
        collector = FinancialCollector(db)
        collector.run()
        logger.info("FinancialCollector completed successfully")
        return {"FinancialCollector": "success"}
    except Exception as e:
        logger.error(f"FinancialCollector failed: {e}")
        return {"FinancialCollector": f"failed: {e}"}
    finally:
        db.close()


def _generate_yearly_chunks(start_year: int, end_date: date) -> list[tuple[str, str]]:
    """Generate (start_date, end_date) pairs in YYYYMMDD format, one per year."""
    chunks = []
    current_start = date(start_year, 1, 1)

    while current_start < end_date:
        current_end = date(current_start.year, 12, 31)
        if current_end > end_date:
            current_end = end_date
        chunks.append((
            current_start.strftime("%Y%m%d"),
            current_end.strftime("%Y%m%d"),
        ))
        current_start = date(current_start.year + 1, 1, 1)

    return chunks


def run_backfill(start_year: int = 2000):
    """
    Collect historical price data from start_year to today.

    Checks the earliest existing data in DB and only collects
    missing periods. Splits into yearly chunks to avoid overloading pykrx.
    """
    logger.info(f"Starting backfill from {start_year}")
    today = date.today()

    db = SessionLocal()
    try:
        # Determine what needs backfilling
        backfill_targets = [
            ("Price", PriceCollector, Price.date),
            ("ETFPrice", ETFPriceCollector, ETFPrice.date),
        ]

        for name, collector_cls, date_col in backfill_targets:
            # Find earliest existing data
            earliest = db.query(func.min(date_col)).scalar()

            if earliest is None:
                # No data at all - collect everything
                backfill_end = today
            else:
                # Data exists - collect from start_year to day before earliest
                backfill_end = earliest - timedelta(days=1)

            if date(start_year, 1, 1) >= backfill_end:
                logger.info(f"{name}: no backfill needed (data exists from {earliest})")
                continue

            chunks = _generate_yearly_chunks(start_year, backfill_end)
            logger.info(f"{name}: backfilling {len(chunks)} yearly chunks from {start_year} to {backfill_end}")

            for start_dt, end_dt in chunks:
                chunk_db = SessionLocal()
                try:
                    collector = collector_cls(chunk_db, start_date=start_dt, end_date=end_dt)
                    collector.run()
                    logger.info(f"{name}: chunk {start_dt}-{end_dt} completed")
                except Exception as e:
                    logger.error(f"{name}: chunk {start_dt}-{end_dt} failed: {e}")
                finally:
                    chunk_db.close()

            # Also fill gap between latest data and today (forward fill)
            if earliest is not None:
                latest = db.query(func.max(date_col)).scalar()
                if latest and latest < today:
                    gap_start = (latest + timedelta(days=1)).strftime("%Y%m%d")
                    gap_end = today.strftime("%Y%m%d")
                    gap_db = SessionLocal()
                    try:
                        collector = collector_cls(gap_db, start_date=gap_start, end_date=gap_end)
                        collector.run()
                        logger.info(f"{name}: forward fill {gap_start}-{gap_end} completed")
                    except Exception as e:
                        logger.error(f"{name}: forward fill failed: {e}")
                    finally:
                        gap_db.close()

    finally:
        db.close()

    logger.info("Backfill completed")