PocketVeto/backend/app/management/backfill.py

"""
Historical data backfill script.

Usage (run inside the api or worker container):
    python -m app.management.backfill --congress 118 119
    python -m app.management.backfill --congress 119 --skip-llm

This script fetches all bills from the specified Congress numbers,
stores them in the database, and (optionally) enqueues document fetch
and LLM processing tasks for each bill.

Cost note: LLM processing 15,000+ bills can be expensive.
Consider using --skip-llm for initial backfill and processing
manually / in batches.
"""
import argparse
import logging
import sys
import time

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)


def backfill_congress(congress_number: int, skip_llm: bool = False, dry_run: bool = False):
    from app.database import get_sync_db
    from app.models import AppSetting, Bill, Member
    from app.services import congress_api
    from app.workers.congress_poller import _sync_sponsor

    db = get_sync_db()
    offset = 0
    total_processed = 0
    total_new = 0

    logger.info(f"Starting backfill for Congress {congress_number} (skip_llm={skip_llm}, dry_run={dry_run})")

    try:
        while True:
            response = congress_api.get_bills(congress=congress_number, offset=offset, limit=250)
            bills_data = response.get("bills", [])

            if not bills_data:
                break

            for bill_data in bills_data:
                parsed = congress_api.parse_bill_from_api(bill_data, congress_number)
                bill_id = parsed["bill_id"]

                if dry_run:
                    logger.info(f"[DRY RUN] Would process: {bill_id}")
                    total_processed += 1
                    continue

                existing = db.get(Bill, bill_id)
                if existing:
                    total_processed += 1
                    continue

                # Sync sponsor
                sponsor_id = _sync_sponsor(db, bill_data)
                parsed["sponsor_id"] = sponsor_id

                db.add(Bill(**parsed))
                total_new += 1
                total_processed += 1

                if total_new % 50 == 0:
                    db.commit()
                    logger.info(f"Progress: {total_processed} processed, {total_new} new")

                # Enqueue document + LLM at low priority
                if not skip_llm:
                    from app.workers.document_fetcher import fetch_bill_documents
                    fetch_bill_documents.apply_async(args=[bill_id], priority=3)

                # Stay well under Congress.gov rate limit (5,000/hr = ~1.4/sec)
                time.sleep(0.25)

            db.commit()
            offset += 250

            if len(bills_data) < 250:
                break  # Last page

            logger.info(f"Fetched page ending at offset {offset}, total processed: {total_processed}")
            time.sleep(1)  # Polite pause between pages

    except KeyboardInterrupt:
        logger.info("Interrupted by user")
        db.commit()
    finally:
        db.close()

    logger.info(f"Backfill complete: {total_new} new bills added ({total_processed} total processed)")
    return total_new


def main():
    parser = argparse.ArgumentParser(description="Backfill Congressional bill data")
    parser.add_argument("--congress", type=int, nargs="+", default=[119],
                        help="Congress numbers to backfill (default: 119)")
    parser.add_argument("--skip-llm", action="store_true",
                        help="Skip LLM processing (fetch documents only, don't enqueue briefs)")
    parser.add_argument("--dry-run", action="store_true",
                        help="Count bills without actually inserting them")
    args = parser.parse_args()

    total = 0
    for congress_number in args.congress:
        total += backfill_congress(congress_number, skip_llm=args.skip_llm, dry_run=args.dry_run)

    logger.info(f"All done. Total new bills: {total}")


if __name__ == "__main__":
    main()