""" Historical data backfill script. Usage (run inside the api or worker container): python -m app.management.backfill --congress 118 119 python -m app.management.backfill --congress 119 --skip-llm This script fetches all bills from the specified Congress numbers, stores them in the database, and (optionally) enqueues document fetch and LLM processing tasks for each bill. Cost note: LLM processing 15,000+ bills can be expensive. Consider using --skip-llm for initial backfill and processing manually / in batches. """ import argparse import logging import sys import time logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) def backfill_congress(congress_number: int, skip_llm: bool = False, dry_run: bool = False): from app.database import get_sync_db from app.models import AppSetting, Bill, Member from app.services import congress_api from app.workers.congress_poller import _sync_sponsor db = get_sync_db() offset = 0 total_processed = 0 total_new = 0 logger.info(f"Starting backfill for Congress {congress_number} (skip_llm={skip_llm}, dry_run={dry_run})") try: while True: response = congress_api.get_bills(congress=congress_number, offset=offset, limit=250) bills_data = response.get("bills", []) if not bills_data: break for bill_data in bills_data: parsed = congress_api.parse_bill_from_api(bill_data, congress_number) bill_id = parsed["bill_id"] if dry_run: logger.info(f"[DRY RUN] Would process: {bill_id}") total_processed += 1 continue existing = db.get(Bill, bill_id) if existing: total_processed += 1 continue # Sync sponsor sponsor_id = _sync_sponsor(db, bill_data) parsed["sponsor_id"] = sponsor_id db.add(Bill(**parsed)) total_new += 1 total_processed += 1 if total_new % 50 == 0: db.commit() logger.info(f"Progress: {total_processed} processed, {total_new} new") # Enqueue document + LLM at low priority if not skip_llm: from app.workers.document_fetcher import fetch_bill_documents fetch_bill_documents.apply_async(args=[bill_id], priority=3) # Stay well under Congress.gov rate limit (5,000/hr = ~1.4/sec) time.sleep(0.25) db.commit() offset += 250 if len(bills_data) < 250: break # Last page logger.info(f"Fetched page ending at offset {offset}, total processed: {total_processed}") time.sleep(1) # Polite pause between pages except KeyboardInterrupt: logger.info("Interrupted by user") db.commit() finally: db.close() logger.info(f"Backfill complete: {total_new} new bills added ({total_processed} total processed)") return total_new def main(): parser = argparse.ArgumentParser(description="Backfill Congressional bill data") parser.add_argument("--congress", type=int, nargs="+", default=[119], help="Congress numbers to backfill (default: 119)") parser.add_argument("--skip-llm", action="store_true", help="Skip LLM processing (fetch documents only, don't enqueue briefs)") parser.add_argument("--dry-run", action="store_true", help="Count bills without actually inserting them") args = parser.parse_args() total = 0 for congress_number in args.congress: total += backfill_congress(congress_number, skip_llm=args.skip_llm, dry_run=args.dry_run) logger.info(f"All done. Total new bills: {total}") if __name__ == "__main__": main()