Initial commit

2026-02-28 21:08:19 -05:00
commit e418dd9ae0
85 changed files with 5261 additions and 0 deletions
--- a/backend/app/management/backfill.py
+++ b/backend/app/management/backfill.py
@@ -0,0 +1,117 @@
+"""
+Historical data backfill script.
+
+Usage (run inside the api or worker container):
+    python -m app.management.backfill --congress 118 119
+    python -m app.management.backfill --congress 119 --skip-llm
+
+This script fetches all bills from the specified Congress numbers,
+stores them in the database, and (optionally) enqueues document fetch
+and LLM processing tasks for each bill.
+
+Cost note: LLM processing 15,000+ bills can be expensive.
+Consider using --skip-llm for initial backfill and processing
+manually / in batches.
+"""
+import argparse
+import logging
+import sys
+import time
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def backfill_congress(congress_number: int, skip_llm: bool = False, dry_run: bool = False):
+    from app.database import get_sync_db
+    from app.models import AppSetting, Bill, Member
+    from app.services import congress_api
+    from app.workers.congress_poller import _sync_sponsor
+
+    db = get_sync_db()
+    offset = 0
+    total_processed = 0
+    total_new = 0
+
+    logger.info(f"Starting backfill for Congress {congress_number} (skip_llm={skip_llm}, dry_run={dry_run})")
+
+    try:
+        while True:
+            response = congress_api.get_bills(congress=congress_number, offset=offset, limit=250)
+            bills_data = response.get("bills", [])
+
+            if not bills_data:
+                break
+
+            for bill_data in bills_data:
+                parsed = congress_api.parse_bill_from_api(bill_data, congress_number)
+                bill_id = parsed["bill_id"]
+
+                if dry_run:
+                    logger.info(f"[DRY RUN] Would process: {bill_id}")
+                    total_processed += 1
+                    continue
+
+                existing = db.get(Bill, bill_id)
+                if existing:
+                    total_processed += 1
+                    continue
+
+                # Sync sponsor
+                sponsor_id = _sync_sponsor(db, bill_data)
+                parsed["sponsor_id"] = sponsor_id
+
+                db.add(Bill(**parsed))
+                total_new += 1
+                total_processed += 1
+
+                if total_new % 50 == 0:
+                    db.commit()
+                    logger.info(f"Progress: {total_processed} processed, {total_new} new")
+
+                # Enqueue document + LLM at low priority
+                if not skip_llm:
+                    from app.workers.document_fetcher import fetch_bill_documents
+                    fetch_bill_documents.apply_async(args=[bill_id], priority=3)
+
+                # Stay well under Congress.gov rate limit (5,000/hr = ~1.4/sec)
+                time.sleep(0.25)
+
+            db.commit()
+            offset += 250
+
+            if len(bills_data) < 250:
+                break  # Last page
+
+            logger.info(f"Fetched page ending at offset {offset}, total processed: {total_processed}")
+            time.sleep(1)  # Polite pause between pages
+
+    except KeyboardInterrupt:
+        logger.info("Interrupted by user")
+        db.commit()
+    finally:
+        db.close()
+
+    logger.info(f"Backfill complete: {total_new} new bills added ({total_processed} total processed)")
+    return total_new
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backfill Congressional bill data")
+    parser.add_argument("--congress", type=int, nargs="+", default=[119],
+                        help="Congress numbers to backfill (default: 119)")
+    parser.add_argument("--skip-llm", action="store_true",
+                        help="Skip LLM processing (fetch documents only, don't enqueue briefs)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Count bills without actually inserting them")
+    args = parser.parse_args()
+
+    total = 0
+    for congress_number in args.congress:
+        total += backfill_congress(congress_number, skip_llm=args.skip_llm, dry_run=args.dry_run)
+
+    logger.info(f"All done. Total new bills: {total}")
+
+
+if __name__ == "__main__":
+    main()