Initial commit

This commit is contained in:
Jack Levy
2026-02-28 21:08:19 -05:00
commit e418dd9ae0
85 changed files with 5261 additions and 0 deletions

View File

@@ -0,0 +1,117 @@
"""
Historical data backfill script.
Usage (run inside the api or worker container):
python -m app.management.backfill --congress 118 119
python -m app.management.backfill --congress 119 --skip-llm
This script fetches all bills from the specified Congress numbers,
stores them in the database, and (optionally) enqueues document fetch
and LLM processing tasks for each bill.
Cost note: LLM processing 15,000+ bills can be expensive.
Consider using --skip-llm for initial backfill and processing
manually / in batches.
"""
import argparse
import logging
import sys
import time
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
def backfill_congress(congress_number: int, skip_llm: bool = False, dry_run: bool = False):
from app.database import get_sync_db
from app.models import AppSetting, Bill, Member
from app.services import congress_api
from app.workers.congress_poller import _sync_sponsor
db = get_sync_db()
offset = 0
total_processed = 0
total_new = 0
logger.info(f"Starting backfill for Congress {congress_number} (skip_llm={skip_llm}, dry_run={dry_run})")
try:
while True:
response = congress_api.get_bills(congress=congress_number, offset=offset, limit=250)
bills_data = response.get("bills", [])
if not bills_data:
break
for bill_data in bills_data:
parsed = congress_api.parse_bill_from_api(bill_data, congress_number)
bill_id = parsed["bill_id"]
if dry_run:
logger.info(f"[DRY RUN] Would process: {bill_id}")
total_processed += 1
continue
existing = db.get(Bill, bill_id)
if existing:
total_processed += 1
continue
# Sync sponsor
sponsor_id = _sync_sponsor(db, bill_data)
parsed["sponsor_id"] = sponsor_id
db.add(Bill(**parsed))
total_new += 1
total_processed += 1
if total_new % 50 == 0:
db.commit()
logger.info(f"Progress: {total_processed} processed, {total_new} new")
# Enqueue document + LLM at low priority
if not skip_llm:
from app.workers.document_fetcher import fetch_bill_documents
fetch_bill_documents.apply_async(args=[bill_id], priority=3)
# Stay well under Congress.gov rate limit (5,000/hr = ~1.4/sec)
time.sleep(0.25)
db.commit()
offset += 250
if len(bills_data) < 250:
break # Last page
logger.info(f"Fetched page ending at offset {offset}, total processed: {total_processed}")
time.sleep(1) # Polite pause between pages
except KeyboardInterrupt:
logger.info("Interrupted by user")
db.commit()
finally:
db.close()
logger.info(f"Backfill complete: {total_new} new bills added ({total_processed} total processed)")
return total_new
def main():
parser = argparse.ArgumentParser(description="Backfill Congressional bill data")
parser.add_argument("--congress", type=int, nargs="+", default=[119],
help="Congress numbers to backfill (default: 119)")
parser.add_argument("--skip-llm", action="store_true",
help="Skip LLM processing (fetch documents only, don't enqueue briefs)")
parser.add_argument("--dry-run", action="store_true",
help="Count bills without actually inserting them")
args = parser.parse_args()
total = 0
for congress_number in args.congress:
total += backfill_congress(congress_number, skip_llm=args.skip_llm, dry_run=args.dry_run)
logger.info(f"All done. Total new bills: {total}")
if __name__ == "__main__":
main()