Self-hosted US Congress monitoring platform with AI policy briefs, bill/member/topic follows, ntfy + RSS + email notifications, alignment scoring, collections, and draft-letter generator. Authored by: Jack Levy
118 lines
4.0 KiB
Python
118 lines
4.0 KiB
Python
"""
|
|
Historical data backfill script.
|
|
|
|
Usage (run inside the api or worker container):
|
|
python -m app.management.backfill --congress 118 119
|
|
python -m app.management.backfill --congress 119 --skip-llm
|
|
|
|
This script fetches all bills from the specified Congress numbers,
|
|
stores them in the database, and (optionally) enqueues document fetch
|
|
and LLM processing tasks for each bill.
|
|
|
|
Cost note: LLM processing 15,000+ bills can be expensive.
|
|
Consider using --skip-llm for initial backfill and processing
|
|
manually / in batches.
|
|
"""
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
import time
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def backfill_congress(congress_number: int, skip_llm: bool = False, dry_run: bool = False):
|
|
from app.database import get_sync_db
|
|
from app.models import AppSetting, Bill, Member
|
|
from app.services import congress_api
|
|
from app.workers.congress_poller import _sync_sponsor
|
|
|
|
db = get_sync_db()
|
|
offset = 0
|
|
total_processed = 0
|
|
total_new = 0
|
|
|
|
logger.info(f"Starting backfill for Congress {congress_number} (skip_llm={skip_llm}, dry_run={dry_run})")
|
|
|
|
try:
|
|
while True:
|
|
response = congress_api.get_bills(congress=congress_number, offset=offset, limit=250)
|
|
bills_data = response.get("bills", [])
|
|
|
|
if not bills_data:
|
|
break
|
|
|
|
for bill_data in bills_data:
|
|
parsed = congress_api.parse_bill_from_api(bill_data, congress_number)
|
|
bill_id = parsed["bill_id"]
|
|
|
|
if dry_run:
|
|
logger.info(f"[DRY RUN] Would process: {bill_id}")
|
|
total_processed += 1
|
|
continue
|
|
|
|
existing = db.get(Bill, bill_id)
|
|
if existing:
|
|
total_processed += 1
|
|
continue
|
|
|
|
# Sync sponsor
|
|
sponsor_id = _sync_sponsor(db, bill_data)
|
|
parsed["sponsor_id"] = sponsor_id
|
|
|
|
db.add(Bill(**parsed))
|
|
total_new += 1
|
|
total_processed += 1
|
|
|
|
if total_new % 50 == 0:
|
|
db.commit()
|
|
logger.info(f"Progress: {total_processed} processed, {total_new} new")
|
|
|
|
# Enqueue document + LLM at low priority
|
|
if not skip_llm:
|
|
from app.workers.document_fetcher import fetch_bill_documents
|
|
fetch_bill_documents.apply_async(args=[bill_id], priority=3)
|
|
|
|
# Stay well under Congress.gov rate limit (5,000/hr = ~1.4/sec)
|
|
time.sleep(0.25)
|
|
|
|
db.commit()
|
|
offset += 250
|
|
|
|
if len(bills_data) < 250:
|
|
break # Last page
|
|
|
|
logger.info(f"Fetched page ending at offset {offset}, total processed: {total_processed}")
|
|
time.sleep(1) # Polite pause between pages
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Interrupted by user")
|
|
db.commit()
|
|
finally:
|
|
db.close()
|
|
|
|
logger.info(f"Backfill complete: {total_new} new bills added ({total_processed} total processed)")
|
|
return total_new
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Backfill Congressional bill data")
|
|
parser.add_argument("--congress", type=int, nargs="+", default=[119],
|
|
help="Congress numbers to backfill (default: 119)")
|
|
parser.add_argument("--skip-llm", action="store_true",
|
|
help="Skip LLM processing (fetch documents only, don't enqueue briefs)")
|
|
parser.add_argument("--dry-run", action="store_true",
|
|
help="Count bills without actually inserting them")
|
|
args = parser.parse_args()
|
|
|
|
total = 0
|
|
for congress_number in args.congress:
|
|
total += backfill_congress(congress_number, skip_llm=args.skip_llm, dry_run=args.dry_run)
|
|
|
|
logger.info(f"All done. Total new bills: {total}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|