Initial commit
This commit is contained in:
0
backend/app/management/__init__.py
Normal file
0
backend/app/management/__init__.py
Normal file
117
backend/app/management/backfill.py
Normal file
117
backend/app/management/backfill.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""
|
||||
Historical data backfill script.
|
||||
|
||||
Usage (run inside the api or worker container):
|
||||
python -m app.management.backfill --congress 118 119
|
||||
python -m app.management.backfill --congress 119 --skip-llm
|
||||
|
||||
This script fetches all bills from the specified Congress numbers,
|
||||
stores them in the database, and (optionally) enqueues document fetch
|
||||
and LLM processing tasks for each bill.
|
||||
|
||||
Cost note: LLM processing 15,000+ bills can be expensive.
|
||||
Consider using --skip-llm for initial backfill and processing
|
||||
manually / in batches.
|
||||
"""
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def backfill_congress(congress_number: int, skip_llm: bool = False, dry_run: bool = False):
|
||||
from app.database import get_sync_db
|
||||
from app.models import AppSetting, Bill, Member
|
||||
from app.services import congress_api
|
||||
from app.workers.congress_poller import _sync_sponsor
|
||||
|
||||
db = get_sync_db()
|
||||
offset = 0
|
||||
total_processed = 0
|
||||
total_new = 0
|
||||
|
||||
logger.info(f"Starting backfill for Congress {congress_number} (skip_llm={skip_llm}, dry_run={dry_run})")
|
||||
|
||||
try:
|
||||
while True:
|
||||
response = congress_api.get_bills(congress=congress_number, offset=offset, limit=250)
|
||||
bills_data = response.get("bills", [])
|
||||
|
||||
if not bills_data:
|
||||
break
|
||||
|
||||
for bill_data in bills_data:
|
||||
parsed = congress_api.parse_bill_from_api(bill_data, congress_number)
|
||||
bill_id = parsed["bill_id"]
|
||||
|
||||
if dry_run:
|
||||
logger.info(f"[DRY RUN] Would process: {bill_id}")
|
||||
total_processed += 1
|
||||
continue
|
||||
|
||||
existing = db.get(Bill, bill_id)
|
||||
if existing:
|
||||
total_processed += 1
|
||||
continue
|
||||
|
||||
# Sync sponsor
|
||||
sponsor_id = _sync_sponsor(db, bill_data)
|
||||
parsed["sponsor_id"] = sponsor_id
|
||||
|
||||
db.add(Bill(**parsed))
|
||||
total_new += 1
|
||||
total_processed += 1
|
||||
|
||||
if total_new % 50 == 0:
|
||||
db.commit()
|
||||
logger.info(f"Progress: {total_processed} processed, {total_new} new")
|
||||
|
||||
# Enqueue document + LLM at low priority
|
||||
if not skip_llm:
|
||||
from app.workers.document_fetcher import fetch_bill_documents
|
||||
fetch_bill_documents.apply_async(args=[bill_id], priority=3)
|
||||
|
||||
# Stay well under Congress.gov rate limit (5,000/hr = ~1.4/sec)
|
||||
time.sleep(0.25)
|
||||
|
||||
db.commit()
|
||||
offset += 250
|
||||
|
||||
if len(bills_data) < 250:
|
||||
break # Last page
|
||||
|
||||
logger.info(f"Fetched page ending at offset {offset}, total processed: {total_processed}")
|
||||
time.sleep(1) # Polite pause between pages
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Interrupted by user")
|
||||
db.commit()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
logger.info(f"Backfill complete: {total_new} new bills added ({total_processed} total processed)")
|
||||
return total_new
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Backfill Congressional bill data")
|
||||
parser.add_argument("--congress", type=int, nargs="+", default=[119],
|
||||
help="Congress numbers to backfill (default: 119)")
|
||||
parser.add_argument("--skip-llm", action="store_true",
|
||||
help="Skip LLM processing (fetch documents only, don't enqueue briefs)")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Count bills without actually inserting them")
|
||||
args = parser.parse_args()
|
||||
|
||||
total = 0
|
||||
for congress_number in args.congress:
|
||||
total += backfill_congress(congress_number, skip_llm=args.skip_llm, dry_run=args.dry_run)
|
||||
|
||||
logger.info(f"All done. Total new bills: {total}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user