""" Congress.gov poller — incremental bill and member sync. Runs on Celery Beat schedule (every 30 min by default). Uses fromDateTime to fetch only recently updated bills. All operations are idempotent. """ import logging import time from datetime import datetime, timedelta, timezone from sqlalchemy import or_ from sqlalchemy.dialects.postgresql import insert as pg_insert from app.database import get_sync_db from app.models import Bill, BillAction, Member, AppSetting from app.services import congress_api from app.workers.celery_app import celery_app logger = logging.getLogger(__name__) def _get_setting(db, key: str, default=None) -> str | None: row = db.get(AppSetting, key) return row.value if row else default def _set_setting(db, key: str, value: str) -> None: row = db.get(AppSetting, key) if row: row.value = value else: db.add(AppSetting(key=key, value=value)) db.commit() # Only track legislation that can become law. Simple/concurrent resolutions # (hres, sres, hconres, sconres) are procedural and not worth analyzing. TRACKED_BILL_TYPES = {"hr", "s", "hjres", "sjres"} @celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.poll_congress_bills") def poll_congress_bills(self): """Fetch recently updated bills from Congress.gov and enqueue document + LLM processing.""" db = get_sync_db() try: last_polled = _get_setting(db, "congress_last_polled_at") # On first run, seed from 2 months back rather than the full congress history if not last_polled: two_months_ago = datetime.now(timezone.utc) - timedelta(days=60) last_polled = two_months_ago.strftime("%Y-%m-%dT%H:%M:%SZ") current_congress = congress_api.get_current_congress() logger.info(f"Polling Congress {current_congress} (since {last_polled})") new_count = 0 updated_count = 0 offset = 0 while True: response = congress_api.get_bills( congress=current_congress, offset=offset, limit=250, from_date_time=last_polled, ) bills_data = response.get("bills", []) if not bills_data: break for bill_data in bills_data: parsed = congress_api.parse_bill_from_api(bill_data, current_congress) if parsed.get("bill_type") not in TRACKED_BILL_TYPES: continue bill_id = parsed["bill_id"] existing = db.get(Bill, bill_id) if existing is None: # Bill list endpoint has no sponsor data — fetch detail to get it try: detail = congress_api.get_bill_detail( current_congress, parsed["bill_type"], parsed["bill_number"] ) sponsor_id = _sync_sponsor(db, detail.get("bill", {})) except Exception: sponsor_id = None parsed["sponsor_id"] = sponsor_id parsed["last_checked_at"] = datetime.now(timezone.utc) db.add(Bill(**parsed)) db.commit() new_count += 1 # Enqueue document and action fetches from app.workers.document_fetcher import fetch_bill_documents fetch_bill_documents.delay(bill_id) fetch_bill_actions.delay(bill_id) else: _update_bill_if_changed(db, existing, parsed) updated_count += 1 db.commit() offset += 250 if len(bills_data) < 250: break # Update last polled timestamp _set_setting(db, "congress_last_polled_at", datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")) logger.info(f"Poll complete: {new_count} new, {updated_count} updated") return {"new": new_count, "updated": updated_count} except Exception as exc: db.rollback() logger.error(f"Poll failed: {exc}") raise self.retry(exc=exc, countdown=60) finally: db.close() @celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.sync_members") def sync_members(self): """Sync current Congress members.""" db = get_sync_db() try: offset = 0 synced = 0 while True: response = congress_api.get_members(offset=offset, limit=250, current_member=True) members_data = response.get("members", []) if not members_data: break for member_data in members_data: parsed = congress_api.parse_member_from_api(member_data) if not parsed.get("bioguide_id"): continue existing = db.get(Member, parsed["bioguide_id"]) if existing is None: db.add(Member(**parsed)) else: for k, v in parsed.items(): setattr(existing, k, v) synced += 1 db.commit() offset += 250 if len(members_data) < 250: break logger.info(f"Synced {synced} members") return {"synced": synced} except Exception as exc: db.rollback() raise self.retry(exc=exc, countdown=120) finally: db.close() def _sync_sponsor(db, bill_data: dict) -> str | None: """Ensure the bill sponsor exists in the members table. Returns bioguide_id or None.""" sponsors = bill_data.get("sponsors", []) if not sponsors: return None sponsor_raw = sponsors[0] bioguide_id = sponsor_raw.get("bioguideId") if not bioguide_id: return None existing = db.get(Member, bioguide_id) if existing is None: db.add(Member( bioguide_id=bioguide_id, name=sponsor_raw.get("fullName", ""), first_name=sponsor_raw.get("firstName"), last_name=sponsor_raw.get("lastName"), party=sponsor_raw.get("party", "")[:10] if sponsor_raw.get("party") else None, state=sponsor_raw.get("state"), )) db.commit() return bioguide_id @celery_app.task(bind=True, name="app.workers.congress_poller.backfill_sponsor_ids") def backfill_sponsor_ids(self): """Backfill sponsor_id for all bills where it is NULL by fetching bill detail from Congress.gov.""" import time db = get_sync_db() try: bills = db.query(Bill).filter(Bill.sponsor_id.is_(None)).all() total = len(bills) updated = 0 logger.info(f"Backfilling sponsors for {total} bills") for bill in bills: try: detail = congress_api.get_bill_detail(bill.congress_number, bill.bill_type, bill.bill_number) sponsor_id = _sync_sponsor(db, detail.get("bill", {})) if sponsor_id: bill.sponsor_id = sponsor_id db.commit() updated += 1 except Exception as e: logger.warning(f"Could not backfill sponsor for {bill.bill_id}: {e}") time.sleep(0.1) # ~10 req/sec, well under Congress.gov 5000/hr limit logger.info(f"Sponsor backfill complete: {updated}/{total} updated") return {"total": total, "updated": updated} finally: db.close() @celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.fetch_bill_actions") def fetch_bill_actions(self, bill_id: str): """Fetch and sync all actions for a bill from Congress.gov. Idempotent.""" db = get_sync_db() try: bill = db.get(Bill, bill_id) if not bill: logger.warning(f"fetch_bill_actions: bill {bill_id} not found") return offset = 0 inserted = 0 while True: try: response = congress_api.get_bill_actions( bill.congress_number, bill.bill_type, bill.bill_number, offset=offset ) except Exception as exc: raise self.retry(exc=exc, countdown=60) actions_data = response.get("actions", []) if not actions_data: break for action in actions_data: stmt = pg_insert(BillAction.__table__).values( bill_id=bill_id, action_date=action.get("actionDate"), action_text=action.get("text", ""), action_type=action.get("type"), chamber=action.get("chamber"), ).on_conflict_do_nothing(constraint="uq_bill_actions_bill_date_text") result = db.execute(stmt) inserted += result.rowcount db.commit() offset += 250 if len(actions_data) < 250: break bill.actions_fetched_at = datetime.now(timezone.utc) db.commit() logger.info(f"fetch_bill_actions: {bill_id} — inserted {inserted} new actions") return {"bill_id": bill_id, "inserted": inserted} except Exception as exc: db.rollback() raise finally: db.close() @celery_app.task(bind=True, name="app.workers.congress_poller.fetch_actions_for_active_bills") def fetch_actions_for_active_bills(self): """Nightly batch: enqueue action fetches for recently active bills missing action data.""" db = get_sync_db() try: cutoff = datetime.now(timezone.utc).date() - timedelta(days=30) bills = ( db.query(Bill) .filter( Bill.latest_action_date >= cutoff, or_( Bill.actions_fetched_at.is_(None), Bill.latest_action_date > Bill.actions_fetched_at, ), ) .limit(200) .all() ) queued = 0 for bill in bills: fetch_bill_actions.delay(bill.bill_id) queued += 1 time.sleep(0.2) # ~5 tasks/sec to avoid Redis burst logger.info(f"fetch_actions_for_active_bills: queued {queued} bills") return {"queued": queued} finally: db.close() @celery_app.task(bind=True, name="app.workers.congress_poller.backfill_all_bill_actions") def backfill_all_bill_actions(self): """One-time backfill: enqueue action fetches for every bill that has never had actions fetched.""" db = get_sync_db() try: bills = ( db.query(Bill) .filter(Bill.actions_fetched_at.is_(None)) .order_by(Bill.latest_action_date.desc()) .all() ) queued = 0 for bill in bills: fetch_bill_actions.delay(bill.bill_id) queued += 1 time.sleep(0.05) # ~20 tasks/sec — workers will self-throttle against Congress.gov logger.info(f"backfill_all_bill_actions: queued {queued} bills") return {"queued": queued} finally: db.close() def _update_bill_if_changed(db, existing: Bill, parsed: dict) -> bool: """Update bill fields if anything has changed. Returns True if updated.""" changed = False dirty = False # Meaningful change fields — trigger document + action fetch when updated track_fields = ["title", "short_title", "latest_action_date", "latest_action_text", "status"] for field in track_fields: new_val = parsed.get(field) if new_val and getattr(existing, field) != new_val: setattr(existing, field, new_val) changed = True dirty = True # Static fields — only fill in if currently null; no change trigger needed fill_null_fields = ["introduced_date", "congress_url", "chamber"] for field in fill_null_fields: new_val = parsed.get(field) if new_val and getattr(existing, field) is None: setattr(existing, field, new_val) dirty = True if changed: existing.last_checked_at = datetime.now(timezone.utc) if dirty: db.commit() if changed: from app.workers.document_fetcher import fetch_bill_documents fetch_bill_documents.delay(existing.bill_id) fetch_bill_actions.delay(existing.bill_id) from app.workers.notification_utils import ( emit_bill_notification, emit_member_follow_notifications, emit_topic_follow_notifications, categorize_action, ) action_text = parsed.get("latest_action_text", "") action_category = categorize_action(action_text) if action_category: emit_bill_notification(db, existing, "bill_updated", action_text, action_category=action_category) emit_member_follow_notifications(db, existing, "bill_updated", action_text, action_category=action_category) # Topic followers — pull tags from the bill's latest brief from app.models.brief import BillBrief latest_brief = ( db.query(BillBrief) .filter_by(bill_id=existing.bill_id) .order_by(BillBrief.created_at.desc()) .first() ) topic_tags = latest_brief.topic_tags or [] if latest_brief else [] emit_topic_follow_notifications( db, existing, "bill_updated", action_text, topic_tags, action_category=action_category ) return changed @celery_app.task(bind=True, name="app.workers.congress_poller.backfill_bill_metadata") def backfill_bill_metadata(self): """ Find bills with null introduced_date (or other static fields) and re-fetch their detail from Congress.gov to fill in the missing values. No document or LLM calls — metadata only. """ db = get_sync_db() try: from sqlalchemy import text as sa_text rows = db.execute(sa_text(""" SELECT bill_id, congress_number, bill_type, bill_number FROM bills WHERE introduced_date IS NULL OR congress_url IS NULL OR chamber IS NULL """)).fetchall() updated = 0 skipped = 0 for row in rows: try: detail = congress_api.get_bill_detail( row.congress_number, row.bill_type, row.bill_number ) bill_data = detail.get("bill", {}) parsed = congress_api.parse_bill_from_api( { "type": row.bill_type, "number": row.bill_number, "introducedDate": bill_data.get("introducedDate"), "title": bill_data.get("title"), "shortTitle": bill_data.get("shortTitle"), "latestAction": bill_data.get("latestAction") or {}, }, row.congress_number, ) bill = db.get(Bill, row.bill_id) if not bill: skipped += 1 continue fill_null_fields = ["introduced_date", "congress_url", "chamber", "title", "short_title"] dirty = False for field in fill_null_fields: new_val = parsed.get(field) if new_val and getattr(bill, field) is None: setattr(bill, field, new_val) dirty = True if dirty: db.commit() updated += 1 else: skipped += 1 time.sleep(0.2) # ~300 req/min — well under the 5k/hr limit except Exception as exc: logger.warning(f"backfill_bill_metadata: failed for {row.bill_id}: {exc}") skipped += 1 logger.info(f"backfill_bill_metadata: {updated} updated, {skipped} skipped") return {"updated": updated, "skipped": skipped} finally: db.close()