feat: PocketVeto v1.0.0 — initial public release
Self-hosted US Congress monitoring platform with AI policy briefs, bill/member/topic follows, ntfy + RSS + email notifications, alignment scoring, collections, and draft-letter generator. Authored by: Jack Levy
This commit is contained in:
480
backend/app/workers/congress_poller.py
Normal file
480
backend/app/workers/congress_poller.py
Normal file
@@ -0,0 +1,480 @@
|
||||
"""
|
||||
Congress.gov poller — incremental bill and member sync.
|
||||
|
||||
Runs on Celery Beat schedule (every 30 min by default).
|
||||
Uses fromDateTime to fetch only recently updated bills.
|
||||
All operations are idempotent.
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
|
||||
from app.database import get_sync_db
|
||||
from app.models import Bill, BillAction, Member, AppSetting
|
||||
from app.services import congress_api
|
||||
from app.workers.celery_app import celery_app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_setting(db, key: str, default=None) -> str | None:
|
||||
row = db.get(AppSetting, key)
|
||||
return row.value if row else default
|
||||
|
||||
|
||||
def _set_setting(db, key: str, value: str) -> None:
|
||||
row = db.get(AppSetting, key)
|
||||
if row:
|
||||
row.value = value
|
||||
else:
|
||||
db.add(AppSetting(key=key, value=value))
|
||||
db.commit()
|
||||
|
||||
|
||||
# Only track legislation that can become law. Simple/concurrent resolutions
|
||||
# (hres, sres, hconres, sconres) are procedural and not worth analyzing.
|
||||
TRACKED_BILL_TYPES = {"hr", "s", "hjres", "sjres"}
|
||||
|
||||
# Action categories that produce new bill text versions on GovInfo.
|
||||
# Procedural/administrative actions (referral to committee, calendar placement)
|
||||
# rarely produce a new text version, so we skip document fetching for them.
|
||||
_DOC_PRODUCING_CATEGORIES = {"vote", "committee_report", "presidential", "new_document", "new_amendment"}
|
||||
|
||||
|
||||
def _is_congress_off_hours() -> bool:
|
||||
"""Return True during periods when Congress.gov is unlikely to publish new content."""
|
||||
try:
|
||||
from zoneinfo import ZoneInfo
|
||||
now_est = datetime.now(ZoneInfo("America/New_York"))
|
||||
except Exception:
|
||||
return False
|
||||
# Weekends
|
||||
if now_est.weekday() >= 5:
|
||||
return True
|
||||
# Nights: before 9 AM or after 9 PM EST
|
||||
if now_est.hour < 9 or now_est.hour >= 21:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.poll_congress_bills")
|
||||
def poll_congress_bills(self):
|
||||
"""Fetch recently updated bills from Congress.gov and enqueue document + LLM processing."""
|
||||
db = get_sync_db()
|
||||
try:
|
||||
last_polled = _get_setting(db, "congress_last_polled_at")
|
||||
|
||||
# Adaptive: skip off-hours polls if last poll was recent (< 1 hour ago)
|
||||
if _is_congress_off_hours() and last_polled:
|
||||
try:
|
||||
last_dt = datetime.fromisoformat(last_polled.replace("Z", "+00:00"))
|
||||
if (datetime.now(timezone.utc) - last_dt) < timedelta(hours=1):
|
||||
logger.info("Skipping poll — off-hours and last poll < 1 hour ago")
|
||||
return {"new": 0, "updated": 0, "skipped": "off_hours"}
|
||||
except Exception:
|
||||
pass
|
||||
# On first run, seed from 2 months back rather than the full congress history
|
||||
if not last_polled:
|
||||
two_months_ago = datetime.now(timezone.utc) - timedelta(days=60)
|
||||
last_polled = two_months_ago.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
current_congress = congress_api.get_current_congress()
|
||||
logger.info(f"Polling Congress {current_congress} (since {last_polled})")
|
||||
|
||||
new_count = 0
|
||||
updated_count = 0
|
||||
offset = 0
|
||||
|
||||
while True:
|
||||
response = congress_api.get_bills(
|
||||
congress=current_congress,
|
||||
offset=offset,
|
||||
limit=250,
|
||||
from_date_time=last_polled,
|
||||
)
|
||||
bills_data = response.get("bills", [])
|
||||
if not bills_data:
|
||||
break
|
||||
|
||||
for bill_data in bills_data:
|
||||
parsed = congress_api.parse_bill_from_api(bill_data, current_congress)
|
||||
if parsed.get("bill_type") not in TRACKED_BILL_TYPES:
|
||||
continue
|
||||
bill_id = parsed["bill_id"]
|
||||
existing = db.get(Bill, bill_id)
|
||||
|
||||
if existing is None:
|
||||
# Save bill immediately; fetch sponsor detail asynchronously
|
||||
parsed["sponsor_id"] = None
|
||||
parsed["last_checked_at"] = datetime.now(timezone.utc)
|
||||
db.add(Bill(**parsed))
|
||||
db.commit()
|
||||
new_count += 1
|
||||
# Enqueue document, action, sponsor, and cosponsor fetches
|
||||
from app.workers.document_fetcher import fetch_bill_documents
|
||||
fetch_bill_documents.delay(bill_id)
|
||||
fetch_bill_actions.delay(bill_id)
|
||||
fetch_sponsor_for_bill.delay(
|
||||
bill_id, current_congress, parsed["bill_type"], parsed["bill_number"]
|
||||
)
|
||||
from app.workers.bill_classifier import fetch_bill_cosponsors
|
||||
fetch_bill_cosponsors.delay(bill_id)
|
||||
else:
|
||||
_update_bill_if_changed(db, existing, parsed)
|
||||
updated_count += 1
|
||||
|
||||
db.commit()
|
||||
offset += 250
|
||||
if len(bills_data) < 250:
|
||||
break
|
||||
|
||||
# Update last polled timestamp
|
||||
_set_setting(db, "congress_last_polled_at", datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))
|
||||
logger.info(f"Poll complete: {new_count} new, {updated_count} updated")
|
||||
return {"new": new_count, "updated": updated_count}
|
||||
|
||||
except Exception as exc:
|
||||
db.rollback()
|
||||
logger.error(f"Poll failed: {exc}")
|
||||
raise self.retry(exc=exc, countdown=60)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.sync_members")
|
||||
def sync_members(self):
|
||||
"""Sync current Congress members."""
|
||||
db = get_sync_db()
|
||||
try:
|
||||
offset = 0
|
||||
synced = 0
|
||||
while True:
|
||||
response = congress_api.get_members(offset=offset, limit=250, current_member=True)
|
||||
members_data = response.get("members", [])
|
||||
if not members_data:
|
||||
break
|
||||
|
||||
for member_data in members_data:
|
||||
parsed = congress_api.parse_member_from_api(member_data)
|
||||
if not parsed.get("bioguide_id"):
|
||||
continue
|
||||
existing = db.get(Member, parsed["bioguide_id"])
|
||||
if existing is None:
|
||||
db.add(Member(**parsed))
|
||||
else:
|
||||
for k, v in parsed.items():
|
||||
setattr(existing, k, v)
|
||||
synced += 1
|
||||
|
||||
db.commit()
|
||||
offset += 250
|
||||
if len(members_data) < 250:
|
||||
break
|
||||
|
||||
logger.info(f"Synced {synced} members")
|
||||
return {"synced": synced}
|
||||
except Exception as exc:
|
||||
db.rollback()
|
||||
raise self.retry(exc=exc, countdown=120)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def _sync_sponsor(db, bill_data: dict) -> str | None:
|
||||
"""Ensure the bill sponsor exists in the members table. Returns bioguide_id or None."""
|
||||
sponsors = bill_data.get("sponsors", [])
|
||||
if not sponsors:
|
||||
return None
|
||||
sponsor_raw = sponsors[0]
|
||||
bioguide_id = sponsor_raw.get("bioguideId")
|
||||
if not bioguide_id:
|
||||
return None
|
||||
existing = db.get(Member, bioguide_id)
|
||||
if existing is None:
|
||||
db.add(Member(
|
||||
bioguide_id=bioguide_id,
|
||||
name=sponsor_raw.get("fullName", ""),
|
||||
first_name=sponsor_raw.get("firstName"),
|
||||
last_name=sponsor_raw.get("lastName"),
|
||||
party=sponsor_raw.get("party", "")[:10] if sponsor_raw.get("party") else None,
|
||||
state=sponsor_raw.get("state"),
|
||||
))
|
||||
db.commit()
|
||||
return bioguide_id
|
||||
|
||||
|
||||
@celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.fetch_sponsor_for_bill")
|
||||
def fetch_sponsor_for_bill(self, bill_id: str, congress: int, bill_type: str, bill_number: str):
|
||||
"""Async sponsor fetch: get bill detail from Congress.gov and link the sponsor. Idempotent."""
|
||||
db = get_sync_db()
|
||||
try:
|
||||
bill = db.get(Bill, bill_id)
|
||||
if not bill:
|
||||
return {"status": "not_found"}
|
||||
if bill.sponsor_id:
|
||||
return {"status": "already_set", "sponsor_id": bill.sponsor_id}
|
||||
detail = congress_api.get_bill_detail(congress, bill_type, bill_number)
|
||||
sponsor_id = _sync_sponsor(db, detail.get("bill", {}))
|
||||
if sponsor_id:
|
||||
bill.sponsor_id = sponsor_id
|
||||
db.commit()
|
||||
return {"status": "ok", "sponsor_id": sponsor_id}
|
||||
except Exception as exc:
|
||||
db.rollback()
|
||||
raise self.retry(exc=exc, countdown=60)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task(bind=True, name="app.workers.congress_poller.backfill_sponsor_ids")
|
||||
def backfill_sponsor_ids(self):
|
||||
"""Backfill sponsor_id for all bills where it is NULL by fetching bill detail from Congress.gov."""
|
||||
import time
|
||||
db = get_sync_db()
|
||||
try:
|
||||
bills = db.query(Bill).filter(Bill.sponsor_id.is_(None)).all()
|
||||
total = len(bills)
|
||||
updated = 0
|
||||
logger.info(f"Backfilling sponsors for {total} bills")
|
||||
for bill in bills:
|
||||
try:
|
||||
detail = congress_api.get_bill_detail(bill.congress_number, bill.bill_type, bill.bill_number)
|
||||
sponsor_id = _sync_sponsor(db, detail.get("bill", {}))
|
||||
if sponsor_id:
|
||||
bill.sponsor_id = sponsor_id
|
||||
db.commit()
|
||||
updated += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not backfill sponsor for {bill.bill_id}: {e}")
|
||||
time.sleep(0.1) # ~10 req/sec, well under Congress.gov 5000/hr limit
|
||||
logger.info(f"Sponsor backfill complete: {updated}/{total} updated")
|
||||
return {"total": total, "updated": updated}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.fetch_bill_actions")
|
||||
def fetch_bill_actions(self, bill_id: str):
|
||||
"""Fetch and sync all actions for a bill from Congress.gov. Idempotent."""
|
||||
db = get_sync_db()
|
||||
try:
|
||||
bill = db.get(Bill, bill_id)
|
||||
if not bill:
|
||||
logger.warning(f"fetch_bill_actions: bill {bill_id} not found")
|
||||
return
|
||||
|
||||
offset = 0
|
||||
inserted = 0
|
||||
while True:
|
||||
try:
|
||||
response = congress_api.get_bill_actions(
|
||||
bill.congress_number, bill.bill_type, bill.bill_number, offset=offset
|
||||
)
|
||||
except Exception as exc:
|
||||
raise self.retry(exc=exc, countdown=60)
|
||||
|
||||
actions_data = response.get("actions", [])
|
||||
if not actions_data:
|
||||
break
|
||||
|
||||
for action in actions_data:
|
||||
stmt = pg_insert(BillAction.__table__).values(
|
||||
bill_id=bill_id,
|
||||
action_date=action.get("actionDate"),
|
||||
action_text=action.get("text", ""),
|
||||
action_type=action.get("type"),
|
||||
chamber=action.get("chamber"),
|
||||
).on_conflict_do_nothing(constraint="uq_bill_actions_bill_date_text")
|
||||
result = db.execute(stmt)
|
||||
inserted += result.rowcount
|
||||
|
||||
db.commit()
|
||||
offset += 250
|
||||
if len(actions_data) < 250:
|
||||
break
|
||||
|
||||
bill.actions_fetched_at = datetime.now(timezone.utc)
|
||||
db.commit()
|
||||
logger.info(f"fetch_bill_actions: {bill_id} — inserted {inserted} new actions")
|
||||
return {"bill_id": bill_id, "inserted": inserted}
|
||||
except Exception as exc:
|
||||
db.rollback()
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task(bind=True, name="app.workers.congress_poller.fetch_actions_for_active_bills")
|
||||
def fetch_actions_for_active_bills(self):
|
||||
"""Nightly batch: enqueue action fetches for recently active bills missing action data."""
|
||||
db = get_sync_db()
|
||||
try:
|
||||
cutoff = datetime.now(timezone.utc).date() - timedelta(days=30)
|
||||
bills = (
|
||||
db.query(Bill)
|
||||
.filter(
|
||||
Bill.latest_action_date >= cutoff,
|
||||
or_(
|
||||
Bill.actions_fetched_at.is_(None),
|
||||
Bill.latest_action_date > Bill.actions_fetched_at,
|
||||
),
|
||||
)
|
||||
.limit(200)
|
||||
.all()
|
||||
)
|
||||
queued = 0
|
||||
for bill in bills:
|
||||
fetch_bill_actions.delay(bill.bill_id)
|
||||
queued += 1
|
||||
time.sleep(0.2) # ~5 tasks/sec to avoid Redis burst
|
||||
logger.info(f"fetch_actions_for_active_bills: queued {queued} bills")
|
||||
return {"queued": queued}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task(bind=True, name="app.workers.congress_poller.backfill_all_bill_actions")
|
||||
def backfill_all_bill_actions(self):
|
||||
"""One-time backfill: enqueue action fetches for every bill that has never had actions fetched."""
|
||||
db = get_sync_db()
|
||||
try:
|
||||
bills = (
|
||||
db.query(Bill)
|
||||
.filter(Bill.actions_fetched_at.is_(None))
|
||||
.order_by(Bill.latest_action_date.desc())
|
||||
.all()
|
||||
)
|
||||
queued = 0
|
||||
for bill in bills:
|
||||
fetch_bill_actions.delay(bill.bill_id)
|
||||
queued += 1
|
||||
time.sleep(0.05) # ~20 tasks/sec — workers will self-throttle against Congress.gov
|
||||
logger.info(f"backfill_all_bill_actions: queued {queued} bills")
|
||||
return {"queued": queued}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def _update_bill_if_changed(db, existing: Bill, parsed: dict) -> bool:
|
||||
"""Update bill fields if anything has changed. Returns True if updated."""
|
||||
changed = False
|
||||
dirty = False
|
||||
|
||||
# Meaningful change fields — trigger document + action fetch when updated
|
||||
track_fields = ["title", "short_title", "latest_action_date", "latest_action_text", "status"]
|
||||
for field in track_fields:
|
||||
new_val = parsed.get(field)
|
||||
if new_val and getattr(existing, field) != new_val:
|
||||
setattr(existing, field, new_val)
|
||||
changed = True
|
||||
dirty = True
|
||||
|
||||
# Static fields — only fill in if currently null; no change trigger needed
|
||||
fill_null_fields = ["introduced_date", "congress_url", "chamber"]
|
||||
for field in fill_null_fields:
|
||||
new_val = parsed.get(field)
|
||||
if new_val and getattr(existing, field) is None:
|
||||
setattr(existing, field, new_val)
|
||||
dirty = True
|
||||
|
||||
if changed:
|
||||
existing.last_checked_at = datetime.now(timezone.utc)
|
||||
if dirty:
|
||||
db.commit()
|
||||
if changed:
|
||||
from app.workers.notification_utils import (
|
||||
emit_bill_notification,
|
||||
emit_member_follow_notifications,
|
||||
emit_topic_follow_notifications,
|
||||
categorize_action,
|
||||
)
|
||||
action_text = parsed.get("latest_action_text", "")
|
||||
action_category = categorize_action(action_text)
|
||||
# Only fetch new documents for actions that produce new text versions on GovInfo.
|
||||
# Skip procedural/administrative actions (referral, calendar) to avoid unnecessary calls.
|
||||
if not action_category or action_category in _DOC_PRODUCING_CATEGORIES:
|
||||
from app.workers.document_fetcher import fetch_bill_documents
|
||||
fetch_bill_documents.delay(existing.bill_id)
|
||||
fetch_bill_actions.delay(existing.bill_id)
|
||||
if action_category:
|
||||
emit_bill_notification(db, existing, "bill_updated", action_text, action_category=action_category)
|
||||
emit_member_follow_notifications(db, existing, "bill_updated", action_text, action_category=action_category)
|
||||
# Topic followers — pull tags from the bill's latest brief
|
||||
from app.models.brief import BillBrief
|
||||
latest_brief = (
|
||||
db.query(BillBrief)
|
||||
.filter_by(bill_id=existing.bill_id)
|
||||
.order_by(BillBrief.created_at.desc())
|
||||
.first()
|
||||
)
|
||||
topic_tags = latest_brief.topic_tags or [] if latest_brief else []
|
||||
emit_topic_follow_notifications(
|
||||
db, existing, "bill_updated", action_text, topic_tags, action_category=action_category
|
||||
)
|
||||
return changed
|
||||
|
||||
|
||||
@celery_app.task(bind=True, name="app.workers.congress_poller.backfill_bill_metadata")
|
||||
def backfill_bill_metadata(self):
|
||||
"""
|
||||
Find bills with null introduced_date (or other static fields) and
|
||||
re-fetch their detail from Congress.gov to fill in the missing values.
|
||||
No document or LLM calls — metadata only.
|
||||
"""
|
||||
db = get_sync_db()
|
||||
try:
|
||||
from sqlalchemy import text as sa_text
|
||||
rows = db.execute(sa_text("""
|
||||
SELECT bill_id, congress_number, bill_type, bill_number
|
||||
FROM bills
|
||||
WHERE introduced_date IS NULL
|
||||
OR congress_url IS NULL
|
||||
OR chamber IS NULL
|
||||
""")).fetchall()
|
||||
|
||||
updated = 0
|
||||
skipped = 0
|
||||
for row in rows:
|
||||
try:
|
||||
detail = congress_api.get_bill_detail(
|
||||
row.congress_number, row.bill_type, row.bill_number
|
||||
)
|
||||
bill_data = detail.get("bill", {})
|
||||
parsed = congress_api.parse_bill_from_api(
|
||||
{
|
||||
"type": row.bill_type,
|
||||
"number": row.bill_number,
|
||||
"introducedDate": bill_data.get("introducedDate"),
|
||||
"title": bill_data.get("title"),
|
||||
"shortTitle": bill_data.get("shortTitle"),
|
||||
"latestAction": bill_data.get("latestAction") or {},
|
||||
},
|
||||
row.congress_number,
|
||||
)
|
||||
bill = db.get(Bill, row.bill_id)
|
||||
if not bill:
|
||||
skipped += 1
|
||||
continue
|
||||
fill_null_fields = ["introduced_date", "congress_url", "chamber", "title", "short_title"]
|
||||
dirty = False
|
||||
for field in fill_null_fields:
|
||||
new_val = parsed.get(field)
|
||||
if new_val and getattr(bill, field) is None:
|
||||
setattr(bill, field, new_val)
|
||||
dirty = True
|
||||
if dirty:
|
||||
db.commit()
|
||||
updated += 1
|
||||
else:
|
||||
skipped += 1
|
||||
time.sleep(0.2) # ~300 req/min — well under the 5k/hr limit
|
||||
except Exception as exc:
|
||||
logger.warning(f"backfill_bill_metadata: failed for {row.bill_id}: {exc}")
|
||||
skipped += 1
|
||||
|
||||
logger.info(f"backfill_bill_metadata: {updated} updated, {skipped} skipped")
|
||||
return {"updated": updated, "skipped": skipped}
|
||||
finally:
|
||||
db.close()
|
||||
Reference in New Issue
Block a user