feat: PocketVeto v1.0.0 — initial public release

Self-hosted US Congress monitoring platform with AI policy briefs,
bill/member/topic follows, ntfy + RSS + email notifications,
alignment scoring, collections, and draft-letter generator.

Authored by: Jack Levy
This commit is contained in:
Jack Levy
2026-03-15 01:35:01 -04:00
commit 4c86a5b9ca
150 changed files with 19859 additions and 0 deletions

View File

@@ -0,0 +1,480 @@
"""
Congress.gov poller — incremental bill and member sync.
Runs on Celery Beat schedule (every 30 min by default).
Uses fromDateTime to fetch only recently updated bills.
All operations are idempotent.
"""
import logging
import time
from datetime import datetime, timedelta, timezone
from sqlalchemy import or_
from sqlalchemy.dialects.postgresql import insert as pg_insert
from app.database import get_sync_db
from app.models import Bill, BillAction, Member, AppSetting
from app.services import congress_api
from app.workers.celery_app import celery_app
logger = logging.getLogger(__name__)
def _get_setting(db, key: str, default=None) -> str | None:
row = db.get(AppSetting, key)
return row.value if row else default
def _set_setting(db, key: str, value: str) -> None:
row = db.get(AppSetting, key)
if row:
row.value = value
else:
db.add(AppSetting(key=key, value=value))
db.commit()
# Only track legislation that can become law. Simple/concurrent resolutions
# (hres, sres, hconres, sconres) are procedural and not worth analyzing.
TRACKED_BILL_TYPES = {"hr", "s", "hjres", "sjres"}
# Action categories that produce new bill text versions on GovInfo.
# Procedural/administrative actions (referral to committee, calendar placement)
# rarely produce a new text version, so we skip document fetching for them.
_DOC_PRODUCING_CATEGORIES = {"vote", "committee_report", "presidential", "new_document", "new_amendment"}
def _is_congress_off_hours() -> bool:
"""Return True during periods when Congress.gov is unlikely to publish new content."""
try:
from zoneinfo import ZoneInfo
now_est = datetime.now(ZoneInfo("America/New_York"))
except Exception:
return False
# Weekends
if now_est.weekday() >= 5:
return True
# Nights: before 9 AM or after 9 PM EST
if now_est.hour < 9 or now_est.hour >= 21:
return True
return False
@celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.poll_congress_bills")
def poll_congress_bills(self):
"""Fetch recently updated bills from Congress.gov and enqueue document + LLM processing."""
db = get_sync_db()
try:
last_polled = _get_setting(db, "congress_last_polled_at")
# Adaptive: skip off-hours polls if last poll was recent (< 1 hour ago)
if _is_congress_off_hours() and last_polled:
try:
last_dt = datetime.fromisoformat(last_polled.replace("Z", "+00:00"))
if (datetime.now(timezone.utc) - last_dt) < timedelta(hours=1):
logger.info("Skipping poll — off-hours and last poll < 1 hour ago")
return {"new": 0, "updated": 0, "skipped": "off_hours"}
except Exception:
pass
# On first run, seed from 2 months back rather than the full congress history
if not last_polled:
two_months_ago = datetime.now(timezone.utc) - timedelta(days=60)
last_polled = two_months_ago.strftime("%Y-%m-%dT%H:%M:%SZ")
current_congress = congress_api.get_current_congress()
logger.info(f"Polling Congress {current_congress} (since {last_polled})")
new_count = 0
updated_count = 0
offset = 0
while True:
response = congress_api.get_bills(
congress=current_congress,
offset=offset,
limit=250,
from_date_time=last_polled,
)
bills_data = response.get("bills", [])
if not bills_data:
break
for bill_data in bills_data:
parsed = congress_api.parse_bill_from_api(bill_data, current_congress)
if parsed.get("bill_type") not in TRACKED_BILL_TYPES:
continue
bill_id = parsed["bill_id"]
existing = db.get(Bill, bill_id)
if existing is None:
# Save bill immediately; fetch sponsor detail asynchronously
parsed["sponsor_id"] = None
parsed["last_checked_at"] = datetime.now(timezone.utc)
db.add(Bill(**parsed))
db.commit()
new_count += 1
# Enqueue document, action, sponsor, and cosponsor fetches
from app.workers.document_fetcher import fetch_bill_documents
fetch_bill_documents.delay(bill_id)
fetch_bill_actions.delay(bill_id)
fetch_sponsor_for_bill.delay(
bill_id, current_congress, parsed["bill_type"], parsed["bill_number"]
)
from app.workers.bill_classifier import fetch_bill_cosponsors
fetch_bill_cosponsors.delay(bill_id)
else:
_update_bill_if_changed(db, existing, parsed)
updated_count += 1
db.commit()
offset += 250
if len(bills_data) < 250:
break
# Update last polled timestamp
_set_setting(db, "congress_last_polled_at", datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))
logger.info(f"Poll complete: {new_count} new, {updated_count} updated")
return {"new": new_count, "updated": updated_count}
except Exception as exc:
db.rollback()
logger.error(f"Poll failed: {exc}")
raise self.retry(exc=exc, countdown=60)
finally:
db.close()
@celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.sync_members")
def sync_members(self):
"""Sync current Congress members."""
db = get_sync_db()
try:
offset = 0
synced = 0
while True:
response = congress_api.get_members(offset=offset, limit=250, current_member=True)
members_data = response.get("members", [])
if not members_data:
break
for member_data in members_data:
parsed = congress_api.parse_member_from_api(member_data)
if not parsed.get("bioguide_id"):
continue
existing = db.get(Member, parsed["bioguide_id"])
if existing is None:
db.add(Member(**parsed))
else:
for k, v in parsed.items():
setattr(existing, k, v)
synced += 1
db.commit()
offset += 250
if len(members_data) < 250:
break
logger.info(f"Synced {synced} members")
return {"synced": synced}
except Exception as exc:
db.rollback()
raise self.retry(exc=exc, countdown=120)
finally:
db.close()
def _sync_sponsor(db, bill_data: dict) -> str | None:
"""Ensure the bill sponsor exists in the members table. Returns bioguide_id or None."""
sponsors = bill_data.get("sponsors", [])
if not sponsors:
return None
sponsor_raw = sponsors[0]
bioguide_id = sponsor_raw.get("bioguideId")
if not bioguide_id:
return None
existing = db.get(Member, bioguide_id)
if existing is None:
db.add(Member(
bioguide_id=bioguide_id,
name=sponsor_raw.get("fullName", ""),
first_name=sponsor_raw.get("firstName"),
last_name=sponsor_raw.get("lastName"),
party=sponsor_raw.get("party", "")[:10] if sponsor_raw.get("party") else None,
state=sponsor_raw.get("state"),
))
db.commit()
return bioguide_id
@celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.fetch_sponsor_for_bill")
def fetch_sponsor_for_bill(self, bill_id: str, congress: int, bill_type: str, bill_number: str):
"""Async sponsor fetch: get bill detail from Congress.gov and link the sponsor. Idempotent."""
db = get_sync_db()
try:
bill = db.get(Bill, bill_id)
if not bill:
return {"status": "not_found"}
if bill.sponsor_id:
return {"status": "already_set", "sponsor_id": bill.sponsor_id}
detail = congress_api.get_bill_detail(congress, bill_type, bill_number)
sponsor_id = _sync_sponsor(db, detail.get("bill", {}))
if sponsor_id:
bill.sponsor_id = sponsor_id
db.commit()
return {"status": "ok", "sponsor_id": sponsor_id}
except Exception as exc:
db.rollback()
raise self.retry(exc=exc, countdown=60)
finally:
db.close()
@celery_app.task(bind=True, name="app.workers.congress_poller.backfill_sponsor_ids")
def backfill_sponsor_ids(self):
"""Backfill sponsor_id for all bills where it is NULL by fetching bill detail from Congress.gov."""
import time
db = get_sync_db()
try:
bills = db.query(Bill).filter(Bill.sponsor_id.is_(None)).all()
total = len(bills)
updated = 0
logger.info(f"Backfilling sponsors for {total} bills")
for bill in bills:
try:
detail = congress_api.get_bill_detail(bill.congress_number, bill.bill_type, bill.bill_number)
sponsor_id = _sync_sponsor(db, detail.get("bill", {}))
if sponsor_id:
bill.sponsor_id = sponsor_id
db.commit()
updated += 1
except Exception as e:
logger.warning(f"Could not backfill sponsor for {bill.bill_id}: {e}")
time.sleep(0.1) # ~10 req/sec, well under Congress.gov 5000/hr limit
logger.info(f"Sponsor backfill complete: {updated}/{total} updated")
return {"total": total, "updated": updated}
finally:
db.close()
@celery_app.task(bind=True, max_retries=3, name="app.workers.congress_poller.fetch_bill_actions")
def fetch_bill_actions(self, bill_id: str):
"""Fetch and sync all actions for a bill from Congress.gov. Idempotent."""
db = get_sync_db()
try:
bill = db.get(Bill, bill_id)
if not bill:
logger.warning(f"fetch_bill_actions: bill {bill_id} not found")
return
offset = 0
inserted = 0
while True:
try:
response = congress_api.get_bill_actions(
bill.congress_number, bill.bill_type, bill.bill_number, offset=offset
)
except Exception as exc:
raise self.retry(exc=exc, countdown=60)
actions_data = response.get("actions", [])
if not actions_data:
break
for action in actions_data:
stmt = pg_insert(BillAction.__table__).values(
bill_id=bill_id,
action_date=action.get("actionDate"),
action_text=action.get("text", ""),
action_type=action.get("type"),
chamber=action.get("chamber"),
).on_conflict_do_nothing(constraint="uq_bill_actions_bill_date_text")
result = db.execute(stmt)
inserted += result.rowcount
db.commit()
offset += 250
if len(actions_data) < 250:
break
bill.actions_fetched_at = datetime.now(timezone.utc)
db.commit()
logger.info(f"fetch_bill_actions: {bill_id} — inserted {inserted} new actions")
return {"bill_id": bill_id, "inserted": inserted}
except Exception as exc:
db.rollback()
raise
finally:
db.close()
@celery_app.task(bind=True, name="app.workers.congress_poller.fetch_actions_for_active_bills")
def fetch_actions_for_active_bills(self):
"""Nightly batch: enqueue action fetches for recently active bills missing action data."""
db = get_sync_db()
try:
cutoff = datetime.now(timezone.utc).date() - timedelta(days=30)
bills = (
db.query(Bill)
.filter(
Bill.latest_action_date >= cutoff,
or_(
Bill.actions_fetched_at.is_(None),
Bill.latest_action_date > Bill.actions_fetched_at,
),
)
.limit(200)
.all()
)
queued = 0
for bill in bills:
fetch_bill_actions.delay(bill.bill_id)
queued += 1
time.sleep(0.2) # ~5 tasks/sec to avoid Redis burst
logger.info(f"fetch_actions_for_active_bills: queued {queued} bills")
return {"queued": queued}
finally:
db.close()
@celery_app.task(bind=True, name="app.workers.congress_poller.backfill_all_bill_actions")
def backfill_all_bill_actions(self):
"""One-time backfill: enqueue action fetches for every bill that has never had actions fetched."""
db = get_sync_db()
try:
bills = (
db.query(Bill)
.filter(Bill.actions_fetched_at.is_(None))
.order_by(Bill.latest_action_date.desc())
.all()
)
queued = 0
for bill in bills:
fetch_bill_actions.delay(bill.bill_id)
queued += 1
time.sleep(0.05) # ~20 tasks/sec — workers will self-throttle against Congress.gov
logger.info(f"backfill_all_bill_actions: queued {queued} bills")
return {"queued": queued}
finally:
db.close()
def _update_bill_if_changed(db, existing: Bill, parsed: dict) -> bool:
"""Update bill fields if anything has changed. Returns True if updated."""
changed = False
dirty = False
# Meaningful change fields — trigger document + action fetch when updated
track_fields = ["title", "short_title", "latest_action_date", "latest_action_text", "status"]
for field in track_fields:
new_val = parsed.get(field)
if new_val and getattr(existing, field) != new_val:
setattr(existing, field, new_val)
changed = True
dirty = True
# Static fields — only fill in if currently null; no change trigger needed
fill_null_fields = ["introduced_date", "congress_url", "chamber"]
for field in fill_null_fields:
new_val = parsed.get(field)
if new_val and getattr(existing, field) is None:
setattr(existing, field, new_val)
dirty = True
if changed:
existing.last_checked_at = datetime.now(timezone.utc)
if dirty:
db.commit()
if changed:
from app.workers.notification_utils import (
emit_bill_notification,
emit_member_follow_notifications,
emit_topic_follow_notifications,
categorize_action,
)
action_text = parsed.get("latest_action_text", "")
action_category = categorize_action(action_text)
# Only fetch new documents for actions that produce new text versions on GovInfo.
# Skip procedural/administrative actions (referral, calendar) to avoid unnecessary calls.
if not action_category or action_category in _DOC_PRODUCING_CATEGORIES:
from app.workers.document_fetcher import fetch_bill_documents
fetch_bill_documents.delay(existing.bill_id)
fetch_bill_actions.delay(existing.bill_id)
if action_category:
emit_bill_notification(db, existing, "bill_updated", action_text, action_category=action_category)
emit_member_follow_notifications(db, existing, "bill_updated", action_text, action_category=action_category)
# Topic followers — pull tags from the bill's latest brief
from app.models.brief import BillBrief
latest_brief = (
db.query(BillBrief)
.filter_by(bill_id=existing.bill_id)
.order_by(BillBrief.created_at.desc())
.first()
)
topic_tags = latest_brief.topic_tags or [] if latest_brief else []
emit_topic_follow_notifications(
db, existing, "bill_updated", action_text, topic_tags, action_category=action_category
)
return changed
@celery_app.task(bind=True, name="app.workers.congress_poller.backfill_bill_metadata")
def backfill_bill_metadata(self):
"""
Find bills with null introduced_date (or other static fields) and
re-fetch their detail from Congress.gov to fill in the missing values.
No document or LLM calls — metadata only.
"""
db = get_sync_db()
try:
from sqlalchemy import text as sa_text
rows = db.execute(sa_text("""
SELECT bill_id, congress_number, bill_type, bill_number
FROM bills
WHERE introduced_date IS NULL
OR congress_url IS NULL
OR chamber IS NULL
""")).fetchall()
updated = 0
skipped = 0
for row in rows:
try:
detail = congress_api.get_bill_detail(
row.congress_number, row.bill_type, row.bill_number
)
bill_data = detail.get("bill", {})
parsed = congress_api.parse_bill_from_api(
{
"type": row.bill_type,
"number": row.bill_number,
"introducedDate": bill_data.get("introducedDate"),
"title": bill_data.get("title"),
"shortTitle": bill_data.get("shortTitle"),
"latestAction": bill_data.get("latestAction") or {},
},
row.congress_number,
)
bill = db.get(Bill, row.bill_id)
if not bill:
skipped += 1
continue
fill_null_fields = ["introduced_date", "congress_url", "chamber", "title", "short_title"]
dirty = False
for field in fill_null_fields:
new_val = parsed.get(field)
if new_val and getattr(bill, field) is None:
setattr(bill, field, new_val)
dirty = True
if dirty:
db.commit()
updated += 1
else:
skipped += 1
time.sleep(0.2) # ~300 req/min — well under the 5k/hr limit
except Exception as exc:
logger.warning(f"backfill_bill_metadata: failed for {row.bill_id}: {exc}")
skipped += 1
logger.info(f"backfill_bill_metadata: {updated} updated, {skipped} skipped")
return {"updated": updated, "skipped": skipped}
finally:
db.close()