Add bill action pipeline, admin health panel, and LLM provider fixes

- Fetch bill actions from Congress.gov and populate the action timeline - Add nightly batch task and beat schedule for active bill actions - Add admin reprocess endpoint for per-bill debugging - Add BriefPanel with "What Changed" view and version history - Add External API Health section with per-source latency testing - Redesign Manual Controls as health panel with status dots and descriptions - Add Resume Analysis task for stalled LLM jobs - Add Backfill Dates & Links task for bills with null metadata - Fix LLM provider/model DB overrides being ignored (env vars used instead) - Fix Gemini 404: gemini-1.5-pro deprecated → gemini-2.0-flash - Fix Anthropic models list: use REST API directly (SDK too old for .models) - Replace test-LLM full analysis with lightweight ping (max_tokens=20) - Add has_document field to BillDetail; show "No bill text published" state - Fix "Introduced: —" showing for bills with null introduced_date - Add bills_missing_sponsor and bills_missing_metadata to admin stats - Add GovInfo health check using /collections endpoint (fixes 500 from /packages) Authored-By: Jack Levy
2026-03-01 11:06:14 -05:00
parent defc2c116d
commit 5eebc2f196
10 changed files with 586 additions and 74 deletions
--- a/backend/app/workers/celery_app.py
+++ b/backend/app/workers/celery_app.py
@@ -15,6 +15,7 @@ celery_app = Celery(
        "app.workers.news_fetcher",
        "app.workers.trend_scorer",
        "app.workers.member_interest",
+        "app.workers.notification_dispatcher",
    ],
 )

@@ -37,6 +38,7 @@ celery_app.conf.update(
        "app.workers.news_fetcher.*": {"queue": "news"},
        "app.workers.trend_scorer.*": {"queue": "news"},
        "app.workers.member_interest.*": {"queue": "news"},
+        "app.workers.notification_dispatcher.*": {"queue": "polling"},
    },
    task_queues=[
        Queue("polling"),
@@ -72,5 +74,9 @@ celery_app.conf.update(
            "task": "app.workers.congress_poller.fetch_actions_for_active_bills",
            "schedule": crontab(hour=4, minute=0),  # 4 AM UTC, after trend + member scoring
        },
+        "dispatch-notifications": {
+            "task": "app.workers.notification_dispatcher.dispatch_notifications",
+            "schedule": crontab(minute="*/5"),  # Every 5 minutes
+        },
    },
 )
--- a/backend/app/workers/congress_poller.py
+++ b/backend/app/workers/congress_poller.py
@@ -300,17 +300,95 @@ def fetch_actions_for_active_bills(self):
 def _update_bill_if_changed(db, existing: Bill, parsed: dict) -> bool:
    """Update bill fields if anything has changed. Returns True if updated."""
    changed = False
+    dirty = False
+
+    # Meaningful change fields — trigger document + action fetch when updated
    track_fields = ["title", "short_title", "latest_action_date", "latest_action_text", "status"]
    for field in track_fields:
        new_val = parsed.get(field)
        if new_val and getattr(existing, field) != new_val:
            setattr(existing, field, new_val)
            changed = True
+            dirty = True
+
+    # Static fields — only fill in if currently null; no change trigger needed
+    fill_null_fields = ["introduced_date", "congress_url", "chamber"]
+    for field in fill_null_fields:
+        new_val = parsed.get(field)
+        if new_val and getattr(existing, field) is None:
+            setattr(existing, field, new_val)
+            dirty = True
+
    if changed:
        existing.last_checked_at = datetime.now(timezone.utc)
+    if dirty:
        db.commit()
-        # Check for new text versions and sync actions now that the bill has changed
+    if changed:
        from app.workers.document_fetcher import fetch_bill_documents
        fetch_bill_documents.delay(existing.bill_id)
        fetch_bill_actions.delay(existing.bill_id)
    return changed
+
+
+@celery_app.task(bind=True, name="app.workers.congress_poller.backfill_bill_metadata")
+def backfill_bill_metadata(self):
+    """
+    Find bills with null introduced_date (or other static fields) and
+    re-fetch their detail from Congress.gov to fill in the missing values.
+    No document or LLM calls — metadata only.
+    """
+    db = get_sync_db()
+    try:
+        from sqlalchemy import text as sa_text
+        rows = db.execute(sa_text("""
+            SELECT bill_id, congress_number, bill_type, bill_number
+            FROM bills
+            WHERE introduced_date IS NULL
+               OR congress_url IS NULL
+               OR chamber IS NULL
+        """)).fetchall()
+
+        updated = 0
+        skipped = 0
+        for row in rows:
+            try:
+                detail = congress_api.get_bill_detail(
+                    row.congress_number, row.bill_type, row.bill_number
+                )
+                bill_data = detail.get("bill", {})
+                parsed = congress_api.parse_bill_from_api(
+                    {
+                        "type": row.bill_type,
+                        "number": row.bill_number,
+                        "introducedDate": bill_data.get("introducedDate"),
+                        "title": bill_data.get("title"),
+                        "shortTitle": bill_data.get("shortTitle"),
+                        "latestAction": bill_data.get("latestAction") or {},
+                    },
+                    row.congress_number,
+                )
+                bill = db.get(Bill, row.bill_id)
+                if not bill:
+                    skipped += 1
+                    continue
+                fill_null_fields = ["introduced_date", "congress_url", "chamber", "title", "short_title"]
+                dirty = False
+                for field in fill_null_fields:
+                    new_val = parsed.get(field)
+                    if new_val and getattr(bill, field) is None:
+                        setattr(bill, field, new_val)
+                        dirty = True
+                if dirty:
+                    db.commit()
+                    updated += 1
+                else:
+                    skipped += 1
+                time.sleep(0.2)  # ~300 req/min — well under the 5k/hr limit
+            except Exception as exc:
+                logger.warning(f"backfill_bill_metadata: failed for {row.bill_id}: {exc}")
+                skipped += 1
+
+        logger.info(f"backfill_bill_metadata: {updated} updated, {skipped} skipped")
+        return {"updated": updated, "skipped": skipped}
+    finally:
+        db.close()
--- a/backend/app/workers/llm_processor.py
+++ b/backend/app/workers/llm_processor.py
@@ -199,3 +199,55 @@ def backfill_brief_citations(self):
        return {"total": total, "queued": queued, "skipped": skipped}
    finally:
        db.close()
+
+
+@celery_app.task(bind=True, name="app.workers.llm_processor.resume_pending_analysis")
+def resume_pending_analysis(self):
+    """
+    Two-pass backfill for bills missing analysis:
+
+    Pass 1 — Documents with no brief (LLM tasks failed/timed out):
+      Find BillDocuments that have raw_text but no BillBrief, re-queue LLM.
+
+    Pass 2 — Bills with no document at all:
+      Find Bills with no BillDocument, re-queue document fetch (which will
+      then chain into LLM if text is available on GovInfo).
+    """
+    db = get_sync_db()
+    try:
+        # Pass 1: docs with raw_text but no brief
+        docs_no_brief = db.execute(text("""
+            SELECT bd.id
+            FROM bill_documents bd
+            LEFT JOIN bill_briefs bb ON bb.document_id = bd.id
+            WHERE bb.id IS NULL AND bd.raw_text IS NOT NULL
+        """)).fetchall()
+
+        queued_llm = 0
+        for row in docs_no_brief:
+            process_document_with_llm.delay(row.id)
+            queued_llm += 1
+            time.sleep(0.1)
+
+        # Pass 2: bills with no document at all
+        bills_no_doc = db.execute(text("""
+            SELECT b.bill_id
+            FROM bills b
+            LEFT JOIN bill_documents bd ON bd.bill_id = b.bill_id
+            WHERE bd.id IS NULL
+        """)).fetchall()
+
+        queued_fetch = 0
+        from app.workers.document_fetcher import fetch_bill_documents
+        for row in bills_no_doc:
+            fetch_bill_documents.delay(row.bill_id)
+            queued_fetch += 1
+            time.sleep(0.1)
+
+        logger.info(
+            f"resume_pending_analysis: {queued_llm} LLM tasks queued, "
+            f"{queued_fetch} document fetch tasks queued"
+        )
+        return {"queued_llm": queued_llm, "queued_fetch": queued_fetch}
+    finally:
+        db.close()