feat: PocketVeto v1.0.0 — initial public release

Self-hosted US Congress monitoring platform with AI policy briefs, bill/member/topic follows, ntfy + RSS + email notifications, alignment scoring, collections, and draft-letter generator. Authored by: Jack Levy
2026-03-15 01:35:01 -04:00
commit 4c86a5b9ca
150 changed files with 19859 additions and 0 deletions
--- a/backend/app/services/govinfo_api.py
+++ b/backend/app/services/govinfo_api.py
@@ -0,0 +1,138 @@
+"""
+GovInfo API client for fetching actual bill text.
+
+Priority order for text formats: htm > txt > pdf
+ETag support: stores ETags in Redis so repeat fetches skip unchanged documents.
+"""
+import hashlib
+import logging
+import re
+from typing import Optional
+
+import requests
+from bs4 import BeautifulSoup
+from tenacity import retry, stop_after_attempt, wait_exponential
+
+from app.config import settings
+
+logger = logging.getLogger(__name__)
+
+GOVINFO_BASE = "https://api.govinfo.gov"
+FORMAT_PRIORITY = ["htm", "html", "txt", "pdf"]
+_ETAG_CACHE_TTL = 86400 * 30  # 30 days
+
+
+class DocumentUnchangedError(Exception):
+    """Raised when GovInfo confirms the document is unchanged via ETag (HTTP 304)."""
+    pass
+
+
+def _etag_redis():
+    import redis
+    return redis.from_url(settings.REDIS_URL, decode_responses=True)
+
+
+def _etag_key(url: str) -> str:
+    return f"govinfo:etag:{hashlib.md5(url.encode()).hexdigest()}"
+
+
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=15))
+def _get(url: str, params: dict = None) -> requests.Response:
+    p = {"api_key": settings.DATA_GOV_API_KEY, **(params or {})}
+    response = requests.get(url, params=p, timeout=60)
+    response.raise_for_status()
+    return response
+
+
+def get_package_summary(package_id: str) -> dict:
+    response = _get(f"{GOVINFO_BASE}/packages/{package_id}/summary")
+    return response.json()
+
+
+def get_package_content_detail(package_id: str) -> dict:
+    response = _get(f"{GOVINFO_BASE}/packages/{package_id}/content-detail")
+    return response.json()
+
+
+def find_best_text_url(text_versions: list[dict]) -> Optional[tuple[str, str]]:
+    """
+    From a list of text version objects (from Congress.gov API), find the best
+    available text format. Returns (url, format) or None.
+    """
+    for fmt in FORMAT_PRIORITY:
+        for version in text_versions:
+            for fmt_info in version.get("formats", []):
+                if not isinstance(fmt_info, dict):
+                    continue
+                url = fmt_info.get("url", "")
+                if url.lower().endswith(f".{fmt}"):
+                    return url, fmt
+    return None, None
+
+
+def fetch_text_from_url(url: str, fmt: str) -> Optional[str]:
+    """
+    Download and extract plain text from a GovInfo document URL.
+
+    Uses ETag conditional GET: if GovInfo returns 304 Not Modified,
+    raises DocumentUnchangedError so the caller can skip reprocessing.
+    On a successful 200 response, stores the new ETag in Redis for next time.
+    """
+    headers = {}
+    try:
+        stored_etag = _etag_redis().get(_etag_key(url))
+        if stored_etag:
+            headers["If-None-Match"] = stored_etag
+    except Exception:
+        pass
+
+    try:
+        response = requests.get(url, headers=headers, timeout=120)
+
+        if response.status_code == 304:
+            raise DocumentUnchangedError(f"Document unchanged (ETag match): {url}")
+
+        response.raise_for_status()
+
+        # Persist ETag for future conditional requests
+        etag = response.headers.get("ETag")
+        if etag:
+            try:
+                _etag_redis().setex(_etag_key(url), _ETAG_CACHE_TTL, etag)
+            except Exception:
+                pass
+
+        if fmt in ("htm", "html"):
+            return _extract_from_html(response.text)
+        elif fmt == "txt":
+            return response.text
+        elif fmt == "pdf":
+            return _extract_from_pdf(response.content)
+
+    except DocumentUnchangedError:
+        raise
+    except Exception as e:
+        logger.error(f"Failed to fetch text from {url}: {e}")
+        return None
+
+
+def _extract_from_html(html: str) -> str:
+    """Strip HTML tags and clean up whitespace."""
+    soup = BeautifulSoup(html, "lxml")
+    for tag in soup(["script", "style", "nav", "header", "footer"]):
+        tag.decompose()
+    text = soup.get_text(separator="\n")
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r" {2,}", " ", text)
+    return text.strip()
+
+
+def _extract_from_pdf(content: bytes) -> Optional[str]:
+    """Extract text from PDF bytes using pdfminer."""
+    try:
+        from io import BytesIO
+        from pdfminer.high_level import extract_text as pdf_extract
+        return pdf_extract(BytesIO(content))
+    except Exception as e:
+        logger.error(f"PDF extraction failed: {e}")
+        return None