feat: PocketVeto v1.0.0 — initial public release

Self-hosted US Congress monitoring platform with AI policy briefs,
bill/member/topic follows, ntfy + RSS + email notifications,
alignment scoring, collections, and draft-letter generator.

Authored by: Jack Levy
This commit is contained in:
Jack Levy
2026-03-15 01:35:01 -04:00
commit 4c86a5b9ca
150 changed files with 19859 additions and 0 deletions

View File

@@ -0,0 +1,138 @@
"""
GovInfo API client for fetching actual bill text.
Priority order for text formats: htm > txt > pdf
ETag support: stores ETags in Redis so repeat fetches skip unchanged documents.
"""
import hashlib
import logging
import re
from typing import Optional
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_exponential
from app.config import settings
logger = logging.getLogger(__name__)
GOVINFO_BASE = "https://api.govinfo.gov"
FORMAT_PRIORITY = ["htm", "html", "txt", "pdf"]
_ETAG_CACHE_TTL = 86400 * 30 # 30 days
class DocumentUnchangedError(Exception):
"""Raised when GovInfo confirms the document is unchanged via ETag (HTTP 304)."""
pass
def _etag_redis():
import redis
return redis.from_url(settings.REDIS_URL, decode_responses=True)
def _etag_key(url: str) -> str:
return f"govinfo:etag:{hashlib.md5(url.encode()).hexdigest()}"
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=15))
def _get(url: str, params: dict = None) -> requests.Response:
p = {"api_key": settings.DATA_GOV_API_KEY, **(params or {})}
response = requests.get(url, params=p, timeout=60)
response.raise_for_status()
return response
def get_package_summary(package_id: str) -> dict:
response = _get(f"{GOVINFO_BASE}/packages/{package_id}/summary")
return response.json()
def get_package_content_detail(package_id: str) -> dict:
response = _get(f"{GOVINFO_BASE}/packages/{package_id}/content-detail")
return response.json()
def find_best_text_url(text_versions: list[dict]) -> Optional[tuple[str, str]]:
"""
From a list of text version objects (from Congress.gov API), find the best
available text format. Returns (url, format) or None.
"""
for fmt in FORMAT_PRIORITY:
for version in text_versions:
for fmt_info in version.get("formats", []):
if not isinstance(fmt_info, dict):
continue
url = fmt_info.get("url", "")
if url.lower().endswith(f".{fmt}"):
return url, fmt
return None, None
def fetch_text_from_url(url: str, fmt: str) -> Optional[str]:
"""
Download and extract plain text from a GovInfo document URL.
Uses ETag conditional GET: if GovInfo returns 304 Not Modified,
raises DocumentUnchangedError so the caller can skip reprocessing.
On a successful 200 response, stores the new ETag in Redis for next time.
"""
headers = {}
try:
stored_etag = _etag_redis().get(_etag_key(url))
if stored_etag:
headers["If-None-Match"] = stored_etag
except Exception:
pass
try:
response = requests.get(url, headers=headers, timeout=120)
if response.status_code == 304:
raise DocumentUnchangedError(f"Document unchanged (ETag match): {url}")
response.raise_for_status()
# Persist ETag for future conditional requests
etag = response.headers.get("ETag")
if etag:
try:
_etag_redis().setex(_etag_key(url), _ETAG_CACHE_TTL, etag)
except Exception:
pass
if fmt in ("htm", "html"):
return _extract_from_html(response.text)
elif fmt == "txt":
return response.text
elif fmt == "pdf":
return _extract_from_pdf(response.content)
except DocumentUnchangedError:
raise
except Exception as e:
logger.error(f"Failed to fetch text from {url}: {e}")
return None
def _extract_from_html(html: str) -> str:
"""Strip HTML tags and clean up whitespace."""
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "nav", "header", "footer"]):
tag.decompose()
text = soup.get_text(separator="\n")
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r" {2,}", " ", text)
return text.strip()
def _extract_from_pdf(content: bytes) -> Optional[str]:
"""Extract text from PDF bytes using pdfminer."""
try:
from io import BytesIO
from pdfminer.high_level import extract_text as pdf_extract
return pdf_extract(BytesIO(content))
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
return None