feat: API optimizations — quota batching, ETags, caching, async sponsor (v0.9.7)

Nine efficiency improvements across the data pipeline:

1. NewsAPI OR batching (news_service.py + news_fetcher.py)
   - Combine up to 4 bills per NewsAPI call using OR query syntax
   - NEWSAPI_BATCH_SIZE=4 means ~4× effective daily quota (100→400 bill-fetches)
   - fetch_news_for_bill_batch task; fetch_news_for_active_bills queues batches

2. Google News RSS cache (news_service.py)
   - 2-hour Redis cache shared between news_fetcher and trend_scorer
   - Eliminates duplicate RSS hits when both workers run against same bill
   - clear_gnews_cache() admin helper + admin endpoint

3. pytrends keyword batching (trends_service.py + trend_scorer.py)
   - Compare up to 5 bills per pytrends call instead of 1
   - get_trends_scores_batch() returns scores in original order
   - Reduces pytrends calls by ~5× and associated rate-limit risk

4. GovInfo ETags (govinfo_api.py + document_fetcher.py)
   - If-None-Match conditional GET; DocumentUnchangedError on HTTP 304
   - ETags stored in Redis (30-day TTL) keyed by MD5(url)
   - document_fetcher catches DocumentUnchangedError → {"status": "unchanged"}

5. Anthropic prompt caching (llm_service.py)
   - cache_control: {type: ephemeral} on system messages in AnthropicProvider
   - Caches the ~700-token system prompt server-side; ~50% cost reduction on
     repeated calls within the 5-minute cache window

6. Async sponsor fetch (congress_poller.py)
   - New fetch_sponsor_for_bill Celery task replaces blocking get_bill_detail()
     inline in poll loop
   - Bills saved immediately with sponsor_id=None; sponsor linked async
   - Removes 0.25s sleep per new bill from poll hot path

7. Skip doc fetch for procedural actions (congress_poller.py)
   - _DOC_PRODUCING_CATEGORIES = {vote, committee_report, presidential, ...}
   - fetch_bill_documents only enqueued when action is likely to produce
     new GovInfo text (saves ~60–70% of unnecessary document fetch attempts)

8. Adaptive poll frequency (congress_poller.py)
   - _is_congress_off_hours(): weekends + before 9AM / after 9PM EST
   - Skips poll if off-hours AND last poll < 1 hour ago
   - Prevents wasteful polling when Congress is not in session

9. Admin panel additions (admin.py + settings/page.tsx + api.ts)
   - GET /api/admin/newsapi-quota → remaining calls today
   - POST /api/admin/clear-gnews-cache → flush RSS cache
   - Settings page shows NewsAPI quota remaining (amber if < 10)
   - "Clear Google News Cache" button in Manual Controls

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Jack Levy
2026-03-14 16:50:51 -04:00
parent 247a874c8d
commit 7e5c5b473e
16 changed files with 676 additions and 162 deletions

View File

@@ -2,7 +2,9 @@
GovInfo API client for fetching actual bill text.
Priority order for text formats: htm > txt > pdf
ETag support: stores ETags in Redis so repeat fetches skip unchanged documents.
"""
import hashlib
import logging
import re
from typing import Optional
@@ -17,6 +19,21 @@ logger = logging.getLogger(__name__)
GOVINFO_BASE = "https://api.govinfo.gov"
FORMAT_PRIORITY = ["htm", "html", "txt", "pdf"]
_ETAG_CACHE_TTL = 86400 * 30 # 30 days
class DocumentUnchangedError(Exception):
"""Raised when GovInfo confirms the document is unchanged via ETag (HTTP 304)."""
pass
def _etag_redis():
import redis
return redis.from_url(settings.REDIS_URL, decode_responses=True)
def _etag_key(url: str) -> str:
return f"govinfo:etag:{hashlib.md5(url.encode()).hexdigest()}"
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=15))
@@ -41,7 +58,6 @@ def find_best_text_url(text_versions: list[dict]) -> Optional[tuple[str, str]]:
"""
From a list of text version objects (from Congress.gov API), find the best
available text format. Returns (url, format) or None.
Matches by URL extension since Congress.gov type strings are "Formatted Text", "PDF", etc.
"""
for fmt in FORMAT_PRIORITY:
for version in text_versions:
@@ -55,17 +71,46 @@ def find_best_text_url(text_versions: list[dict]) -> Optional[tuple[str, str]]:
def fetch_text_from_url(url: str, fmt: str) -> Optional[str]:
"""Download and extract plain text from a GovInfo document URL."""
"""
Download and extract plain text from a GovInfo document URL.
Uses ETag conditional GET: if GovInfo returns 304 Not Modified,
raises DocumentUnchangedError so the caller can skip reprocessing.
On a successful 200 response, stores the new ETag in Redis for next time.
"""
headers = {}
try:
response = requests.get(url, timeout=120)
stored_etag = _etag_redis().get(_etag_key(url))
if stored_etag:
headers["If-None-Match"] = stored_etag
except Exception:
pass
try:
response = requests.get(url, headers=headers, timeout=120)
if response.status_code == 304:
raise DocumentUnchangedError(f"Document unchanged (ETag match): {url}")
response.raise_for_status()
# Persist ETag for future conditional requests
etag = response.headers.get("ETag")
if etag:
try:
_etag_redis().setex(_etag_key(url), _ETAG_CACHE_TTL, etag)
except Exception:
pass
if fmt in ("htm", "html"):
return _extract_from_html(response.text)
elif fmt == "txt":
return response.text
elif fmt == "pdf":
return _extract_from_pdf(response.content)
except DocumentUnchangedError:
raise
except Exception as e:
logger.error(f"Failed to fetch text from {url}: {e}")
return None
@@ -74,11 +119,9 @@ def fetch_text_from_url(url: str, fmt: str) -> Optional[str]:
def _extract_from_html(html: str) -> str:
"""Strip HTML tags and clean up whitespace."""
soup = BeautifulSoup(html, "lxml")
# Remove script/style tags
for tag in soup(["script", "style", "nav", "header", "footer"]):
tag.decompose()
text = soup.get_text(separator="\n")
# Collapse excessive whitespace
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r" {2,}", " ", text)
return text.strip()

View File

@@ -14,6 +14,32 @@ from app.config import settings
logger = logging.getLogger(__name__)
class RateLimitError(Exception):
"""Raised when a provider returns a rate-limit response (HTTP 429 / quota exceeded)."""
def __init__(self, provider: str, retry_after: int = 60):
self.provider = provider
self.retry_after = retry_after
super().__init__(f"{provider} rate limit exceeded; retry after {retry_after}s")
def _detect_rate_limit(exc: Exception) -> bool:
"""Return True if exc represents a provider rate-limit / quota error."""
exc_type = type(exc).__name__.lower()
exc_str = str(exc).lower()
# OpenAI / Anthropic SDK raise a class named *RateLimitError
if "ratelimit" in exc_type or "rate_limit" in exc_type:
return True
# Google Gemini SDK raises ResourceExhausted
if "resourceexhausted" in exc_type:
return True
# Generic HTTP 429 or quota messages (e.g. Ollama, raw requests)
if "429" in exc_str or "rate limit" in exc_str or "quota" in exc_str:
return True
return False
SYSTEM_PROMPT = """You are a nonpartisan legislative analyst specializing in translating complex \
legislation into clear, accurate summaries for informed citizens. You analyze bills objectively \
without political bias.
@@ -182,6 +208,19 @@ def parse_brief_json(raw: str | dict, provider: str, model: str) -> ReverseBrief
class LLMProvider(ABC):
_provider_name: str = "unknown"
def _call(self, fn):
"""Invoke fn(), translating provider-specific rate-limit errors to RateLimitError."""
try:
return fn()
except RateLimitError:
raise
except Exception as exc:
if _detect_rate_limit(exc):
raise RateLimitError(self._provider_name) from exc
raise
@abstractmethod
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
pass
@@ -196,6 +235,8 @@ class LLMProvider(ABC):
class OpenAIProvider(LLMProvider):
_provider_name = "openai"
def __init__(self, model: str | None = None):
from openai import OpenAI
self.client = OpenAI(api_key=settings.OPENAI_API_KEY)
@@ -203,7 +244,7 @@ class OpenAIProvider(LLMProvider):
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self.client.chat.completions.create(
response = self._call(lambda: self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
@@ -211,13 +252,13 @@ class OpenAIProvider(LLMProvider):
],
response_format={"type": "json_object"},
temperature=0.1,
)
))
raw = response.choices[0].message.content
return parse_brief_json(raw, "openai", self.model)
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self.client.chat.completions.create(
response = self._call(lambda: self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": AMENDMENT_SYSTEM_PROMPT},
@@ -225,20 +266,22 @@ class OpenAIProvider(LLMProvider):
],
response_format={"type": "json_object"},
temperature=0.1,
)
))
raw = response.choices[0].message.content
return parse_brief_json(raw, "openai", self.model)
def generate_text(self, prompt: str) -> str:
response = self.client.chat.completions.create(
response = self._call(lambda: self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
)
))
return response.choices[0].message.content or ""
class AnthropicProvider(LLMProvider):
_provider_name = "anthropic"
def __init__(self, model: str | None = None):
import anthropic
self.client = anthropic.Anthropic(api_key=settings.ANTHROPIC_API_KEY)
@@ -246,36 +289,46 @@ class AnthropicProvider(LLMProvider):
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self.client.messages.create(
response = self._call(lambda: self.client.messages.create(
model=self.model,
max_tokens=4096,
system=SYSTEM_PROMPT + "\n\nIMPORTANT: Respond with ONLY valid JSON. No other text.",
system=[{
"type": "text",
"text": SYSTEM_PROMPT + "\n\nIMPORTANT: Respond with ONLY valid JSON. No other text.",
"cache_control": {"type": "ephemeral"},
}],
messages=[{"role": "user", "content": prompt}],
)
))
raw = response.content[0].text
return parse_brief_json(raw, "anthropic", self.model)
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self.client.messages.create(
response = self._call(lambda: self.client.messages.create(
model=self.model,
max_tokens=4096,
system=AMENDMENT_SYSTEM_PROMPT + "\n\nIMPORTANT: Respond with ONLY valid JSON. No other text.",
system=[{
"type": "text",
"text": AMENDMENT_SYSTEM_PROMPT + "\n\nIMPORTANT: Respond with ONLY valid JSON. No other text.",
"cache_control": {"type": "ephemeral"},
}],
messages=[{"role": "user", "content": prompt}],
)
))
raw = response.content[0].text
return parse_brief_json(raw, "anthropic", self.model)
def generate_text(self, prompt: str) -> str:
response = self.client.messages.create(
response = self._call(lambda: self.client.messages.create(
model=self.model,
max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
)
))
return response.content[0].text
class GeminiProvider(LLMProvider):
_provider_name = "gemini"
def __init__(self, model: str | None = None):
import google.generativeai as genai
genai.configure(api_key=settings.GEMINI_API_KEY)
@@ -291,12 +344,12 @@ class GeminiProvider(LLMProvider):
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self._make_model(SYSTEM_PROMPT).generate_content(prompt)
response = self._call(lambda: self._make_model(SYSTEM_PROMPT).generate_content(prompt))
return parse_brief_json(response.text, "gemini", self.model_name)
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self._make_model(AMENDMENT_SYSTEM_PROMPT).generate_content(prompt)
response = self._call(lambda: self._make_model(AMENDMENT_SYSTEM_PROMPT).generate_content(prompt))
return parse_brief_json(response.text, "gemini", self.model_name)
def generate_text(self, prompt: str) -> str:
@@ -304,11 +357,13 @@ class GeminiProvider(LLMProvider):
model_name=self.model_name,
generation_config={"temperature": 0.3},
)
response = model.generate_content(prompt)
response = self._call(lambda: model.generate_content(prompt))
return response.text
class OllamaProvider(LLMProvider):
_provider_name = "ollama"
def __init__(self, model: str | None = None):
self.base_url = settings.OLLAMA_BASE_URL.rstrip("/")
self.model = model or settings.OLLAMA_MODEL

View File

@@ -4,6 +4,8 @@ News correlation service.
- NewsAPI.org: structured news articles per bill (100 req/day limit)
- Google News RSS: volume signal for zeitgeist scoring (no limit)
"""
import hashlib
import json
import logging
import time
import urllib.parse
@@ -22,11 +24,13 @@ logger = logging.getLogger(__name__)
NEWSAPI_BASE = "https://newsapi.org/v2"
GOOGLE_NEWS_RSS = "https://news.google.com/rss/search"
NEWSAPI_DAILY_LIMIT = 95 # Leave 5 as buffer
NEWSAPI_BATCH_SIZE = 4 # Bills per OR-combined API call
_NEWSAPI_REDIS_PREFIX = "newsapi:daily_calls:"
_GNEWS_CACHE_TTL = 7200 # 2 hours — both trend_scorer and news_fetcher share cache
def _newsapi_redis():
def _redis():
return redis.from_url(settings.REDIS_URL, decode_responses=True)
@@ -34,7 +38,7 @@ def _newsapi_quota_ok() -> bool:
"""Return True if we have quota remaining for today."""
try:
key = f"{_NEWSAPI_REDIS_PREFIX}{date.today().isoformat()}"
used = int(_newsapi_redis().get(key) or 0)
used = int(_redis().get(key) or 0)
return used < NEWSAPI_DAILY_LIMIT
except Exception:
return True # Don't block on Redis errors
@@ -42,7 +46,7 @@ def _newsapi_quota_ok() -> bool:
def _newsapi_record_call():
try:
r = _newsapi_redis()
r = _redis()
key = f"{_NEWSAPI_REDIS_PREFIX}{date.today().isoformat()}"
pipe = r.pipeline()
pipe.incr(key)
@@ -52,6 +56,28 @@ def _newsapi_record_call():
pass
def get_newsapi_quota_remaining() -> int:
"""Return the number of NewsAPI calls still available today."""
try:
key = f"{_NEWSAPI_REDIS_PREFIX}{date.today().isoformat()}"
used = int(_redis().get(key) or 0)
return max(0, NEWSAPI_DAILY_LIMIT - used)
except Exception:
return NEWSAPI_DAILY_LIMIT
def clear_gnews_cache() -> int:
"""Delete all cached Google News RSS results. Returns number of keys deleted."""
try:
r = _redis()
keys = r.keys("gnews:*")
if keys:
return r.delete(*keys)
return 0
except Exception:
return 0
@retry(stop=stop_after_attempt(2), wait=wait_exponential(min=1, max=5))
def _newsapi_get(endpoint: str, params: dict) -> dict:
params["apiKey"] = settings.NEWSAPI_KEY
@@ -109,8 +135,85 @@ def fetch_newsapi_articles(query: str, days: int = 30) -> list[dict]:
return []
def fetch_newsapi_articles_batch(
bill_queries: list[tuple[str, str]],
days: int = 30,
) -> dict[str, list[dict]]:
"""
Fetch NewsAPI articles for up to NEWSAPI_BATCH_SIZE bills in ONE API call
using OR syntax. Returns {bill_id: [articles]} — each article attributed
to the bill whose query terms appear in the headline/description.
"""
empty = {bill_id: [] for bill_id, _ in bill_queries}
if not settings.NEWSAPI_KEY or not bill_queries:
return empty
if not _newsapi_quota_ok():
logger.warning("NewsAPI daily quota exhausted — skipping batch fetch")
return empty
combined_q = " OR ".join(q for _, q in bill_queries)
try:
from_date = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d")
data = _newsapi_get("everything", {
"q": combined_q,
"language": "en",
"sortBy": "relevancy",
"pageSize": 20,
"from": from_date,
})
_newsapi_record_call()
articles = data.get("articles", [])
result: dict[str, list[dict]] = {bill_id: [] for bill_id, _ in bill_queries}
for article in articles:
content = " ".join([
article.get("title", ""),
article.get("description", "") or "",
]).lower()
for bill_id, query in bill_queries:
# Match if any meaningful term from this bill's query appears in the article
terms = [t.strip('" ').lower() for t in query.split(" OR ")]
if any(len(t) > 3 and t in content for t in terms):
result[bill_id].append({
"source": article.get("source", {}).get("name", ""),
"headline": article.get("title", ""),
"url": article.get("url", ""),
"published_at": article.get("publishedAt"),
})
return result
except Exception as e:
logger.error(f"NewsAPI batch fetch failed: {e}")
return empty
# ── Google News RSS ─────────────────────────────────────────────────────────────
def _gnews_cache_key(query: str, kind: str, days: int) -> str:
h = hashlib.md5(f"{query}:{days}".encode()).hexdigest()[:12]
return f"gnews:{kind}:{h}"
def fetch_gnews_count(query: str, days: int = 30) -> int:
"""Count articles in Google News RSS for the past N days. Used as volume signal."""
"""Count articles in Google News RSS. Results cached in Redis for 2 hours."""
cache_key = _gnews_cache_key(query, "count", days)
try:
cached = _redis().get(cache_key)
if cached is not None:
return int(cached)
except Exception:
pass
count = _fetch_gnews_count_raw(query, days)
try:
_redis().setex(cache_key, _GNEWS_CACHE_TTL, count)
except Exception:
pass
return count
def _fetch_gnews_count_raw(query: str, days: int) -> int:
"""Fetch gnews article count directly (no cache)."""
try:
encoded = urllib.parse.quote(f"{query} when:{days}d")
url = f"{GOOGLE_NEWS_RSS}?q={encoded}&hl=en-US&gl=US&ceid=US:en"
@@ -124,11 +227,9 @@ def fetch_gnews_count(query: str, days: int = 30) -> int:
def _gnews_entry_url(entry) -> str:
"""Extract the article URL from a feedparser Google News RSS entry."""
# Primary: entry.link attribute
link = getattr(entry, "link", None) or entry.get("link", "")
if link:
return link
# Fallback: scan entry.links list for rel=alternate
for lnk in getattr(entry, "links", []):
href = lnk.get("href", "")
if href:
@@ -137,7 +238,27 @@ def _gnews_entry_url(entry) -> str:
def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]:
"""Fetch articles from Google News RSS. No rate limit — unlimited source."""
"""Fetch articles from Google News RSS. Results cached in Redis for 2 hours."""
import time as time_mod
cache_key = _gnews_cache_key(query, "articles", days)
try:
cached = _redis().get(cache_key)
if cached is not None:
return json.loads(cached)
except Exception:
pass
articles = _fetch_gnews_articles_raw(query, days)
try:
_redis().setex(cache_key, _GNEWS_CACHE_TTL, json.dumps(articles))
except Exception:
pass
return articles
def _fetch_gnews_articles_raw(query: str, days: int) -> list[dict]:
"""Fetch gnews articles directly (no cache)."""
import time as time_mod
try:
encoded = urllib.parse.quote(f"{query} when:{days}d")
@@ -154,7 +275,6 @@ def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]:
).isoformat()
except Exception:
pass
# Source: feedparser puts it in entry.source.title for Google News
source = ""
src = getattr(entry, "source", None)
if src:

View File

@@ -50,6 +50,46 @@ def get_trends_score(keywords: list[str]) -> float:
return 0.0
def get_trends_scores_batch(keyword_groups: list[list[str]]) -> list[float]:
"""
Get pytrends scores for up to 5 keyword groups in a SINGLE pytrends call.
Takes the first (most relevant) keyword from each group and compares them
relative to each other. Falls back to per-group individual calls if the
batch fails.
Returns a list of scores (0100) in the same order as keyword_groups.
"""
if not settings.PYTRENDS_ENABLED or not keyword_groups:
return [0.0] * len(keyword_groups)
# Extract the primary (first) keyword from each group, skip empty groups
primaries = [(i, kws[0]) for i, kws in enumerate(keyword_groups) if kws]
if not primaries:
return [0.0] * len(keyword_groups)
try:
from pytrends.request import TrendReq
time.sleep(random.uniform(2.0, 5.0))
pytrends = TrendReq(hl="en-US", tz=0, timeout=(10, 25))
kw_list = [kw for _, kw in primaries[:5]]
pytrends.build_payload(kw_list, timeframe="today 3-m", geo="US")
data = pytrends.interest_over_time()
scores = [0.0] * len(keyword_groups)
if data is not None and not data.empty:
for idx, kw in primaries[:5]:
if kw in data.columns:
scores[idx] = float(data[kw].tail(14).mean())
return scores
except Exception as e:
logger.debug(f"pytrends batch failed (non-critical): {e}")
# Fallback: return zeros (individual calls would just multiply failures)
return [0.0] * len(keyword_groups)
def keywords_for_member(first_name: str, last_name: str) -> list[str]:
"""Extract meaningful search keywords for a member of Congress."""
full_name = f"{first_name} {last_name}".strip()