feat: PocketVeto v1.0.0 — initial public release
Self-hosted US Congress monitoring platform with AI policy briefs, bill/member/topic follows, ntfy + RSS + email notifications, alignment scoring, collections, and draft-letter generator. Authored by: Jack Levy
This commit is contained in:
0
backend/app/services/__init__.py
Normal file
0
backend/app/services/__init__.py
Normal file
228
backend/app/services/congress_api.py
Normal file
228
backend/app/services/congress_api.py
Normal file
@@ -0,0 +1,228 @@
|
||||
"""
|
||||
Congress.gov API client.
|
||||
|
||||
Rate limit: 5,000 requests/hour (enforced server-side by Congress.gov).
|
||||
We track usage in Redis to stay well under the limit.
|
||||
"""
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
|
||||
from app.config import settings
|
||||
|
||||
BASE_URL = "https://api.congress.gov/v3"
|
||||
|
||||
_BILL_TYPE_SLUG = {
|
||||
"hr": "house-bill",
|
||||
"s": "senate-bill",
|
||||
"hjres": "house-joint-resolution",
|
||||
"sjres": "senate-joint-resolution",
|
||||
"hres": "house-resolution",
|
||||
"sres": "senate-resolution",
|
||||
"hconres": "house-concurrent-resolution",
|
||||
"sconres": "senate-concurrent-resolution",
|
||||
}
|
||||
|
||||
|
||||
def _congress_ordinal(n: int) -> str:
|
||||
if 11 <= n % 100 <= 13:
|
||||
return f"{n}th"
|
||||
suffixes = {1: "st", 2: "nd", 3: "rd"}
|
||||
return f"{n}{suffixes.get(n % 10, 'th')}"
|
||||
|
||||
|
||||
def build_bill_public_url(congress: int, bill_type: str, bill_number: int) -> str:
|
||||
"""Return the public congress.gov page URL for a bill (not the API endpoint)."""
|
||||
slug = _BILL_TYPE_SLUG.get(bill_type.lower(), bill_type.lower())
|
||||
return f"https://www.congress.gov/bill/{_congress_ordinal(congress)}-congress/{slug}/{bill_number}"
|
||||
|
||||
|
||||
def _get_current_congress() -> int:
|
||||
"""Calculate the current Congress number. 119th started Jan 3, 2025."""
|
||||
year = datetime.utcnow().year
|
||||
# Congress changes on odd years (Jan 3)
|
||||
if datetime.utcnow().month == 1 and datetime.utcnow().day < 3:
|
||||
year -= 1
|
||||
return 118 + ((year - 2023) // 2 + (1 if year % 2 == 1 else 0))
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
|
||||
def _get(endpoint: str, params: dict) -> dict:
|
||||
params["api_key"] = settings.DATA_GOV_API_KEY
|
||||
params["format"] = "json"
|
||||
response = requests.get(f"{BASE_URL}{endpoint}", params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def get_current_congress() -> int:
|
||||
return _get_current_congress()
|
||||
|
||||
|
||||
def build_bill_id(congress: int, bill_type: str, bill_number: int) -> str:
|
||||
return f"{congress}-{bill_type.lower()}-{bill_number}"
|
||||
|
||||
|
||||
def get_bills(
|
||||
congress: int,
|
||||
offset: int = 0,
|
||||
limit: int = 250,
|
||||
from_date_time: Optional[str] = None,
|
||||
) -> dict:
|
||||
params: dict = {"offset": offset, "limit": limit, "sort": "updateDate+desc"}
|
||||
if from_date_time:
|
||||
params["fromDateTime"] = from_date_time
|
||||
return _get(f"/bill/{congress}", params)
|
||||
|
||||
|
||||
def get_bill_detail(congress: int, bill_type: str, bill_number: int) -> dict:
|
||||
return _get(f"/bill/{congress}/{bill_type.lower()}/{bill_number}", {})
|
||||
|
||||
|
||||
def get_bill_actions(congress: int, bill_type: str, bill_number: int, offset: int = 0) -> dict:
|
||||
return _get(f"/bill/{congress}/{bill_type.lower()}/{bill_number}/actions", {"offset": offset, "limit": 250})
|
||||
|
||||
|
||||
def get_bill_cosponsors(congress: int, bill_type: str, bill_number: int, offset: int = 0) -> dict:
|
||||
return _get(f"/bill/{congress}/{bill_type.lower()}/{bill_number}/cosponsors", {"offset": offset, "limit": 250})
|
||||
|
||||
|
||||
def get_bill_text_versions(congress: int, bill_type: str, bill_number: int) -> dict:
|
||||
return _get(f"/bill/{congress}/{bill_type.lower()}/{bill_number}/text", {})
|
||||
|
||||
|
||||
def get_vote_detail(congress: int, chamber: str, session: int, roll_number: int) -> dict:
|
||||
chamber_slug = "house" if chamber.lower() == "house" else "senate"
|
||||
return _get(f"/vote/{congress}/{chamber_slug}/{session}/{roll_number}", {})
|
||||
|
||||
|
||||
def get_members(offset: int = 0, limit: int = 250, current_member: bool = True) -> dict:
|
||||
params: dict = {"offset": offset, "limit": limit}
|
||||
if current_member:
|
||||
params["currentMember"] = "true"
|
||||
return _get("/member", params)
|
||||
|
||||
|
||||
def get_member_detail(bioguide_id: str) -> dict:
|
||||
return _get(f"/member/{bioguide_id}", {})
|
||||
|
||||
|
||||
def get_committees(offset: int = 0, limit: int = 250) -> dict:
|
||||
return _get("/committee", {"offset": offset, "limit": limit})
|
||||
|
||||
|
||||
def parse_bill_from_api(data: dict, congress: int) -> dict:
|
||||
"""Normalize raw API bill data into our model fields."""
|
||||
bill_type = data.get("type", "").lower()
|
||||
bill_number = data.get("number", 0)
|
||||
latest_action = data.get("latestAction") or {}
|
||||
return {
|
||||
"bill_id": build_bill_id(congress, bill_type, bill_number),
|
||||
"congress_number": congress,
|
||||
"bill_type": bill_type,
|
||||
"bill_number": bill_number,
|
||||
"title": data.get("title"),
|
||||
"short_title": data.get("shortTitle"),
|
||||
"introduced_date": data.get("introducedDate"),
|
||||
"latest_action_date": latest_action.get("actionDate"),
|
||||
"latest_action_text": latest_action.get("text"),
|
||||
"status": latest_action.get("text", "")[:100] if latest_action.get("text") else None,
|
||||
"chamber": "House" if bill_type.startswith("h") else "Senate",
|
||||
"congress_url": build_bill_public_url(congress, bill_type, bill_number),
|
||||
}
|
||||
|
||||
|
||||
_STATE_NAME_TO_CODE: dict[str, str] = {
|
||||
"Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
|
||||
"California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
|
||||
"Florida": "FL", "Georgia": "GA", "Hawaii": "HI", "Idaho": "ID",
|
||||
"Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS",
|
||||
"Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
|
||||
"Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS",
|
||||
"Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV",
|
||||
"New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY",
|
||||
"North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK",
|
||||
"Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
|
||||
"South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT",
|
||||
"Vermont": "VT", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV",
|
||||
"Wisconsin": "WI", "Wyoming": "WY",
|
||||
"American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP",
|
||||
"Puerto Rico": "PR", "Virgin Islands": "VI", "District of Columbia": "DC",
|
||||
}
|
||||
|
||||
|
||||
def _normalize_state(state: str | None) -> str | None:
|
||||
if not state:
|
||||
return None
|
||||
s = state.strip()
|
||||
if len(s) == 2:
|
||||
return s.upper()
|
||||
return _STATE_NAME_TO_CODE.get(s, s)
|
||||
|
||||
|
||||
def parse_member_from_api(data: dict) -> dict:
|
||||
"""Normalize raw API member list data into our model fields."""
|
||||
terms = data.get("terms", {}).get("item", [])
|
||||
current_term = terms[-1] if terms else {}
|
||||
return {
|
||||
"bioguide_id": data.get("bioguideId"),
|
||||
"name": data.get("name", ""),
|
||||
"first_name": data.get("firstName"),
|
||||
"last_name": data.get("lastName"),
|
||||
"party": data.get("partyName") or None,
|
||||
"state": _normalize_state(data.get("state")),
|
||||
"chamber": current_term.get("chamber"),
|
||||
"district": str(data.get("district")) if data.get("district") else None,
|
||||
"photo_url": data.get("depiction", {}).get("imageUrl"),
|
||||
"official_url": data.get("officialWebsiteUrl"),
|
||||
}
|
||||
|
||||
|
||||
def parse_member_detail_from_api(data: dict) -> dict:
|
||||
"""Normalize Congress.gov member detail response into enrichment fields."""
|
||||
member = data.get("member", data)
|
||||
addr = member.get("addressInformation") or {}
|
||||
terms_raw = member.get("terms", [])
|
||||
if isinstance(terms_raw, dict):
|
||||
terms_raw = terms_raw.get("item", [])
|
||||
leadership_raw = member.get("leadership") or []
|
||||
if isinstance(leadership_raw, dict):
|
||||
leadership_raw = leadership_raw.get("item", [])
|
||||
first = member.get("firstName", "")
|
||||
last = member.get("lastName", "")
|
||||
bioguide_id = member.get("bioguideId", "")
|
||||
slug = f"{first}-{last}".lower().replace(" ", "-").replace("'", "")
|
||||
return {
|
||||
"birth_year": str(member["birthYear"]) if member.get("birthYear") else None,
|
||||
"address": addr.get("officeAddress"),
|
||||
"phone": addr.get("phoneNumber"),
|
||||
"official_url": member.get("officialWebsiteUrl"),
|
||||
"photo_url": (member.get("depiction") or {}).get("imageUrl"),
|
||||
"congress_url": f"https://www.congress.gov/member/{slug}/{bioguide_id}" if bioguide_id else None,
|
||||
"terms_json": [
|
||||
{
|
||||
"congress": t.get("congress"),
|
||||
"chamber": t.get("chamber"),
|
||||
"partyName": t.get("partyName"),
|
||||
"stateCode": t.get("stateCode"),
|
||||
"stateName": t.get("stateName"),
|
||||
"startYear": t.get("startYear"),
|
||||
"endYear": t.get("endYear"),
|
||||
"district": t.get("district"),
|
||||
}
|
||||
for t in terms_raw
|
||||
],
|
||||
"leadership_json": [
|
||||
{
|
||||
"type": l.get("type"),
|
||||
"congress": l.get("congress"),
|
||||
"current": l.get("current"),
|
||||
}
|
||||
for l in leadership_raw
|
||||
],
|
||||
"sponsored_count": (member.get("sponsoredLegislation") or {}).get("count"),
|
||||
"cosponsored_count": (member.get("cosponsoredLegislation") or {}).get("count"),
|
||||
}
|
||||
138
backend/app/services/govinfo_api.py
Normal file
138
backend/app/services/govinfo_api.py
Normal file
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
GovInfo API client for fetching actual bill text.
|
||||
|
||||
Priority order for text formats: htm > txt > pdf
|
||||
ETag support: stores ETags in Redis so repeat fetches skip unchanged documents.
|
||||
"""
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
GOVINFO_BASE = "https://api.govinfo.gov"
|
||||
FORMAT_PRIORITY = ["htm", "html", "txt", "pdf"]
|
||||
_ETAG_CACHE_TTL = 86400 * 30 # 30 days
|
||||
|
||||
|
||||
class DocumentUnchangedError(Exception):
|
||||
"""Raised when GovInfo confirms the document is unchanged via ETag (HTTP 304)."""
|
||||
pass
|
||||
|
||||
|
||||
def _etag_redis():
|
||||
import redis
|
||||
return redis.from_url(settings.REDIS_URL, decode_responses=True)
|
||||
|
||||
|
||||
def _etag_key(url: str) -> str:
|
||||
return f"govinfo:etag:{hashlib.md5(url.encode()).hexdigest()}"
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=15))
|
||||
def _get(url: str, params: dict = None) -> requests.Response:
|
||||
p = {"api_key": settings.DATA_GOV_API_KEY, **(params or {})}
|
||||
response = requests.get(url, params=p, timeout=60)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
|
||||
def get_package_summary(package_id: str) -> dict:
|
||||
response = _get(f"{GOVINFO_BASE}/packages/{package_id}/summary")
|
||||
return response.json()
|
||||
|
||||
|
||||
def get_package_content_detail(package_id: str) -> dict:
|
||||
response = _get(f"{GOVINFO_BASE}/packages/{package_id}/content-detail")
|
||||
return response.json()
|
||||
|
||||
|
||||
def find_best_text_url(text_versions: list[dict]) -> Optional[tuple[str, str]]:
|
||||
"""
|
||||
From a list of text version objects (from Congress.gov API), find the best
|
||||
available text format. Returns (url, format) or None.
|
||||
"""
|
||||
for fmt in FORMAT_PRIORITY:
|
||||
for version in text_versions:
|
||||
for fmt_info in version.get("formats", []):
|
||||
if not isinstance(fmt_info, dict):
|
||||
continue
|
||||
url = fmt_info.get("url", "")
|
||||
if url.lower().endswith(f".{fmt}"):
|
||||
return url, fmt
|
||||
return None, None
|
||||
|
||||
|
||||
def fetch_text_from_url(url: str, fmt: str) -> Optional[str]:
|
||||
"""
|
||||
Download and extract plain text from a GovInfo document URL.
|
||||
|
||||
Uses ETag conditional GET: if GovInfo returns 304 Not Modified,
|
||||
raises DocumentUnchangedError so the caller can skip reprocessing.
|
||||
On a successful 200 response, stores the new ETag in Redis for next time.
|
||||
"""
|
||||
headers = {}
|
||||
try:
|
||||
stored_etag = _etag_redis().get(_etag_key(url))
|
||||
if stored_etag:
|
||||
headers["If-None-Match"] = stored_etag
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=120)
|
||||
|
||||
if response.status_code == 304:
|
||||
raise DocumentUnchangedError(f"Document unchanged (ETag match): {url}")
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
# Persist ETag for future conditional requests
|
||||
etag = response.headers.get("ETag")
|
||||
if etag:
|
||||
try:
|
||||
_etag_redis().setex(_etag_key(url), _ETAG_CACHE_TTL, etag)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if fmt in ("htm", "html"):
|
||||
return _extract_from_html(response.text)
|
||||
elif fmt == "txt":
|
||||
return response.text
|
||||
elif fmt == "pdf":
|
||||
return _extract_from_pdf(response.content)
|
||||
|
||||
except DocumentUnchangedError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch text from {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_from_html(html: str) -> str:
|
||||
"""Strip HTML tags and clean up whitespace."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for tag in soup(["script", "style", "nav", "header", "footer"]):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator="\n")
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
text = re.sub(r" {2,}", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _extract_from_pdf(content: bytes) -> Optional[str]:
|
||||
"""Extract text from PDF bytes using pdfminer."""
|
||||
try:
|
||||
from io import BytesIO
|
||||
from pdfminer.high_level import extract_text as pdf_extract
|
||||
return pdf_extract(BytesIO(content))
|
||||
except Exception as e:
|
||||
logger.error(f"PDF extraction failed: {e}")
|
||||
return None
|
||||
523
backend/app/services/llm_service.py
Normal file
523
backend/app/services/llm_service.py
Normal file
@@ -0,0 +1,523 @@
|
||||
"""
|
||||
LLM provider abstraction.
|
||||
|
||||
All providers implement generate_brief(doc_text, bill_metadata) -> ReverseBrief.
|
||||
Select provider via LLM_PROVIDER env var.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RateLimitError(Exception):
|
||||
"""Raised when a provider returns a rate-limit response (HTTP 429 / quota exceeded)."""
|
||||
|
||||
def __init__(self, provider: str, retry_after: int = 60):
|
||||
self.provider = provider
|
||||
self.retry_after = retry_after
|
||||
super().__init__(f"{provider} rate limit exceeded; retry after {retry_after}s")
|
||||
|
||||
|
||||
def _detect_rate_limit(exc: Exception) -> bool:
|
||||
"""Return True if exc represents a provider rate-limit / quota error."""
|
||||
exc_type = type(exc).__name__.lower()
|
||||
exc_str = str(exc).lower()
|
||||
# OpenAI / Anthropic SDK raise a class named *RateLimitError
|
||||
if "ratelimit" in exc_type or "rate_limit" in exc_type:
|
||||
return True
|
||||
# Google Gemini SDK raises ResourceExhausted
|
||||
if "resourceexhausted" in exc_type:
|
||||
return True
|
||||
# Generic HTTP 429 or quota messages (e.g. Ollama, raw requests)
|
||||
if "429" in exc_str or "rate limit" in exc_str or "quota" in exc_str:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """You are a nonpartisan legislative analyst specializing in translating complex \
|
||||
legislation into clear, accurate summaries for informed citizens. You analyze bills objectively \
|
||||
without political bias.
|
||||
|
||||
Always respond with valid JSON matching exactly this schema:
|
||||
{
|
||||
"summary": "2-4 paragraph plain-language summary of what this bill does",
|
||||
"key_points": [
|
||||
{"text": "specific concrete fact", "citation": "Section X(y)", "quote": "verbatim excerpt from bill ≤80 words", "label": "cited_fact"}
|
||||
],
|
||||
"risks": [
|
||||
{"text": "legitimate concern or challenge", "citation": "Section X(y)", "quote": "verbatim excerpt from bill ≤80 words", "label": "cited_fact"}
|
||||
],
|
||||
"deadlines": [{"date": "YYYY-MM-DD or null", "description": "what happens on this date"}],
|
||||
"topic_tags": ["healthcare", "taxation"]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- summary: Explain WHAT the bill does, not whether it is good or bad. Be factual and complete.
|
||||
- key_points: 5-10 specific, concrete things the bill changes, authorizes, or appropriates. \
|
||||
Each item MUST include "text" (your claim), "citation" (the section number, e.g. "Section 301(a)(2)"), \
|
||||
"quote" (a verbatim excerpt of ≤80 words from that section that supports your claim), and "label".
|
||||
- risks: Legitimate concerns from any perspective — costs, implementation challenges, \
|
||||
constitutional questions, unintended consequences. Include at least 2 even for benign bills. \
|
||||
Each item MUST include "text", "citation", "quote", and "label" just like key_points.
|
||||
- label: "cited_fact" if the claim is directly and explicitly stated in the quoted text. \
|
||||
"inference" if the claim is an analytical interpretation, projection, or implication that goes \
|
||||
beyond what the text literally says (e.g. projected costs, likely downstream effects, \
|
||||
constitutional questions). When in doubt, use "inference".
|
||||
- deadlines: Only include if explicitly stated in the text. Use null for date if a deadline \
|
||||
is mentioned without a specific date. Empty list if none.
|
||||
- topic_tags: 3-8 lowercase tags. Prefer these standard tags: healthcare, taxation, defense, \
|
||||
education, immigration, environment, housing, infrastructure, technology, agriculture, judiciary, \
|
||||
foreign-policy, veterans, social-security, trade, budget, energy, banking, transportation, \
|
||||
public-lands, labor, civil-rights, science.
|
||||
|
||||
Respond with ONLY valid JSON. No preamble, no explanation, no markdown code blocks."""
|
||||
|
||||
MAX_TOKENS_DEFAULT = 6000
|
||||
MAX_TOKENS_OLLAMA = 3000
|
||||
TOKENS_PER_CHAR = 0.25 # rough approximation: 4 chars ≈ 1 token
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReverseBrief:
|
||||
summary: str
|
||||
key_points: list[dict]
|
||||
risks: list[dict]
|
||||
deadlines: list[dict]
|
||||
topic_tags: list[str]
|
||||
llm_provider: str
|
||||
llm_model: str
|
||||
|
||||
|
||||
def smart_truncate(text: str, max_tokens: int) -> str:
|
||||
"""Truncate bill text intelligently if it exceeds token budget."""
|
||||
approx_tokens = len(text) * TOKENS_PER_CHAR
|
||||
if approx_tokens <= max_tokens:
|
||||
return text
|
||||
|
||||
# Keep first 75% of budget for the preamble (purpose section)
|
||||
# and last 25% for effective dates / enforcement sections
|
||||
preamble_chars = int(max_tokens * 0.75 / TOKENS_PER_CHAR)
|
||||
tail_chars = int(max_tokens * 0.25 / TOKENS_PER_CHAR)
|
||||
omitted_chars = len(text) - preamble_chars - tail_chars
|
||||
|
||||
return (
|
||||
text[:preamble_chars]
|
||||
+ f"\n\n[... {omitted_chars:,} characters omitted for length ...]\n\n"
|
||||
+ text[-tail_chars:]
|
||||
)
|
||||
|
||||
|
||||
AMENDMENT_SYSTEM_PROMPT = """You are a nonpartisan legislative analyst. A bill has been updated \
|
||||
and you must summarize what changed between the previous and new version.
|
||||
|
||||
Always respond with valid JSON matching exactly this schema:
|
||||
{
|
||||
"summary": "2-3 paragraph plain-language description of what changed in this version",
|
||||
"key_points": [
|
||||
{"text": "specific change", "citation": "Section X(y)", "quote": "verbatim excerpt from new version ≤80 words", "label": "cited_fact"}
|
||||
],
|
||||
"risks": [
|
||||
{"text": "new concern introduced by this change", "citation": "Section X(y)", "quote": "verbatim excerpt from new version ≤80 words", "label": "cited_fact"}
|
||||
],
|
||||
"deadlines": [{"date": "YYYY-MM-DD or null", "description": "new deadline added"}],
|
||||
"topic_tags": ["healthcare", "taxation"]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- summary: Focus ONLY on what is different from the previous version. Be specific.
|
||||
- key_points: List concrete additions, removals, or modifications in this version. \
|
||||
Each item MUST include "text" (your claim), "citation" (the section number, e.g. "Section 301(a)(2)"), \
|
||||
"quote" (a verbatim excerpt of ≤80 words from the NEW version that supports your claim), and "label".
|
||||
- risks: Only include risks that are new or changed relative to the previous version. \
|
||||
Each item MUST include "text", "citation", "quote", and "label" just like key_points.
|
||||
- label: "cited_fact" if the claim is directly and explicitly stated in the quoted text. \
|
||||
"inference" if the claim is an analytical interpretation, projection, or implication that goes \
|
||||
beyond what the text literally says. When in doubt, use "inference".
|
||||
- deadlines: Only new or changed deadlines. Empty list if none.
|
||||
- topic_tags: Same standard tags as before — include any new topics this version adds.
|
||||
|
||||
Respond with ONLY valid JSON. No preamble, no explanation, no markdown code blocks."""
|
||||
|
||||
|
||||
def build_amendment_prompt(new_text: str, previous_text: str, bill_metadata: dict, max_tokens: int) -> str:
|
||||
half = max_tokens // 2
|
||||
truncated_new = smart_truncate(new_text, half)
|
||||
truncated_prev = smart_truncate(previous_text, half)
|
||||
return f"""A bill has been updated. Summarize what changed between the previous and new version.
|
||||
|
||||
BILL METADATA:
|
||||
- Title: {bill_metadata.get('title', 'Unknown')}
|
||||
- Sponsor: {bill_metadata.get('sponsor_name', 'Unknown')} \
|
||||
({bill_metadata.get('party', '?')}-{bill_metadata.get('state', '?')})
|
||||
- Latest Action: {bill_metadata.get('latest_action_text', 'None')} \
|
||||
({bill_metadata.get('latest_action_date', 'Unknown')})
|
||||
|
||||
PREVIOUS VERSION:
|
||||
{truncated_prev}
|
||||
|
||||
NEW VERSION:
|
||||
{truncated_new}
|
||||
|
||||
Produce the JSON amendment summary now:"""
|
||||
|
||||
|
||||
def build_prompt(doc_text: str, bill_metadata: dict, max_tokens: int) -> str:
|
||||
truncated = smart_truncate(doc_text, max_tokens)
|
||||
return f"""Analyze this legislation and produce a structured brief.
|
||||
|
||||
BILL METADATA:
|
||||
- Title: {bill_metadata.get('title', 'Unknown')}
|
||||
- Sponsor: {bill_metadata.get('sponsor_name', 'Unknown')} \
|
||||
({bill_metadata.get('party', '?')}-{bill_metadata.get('state', '?')})
|
||||
- Introduced: {bill_metadata.get('introduced_date', 'Unknown')}
|
||||
- Chamber: {bill_metadata.get('chamber', 'Unknown')}
|
||||
- Latest Action: {bill_metadata.get('latest_action_text', 'None')} \
|
||||
({bill_metadata.get('latest_action_date', 'Unknown')})
|
||||
|
||||
BILL TEXT:
|
||||
{truncated}
|
||||
|
||||
Produce the JSON brief now:"""
|
||||
|
||||
|
||||
def parse_brief_json(raw: str | dict, provider: str, model: str) -> ReverseBrief:
|
||||
"""Parse and validate LLM JSON response into a ReverseBrief."""
|
||||
if isinstance(raw, str):
|
||||
# Strip markdown code fences if present
|
||||
raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
|
||||
raw = re.sub(r"\s*```$", "", raw.strip())
|
||||
data = json.loads(raw)
|
||||
else:
|
||||
data = raw
|
||||
|
||||
return ReverseBrief(
|
||||
summary=str(data.get("summary", "")),
|
||||
key_points=list(data.get("key_points", [])),
|
||||
risks=list(data.get("risks", [])),
|
||||
deadlines=list(data.get("deadlines", [])),
|
||||
topic_tags=list(data.get("topic_tags", [])),
|
||||
llm_provider=provider,
|
||||
llm_model=model,
|
||||
)
|
||||
|
||||
|
||||
class LLMProvider(ABC):
|
||||
_provider_name: str = "unknown"
|
||||
|
||||
def _call(self, fn):
|
||||
"""Invoke fn(), translating provider-specific rate-limit errors to RateLimitError."""
|
||||
try:
|
||||
return fn()
|
||||
except RateLimitError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
if _detect_rate_limit(exc):
|
||||
raise RateLimitError(self._provider_name) from exc
|
||||
raise
|
||||
|
||||
@abstractmethod
|
||||
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def generate_text(self, prompt: str) -> str:
|
||||
pass
|
||||
|
||||
|
||||
class OpenAIProvider(LLMProvider):
|
||||
_provider_name = "openai"
|
||||
|
||||
def __init__(self, model: str | None = None):
|
||||
from openai import OpenAI
|
||||
self.client = OpenAI(api_key=settings.OPENAI_API_KEY)
|
||||
self.model = model or settings.OPENAI_MODEL
|
||||
|
||||
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_DEFAULT)
|
||||
response = self._call(lambda: self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.1,
|
||||
))
|
||||
raw = response.choices[0].message.content
|
||||
return parse_brief_json(raw, "openai", self.model)
|
||||
|
||||
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_DEFAULT)
|
||||
response = self._call(lambda: self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": AMENDMENT_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.1,
|
||||
))
|
||||
raw = response.choices[0].message.content
|
||||
return parse_brief_json(raw, "openai", self.model)
|
||||
|
||||
def generate_text(self, prompt: str) -> str:
|
||||
response = self._call(lambda: self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.3,
|
||||
))
|
||||
return response.choices[0].message.content or ""
|
||||
|
||||
|
||||
class AnthropicProvider(LLMProvider):
|
||||
_provider_name = "anthropic"
|
||||
|
||||
def __init__(self, model: str | None = None):
|
||||
import anthropic
|
||||
self.client = anthropic.Anthropic(api_key=settings.ANTHROPIC_API_KEY)
|
||||
self.model = model or settings.ANTHROPIC_MODEL
|
||||
|
||||
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_DEFAULT)
|
||||
response = self._call(lambda: self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=4096,
|
||||
system=[{
|
||||
"type": "text",
|
||||
"text": SYSTEM_PROMPT + "\n\nIMPORTANT: Respond with ONLY valid JSON. No other text.",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}],
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
))
|
||||
raw = response.content[0].text
|
||||
return parse_brief_json(raw, "anthropic", self.model)
|
||||
|
||||
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_DEFAULT)
|
||||
response = self._call(lambda: self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=4096,
|
||||
system=[{
|
||||
"type": "text",
|
||||
"text": AMENDMENT_SYSTEM_PROMPT + "\n\nIMPORTANT: Respond with ONLY valid JSON. No other text.",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}],
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
))
|
||||
raw = response.content[0].text
|
||||
return parse_brief_json(raw, "anthropic", self.model)
|
||||
|
||||
def generate_text(self, prompt: str) -> str:
|
||||
response = self._call(lambda: self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=1024,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
))
|
||||
return response.content[0].text
|
||||
|
||||
|
||||
class GeminiProvider(LLMProvider):
|
||||
_provider_name = "gemini"
|
||||
|
||||
def __init__(self, model: str | None = None):
|
||||
import google.generativeai as genai
|
||||
genai.configure(api_key=settings.GEMINI_API_KEY)
|
||||
self._genai = genai
|
||||
self.model_name = model or settings.GEMINI_MODEL
|
||||
|
||||
def _make_model(self, system_prompt: str):
|
||||
return self._genai.GenerativeModel(
|
||||
model_name=self.model_name,
|
||||
generation_config={"response_mime_type": "application/json", "temperature": 0.1},
|
||||
system_instruction=system_prompt,
|
||||
)
|
||||
|
||||
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_DEFAULT)
|
||||
response = self._call(lambda: self._make_model(SYSTEM_PROMPT).generate_content(prompt))
|
||||
return parse_brief_json(response.text, "gemini", self.model_name)
|
||||
|
||||
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_DEFAULT)
|
||||
response = self._call(lambda: self._make_model(AMENDMENT_SYSTEM_PROMPT).generate_content(prompt))
|
||||
return parse_brief_json(response.text, "gemini", self.model_name)
|
||||
|
||||
def generate_text(self, prompt: str) -> str:
|
||||
model = self._genai.GenerativeModel(
|
||||
model_name=self.model_name,
|
||||
generation_config={"temperature": 0.3},
|
||||
)
|
||||
response = self._call(lambda: model.generate_content(prompt))
|
||||
return response.text
|
||||
|
||||
|
||||
class OllamaProvider(LLMProvider):
|
||||
_provider_name = "ollama"
|
||||
|
||||
def __init__(self, model: str | None = None):
|
||||
self.base_url = settings.OLLAMA_BASE_URL.rstrip("/")
|
||||
self.model = model or settings.OLLAMA_MODEL
|
||||
|
||||
def _generate(self, system_prompt: str, user_prompt: str) -> str:
|
||||
import requests as req
|
||||
full_prompt = f"{system_prompt}\n\n{user_prompt}"
|
||||
response = req.post(
|
||||
f"{self.base_url}/api/generate",
|
||||
json={"model": self.model, "prompt": full_prompt, "stream": False, "format": "json"},
|
||||
timeout=300,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json().get("response", "")
|
||||
try:
|
||||
return raw
|
||||
except Exception:
|
||||
strict = f"{full_prompt}\n\nCRITICAL: Your response MUST be valid JSON only."
|
||||
r2 = req.post(
|
||||
f"{self.base_url}/api/generate",
|
||||
json={"model": self.model, "prompt": strict, "stream": False, "format": "json"},
|
||||
timeout=300,
|
||||
)
|
||||
r2.raise_for_status()
|
||||
return r2.json().get("response", "")
|
||||
|
||||
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_OLLAMA)
|
||||
raw = self._generate(SYSTEM_PROMPT, prompt)
|
||||
try:
|
||||
return parse_brief_json(raw, "ollama", self.model)
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"Ollama JSON parse failed, retrying: {e}")
|
||||
raw2 = self._generate(
|
||||
SYSTEM_PROMPT,
|
||||
prompt + "\n\nCRITICAL: Your response MUST be valid JSON only. No text before or after the JSON object."
|
||||
)
|
||||
return parse_brief_json(raw2, "ollama", self.model)
|
||||
|
||||
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
|
||||
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_OLLAMA)
|
||||
raw = self._generate(AMENDMENT_SYSTEM_PROMPT, prompt)
|
||||
try:
|
||||
return parse_brief_json(raw, "ollama", self.model)
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning(f"Ollama amendment JSON parse failed, retrying: {e}")
|
||||
raw2 = self._generate(
|
||||
AMENDMENT_SYSTEM_PROMPT,
|
||||
prompt + "\n\nCRITICAL: Your response MUST be valid JSON only. No text before or after the JSON object."
|
||||
)
|
||||
return parse_brief_json(raw2, "ollama", self.model)
|
||||
|
||||
def generate_text(self, prompt: str) -> str:
|
||||
import requests as req
|
||||
response = req.post(
|
||||
f"{self.base_url}/api/generate",
|
||||
json={"model": self.model, "prompt": prompt, "stream": False},
|
||||
timeout=120,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json().get("response", "")
|
||||
|
||||
|
||||
def get_llm_provider(provider: str | None = None, model: str | None = None) -> LLMProvider:
|
||||
"""Factory — returns the configured LLM provider.
|
||||
|
||||
Pass ``provider`` and/or ``model`` explicitly (e.g. from DB overrides) to bypass env defaults.
|
||||
"""
|
||||
if provider is None:
|
||||
provider = settings.LLM_PROVIDER
|
||||
provider = provider.lower()
|
||||
if provider == "openai":
|
||||
return OpenAIProvider(model=model)
|
||||
elif provider == "anthropic":
|
||||
return AnthropicProvider(model=model)
|
||||
elif provider == "gemini":
|
||||
return GeminiProvider(model=model)
|
||||
elif provider == "ollama":
|
||||
return OllamaProvider(model=model)
|
||||
raise ValueError(f"Unknown LLM_PROVIDER: '{provider}'. Must be one of: openai, anthropic, gemini, ollama")
|
||||
|
||||
|
||||
_BILL_TYPE_LABELS: dict[str, str] = {
|
||||
"hr": "H.R.",
|
||||
"s": "S.",
|
||||
"hjres": "H.J.Res.",
|
||||
"sjres": "S.J.Res.",
|
||||
"hconres": "H.Con.Res.",
|
||||
"sconres": "S.Con.Res.",
|
||||
"hres": "H.Res.",
|
||||
"sres": "S.Res.",
|
||||
}
|
||||
|
||||
_TONE_INSTRUCTIONS: dict[str, str] = {
|
||||
"short": "Keep the letter brief — 6 to 8 sentences total.",
|
||||
"polite": "Use a respectful, formal, and courteous tone throughout the letter.",
|
||||
"firm": "Use a direct, firm tone that makes clear the constituent's strong conviction.",
|
||||
}
|
||||
|
||||
|
||||
def generate_draft_letter(
|
||||
bill_label: str,
|
||||
bill_title: str,
|
||||
stance: str,
|
||||
recipient: str,
|
||||
tone: str,
|
||||
selected_points: list[str],
|
||||
include_citations: bool,
|
||||
zip_code: str | None,
|
||||
rep_name: str | None = None,
|
||||
llm_provider: str | None = None,
|
||||
llm_model: str | None = None,
|
||||
) -> str:
|
||||
"""Generate a plain-text constituent letter draft using the configured LLM provider."""
|
||||
vote_word = "YES" if stance == "yes" else "NO"
|
||||
chamber_word = "House" if recipient == "house" else "Senate"
|
||||
tone_instruction = _TONE_INSTRUCTIONS.get(tone, _TONE_INSTRUCTIONS["polite"])
|
||||
|
||||
points_block = "\n".join(f"- {p}" for p in selected_points)
|
||||
|
||||
citation_instruction = (
|
||||
"You may reference the citation label for each point (e.g. 'as noted in Section 3') if it adds clarity."
|
||||
if include_citations
|
||||
else "Do not include any citation references."
|
||||
)
|
||||
|
||||
location_line = f"The constituent is writing from ZIP code {zip_code}." if zip_code else ""
|
||||
|
||||
if rep_name:
|
||||
title = "Senator" if recipient == "senate" else "Representative"
|
||||
salutation_instruction = f'- Open with "Dear {title} {rep_name},"'
|
||||
else:
|
||||
salutation_instruction = f'- Open with "Dear {chamber_word} Member,"'
|
||||
|
||||
prompt = f"""Write a short constituent letter to a {chamber_word} member of Congress.
|
||||
|
||||
RULES:
|
||||
- {tone_instruction}
|
||||
- 6 to 12 sentences total.
|
||||
- {salutation_instruction}
|
||||
- Second sentence must be a clear, direct ask: "Please vote {vote_word} on {bill_label}."
|
||||
- The body must reference ONLY the points listed below — do not invent any other claims or facts.
|
||||
- {citation_instruction}
|
||||
- Close with a brief sign-off and the placeholder "[Your Name]".
|
||||
- Plain text only. No markdown, no bullet points, no headers, no partisan framing.
|
||||
- Do not mention any political party.
|
||||
|
||||
BILL: {bill_label} — {bill_title}
|
||||
STANCE: Vote {vote_word}
|
||||
{location_line}
|
||||
|
||||
SELECTED POINTS TO REFERENCE:
|
||||
{points_block}
|
||||
|
||||
Write the letter now:"""
|
||||
|
||||
return get_llm_provider(provider=llm_provider, model=llm_model).generate_text(prompt)
|
||||
308
backend/app/services/news_service.py
Normal file
308
backend/app/services/news_service.py
Normal file
@@ -0,0 +1,308 @@
|
||||
"""
|
||||
News correlation service.
|
||||
|
||||
- NewsAPI.org: structured news articles per bill (100 req/day limit)
|
||||
- Google News RSS: volume signal for zeitgeist scoring (no limit)
|
||||
"""
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import urllib.parse
|
||||
from datetime import date, datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
import feedparser
|
||||
import redis
|
||||
import requests
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
NEWSAPI_BASE = "https://newsapi.org/v2"
|
||||
GOOGLE_NEWS_RSS = "https://news.google.com/rss/search"
|
||||
NEWSAPI_DAILY_LIMIT = 95 # Leave 5 as buffer
|
||||
NEWSAPI_BATCH_SIZE = 4 # Bills per OR-combined API call
|
||||
|
||||
_NEWSAPI_REDIS_PREFIX = "newsapi:daily_calls:"
|
||||
_GNEWS_CACHE_TTL = 7200 # 2 hours — both trend_scorer and news_fetcher share cache
|
||||
|
||||
|
||||
def _redis():
|
||||
return redis.from_url(settings.REDIS_URL, decode_responses=True)
|
||||
|
||||
|
||||
def _newsapi_quota_ok() -> bool:
|
||||
"""Return True if we have quota remaining for today."""
|
||||
try:
|
||||
key = f"{_NEWSAPI_REDIS_PREFIX}{date.today().isoformat()}"
|
||||
used = int(_redis().get(key) or 0)
|
||||
return used < NEWSAPI_DAILY_LIMIT
|
||||
except Exception:
|
||||
return True # Don't block on Redis errors
|
||||
|
||||
|
||||
def _newsapi_record_call():
|
||||
try:
|
||||
r = _redis()
|
||||
key = f"{_NEWSAPI_REDIS_PREFIX}{date.today().isoformat()}"
|
||||
pipe = r.pipeline()
|
||||
pipe.incr(key)
|
||||
pipe.expire(key, 90000) # 25 hours — expires safely after midnight
|
||||
pipe.execute()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def get_newsapi_quota_remaining() -> int:
|
||||
"""Return the number of NewsAPI calls still available today."""
|
||||
try:
|
||||
key = f"{_NEWSAPI_REDIS_PREFIX}{date.today().isoformat()}"
|
||||
used = int(_redis().get(key) or 0)
|
||||
return max(0, NEWSAPI_DAILY_LIMIT - used)
|
||||
except Exception:
|
||||
return NEWSAPI_DAILY_LIMIT
|
||||
|
||||
|
||||
def clear_gnews_cache() -> int:
|
||||
"""Delete all cached Google News RSS results. Returns number of keys deleted."""
|
||||
try:
|
||||
r = _redis()
|
||||
keys = r.keys("gnews:*")
|
||||
if keys:
|
||||
return r.delete(*keys)
|
||||
return 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(2), wait=wait_exponential(min=1, max=5))
|
||||
def _newsapi_get(endpoint: str, params: dict) -> dict:
|
||||
params["apiKey"] = settings.NEWSAPI_KEY
|
||||
response = requests.get(f"{NEWSAPI_BASE}/{endpoint}", params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def build_news_query(bill_title: str, short_title: Optional[str], sponsor_name: Optional[str],
|
||||
bill_type: str, bill_number: int) -> str:
|
||||
"""Build a NewsAPI search query for a bill."""
|
||||
terms = []
|
||||
if short_title:
|
||||
terms.append(f'"{short_title}"')
|
||||
elif bill_title:
|
||||
# Use first 6 words of title as phrase
|
||||
words = bill_title.split()[:6]
|
||||
if len(words) >= 3:
|
||||
terms.append(f'"{" ".join(words)}"')
|
||||
# Add bill number as fallback
|
||||
terms.append(f'"{bill_type.upper()} {bill_number}"')
|
||||
return " OR ".join(terms[:2]) # Keep queries short for relevance
|
||||
|
||||
|
||||
def fetch_newsapi_articles(query: str, days: int = 30) -> list[dict]:
|
||||
"""Fetch articles from NewsAPI.org. Returns empty list if quota is exhausted or key not set."""
|
||||
if not settings.NEWSAPI_KEY:
|
||||
return []
|
||||
if not _newsapi_quota_ok():
|
||||
logger.warning("NewsAPI daily quota exhausted — skipping fetch")
|
||||
return []
|
||||
try:
|
||||
from_date = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d")
|
||||
data = _newsapi_get("everything", {
|
||||
"q": query,
|
||||
"language": "en",
|
||||
"sortBy": "relevancy",
|
||||
"pageSize": 10,
|
||||
"from": from_date,
|
||||
})
|
||||
_newsapi_record_call()
|
||||
articles = data.get("articles", [])
|
||||
return [
|
||||
{
|
||||
"source": a.get("source", {}).get("name", ""),
|
||||
"headline": a.get("title", ""),
|
||||
"url": a.get("url", ""),
|
||||
"published_at": a.get("publishedAt"),
|
||||
}
|
||||
for a in articles
|
||||
if a.get("url") and a.get("title")
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"NewsAPI fetch failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def fetch_newsapi_articles_batch(
|
||||
bill_queries: list[tuple[str, str]],
|
||||
days: int = 30,
|
||||
) -> dict[str, list[dict]]:
|
||||
"""
|
||||
Fetch NewsAPI articles for up to NEWSAPI_BATCH_SIZE bills in ONE API call
|
||||
using OR syntax. Returns {bill_id: [articles]} — each article attributed
|
||||
to the bill whose query terms appear in the headline/description.
|
||||
"""
|
||||
empty = {bill_id: [] for bill_id, _ in bill_queries}
|
||||
if not settings.NEWSAPI_KEY or not bill_queries:
|
||||
return empty
|
||||
if not _newsapi_quota_ok():
|
||||
logger.warning("NewsAPI daily quota exhausted — skipping batch fetch")
|
||||
return empty
|
||||
|
||||
combined_q = " OR ".join(q for _, q in bill_queries)
|
||||
try:
|
||||
from_date = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d")
|
||||
data = _newsapi_get("everything", {
|
||||
"q": combined_q,
|
||||
"language": "en",
|
||||
"sortBy": "relevancy",
|
||||
"pageSize": 20,
|
||||
"from": from_date,
|
||||
})
|
||||
_newsapi_record_call()
|
||||
articles = data.get("articles", [])
|
||||
|
||||
result: dict[str, list[dict]] = {bill_id: [] for bill_id, _ in bill_queries}
|
||||
for article in articles:
|
||||
content = " ".join([
|
||||
article.get("title", ""),
|
||||
article.get("description", "") or "",
|
||||
]).lower()
|
||||
for bill_id, query in bill_queries:
|
||||
# Match if any meaningful term from this bill's query appears in the article
|
||||
terms = [t.strip('" ').lower() for t in query.split(" OR ")]
|
||||
if any(len(t) > 3 and t in content for t in terms):
|
||||
result[bill_id].append({
|
||||
"source": article.get("source", {}).get("name", ""),
|
||||
"headline": article.get("title", ""),
|
||||
"url": article.get("url", ""),
|
||||
"published_at": article.get("publishedAt"),
|
||||
})
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"NewsAPI batch fetch failed: {e}")
|
||||
return empty
|
||||
|
||||
|
||||
# ── Google News RSS ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _gnews_cache_key(query: str, kind: str, days: int) -> str:
|
||||
h = hashlib.md5(f"{query}:{days}".encode()).hexdigest()[:12]
|
||||
return f"gnews:{kind}:{h}"
|
||||
|
||||
|
||||
def fetch_gnews_count(query: str, days: int = 30) -> int:
|
||||
"""Count articles in Google News RSS. Results cached in Redis for 2 hours."""
|
||||
cache_key = _gnews_cache_key(query, "count", days)
|
||||
try:
|
||||
cached = _redis().get(cache_key)
|
||||
if cached is not None:
|
||||
return int(cached)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
count = _fetch_gnews_count_raw(query, days)
|
||||
|
||||
try:
|
||||
_redis().setex(cache_key, _GNEWS_CACHE_TTL, count)
|
||||
except Exception:
|
||||
pass
|
||||
return count
|
||||
|
||||
|
||||
def _fetch_gnews_count_raw(query: str, days: int) -> int:
|
||||
"""Fetch gnews article count directly (no cache)."""
|
||||
try:
|
||||
encoded = urllib.parse.quote(f"{query} when:{days}d")
|
||||
url = f"{GOOGLE_NEWS_RSS}?q={encoded}&hl=en-US&gl=US&ceid=US:en"
|
||||
time.sleep(1) # Polite delay
|
||||
feed = feedparser.parse(url)
|
||||
return len(feed.entries)
|
||||
except Exception as e:
|
||||
logger.error(f"Google News RSS fetch failed: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def _gnews_entry_url(entry) -> str:
|
||||
"""Extract the article URL from a feedparser Google News RSS entry."""
|
||||
link = getattr(entry, "link", None) or entry.get("link", "")
|
||||
if link:
|
||||
return link
|
||||
for lnk in getattr(entry, "links", []):
|
||||
href = lnk.get("href", "")
|
||||
if href:
|
||||
return href
|
||||
return ""
|
||||
|
||||
|
||||
def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]:
|
||||
"""Fetch articles from Google News RSS. Results cached in Redis for 2 hours."""
|
||||
import time as time_mod
|
||||
cache_key = _gnews_cache_key(query, "articles", days)
|
||||
try:
|
||||
cached = _redis().get(cache_key)
|
||||
if cached is not None:
|
||||
return json.loads(cached)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
articles = _fetch_gnews_articles_raw(query, days)
|
||||
|
||||
try:
|
||||
_redis().setex(cache_key, _GNEWS_CACHE_TTL, json.dumps(articles))
|
||||
except Exception:
|
||||
pass
|
||||
return articles
|
||||
|
||||
|
||||
def _fetch_gnews_articles_raw(query: str, days: int) -> list[dict]:
|
||||
"""Fetch gnews articles directly (no cache)."""
|
||||
import time as time_mod
|
||||
try:
|
||||
encoded = urllib.parse.quote(f"{query} when:{days}d")
|
||||
url = f"{GOOGLE_NEWS_RSS}?q={encoded}&hl=en-US&gl=US&ceid=US:en"
|
||||
time.sleep(1) # Polite delay
|
||||
feed = feedparser.parse(url)
|
||||
articles = []
|
||||
for entry in feed.entries[:20]:
|
||||
pub_at = None
|
||||
if getattr(entry, "published_parsed", None):
|
||||
try:
|
||||
pub_at = datetime.fromtimestamp(
|
||||
time_mod.mktime(entry.published_parsed), tz=timezone.utc
|
||||
).isoformat()
|
||||
except Exception:
|
||||
pass
|
||||
source = ""
|
||||
src = getattr(entry, "source", None)
|
||||
if src:
|
||||
source = getattr(src, "title", "") or src.get("title", "")
|
||||
headline = entry.get("title", "") or getattr(entry, "title", "")
|
||||
article_url = _gnews_entry_url(entry)
|
||||
if article_url and headline:
|
||||
articles.append({
|
||||
"source": source or "Google News",
|
||||
"headline": headline,
|
||||
"url": article_url,
|
||||
"published_at": pub_at,
|
||||
})
|
||||
return articles
|
||||
except Exception as e:
|
||||
logger.error(f"Google News RSS article fetch failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def build_member_query(first_name: str, last_name: str, chamber: Optional[str] = None) -> str:
|
||||
"""Build a news search query for a member of Congress."""
|
||||
full_name = f"{first_name} {last_name}".strip()
|
||||
title = ""
|
||||
if chamber:
|
||||
if "senate" in chamber.lower():
|
||||
title = "Senator"
|
||||
else:
|
||||
title = "Rep."
|
||||
if title:
|
||||
return f'"{full_name}" OR "{title} {last_name}"'
|
||||
return f'"{full_name}"'
|
||||
112
backend/app/services/trends_service.py
Normal file
112
backend/app/services/trends_service.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
Google Trends service (via pytrends).
|
||||
|
||||
pytrends is unofficial web scraping — Google blocks it sporadically.
|
||||
All calls are wrapped in try/except and return 0 on any failure.
|
||||
"""
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_trends_score(keywords: list[str]) -> float:
|
||||
"""
|
||||
Return a 0–100 interest score for the given keywords over the past 90 days.
|
||||
Returns 0.0 on any failure (rate limit, empty data, exception).
|
||||
"""
|
||||
if not settings.PYTRENDS_ENABLED or not keywords:
|
||||
return 0.0
|
||||
try:
|
||||
from pytrends.request import TrendReq
|
||||
|
||||
# Jitter to avoid detection as bot
|
||||
time.sleep(random.uniform(2.0, 5.0))
|
||||
|
||||
pytrends = TrendReq(hl="en-US", tz=0, timeout=(10, 25))
|
||||
kw_list = [k for k in keywords[:5] if k] # max 5 keywords
|
||||
if not kw_list:
|
||||
return 0.0
|
||||
|
||||
pytrends.build_payload(kw_list, timeframe="today 3-m", geo="US")
|
||||
data = pytrends.interest_over_time()
|
||||
|
||||
if data is None or data.empty:
|
||||
return 0.0
|
||||
|
||||
# Average the most recent 14 data points for the primary keyword
|
||||
primary = kw_list[0]
|
||||
if primary not in data.columns:
|
||||
return 0.0
|
||||
|
||||
recent = data[primary].tail(14)
|
||||
return float(recent.mean())
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"pytrends failed (non-critical): {e}")
|
||||
return 0.0
|
||||
|
||||
|
||||
def get_trends_scores_batch(keyword_groups: list[list[str]]) -> list[float]:
|
||||
"""
|
||||
Get pytrends scores for up to 5 keyword groups in a SINGLE pytrends call.
|
||||
Takes the first (most relevant) keyword from each group and compares them
|
||||
relative to each other. Falls back to per-group individual calls if the
|
||||
batch fails.
|
||||
|
||||
Returns a list of scores (0–100) in the same order as keyword_groups.
|
||||
"""
|
||||
if not settings.PYTRENDS_ENABLED or not keyword_groups:
|
||||
return [0.0] * len(keyword_groups)
|
||||
|
||||
# Extract the primary (first) keyword from each group, skip empty groups
|
||||
primaries = [(i, kws[0]) for i, kws in enumerate(keyword_groups) if kws]
|
||||
if not primaries:
|
||||
return [0.0] * len(keyword_groups)
|
||||
|
||||
try:
|
||||
from pytrends.request import TrendReq
|
||||
|
||||
time.sleep(random.uniform(2.0, 5.0))
|
||||
pytrends = TrendReq(hl="en-US", tz=0, timeout=(10, 25))
|
||||
kw_list = [kw for _, kw in primaries[:5]]
|
||||
|
||||
pytrends.build_payload(kw_list, timeframe="today 3-m", geo="US")
|
||||
data = pytrends.interest_over_time()
|
||||
|
||||
scores = [0.0] * len(keyword_groups)
|
||||
if data is not None and not data.empty:
|
||||
for idx, kw in primaries[:5]:
|
||||
if kw in data.columns:
|
||||
scores[idx] = float(data[kw].tail(14).mean())
|
||||
return scores
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"pytrends batch failed (non-critical): {e}")
|
||||
# Fallback: return zeros (individual calls would just multiply failures)
|
||||
return [0.0] * len(keyword_groups)
|
||||
|
||||
|
||||
def keywords_for_member(first_name: str, last_name: str) -> list[str]:
|
||||
"""Extract meaningful search keywords for a member of Congress."""
|
||||
full_name = f"{first_name} {last_name}".strip()
|
||||
if not full_name:
|
||||
return []
|
||||
return [full_name]
|
||||
|
||||
|
||||
def keywords_for_bill(title: str, short_title: str, topic_tags: list[str]) -> list[str]:
|
||||
"""Extract meaningful search keywords for a bill."""
|
||||
keywords = []
|
||||
if short_title:
|
||||
keywords.append(short_title)
|
||||
elif title:
|
||||
# Use first 5 words of title
|
||||
words = title.split()[:5]
|
||||
if len(words) >= 2:
|
||||
keywords.append(" ".join(words))
|
||||
keywords.extend(tag.replace("-", " ") for tag in (topic_tags or [])[:3])
|
||||
return keywords[:5]
|
||||
Reference in New Issue
Block a user