Initial commit

This commit is contained in:
Jack Levy
2026-02-28 21:08:19 -05:00
commit e418dd9ae0
85 changed files with 5261 additions and 0 deletions

View File

View File

@@ -0,0 +1,120 @@
"""
Congress.gov API client.
Rate limit: 5,000 requests/hour (enforced server-side by Congress.gov).
We track usage in Redis to stay well under the limit.
"""
import time
from datetime import datetime
from typing import Optional
import requests
from tenacity import retry, stop_after_attempt, wait_exponential
from app.config import settings
BASE_URL = "https://api.congress.gov/v3"
def _get_current_congress() -> int:
"""Calculate the current Congress number. 119th started Jan 3, 2025."""
year = datetime.utcnow().year
# Congress changes on odd years (Jan 3)
if datetime.utcnow().month == 1 and datetime.utcnow().day < 3:
year -= 1
return 118 + ((year - 2023) // 2 + (1 if year % 2 == 1 else 0))
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
def _get(endpoint: str, params: dict) -> dict:
params["api_key"] = settings.DATA_GOV_API_KEY
params["format"] = "json"
response = requests.get(f"{BASE_URL}{endpoint}", params=params, timeout=30)
response.raise_for_status()
return response.json()
def get_current_congress() -> int:
return _get_current_congress()
def build_bill_id(congress: int, bill_type: str, bill_number: int) -> str:
return f"{congress}-{bill_type.lower()}-{bill_number}"
def get_bills(
congress: int,
offset: int = 0,
limit: int = 250,
from_date_time: Optional[str] = None,
) -> dict:
params: dict = {"offset": offset, "limit": limit, "sort": "updateDate+desc"}
if from_date_time:
params["fromDateTime"] = from_date_time
return _get(f"/bill/{congress}", params)
def get_bill_detail(congress: int, bill_type: str, bill_number: int) -> dict:
return _get(f"/bill/{congress}/{bill_type.lower()}/{bill_number}", {})
def get_bill_actions(congress: int, bill_type: str, bill_number: int, offset: int = 0) -> dict:
return _get(f"/bill/{congress}/{bill_type.lower()}/{bill_number}/actions", {"offset": offset, "limit": 250})
def get_bill_text_versions(congress: int, bill_type: str, bill_number: int) -> dict:
return _get(f"/bill/{congress}/{bill_type.lower()}/{bill_number}/text", {})
def get_members(offset: int = 0, limit: int = 250, current_member: bool = True) -> dict:
params: dict = {"offset": offset, "limit": limit}
if current_member:
params["currentMember"] = "true"
return _get("/member", params)
def get_member_detail(bioguide_id: str) -> dict:
return _get(f"/member/{bioguide_id}", {})
def get_committees(offset: int = 0, limit: int = 250) -> dict:
return _get("/committee", {"offset": offset, "limit": limit})
def parse_bill_from_api(data: dict, congress: int) -> dict:
"""Normalize raw API bill data into our model fields."""
bill_type = data.get("type", "").lower()
bill_number = data.get("number", 0)
latest_action = data.get("latestAction") or {}
return {
"bill_id": build_bill_id(congress, bill_type, bill_number),
"congress_number": congress,
"bill_type": bill_type,
"bill_number": bill_number,
"title": data.get("title"),
"short_title": data.get("shortTitle"),
"introduced_date": data.get("introducedDate"),
"latest_action_date": latest_action.get("actionDate"),
"latest_action_text": latest_action.get("text"),
"status": latest_action.get("text", "")[:100] if latest_action.get("text") else None,
"chamber": "House" if bill_type.startswith("h") else "Senate",
"congress_url": data.get("url"),
}
def parse_member_from_api(data: dict) -> dict:
"""Normalize raw API member data into our model fields."""
terms = data.get("terms", {}).get("item", [])
current_term = terms[-1] if terms else {}
return {
"bioguide_id": data.get("bioguideId"),
"name": data.get("name", ""),
"first_name": data.get("firstName"),
"last_name": data.get("lastName"),
"party": data.get("partyName") or None,
"state": data.get("state"),
"chamber": current_term.get("chamber"),
"district": str(current_term.get("district")) if current_term.get("district") else None,
"photo_url": data.get("depiction", {}).get("imageUrl"),
"official_url": data.get("officialWebsiteUrl"),
}

View File

@@ -0,0 +1,95 @@
"""
GovInfo API client for fetching actual bill text.
Priority order for text formats: htm > txt > pdf
"""
import logging
import re
from typing import Optional
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_exponential
from app.config import settings
logger = logging.getLogger(__name__)
GOVINFO_BASE = "https://api.govinfo.gov"
FORMAT_PRIORITY = ["htm", "html", "txt", "pdf"]
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=15))
def _get(url: str, params: dict = None) -> requests.Response:
p = {"api_key": settings.DATA_GOV_API_KEY, **(params or {})}
response = requests.get(url, params=p, timeout=60)
response.raise_for_status()
return response
def get_package_summary(package_id: str) -> dict:
response = _get(f"{GOVINFO_BASE}/packages/{package_id}/summary")
return response.json()
def get_package_content_detail(package_id: str) -> dict:
response = _get(f"{GOVINFO_BASE}/packages/{package_id}/content-detail")
return response.json()
def find_best_text_url(text_versions: list[dict]) -> Optional[tuple[str, str]]:
"""
From a list of text version objects (from Congress.gov API), find the best
available text format. Returns (url, format) or None.
Matches by URL extension since Congress.gov type strings are "Formatted Text", "PDF", etc.
"""
for fmt in FORMAT_PRIORITY:
for version in text_versions:
for fmt_info in version.get("formats", []):
if not isinstance(fmt_info, dict):
continue
url = fmt_info.get("url", "")
if url.lower().endswith(f".{fmt}"):
return url, fmt
return None, None
def fetch_text_from_url(url: str, fmt: str) -> Optional[str]:
"""Download and extract plain text from a GovInfo document URL."""
try:
response = requests.get(url, timeout=120)
response.raise_for_status()
if fmt in ("htm", "html"):
return _extract_from_html(response.text)
elif fmt == "txt":
return response.text
elif fmt == "pdf":
return _extract_from_pdf(response.content)
except Exception as e:
logger.error(f"Failed to fetch text from {url}: {e}")
return None
def _extract_from_html(html: str) -> str:
"""Strip HTML tags and clean up whitespace."""
soup = BeautifulSoup(html, "lxml")
# Remove script/style tags
for tag in soup(["script", "style", "nav", "header", "footer"]):
tag.decompose()
text = soup.get_text(separator="\n")
# Collapse excessive whitespace
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r" {2,}", " ", text)
return text.strip()
def _extract_from_pdf(content: bytes) -> Optional[str]:
"""Extract text from PDF bytes using pdfminer."""
try:
from io import BytesIO
from pdfminer.high_level import extract_text as pdf_extract
return pdf_extract(BytesIO(content))
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
return None

View File

@@ -0,0 +1,327 @@
"""
LLM provider abstraction.
All providers implement generate_brief(doc_text, bill_metadata) -> ReverseBrief.
Select provider via LLM_PROVIDER env var.
"""
import json
import logging
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from app.config import settings
logger = logging.getLogger(__name__)
SYSTEM_PROMPT = """You are a nonpartisan legislative analyst specializing in translating complex \
legislation into clear, accurate summaries for informed citizens. You analyze bills objectively \
without political bias.
Always respond with valid JSON matching exactly this schema:
{
"summary": "2-4 paragraph plain-language summary of what this bill does",
"key_points": ["specific concrete fact 1", "specific concrete fact 2"],
"risks": ["legitimate concern or challenge 1", "legitimate concern 2"],
"deadlines": [{"date": "YYYY-MM-DD or null", "description": "what happens on this date"}],
"topic_tags": ["healthcare", "taxation"]
}
Rules:
- summary: Explain WHAT the bill does, not whether it is good or bad. Be factual and complete.
- key_points: 5-10 specific, concrete things the bill changes, authorizes, or appropriates.
- risks: Legitimate concerns from any perspective — costs, implementation challenges, \
constitutional questions, unintended consequences. Include at least 2 even for benign bills.
- deadlines: Only include if explicitly stated in the text. Use null for date if a deadline \
is mentioned without a specific date. Empty list if none.
- topic_tags: 3-8 lowercase tags. Prefer these standard tags: healthcare, taxation, defense, \
education, immigration, environment, housing, infrastructure, technology, agriculture, judiciary, \
foreign-policy, veterans, social-security, trade, budget, energy, banking, transportation, \
public-lands, labor, civil-rights, science.
Respond with ONLY valid JSON. No preamble, no explanation, no markdown code blocks."""
MAX_TOKENS_DEFAULT = 6000
MAX_TOKENS_OLLAMA = 3000
TOKENS_PER_CHAR = 0.25 # rough approximation: 4 chars ≈ 1 token
@dataclass
class ReverseBrief:
summary: str
key_points: list[str]
risks: list[str]
deadlines: list[dict]
topic_tags: list[str]
llm_provider: str
llm_model: str
def smart_truncate(text: str, max_tokens: int) -> str:
"""Truncate bill text intelligently if it exceeds token budget."""
approx_tokens = len(text) * TOKENS_PER_CHAR
if approx_tokens <= max_tokens:
return text
# Keep first 75% of budget for the preamble (purpose section)
# and last 25% for effective dates / enforcement sections
preamble_chars = int(max_tokens * 0.75 / TOKENS_PER_CHAR)
tail_chars = int(max_tokens * 0.25 / TOKENS_PER_CHAR)
omitted_chars = len(text) - preamble_chars - tail_chars
return (
text[:preamble_chars]
+ f"\n\n[... {omitted_chars:,} characters omitted for length ...]\n\n"
+ text[-tail_chars:]
)
AMENDMENT_SYSTEM_PROMPT = """You are a nonpartisan legislative analyst. A bill has been updated \
and you must summarize what changed between the previous and new version.
Always respond with valid JSON matching exactly this schema:
{
"summary": "2-3 paragraph plain-language description of what changed in this version",
"key_points": ["specific change 1", "specific change 2"],
"risks": ["new concern introduced by this change 1", "concern 2"],
"deadlines": [{"date": "YYYY-MM-DD or null", "description": "new deadline added"}],
"topic_tags": ["healthcare", "taxation"]
}
Rules:
- summary: Focus ONLY on what is different from the previous version. Be specific.
- key_points: List concrete additions, removals, or modifications in this version.
- risks: Only include risks that are new or changed relative to the previous version.
- deadlines: Only new or changed deadlines. Empty list if none.
- topic_tags: Same standard tags as before — include any new topics this version adds.
Respond with ONLY valid JSON. No preamble, no explanation, no markdown code blocks."""
def build_amendment_prompt(new_text: str, previous_text: str, bill_metadata: dict, max_tokens: int) -> str:
half = max_tokens // 2
truncated_new = smart_truncate(new_text, half)
truncated_prev = smart_truncate(previous_text, half)
return f"""A bill has been updated. Summarize what changed between the previous and new version.
BILL METADATA:
- Title: {bill_metadata.get('title', 'Unknown')}
- Sponsor: {bill_metadata.get('sponsor_name', 'Unknown')} \
({bill_metadata.get('party', '?')}-{bill_metadata.get('state', '?')})
- Latest Action: {bill_metadata.get('latest_action_text', 'None')} \
({bill_metadata.get('latest_action_date', 'Unknown')})
PREVIOUS VERSION:
{truncated_prev}
NEW VERSION:
{truncated_new}
Produce the JSON amendment summary now:"""
def build_prompt(doc_text: str, bill_metadata: dict, max_tokens: int) -> str:
truncated = smart_truncate(doc_text, max_tokens)
return f"""Analyze this legislation and produce a structured brief.
BILL METADATA:
- Title: {bill_metadata.get('title', 'Unknown')}
- Sponsor: {bill_metadata.get('sponsor_name', 'Unknown')} \
({bill_metadata.get('party', '?')}-{bill_metadata.get('state', '?')})
- Introduced: {bill_metadata.get('introduced_date', 'Unknown')}
- Chamber: {bill_metadata.get('chamber', 'Unknown')}
- Latest Action: {bill_metadata.get('latest_action_text', 'None')} \
({bill_metadata.get('latest_action_date', 'Unknown')})
BILL TEXT:
{truncated}
Produce the JSON brief now:"""
def parse_brief_json(raw: str | dict, provider: str, model: str) -> ReverseBrief:
"""Parse and validate LLM JSON response into a ReverseBrief."""
if isinstance(raw, str):
# Strip markdown code fences if present
raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
raw = re.sub(r"\s*```$", "", raw.strip())
data = json.loads(raw)
else:
data = raw
return ReverseBrief(
summary=str(data.get("summary", "")),
key_points=list(data.get("key_points", [])),
risks=list(data.get("risks", [])),
deadlines=list(data.get("deadlines", [])),
topic_tags=list(data.get("topic_tags", [])),
llm_provider=provider,
llm_model=model,
)
class LLMProvider(ABC):
@abstractmethod
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
pass
@abstractmethod
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
pass
class OpenAIProvider(LLMProvider):
def __init__(self):
from openai import OpenAI
self.client = OpenAI(api_key=settings.OPENAI_API_KEY)
self.model = settings.OPENAI_MODEL
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
temperature=0.1,
)
raw = response.choices[0].message.content
return parse_brief_json(raw, "openai", self.model)
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": AMENDMENT_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
temperature=0.1,
)
raw = response.choices[0].message.content
return parse_brief_json(raw, "openai", self.model)
class AnthropicProvider(LLMProvider):
def __init__(self):
import anthropic
self.client = anthropic.Anthropic(api_key=settings.ANTHROPIC_API_KEY)
self.model = settings.ANTHROPIC_MODEL
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self.client.messages.create(
model=self.model,
max_tokens=4096,
system=SYSTEM_PROMPT + "\n\nIMPORTANT: Respond with ONLY valid JSON. No other text.",
messages=[{"role": "user", "content": prompt}],
)
raw = response.content[0].text
return parse_brief_json(raw, "anthropic", self.model)
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self.client.messages.create(
model=self.model,
max_tokens=4096,
system=AMENDMENT_SYSTEM_PROMPT + "\n\nIMPORTANT: Respond with ONLY valid JSON. No other text.",
messages=[{"role": "user", "content": prompt}],
)
raw = response.content[0].text
return parse_brief_json(raw, "anthropic", self.model)
class GeminiProvider(LLMProvider):
def __init__(self):
import google.generativeai as genai
genai.configure(api_key=settings.GEMINI_API_KEY)
self._genai = genai
self.model_name = settings.GEMINI_MODEL
def _make_model(self, system_prompt: str):
return self._genai.GenerativeModel(
model_name=self.model_name,
generation_config={"response_mime_type": "application/json", "temperature": 0.1},
system_instruction=system_prompt,
)
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self._make_model(SYSTEM_PROMPT).generate_content(prompt)
return parse_brief_json(response.text, "gemini", self.model_name)
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_DEFAULT)
response = self._make_model(AMENDMENT_SYSTEM_PROMPT).generate_content(prompt)
return parse_brief_json(response.text, "gemini", self.model_name)
class OllamaProvider(LLMProvider):
def __init__(self):
self.base_url = settings.OLLAMA_BASE_URL.rstrip("/")
self.model = settings.OLLAMA_MODEL
def _generate(self, system_prompt: str, user_prompt: str) -> str:
import requests as req
full_prompt = f"{system_prompt}\n\n{user_prompt}"
response = req.post(
f"{self.base_url}/api/generate",
json={"model": self.model, "prompt": full_prompt, "stream": False, "format": "json"},
timeout=300,
)
response.raise_for_status()
raw = response.json().get("response", "")
try:
return raw
except Exception:
strict = f"{full_prompt}\n\nCRITICAL: Your response MUST be valid JSON only."
r2 = req.post(
f"{self.base_url}/api/generate",
json={"model": self.model, "prompt": strict, "stream": False, "format": "json"},
timeout=300,
)
r2.raise_for_status()
return r2.json().get("response", "")
def generate_brief(self, doc_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_prompt(doc_text, bill_metadata, MAX_TOKENS_OLLAMA)
raw = self._generate(SYSTEM_PROMPT, prompt)
try:
return parse_brief_json(raw, "ollama", self.model)
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"Ollama JSON parse failed, retrying: {e}")
raw2 = self._generate(
SYSTEM_PROMPT,
prompt + "\n\nCRITICAL: Your response MUST be valid JSON only. No text before or after the JSON object."
)
return parse_brief_json(raw2, "ollama", self.model)
def generate_amendment_brief(self, new_text: str, previous_text: str, bill_metadata: dict) -> ReverseBrief:
prompt = build_amendment_prompt(new_text, previous_text, bill_metadata, MAX_TOKENS_OLLAMA)
raw = self._generate(AMENDMENT_SYSTEM_PROMPT, prompt)
try:
return parse_brief_json(raw, "ollama", self.model)
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"Ollama amendment JSON parse failed, retrying: {e}")
raw2 = self._generate(
AMENDMENT_SYSTEM_PROMPT,
prompt + "\n\nCRITICAL: Your response MUST be valid JSON only. No text before or after the JSON object."
)
return parse_brief_json(raw2, "ollama", self.model)
def get_llm_provider() -> LLMProvider:
"""Factory — returns the configured LLM provider."""
provider = settings.LLM_PROVIDER.lower()
if provider == "openai":
return OpenAIProvider()
elif provider == "anthropic":
return AnthropicProvider()
elif provider == "gemini":
return GeminiProvider()
elif provider == "ollama":
return OllamaProvider()
raise ValueError(f"Unknown LLM_PROVIDER: '{provider}'. Must be one of: openai, anthropic, gemini, ollama")

View File

@@ -0,0 +1,89 @@
"""
News correlation service.
- NewsAPI.org: structured news articles per bill (100 req/day limit)
- Google News RSS: volume signal for zeitgeist scoring (no limit)
"""
import logging
import time
import urllib.parse
from datetime import datetime, timedelta, timezone
from typing import Optional
import feedparser
import requests
from tenacity import retry, stop_after_attempt, wait_exponential
from app.config import settings
logger = logging.getLogger(__name__)
NEWSAPI_BASE = "https://newsapi.org/v2"
GOOGLE_NEWS_RSS = "https://news.google.com/rss/search"
NEWSAPI_DAILY_LIMIT = 95 # Leave 5 as buffer
@retry(stop=stop_after_attempt(2), wait=wait_exponential(min=1, max=5))
def _newsapi_get(endpoint: str, params: dict) -> dict:
params["apiKey"] = settings.NEWSAPI_KEY
response = requests.get(f"{NEWSAPI_BASE}/{endpoint}", params=params, timeout=30)
response.raise_for_status()
return response.json()
def build_news_query(bill_title: str, short_title: Optional[str], sponsor_name: Optional[str],
bill_type: str, bill_number: int) -> str:
"""Build a NewsAPI search query for a bill."""
terms = []
if short_title:
terms.append(f'"{short_title}"')
elif bill_title:
# Use first 6 words of title as phrase
words = bill_title.split()[:6]
if len(words) >= 3:
terms.append(f'"{" ".join(words)}"')
# Add bill number as fallback
terms.append(f'"{bill_type.upper()} {bill_number}"')
return " OR ".join(terms[:2]) # Keep queries short for relevance
def fetch_newsapi_articles(query: str, days: int = 30) -> list[dict]:
"""Fetch articles from NewsAPI.org. Returns empty list if quota is exhausted or key not set."""
if not settings.NEWSAPI_KEY:
return []
try:
from_date = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d")
data = _newsapi_get("everything", {
"q": query,
"language": "en",
"sortBy": "relevancy",
"pageSize": 10,
"from": from_date,
})
articles = data.get("articles", [])
return [
{
"source": a.get("source", {}).get("name", ""),
"headline": a.get("title", ""),
"url": a.get("url", ""),
"published_at": a.get("publishedAt"),
}
for a in articles
if a.get("url") and a.get("title")
]
except Exception as e:
logger.error(f"NewsAPI fetch failed: {e}")
return []
def fetch_gnews_count(query: str, days: int = 30) -> int:
"""Count articles in Google News RSS for the past N days. Used as volume signal."""
try:
encoded = urllib.parse.quote(f"{query} when:{days}d")
url = f"{GOOGLE_NEWS_RSS}?q={encoded}&hl=en-US&gl=US&ceid=US:en"
time.sleep(1) # Polite delay
feed = feedparser.parse(url)
return len(feed.entries)
except Exception as e:
logger.error(f"Google News RSS fetch failed: {e}")
return 0

View File

@@ -0,0 +1,64 @@
"""
Google Trends service (via pytrends).
pytrends is unofficial web scraping — Google blocks it sporadically.
All calls are wrapped in try/except and return 0 on any failure.
"""
import logging
import random
import time
from app.config import settings
logger = logging.getLogger(__name__)
def get_trends_score(keywords: list[str]) -> float:
"""
Return a 0100 interest score for the given keywords over the past 90 days.
Returns 0.0 on any failure (rate limit, empty data, exception).
"""
if not settings.PYTRENDS_ENABLED or not keywords:
return 0.0
try:
from pytrends.request import TrendReq
# Jitter to avoid detection as bot
time.sleep(random.uniform(2.0, 5.0))
pytrends = TrendReq(hl="en-US", tz=0, timeout=(10, 25))
kw_list = [k for k in keywords[:5] if k] # max 5 keywords
if not kw_list:
return 0.0
pytrends.build_payload(kw_list, timeframe="today 3-m", geo="US")
data = pytrends.interest_over_time()
if data is None or data.empty:
return 0.0
# Average the most recent 14 data points for the primary keyword
primary = kw_list[0]
if primary not in data.columns:
return 0.0
recent = data[primary].tail(14)
return float(recent.mean())
except Exception as e:
logger.debug(f"pytrends failed (non-critical): {e}")
return 0.0
def keywords_for_bill(title: str, short_title: str, topic_tags: list[str]) -> list[str]:
"""Extract meaningful search keywords for a bill."""
keywords = []
if short_title:
keywords.append(short_title)
elif title:
# Use first 5 words of title
words = title.split()[:5]
if len(words) >= 2:
keywords.append(" ".join(words))
keywords.extend(tag.replace("-", " ") for tag in (topic_tags or [])[:3])
return keywords[:5]