""" News correlation service. - NewsAPI.org: structured news articles per bill (100 req/day limit) - Google News RSS: volume signal for zeitgeist scoring (no limit) """ import logging import time import urllib.parse from datetime import datetime, timedelta, timezone from typing import Optional import feedparser import requests from tenacity import retry, stop_after_attempt, wait_exponential from app.config import settings logger = logging.getLogger(__name__) NEWSAPI_BASE = "https://newsapi.org/v2" GOOGLE_NEWS_RSS = "https://news.google.com/rss/search" NEWSAPI_DAILY_LIMIT = 95 # Leave 5 as buffer @retry(stop=stop_after_attempt(2), wait=wait_exponential(min=1, max=5)) def _newsapi_get(endpoint: str, params: dict) -> dict: params["apiKey"] = settings.NEWSAPI_KEY response = requests.get(f"{NEWSAPI_BASE}/{endpoint}", params=params, timeout=30) response.raise_for_status() return response.json() def build_news_query(bill_title: str, short_title: Optional[str], sponsor_name: Optional[str], bill_type: str, bill_number: int) -> str: """Build a NewsAPI search query for a bill.""" terms = [] if short_title: terms.append(f'"{short_title}"') elif bill_title: # Use first 6 words of title as phrase words = bill_title.split()[:6] if len(words) >= 3: terms.append(f'"{" ".join(words)}"') # Add bill number as fallback terms.append(f'"{bill_type.upper()} {bill_number}"') return " OR ".join(terms[:2]) # Keep queries short for relevance def fetch_newsapi_articles(query: str, days: int = 30) -> list[dict]: """Fetch articles from NewsAPI.org. Returns empty list if quota is exhausted or key not set.""" if not settings.NEWSAPI_KEY: return [] try: from_date = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d") data = _newsapi_get("everything", { "q": query, "language": "en", "sortBy": "relevancy", "pageSize": 10, "from": from_date, }) articles = data.get("articles", []) return [ { "source": a.get("source", {}).get("name", ""), "headline": a.get("title", ""), "url": a.get("url", ""), "published_at": a.get("publishedAt"), } for a in articles if a.get("url") and a.get("title") ] except Exception as e: logger.error(f"NewsAPI fetch failed: {e}") return [] def fetch_gnews_count(query: str, days: int = 30) -> int: """Count articles in Google News RSS for the past N days. Used as volume signal.""" try: encoded = urllib.parse.quote(f"{query} when:{days}d") url = f"{GOOGLE_NEWS_RSS}?q={encoded}&hl=en-US&gl=US&ceid=US:en" time.sleep(1) # Polite delay feed = feedparser.parse(url) return len(feed.entries) except Exception as e: logger.error(f"Google News RSS fetch failed: {e}") return 0 def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]: """Fetch articles from Google News RSS. No rate limit — unlimited source.""" import time as time_mod try: encoded = urllib.parse.quote(f"{query} when:{days}d") url = f"{GOOGLE_NEWS_RSS}?q={encoded}&hl=en-US&gl=US&ceid=US:en" time.sleep(1) # Polite delay feed = feedparser.parse(url) articles = [] for entry in feed.entries[:20]: pub_at = None if entry.get("published_parsed"): try: pub_at = datetime.fromtimestamp( time_mod.mktime(entry.published_parsed), tz=timezone.utc ).isoformat() except Exception: pass source = "" if hasattr(entry, "source") and isinstance(entry.source, dict): source = entry.source.get("title", "") elif entry.get("tags"): source = entry.tags[0].get("term", "") if entry.tags else "" articles.append({ "source": source or "Google News", "headline": entry.get("title", ""), "url": entry.get("link", ""), "published_at": pub_at, }) return [a for a in articles if a["url"] and a["headline"]] except Exception as e: logger.error(f"Google News RSS article fetch failed: {e}") return [] def build_member_query(first_name: str, last_name: str, chamber: Optional[str] = None) -> str: """Build a news search query for a member of Congress.""" full_name = f"{first_name} {last_name}".strip() title = "" if chamber: if "senate" in chamber.lower(): title = "Senator" else: title = "Rep." if title: return f'"{full_name}" OR "{title} {last_name}"' return f'"{full_name}"'