PocketVeto/backend/app/services/news_service.py

"""
News correlation service.

- NewsAPI.org: structured news articles per bill (100 req/day limit)
- Google News RSS: volume signal for zeitgeist scoring (no limit)
"""
import logging
import time
import urllib.parse
from datetime import date, datetime, timedelta, timezone
from typing import Optional

import feedparser
import redis
import requests
from tenacity import retry, stop_after_attempt, wait_exponential

from app.config import settings

logger = logging.getLogger(__name__)

NEWSAPI_BASE = "https://newsapi.org/v2"
GOOGLE_NEWS_RSS = "https://news.google.com/rss/search"
NEWSAPI_DAILY_LIMIT = 95  # Leave 5 as buffer

_NEWSAPI_REDIS_PREFIX = "newsapi:daily_calls:"


def _newsapi_redis():
    return redis.from_url(settings.REDIS_URL, decode_responses=True)


def _newsapi_quota_ok() -> bool:
    """Return True if we have quota remaining for today."""
    try:
        key = f"{_NEWSAPI_REDIS_PREFIX}{date.today().isoformat()}"
        used = int(_newsapi_redis().get(key) or 0)
        return used < NEWSAPI_DAILY_LIMIT
    except Exception:
        return True  # Don't block on Redis errors


def _newsapi_record_call():
    try:
        r = _newsapi_redis()
        key = f"{_NEWSAPI_REDIS_PREFIX}{date.today().isoformat()}"
        pipe = r.pipeline()
        pipe.incr(key)
        pipe.expire(key, 90000)  # 25 hours — expires safely after midnight
        pipe.execute()
    except Exception:
        pass


@retry(stop=stop_after_attempt(2), wait=wait_exponential(min=1, max=5))
def _newsapi_get(endpoint: str, params: dict) -> dict:
    params["apiKey"] = settings.NEWSAPI_KEY
    response = requests.get(f"{NEWSAPI_BASE}/{endpoint}", params=params, timeout=30)
    response.raise_for_status()
    return response.json()


def build_news_query(bill_title: str, short_title: Optional[str], sponsor_name: Optional[str],
                     bill_type: str, bill_number: int) -> str:
    """Build a NewsAPI search query for a bill."""
    terms = []
    if short_title:
        terms.append(f'"{short_title}"')
    elif bill_title:
        # Use first 6 words of title as phrase
        words = bill_title.split()[:6]
        if len(words) >= 3:
            terms.append(f'"{" ".join(words)}"')
    # Add bill number as fallback
    terms.append(f'"{bill_type.upper()} {bill_number}"')
    return " OR ".join(terms[:2])  # Keep queries short for relevance


def fetch_newsapi_articles(query: str, days: int = 30) -> list[dict]:
    """Fetch articles from NewsAPI.org. Returns empty list if quota is exhausted or key not set."""
    if not settings.NEWSAPI_KEY:
        return []
    if not _newsapi_quota_ok():
        logger.warning("NewsAPI daily quota exhausted — skipping fetch")
        return []
    try:
        from_date = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d")
        data = _newsapi_get("everything", {
            "q": query,
            "language": "en",
            "sortBy": "relevancy",
            "pageSize": 10,
            "from": from_date,
        })
        _newsapi_record_call()
        articles = data.get("articles", [])
        return [
            {
                "source": a.get("source", {}).get("name", ""),
                "headline": a.get("title", ""),
                "url": a.get("url", ""),
                "published_at": a.get("publishedAt"),
            }
            for a in articles
            if a.get("url") and a.get("title")
        ]
    except Exception as e:
        logger.error(f"NewsAPI fetch failed: {e}")
        return []


def fetch_gnews_count(query: str, days: int = 30) -> int:
    """Count articles in Google News RSS for the past N days. Used as volume signal."""
    try:
        encoded = urllib.parse.quote(f"{query} when:{days}d")
        url = f"{GOOGLE_NEWS_RSS}?q={encoded}&hl=en-US&gl=US&ceid=US:en"
        time.sleep(1)  # Polite delay
        feed = feedparser.parse(url)
        return len(feed.entries)
    except Exception as e:
        logger.error(f"Google News RSS fetch failed: {e}")
        return 0


def _gnews_entry_url(entry) -> str:
    """Extract the article URL from a feedparser Google News RSS entry."""
    # Primary: entry.link attribute
    link = getattr(entry, "link", None) or entry.get("link", "")
    if link:
        return link
    # Fallback: scan entry.links list for rel=alternate
    for lnk in getattr(entry, "links", []):
        href = lnk.get("href", "")
        if href:
            return href
    return ""


def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]:
    """Fetch articles from Google News RSS. No rate limit — unlimited source."""
    import time as time_mod
    try:
        encoded = urllib.parse.quote(f"{query} when:{days}d")
        url = f"{GOOGLE_NEWS_RSS}?q={encoded}&hl=en-US&gl=US&ceid=US:en"
        time.sleep(1)  # Polite delay
        feed = feedparser.parse(url)
        articles = []
        for entry in feed.entries[:20]:
            pub_at = None
            if getattr(entry, "published_parsed", None):
                try:
                    pub_at = datetime.fromtimestamp(
                        time_mod.mktime(entry.published_parsed), tz=timezone.utc
                    ).isoformat()
                except Exception:
                    pass
            # Source: feedparser puts it in entry.source.title for Google News
            source = ""
            src = getattr(entry, "source", None)
            if src:
                source = getattr(src, "title", "") or src.get("title", "")
            headline = entry.get("title", "") or getattr(entry, "title", "")
            article_url = _gnews_entry_url(entry)
            if article_url and headline:
                articles.append({
                    "source": source or "Google News",
                    "headline": headline,
                    "url": article_url,
                    "published_at": pub_at,
                })
        return articles
    except Exception as e:
        logger.error(f"Google News RSS article fetch failed: {e}")
        return []


def build_member_query(first_name: str, last_name: str, chamber: Optional[str] = None) -> str:
    """Build a news search query for a member of Congress."""
    full_name = f"{first_name} {last_name}".strip()
    title = ""
    if chamber:
        if "senate" in chamber.lower():
            title = "Senator"
        else:
            title = "Rep."
    if title:
        return f'"{full_name}" OR "{title} {last_name}"'
    return f'"{full_name}"'