fix(news): robust gnews URL extraction + smarter lazy trigger

- Replace fragile entry.get("link") with _gnews_entry_url() helper that checks entry.link attribute then falls back to entry.links[].href, fixing cases where feedparser puts the URL in a non-standard location - Lazy news re-fetch on bill detail now only triggers when the stored trend score confirms gnews_count > 0, preventing endless re-queuing for bills with genuinely no news coverage Co-Authored-By: Jack Levy
2026-03-01 00:49:02 -05:00
parent 50f93468db
commit b57833d4b7
2 changed files with 34 additions and 14 deletions
--- a/backend/app/api/bills.py
+++ b/backend/app/api/bills.py
@@ -110,8 +110,11 @@ async def get_bill(bill_id: str, db: AsyncSession = Depends(get_db)):
    if bill.trend_scores:
        detail.latest_trend = bill.trend_scores[0]

-    # Trigger a background news refresh if no articles are stored yet
-    if not bill.news_articles:
+    # Trigger a background news refresh if no articles are stored but trend
+    # data shows there are gnews results out there waiting to be fetched.
+    latest_trend = bill.trend_scores[0] if bill.trend_scores else None
+    has_gnews = latest_trend and (latest_trend.gnews_count or 0) > 0
+    if not bill.news_articles and has_gnews:
        try:
            from app.workers.news_fetcher import fetch_news_for_bill
            fetch_news_for_bill.delay(bill_id)
--- a/backend/app/services/news_service.py
+++ b/backend/app/services/news_service.py
@@ -89,6 +89,20 @@ def fetch_gnews_count(query: str, days: int = 30) -> int:
        return 0


+def _gnews_entry_url(entry) -> str:
+    """Extract the article URL from a feedparser Google News RSS entry."""
+    # Primary: entry.link attribute
+    link = getattr(entry, "link", None) or entry.get("link", "")
+    if link:
+        return link
+    # Fallback: scan entry.links list for rel=alternate
+    for lnk in getattr(entry, "links", []):
+        href = lnk.get("href", "")
+        if href:
+            return href
+    return ""
+
+
 def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]:
    """Fetch articles from Google News RSS. No rate limit — unlimited source."""
    import time as time_mod
@@ -100,25 +114,28 @@ def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]:
        articles = []
        for entry in feed.entries[:20]:
            pub_at = None
-            if entry.get("published_parsed"):
+            if getattr(entry, "published_parsed", None):
                try:
                    pub_at = datetime.fromtimestamp(
                        time_mod.mktime(entry.published_parsed), tz=timezone.utc
                    ).isoformat()
                except Exception:
                    pass
+            # Source: feedparser puts it in entry.source.title for Google News
            source = ""
-            if hasattr(entry, "source") and isinstance(entry.source, dict):
-                source = entry.source.get("title", "")
-            elif entry.get("tags"):
-                source = entry.tags[0].get("term", "") if entry.tags else ""
+            src = getattr(entry, "source", None)
+            if src:
+                source = getattr(src, "title", "") or src.get("title", "")
+            headline = entry.get("title", "") or getattr(entry, "title", "")
+            article_url = _gnews_entry_url(entry)
+            if article_url and headline:
                articles.append({
                    "source": source or "Google News",
-                "headline": entry.get("title", ""),
-                "url": entry.get("link", ""),
+                    "headline": headline,
+                    "url": article_url,
                    "published_at": pub_at,
                })
-        return [a for a in articles if a["url"] and a["headline"]]
+        return articles
    except Exception as e:
        logger.error(f"Google News RSS article fetch failed: {e}")
        return []