fix(news): robust gnews URL extraction + smarter lazy trigger

- Replace fragile entry.get("link") with _gnews_entry_url() helper that
  checks entry.link attribute then falls back to entry.links[].href,
  fixing cases where feedparser puts the URL in a non-standard location
- Lazy news re-fetch on bill detail now only triggers when the stored
  trend score confirms gnews_count > 0, preventing endless re-queuing
  for bills with genuinely no news coverage

Co-Authored-By: Jack Levy
This commit is contained in:
Jack Levy
2026-03-01 00:49:02 -05:00
parent 50f93468db
commit b57833d4b7
2 changed files with 34 additions and 14 deletions

View File

@@ -110,8 +110,11 @@ async def get_bill(bill_id: str, db: AsyncSession = Depends(get_db)):
if bill.trend_scores: if bill.trend_scores:
detail.latest_trend = bill.trend_scores[0] detail.latest_trend = bill.trend_scores[0]
# Trigger a background news refresh if no articles are stored yet # Trigger a background news refresh if no articles are stored but trend
if not bill.news_articles: # data shows there are gnews results out there waiting to be fetched.
latest_trend = bill.trend_scores[0] if bill.trend_scores else None
has_gnews = latest_trend and (latest_trend.gnews_count or 0) > 0
if not bill.news_articles and has_gnews:
try: try:
from app.workers.news_fetcher import fetch_news_for_bill from app.workers.news_fetcher import fetch_news_for_bill
fetch_news_for_bill.delay(bill_id) fetch_news_for_bill.delay(bill_id)

View File

@@ -89,6 +89,20 @@ def fetch_gnews_count(query: str, days: int = 30) -> int:
return 0 return 0
def _gnews_entry_url(entry) -> str:
"""Extract the article URL from a feedparser Google News RSS entry."""
# Primary: entry.link attribute
link = getattr(entry, "link", None) or entry.get("link", "")
if link:
return link
# Fallback: scan entry.links list for rel=alternate
for lnk in getattr(entry, "links", []):
href = lnk.get("href", "")
if href:
return href
return ""
def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]: def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]:
"""Fetch articles from Google News RSS. No rate limit — unlimited source.""" """Fetch articles from Google News RSS. No rate limit — unlimited source."""
import time as time_mod import time as time_mod
@@ -100,25 +114,28 @@ def fetch_gnews_articles(query: str, days: int = 30) -> list[dict]:
articles = [] articles = []
for entry in feed.entries[:20]: for entry in feed.entries[:20]:
pub_at = None pub_at = None
if entry.get("published_parsed"): if getattr(entry, "published_parsed", None):
try: try:
pub_at = datetime.fromtimestamp( pub_at = datetime.fromtimestamp(
time_mod.mktime(entry.published_parsed), tz=timezone.utc time_mod.mktime(entry.published_parsed), tz=timezone.utc
).isoformat() ).isoformat()
except Exception: except Exception:
pass pass
# Source: feedparser puts it in entry.source.title for Google News
source = "" source = ""
if hasattr(entry, "source") and isinstance(entry.source, dict): src = getattr(entry, "source", None)
source = entry.source.get("title", "") if src:
elif entry.get("tags"): source = getattr(src, "title", "") or src.get("title", "")
source = entry.tags[0].get("term", "") if entry.tags else "" headline = entry.get("title", "") or getattr(entry, "title", "")
articles.append({ article_url = _gnews_entry_url(entry)
"source": source or "Google News", if article_url and headline:
"headline": entry.get("title", ""), articles.append({
"url": entry.get("link", ""), "source": source or "Google News",
"published_at": pub_at, "headline": headline,
}) "url": article_url,
return [a for a in articles if a["url"] and a["headline"]] "published_at": pub_at,
})
return articles
except Exception as e: except Exception as e:
logger.error(f"Google News RSS article fetch failed: {e}") logger.error(f"Google News RSS article fetch failed: {e}")
return [] return []