diff --git a/backend/app/workers/bill_classifier.py b/backend/app/workers/bill_classifier.py index 0bb1a44..4f21294 100644 --- a/backend/app/workers/bill_classifier.py +++ b/backend/app/workers/bill_classifier.py @@ -140,6 +140,9 @@ def fetch_bill_cosponsors(self, bill_id: str): return {"status": "skipped"} known_bioguides = {row[0] for row in db.execute(text("SELECT bioguide_id FROM members")).fetchall()} + # Track bioguide_ids already inserted this run to handle within-page dupes + # (Congress.gov sometimes lists the same member twice with different dates) + inserted_this_run: set[str] = set() inserted = 0 offset = 0 @@ -157,12 +160,15 @@ def fetch_bill_cosponsors(self, bill_id: str): if bioguide_id and bioguide_id not in known_bioguides: bioguide_id = None - # Skip if we already have this (bioguide_id, bill_id) pair + # Skip dupes — both across runs (DB check) and within this page if bioguide_id: + if bioguide_id in inserted_this_run: + continue exists = db.query(BillCosponsor).filter_by( bill_id=bill_id, bioguide_id=bioguide_id ).first() if exists: + inserted_this_run.add(bioguide_id) continue date_str = cs.get("sponsorshipDate") @@ -179,6 +185,8 @@ def fetch_bill_cosponsors(self, bill_id: str): state=cs.get("state"), sponsored_date=sponsored_date, )) + if bioguide_id: + inserted_this_run.add(bioguide_id) inserted += 1 db.commit()