fix: handle within-page cosponsor duplicates from Congress.gov API
Congress.gov occasionally returns the same member twice on a single page with different sponsorship dates (observed: Sen. Warnock on 119-s-1383). The DB uniqueness check didn't catch this because the first insert hadn't been committed yet when processing the duplicate row, causing a UniqueViolation. Fix adds an `inserted_this_run` set to skip bioguide_ids already added in the current fetch loop. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -140,6 +140,9 @@ def fetch_bill_cosponsors(self, bill_id: str):
|
||||
return {"status": "skipped"}
|
||||
|
||||
known_bioguides = {row[0] for row in db.execute(text("SELECT bioguide_id FROM members")).fetchall()}
|
||||
# Track bioguide_ids already inserted this run to handle within-page dupes
|
||||
# (Congress.gov sometimes lists the same member twice with different dates)
|
||||
inserted_this_run: set[str] = set()
|
||||
inserted = 0
|
||||
offset = 0
|
||||
|
||||
@@ -157,12 +160,15 @@ def fetch_bill_cosponsors(self, bill_id: str):
|
||||
if bioguide_id and bioguide_id not in known_bioguides:
|
||||
bioguide_id = None
|
||||
|
||||
# Skip if we already have this (bioguide_id, bill_id) pair
|
||||
# Skip dupes — both across runs (DB check) and within this page
|
||||
if bioguide_id:
|
||||
if bioguide_id in inserted_this_run:
|
||||
continue
|
||||
exists = db.query(BillCosponsor).filter_by(
|
||||
bill_id=bill_id, bioguide_id=bioguide_id
|
||||
).first()
|
||||
if exists:
|
||||
inserted_this_run.add(bioguide_id)
|
||||
continue
|
||||
|
||||
date_str = cs.get("sponsorshipDate")
|
||||
@@ -179,6 +185,8 @@ def fetch_bill_cosponsors(self, bill_id: str):
|
||||
state=cs.get("state"),
|
||||
sponsored_date=sponsored_date,
|
||||
))
|
||||
if bioguide_id:
|
||||
inserted_this_run.add(bioguide_id)
|
||||
inserted += 1
|
||||
|
||||
db.commit()
|
||||
|
||||
Reference in New Issue
Block a user