|
19 | 19 | VotePosition,
|
20 | 20 | VoteResult,
|
21 | 21 | )
|
22 |
| -from .common import BeautifulSoupScraper, RequestCache, ScrapingError |
| 22 | +from .common import BeautifulSoupScraper, NoWorkingUrlError, RequestCache, ScrapingError |
23 | 23 | from .helpers import (
|
24 | 24 | fill_missing_by_reference,
|
25 | 25 | normalize_name,
|
@@ -363,6 +363,23 @@ def _url(self) -> str:
|
363 | 363 | return f"{self.BASE_URL}/PV-{self.term}-{date}-VOT_EN.xml"
|
364 | 364 |
|
365 | 365 | def _extract_data(self, doc: BeautifulSoup) -> Iterator[Fragment | None]:
|
| 366 | + root = doc.select_one("file") |
| 367 | + |
| 368 | + if not root: |
| 369 | + raise ScrapingError("Missing root element `file` in VOT list") |
| 370 | + |
| 371 | + # https://github.com/python/typeshed/issues/8755 |
| 372 | + language = cast(str, root["language"]).lower() |
| 373 | + |
| 374 | + if language != "en": |
| 375 | + # If an English translation isn’t yet available, requesting the English translation |
| 376 | + # will return the French original. In case a French document is returned, we raise |
| 377 | + # `NoWorkingUrlError`. Pipelines catching this exception will usually be re-run |
| 378 | + # later (rather than being marked as permanently failed). |
| 379 | + raise NoWorkingUrlError( |
| 380 | + "Request English version of document, but received language {language}." |
| 381 | + ) |
| 382 | + |
366 | 383 | for vote_tag in doc.select("votes vote"):
|
367 | 384 | # The source data often contains sections with additional information (such as
|
368 | 385 | # corrections). These are also modeled as "votes" (even though there was no
|
|
0 commit comments