Skip to content

Spider para casos de Rondônia #132

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions web/spiders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .spider_pr import Covid19PRSpider
from .spider_rn import Covid19RNSpider
from .spider_rr import Covid19RRSpider
from .spider_ro import Covid19ROSpider


SPIDERS = [
Expand All @@ -16,6 +17,7 @@
Covid19PRSpider,
Covid19RRSpider,
Covid19RNSpider,
Covid19ROSpider,
]
STATE_SPIDERS = {SpiderClass.name: SpiderClass for SpiderClass in SPIDERS}
# TODO: do autodiscovery from base class' subclasses
Expand All @@ -29,6 +31,7 @@ def execute_spider_worker(SpiderClass):
process.start()
except Exception as exp:
import traceback

return "error", traceback.format_exc()
else:
report_fobj.seek(0)
Expand Down
6 changes: 3 additions & 3 deletions web/spiders/spider_pa.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import io
import json
import os
import scrapy
from urllib.parse import urlencode

from .base import BaseCovid19Spider


class Covid19PASpider(BaseCovid19Spider):
name = "PA"
base_url = "https://www.covid-19.pa.gov.br/monitoramento-corona-service/statuscorona/casos-confirmados-obitos-por-municipio"]
base_url = "https://www.covid-19.pa.gov.br/monitoramento-corona-service/statuscorona/casos-confirmados-obitos-por-municipio"
splash_url = os.environ.get("SPLASH_URL", None)

def start_requests(self):
Expand Down
8 changes: 5 additions & 3 deletions web/spiders/spider_pe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ def city_id_from_name(self):
return data

def parse(self, response):
page_jsons = response.xpath("//script[@type='application/json' and @data-for]/text()")
page_jsons = response.xpath(
"//script[@type='application/json' and @data-for]/text()"
)
case_data = None
for json_data in page_jsons.extract():
data = json.loads(json_data)["x"]
if data['options'].get('buttons'):
if data["options"].get("buttons"):
continue
case_data = data["data"]
break
Expand Down Expand Up @@ -71,7 +73,7 @@ def parse(self, response):
def fix_row(self, row):
new = row.copy()
cd_municipio = new["cd_municipio"]
if cd_municipio == '-':
if cd_municipio == "-":
cd_municipio = 0

if int(cd_municipio) == 0 or not new["cd_municipio"]:
Expand Down
52 changes: 52 additions & 0 deletions web/spiders/spider_ro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import json
import scrapy

from datetime import date
from .base import BaseCovid19Spider


class Covid19ROSpider(BaseCovid19Spider):
name = "RO"
start_urls = ["http://covid19.sesau.ro.gov.br"]

def parse(self, response):
# extract the displayed date (there's no option to change the date to a past one)
date_container = response.xpath(
"//*[text()='calendar_today']/../text()"
).extract()
report_date = [t.strip() for t in date_container if t.strip()][0]

# get the URL for the JS file with the data
js_script = response.xpath(
"//attribute::*[contains(., 'estadoRO')]/../@src"
).extract()[0]
full_url = response.url + js_script

year, month, day = [int(v) for v in report_date.split("/")[::-1]]
self.add_report(date=date(year, month, day), url=full_url)

yield scrapy.Request(
url=full_url,
meta={"row": {"date": date}},
callback=self.parse_js_data_script,
)

def parse_js_data_script(self, response):
"""
The JS code only defines a variable called 'cidades' with the required JSON data to the other
JS codes work with. This parsing function cleans up the JS file to get only the JSON content.
"""
json_data = response.body_as_unicode().replace("var cidades = ", "").strip()
content = json.loads(json_data)

total_confirmed, total_deaths = 0, 0
for data in [d["properties"] for d in content["features"]]:
city, confirmed, deaths = data["NOME"], data["confirmados"], data["obitos"]
total_confirmed += confirmed
total_deaths += deaths

self.add_city_case(city=city, confirmed=confirmed, deaths=deaths)

# TODO: in the future we'll might have to change this once this data is available
self.add_city_case(city="Importados/Indefinidos", confirmed=None, deaths=None)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eu colocaria um TODO aqui só pra alertar que talvez, no futuro, possamos alterar esses None caso eles comecem a divulgar o valor.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feito

self.add_state_case(confirmed=total_confirmed, deaths=total_deaths)