Skip to content

Commit 3a35fec

Browse files
author
Cesar Smaniotto
committed
refatora spider do Ceará
1 parent 6b9da88 commit 3a35fec

File tree

1 file changed

+114
-107
lines changed

1 file changed

+114
-107
lines changed

corona_ce_spider.py

Lines changed: 114 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -1,132 +1,139 @@
11
import json
2-
import os
32
from collections import defaultdict
4-
from datetime import datetime
3+
from datetime import datetime, timedelta
54

65
import scrapy
76

8-
from scrapy_splash import SplashRequest
9-
107

118
class Covid19CESpider(scrapy.Spider):
12-
url = "https://indicadores.integrasus.saude.ce.gov.br/indicadores/indicadores-coronavirus/coronavirus-ceara"
139
name = "covid19ce"
14-
lua_src = """
15-
function main(splash, args)
16-
splash:go(args.url)
17-
splash:wait(0.5)
18-
local data = splash:jsfunc([[
19-
function() {
20-
var module = {exports:[]};
21-
for (var i=0; i < window.webpackJsonp.length; i++) {
22-
try {
23-
console.log(i);
24-
window.webpackJsonp[i][1]['./src/assets/data/data-municipio.json'](module);
25-
break;
26-
}
27-
catch (err) {
28-
console.log('Falha ao buscar arquivo no indice ' + i);
29-
}
30-
}
31-
return module.exports;
32-
}
33-
]])
34-
return {
35-
data = data()
36-
}
37-
end
38-
"""
39-
40-
custom_settings = {
41-
'SPLASH_URL': os.environ.get('SPLASH_URL'),
42-
'DOWNLOADER_MIDDLEWARES': {
43-
'scrapy_splash.SplashCookiesMiddleware': 723,
44-
'scrapy_splash.SplashMiddleware': 725,
45-
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
46-
},
47-
'SPIDER_MIDDLEWARES': {
48-
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
49-
},
50-
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
51-
'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage',
52-
}
10+
base_url = "https://indicadores.integrasus.saude.ce.gov.br/api/coronavirus/qtd-por-municipio?data={date}&tipo={type}"
5311

5412
def start_requests(self):
55-
args = {"lua_source": self.lua_src}
56-
yield SplashRequest(
57-
self.url, self.parse, endpoint="execute", args=args
13+
yield scrapy.Request(
14+
"https://indicadores.integrasus.saude.ce.gov.br/api/coronavirus/filtro-data",
15+
self.parse_filter_date,
5816
)
5917

60-
def parse(self, response):
61-
script_return = response.body_as_unicode()
62-
cases = json.loads(script_return)["data"]
18+
def parse_filter_date(self, response):
19+
response_json = response.body_as_unicode()
20+
filter_date = json.loads(response_json)[0]
6321

64-
cities_cases = process_cities(cases, self.url)
65-
state_cases = process_state(cities_cases)
22+
iter_date = datetime.strptime(filter_date["dataMin"], "%d/%m/%Y").date()
23+
end_date = datetime.strptime(filter_date["dataMax"], "%d/%m/%Y").date()
6624

67-
all_cases = cities_cases + state_cases
68-
all_cases.sort(key=lambda d: d['date'])
25+
while iter_date <= end_date:
26+
yield scrapy.Request(
27+
self.base_url.format(date=iter_date.isoformat(), type="Confirmado"),
28+
self.parse_confirmed,
29+
meta={"date": iter_date.isoformat()},
30+
)
6931

70-
for case in all_cases:
71-
yield case
32+
iter_date += timedelta(days=1)
7233

34+
def parse_confirmed(self, response):
35+
response_json = response.body_as_unicode()
36+
confirmed_cases = json.loads(response_json)
7337

74-
def process_cities(cases, url):
75-
date_cases_map = defaultdict(lambda: defaultdict(dict))
38+
fixed_cases = []
39+
for case in confirmed_cases:
40+
if case["tipo"] != "Positivo":
41+
continue
7642

77-
for case in cases:
78-
date = case["data"]
79-
city = case["municipio"]
80-
if case["tipo"] == "Confirmado":
81-
date_cases_map[date][city]["confirmed"] = case["quantidade"]
82-
elif case["tipo"] == "Óbito":
83-
date_cases_map[date][city]["deaths"] = case["quantidade"]
84-
85-
cities_cases = []
86-
87-
for date, city_map in date_cases_map.items():
88-
for city_name, cases in city_map.items():
89-
date_obj = datetime.strptime(date, "%d/%m/%Y").date()
90-
cities_cases.append(
43+
fixed_cases.append(
9144
{
92-
"date": date_obj.isoformat(),
93-
"state": "CE",
94-
"city": city_name,
95-
"place_type": "city",
96-
"notified": "",
97-
"confirmed": cases.get("confirmed", ""),
98-
"discarded": "",
99-
"suspect": "",
100-
"deaths": cases.get("deaths", ""),
101-
"notes": "",
102-
"source_url": url,
45+
**case,
46+
"date": response.meta["date"],
47+
"url": response.url,
48+
"confirmed": case["quantidade"],
10349
}
10450
)
10551

106-
return cities_cases
52+
yield scrapy.Request(
53+
self.base_url.format(date=response.meta["date"], type="Óbito"),
54+
self.parse_death,
55+
meta={"confirmed": fixed_cases, "date": response.meta["date"]},
56+
)
57+
58+
def parse_death(self, response):
59+
response_json = response.body_as_unicode()
60+
death_cases = json.loads(response_json)
61+
62+
fixed_cases = []
63+
for case in death_cases:
64+
if case["tipo"] != "Positivo":
65+
continue
66+
67+
fixed_cases.append(
68+
{
69+
**case,
70+
"date": response.meta["date"],
71+
"url": response.url,
72+
"deaths": case["quantidade"],
73+
}
74+
)
75+
76+
all_cases = fixed_cases + response.meta["confirmed"]
77+
parsed_cases = list(process_cities(all_cases))
78+
79+
for case in parsed_cases:
80+
yield case
81+
82+
state_case = process_state(parsed_cases)
83+
if state_case:
84+
yield state_case
85+
86+
87+
def process_cities(cases):
88+
map_city_case = defaultdict(lambda: {"deaths": 0, "confirmed": 0, "source_url": []})
89+
90+
for case in cases:
91+
municipio = case["municipio"]
92+
if "confirmed" in case:
93+
map_city_case[municipio]["confirmed"] = case["confirmed"]
94+
if "deaths" in case:
95+
map_city_case[municipio]["deaths"] = case["deaths"]
96+
97+
map_city_case[municipio]["source_url"].append(case["url"])
98+
map_city_case[municipio].update(
99+
date=case["date"], city=case["municipio"].title(),
100+
)
101+
102+
for case in map_city_case.values():
103+
yield {
104+
"date": case["date"],
105+
"state": "CE",
106+
"city": case["city"],
107+
"place_type": "city",
108+
"notified": "",
109+
"confirmed": case["confirmed"],
110+
"discarded": "",
111+
"suspect": "",
112+
"deaths": case["deaths"],
113+
"notes": "",
114+
"source_url": ",".join(case["source_url"]),
115+
}
107116

108117

109118
def process_state(cities_cases):
110-
date_cases_map = {}
111-
112-
for city_case in cities_cases:
113-
date = city_case['date']
114-
if date not in date_cases_map:
115-
date_cases_map[date] = {
116-
"date": date,
117-
"state": "CE",
118-
"city": '',
119-
"place_type": "state",
120-
"notified": "",
121-
"confirmed": city_case['confirmed'] or 0,
122-
"discarded": "",
123-
"suspect": "",
124-
"deaths": city_case['deaths'] or 0,
125-
"notes": "",
126-
"source_url": city_case['source_url'],
127-
}
128-
else:
129-
date_cases_map[date]['deaths'] += city_case['deaths'] or 0
130-
date_cases_map[date]['confirmed'] += city_case['confirmed'] or 0
131-
132-
return list(date_cases_map.values())
119+
if not cities_cases:
120+
return None
121+
122+
sum_confirmed = sum((case["confirmed"] for case in cities_cases))
123+
sum_deaths = sum((case["deaths"] for case in cities_cases))
124+
125+
city_case = cities_cases[0]
126+
127+
return {
128+
"date": city_case["date"],
129+
"state": "CE",
130+
"city": "",
131+
"place_type": "state",
132+
"notified": "",
133+
"confirmed": sum_confirmed,
134+
"discarded": "",
135+
"suspect": "",
136+
"deaths": sum_deaths,
137+
"notes": "",
138+
"source_url": city_case["source_url"],
139+
}

0 commit comments

Comments
 (0)