|
1 | 1 | import json
|
2 |
| -import os |
3 | 2 | from collections import defaultdict
|
4 |
| -from datetime import datetime |
| 3 | +from datetime import datetime, timedelta |
5 | 4 |
|
6 | 5 | import scrapy
|
7 | 6 |
|
8 |
| -from scrapy_splash import SplashRequest |
9 |
| - |
10 | 7 |
|
11 | 8 | class Covid19CESpider(scrapy.Spider):
|
12 |
| - url = "https://indicadores.integrasus.saude.ce.gov.br/indicadores/indicadores-coronavirus/coronavirus-ceara" |
13 | 9 | name = "covid19ce"
|
14 |
| - lua_src = """ |
15 |
| - function main(splash, args) |
16 |
| - splash:go(args.url) |
17 |
| - splash:wait(0.5) |
18 |
| - local data = splash:jsfunc([[ |
19 |
| - function() { |
20 |
| - var module = {exports:[]}; |
21 |
| - for (var i=0; i < window.webpackJsonp.length; i++) { |
22 |
| - try { |
23 |
| - console.log(i); |
24 |
| - window.webpackJsonp[i][1]['./src/assets/data/data-municipio.json'](module); |
25 |
| - break; |
26 |
| - } |
27 |
| - catch (err) { |
28 |
| - console.log('Falha ao buscar arquivo no indice ' + i); |
29 |
| - } |
30 |
| - } |
31 |
| - return module.exports; |
32 |
| - } |
33 |
| - ]]) |
34 |
| - return { |
35 |
| - data = data() |
36 |
| - } |
37 |
| - end |
38 |
| - """ |
39 |
| - |
40 |
| - custom_settings = { |
41 |
| - 'SPLASH_URL': os.environ.get('SPLASH_URL'), |
42 |
| - 'DOWNLOADER_MIDDLEWARES': { |
43 |
| - 'scrapy_splash.SplashCookiesMiddleware': 723, |
44 |
| - 'scrapy_splash.SplashMiddleware': 725, |
45 |
| - 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, |
46 |
| - }, |
47 |
| - 'SPIDER_MIDDLEWARES': { |
48 |
| - 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, |
49 |
| - }, |
50 |
| - 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', |
51 |
| - 'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage', |
52 |
| - } |
| 10 | + base_url = "https://indicadores.integrasus.saude.ce.gov.br/api/coronavirus/qtd-por-municipio?data={date}&tipo={type}" |
53 | 11 |
|
54 | 12 | def start_requests(self):
|
55 |
| - args = {"lua_source": self.lua_src} |
56 |
| - yield SplashRequest( |
57 |
| - self.url, self.parse, endpoint="execute", args=args |
| 13 | + yield scrapy.Request( |
| 14 | + "https://indicadores.integrasus.saude.ce.gov.br/api/coronavirus/filtro-data", |
| 15 | + self.parse_filter_date, |
58 | 16 | )
|
59 | 17 |
|
60 |
| - def parse(self, response): |
61 |
| - script_return = response.body_as_unicode() |
62 |
| - cases = json.loads(script_return)["data"] |
| 18 | + def parse_filter_date(self, response): |
| 19 | + response_json = response.body_as_unicode() |
| 20 | + filter_date = json.loads(response_json)[0] |
63 | 21 |
|
64 |
| - cities_cases = process_cities(cases, self.url) |
65 |
| - state_cases = process_state(cities_cases) |
| 22 | + iter_date = datetime.strptime(filter_date["dataMin"], "%d/%m/%Y").date() |
| 23 | + end_date = datetime.strptime(filter_date["dataMax"], "%d/%m/%Y").date() |
66 | 24 |
|
67 |
| - all_cases = cities_cases + state_cases |
68 |
| - all_cases.sort(key=lambda d: d['date']) |
| 25 | + while iter_date <= end_date: |
| 26 | + yield scrapy.Request( |
| 27 | + self.base_url.format(date=iter_date.isoformat(), type="Confirmado"), |
| 28 | + self.parse_confirmed, |
| 29 | + meta={"date": iter_date.isoformat()}, |
| 30 | + ) |
69 | 31 |
|
70 |
| - for case in all_cases: |
71 |
| - yield case |
| 32 | + iter_date += timedelta(days=1) |
72 | 33 |
|
| 34 | + def parse_confirmed(self, response): |
| 35 | + response_json = response.body_as_unicode() |
| 36 | + confirmed_cases = json.loads(response_json) |
73 | 37 |
|
74 |
| -def process_cities(cases, url): |
75 |
| - date_cases_map = defaultdict(lambda: defaultdict(dict)) |
| 38 | + fixed_cases = [] |
| 39 | + for case in confirmed_cases: |
| 40 | + if case["tipo"] != "Positivo": |
| 41 | + continue |
76 | 42 |
|
77 |
| - for case in cases: |
78 |
| - date = case["data"] |
79 |
| - city = case["municipio"] |
80 |
| - if case["tipo"] == "Confirmado": |
81 |
| - date_cases_map[date][city]["confirmed"] = case["quantidade"] |
82 |
| - elif case["tipo"] == "Óbito": |
83 |
| - date_cases_map[date][city]["deaths"] = case["quantidade"] |
84 |
| - |
85 |
| - cities_cases = [] |
86 |
| - |
87 |
| - for date, city_map in date_cases_map.items(): |
88 |
| - for city_name, cases in city_map.items(): |
89 |
| - date_obj = datetime.strptime(date, "%d/%m/%Y").date() |
90 |
| - cities_cases.append( |
| 43 | + fixed_cases.append( |
91 | 44 | {
|
92 |
| - "date": date_obj.isoformat(), |
93 |
| - "state": "CE", |
94 |
| - "city": city_name, |
95 |
| - "place_type": "city", |
96 |
| - "notified": "", |
97 |
| - "confirmed": cases.get("confirmed", ""), |
98 |
| - "discarded": "", |
99 |
| - "suspect": "", |
100 |
| - "deaths": cases.get("deaths", ""), |
101 |
| - "notes": "", |
102 |
| - "source_url": url, |
| 45 | + **case, |
| 46 | + "date": response.meta["date"], |
| 47 | + "url": response.url, |
| 48 | + "confirmed": case["quantidade"], |
103 | 49 | }
|
104 | 50 | )
|
105 | 51 |
|
106 |
| - return cities_cases |
| 52 | + yield scrapy.Request( |
| 53 | + self.base_url.format(date=response.meta["date"], type="Óbito"), |
| 54 | + self.parse_death, |
| 55 | + meta={"confirmed": fixed_cases, "date": response.meta["date"]}, |
| 56 | + ) |
| 57 | + |
| 58 | + def parse_death(self, response): |
| 59 | + response_json = response.body_as_unicode() |
| 60 | + death_cases = json.loads(response_json) |
| 61 | + |
| 62 | + fixed_cases = [] |
| 63 | + for case in death_cases: |
| 64 | + if case["tipo"] != "Positivo": |
| 65 | + continue |
| 66 | + |
| 67 | + fixed_cases.append( |
| 68 | + { |
| 69 | + **case, |
| 70 | + "date": response.meta["date"], |
| 71 | + "url": response.url, |
| 72 | + "deaths": case["quantidade"], |
| 73 | + } |
| 74 | + ) |
| 75 | + |
| 76 | + all_cases = fixed_cases + response.meta["confirmed"] |
| 77 | + parsed_cases = list(process_cities(all_cases)) |
| 78 | + |
| 79 | + for case in parsed_cases: |
| 80 | + yield case |
| 81 | + |
| 82 | + state_case = process_state(parsed_cases) |
| 83 | + if state_case: |
| 84 | + yield state_case |
| 85 | + |
| 86 | + |
| 87 | +def process_cities(cases): |
| 88 | + map_city_case = defaultdict(lambda: {"deaths": 0, "confirmed": 0, "source_url": []}) |
| 89 | + |
| 90 | + for case in cases: |
| 91 | + municipio = case["municipio"] |
| 92 | + if "confirmed" in case: |
| 93 | + map_city_case[municipio]["confirmed"] = case["confirmed"] |
| 94 | + if "deaths" in case: |
| 95 | + map_city_case[municipio]["deaths"] = case["deaths"] |
| 96 | + |
| 97 | + map_city_case[municipio]["source_url"].append(case["url"]) |
| 98 | + map_city_case[municipio].update( |
| 99 | + date=case["date"], city=case["municipio"].title(), |
| 100 | + ) |
| 101 | + |
| 102 | + for case in map_city_case.values(): |
| 103 | + yield { |
| 104 | + "date": case["date"], |
| 105 | + "state": "CE", |
| 106 | + "city": case["city"], |
| 107 | + "place_type": "city", |
| 108 | + "notified": "", |
| 109 | + "confirmed": case["confirmed"], |
| 110 | + "discarded": "", |
| 111 | + "suspect": "", |
| 112 | + "deaths": case["deaths"], |
| 113 | + "notes": "", |
| 114 | + "source_url": ",".join(case["source_url"]), |
| 115 | + } |
107 | 116 |
|
108 | 117 |
|
109 | 118 | def process_state(cities_cases):
|
110 |
| - date_cases_map = {} |
111 |
| - |
112 |
| - for city_case in cities_cases: |
113 |
| - date = city_case['date'] |
114 |
| - if date not in date_cases_map: |
115 |
| - date_cases_map[date] = { |
116 |
| - "date": date, |
117 |
| - "state": "CE", |
118 |
| - "city": '', |
119 |
| - "place_type": "state", |
120 |
| - "notified": "", |
121 |
| - "confirmed": city_case['confirmed'] or 0, |
122 |
| - "discarded": "", |
123 |
| - "suspect": "", |
124 |
| - "deaths": city_case['deaths'] or 0, |
125 |
| - "notes": "", |
126 |
| - "source_url": city_case['source_url'], |
127 |
| - } |
128 |
| - else: |
129 |
| - date_cases_map[date]['deaths'] += city_case['deaths'] or 0 |
130 |
| - date_cases_map[date]['confirmed'] += city_case['confirmed'] or 0 |
131 |
| - |
132 |
| - return list(date_cases_map.values()) |
| 119 | + if not cities_cases: |
| 120 | + return None |
| 121 | + |
| 122 | + sum_confirmed = sum((case["confirmed"] for case in cities_cases)) |
| 123 | + sum_deaths = sum((case["deaths"] for case in cities_cases)) |
| 124 | + |
| 125 | + city_case = cities_cases[0] |
| 126 | + |
| 127 | + return { |
| 128 | + "date": city_case["date"], |
| 129 | + "state": "CE", |
| 130 | + "city": "", |
| 131 | + "place_type": "state", |
| 132 | + "notified": "", |
| 133 | + "confirmed": sum_confirmed, |
| 134 | + "discarded": "", |
| 135 | + "suspect": "", |
| 136 | + "deaths": sum_deaths, |
| 137 | + "notes": "", |
| 138 | + "source_url": city_case["source_url"], |
| 139 | + } |
0 commit comments