Skip to content

[Feat] MA spider #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions covid19br/common/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ def add_warning_in_report(
report = self._get_or_create_report(date)
report.add_warning(slug, description)

def add_note_in_report(self, date: datetime.date, note: str):
report = self._get_or_create_report(date)
report.add_note(note)

def _get_or_create_report(self, date: datetime.date):
report = self.reports.get(date)
if not report:
Expand Down
4 changes: 3 additions & 1 deletion covid19br/common/data_normalization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
CURRENT_YEAR = date.today().year
MONTHS = "jan fev mar abr mai jun jul ago set out nov dez".split()

REGEXP_IN_FULL_DATE = re.compile("([0-9]{1,2})(?: +de)? ([^ ]+)(?: de ([0-9]{4}))?")
REGEXP_IN_FULL_DATE = re.compile(
"([0-9]{1,2})(?: +de)? ([^ ]+)(?: de)?(?: ([0-9]{4}))?"
)
REGEXP_NUMERIC_DATE = re.compile("([0-9]{2})[/-]([0-9]{2})[/-]([0-9]{2,4})[ .]?")


Expand Down
44 changes: 44 additions & 0 deletions covid19br/parsers/MA/maranhao_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import csv

CITY_NAME_CSV_COLUMN = 0
CONFIRMED_CASES_CSV_COLUMN = 1
DEATH_CASES_CSV_COLUMN = 2


class MaranhaoCSVBulletinExtractor:
def __init__(self, filename):
with open(filename, encoding="latin-1") as fobj:
reader = csv.reader(fobj, delimiter=";", lineterminator="\n")
self.file_content = list(reader)

# We actually just need the header, but we are not sure about
# it's position so we take the first 10 lines
first_items = [l[0].strip() for l in self.file_content[:10]]
# If the encoding is wrong, we try another one
if "MUNICÍPIOS" not in first_items:
with open(filename, encoding="mac_iceland") as fobj:
reader = csv.reader(fobj, delimiter=";", lineterminator="\n")
self.file_content = list(reader)

@property
def data(self):
# There are blank lines in the beginning of the file. Wait until find the header
header = None
for line in self.file_content:
# Consider only non-empty lines and first 3 columns
line = [cell.strip() for cell in line[:3]]
if not any(line):
continue
elif not header:
if line[CITY_NAME_CSV_COLUMN] == "MUNICÍPIOS":
header = line
continue

if line[
CITY_NAME_CSV_COLUMN
]: # There are some blank lines we want to ignore
yield {
"municipio": line[CITY_NAME_CSV_COLUMN],
"confirmados": line[CONFIRMED_CASES_CSV_COLUMN],
"mortes": line[DEATH_CASES_CSV_COLUMN],
}
52 changes: 52 additions & 0 deletions covid19br/parsers/MA/maranhao_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import re
import rows

from covid19br.common.data_normalization_utils import NormalizationUtils


def is_only_number(value):
return re.compile("^([0-9.]+)$").findall(value.strip())


class MaranhaoPdfBulletinExtractor:
def __init__(self, filename):
self.doc = rows.plugins.pdf.PyMuPDFBackend(filename)
self.first_page_objs = next(self.doc.text_objects())

@property
def date(self):
for obj in self.first_page_objs:
if "BOLETIM ATUALIZADO" in obj.text:
return NormalizationUtils.extract_numeric_date(obj.text)

@property
def official_total(self):
confirmed_cases_label = next(
obj for obj in self.first_page_objs if obj.text.lower() == "confirmados"
)
deaths_label = next(
obj for obj in self.first_page_objs if obj.text.lower() == "óbitos"
)

# select the number above and on the left of confirmed_cases_label
confirmed_cases = next(
obj
for obj in self.first_page_objs
if is_only_number(obj.text)
and obj.y0 < confirmed_cases_label.y0
and obj.x0 < confirmed_cases_label.x0
)
# select the numbers above and that are in the same column as deaths_label and pick the closest one
deaths, *_ = sorted(
[
obj
for obj in self.first_page_objs
if is_only_number(obj.text)
and obj.y0 < deaths_label.y0
and obj.x0 < deaths_label.x0
and obj.x1 > deaths_label.x1
],
key=lambda obj: deaths_label.y0 - obj.y0,
) or [None]

return {"confirmados": deaths.text, "mortes": confirmed_cases.text}
2 changes: 2 additions & 0 deletions covid19br/run_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from covid19br.common.data_normalization_utils import NormalizationUtils
from covid19br.spiders.spider_ba import SpiderBA
from covid19br.spiders.spider_ce import SpiderCE
from covid19br.spiders.spider_ma import SpiderMA
from covid19br.spiders.spider_pr import SpiderPR
from covid19br.spiders.spider_sp import SpiderSP
from covid19br.spiders.spider_to import SpiderTO
Expand All @@ -19,6 +20,7 @@
AVAILABLE_SPIDERS = [
SpiderBA,
SpiderCE,
SpiderMA,
SpiderPR,
SpiderSP,
SpiderTO,
Expand Down
133 changes: 133 additions & 0 deletions covid19br/spiders/spider_ma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import io
import re
import scrapy
from collections import defaultdict
from datetime import datetime
import tempfile

from covid19br.common.base_spider import BaseCovid19Spider
from covid19br.common.constants import State, ReportQuality
from covid19br.common.models.bulletin_models import (
CountyBulletinModel,
StateTotalBulletinModel,
)
from covid19br.parsers.MA.maranhao_csv import MaranhaoCSVBulletinExtractor
from covid19br.parsers.MA.maranhao_pdf import MaranhaoPdfBulletinExtractor


class SpiderMA(BaseCovid19Spider):
state = State.MA
name = State.MA.value
information_delay_in_days = 0
report_qualities = [ReportQuality.COUNTY_BULLETINS]

base_url = "https://www.saude.ma.gov.br/boletins-covid-19"

def pre_init(self):
self.requested_dates = list(self.requested_dates)

def start_requests(self):
current_year = self.today.year
requested_years = set([date.year for date in self.requested_dates])
for year in requested_years:
if year == current_year:
yield scrapy.Request(self.base_url + "/")
else:
yield scrapy.Request(f"{self.base_url}-{year}/")

def parse(self, response, **kwargs):
bulletins_per_date = defaultdict(dict)
divs = response.xpath("//div[@class='wpb_wrapper']//a")
for div in divs:
div_text = self.normalizer.remove_accentuation(
(div.xpath("./text()").get() or "").lower()
)
div_url = div.xpath("./@href").get()
if "dados gerais em csv" in div_text:
date = self._extract_date_from_csv_name(div_url)
bulletins_per_date[date]["csv"] = div_url
elif "boletim epidemiologic" in div_text:
date = self.normalizer.extract_in_full_date(div_text)
bulletins_per_date[date]["pdf"] = div_url

for date in self.requested_dates:
if date in bulletins_per_date:
urls = bulletins_per_date[date]
csv_url = urls.get("csv")
pdf_url = urls.get("pdf")
if csv_url:
yield scrapy.Request(
csv_url,
callback=self.parse_reports_csv,
cb_kwargs={"date": date},
)
if pdf_url:
yield scrapy.Request(
pdf_url,
callback=self.parse_report_pdf,
cb_kwargs={"date": date},
)

def parse_reports_csv(self, response, date):
with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") as tmp:
tmp.write(response.body)
extractor = MaranhaoCSVBulletinExtractor(tmp.name)
for report in extractor.data:
name = report["municipio"].lower()
deaths = report["mortes"]
cases = report["confirmados"]

if "revisão" in name:
self.add_note_in_report(date, f"- Nota no csv: {name}")
elif "total" in name:
bulletin = StateTotalBulletinModel(
date=date,
state=self.state,
deaths=deaths,
confirmed_cases=cases,
source=response.request.url,
)
self.add_new_bulletin_to_report(bulletin, date)
else:
bulletin = CountyBulletinModel(
date=date,
state=self.state,
city=name,
confirmed_cases=cases,
deaths=deaths,
source=response.request.url,
)
self.add_new_bulletin_to_report(bulletin, date)

def parse_report_pdf(self, response, date):
source = response.request.url
file = io.BytesIO(response.body)
extractor = MaranhaoPdfBulletinExtractor(file)

pdf_date = extractor.date
if pdf_date and pdf_date != date:
self.logger.warning(
f"PDF date does not match for pdf {source}. Aborting extraction."
)
return

official_total = extractor.official_total
bulletin = StateTotalBulletinModel(
date=date,
state=self.state,
deaths=official_total["confirmados"],
confirmed_cases=official_total["mortes"],
source=source,
)
self.add_new_bulletin_to_report(bulletin, date)

@staticmethod
def _extract_date_from_csv_name(csv_name) -> datetime.date:
year = csv_name.split("uploads/")[-1].split("/")[0]
date_month, *_ = re.compile("[0-9]+").findall(csv_name.split("/")[-1]) or [None]
if date_month:
if len(date_month) <= 4:
month, day = date_month[-2:], date_month[-4:-2]
else:
month, day = date_month[2:4], date_month[0:2]
return datetime(int(year), int(month), int(day)).date()
Empty file added tests/__init__.py
Empty file.
Loading