turicas · dehatanes · Mar 10, 2022 · Mar 12, 2022 · Mar 13, 2022 · Mar 15, 2022
diff --git a/covid19br/common/base_spider.py b/covid19br/common/base_spider.py
@@ -101,6 +101,10 @@ def add_warning_in_report(
         report = self._get_or_create_report(date)
         report.add_warning(slug, description)
 
+    def add_note_in_report(self, date: datetime.date, note: str):
+        report = self._get_or_create_report(date)
+        report.add_note(note)
+
     def _get_or_create_report(self, date: datetime.date):
         report = self.reports.get(date)
         if not report:

diff --git a/covid19br/common/data_normalization_utils.py b/covid19br/common/data_normalization_utils.py
@@ -8,7 +8,9 @@
 CURRENT_YEAR = date.today().year
 MONTHS = "jan fev mar abr mai jun jul ago set out nov dez".split()
 
-REGEXP_IN_FULL_DATE = re.compile("([0-9]{1,2})(?: +de)? ([^ ]+)(?: de ([0-9]{4}))?")
+REGEXP_IN_FULL_DATE = re.compile(
+    "([0-9]{1,2})(?: +de)? ([^ ]+)(?: de)?(?: ([0-9]{4}))?"
+)
 REGEXP_NUMERIC_DATE = re.compile("([0-9]{2})[/-]([0-9]{2})[/-]([0-9]{2,4})[ .]?")
 
 

diff --git a/covid19br/parsers/MA/maranhao_csv.py b/covid19br/parsers/MA/maranhao_csv.py
@@ -0,0 +1,44 @@
+import csv
+
+CITY_NAME_CSV_COLUMN = 0
+CONFIRMED_CASES_CSV_COLUMN = 1
+DEATH_CASES_CSV_COLUMN = 2
+
+
+class MaranhaoCSVBulletinExtractor:
+    def __init__(self, filename):
+        with open(filename, encoding="latin-1") as fobj:
+            reader = csv.reader(fobj, delimiter=";", lineterminator="\n")
+            self.file_content = list(reader)
+
+        # We actually just need the header, but we are not sure about
+        # it's position so we take the first 10 lines
+        first_items = [l[0].strip() for l in self.file_content[:10]]
+        # If the encoding is wrong, we try another one
+        if "MUNICÍPIOS" not in first_items:
+            with open(filename, encoding="mac_iceland") as fobj:
+                reader = csv.reader(fobj, delimiter=";", lineterminator="\n")
+                self.file_content = list(reader)
+
+    @property
+    def data(self):
+        # There are blank lines in the beginning of the file. Wait until find the header
+        header = None
+        for line in self.file_content:
+            # Consider only non-empty lines and first 3 columns
+            line = [cell.strip() for cell in line[:3]]
+            if not any(line):
+                continue
+            elif not header:
+                if line[CITY_NAME_CSV_COLUMN] == "MUNICÍPIOS":
+                    header = line
+                continue
+
+            if line[
+                CITY_NAME_CSV_COLUMN
+            ]:  # There are some blank lines we want to ignore
+                yield {
+                    "municipio": line[CITY_NAME_CSV_COLUMN],
+                    "confirmados": line[CONFIRMED_CASES_CSV_COLUMN],
+                    "mortes": line[DEATH_CASES_CSV_COLUMN],
+                }
diff --git a/covid19br/parsers/MA/maranhao_pdf.py b/covid19br/parsers/MA/maranhao_pdf.py
@@ -0,0 +1,52 @@
+import re
+import rows
+
+from covid19br.common.data_normalization_utils import NormalizationUtils
+
+
+def is_only_number(value):
+    return re.compile("^([0-9.]+)$").findall(value.strip())
+
+
+class MaranhaoPdfBulletinExtractor:
+    def __init__(self, filename):
+        self.doc = rows.plugins.pdf.PyMuPDFBackend(filename)
+        self.first_page_objs = next(self.doc.text_objects())
+
+    @property
+    def date(self):
+        for obj in self.first_page_objs:
+            if "BOLETIM ATUALIZADO" in obj.text:
+                return NormalizationUtils.extract_numeric_date(obj.text)
+
+    @property
+    def official_total(self):
+        confirmed_cases_label = next(
+            obj for obj in self.first_page_objs if obj.text.lower() == "confirmados"
+        )
+        deaths_label = next(
+            obj for obj in self.first_page_objs if obj.text.lower() == "óbitos"
+        )
+
+        # select the number above and on the left of confirmed_cases_label
+        confirmed_cases = next(
+            obj
+            for obj in self.first_page_objs
+            if is_only_number(obj.text)
+            and obj.y0 < confirmed_cases_label.y0
+            and obj.x0 < confirmed_cases_label.x0
+        )
+        # select the numbers above and that are in the same column as deaths_label and pick the closest one
+        deaths, *_ = sorted(
+            [
+                obj
+                for obj in self.first_page_objs
+                if is_only_number(obj.text)
+                and obj.y0 < deaths_label.y0
+                and obj.x0 < deaths_label.x0
+                and obj.x1 > deaths_label.x1
+            ],
+            key=lambda obj: deaths_label.y0 - obj.y0,
+        ) or [None]
+
+        return {"confirmados": deaths.text, "mortes": confirmed_cases.text}
diff --git a/covid19br/run_spider.py b/covid19br/run_spider.py
@@ -11,6 +11,7 @@
 from covid19br.common.data_normalization_utils import NormalizationUtils
 from covid19br.spiders.spider_ba import SpiderBA
 from covid19br.spiders.spider_ce import SpiderCE
+from covid19br.spiders.spider_ma import SpiderMA
 from covid19br.spiders.spider_pr import SpiderPR
 from covid19br.spiders.spider_sp import SpiderSP
 from covid19br.spiders.spider_to import SpiderTO
@@ -19,6 +20,7 @@
 AVAILABLE_SPIDERS = [
     SpiderBA,
     SpiderCE,
+    SpiderMA,
     SpiderPR,
     SpiderSP,
     SpiderTO,

diff --git a/covid19br/spiders/spider_ma.py b/covid19br/spiders/spider_ma.py
@@ -0,0 +1,133 @@
+import io
+import re
+import scrapy
+from collections import defaultdict
+from datetime import datetime
+import tempfile
+
+from covid19br.common.base_spider import BaseCovid19Spider
+from covid19br.common.constants import State, ReportQuality
+from covid19br.common.models.bulletin_models import (
+    CountyBulletinModel,
+    StateTotalBulletinModel,
+)
+from covid19br.parsers.MA.maranhao_csv import MaranhaoCSVBulletinExtractor
+from covid19br.parsers.MA.maranhao_pdf import MaranhaoPdfBulletinExtractor
+
+
+class SpiderMA(BaseCovid19Spider):
+    state = State.MA
+    name = State.MA.value
+    information_delay_in_days = 0
+    report_qualities = [ReportQuality.COUNTY_BULLETINS]
+
+    base_url = "https://www.saude.ma.gov.br/boletins-covid-19"
+
+    def pre_init(self):
+        self.requested_dates = list(self.requested_dates)
+
+    def start_requests(self):
+        current_year = self.today.year
+        requested_years = set([date.year for date in self.requested_dates])
+        for year in requested_years:
+            if year == current_year:
+                yield scrapy.Request(self.base_url + "/")
+            else:
+                yield scrapy.Request(f"{self.base_url}-{year}/")
+
+    def parse(self, response, **kwargs):
+        bulletins_per_date = defaultdict(dict)
+        divs = response.xpath("//div[@class='wpb_wrapper']//a")
+        for div in divs:
+            div_text = self.normalizer.remove_accentuation(
+                (div.xpath("./text()").get() or "").lower()
+            )
+            div_url = div.xpath("./@href").get()
+            if "dados gerais em csv" in div_text:
+                date = self._extract_date_from_csv_name(div_url)
+                bulletins_per_date[date]["csv"] = div_url
+            elif "boletim epidemiologic" in div_text:
+                date = self.normalizer.extract_in_full_date(div_text)
+                bulletins_per_date[date]["pdf"] = div_url
+
+        for date in self.requested_dates:
+            if date in bulletins_per_date:
+                urls = bulletins_per_date[date]
+                csv_url = urls.get("csv")
+                pdf_url = urls.get("pdf")
+                if csv_url:
+                    yield scrapy.Request(
+                        csv_url,
+                        callback=self.parse_reports_csv,
+                        cb_kwargs={"date": date},
+                    )
+                if pdf_url:
+                    yield scrapy.Request(
+                        pdf_url,
+                        callback=self.parse_report_pdf,
+                        cb_kwargs={"date": date},
+                    )
+
+    def parse_reports_csv(self, response, date):
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") as tmp:
+            tmp.write(response.body)
+            extractor = MaranhaoCSVBulletinExtractor(tmp.name)
+            for report in extractor.data:
+                name = report["municipio"].lower()
+                deaths = report["mortes"]
+                cases = report["confirmados"]
+
+                if "revisão" in name:
+                    self.add_note_in_report(date, f"- Nota no csv: {name}")
+                elif "total" in name:
+                    bulletin = StateTotalBulletinModel(
+                        date=date,
+                        state=self.state,
+                        deaths=deaths,
+                        confirmed_cases=cases,
+                        source=response.request.url,
+                    )
+                    self.add_new_bulletin_to_report(bulletin, date)
+                else:
+                    bulletin = CountyBulletinModel(
+                        date=date,
+                        state=self.state,
+                        city=name,
+                        confirmed_cases=cases,
+                        deaths=deaths,
+                        source=response.request.url,
+                    )
+                    self.add_new_bulletin_to_report(bulletin, date)
+
+    def parse_report_pdf(self, response, date):
+        source = response.request.url
+        file = io.BytesIO(response.body)
+        extractor = MaranhaoPdfBulletinExtractor(file)
+
+        pdf_date = extractor.date
+        if pdf_date and pdf_date != date:
+            self.logger.warning(
+                f"PDF date does not match for pdf {source}. Aborting extraction."
+            )
+            return
+
+        official_total = extractor.official_total
+        bulletin = StateTotalBulletinModel(
+            date=date,
+            state=self.state,
+            deaths=official_total["confirmados"],
+            confirmed_cases=official_total["mortes"],
+            source=source,
+        )
+        self.add_new_bulletin_to_report(bulletin, date)
+
+    @staticmethod
+    def _extract_date_from_csv_name(csv_name) -> datetime.date:
+        year = csv_name.split("uploads/")[-1].split("/")[0]
+        date_month, *_ = re.compile("[0-9]+").findall(csv_name.split("/")[-1]) or [None]
+        if date_month:
+            if len(date_month) <= 4:
+                month, day = date_month[-2:], date_month[-4:-2]
+            else:
+                month, day = date_month[2:4], date_month[0:2]
+            return datetime(int(year), int(month), int(day)).date()
diff --git a/tests/__init__.py b/tests/__init__.py