diff --git a/pipelines/br_rj_riodejaneiro_brt_gps/constants.py b/pipelines/br_rj_riodejaneiro_brt_gps/constants.py new file mode 100644 index 000000000..c94c21f50 --- /dev/null +++ b/pipelines/br_rj_riodejaneiro_brt_gps/constants.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +""" +Valores constantes para pipelines br_rj_riodejaneiro_brt_gps +""" + +from enum import Enum + + +class constants(Enum): # pylint: disable=c0103 + """ + Valores constantes para pipelines br_rj_riodejaneiro_brt_gps + """ + + GPS_BRT_RAW_DATASET_ID = "br_rj_riodejaneiro_brt_gps" + GPS_BRT_RAW_TABLE_ID = "registros" + GPS_BRT_DATASET_ID = "br_rj_riodejaneiro_veiculos" + GPS_BRT_TREATED_TABLE_ID = "gps_brt" + GPS_BRT_MATERIALIZE_DELAY_HOURS = 0 + GPS_BRT_API_URL = "https://zn4.m2mcontrol.com.br/api/integracao/veiculos" + GPS_BRT_API_SECRET_PATH = "brt_api_v2" + + GPS_BRT_MAPPING_KEYS = { + "codigo": "id_veiculo", + "linha": "servico", + "latitude": "latitude", + "longitude": "longitude", + "dataHora": "timestamp_gps", + "velocidade": "velocidade", + "sentido": "sentido", + "trajeto": "vista", + # "inicio_viagem": "timestamp_inicio_viagem", + } diff --git a/pipelines/br_rj_riodejaneiro_brt_gps/flows.py b/pipelines/br_rj_riodejaneiro_brt_gps/flows.py index b9ebf67a3..808c75e5f 100644 --- a/pipelines/br_rj_riodejaneiro_brt_gps/flows.py +++ b/pipelines/br_rj_riodejaneiro_brt_gps/flows.py @@ -22,12 +22,12 @@ # isort: on # SMTR Imports # +from pipelines.br_rj_riodejaneiro_brt_gps.constants import constants as gps_constants from pipelines.br_rj_riodejaneiro_brt_gps.tasks import ( pre_treatment_br_rj_riodejaneiro_brt_gps, ) -from pipelines.constants import constants from pipelines.schedules import every_hour, every_minute -from pipelines.tasks import ( # get_local_dbt_client,; setup_task, +from pipelines.utils.backup.tasks import ( # get_local_dbt_client,; setup_task, bq_upload, create_date_hour_partition, create_local_partition_path, @@ -58,10 +58,10 @@ ) # Get default parameters # - raw_dataset_id = Parameter("raw_dataset_id", default=constants.GPS_BRT_RAW_DATASET_ID.value) - raw_table_id = Parameter("raw_table_id", default=constants.GPS_BRT_RAW_TABLE_ID.value) - dataset_id = Parameter("dataset_id", default=constants.GPS_BRT_DATASET_ID.value) - table_id = Parameter("table_id", default=constants.GPS_BRT_TREATED_TABLE_ID.value) + raw_dataset_id = Parameter("raw_dataset_id", default=gps_constants.GPS_BRT_RAW_DATASET_ID.value) + raw_table_id = Parameter("raw_table_id", default=gps_constants.GPS_BRT_RAW_TABLE_ID.value) + dataset_id = Parameter("dataset_id", default=gps_constants.GPS_BRT_DATASET_ID.value) + table_id = Parameter("table_id", default=gps_constants.GPS_BRT_TREATED_TABLE_ID.value) rebuild = Parameter("rebuild", False) LABELS = get_current_flow_labels() @@ -80,7 +80,7 @@ raw_table_id=raw_table_id, table_run_datetime_column_name="timestamp_gps", mode=MODE, - delay_hours=constants.GPS_BRT_MATERIALIZE_DELAY_HOURS.value, + delay_hours=gps_constants.GPS_BRT_MATERIALIZE_DELAY_HOURS.value, ) dataset_sha = fetch_dataset_sha( dataset_id=dataset_id, @@ -144,16 +144,16 @@ filename = parse_timestamp_to_string(timestamp) filepath = create_local_partition_path( - dataset_id=constants.GPS_BRT_RAW_DATASET_ID.value, - table_id=constants.GPS_BRT_RAW_TABLE_ID.value, + dataset_id=gps_constants.GPS_BRT_RAW_DATASET_ID.value, + table_id=gps_constants.GPS_BRT_RAW_TABLE_ID.value, filename=filename, partitions=partitions, ) # EXTRACT raw_status = get_raw( - url=constants.GPS_BRT_API_URL.value, - headers=constants.GPS_BRT_API_SECRET_PATH.value, + url=gps_constants.GPS_BRT_API_URL.value, + headers=gps_constants.GPS_BRT_API_SECRET_PATH.value, ) raw_filepath = save_raw_local(status=raw_status, file_path=filepath) @@ -165,16 +165,16 @@ treated_filepath = save_treated_local(status=treated_status, file_path=filepath) # LOAD error = bq_upload( - dataset_id=constants.GPS_BRT_RAW_DATASET_ID.value, - table_id=constants.GPS_BRT_RAW_TABLE_ID.value, + dataset_id=gps_constants.GPS_BRT_RAW_DATASET_ID.value, + table_id=gps_constants.GPS_BRT_RAW_TABLE_ID.value, filepath=treated_filepath, raw_filepath=raw_filepath, partitions=partitions, status=treated_status, ) upload_logs_to_bq( - dataset_id=constants.GPS_BRT_RAW_DATASET_ID.value, - parent_table_id=constants.GPS_BRT_RAW_TABLE_ID.value, + dataset_id=gps_constants.GPS_BRT_RAW_DATASET_ID.value, + parent_table_id=gps_constants.GPS_BRT_RAW_TABLE_ID.value, timestamp=timestamp, error=error, ) diff --git a/pipelines/br_rj_riodejaneiro_brt_gps/tasks.py b/pipelines/br_rj_riodejaneiro_brt_gps/tasks.py index 7f04a30cd..26db9cc21 100644 --- a/pipelines/br_rj_riodejaneiro_brt_gps/tasks.py +++ b/pipelines/br_rj_riodejaneiro_brt_gps/tasks.py @@ -20,7 +20,10 @@ # SMTR Imports # from pipelines.constants import constants -from pipelines.utils.utils import log_critical, map_dict_keys +from pipelines.utils.backup.utils import log_critical, map_dict_keys + + +from pipelines.br_rj_riodejaneiro_brt_gps.constants import constants as gps_constants # Tasks # @@ -58,7 +61,9 @@ def pre_treatment_br_rj_riodejaneiro_brt_gps(status: dict, timestamp): df = pd.DataFrame(columns=columns) # pylint: disable=c0103 # map_dict_keys change data keys to match project data structure - df["content"] = [map_dict_keys(piece, constants.GPS_BRT_MAPPING_KEYS.value) for piece in data] + df["content"] = [ + map_dict_keys(piece, gps_constants.GPS_BRT_MAPPING_KEYS.value) for piece in data + ] df[key_column] = [piece[key_column] for piece in data] df["timestamp_gps"] = [piece["timestamp_gps"] for piece in data] df["timestamp_captura"] = timestamp diff --git a/pipelines/capture/__init__.py b/pipelines/capture/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/capture/jae/__init__.py b/pipelines/capture/jae/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/capture/jae/constants.py b/pipelines/capture/jae/constants.py new file mode 100644 index 000000000..949a64627 --- /dev/null +++ b/pipelines/capture/jae/constants.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +""" +Valores constantes para captura de dados da JAE +""" + +from enum import Enum + +from pipelines.utils.incremental_capture_strategy import ( + DatetimeIncremental, + IDIncremental, +) + + +class constants(Enum): + """ + Valores constantes para captura de dados da JAE + """ + + JAE_SOURCE_NAME = "jae" + + JAE_DATABASES = { + "principal_db": { + "engine": "mysql", + "host": "10.5.114.121", + }, + "tarifa_db": { + "engine": "postgresql", + "host": "10.5.113.254", + }, + "transacao_db": { + "engine": "postgresql", + "host": "10.5.115.1", + }, + "tracking_db": { + "engine": "postgresql", + "host": "10.5.15.25", + }, + "ressarcimento_db": { + "engine": "postgresql", + "host": "10.5.15.127", + }, + "gratuidade_db": { + "engine": "postgresql", + "host": "10.5.12.107", + }, + } + + JAE_PRIVATE_BUCKET = {"dev": "br-rj-smtr-jae-private-dev", "prod": "br-rj-smtr-jae-private-dev"} + + JAE_RAW_FILETYPE = "json" + + TRANSACAO_DEFAULT_PARAMS = { + "table_id": "transacao", + "raw_filetype": JAE_RAW_FILETYPE, + "incremental_capture_strategy": DatetimeIncremental( + max_incremental_window={"hours": 3}, first_value="2024-02-15 00:00:00" + ).to_dict(), + "data_extractor_params": { + "database": "transacao_db", + "query": """ + SELECT + * + FROM + transacao + WHERE + data_processamento > '{{ start }}' + AND data_processamento <= '{{ end }}' + """, + }, + "primary_keys": ["id"], + } + + GPS_VALIDADOR_CAPTURE_PARAMS = { + "table_id": "gps_validador", + "raw_filetype": JAE_RAW_FILETYPE, + "incremental_capture_strategy": IDIncremental( + max_incremental_window=100_000, + id_column_name="id", + first_value=406_064_585, + ).to_dict(), + "data_extractor_params": { + "database": "tracking_db", + "query": """ + SELECT + * + FROM + tracking_detalhe + WHERE + id > {{ start }} AND id <= {{ end }} + """, + "page_size": 1000, + "max_pages": 100, + }, + "primary_keys": ["id"], + } + + AUXILIAR_GENERAL_CAPTURE_PARAMS = { + "incremental_capture_strategy": DatetimeIncremental( + max_incremental_window={"hours": 5} + ).to_dict(), + "raw_filetype": JAE_RAW_FILETYPE, + } + + AUXILIAR_TABLE_CAPTURE_PARAMS = [ + { + "table_id": "linha", + "data_extractor_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + LINHA + {% if is_incremental() %} + WHERE + DT_INCLUSAO BETWEEN '{{ start }}' + AND '{{ end }}' + {% endif %} + """, + }, + "primary_keys": ["CD_LINHA"], + }, + { + "table_id": "operadora_transporte", + "data_extractor_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + OPERADORA_TRANSPORTE + {% if is_incremental() %} + WHERE + DT_INCLUSAO BETWEEN '{{ start }}' + AND '{{ end }}' + {% endif %} + """, + }, + "primary_keys": ["CD_OPERADORA_TRANSPORTE"], + }, + { + "table_id": "cliente", + "data_extractor_params": { + "database": "principal_db", + "query": """ + SELECT + c.* + FROM + CLIENTE c + {% if is_incremental() %} + WHERE + DT_CADASTRO BETWEEN '{{ start }}' + AND '{{ end }}' + {% endif %} + """, + }, + "primary_keys": ["CD_CLIENTE"], + "save_bucket_names": JAE_PRIVATE_BUCKET, + }, + { + "table_id": "pessoa_fisica", + "data_extractor_params": { + "database": "principal_db", + "query": """ + SELECT + p.*, + c.DT_CADASTRO + FROM + PESSOA_FISICA p + JOIN + CLIENTE c + ON + p.CD_CLIENTE = c.CD_CLIENTE + {% if is_incremental() %} + WHERE + c.DT_CADASTRO BETWEEN '{{ start }}' + AND '{{ end }}' + {% endif %} + """, + }, + "primary_keys": ["CD_CLIENTE"], + "save_bucket_names": JAE_PRIVATE_BUCKET, + }, + { + "table_id": "gratuidade", + "data_extractor_params": { + "database": "gratuidade_db", + "query": """ + SELECT + g.*, + t.descricao AS tipo_gratuidade + FROM + gratuidade g + LEFT JOIN + tipo_gratuidade t + ON + g.id_tipo_gratuidade = t.id + {% if is_incremental() %} + WHERE + g.data_inclusao BETWEEN '{{ start }}' + AND '{{ end }}' + {% endif %} + """, + }, + "primary_keys": ["id"], + "save_bucket_names": JAE_PRIVATE_BUCKET, + }, + { + "table_id": "consorcio", + "data_extractor_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + CONSORCIO + {% if is_incremental() %} + WHERE + DT_INCLUSAO BETWEEN '{{ start }}' + AND '{{ end }}' + {% endif %} + """, + }, + "primary_keys": ["CD_CONSORCIO"], + }, + { + "table_id": "percentual_rateio_integracao", + "data_extractor_params": { + "database": "ressarcimento_db", + "query": """ + SELECT + * + FROM + percentual_rateio_integracao + {% if is_incremental() %} + WHERE + dt_inclusao BETWEEN '{{ start }}' + AND '{{ end }}' + {% endif %} + """, + }, + "primary_keys": ["id"], + }, + { + "table_id": "conta_bancaria", + "data_extractor_params": { + "database": "principal_db", + "query": """ + SELECT + c.*, + b.NM_BANCO + FROM + CONTA_BANCARIA c + JOIN + BANCO b + ON + b.NR_BANCO = c.NR_BANCO + JOIN + OPERADORA_TRANSPORTE o + ON + o.CD_CLIENTE = c.CD_CLIENTE + WHERE + {{ update }} + """, + "get_updates": [ + "c.cd_cliente", + "c.cd_agencia", + "c.cd_tipo_conta", + "c.nr_banco", + "c.nr_conta", + ], + }, + "primary_keys": ["CD_CLIENTE"], + "save_bucket_names": JAE_PRIVATE_BUCKET, + }, + { + "table_id": "contato_pessoa_juridica", + "data_extractor_params": { + "database": "principal_db", + "query": """ + SELECT + * + FROM + CONTATO_PESSOA_JURIDICA + {% if is_incremental() %} + WHERE + DT_INCLUSAO BETWEEN '{{ start }}' + AND '{{ end }}' + {% endif %} + """, + }, + "primary_keys": [ + "NR_SEQ_CONTATO", + "CD_CLIENTE", + ], + "save_bucket_names": JAE_PRIVATE_BUCKET, + }, + ] diff --git a/pipelines/capture/jae/flows.py b/pipelines/capture/jae/flows.py new file mode 100644 index 000000000..0045b5879 --- /dev/null +++ b/pipelines/capture/jae/flows.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +"""Capture flows for Jae""" +from datetime import timedelta + +from pipelines.capture.jae.constants import constants +from pipelines.capture.jae.tasks import create_extractor_jae +from pipelines.capture.templates.flows import create_default_capture_flow +from pipelines.constants import constants as smtr_constants +from pipelines.schedules import generate_interval_schedule + +# Transação + +JAE_TRANSACAO_CAPTURE = create_default_capture_flow( + flow_name="Jaé Transação - Captura", + source_name=constants.JAE_SOURCE_NAME.value, + partition_date_only=False, + create_extractor_task=create_extractor_jae, + overwrite_flow_params=constants.TRANSACAO_DEFAULT_PARAMS.value, + agent_label=smtr_constants.RJ_SMTR_AGENT_LABEL.value, +) + +JAE_TRANSACAO_CAPTURE.schedule = generate_interval_schedule( + interval=timedelta(minutes=5), + agent_label=smtr_constants.RJ_SMTR_AGENT_LABEL.value, +) + +# GPS Validador + +JAE_GPS_VALIDADOR_CAPTURE = create_default_capture_flow( + flow_name="Jaé GPS Validador - Captura", + source_name=constants.JAE_SOURCE_NAME.value, + partition_date_only=False, + create_extractor_task=create_extractor_jae, + overwrite_flow_params=constants.GPS_VALIDADOR_CAPTURE_PARAMS.value, + agent_label=smtr_constants.RJ_SMTR_AGENT_LABEL.value, +) + +JAE_GPS_VALIDADOR_CAPTURE.schedule = generate_interval_schedule( + interval=timedelta(minutes=5), + agent_label=smtr_constants.RJ_SMTR_AGENT_LABEL.value, +) + +# Tabelas Auxiliares + +JAE_AUXILIAR_CAPTURE = create_default_capture_flow( + flow_name="Jaé Auxiliar - Captura (subflow)", + source_name=constants.JAE_SOURCE_NAME.value, + partition_date_only=True, + create_extractor_task=create_extractor_jae, + overwrite_flow_params=constants.AUXILIAR_GENERAL_CAPTURE_PARAMS.value, + agent_label=smtr_constants.RJ_SMTR_AGENT_LABEL.value, + skip_if_running=False, +) diff --git a/pipelines/capture/jae/tasks.py b/pipelines/capture/jae/tasks.py new file mode 100644 index 000000000..8aa2baa65 --- /dev/null +++ b/pipelines/capture/jae/tasks.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +"""Tasks for pipelines.capture.jae""" +from datetime import datetime +from typing import Union + +from pipelines.capture.jae.constants import constants as jae_constants +from pipelines.utils.capture.db import DBExtractor, PaginatedDBExtractor +from pipelines.utils.incremental_capture_strategy import IncrementalInfo +from pipelines.utils.jinja import render_template +from pipelines.utils.prefect import extractor_task +from pipelines.utils.secret import get_secret +from pipelines.utils.utils import create_sql_update_filter + + +@extractor_task +def create_extractor_jae( + env: str, + dataset_id: str, + table_id: str, + save_filepath: str, + data_extractor_params: dict, + incremental_info: IncrementalInfo, +) -> Union[DBExtractor, PaginatedDBExtractor]: + """Cria o extrator de dados para capturas da JAE""" + credentials = get_secret("smtr_jae_access_data") + database = data_extractor_params["database"] + database_details = jae_constants.JAE_DATABASES.value[database] + + start = incremental_info.start_value + end = incremental_info.end_value + + if isinstance(start, datetime): + start = start.strftime("%Y-%m-%d %H:%M:%S") + + if isinstance(end, datetime): + end = end.strftime("%Y-%m-%d %H:%M:%S") + + template_variables = { + "start": start, + "end": end, + } + + if "get_updates" in data_extractor_params.keys(): + template_variables["update"] = create_sql_update_filter( + env=env, + dataset_id=dataset_id, + table_id=table_id, + columns_to_search=data_extractor_params["get_updates"], + ) + + query = render_template( + template_string=data_extractor_params["query"], + execution_mode=incremental_info.execution_mode, + _vars=template_variables, + ) + + extractor_general_args = { + "query": query, + "engine": database_details["engine"], + "host": database_details["host"], + "user": credentials["user"], + "password": credentials["password"], + "database": database, + "save_filepath": save_filepath, + } + + if table_id == jae_constants.GPS_VALIDADOR_CAPTURE_PARAMS.value["table_id"]: + return PaginatedDBExtractor( + page_size=data_extractor_params["page_size"], + max_pages=data_extractor_params["max_pages"], + **extractor_general_args, + ) + + return DBExtractor(**extractor_general_args) diff --git a/pipelines/capture/templates/__init__.py b/pipelines/capture/templates/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/capture/templates/flows.py b/pipelines/capture/templates/flows.py new file mode 100644 index 000000000..341982e4f --- /dev/null +++ b/pipelines/capture/templates/flows.py @@ -0,0 +1,249 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime +from types import NoneType +from typing import Callable + +import pandas as pd +from prefect import Parameter +from prefect.run_configs import KubernetesRun +from prefect.storage import GCS +from prefect.tasks.core.function import FunctionTask +from prefeitura_rio.pipelines_utils.custom import Flow +from prefeitura_rio.pipelines_utils.state_handlers import ( + handler_inject_bd_credentials, + handler_skip_if_running, +) + +from pipelines.capture.templates.tasks import ( + create_incremental_strategy, + create_table_object, + get_raw_data, + rename_capture_flow, + save_incremental_redis, + transform_raw_to_nested_structure, + upload_raw_file_to_gcs, + upload_source_data_to_gcs, +) +from pipelines.constants import constants +from pipelines.tasks import get_current_timestamp, get_run_env +from pipelines.utils.prefect import TypedParameter + +# from pipelines.utils.pretreatment import strip_string_columns + + +def create_default_capture_flow( + flow_name: str, + source_name: str, + partition_date_only: bool, + create_extractor_task: FunctionTask, + overwrite_flow_params: dict, + agent_label: str, + pretreat_funcs: list[Callable[[pd.DataFrame, datetime, list], pd.DataFrame]] = None, + skip_if_running=True, +): # pylint: disable=R0914, R0913 + """ + Cria um flow de captura + + Args: + flow_name (str): O nome do flow + source_name (str): Nome da fonte do dado (exemplo: jae) + partition_date_only (bool): True se o particionamento deve ser feito apenas por data + False se o particionamento deve ser feito por data e hora + create_extractor_task (FunctionTask): + A task que cria o DataExtractor + Pode receber os argumentos: + env (str): dev ou prod + source_name (str): O nome do source + table_id (str): table_id no BigQuery + save_filepath (str): O caminho para salvar o arquivo raw localmente + data_extractor_params (dict): Dicionario com parametros personalizados + incremental_info (IncrementalInfo): Objeto contendo informações sobre + a execução incremental + Deve retornar uma classe derivada de DataExtractor + overwrite_optional_flow_params (dict): Dicionário para substituir + o valor padrão dos parâmetros opcionais do flow + agent_label (str): Label do flow + pretreat_funcs (list[Callable[[pd.DataFrame, datetime, list], pd.DataFrame]], optional): + Lista de funções de pre-tratamento para serem executadas antes de aninhar os dados + A função pode receber os argumentos: + data (pd.DataFrame): O DataFrame para ser tratado + timestamp (datetime): A timestamp do flow + primary_key (list): A lista de primary keys + Deve retornar um DataFrame + + Returns: + Flow: The capture flow + """ + + if pretreat_funcs is None: + pretreat_funcs = [] + + with Flow(flow_name) as capture_flow: + # Parâmetros Gerais # + + # table_id no BigQuery + table_id = TypedParameter( + name="table_id", + default=overwrite_flow_params.get("table_id"), + accepted_types=str, + ) + # Tipo do arquivo raw (json, csv...) + raw_filetype = TypedParameter( + name="raw_filetype", + default=overwrite_flow_params.get("raw_filetype"), + accepted_types=str, + ) + + # Parâmetros Incremental # + + # Dicionário para gerar o objeto de estratégia incremental + # Modo de usar: + # Instancie o objeto da estrategia escolhida e chame o metodo to_dict() + # ex.: DatetimeIncremental(max_incremental_window={"hours": 3}).to_dict() + incremental_capture_strategy = TypedParameter( + name="incremental_capture_strategy", + default=overwrite_flow_params.get("incremental_capture_strategy"), + accepted_types=(dict, NoneType), + ) + # Valor inicial de captura para sobrescrever o padrão + # valor inicial padrão = valor do salvo no Redis + # para incrementais do tipo datetime, o valor deve ser uma string + # de data no formato iso (timezone padrão = UTC) + # para incrementais de id deve ser um inteiro + incremental_start_value = TypedParameter( + name="incremental_start_value", + default=overwrite_flow_params.get("incremental_start_value"), + accepted_types=(str, int, NoneType), + ) + # Valor final de captura para sobrescrever o padrão + # valor final padrão = valor inicial + max_incremental_window + # para incrementais do tipo datetime, o valor deve ser uma string + # de data no formato iso (timezone padrão = UTC) + # para incrementais de id deve ser um inteiro + incremental_end_value = TypedParameter( + name="incremental_end_value", + default=overwrite_flow_params.get("incremental_end_value"), + accepted_types=(str, int, NoneType), + ) + + # Parâmetros para Captura # + + # Dicionário com valores personalizados para serem acessados na task + # passada no argumento create_extractor_task + data_extractor_params = Parameter( + "data_extractor_params", + default=overwrite_flow_params.get("data_extractor_params"), + ) + + # Parâmetros para Pré-tratamento # + + # Lista de primary keys da tabela + primary_keys = TypedParameter( + name="primary_keys", + default=overwrite_flow_params.get("primary_keys"), + accepted_types=(list, NoneType), + ) + # Dicionário com argumentos para serem passados na função de ler os dados raw: + # pd.read_csv ou pd.read_json + pretreatment_reader_args = TypedParameter( + name="pretreatment_reader_args", + default=overwrite_flow_params.get("pretreatment_reader_args"), + accepted_types=(dict, NoneType), + ) + + # Parâmetros para Carregamento de Dados # + + # Nome do bucket para salvar os dados + # Se for None, salva no bucket padrão do ambiente atual + save_bucket_names = TypedParameter( + name="save_bucket_names", + default=overwrite_flow_params.get("save_bucket_names"), + accepted_types=(dict, NoneType), + ) + + # Preparar execução # + + timestamp = get_current_timestamp() + dataset_id = source_name + "_source" + + env = get_run_env() + + table = create_table_object( + env=env, + dataset_id=dataset_id, + table_id=table_id, + bucket_names=save_bucket_names, + timestamp=timestamp, + partition_date_only=partition_date_only, + raw_filetype=raw_filetype, + ) + + incremental_capture_strategy = create_incremental_strategy( + strategy_dict=incremental_capture_strategy, + table=table, + overwrite_start_value=incremental_start_value, + overwrite_end_value=incremental_end_value, + ) + + incremental_info = incremental_capture_strategy["incremental_info"] + + rename_flow_run = rename_capture_flow( + dataset_id=dataset_id, + table_id=table_id, + timestamp=timestamp, + incremental_info=incremental_info, + ) + + # Extração # + + data_extractor = create_extractor_task( + env=env, + dataset_id=dataset_id, + table_id=table_id, + save_filepath=table["raw_filepath"], + data_extractor_params=data_extractor_params, + incremental_info=incremental_info, + ) + + data_extractor.set_upstream(rename_flow_run) + + get_raw = get_raw_data(data_extractor=data_extractor) + + upload_raw_gcs = upload_raw_file_to_gcs(table=table, upstream_tasks=[get_raw]) + + # Pré-tratamento # + + pretreatment = transform_raw_to_nested_structure( + pretreat_funcs=pretreat_funcs, + raw_filepath=table["raw_filepath"], + source_filepath=table["source_filepath"], + timestamp=timestamp, + primary_keys=primary_keys, + print_inputs=save_bucket_names.is_equal(None), + reader_args=pretreatment_reader_args, + upstream_tasks=[get_raw], + ) + + upload_source_gcs = upload_source_data_to_gcs(table=table, upstream_tasks=[pretreatment]) + + # Finalizar Flow # + + save_incremental_redis( + incremental_capture_strategy=incremental_capture_strategy, + upstream_tasks=[upload_source_gcs, upload_raw_gcs], + ) + + capture_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) + capture_flow.run_config = KubernetesRun( + image=constants.DOCKER_IMAGE.value, + labels=[agent_label], + ) + capture_flow.state_handlers = [ + handler_inject_bd_credentials, + ] + + if skip_if_running: + capture_flow.state_handlers.append(handler_skip_if_running) + + return capture_flow diff --git a/pipelines/capture/templates/tasks.py b/pipelines/capture/templates/tasks.py new file mode 100644 index 000000000..ee995a20e --- /dev/null +++ b/pipelines/capture/templates/tasks.py @@ -0,0 +1,322 @@ +# -*- coding: utf-8 -*- +""" +Tasks for rj_smtr +""" +from datetime import datetime, timedelta +from typing import Any, Callable, Union + +import pandas as pd +from prefect import task +from prefeitura_rio.pipelines_utils.logging import log +from pytz import timezone + +from pipelines.constants import constants +from pipelines.utils.capture.base import DataExtractor +from pipelines.utils.fs import read_raw_data, save_local_file +from pipelines.utils.gcp import BQTable +from pipelines.utils.incremental_capture_strategy import ( + IncrementalCaptureStrategy, + IncrementalInfo, + incremental_strategy_from_dict, +) +from pipelines.utils.prefect import flow_is_running_local, rename_current_flow_run +from pipelines.utils.pretreatment import transform_to_nested_structure +from pipelines.utils.utils import create_timestamp_captura, data_info_str + +############################ +# Flow Configuration Tasks # +############################ + + +@task( + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def create_table_object( + env: str, + dataset_id: str, + table_id: str, + bucket_names: Union[None, dict], + timestamp: datetime, + partition_date_only: bool, + raw_filetype: str, +) -> BQTable: + """ + Cria um objeto de tabela para interagir com o BigQuery + Creates basedosdados Table object + + Args: + env (str): dev ou prod, + dataset_id (str): dataset_id no BigQuery, + table_id (str): table_id no BigQuery, + bucket_name (Union[None, str]): Nome do bucket com os dados da tabela no GCS, + se for None, usa o bucket padrão do ambiente + timestamp (datetime): timestamp gerado pela execução do flow, + partition_date_only (bool): True se o particionamento deve ser feito apenas por data + False se o particionamento deve ser feito por data e hora, + raw_filetype (str): Tipo do arquivo raw (json, csv...), + + Returns: + BQTable: Objeto para manipular a tabela no BigQuery + """ + + return BQTable( + env=env, + dataset_id=dataset_id, + table_id=table_id, + bucket_names=bucket_names, + timestamp=timestamp, + partition_date_only=partition_date_only, + raw_filetype=raw_filetype, + ) + + +@task( + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def rename_capture_flow( + dataset_id: str, + table_id: str, + timestamp: datetime, + incremental_info: IncrementalInfo, +) -> bool: + """ + Renomeia a run atual do Flow de captura com o formato: + [ | ] .: from to + + Returns: + bool: Se o flow foi renomeado + """ + name = f"[{timestamp.astimezone(tz=timezone(constants.TIMEZONE.value))} | \ +{incremental_info.execution_mode.upper()}] {dataset_id}.{table_id}: from \ +{incremental_info.start_value} to {incremental_info.end_value}" + return rename_current_flow_run(name=name) + + +##################### +# Raw Capture Tasks # +##################### + + +@task( + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def get_raw_data(data_extractor: DataExtractor): + """ + Faz a extração dos dados raw e salva localmente + + Args: + data_extractor (DataExtractor): Extrator de dados a ser executado + """ + data_extractor.extract() + data_extractor.save_raw_local() + + +################ +# Upload Tasks # +################ + + +@task( + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def upload_raw_file_to_gcs(table: BQTable): + """ + Sobe o arquivo raw para o GCS + + Args: + table (BQTable): Objeto de tabela para BigQuery + """ + table.upload_raw_file() + + +@task( + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def upload_source_data_to_gcs(table: BQTable): + """ + Sobe os dados aninhados e o log do Flow para a pasta source do GCS + + Args: + table (BQTable): Objeto de tabela para BigQuery + """ + + if not table.exists(): + log("Staging Table does not exist, creating table...") + table.create() + else: + log("Staging Table already exists, appending to it...") + table.append() + + +###################### +# Pretreatment Tasks # +###################### + + +@task( + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def transform_raw_to_nested_structure( + pretreat_funcs: list[Callable[[pd.DataFrame, datetime, list], pd.DataFrame]], + raw_filepath: str, + source_filepath: str, + timestamp: datetime, + primary_keys: Union[list, str], + print_inputs: bool, + reader_args: dict, +): + """ + Task para aplicar pre-tratamentos e transformar os dados para o formato aninhado + + Args: + pretreat_funcs (list[Callable[[pd.DataFrame, datetime, list], pd.DataFrame]]): + Lista de funções para serem executadas antes de aninhar os dados + A função pode receber os argumentos: + data (pd.DataFrame): O DataFrame a ser tratado + timestamp (datetime): A timestamp da execução do Flow + primary_keys (list): Lista de primary keys da tabela + Deve retornar um DataFrame + raw_filepath (str): Caminho para ler os dados raw + source_filepath (str): Caminho para salvar os dados tratados + timestamp (datetime): A timestamp da execução do Flow + primary_keys (list): Lista de primary keys da tabela + print_inputs (bool): Se a task deve exibir os dados lidos no log ou não + reader_args (dict): Dicionário de argumentos para serem passados no leitor de dados raw + (pd.read_json ou pd.read_csv) + """ + data = read_raw_data(filepath=raw_filepath, reader_args=reader_args) + + if print_inputs: + log( + f""" + Received inputs: + - timestamp:\n{timestamp} + - data:\n{data.head()}""" + ) + + if data.empty: + log("Empty dataframe, skipping transformation...") + data = pd.DataFrame() + else: + log(f"Raw data:\n{data_info_str(data)}", level="info") + + for step in pretreat_funcs: + log(f"Starting treatment step: {step.__name__}...") + data = step(data=data, timestamp=timestamp, primary_keys=primary_keys) + log(f"Step {step.__name__} finished") + + log("Creating nested structure...", level="info") + + data = transform_to_nested_structure(data=data, primary_keys=primary_keys) + + timestamp = create_timestamp_captura(timestamp=timestamp) + data["timestamp_captura"] = timestamp + log(f"timestamp column = {timestamp}", level="info") + + log( + f"Finished nested structure! Data:\n{data_info_str(data)}", + level="info", + ) + + save_local_file(filepath=source_filepath, data=data) + log(f"Data saved in {source_filepath}") + + +##################### +# Incremental Tasks # +##################### + + +@task( + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def create_incremental_strategy( + strategy_dict: Union[None, dict], + table: BQTable, + overwrite_start_value: Any, + overwrite_end_value: Any, +) -> Union[dict, IncrementalCaptureStrategy]: + """ + Cria a estratégia de captura incremental + + Args: + strategy_dict (Union[None, dict]): dicionario retornado pelo + método .to_dict() do objeto de IncrementalCaptureStrategy + table (BQTable): Objeto de tabela para BigQuery + overwrite_start_value: Valor para substituir o inicial manualmente + overwrite_end_value: Valor para substituir o final manualmente + + Returns: + Union[dict, IncrementalCaptureStrategy]: Se strategy_dict for None, retorna um Dicionário + contendo um objeto IncrementalInfo com os valores de start e end sendo + overwrite_start_value e overwrite_end_value respectivamente + e execution_mode full + Se houver valor no argumento strategy_dict, retorna um objeto IncrementalCaptureStrategy + de acordo com as especificações descritas no dicionário + """ + if strategy_dict: + incremental_strategy = incremental_strategy_from_dict(strategy_dict=strategy_dict) + incremental_strategy.initialize( + table=table, + overwrite_start_value=overwrite_start_value, + overwrite_end_value=overwrite_end_value, + ) + + log( + f"""Incremental Strategy created: + Mode: {incremental_strategy.incremental_info.execution_mode} + Start Value: {incremental_strategy.incremental_info.start_value} + End Value: {incremental_strategy.incremental_info.end_value} + """ + ) + + return incremental_strategy + + log( + f"""Empty incremental: + Mode: {constants.MODE_FULL.value} + Start Value: {overwrite_start_value} + End Value: {overwrite_end_value} + """ + ) + return { + "incremental_info": IncrementalInfo( + start_value=overwrite_start_value, + end_value=overwrite_end_value, + execution_mode=constants.MODE_FULL.value, + ) + } + + +@task( + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def save_incremental_redis( + incremental_capture_strategy: Union[dict, IncrementalCaptureStrategy], +): + """ + Salva o último valor incremental capturado no Redis + + + Args: + incremental_capture_strategy: Union[dict, IncrementalCaptureStrategy]: Objeto de estratégia + de captura incremental. apenas salva no Redis se for do tipo IncrementalCaptureStrategy + """ + is_local_run = flow_is_running_local() + if isinstance(incremental_capture_strategy, IncrementalCaptureStrategy) and not is_local_run: + incremental_capture_strategy.save_on_redis() + else: + log( + f"""Save on Redis skipped: + incremental_capture_strategy type: {type(incremental_capture_strategy)} + flow is running local: {is_local_run} + """ + ) diff --git a/pipelines/constants.py b/pipelines/constants.py index 1aba4eff3..fdc543f5a 100644 --- a/pipelines/constants.py +++ b/pipelines/constants.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Constant values for the rj_smtr projects +Valores constantes gerais para pipelines da rj-smtr """ from enum import Enum @@ -8,7 +8,7 @@ class constants(Enum): # pylint: disable=c0103 """ - Constant values for the rj_smtr projects + Valores constantes gerais para pipelines da rj-smtr """ # CONFIGS # @@ -16,6 +16,11 @@ class constants(Enum): # pylint: disable=c0103 DOCKER_IMAGE_NAME = "AUTO_REPLACE_DOCKER_IMAGE" DOCKER_IMAGE = f"{DOCKER_IMAGE_NAME}:{DOCKER_TAG}" GCS_FLOWS_BUCKET = "datario-public" + # PROJECT_NAME = {"dev": "rj-smtr-dev", "prod": "rj-smtr"} + # DEFAULT_BUCKET_NAME = {"dev": "br-rj-smtr-dev", "prod": "br-rj-smtr"} + PROJECT_NAME = {"dev": "rj-smtr-dev", "prod": "rj-smtr-dev"} + DEFAULT_BUCKET_NAME = {"dev": "br-rj-smtr-dev", "prod": "br-rj-smtr-dev"} + FILE_MAX_SIZE = 20_000 # AGENT LABELS # RJ_SMTR_AGENT_LABEL = "rj-smtr" @@ -32,540 +37,509 @@ class constants(Enum): # pylint: disable=c0103 MAX_RETRIES = 3 RETRY_DELAY = 10 + # REDIS DEFAULT KEYS # + REDIS_LAST_CAPTURED_VALUE_KEY = "last_captured_value" + + # PATTERNS # + FILENAME_PATTERN = "%Y-%m-%d-%H-%M-%S" + SOURCE_DATASET_ID_PATTERN = "{source_name}_source" + MODE_FULL = "full" + MODE_INCR = "incr" + FLOW_RUN_URL_PATTERN = "https://pipelines.dados.rio/smtr/flow-run/{run_id}" + # GPS STPL # - GPS_STPL_API_BASE_URL = "http://zn4.m2mcontrol.com.br/api/integracao/veiculos" - GPS_STPL_API_SECRET_PATH = "stpl_api" - - GPS_STPL_DATASET_ID = "br_rj_riodejaneiro_veiculos" - GPS_STPL_RAW_DATASET_ID = "br_rj_riodejaneiro_stpl_gps" - GPS_STPL_RAW_TABLE_ID = "registros" - GPS_STPL_TREATED_TABLE_ID = "gps_stpl" - - # GPS SPPO # - GPS_SPPO_API_BASE_URL = ( - "http://ccomobility.com.br/WebServices/Binder/WSConecta/EnvioInformacoesIplan?" - ) - GPS_SPPO_API_BASE_URL_V2 = "http://ccomobility.com.br/WebServices/Binder/wsconecta/EnvioIplan?" - GPS_SPPO_API_SECRET_PATH = "sppo_api" - GPS_SPPO_API_SECRET_PATH_V2 = "sppo_api_v2" - - GPS_SPPO_RAW_DATASET_ID = "br_rj_riodejaneiro_onibus_gps" - GPS_SPPO_RAW_TABLE_ID = "registros" - GPS_SPPO_DATASET_ID = "br_rj_riodejaneiro_veiculos" - GPS_SPPO_TREATED_TABLE_ID = "gps_sppo" - GPS_SPPO_CAPTURE_DELAY_V1 = 1 - GPS_SPPO_CAPTURE_DELAY_V2 = 60 - GPS_SPPO_RECAPTURE_DELAY_V2 = 6 - GPS_SPPO_MATERIALIZE_DELAY_HOURS = 1 - - # REALOCAÇÃO # - GPS_SPPO_REALOCACAO_RAW_TABLE_ID = "realocacao" - GPS_SPPO_REALOCACAO_TREATED_TABLE_ID = "realocacao" - GPS_SPPO_REALOCACAO_SECRET_PATH = "realocacao_api" - - # GPS BRT # - GPS_BRT_API_SECRET_PATH = "brt_api_v2" - GPS_BRT_API_URL = "https://zn4.m2mcontrol.com.br/api/integracao/veiculos" - GPS_BRT_DATASET_ID = "br_rj_riodejaneiro_veiculos" - GPS_BRT_RAW_DATASET_ID = "br_rj_riodejaneiro_brt_gps" - GPS_BRT_RAW_TABLE_ID = "registros" - GPS_BRT_TREATED_TABLE_ID = "gps_brt" - GPS_BRT_MAPPING_KEYS = { - "codigo": "id_veiculo", - "linha": "servico", - "latitude": "latitude", - "longitude": "longitude", - "dataHora": "timestamp_gps", - "velocidade": "velocidade", - "sentido": "sentido", - "trajeto": "vista", - # "inicio_viagem": "timestamp_inicio_viagem", - } - GPS_BRT_MATERIALIZE_DELAY_HOURS = 0 - - # SIGMOB (GTFS) # - SIGMOB_GET_REQUESTS_TIMEOUT = 60 - SIGMOB_PAGES_FOR_CSV_FILE = 10 - TASK_MAX_RETRIES = 3 - TASK_RETRY_DELAY = 10 - - SIGMOB_DATASET_ID = "br_rj_riodejaneiro_sigmob" - SIGMOB_ENDPOINTS = { - "agency": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_agency.rule?sys=MOB", - "key_column": "agency_id", - }, - "calendar": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_calendar.rule?sys=MOB", - "key_column": "service_id", - }, - "frota_determinada": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_frota_determinada.rule?sys=MOB", - "key_column": "route_id", - }, - "holidays": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_holiday.rule?sys=MOB", - "key_column": "Data", - }, - "linhas": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_linhas.rule?sys=MOB", - "key_column": "linha_id", - }, - "routes": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_routes.rule?sys=MOB", - "key_column": "route_id", - }, - "shapes": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_shapes.rule?sys=MOB&INDICE=0", - "key_column": "shape_id", - }, - "stops": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_stops.rule?sys=MOB&INDICE=0", - "key_column": "stop_id", - }, - "stop_times": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_stop_times.rule?sys=MOB", - "key_column": "stop_id", - }, - "stop_details": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_stops_details.rule?sys=MOB&INDICE=0", - "key_column": "stop_id", - }, - "trips": { - "url": "http://jeap.rio.rj.gov.br/MOB/get_trips.rule?sys=MOB", - "key_column": "trip_id", - }, - } - - # RDO/RHO - RDO_FTP_ALLOWED_PATHS = ["SPPO", "STPL"] - RDO_FTPS_SECRET_PATH = "smtr_rdo_ftps" - RDO_DATASET_ID = "br_rj_riodejaneiro_rdo" + # GPS_STPL_API_BASE_URL = "http://zn4.m2mcontrol.com.br/api/integracao/veiculos" + # GPS_STPL_API_SECRET_PATH = "stpl_api" + + # GPS_STPL_DATASET_ID = "br_rj_riodejaneiro_veiculos" + # GPS_STPL_RAW_DATASET_ID = "br_rj_riodejaneiro_stpl_gps" + # GPS_STPL_RAW_TABLE_ID = "registros" + # GPS_STPL_TREATED_TABLE_ID = "gps_stpl" + + # # GPS SPPO # + # GPS_SPPO_API_BASE_URL = ( + # "http://ccomobility.com.br/WebServices/Binder/WSConecta/EnvioInformacoesIplan?" + # ) + # GPS_SPPO_API_BASE_URL_V2 = "http://ccomobility.com.br/WebServices/Binder/wsconecta + # /EnvioIplan?" + # GPS_SPPO_API_SECRET_PATH = "sppo_api" + # GPS_SPPO_API_SECRET_PATH_V2 = "sppo_api_v2" + + # GPS_SPPO_RAW_DATASET_ID = "br_rj_riodejaneiro_onibus_gps" + # GPS_SPPO_RAW_TABLE_ID = "registros" + # GPS_SPPO_DATASET_ID = "br_rj_riodejaneiro_veiculos" + # GPS_SPPO_TREATED_TABLE_ID = "gps_sppo" + # GPS_SPPO_CAPTURE_DELAY_V1 = 1 + # GPS_SPPO_CAPTURE_DELAY_V2 = 60 + # GPS_SPPO_RECAPTURE_DELAY_V2 = 6 + # GPS_SPPO_MATERIALIZE_DELAY_HOURS = 1 + + # # REALOCAÇÃO # + # GPS_SPPO_REALOCACAO_RAW_TABLE_ID = "realocacao" + # GPS_SPPO_REALOCACAO_TREATED_TABLE_ID = "realocacao" + # GPS_SPPO_REALOCACAO_SECRET_PATH = "realocacao_api" + + # # GPS BRT # + # GPS_BRT_API_SECRET_PATH = "brt_api_v2" + # GPS_BRT_API_URL = "https://zn4.m2mcontrol.com.br/api/integracao/veiculos" + # GPS_BRT_DATASET_ID = "br_rj_riodejaneiro_veiculos" + # GPS_BRT_RAW_DATASET_ID = "br_rj_riodejaneiro_brt_gps" + # GPS_BRT_RAW_TABLE_ID = "registros" + # GPS_BRT_TREATED_TABLE_ID = "gps_brt" + # GPS_BRT_MAPPING_KEYS = { + # "codigo": "id_veiculo", + # "linha": "servico", + # "latitude": "latitude", + # "longitude": "longitude", + # "dataHora": "timestamp_gps", + # "velocidade": "velocidade", + # "sentido": "sentido", + # "trajeto": "vista", + # # "inicio_viagem": "timestamp_inicio_viagem", + # } + # GPS_BRT_MATERIALIZE_DELAY_HOURS = 0 + + # # SIGMOB (GTFS) # + # SIGMOB_GET_REQUESTS_TIMEOUT = 60 + # SIGMOB_PAGES_FOR_CSV_FILE = 10 + # TASK_MAX_RETRIES = 3 + # TASK_RETRY_DELAY = 10 + + # SIGMOB_DATASET_ID = "br_rj_riodejaneiro_sigmob" + # SIGMOB_ENDPOINTS = { + # "agency": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_agency.rule?sys=MOB", + # "key_column": "agency_id", + # }, + # "calendar": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_calendar.rule?sys=MOB", + # "key_column": "service_id", + # }, + # "frota_determinada": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_frota_determinada.rule?sys=MOB", + # "key_column": "route_id", + # }, + # "holidays": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_holiday.rule?sys=MOB", + # "key_column": "Data", + # }, + # "linhas": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_linhas.rule?sys=MOB", + # "key_column": "linha_id", + # }, + # "routes": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_routes.rule?sys=MOB", + # "key_column": "route_id", + # }, + # "shapes": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_shapes.rule?sys=MOB&INDICE=0", + # "key_column": "shape_id", + # }, + # "stops": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_stops.rule?sys=MOB&INDICE=0", + # "key_column": "stop_id", + # }, + # "stop_times": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_stop_times.rule?sys=MOB", + # "key_column": "stop_id", + # }, + # "stop_details": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_stops_details.rule?sys=MOB&INDICE=0", + # "key_column": "stop_id", + # }, + # "trips": { + # "url": "http://jeap.rio.rj.gov.br/MOB/get_trips.rule?sys=MOB", + # "key_column": "trip_id", + # }, + # } + + # # RDO/RHO + # RDO_FTP_ALLOWED_PATHS = ["SPPO", "STPL"] + # RDO_FTPS_SECRET_PATH = "smtr_rdo_ftps" + # RDO_DATASET_ID = "br_rj_riodejaneiro_rdo" SPPO_RDO_TABLE_ID = "rdo_registros_sppo" SPPO_RHO_TABLE_ID = "rho_registros_sppo" STPL_RDO_TABLE_ID = "rdo_registros_stpl" STPL_RHO_TABLE_ID = "rho_registros_stpl" - RDO_MATERIALIZE_START_DATE = "2022-12-07" - # ROCK IN RIO - RIR_DATASET_ID = "dashboards" - RIR_TABLE_ID = "registros_ocr_rir" - RIR_START_DATE = "2022-08-30 12:00:00" - RIR_SECRET_PATH = "smtr_rir_ftp" - RIR_OCR_PRIMARY_COLUMNS = { - "CodCET": "codigo_cet", - "Placa": "placa", - "UF": "uf", - "LOCAL": "local", - "datahora": "datahora", - } - RIR_OCR_SECONDARY_COLUMNS = { - "RiR": "flag_rir", - "Apoio": "flag_apoio", - } - - # SUBSÍDIO - SUBSIDIO_SPPO_DATASET_ID = "projeto_subsidio_sppo" - SUBSIDIO_SPPO_TABLE_ID = "viagem_completa" - - # SUBSÍDIO DASHBOARD - SUBSIDIO_SPPO_DASHBOARD_DATASET_ID = "dashboard_subsidio_sppo" - SUBSIDIO_SPPO_DASHBOARD_TABLE_ID = "sumario_servico_dia" - - # BILHETAGEM - BILHETAGEM_DATASET_ID = "br_rj_riodejaneiro_bilhetagem" - - BILHETAGEM_GENERAL_CAPTURE_PARAMS = { - "databases": { - "principal_db": { - "engine": "mysql", - "host": "10.5.114.121", - }, - "tarifa_db": { - "engine": "postgresql", - "host": "10.5.113.254", - }, - "transacao_db": { - "engine": "postgresql", - "host": "10.5.115.1", - }, - "tracking_db": { - "engine": "postgresql", - "host": "10.5.15.25", - }, - "ressarcimento_db": { - "engine": "postgresql", - "host": "10.5.15.127", - }, - }, - "source_type": "db", - } - - BILHETAGEM_TRANSACAO_CAPTURE_PARAMS = { - "table_id": "transacao", - "partition_date_only": False, - "extract_params": { - "database": "transacao_db", - "query": """ - SELECT - * - FROM - transacao - WHERE - data_processamento BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["id"], - "interval_minutes": 1, - } - - BILHETAGEM_TRACKING_CAPTURE_PARAMS = { - "table_id": "gps_validador", - "partition_date_only": False, - "extract_params": { - "database": "tracking_db", - "query": """ - SELECT - * - FROM - tracking_detalhe - WHERE - data_tracking BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["id"], - "interval_minutes": 1, - } - - BILHETAGEM_ORDEM_PAGAMENTO_CAPTURE_PARAMS = [ - { - "table_id": "ordem_ressarcimento", - "partition_date_only": True, - "extract_params": { - "database": "ressarcimento_db", - "query": """ - SELECT - * - FROM - ordem_ressarcimento - WHERE - data_inclusao BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["id"], - "interval_minutes": 1440, - }, - { - "table_id": "ordem_pagamento", - "partition_date_only": True, - "extract_params": { - "database": "ressarcimento_db", - "query": """ - SELECT - * - FROM - ordem_pagamento - WHERE - data_inclusao BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["id"], - "interval_minutes": 1440, - }, - ] - - BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" - - BILHETAGEM_TRATAMENTO_INTERVAL = 60 - - BILHETAGEM_CAPTURE_PARAMS = [ - { - "table_id": "linha", - "partition_date_only": True, - "extract_params": { - "database": "principal_db", - "query": """ - SELECT - * - FROM - LINHA - WHERE - DT_INCLUSAO BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["CD_LINHA"], # id column to nest data on - "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, - }, - { - "table_id": "grupo", - "partition_date_only": True, - "extract_params": { - "database": "principal_db", - "query": """ - SELECT - * - FROM - GRUPO - WHERE - DT_INCLUSAO BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["CD_GRUPO"], # id column to nest data on - "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, - }, - { - "table_id": "grupo_linha", - "partition_date_only": True, - "extract_params": { - "database": "principal_db", - "query": """ - SELECT - * - FROM - GRUPO_LINHA - WHERE - DT_INCLUSAO BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["CD_GRUPO", "CD_LINHA"], - "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, - }, - { - "table_id": "matriz_integracao", - "partition_date_only": True, - "extract_params": { - "database": "tarifa_db", - "query": """ - SELECT - * - FROM - matriz_integracao - WHERE - dt_inclusao BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": [ - "cd_versao_matriz", - "cd_integracao", - ], # id column to nest data on - "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, - }, - { - "table_id": "operadora_transporte", - "partition_date_only": True, - "extract_params": { - "database": "principal_db", - "query": """ - SELECT - * - FROM - OPERADORA_TRANSPORTE - WHERE - DT_INCLUSAO BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["CD_OPERADORA_TRANSPORTE"], # id column to nest data on - "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, - }, - { - "table_id": "pessoa_juridica", - "partition_date_only": True, - "extract_params": { - "database": "principal_db", - "query": """ - SELECT - * - FROM - PESSOA_JURIDICA - """, - }, - "primary_key": ["CD_CLIENTE"], # id column to nest data on - "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, - }, - { - "table_id": "consorcio", - "partition_date_only": True, - "extract_params": { - "database": "principal_db", - "query": """ - SELECT - * - FROM - CONSORCIO - WHERE - DT_INCLUSAO BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["CD_CONSORCIO"], # id column to nest data on - "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, - }, - { - "table_id": "linha_consorcio", - "partition_date_only": True, - "extract_params": { - "database": "principal_db", - "query": """ - SELECT - * - FROM - LINHA_CONSORCIO - WHERE - DT_INCLUSAO BETWEEN '{start}' - AND '{end}' - """, - }, - "primary_key": ["CD_CONSORCIO", "CD_LINHA"], # id column to nest data on - "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, - }, - ] - - BILHETAGEM_MATERIALIZACAO_TRANSACAO_PARAMS = { - "dataset_id": BILHETAGEM_DATASET_ID, - "table_id": BILHETAGEM_TRANSACAO_CAPTURE_PARAMS["table_id"], - "upstream": True, - "dbt_vars": { - "date_range": { - "table_run_datetime_column_name": "datetime_transacao", - "delay_hours": 1, - }, - "version": {}, - }, - } - - BILHETAGEM_MATERIALIZACAO_ORDEM_PAGAMENTO_PARAMS = { - "dataset_id": BILHETAGEM_DATASET_ID, - "table_id": "ordem_pagamento", - "upstream": True, - "exclude": f"+{BILHETAGEM_MATERIALIZACAO_TRANSACAO_PARAMS['table_id']}", - "dbt_vars": { - "date_range": { - "table_run_datetime_column_name": "data_ordem", - "delay_hours": 0, - }, - "version": {}, - }, - } - - BILHETAGEM_GENERAL_CAPTURE_DEFAULT_PARAMS = { - "dataset_id": BILHETAGEM_DATASET_ID, - "secret_path": BILHETAGEM_SECRET_PATH, - "source_type": BILHETAGEM_GENERAL_CAPTURE_PARAMS["source_type"], - } + # RDO_MATERIALIZE_START_DATE = "2022-12-07" + # # ROCK IN RIO + # RIR_DATASET_ID = "dashboards" + # RIR_TABLE_ID = "registros_ocr_rir" + # RIR_START_DATE = "2022-08-30 12:00:00" + # RIR_SECRET_PATH = "smtr_rir_ftp" + # RIR_OCR_PRIMARY_COLUMNS = { + # "CodCET": "codigo_cet", + # "Placa": "placa", + # "UF": "uf", + # "LOCAL": "local", + # "datahora": "datahora", + # } + # RIR_OCR_SECONDARY_COLUMNS = { + # "RiR": "flag_rir", + # "Apoio": "flag_apoio", + # } + + # # SUBSÍDIO + # SUBSIDIO_SPPO_DATASET_ID = "projeto_subsidio_sppo" + # SUBSIDIO_SPPO_TABLE_ID = "viagem_completa" + + # # SUBSÍDIO DASHBOARD + # SUBSIDIO_SPPO_DASHBOARD_DATASET_ID = "dashboard_subsidio_sppo" + # SUBSIDIO_SPPO_DASHBOARD_TABLE_ID = "sumario_servico_dia" + BILHETAGEM_DATASET_ID = "bilhetagem" + CADASTRO_DATASET_ID = "cadastro" + + # CAPTURA # + + # JAE + + # BILHETAGEM_TRACKING_CAPTURE_PARAMS = { + # "table_id": "gps_validador", + # "partition_date_only": False, + # "extract_params": { + # "database": "tracking_db", + # "query": """ + # SELECT + # * + # FROM + # tracking_detalhe + # WHERE + # data_tracking BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": ["id"], + # "interval_minutes": 1, + # } + + # BILHETAGEM_ORDEM_PAGAMENTO_CAPTURE_PARAMS = [ + # { + # "table_id": "ordem_ressarcimento", + # "partition_date_only": True, + # "extract_params": { + # "database": "ressarcimento_db", + # "query": """ + # SELECT + # * + # FROM + # ordem_ressarcimento + # WHERE + # data_inclusao BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": ["id"], + # "interval_minutes": 1440, + # }, + # { + # "table_id": "ordem_pagamento", + # "partition_date_only": True, + # "extract_params": { + # "database": "ressarcimento_db", + # "query": """ + # SELECT + # * + # FROM + # ordem_pagamento + # WHERE + # data_inclusao BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": ["id"], + # "interval_minutes": 1440, + # }, + # ] + + # BILHETAGEM_SECRET_PATH = "smtr_jae_access_data" + + # BILHETAGEM_TRATAMENTO_INTERVAL = 60 + + # BILHETAGEM_CAPTURE_PARAMS = [ + # { + # "table_id": "linha", + # "partition_date_only": True, + # "extract_params": { + # "database": "principal_db", + # "query": """ + # SELECT + # * + # FROM + # LINHA + # WHERE + # DT_INCLUSAO BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": ["CD_LINHA"], # id column to nest data on + # "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, + # }, + # { + # "table_id": "grupo", + # "partition_date_only": True, + # "extract_params": { + # "database": "principal_db", + # "query": """ + # SELECT + # * + # FROM + # GRUPO + # WHERE + # DT_INCLUSAO BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": ["CD_GRUPO"], # id column to nest data on + # "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, + # }, + # { + # "table_id": "grupo_linha", + # "partition_date_only": True, + # "extract_params": { + # "database": "principal_db", + # "query": """ + # SELECT + # * + # FROM + # GRUPO_LINHA + # WHERE + # DT_INCLUSAO BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": ["CD_GRUPO", "CD_LINHA"], + # "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, + # }, + # { + # "table_id": "matriz_integracao", + # "partition_date_only": True, + # "extract_params": { + # "database": "tarifa_db", + # "query": """ + # SELECT + # * + # FROM + # matriz_integracao + # WHERE + # dt_inclusao BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": [ + # "cd_versao_matriz", + # "cd_integracao", + # ], # id column to nest data on + # "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, + # }, + # { + # "table_id": "operadora_transporte", + # "partition_date_only": True, + # "extract_params": { + # "database": "principal_db", + # "query": """ + # SELECT + # * + # FROM + # OPERADORA_TRANSPORTE + # WHERE + # DT_INCLUSAO BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": ["CD_OPERADORA_TRANSPORTE"], # id column to nest data on + # "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, + # }, + # { + # "table_id": "pessoa_juridica", + # "partition_date_only": True, + # "extract_params": { + # "database": "principal_db", + # "query": """ + # SELECT + # * + # FROM + # PESSOA_JURIDICA + # """, + # }, + # "primary_key": ["CD_CLIENTE"], # id column to nest data on + # "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, + # }, + # { + # "table_id": "consorcio", + # "partition_date_only": True, + # "extract_params": { + # "database": "principal_db", + # "query": """ + # SELECT + # * + # FROM + # CONSORCIO + # WHERE + # DT_INCLUSAO BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": ["CD_CONSORCIO"], # id column to nest data on + # "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, + # }, + # { + # "table_id": "linha_consorcio", + # "partition_date_only": True, + # "extract_params": { + # "database": "principal_db", + # "query": """ + # SELECT + # * + # FROM + # LINHA_CONSORCIO + # WHERE + # DT_INCLUSAO BETWEEN '{start}' + # AND '{end}' + # """, + # }, + # "primary_key": ["CD_CONSORCIO", "CD_LINHA"], # id column to nest data on + # "interval_minutes": BILHETAGEM_TRATAMENTO_INTERVAL, + # }, + # ] + + # BILHETAGEM_MATERIALIZACAO_TRANSACAO_PARAMS = { + # "dataset_id": BILHETAGEM_DATASET_ID, + # "table_id": BILHETAGEM_TRANSACAO_CAPTURE_PARAMS["table_id"], + # "upstream": True, + # "dbt_vars": { + # "date_range": { + # "table_run_datetime_column_name": "datetime_transacao", + # "delay_hours": 1, + # }, + # "version": {}, + # }, + # } + + # BILHETAGEM_MATERIALIZACAO_ORDEM_PAGAMENTO_PARAMS = { + # "dataset_id": BILHETAGEM_DATASET_ID, + # "table_id": "ordem_pagamento", + # "upstream": True, + # "exclude": f"+{BILHETAGEM_MATERIALIZACAO_TRANSACAO_PARAMS['table_id']}", + # "dbt_vars": { + # "date_range": { + # "table_run_datetime_column_name": "data_ordem", + # "delay_hours": 0, + # }, + # "version": {}, + # }, + # } + + # BILHETAGEM_GENERAL_CAPTURE_DEFAULT_PARAMS = { + # "dataset_id": BILHETAGEM_DATASET_ID, + # "secret_path": BILHETAGEM_SECRET_PATH, + # "source_type": BILHETAGEM_GENERAL_CAPTURE_PARAMS["source_type"], + # } # GTFS - GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" - - GTFS_GENERAL_CAPTURE_PARAMS = { - "partition_date_only": True, - "source_type": "gcs", - "dataset_id": "br_rj_riodejaneiro_gtfs", - "extract_params": {"filename": "gtfs"}, - "partition_date_name": "data_versao", - } - - GTFS_TABLE_CAPTURE_PARAMS = [ - { - "table_id": "shapes", - "primary_key": ["shape_id", "shape_pt_sequence"], - }, - { - "table_id": "agency", - "primary_key": ["agency_id"], - }, - { - "table_id": "calendar_dates", - "primary_key": ["service_id", "date"], - }, - { - "table_id": "calendar", - "primary_key": ["service_id"], - }, - { - "table_id": "feed_info", - "primary_key": ["feed_publisher_name"], - }, - { - "table_id": "frequencies", - "primary_key": ["trip_id", "start_time"], - }, - { - "table_id": "routes", - "primary_key": ["route_id"], - }, - { - "table_id": "stops", - "primary_key": ["stop_id"], - }, - { - "table_id": "trips", - "primary_key": ["trip_id"], - }, - { - "table_id": "fare_attributes", - "primary_key": ["fare_id"], - }, - { - "table_id": "fare_rules", - "primary_key": [], - }, - { - "table_id": "ordem_servico", - "primary_key": ["servico"], - "extract_params": {"filename": "ordem_servico"}, - }, - { - "table_id": "stop_times", - "primary_key": ["trip_id", "stop_sequence"], - }, - ] - - GTFS_MATERIALIZACAO_PARAMS = { - "dataset_id": GTFS_DATASET_ID, - "dbt_vars": { - "data_versao_gtfs": "", - "version": {}, - }, - } - - # SUBSÍDIO RECURSOS VIAGENS INDIVIDUAIS - SUBSIDIO_SPPO_RECURSOS_DATASET_ID = "br_rj_riodejaneiro_recurso" - SUBSIDIO_SPPO_RECURSO_API_BASE_URL = "https://api.movidesk.com/public/v1/tickets?" - SUBSIDIO_SPPO_RECURSO_API_SECRET_PATH = "sppo_subsidio_recursos_api" - SUBSIDIO_SPPO_RECURSO_SERVICE = "serviceFull eq 'SPPO'" - SUBSIDIO_SPPO_RECURSO_CAPTURE_PARAMS = { - "partition_date_only": True, - "table_id": "recurso_sppo", - "dataset_id": SUBSIDIO_SPPO_RECURSOS_DATASET_ID, - "extract_params": { - "token": "", - "$select": "id,protocol,createdDate", - "$filter": "{dates} and serviceFull/any(serviceFull: {service})", - "$expand": "customFieldValues,customFieldValues($expand=items)", - "$orderby": "createdDate asc", - }, - "interval_minutes": 1440, - "source_type": "movidesk", - "primary_key": ["protocol"], - } - - SUBSIDIO_SPPO_RECURSOS_MATERIALIZACAO_PARAMS = { - "dataset_id": SUBSIDIO_SPPO_RECURSOS_DATASET_ID, - "table_id": SUBSIDIO_SPPO_RECURSO_CAPTURE_PARAMS["table_id"], - "upstream": True, - "dbt_vars": { - "date_range": { - "table_run_datetime_column_name": "data_recurso", - "delay_hours": 0, - }, - "version": {}, - }, - } + # GTFS_DATASET_ID = "br_rj_riodejaneiro_gtfs" + + # GTFS_GENERAL_CAPTURE_PARAMS = { + # "partition_date_only": True, + # "source_type": "gcs", + # "dataset_id": "br_rj_riodejaneiro_gtfs", + # "extract_params": {"filename": "gtfs"}, + # "partition_date_name": "data_versao", + # } + + # GTFS_TABLE_CAPTURE_PARAMS = [ + # { + # "table_id": "shapes", + # "primary_key": ["shape_id", "shape_pt_sequence"], + # }, + # { + # "table_id": "agency", + # "primary_key": ["agency_id"], + # }, + # { + # "table_id": "calendar_dates", + # "primary_key": ["service_id", "date"], + # }, + # { + # "table_id": "calendar", + # "primary_key": ["service_id"], + # }, + # { + # "table_id": "feed_info", + # "primary_key": ["feed_publisher_name"], + # }, + # { + # "table_id": "frequencies", + # "primary_key": ["trip_id", "start_time"], + # }, + # { + # "table_id": "routes", + # "primary_key": ["route_id"], + # }, + # { + # "table_id": "stops", + # "primary_key": ["stop_id"], + # }, + # { + # "table_id": "trips", + # "primary_key": ["trip_id"], + # }, + # { + # "table_id": "fare_attributes", + # "primary_key": ["fare_id"], + # }, + # { + # "table_id": "fare_rules", + # "primary_key": [], + # }, + # { + # "table_id": "ordem_servico", + # "primary_key": ["servico"], + # "extract_params": {"filename": "ordem_servico"}, + # }, + # { + # "table_id": "stop_times", + # "primary_key": ["trip_id", "stop_sequence"], + # }, + # ] + + # GTFS_MATERIALIZACAO_PARAMS = { + # "dataset_id": GTFS_DATASET_ID, + # "dbt_vars": { + # "data_versao_gtfs": "", + # "version": {}, + # }, + # } + + # # SUBSÍDIO RECURSOS VIAGENS INDIVIDUAIS + # SUBSIDIO_SPPO_RECURSOS_DATASET_ID = "br_rj_riodejaneiro_recurso" + # SUBSIDIO_SPPO_RECURSO_API_BASE_URL = "https://api.movidesk.com/public/v1/tickets?" + # SUBSIDIO_SPPO_RECURSO_API_SECRET_PATH = "sppo_subsidio_recursos_api" + # SUBSIDIO_SPPO_RECURSO_SERVICE = "serviceFull eq 'SPPO'" + # SUBSIDIO_SPPO_RECURSO_CAPTURE_PARAMS = { + # "partition_date_only": True, + # "table_id": "recurso_sppo", + # "dataset_id": SUBSIDIO_SPPO_RECURSOS_DATASET_ID, + # "extract_params": { + # "token": "", + # "$select": "id,protocol,createdDate", + # "$filter": "{dates} and serviceFull/any(serviceFull: {service})", + # "$expand": "customFieldValues,customFieldValues($expand=items)", + # "$orderby": "createdDate asc", + # }, + # "interval_minutes": 1440, + # "source_type": "movidesk", + # "primary_key": ["protocol"], + # } + + # SUBSIDIO_SPPO_RECURSOS_MATERIALIZACAO_PARAMS = { + # "dataset_id": SUBSIDIO_SPPO_RECURSOS_DATASET_ID, + # "table_id": SUBSIDIO_SPPO_RECURSO_CAPTURE_PARAMS["table_id"], + # "upstream": True, + # "dbt_vars": { + # "date_range": { + # "table_run_datetime_column_name": "data_recurso", + # "delay_hours": 0, + # }, + # "version": {}, + # }, + # } diff --git a/pipelines/flows.py b/pipelines/flows.py index 71646f9e2..f949fbbd7 100644 --- a/pipelines/flows.py +++ b/pipelines/flows.py @@ -2,5 +2,8 @@ """ Imports all flows for every project so we can register all of them. """ -# from pipelines.br_rj_riodejaneiro_brt_gps.flows import * +from pipelines.br_rj_riodejaneiro_brt_gps.flows import * # noqa +from pipelines.capture.jae.flows import * # noqa +from pipelines.capture.templates.flows import * # noqa from pipelines.exemplo import * # noqa +from pipelines.treatment.bilhetagem.flows import * # noqa diff --git a/pipelines/schedules.py b/pipelines/schedules.py index dafb810a3..f87fcf1ff 100644 --- a/pipelines/schedules.py +++ b/pipelines/schedules.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Schedules for rj_smtr +Modulo com schedules para os Flows da rj-smtr """ from datetime import datetime, timedelta @@ -11,7 +11,36 @@ from pipelines.constants import constants from pipelines.constants import constants as emd_constants -from pipelines.utils.utils import generate_ftp_schedules +from pipelines.utils.backup.utils import generate_ftp_schedules + + +def generate_interval_schedule( + interval: timedelta, agent_label: str, params: dict = None +) -> Schedule: + """ + Cria um Schedule para os flows do prefect + + Args: + interval (timedelta): Frequência do agendamento do flow + agent_label (str): Label para executar o flow + params (dict, optional): Parâmetros para ser passados ao flow no + momento da execução + """ + if not params: + params = {} + return Schedule( + [ + IntervalClock( + interval=interval, + start_date=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone(constants.TIMEZONE.value)), + labels=[ + agent_label, + ], + parameter_defaults=params, + ) + ] + ) + every_minute = Schedule( clocks=[ diff --git a/pipelines/tasks.py b/pipelines/tasks.py index 78355251e..76c051707 100644 --- a/pipelines/tasks.py +++ b/pipelines/tasks.py @@ -1,1485 +1,162 @@ # -*- coding: utf-8 -*- -# pylint: disable=W0703, W0511 -""" -Tasks for rj_smtr -""" -import io -import json -import os -import traceback -from datetime import date, datetime, timedelta -from pathlib import Path -from typing import Any, Dict, Iterable, List, Union +"""Module containing general purpose tasks""" +from datetime import datetime +from typing import Any, Union -import basedosdados as bd -import pandas as pd -import pendulum import prefect -import requests -from basedosdados import Storage, Table -from prefect import Client, task -from prefect.backend import FlowRunView -from prefeitura_rio.pipelines_utils.dbt import run_dbt_model as run_dbt_model_func -from prefeitura_rio.pipelines_utils.infisical import inject_bd_credentials +from prefect import task from prefeitura_rio.pipelines_utils.logging import log -from prefeitura_rio.pipelines_utils.redis_pal import get_redis_client +from prefeitura_rio.pipelines_utils.prefect import get_flow_run_mode from pytz import timezone from pipelines.constants import constants -from pipelines.utils.secret import get_secret -from pipelines.utils.utils import ( # normalize_keys, - bq_project, - create_or_append_table, - data_info_str, - dict_contains_keys, - get_datetime_range, - get_last_run_timestamp, - get_raw_data_api, - get_raw_data_db, - get_raw_data_gcs, - get_raw_recursos, - get_table_min_max_value, - log_critical, - read_raw_data, - save_raw_local_func, - save_treated_local_func, - upload_run_logs_to_bq, -) +from pipelines.utils.prefect import FailedSubFlow, create_subflow_run, wait_subflow_run -############### -# -# SETUP -# -############### @task -def setup_task(): - return inject_bd_credentials() - - -@task -def get_current_flow_labels() -> List[str]: - """ - Get the labels of the current flow. - """ - flow_run_id = prefect.context.get("flow_run_id") - flow_run_view = FlowRunView.from_flow_run_id(flow_run_id) - return flow_run_view.labels - - -############### -# -# DBT -# -############### - - -@task -def run_dbt_model( - dataset_id: str = None, - table_id: str = None, - dbt_alias: bool = False, - upstream: bool = None, - downstream: bool = None, - exclude: str = None, - flags: str = None, - _vars: dict | List[Dict] = None, -): - return run_dbt_model_func( - dataset_id=dataset_id, - table_id=table_id, - dbt_alias=dbt_alias, - upstream=upstream, - downstream=downstream, - exclude=exclude, - flags=flags, - _vars=_vars, - ) - - -@task(max_retries=3, retry_delay=timedelta(seconds=10)) -def build_incremental_model( # pylint: disable=too-many-arguments - dataset_id: str, - base_table_id: str, - mat_table_id: str, - field_name: str = "data_versao", - refresh: bool = False, - wait=None, # pylint: disable=unused-argument -): - """ - Utility task for backfilling table in predetermined steps. - Assumes the step sizes will be defined on the .sql file. - - Args: - dbt_client (DbtClient): DBT interface object - dataset_id (str): Dataset id on BigQuery - base_table_id (str): Base table from which to materialize (usually, an external table) - mat_table_id (str): Target table id for materialization - field_name (str, optional): Key field (column) for dbt incremental filters. - Defaults to "data_versao". - refresh (bool, optional): If True, rebuild the table from scratch. Defaults to False. - wait (NoneType, optional): Placeholder parameter, used to wait previous tasks finish. - Defaults to None. - - Returns: - bool: whether the table was fully built or not. - """ - - query_project_id = bq_project() - last_mat_date = get_table_min_max_value( - query_project_id, dataset_id, mat_table_id, field_name, "max" - ) - last_base_date = get_table_min_max_value( - query_project_id, dataset_id, base_table_id, field_name, "max" - ) - log( - f""" - Base table last version: {last_base_date} - Materialized table last version: {last_mat_date} - """ - ) - run_command = f"run --select models/{dataset_id}/{mat_table_id}.sql" - - if refresh: - log("Running in full refresh mode") - log(f"DBT will run the following command:\n{run_command+' --full-refresh'}") - run_dbt_model_func(dataset_id=dataset_id, table_id=mat_table_id, flags="--full-refresh") - last_mat_date = get_table_min_max_value( - query_project_id, dataset_id, mat_table_id, field_name, "max" - ) - - if last_base_date > last_mat_date: - log("Running interval step materialization") - log(f"DBT will run the following command:\n{run_command}") - while last_base_date > last_mat_date: - running = run_dbt_model_func(dataset_id=dataset_id, table_id=mat_table_id) - # running = dbt_client.cli(run_command, sync=True) - last_mat_date = get_table_min_max_value( - query_project_id, - dataset_id, - mat_table_id, - field_name, - "max", - wait=running, - ) - log(f"After this step, materialized table last version is: {last_mat_date}") - if last_mat_date == last_base_date: - log("Materialized table reached base table version!") - return True - log("Did not run interval step materialization...") - return False - - -@task(checkpoint=False, nout=3) -def create_dbt_run_vars( - dataset_id: str, - dbt_vars: dict, - table_id: str, - raw_dataset_id: str, - raw_table_id: str, - mode: str, - timestamp: datetime, -) -> tuple[list[dict], Union[list[dict], dict, None], bool]: - """ - Create the variables to be used in dbt materialization based on a dict +def task_value_is_none(task_value: Union[Any, None]) -> bool: + """Testa se o valor retornado por uma Task é None Args: - dataset_id (str): the dataset_id to get the variables - dbt_vars (dict): dict containing the parameters - table_id (str): the table_id get the date_range variable - raw_dataset_id (str): the raw_dataset_id get the date_range variable - raw_table_id (str): the raw_table_id get the date_range variable - mode (str): the mode to get the date_range variable + task_value (Union[Any, None]): Valor retornado por uma Task Returns: - list[dict]: the variables to be used in DBT - Union[list[dict], dict, None]: the date variable (date_range or run_date) - bool: a flag that indicates if the date_range variable came from Redis + bool: Se o valor é None ou não """ - - log(f"Creating DBT variables. Parameter received: {dbt_vars}") - - if not dbt_vars: - log("dbt_vars are blank. Skiping task...") - return [None], None, False - - final_vars = [] - date_var = None - flag_date_range = False - - if "date_range" in dbt_vars.keys(): - log("Creating date_range variable") - - # Set date_range variable manually - if dict_contains_keys(dbt_vars["date_range"], ["date_range_start", "date_range_end"]): - date_var = { - "date_range_start": dbt_vars["date_range"]["date_range_start"], - "date_range_end": dbt_vars["date_range"]["date_range_end"], - } - # Create date_range using Redis - else: - if not table_id: - log("table_id are blank. Skiping task...") - return [None], None, False - - raw_table_id = raw_table_id or table_id - - date_var = get_materialization_date_range.run( - dataset_id=dataset_id, - table_id=table_id, - raw_dataset_id=raw_dataset_id, - raw_table_id=raw_table_id, - table_run_datetime_column_name=dbt_vars["date_range"].get( - "table_run_datetime_column_name" - ), - mode=mode, - delay_hours=dbt_vars["date_range"].get("delay_hours", 0), - end_ts=timestamp, - ) - - flag_date_range = True - - final_vars.append(date_var.copy()) - - log(f"date_range created: {date_var}") - - elif "run_date" in dbt_vars.keys(): - log("Creating run_date variable") - - date_var = get_run_dates.run( - date_range_start=dbt_vars["run_date"].get("date_range_start", False), - date_range_end=dbt_vars["run_date"].get("date_range_end", False), - day_datetime=timestamp, - ) - - final_vars.append([d.copy() for d in date_var]) - - log(f"run_date created: {date_var}") - - elif "data_versao_gtfs" in dbt_vars.keys(): - log("Creating data_versao_gtfs variable") - - date_var = {"data_versao_gtfs": dbt_vars["data_versao_gtfs"]} - - final_vars.append(date_var.copy()) - - if "version" in dbt_vars.keys(): - log("Creating version variable") - dataset_sha = fetch_dataset_sha.run(dataset_id=dataset_id) - - # if there are other variables inside the list, update each item adding the version variable - if final_vars: - final_vars = get_join_dict.run(dict_list=final_vars, new_dict=dataset_sha) - else: - final_vars.append(dataset_sha) - - log(f"version created: {dataset_sha}") - - log(f"All variables was created, final value is: {final_vars}") - - return final_vars, date_var, flag_date_range - - -############### -# -# Local file management -# -############### + return task_value is None @task -def get_rounded_timestamp( - timestamp: Union[str, datetime, None] = None, - interval_minutes: Union[int, None] = None, +def get_current_timestamp( + truncate_minute: bool = True, ) -> datetime: """ - Calculate rounded timestamp for flow run. + Retorna a timestamp atual em UTC Args: - timestamp (Union[str, datetime, None]): timestamp to be used as reference - interval_minutes (Union[int, None], optional): interval in minutes between each recapture + truncate_minute: Se for True, substitui os segundos e os microssegundos por 0 Returns: - datetime: timestamp for flow run - """ - if isinstance(timestamp, str): - timestamp = datetime.fromisoformat(timestamp) - - if not timestamp: - timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) - - timestamp = timestamp.replace(second=0, microsecond=0) - - if interval_minutes: - if interval_minutes >= 60: - hours = interval_minutes / 60 - interval_minutes = round(((hours) % 1) * 60) - - if interval_minutes == 0: - rounded_minutes = interval_minutes - else: - rounded_minutes = (timestamp.minute // interval_minutes) * interval_minutes - - timestamp = timestamp.replace(minute=rounded_minutes) - - return timestamp - - -@task -def get_current_timestamp( - timestamp=None, truncate_minute: bool = True, return_str: bool = False -) -> Union[datetime, str]: + Union[datetime, str]: A timestamp atual """ - Get current timestamp for flow run. - Args: - timestamp: timestamp to be used as reference (optionally, it can be a string) - truncate_minute: whether to truncate the timestamp to the minute or not - return_str: if True, the return will be an isoformatted datetime string - otherwise it returns a datetime object - - Returns: - Union[datetime, str]: timestamp for flow run - """ - if isinstance(timestamp, str): - timestamp = datetime.fromisoformat(timestamp) - if not timestamp: - timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) + timestamp = datetime.now(tz=timezone("UTC")) if truncate_minute: timestamp = timestamp.replace(second=0, microsecond=0) - if return_str: - timestamp = timestamp.isoformat() return timestamp @task -def create_date_hour_partition( - timestamp: datetime, - partition_date_name: str = "data", - partition_date_only: bool = False, -) -> str: +def parse_timestamp_to_string(timestamp: datetime, pattern: str = "%Y-%m-%d-%H-%M-%S") -> str: """ - Create a date (and hour) Hive partition structure from timestamp. + Converte um datetime em string Args: - timestamp (datetime): timestamp to be used as reference - partition_date_name (str, optional): partition name. Defaults to "data". - partition_date_only (bool, optional): whether to add hour partition or not + timestamp (datetime): O datetime a ser convertido + pattern (str): O formato da string de data retornado - Returns: - str: partition string - """ - partition = f"{partition_date_name}={timestamp.strftime('%Y-%m-%d')}" - if not partition_date_only: - partition += f"/hora={timestamp.strftime('%H')}" - return partition - - -@task -def parse_timestamp_to_string(timestamp: datetime, pattern="%Y-%m-%d-%H-%M-%S") -> str: - """ - Parse timestamp to string pattern. """ + if pattern.lower() == "iso": + return timestamp.isoformat() return timestamp.strftime(pattern) @task -def create_local_partition_path( - dataset_id: str, table_id: str, filename: str, partitions: str = None -) -> str: - """ - Create the full path sctructure which to save data locally before - upload. - - Args: - dataset_id (str): dataset_id on BigQuery - table_id (str): table_id on BigQuery - filename (str, optional): Single csv name - partitions (str, optional): Partitioned directory structure, ie "ano=2022/mes=03/data=01" - Returns: - str: String path having `mode` and `filetype` to be replaced afterwards, - either to save raw or staging files. +def get_run_env() -> str: """ - data_folder = os.getenv("DATA_FOLDER", "data") - file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}/{table_id}" - file_path += f"/{partitions}/{filename}.{{filetype}}" - log(f"Creating file path: {file_path}") - return file_path + Retorna o ambiente de execução atual com base no projeto do Prefect - -@task -def save_raw_local(file_path: str, status: dict, mode: str = "raw") -> str: - """ - Saves json response from API to .json file. - Args: - file_path (str): Path which to save raw file - status (dict): Must contain keys - * data: json returned from API - * error: error catched from API request - mode (str, optional): Folder to save locally, later folder which to upload to GCS. Returns: - str: Path to the saved file - """ - _file_path = file_path.format(mode=mode, filetype="json") - Path(_file_path).parent.mkdir(parents=True, exist_ok=True) - if status["error"] is None: - json.dump(status["data"], Path(_file_path).open("w", encoding="utf-8")) - log(f"Raw data saved to: {_file_path}") - return _file_path - - -@task -def save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str: + str: "dev" ou "prod" """ - Save treated file to CSV. - - Args: - file_path (str): Path which to save treated file - status (dict): Must contain keys - * `data`: dataframe returned from treatement - * `error`: error catched from data treatement - mode (str, optional): Folder to save locally, later folder which to upload to GCS. - - Returns: - str: Path to the saved file - """ - - log(f"Saving treated data to: {file_path}, {status}") - - _file_path = file_path.format(mode=mode, filetype="csv") - - Path(_file_path).parent.mkdir(parents=True, exist_ok=True) - if status["error"] is None: - status["data"].to_csv(_file_path, index=False) - log(f"Treated data saved to: {_file_path}") - - return _file_path - - -############### -# -# Extract data -# -############### -@task(nout=3, max_retries=3, retry_delay=timedelta(seconds=5)) -def query_logs( - dataset_id: str, - table_id: str, - datetime_filter=None, - max_recaptures: int = 90, - interval_minutes: int = 1, - recapture_window_days: int = 1, -): - """ - Queries capture logs to check for errors - - Args: - dataset_id (str): dataset_id on BigQuery - table_id (str): table_id on BigQuery - datetime_filter (pendulum.datetime.DateTime, optional): - filter passed to query. This task will query the logs table - for the last n (n = recapture_window_days) days before datetime_filter - max_recaptures (int, optional): maximum number of recaptures to be done - interval_minutes (int, optional): interval in minutes between each recapture - recapture_window_days (int, optional): Number of days to query for erros - - Returns: - lists: errors (bool), - timestamps (list of pendulum.datetime.DateTime), - previous_errors (list of previous errors) - """ - - if not datetime_filter: - datetime_filter = pendulum.now(constants.TIMEZONE.value).replace(second=0, microsecond=0) - elif isinstance(datetime_filter, str): - datetime_filter = datetime.fromisoformat(datetime_filter).replace(second=0, microsecond=0) - - datetime_filter = datetime_filter.strftime("%Y-%m-%d %H:%M:%S") - - query = f""" - WITH - t AS ( - SELECT - DATETIME(timestamp_array) AS timestamp_array - FROM - UNNEST( - GENERATE_TIMESTAMP_ARRAY( - TIMESTAMP_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day), - TIMESTAMP('{datetime_filter}'), - INTERVAL {interval_minutes} minute) ) - AS timestamp_array - WHERE - timestamp_array < '{datetime_filter}' ), - logs_table AS ( - SELECT - SAFE_CAST(DATETIME(TIMESTAMP(timestamp_captura), - "America/Sao_Paulo") AS DATETIME) timestamp_captura, - SAFE_CAST(sucesso AS BOOLEAN) sucesso, - SAFE_CAST(erro AS STRING) erro, - SAFE_CAST(DATA AS DATE) DATA - FROM - rj-smtr-staging.{dataset_id}_staging.{table_id}_logs AS t - ), - logs AS ( - SELECT - *, - TIMESTAMP_TRUNC(timestamp_captura, minute) AS timestamp_array - FROM - logs_table - WHERE - DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}', - INTERVAL {recapture_window_days} day)) - AND DATE('{datetime_filter}') - AND timestamp_captura BETWEEN - DATETIME_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day) - AND '{datetime_filter}' - ) - SELECT - CASE - WHEN logs.timestamp_captura IS NOT NULL THEN logs.timestamp_captura - ELSE - t.timestamp_array - END - AS timestamp_captura, - logs.erro - FROM - t - LEFT JOIN - logs - ON - logs.timestamp_array = t.timestamp_array - WHERE - logs.sucesso IS NOT TRUE - """ - log(f"Run query to check logs:\n{query}") - results = bd.read_sql(query=query, billing_project_id=bq_project()) - - if len(results) > 0: - results = results.sort_values(["timestamp_captura"]) - results["timestamp_captura"] = ( - pd.to_datetime(results["timestamp_captura"]) - .dt.tz_localize(constants.TIMEZONE.value) - .to_list() - ) - log(f"Recapture data for the following {len(results)} timestamps:\n{results}") - if len(results) > max_recaptures: - message = f""" - [SPPO - Recaptures] - Encontradas {len(results)} timestamps para serem recapturadas. - Essa run processará as seguintes: - ##### - {results[:max_recaptures]} - ##### - Sobraram as seguintes para serem recapturadas na próxima run: - ##### - {results[max_recaptures:]} - ##### - """ - log_critical(message) - results = results[:max_recaptures] - return True, results["timestamp_captura"].to_list(), results["erro"].to_list() - return False, [], [] - - -@task -def get_raw( # pylint: disable=R0912 - url: str, - headers: str = None, - filetype: str = "json", - csv_args: dict = None, - params: dict = None, -) -> Dict: - """ - Request data from URL API - - Args: - url (str): URL to send request - headers (str, optional): Path to headers guardeded on Vault, if needed. - filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt) - csv_args (dict, optional): Arguments for read_csv, if needed - params (dict, optional): Params to be sent on request - - Returns: - dict: Containing keys - * `data` (json): data result - * `error` (str): catched error, if any. Otherwise, returns None - """ - data = None - error = None - try: - if headers is not None: - headers = get_secret(secret_path=headers) - # remove from headers, if present - remove_headers = ["host", "databases"] - for remove_header in remove_headers: - if remove_header in list(headers.keys()): - del headers[remove_header] - - response = requests.get( - url, - headers=headers, - timeout=constants.MAX_TIMEOUT_SECONDS.value, - params=params, - ) - - if response.ok: # status code is less than 400 - if filetype == "json": - data = response.json() - - # todo: move to data check on specfic API # pylint: disable=W0102 - if isinstance(data, dict) and "DescricaoErro" in data.keys(): - error = data["DescricaoErro"] - - elif filetype in ("txt", "csv"): - if csv_args is None: - csv_args = {} - data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict(orient="records") - else: - error = "Unsupported raw file extension. Supported only: json, csv and txt" - - except Exception: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - return {"data": data, "error": error} - - -@task(checkpoint=False, nout=2) -def create_request_params( - extract_params: dict, - table_id: str, - dataset_id: str, - timestamp: datetime, - interval_minutes: int, -) -> tuple[str, str]: - """ - Task to create request params - - Args: - extract_params (dict): extract parameters - table_id (str): table_id on BigQuery - dataset_id (str): dataset_id on BigQuery - timestamp (datetime): timestamp for flow run - interval_minutes (int): interval in minutes between each capture - - Returns: - request_params: host, database and query to request data - request_url: url to request data - """ - request_params = None - request_url = None - - if dataset_id == constants.BILHETAGEM_DATASET_ID.value: - database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ - extract_params["database"] - ] - request_url = database["host"] - - datetime_range = get_datetime_range( - timestamp=timestamp, interval=timedelta(minutes=interval_minutes) - ) - - request_params = { - "database": extract_params["database"], - "engine": database["engine"], - "query": extract_params["query"].format(**datetime_range), - } - - elif dataset_id == constants.GTFS_DATASET_ID.value: - request_params = extract_params["filename"] - - elif dataset_id == constants.SUBSIDIO_SPPO_RECURSOS_DATASET_ID.value: - extract_params["token"] = get_secret( - secret_path=constants.SUBSIDIO_SPPO_RECURSO_API_SECRET_PATH.value - )["token"] - start = datetime.strftime( - timestamp - timedelta(minutes=interval_minutes), "%Y-%m-%dT%H:%M:%S.%MZ" - ) - end = datetime.strftime(timestamp, "%Y-%m-%dT%H:%M:%S.%MZ") - log(f" Start date {start}, end date {end}") - recurso_params = { - "dates": f"createdDate ge {start} and createdDate le {end}", - "service": constants.SUBSIDIO_SPPO_RECURSO_SERVICE.value, - } - extract_params["$filter"] = extract_params["$filter"].format(**recurso_params) - request_params = extract_params - - request_url = constants.SUBSIDIO_SPPO_RECURSO_API_BASE_URL.value - - return request_params, request_url - - -@task(checkpoint=False, nout=2) -def get_raw_from_sources( - source_type: str, - local_filepath: str, - source_path: str = None, - dataset_id: str = None, - table_id: str = None, - secret_path: str = None, - request_params: dict = None, -) -> tuple[str, str]: - """ - Task to get raw data from sources - - Args: - source_type (str): source type - local_filepath (str): local filepath - source_path (str, optional): source path. Defaults to None. - dataset_id (str, optional): dataset_id on BigQuery. Defaults to None. - table_id (str, optional): table_id on BigQuery. Defaults to None. - secret_path (str, optional): secret path. Defaults to None. - request_params (dict, optional): request parameters. Defaults to None. - - Returns: - error: error catched from upstream tasks - filepath: filepath to raw data - """ - error = None - filepath = None - data = None - - source_values = source_type.split("-", 1) - - source_type, filetype = source_values if len(source_values) == 2 else (source_values[0], None) - - log(f"Getting raw data from source type: {source_type}") - - try: - if source_type == "api": - error, data, filetype = get_raw_data_api( - url=source_path, - secret_path=secret_path, - api_params=request_params, - filetype=filetype, - ) - elif source_type == "gcs": - error, data, filetype = get_raw_data_gcs( - dataset_id=dataset_id, table_id=table_id, zip_filename=request_params - ) - elif source_type == "db": - error, data, filetype = get_raw_data_db( - host=source_path, secret_path=secret_path, **request_params - ) - elif source_type == "movidesk": - error, data, filetype = get_raw_recursos( - request_url=source_path, request_params=request_params - ) - else: - raise NotImplementedError(f"{source_type} not supported") - - filepath = save_raw_local_func(data=data, filepath=local_filepath, filetype=filetype) - - except NotImplementedError: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - log(f"Raw extraction ended returned values: {error}, {filepath}") - return error, filepath - - -############### -# -# Load data -# -############### + run_mode = get_flow_run_mode() + if run_mode == "staging": + return "dev" + return run_mode + except ValueError as err: + if "Invalid project name: None" in str(err): + return "dev" + raise err @task -def bq_upload( - dataset_id: str, - table_id: str, - filepath: str, - raw_filepath: str = None, - partitions: str = None, - status: dict = None, -): # pylint: disable=R0913 +def flow_log(msg, level: str = "info"): """ - Upload raw and treated data to GCS and BigQuery. - - Args: - dataset_id (str): dataset_id on BigQuery - table_id (str): table_id on BigQuery - filepath (str): Path to the saved treated .csv file - raw_filepath (str, optional): Path to raw .json file. Defaults to None. - partitions (str, optional): Partitioned directory structure, ie "ano=2022/mes=03/data=01". - Defaults to None. - status (dict, optional): Dict containing `error` key from - upstream tasks. - - Returns: - None - """ - log( - f""" - Received inputs: - raw_filepath = {raw_filepath}, type = {type(raw_filepath)} - treated_filepath = {filepath}, type = {type(filepath)} - dataset_id = {dataset_id}, type = {type(dataset_id)} - table_id = {table_id}, type = {type(table_id)} - partitions = {partitions}, type = {type(partitions)} - """ - ) - if status["error"] is not None: - return status["error"] - - error = None - - try: - # Upload raw to staging - if raw_filepath: - st_obj = Storage(table_id=table_id, dataset_id=dataset_id) - log( - f"""Uploading raw file to bucket {st_obj.bucket_name} at - {st_obj.bucket_name}/{dataset_id}/{table_id}""" - ) - st_obj.upload( - path=raw_filepath, - partitions=partitions, - mode="raw", - if_exists="replace", - ) - - # Creates and publish table if it does not exist, append to it otherwise - create_or_append_table( - dataset_id=dataset_id, - table_id=table_id, - path=filepath, - partitions=partitions, - ) - except Exception: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - return error - - -@task -def bq_upload_from_dict(paths: dict, dataset_id: str, partition_levels: int = 1): - """Upload multiple tables from a dict structured as {table_id: csv_path}. - Present use case assumes table partitioned once. Adjust the parameter - 'partition_levels' to best suit new uses. - i.e. if your csv is saved as: - /date=/.csv - it has 1 level of partition. - if your csv file is saved as: - /date=/hour=/.csv - it has 2 levels of partition + Task para Debug, executa a função log no nível do flow Args: - paths (dict): _description_ - dataset_id (str): _description_ - - Returns: - _type_: _description_ + msg: Texto para exibir no log + level (str): Level do log do Prefect """ - for key in paths.keys(): - log("#" * 80) - log(f"KEY = {key}") - tb_dir = paths[key].parent - # climb up the partition directories to reach the table dir - for i in range(partition_levels): # pylint: disable=unused-variable - tb_dir = tb_dir.parent - log(f"tb_dir = {tb_dir}") - create_or_append_table(dataset_id=dataset_id, table_id=key, path=tb_dir) - - log(f"Returning -> {tb_dir.parent}") - - return tb_dir.parent + log(msg, level=level) @task -def upload_logs_to_bq( # pylint: disable=R0913 - dataset_id: str, - parent_table_id: str, - timestamp: str, - error: str = None, - previous_error: str = None, - recapture: bool = False, +def run_subflow( + flow_name: str, + parameters: Union[list[dict], dict], + project_name: str = None, + labels: list[str] = None, + maximum_parallelism: int = None, ): """ - Upload execution status table to BigQuery. - Table is uploaded to the same dataset, named {parent_table_id}_logs. - If passing status_dict, should not pass timestamp and error. - - Args: - dataset_id (str): dataset_id on BigQuery - parent_table_id (str): Parent table id related to the status table - timestamp (str): ISO formatted timestamp string - error (str, optional): String associated with error caught during execution - Returns: - None - """ - table_id = parent_table_id + "_logs" - # Create partition directory - filename = f"{table_id}_{timestamp.isoformat()}" - partition = f"data={timestamp.date()}" - filepath = Path(f"""data/staging/{dataset_id}/{table_id}/{partition}/{filename}.csv""") - filepath.parent.mkdir(exist_ok=True, parents=True) - # Create dataframe to be uploaded - if not error and recapture is True: - # if the recapture is succeeded, update the column erro - dataframe = pd.DataFrame( - { - "timestamp_captura": [timestamp], - "sucesso": [True], - "erro": [f"[recapturado]{previous_error}"], - } - ) - log(f"Recapturing {timestamp} with previous error:\n{error}") - else: - # not recapturing or error during flow execution - dataframe = pd.DataFrame( - { - "timestamp_captura": [timestamp], - "sucesso": [error is None], - "erro": [error], - } - ) - # Save data local - dataframe.to_csv(filepath, index=False) - # Upload to Storage - create_or_append_table( - dataset_id=dataset_id, - table_id=table_id, - path=filepath.as_posix(), - partitions=partition, - ) - if error is not None: - raise Exception(f"Pipeline failed with error: {error}") - - -@task -def upload_raw_data_to_gcs( - error: str, - raw_filepath: str, - table_id: str, - dataset_id: str, - partitions: list, -) -> Union[str, None]: - """ - Upload raw data to GCS. + Executa e espera a execução de um flow Args: - error (str): Error catched from upstream tasks. - raw_filepath (str): Path to the saved raw .json file - table_id (str): table_id on BigQuery - dataset_id (str): dataset_id on BigQuery - partitions (list): list of partition strings - - Returns: - Union[str, None]: if there is an error returns it traceback, otherwise returns None - """ - if error is None: - try: - st_obj = Storage(table_id=table_id, dataset_id=dataset_id) - log( - f"""Uploading raw file to bucket {st_obj.bucket_name} at - {st_obj.bucket_name}/{dataset_id}/{table_id}""" - ) - st_obj.upload( - path=raw_filepath, - partitions=partitions, - mode="raw", - if_exists="replace", - ) - except Exception: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - return error - - -@task -def upload_staging_data_to_gcs( - error: str, - staging_filepath: str, - timestamp: datetime, - table_id: str, - dataset_id: str, - partitions: list, - previous_error: str = None, - recapture: bool = False, -) -> Union[str, None]: + flow_name (str): Nome do flow a ser executado. + parameters (dict): Parâmetros para executar o flow + project_name (str, optional): Nome do projeto no Prefect para executar o flow, + se não for especificado, é utilizado o nome do projeto do flow atual + labels (list[str]): Labels para executar o flow, + se não for especificado, são utilizadas as labels do flow atual """ - Upload staging data to GCS. - Args: - error (str): Error catched from upstream tasks. - staging_filepath (str): Path to the saved treated .csv file. - timestamp (datetime): timestamp for flow run. - table_id (str): table_id on BigQuery. - dataset_id (str): dataset_id on BigQuery. - partitions (list): list of partition strings. + if not isinstance(parameters, (dict, list)): + raise ValueError("parameters must be a list or a dict") - Returns: - Union[str, None]: if there is an error returns it traceback, otherwise returns None - """ - if error is None: - try: - # Creates and publish table if it does not exist, append to it otherwise - create_or_append_table( - dataset_id=dataset_id, - table_id=table_id, - path=staging_filepath, - partitions=partitions, - ) - except Exception: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - upload_run_logs_to_bq( - dataset_id=dataset_id, - parent_table_id=table_id, - error=error, - timestamp=timestamp, - mode="staging", - previous_error=previous_error, - recapture=recapture, - ) + if maximum_parallelism is not None and isinstance(parameters, list): + execution_list = [ + parameters[i : i + maximum_parallelism] # noqa + for i in range(0, len(parameters), maximum_parallelism) + ] - return error + idempotency_key = prefect.context.get("task_run_id") + map_index = prefect.context.get("map_index") + if idempotency_key and map_index is not None: + idempotency_key += f"-{map_index}" + flow_run_results = [] -############### -# -# Daterange tasks -# -############### + for idx, param_list in enumerate(execution_list): + if not isinstance(param_list, list): + param_list = [param_list] -@task( - checkpoint=False, - max_retries=constants.MAX_RETRIES.value, - retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), -) -def get_materialization_date_range( # pylint: disable=R0913 - dataset_id: str, - table_id: str, - raw_dataset_id: str, - raw_table_id: str, - table_run_datetime_column_name: str = None, - mode: str = "prod", - delay_hours: int = 0, - end_ts: datetime = None, -): - """ - Task for generating dict with variables to be passed to the - --vars argument on DBT. - Args: - dataset_id (str): dataset_id on BigQuery - table_id (str): model filename on the queries repo. - eg: if you have a model defined in the file .sql, - the table_id should be - table_date_column_name (Optional, str): if it's the first time this - is ran, will query the table for the maximum value on this field. - If rebuild is true, will query the table for the minimum value - on this field. - rebuild (Optional, bool): if true, queries the minimum date value on the - table and return a date range from that value to the datetime.now() time - delay(Optional, int): hours delayed from now time for materialization range - end_ts(Optional, datetime): date range's final date - Returns: - dict: containing date_range_start and date_range_end - """ - timestr = "%Y-%m-%dT%H:%M:%S" - # get start from redis - last_run = get_last_run_timestamp(dataset_id=dataset_id, table_id=table_id, mode=mode) - # if there's no timestamp set on redis, get max timestamp on source table - if last_run is None: - log("Failed to fetch key from Redis...\n Querying tables for last suceeded run") - if Table(dataset_id=dataset_id, table_id=table_id).table_exists("prod"): - last_run = get_table_min_max_value( - query_project_id=bq_project(), - dataset_id=dataset_id, - table_id=table_id, - field_name=table_run_datetime_column_name, - kind="max", - ) - log( - f""" - Queried last run from {dataset_id}.{table_id} - Got: - {last_run} as type {type(last_run)} - """ + runs_ids = [ + create_subflow_run( + flow_name=flow_name, + parameters=params, + idempotency_key=idempotency_key + f"-{idx}-{sub_idx}", + project_name=project_name, + labels=labels, ) - else: - last_run = get_table_min_max_value( - query_project_id=bq_project(), - dataset_id=raw_dataset_id, - table_id=raw_table_id, - field_name=table_run_datetime_column_name, - kind="max", - ) - log( - f""" - Queried last run from {raw_dataset_id}.{raw_table_id} - Got: - {last_run} as type {type(last_run)} - """ - ) - else: - last_run = datetime.strptime(last_run, timestr) - - if (not isinstance(last_run, datetime)) and (isinstance(last_run, date)): - last_run = datetime(last_run.year, last_run.month, last_run.day) - - # set start to last run hour (H) - start_ts = last_run.replace(minute=0, second=0, microsecond=0).strftime(timestr) - - # set end to now - delay - - if not end_ts: - end_ts = pendulum.now(constants.TIMEZONE.value).replace( - tzinfo=None, minute=0, second=0, microsecond=0 - ) - - end_ts = (end_ts - timedelta(hours=delay_hours)).replace(minute=0, second=0, microsecond=0) - - end_ts = end_ts.strftime(timestr) - - date_range = {"date_range_start": start_ts, "date_range_end": end_ts} - log(f"Got date_range as: {date_range}") - return date_range - - -@task -def set_last_run_timestamp( - dataset_id: str, table_id: str, timestamp: str, mode: str = "prod", wait=None -): # pylint: disable=unused-argument - """ - Set the `last_run_timestamp` key for the dataset_id/table_id pair - to datetime.now() time. Used after running a materialization to set the - stage for the next to come - - Args: - dataset_id (str): dataset_id on BigQuery - table_id (str): model filename on the queries repo. - timestamp: Last run timestamp end. - wait (Any, optional): Used for defining dependencies inside the flow, - in general, pass the output of the task which should be run imediately - before this. Defaults to None. - - Returns: - _type_: _description_ - """ - log(f"Saving timestamp {timestamp} on Redis for {dataset_id}.{table_id}") - redis_client = get_redis_client() - key = dataset_id + "." + table_id - if mode == "dev": - key = f"{mode}.{key}" - content = redis_client.get(key) - if not content: - content = {} - content["last_run_timestamp"] = timestamp - redis_client.set(key, content) - return True - - -@task -def delay_now_time(timestamp: str, delay_minutes=6): - """Return timestamp string delayed by - - Args: - timestamp (str): Isoformat timestamp string - delay_minutes (int, optional): Minutes to delay timestamp by Defaults to 6. - - Returns: - str : timestamp string formatted as "%Y-%m-%dT%H-%M-%S" - """ - ts_obj = datetime.fromisoformat(timestamp) - ts_obj = ts_obj - timedelta(minutes=delay_minutes) - return ts_obj.strftime("%Y-%m-%dT%H-%M-%S") - - -@task -def fetch_dataset_sha(dataset_id: str): - """Fetches the SHA of a branch from Github""" - url = "https://api.github.com/repos/prefeitura-rio/queries-rj-smtr" - url += f"/commits?queries-rj-smtr/rj_smtr/{dataset_id}" - response = requests.get(url) - - if response.status_code != 200: - return None - - dataset_version = response.json()[0]["sha"] - return {"version": dataset_version} - - -@task -def get_run_dates( - date_range_start: str, date_range_end: str, day_datetime: datetime = None -) -> List: - """ - Generates a list of dates between date_range_start and date_range_end. - - Args: - date_range_start (str): the start date to create the date range - date_range_end (str): the end date to create the date range - day_datetime (datetime, Optional): a timestamp to use as run_date - if the range start or end is False - - Returns: - list: the list of run_dates - """ - if (date_range_start is False) or (date_range_end is False): - if day_datetime: - run_date = day_datetime.strftime("%Y-%m-%d") - else: - run_date = get_now_date.run() - dates = [{"run_date": run_date}] - else: - dates = [ - {"run_date": d.strftime("%Y-%m-%d")} - for d in pd.date_range(start=date_range_start, end=date_range_end) + for sub_idx, params in enumerate(param_list) ] - log(f"Will run the following dates: {dates}") - return dates - - -@task -def get_join_dict(dict_list: list, new_dict: dict) -> List: - """ - Updates a list of dictionaries with a new dictionary. - """ - for dict_temp in dict_list: - dict_temp.update(new_dict) - - log(f"get_join_dict: {dict_list}") - return dict_list - - -@task(checkpoint=False) -def get_previous_date(days): - """ - Returns the date of {days} days ago in YYYY-MM-DD. - """ - now = pendulum.now(pendulum.timezone("America/Sao_Paulo")).subtract(days=days) - return now.to_date_string() - - -############### -# -# Pretreat data -# -############### - - -@task(nout=2) -def transform_raw_to_nested_structure( - raw_filepath: str, - filepath: str, - error: str, - timestamp: datetime, - primary_key: list = None, -) -> tuple[str, str]: - """ - Task to transform raw data to nested structure - - Args: - raw_filepath (str): Path to the saved raw .json file - filepath (str): Path to the saved treated .csv file - error (str): Error catched from upstream tasks - timestamp (datetime): timestamp for flow run - primary_key (list, optional): Primary key to be used on nested structure - - Returns: - str: Error traceback - str: Path to the saved treated .csv file - """ - if error is None: - try: - # leitura do dado raw - error, data = read_raw_data(filepath=raw_filepath) - - if primary_key is None: - primary_key = [] - - log( - f""" - Received inputs: - - timestamp:\n{timestamp} - - data:\n{data.head()}""" + for run_id in runs_ids: + result = wait_subflow_run(flow_run_id=run_id) + flow_run_results.append(result) + + failed_message = "The following runs failed:" + flag_failed_runs = False + for res in flow_run_results: + if res.state.is_failed(): + flag_failed_runs = True + failed_message += "\n" + constants.FLOW_RUN_URL_PATTERN.value.format( + run_id=res.flow_run_id ) - # Check empty dataframe - if data.empty: - log("Empty dataframe, skipping transformation...") - - else: - log(f"Raw data:\n{data_info_str(data)}", level="info") - - log("Adding captured timestamp column...", level="info") - data["timestamp_captura"] = timestamp - - if "customFieldValues" not in data: - log("Striping string columns...", level="info") - for col in data.columns[data.dtypes == "object"].to_list(): - data[col] = data[col].str.strip() - - log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info") - - log("Creating nested structure...", level="info") - pk_cols = primary_key + ["timestamp_captura"] - data = ( - data.groupby(pk_cols) - .apply(lambda x: x[data.columns.difference(pk_cols)].to_json(orient="records")) - .str.strip("[]") - .reset_index(name="content")[primary_key + ["content", "timestamp_captura"]] - ) - - log( - f"Finished nested structure! Data:\n{data_info_str(data)}", - level="info", - ) - - # save treated local - filepath = save_treated_local_func(data=data, error=error, filepath=filepath) - - except Exception: # pylint: disable=W0703 - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - return error, filepath - - -############### -# -# Utilitary tasks -# -############### - - -@task(checkpoint=False) -def coalesce_task(value_list: Iterable): - """ - Task to get the first non None value of a list - - Args: - value_list (Iterable): a iterable object with the values - Returns: - any: value_list's first non None item - """ - - try: - return next(value for value in value_list if value is not None) - except StopIteration: - return None - - -@task(checkpoint=False, nout=2) -def unpack_mapped_results_nout2( - mapped_results: Iterable, -) -> tuple[list[Any], list[Any]]: - """ - Task to unpack the results from an nout=2 tasks in 2 lists when it is mapped - - Args: - mapped_results (Iterable): The mapped task return - - Returns: - tuple[list[Any], list[Any]]: The task original return splited in 2 lists: - - 1st list being all the first return - - 2nd list being all the second return - - """ - return [r[0] for r in mapped_results], [r[1] for r in mapped_results] - - -@task -def check_mapped_query_logs_output(query_logs_output: list[tuple]) -> bool: - """ - Task to check if there is recaptures pending - - Args: - query_logs_output (list[tuple]): the return from a mapped query_logs execution - - Returns: - bool: True if there is recaptures to do, otherwise False - """ - - if len(query_logs_output) == 0: - return False - - recapture_list = [i[0] for i in query_logs_output] - return any(recapture_list) - - -@task -def get_scheduled_start_times( - timestamp: datetime, parameters: list, intervals: Union[None, dict] = None -): - """ - Task to get start times to schedule flows - - Args: - timestamp (datetime): initial flow run timestamp - parameters (list): parameters for the flow - intervals (Union[None, dict], optional): intervals between each flow run. Defaults to None. - Optionally, you can pass specific intervals for some table_ids. - Suggests to pass intervals based on previous table observed execution times. - Defaults to dict(default=timedelta(minutes=2)). - - Returns: - list[datetime]: list of scheduled start times - """ - - if intervals is None: - intervals = dict() - - if "default" not in intervals.keys(): - intervals["default"] = timedelta(minutes=2) - - timestamps = [None] - last_schedule = timestamp - - for param in parameters[1:]: - last_schedule += intervals.get(param.get("table_id", "default"), intervals["default"]) - timestamps.append(last_schedule) - - return timestamps - - -@task -def rename_current_flow_run_now_time(prefix: str, now_time=None, wait=None) -> None: - """ - Rename the current flow run. - """ - flow_run_id = prefect.context.get("flow_run_id") - client = Client() - return client.set_flow_run_name(flow_run_id, f"{prefix}{now_time}") - - -@prefect.task(checkpoint=False) -def get_now_time(): - """ - Returns the HH:MM. - """ - now = pendulum.now(pendulum.timezone("America/Sao_Paulo")) - - return f"{now.hour}:{f'0{now.minute}' if len(str(now.minute))==1 else now.minute}" - - -@prefect.task(checkpoint=False) -def get_now_date(): - """ - Returns the current date in YYYY-MM-DD. - """ - now = pendulum.now(pendulum.timezone("America/Sao_Paulo")) - - return now.to_date_string() - - -@task -def get_current_flow_mode(labels: List[str]) -> str: - """ - Get the mode (prod/dev/staging) of the current flow. - """ - if labels[0].endswith("-dev"): - return "dev" - if labels[0].endswith("-staging"): - return "staging" - return "prod" + if flag_failed_runs: + raise FailedSubFlow(failed_message) diff --git a/pipelines/templates/__init__.py b/pipelines/templates/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/templates/backup/__init__.py b/pipelines/templates/backup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/templates/backup/tasks.py b/pipelines/templates/backup/tasks.py new file mode 100644 index 000000000..fd953cfbe --- /dev/null +++ b/pipelines/templates/backup/tasks.py @@ -0,0 +1,1485 @@ +# -*- coding: utf-8 -*- +# pylint: disable=W0703, W0511 +""" +Tasks for rj_smtr +""" +import io +import json +import os +import traceback +from datetime import date, datetime, timedelta +from pathlib import Path +from typing import Any, Dict, Iterable, List, Union + +import basedosdados as bd +import pandas as pd +import pendulum +import prefect +import requests +from basedosdados import Storage, Table +from prefect import Client, task +from prefect.backend import FlowRunView +from prefeitura_rio.pipelines_utils.dbt import run_dbt_model +from prefeitura_rio.pipelines_utils.infisical import inject_bd_credentials +from prefeitura_rio.pipelines_utils.logging import log +from prefeitura_rio.pipelines_utils.redis_pal import get_redis_client +from pytz import timezone + +from pipelines.constants import constants +from pipelines.utils.backup.utils import ( # normalize_keys, + bq_project, + create_or_append_table, + data_info_str, + dict_contains_keys, + get_datetime_range, + get_last_run_timestamp, + get_raw_data_api, + get_raw_data_db, + get_raw_data_gcs, + get_raw_recursos, + get_table_min_max_value, + log_critical, + read_raw_data, + save_raw_local_func, + save_treated_local_func, + upload_run_logs_to_bq, +) +from pipelines.utils.secret import get_secret + + +############### +# +# SETUP +# +############### +@task +def setup_task(): + return inject_bd_credentials() + + +@task +def get_current_flow_labels() -> List[str]: + """ + Get the labels of the current flow. + """ + flow_run_id = prefect.context.get("flow_run_id") + flow_run_view = FlowRunView.from_flow_run_id(flow_run_id) + return flow_run_view.labels + + +############### +# +# DBT +# +############### + + +@task +def run_dbt_model_task( + dataset_id: str = None, + table_id: str = None, + dbt_alias: bool = False, + upstream: bool = None, + downstream: bool = None, + exclude: str = None, + flags: str = None, + _vars: dict | List[Dict] = None, +): + return run_dbt_model( + dataset_id=dataset_id, + table_id=table_id, + dbt_alias=dbt_alias, + upstream=upstream, + downstream=downstream, + exclude=exclude, + flags=flags, + _vars=_vars, + ) + + +@task(max_retries=3, retry_delay=timedelta(seconds=10)) +def build_incremental_model( # pylint: disable=too-many-arguments + dataset_id: str, + base_table_id: str, + mat_table_id: str, + field_name: str = "data_versao", + refresh: bool = False, + wait=None, # pylint: disable=unused-argument +): + """ + Utility task for backfilling table in predetermined steps. + Assumes the step sizes will be defined on the .sql file. + + Args: + dbt_client (DbtClient): DBT interface object + dataset_id (str): Dataset id on BigQuery + base_table_id (str): Base table from which to materialize (usually, an external table) + mat_table_id (str): Target table id for materialization + field_name (str, optional): Key field (column) for dbt incremental filters. + Defaults to "data_versao". + refresh (bool, optional): If True, rebuild the table from scratch. Defaults to False. + wait (NoneType, optional): Placeholder parameter, used to wait previous tasks finish. + Defaults to None. + + Returns: + bool: whether the table was fully built or not. + """ + + query_project_id = bq_project() + last_mat_date = get_table_min_max_value( + query_project_id, dataset_id, mat_table_id, field_name, "max" + ) + last_base_date = get_table_min_max_value( + query_project_id, dataset_id, base_table_id, field_name, "max" + ) + log( + f""" + Base table last version: {last_base_date} + Materialized table last version: {last_mat_date} + """ + ) + run_command = f"run --select models/{dataset_id}/{mat_table_id}.sql" + + if refresh: + log("Running in full refresh mode") + log(f"DBT will run the following command:\n{run_command+' --full-refresh'}") + run_dbt_model(dataset_id=dataset_id, table_id=mat_table_id, flags="--full-refresh") + last_mat_date = get_table_min_max_value( + query_project_id, dataset_id, mat_table_id, field_name, "max" + ) + + if last_base_date > last_mat_date: + log("Running interval step materialization") + log(f"DBT will run the following command:\n{run_command}") + while last_base_date > last_mat_date: + running = run_dbt_model(dataset_id=dataset_id, table_id=mat_table_id) + # running = dbt_client.cli(run_command, sync=True) + last_mat_date = get_table_min_max_value( + query_project_id, + dataset_id, + mat_table_id, + field_name, + "max", + wait=running, + ) + log(f"After this step, materialized table last version is: {last_mat_date}") + if last_mat_date == last_base_date: + log("Materialized table reached base table version!") + return True + log("Did not run interval step materialization...") + return False + + +@task(checkpoint=False, nout=3) +def create_dbt_run_vars( + dataset_id: str, + dbt_vars: dict, + table_id: str, + raw_dataset_id: str, + raw_table_id: str, + mode: str, + timestamp: datetime, +) -> tuple[list[dict], Union[list[dict], dict, None], bool]: + """ + Create the variables to be used in dbt materialization based on a dict + + Args: + dataset_id (str): the dataset_id to get the variables + dbt_vars (dict): dict containing the parameters + table_id (str): the table_id get the date_range variable + raw_dataset_id (str): the raw_dataset_id get the date_range variable + raw_table_id (str): the raw_table_id get the date_range variable + mode (str): the mode to get the date_range variable + + Returns: + list[dict]: the variables to be used in DBT + Union[list[dict], dict, None]: the date variable (date_range or run_date) + bool: a flag that indicates if the date_range variable came from Redis + """ + + log(f"Creating DBT variables. Parameter received: {dbt_vars}") + + if not dbt_vars: + log("dbt_vars are blank. Skiping task...") + return [None], None, False + + final_vars = [] + date_var = None + flag_date_range = False + + if "date_range" in dbt_vars.keys(): + log("Creating date_range variable") + + # Set date_range variable manually + if dict_contains_keys(dbt_vars["date_range"], ["date_range_start", "date_range_end"]): + date_var = { + "date_range_start": dbt_vars["date_range"]["date_range_start"], + "date_range_end": dbt_vars["date_range"]["date_range_end"], + } + # Create date_range using Redis + else: + if not table_id: + log("table_id are blank. Skiping task...") + return [None], None, False + + raw_table_id = raw_table_id or table_id + + date_var = get_materialization_date_range.run( + dataset_id=dataset_id, + table_id=table_id, + raw_dataset_id=raw_dataset_id, + raw_table_id=raw_table_id, + table_run_datetime_column_name=dbt_vars["date_range"].get( + "table_run_datetime_column_name" + ), + mode=mode, + delay_hours=dbt_vars["date_range"].get("delay_hours", 0), + end_ts=timestamp, + ) + + flag_date_range = True + + final_vars.append(date_var.copy()) + + log(f"date_range created: {date_var}") + + elif "run_date" in dbt_vars.keys(): + log("Creating run_date variable") + + date_var = get_run_dates.run( + date_range_start=dbt_vars["run_date"].get("date_range_start", False), + date_range_end=dbt_vars["run_date"].get("date_range_end", False), + day_datetime=timestamp, + ) + + final_vars.append([d.copy() for d in date_var]) + + log(f"run_date created: {date_var}") + + elif "data_versao_gtfs" in dbt_vars.keys(): + log("Creating data_versao_gtfs variable") + + date_var = {"data_versao_gtfs": dbt_vars["data_versao_gtfs"]} + + final_vars.append(date_var.copy()) + + if "version" in dbt_vars.keys(): + log("Creating version variable") + dataset_sha = fetch_dataset_sha.run(dataset_id=dataset_id) + + # if there are other variables inside the list, update each item adding the version variable + if final_vars: + final_vars = get_join_dict.run(dict_list=final_vars, new_dict=dataset_sha) + else: + final_vars.append(dataset_sha) + + log(f"version created: {dataset_sha}") + + log(f"All variables was created, final value is: {final_vars}") + + return final_vars, date_var, flag_date_range + + +############### +# +# Local file management +# +############### + + +@task +def get_rounded_timestamp( + timestamp: Union[str, datetime, None] = None, + interval_minutes: Union[int, None] = None, +) -> datetime: + """ + Calculate rounded timestamp for flow run. + + Args: + timestamp (Union[str, datetime, None]): timestamp to be used as reference + interval_minutes (Union[int, None], optional): interval in minutes between each recapture + + Returns: + datetime: timestamp for flow run + """ + if isinstance(timestamp, str): + timestamp = datetime.fromisoformat(timestamp) + + if not timestamp: + timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) + + timestamp = timestamp.replace(second=0, microsecond=0) + + if interval_minutes: + if interval_minutes >= 60: + hours = interval_minutes / 60 + interval_minutes = round(((hours) % 1) * 60) + + if interval_minutes == 0: + rounded_minutes = interval_minutes + else: + rounded_minutes = (timestamp.minute // interval_minutes) * interval_minutes + + timestamp = timestamp.replace(minute=rounded_minutes) + + return timestamp + + +@task +def get_current_timestamp( + timestamp=None, truncate_minute: bool = True, return_str: bool = False +) -> Union[datetime, str]: + """ + Get current timestamp for flow run. + + Args: + timestamp: timestamp to be used as reference (optionally, it can be a string) + truncate_minute: whether to truncate the timestamp to the minute or not + return_str: if True, the return will be an isoformatted datetime string + otherwise it returns a datetime object + + Returns: + Union[datetime, str]: timestamp for flow run + """ + if isinstance(timestamp, str): + timestamp = datetime.fromisoformat(timestamp) + if not timestamp: + timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) + if truncate_minute: + timestamp = timestamp.replace(second=0, microsecond=0) + if return_str: + timestamp = timestamp.isoformat() + + return timestamp + + +@task +def create_date_hour_partition( + timestamp: datetime, + partition_date_name: str = "data", + partition_date_only: bool = False, +) -> str: + """ + Create a date (and hour) Hive partition structure from timestamp. + + Args: + timestamp (datetime): timestamp to be used as reference + partition_date_name (str, optional): partition name. Defaults to "data". + partition_date_only (bool, optional): whether to add hour partition or not + + Returns: + str: partition string + """ + partition = f"{partition_date_name}={timestamp.strftime('%Y-%m-%d')}" + if not partition_date_only: + partition += f"/hora={timestamp.strftime('%H')}" + return partition + + +@task +def parse_timestamp_to_string(timestamp: datetime, pattern="%Y-%m-%d-%H-%M-%S") -> str: + """ + Parse timestamp to string pattern. + """ + return timestamp.strftime(pattern) + + +@task +def create_local_partition_path( + dataset_id: str, table_id: str, filename: str, partitions: str = None +) -> str: + """ + Create the full path sctructure which to save data locally before + upload. + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): table_id on BigQuery + filename (str, optional): Single csv name + partitions (str, optional): Partitioned directory structure, ie "ano=2022/mes=03/data=01" + Returns: + str: String path having `mode` and `filetype` to be replaced afterwards, + either to save raw or staging files. + """ + data_folder = os.getenv("DATA_FOLDER", "data") + file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}/{table_id}" + file_path += f"/{partitions}/{filename}.{{filetype}}" + log(f"Creating file path: {file_path}") + return file_path + + +@task +def save_raw_local(file_path: str, status: dict, mode: str = "raw") -> str: + """ + Saves json response from API to .json file. + Args: + file_path (str): Path which to save raw file + status (dict): Must contain keys + * data: json returned from API + * error: error catched from API request + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + Returns: + str: Path to the saved file + """ + _file_path = file_path.format(mode=mode, filetype="json") + Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + if status["error"] is None: + json.dump(status["data"], Path(_file_path).open("w", encoding="utf-8")) + log(f"Raw data saved to: {_file_path}") + return _file_path + + +@task +def save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str: + """ + Save treated file to CSV. + + Args: + file_path (str): Path which to save treated file + status (dict): Must contain keys + * `data`: dataframe returned from treatement + * `error`: error catched from data treatement + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + + Returns: + str: Path to the saved file + """ + + log(f"Saving treated data to: {file_path}, {status}") + + _file_path = file_path.format(mode=mode, filetype="csv") + + Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + if status["error"] is None: + status["data"].to_csv(_file_path, index=False) + log(f"Treated data saved to: {_file_path}") + + return _file_path + + +############### +# +# Extract data +# +############### +@task(nout=3, max_retries=3, retry_delay=timedelta(seconds=5)) +def query_logs( + dataset_id: str, + table_id: str, + datetime_filter=None, + max_recaptures: int = 90, + interval_minutes: int = 1, + recapture_window_days: int = 1, +): + """ + Queries capture logs to check for errors + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): table_id on BigQuery + datetime_filter (pendulum.datetime.DateTime, optional): + filter passed to query. This task will query the logs table + for the last n (n = recapture_window_days) days before datetime_filter + max_recaptures (int, optional): maximum number of recaptures to be done + interval_minutes (int, optional): interval in minutes between each recapture + recapture_window_days (int, optional): Number of days to query for erros + + Returns: + lists: errors (bool), + timestamps (list of pendulum.datetime.DateTime), + previous_errors (list of previous errors) + """ + + if not datetime_filter: + datetime_filter = pendulum.now(constants.TIMEZONE.value).replace(second=0, microsecond=0) + elif isinstance(datetime_filter, str): + datetime_filter = datetime.fromisoformat(datetime_filter).replace(second=0, microsecond=0) + + datetime_filter = datetime_filter.strftime("%Y-%m-%d %H:%M:%S") + + query = f""" + WITH + t AS ( + SELECT + DATETIME(timestamp_array) AS timestamp_array + FROM + UNNEST( + GENERATE_TIMESTAMP_ARRAY( + TIMESTAMP_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day), + TIMESTAMP('{datetime_filter}'), + INTERVAL {interval_minutes} minute) ) + AS timestamp_array + WHERE + timestamp_array < '{datetime_filter}' ), + logs_table AS ( + SELECT + SAFE_CAST(DATETIME(TIMESTAMP(timestamp_captura), + "America/Sao_Paulo") AS DATETIME) timestamp_captura, + SAFE_CAST(sucesso AS BOOLEAN) sucesso, + SAFE_CAST(erro AS STRING) erro, + SAFE_CAST(DATA AS DATE) DATA + FROM + rj-smtr-staging.{dataset_id}_staging.{table_id}_logs AS t + ), + logs AS ( + SELECT + *, + TIMESTAMP_TRUNC(timestamp_captura, minute) AS timestamp_array + FROM + logs_table + WHERE + DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}', + INTERVAL {recapture_window_days} day)) + AND DATE('{datetime_filter}') + AND timestamp_captura BETWEEN + DATETIME_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day) + AND '{datetime_filter}' + ) + SELECT + CASE + WHEN logs.timestamp_captura IS NOT NULL THEN logs.timestamp_captura + ELSE + t.timestamp_array + END + AS timestamp_captura, + logs.erro + FROM + t + LEFT JOIN + logs + ON + logs.timestamp_array = t.timestamp_array + WHERE + logs.sucesso IS NOT TRUE + """ + log(f"Run query to check logs:\n{query}") + results = bd.read_sql(query=query, billing_project_id=bq_project()) + + if len(results) > 0: + results = results.sort_values(["timestamp_captura"]) + results["timestamp_captura"] = ( + pd.to_datetime(results["timestamp_captura"]) + .dt.tz_localize(constants.TIMEZONE.value) + .to_list() + ) + log(f"Recapture data for the following {len(results)} timestamps:\n{results}") + if len(results) > max_recaptures: + message = f""" + [SPPO - Recaptures] + Encontradas {len(results)} timestamps para serem recapturadas. + Essa run processará as seguintes: + ##### + {results[:max_recaptures]} + ##### + Sobraram as seguintes para serem recapturadas na próxima run: + ##### + {results[max_recaptures:]} + ##### + """ + log_critical(message) + results = results[:max_recaptures] + return True, results["timestamp_captura"].to_list(), results["erro"].to_list() + return False, [], [] + + +@task +def get_raw( # pylint: disable=R0912 + url: str, + headers: str = None, + filetype: str = "json", + csv_args: dict = None, + params: dict = None, +) -> Dict: + """ + Request data from URL API + + Args: + url (str): URL to send request + headers (str, optional): Path to headers guardeded on Vault, if needed. + filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt) + csv_args (dict, optional): Arguments for read_csv, if needed + params (dict, optional): Params to be sent on request + + Returns: + dict: Containing keys + * `data` (json): data result + * `error` (str): catched error, if any. Otherwise, returns None + """ + data = None + error = None + + try: + if headers is not None: + headers = get_secret(secret_path=headers) + # remove from headers, if present + remove_headers = ["host", "databases"] + for remove_header in remove_headers: + if remove_header in list(headers.keys()): + del headers[remove_header] + + response = requests.get( + url, + headers=headers, + timeout=constants.MAX_TIMEOUT_SECONDS.value, + params=params, + ) + + if response.ok: # status code is less than 400 + if filetype == "json": + data = response.json() + + # todo: move to data check on specfic API # pylint: disable=W0102 + if isinstance(data, dict) and "DescricaoErro" in data.keys(): + error = data["DescricaoErro"] + + elif filetype in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict(orient="records") + else: + error = "Unsupported raw file extension. Supported only: json, csv and txt" + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return {"data": data, "error": error} + + +@task(checkpoint=False, nout=2) +def create_request_params( + extract_params: dict, + table_id: str, + dataset_id: str, + timestamp: datetime, + interval_minutes: int, +) -> tuple[str, str]: + """ + Task to create request params + + Args: + extract_params (dict): extract parameters + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + timestamp (datetime): timestamp for flow run + interval_minutes (int): interval in minutes between each capture + + Returns: + request_params: host, database and query to request data + request_url: url to request data + """ + request_params = None + request_url = None + + if dataset_id == constants.BILHETAGEM_DATASET_ID.value: + database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ + extract_params["database"] + ] + request_url = database["host"] + + datetime_range = get_datetime_range( + timestamp=timestamp, interval=timedelta(minutes=interval_minutes) + ) + + request_params = { + "database": extract_params["database"], + "engine": database["engine"], + "query": extract_params["query"].format(**datetime_range), + } + + elif dataset_id == constants.GTFS_DATASET_ID.value: + request_params = extract_params["filename"] + + elif dataset_id == constants.SUBSIDIO_SPPO_RECURSOS_DATASET_ID.value: + extract_params["token"] = get_secret( + secret_path=constants.SUBSIDIO_SPPO_RECURSO_API_SECRET_PATH.value + )["token"] + start = datetime.strftime( + timestamp - timedelta(minutes=interval_minutes), "%Y-%m-%dT%H:%M:%S.%MZ" + ) + end = datetime.strftime(timestamp, "%Y-%m-%dT%H:%M:%S.%MZ") + log(f" Start date {start}, end date {end}") + recurso_params = { + "dates": f"createdDate ge {start} and createdDate le {end}", + "service": constants.SUBSIDIO_SPPO_RECURSO_SERVICE.value, + } + extract_params["$filter"] = extract_params["$filter"].format(**recurso_params) + request_params = extract_params + + request_url = constants.SUBSIDIO_SPPO_RECURSO_API_BASE_URL.value + + return request_params, request_url + + +@task(checkpoint=False, nout=2) +def get_raw_from_sources( + source_type: str, + local_filepath: str, + source_path: str = None, + dataset_id: str = None, + table_id: str = None, + secret_path: str = None, + request_params: dict = None, +) -> tuple[str, str]: + """ + Task to get raw data from sources + + Args: + source_type (str): source type + local_filepath (str): local filepath + source_path (str, optional): source path. Defaults to None. + dataset_id (str, optional): dataset_id on BigQuery. Defaults to None. + table_id (str, optional): table_id on BigQuery. Defaults to None. + secret_path (str, optional): secret path. Defaults to None. + request_params (dict, optional): request parameters. Defaults to None. + + Returns: + error: error catched from upstream tasks + filepath: filepath to raw data + """ + error = None + filepath = None + data = None + + source_values = source_type.split("-", 1) + + source_type, filetype = source_values if len(source_values) == 2 else (source_values[0], None) + + log(f"Getting raw data from source type: {source_type}") + + try: + if source_type == "api": + error, data, filetype = get_raw_data_api( + url=source_path, + secret_path=secret_path, + api_params=request_params, + filetype=filetype, + ) + elif source_type == "gcs": + error, data, filetype = get_raw_data_gcs( + dataset_id=dataset_id, table_id=table_id, zip_filename=request_params + ) + elif source_type == "db": + error, data, filetype = get_raw_data_db( + host=source_path, secret_path=secret_path, **request_params + ) + elif source_type == "movidesk": + error, data, filetype = get_raw_recursos( + request_url=source_path, request_params=request_params + ) + else: + raise NotImplementedError(f"{source_type} not supported") + + filepath = save_raw_local_func(data=data, filepath=local_filepath, filetype=filetype) + + except NotImplementedError: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + log(f"Raw extraction ended returned values: {error}, {filepath}") + return error, filepath + + +############### +# +# Load data +# +############### + + +@task +def bq_upload( + dataset_id: str, + table_id: str, + filepath: str, + raw_filepath: str = None, + partitions: str = None, + status: dict = None, +): # pylint: disable=R0913 + """ + Upload raw and treated data to GCS and BigQuery. + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): table_id on BigQuery + filepath (str): Path to the saved treated .csv file + raw_filepath (str, optional): Path to raw .json file. Defaults to None. + partitions (str, optional): Partitioned directory structure, ie "ano=2022/mes=03/data=01". + Defaults to None. + status (dict, optional): Dict containing `error` key from + upstream tasks. + + Returns: + None + """ + log( + f""" + Received inputs: + raw_filepath = {raw_filepath}, type = {type(raw_filepath)} + treated_filepath = {filepath}, type = {type(filepath)} + dataset_id = {dataset_id}, type = {type(dataset_id)} + table_id = {table_id}, type = {type(table_id)} + partitions = {partitions}, type = {type(partitions)} + """ + ) + if status["error"] is not None: + return status["error"] + + error = None + + try: + # Upload raw to staging + if raw_filepath: + st_obj = Storage(table_id=table_id, dataset_id=dataset_id) + log( + f"""Uploading raw file to bucket {st_obj.bucket_name} at + {st_obj.bucket_name}/{dataset_id}/{table_id}""" + ) + st_obj.upload( + path=raw_filepath, + partitions=partitions, + mode="raw", + if_exists="replace", + ) + + # Creates and publish table if it does not exist, append to it otherwise + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=filepath, + partitions=partitions, + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error + + +@task +def bq_upload_from_dict(paths: dict, dataset_id: str, partition_levels: int = 1): + """Upload multiple tables from a dict structured as {table_id: csv_path}. + Present use case assumes table partitioned once. Adjust the parameter + 'partition_levels' to best suit new uses. + i.e. if your csv is saved as: + /date=/.csv + it has 1 level of partition. + if your csv file is saved as: + /date=/hour=/.csv + it has 2 levels of partition + + Args: + paths (dict): _description_ + dataset_id (str): _description_ + + Returns: + _type_: _description_ + """ + for key in paths.keys(): + log("#" * 80) + log(f"KEY = {key}") + tb_dir = paths[key].parent + # climb up the partition directories to reach the table dir + for i in range(partition_levels): # pylint: disable=unused-variable + tb_dir = tb_dir.parent + log(f"tb_dir = {tb_dir}") + create_or_append_table(dataset_id=dataset_id, table_id=key, path=tb_dir) + + log(f"Returning -> {tb_dir.parent}") + + return tb_dir.parent + + +@task +def upload_logs_to_bq( # pylint: disable=R0913 + dataset_id: str, + parent_table_id: str, + timestamp: str, + error: str = None, + previous_error: str = None, + recapture: bool = False, +): + """ + Upload execution status table to BigQuery. + Table is uploaded to the same dataset, named {parent_table_id}_logs. + If passing status_dict, should not pass timestamp and error. + + Args: + dataset_id (str): dataset_id on BigQuery + parent_table_id (str): Parent table id related to the status table + timestamp (str): ISO formatted timestamp string + error (str, optional): String associated with error caught during execution + Returns: + None + """ + table_id = parent_table_id + "_logs" + # Create partition directory + filename = f"{table_id}_{timestamp.isoformat()}" + partition = f"data={timestamp.date()}" + filepath = Path(f"""data/staging/{dataset_id}/{table_id}/{partition}/{filename}.csv""") + filepath.parent.mkdir(exist_ok=True, parents=True) + # Create dataframe to be uploaded + if not error and recapture is True: + # if the recapture is succeeded, update the column erro + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [True], + "erro": [f"[recapturado]{previous_error}"], + } + ) + log(f"Recapturing {timestamp} with previous error:\n{error}") + else: + # not recapturing or error during flow execution + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [error is None], + "erro": [error], + } + ) + # Save data local + dataframe.to_csv(filepath, index=False) + # Upload to Storage + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=filepath.as_posix(), + partitions=partition, + ) + if error is not None: + raise Exception(f"Pipeline failed with error: {error}") + + +@task +def upload_raw_data_to_gcs( + error: str, + raw_filepath: str, + table_id: str, + dataset_id: str, + partitions: list, +) -> Union[str, None]: + """ + Upload raw data to GCS. + + Args: + error (str): Error catched from upstream tasks. + raw_filepath (str): Path to the saved raw .json file + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + partitions (list): list of partition strings + + Returns: + Union[str, None]: if there is an error returns it traceback, otherwise returns None + """ + if error is None: + try: + st_obj = Storage(table_id=table_id, dataset_id=dataset_id) + log( + f"""Uploading raw file to bucket {st_obj.bucket_name} at + {st_obj.bucket_name}/{dataset_id}/{table_id}""" + ) + st_obj.upload( + path=raw_filepath, + partitions=partitions, + mode="raw", + if_exists="replace", + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error + + +@task +def upload_staging_data_to_gcs( + error: str, + staging_filepath: str, + timestamp: datetime, + table_id: str, + dataset_id: str, + partitions: list, + previous_error: str = None, + recapture: bool = False, +) -> Union[str, None]: + """ + Upload staging data to GCS. + + Args: + error (str): Error catched from upstream tasks. + staging_filepath (str): Path to the saved treated .csv file. + timestamp (datetime): timestamp for flow run. + table_id (str): table_id on BigQuery. + dataset_id (str): dataset_id on BigQuery. + partitions (list): list of partition strings. + + Returns: + Union[str, None]: if there is an error returns it traceback, otherwise returns None + """ + if error is None: + try: + # Creates and publish table if it does not exist, append to it otherwise + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=staging_filepath, + partitions=partitions, + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + upload_run_logs_to_bq( + dataset_id=dataset_id, + parent_table_id=table_id, + error=error, + timestamp=timestamp, + mode="staging", + previous_error=previous_error, + recapture=recapture, + ) + + return error + + +############### +# +# Daterange tasks +# +############### + + +@task( + checkpoint=False, + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def get_materialization_date_range( # pylint: disable=R0913 + dataset_id: str, + table_id: str, + raw_dataset_id: str, + raw_table_id: str, + table_run_datetime_column_name: str = None, + mode: str = "prod", + delay_hours: int = 0, + end_ts: datetime = None, +): + """ + Task for generating dict with variables to be passed to the + --vars argument on DBT. + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): model filename on the queries repo. + eg: if you have a model defined in the file .sql, + the table_id should be + table_date_column_name (Optional, str): if it's the first time this + is ran, will query the table for the maximum value on this field. + If rebuild is true, will query the table for the minimum value + on this field. + rebuild (Optional, bool): if true, queries the minimum date value on the + table and return a date range from that value to the datetime.now() time + delay(Optional, int): hours delayed from now time for materialization range + end_ts(Optional, datetime): date range's final date + Returns: + dict: containing date_range_start and date_range_end + """ + timestr = "%Y-%m-%dT%H:%M:%S" + # get start from redis + last_run = get_last_run_timestamp(dataset_id=dataset_id, table_id=table_id, mode=mode) + # if there's no timestamp set on redis, get max timestamp on source table + if last_run is None: + log("Failed to fetch key from Redis...\n Querying tables for last suceeded run") + if Table(dataset_id=dataset_id, table_id=table_id).table_exists("prod"): + last_run = get_table_min_max_value( + query_project_id=bq_project(), + dataset_id=dataset_id, + table_id=table_id, + field_name=table_run_datetime_column_name, + kind="max", + ) + log( + f""" + Queried last run from {dataset_id}.{table_id} + Got: + {last_run} as type {type(last_run)} + """ + ) + else: + last_run = get_table_min_max_value( + query_project_id=bq_project(), + dataset_id=raw_dataset_id, + table_id=raw_table_id, + field_name=table_run_datetime_column_name, + kind="max", + ) + log( + f""" + Queried last run from {raw_dataset_id}.{raw_table_id} + Got: + {last_run} as type {type(last_run)} + """ + ) + else: + last_run = datetime.strptime(last_run, timestr) + + if (not isinstance(last_run, datetime)) and (isinstance(last_run, date)): + last_run = datetime(last_run.year, last_run.month, last_run.day) + + # set start to last run hour (H) + start_ts = last_run.replace(minute=0, second=0, microsecond=0).strftime(timestr) + + # set end to now - delay + + if not end_ts: + end_ts = pendulum.now(constants.TIMEZONE.value).replace( + tzinfo=None, minute=0, second=0, microsecond=0 + ) + + end_ts = (end_ts - timedelta(hours=delay_hours)).replace(minute=0, second=0, microsecond=0) + + end_ts = end_ts.strftime(timestr) + + date_range = {"date_range_start": start_ts, "date_range_end": end_ts} + log(f"Got date_range as: {date_range}") + return date_range + + +@task +def set_last_run_timestamp( + dataset_id: str, table_id: str, timestamp: str, mode: str = "prod", wait=None +): # pylint: disable=unused-argument + """ + Set the `last_run_timestamp` key for the dataset_id/table_id pair + to datetime.now() time. Used after running a materialization to set the + stage for the next to come + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): model filename on the queries repo. + timestamp: Last run timestamp end. + wait (Any, optional): Used for defining dependencies inside the flow, + in general, pass the output of the task which should be run imediately + before this. Defaults to None. + + Returns: + _type_: _description_ + """ + log(f"Saving timestamp {timestamp} on Redis for {dataset_id}.{table_id}") + redis_client = get_redis_client() + key = dataset_id + "." + table_id + if mode == "dev": + key = f"{mode}.{key}" + content = redis_client.get(key) + if not content: + content = {} + content["last_run_timestamp"] = timestamp + redis_client.set(key, content) + return True + + +@task +def delay_now_time(timestamp: str, delay_minutes=6): + """Return timestamp string delayed by + + Args: + timestamp (str): Isoformat timestamp string + delay_minutes (int, optional): Minutes to delay timestamp by Defaults to 6. + + Returns: + str : timestamp string formatted as "%Y-%m-%dT%H-%M-%S" + """ + ts_obj = datetime.fromisoformat(timestamp) + ts_obj = ts_obj - timedelta(minutes=delay_minutes) + return ts_obj.strftime("%Y-%m-%dT%H-%M-%S") + + +@task +def fetch_dataset_sha(dataset_id: str): + """Fetches the SHA of a branch from Github""" + url = "https://api.github.com/repos/prefeitura-rio/queries-rj-smtr" + url += f"/commits?queries-rj-smtr/rj_smtr/{dataset_id}" + response = requests.get(url) + + if response.status_code != 200: + return None + + dataset_version = response.json()[0]["sha"] + return {"version": dataset_version} + + +@task +def get_run_dates( + date_range_start: str, date_range_end: str, day_datetime: datetime = None +) -> List: + """ + Generates a list of dates between date_range_start and date_range_end. + + Args: + date_range_start (str): the start date to create the date range + date_range_end (str): the end date to create the date range + day_datetime (datetime, Optional): a timestamp to use as run_date + if the range start or end is False + + Returns: + list: the list of run_dates + """ + if (date_range_start is False) or (date_range_end is False): + if day_datetime: + run_date = day_datetime.strftime("%Y-%m-%d") + else: + run_date = get_now_date.run() + dates = [{"run_date": run_date}] + else: + dates = [ + {"run_date": d.strftime("%Y-%m-%d")} + for d in pd.date_range(start=date_range_start, end=date_range_end) + ] + log(f"Will run the following dates: {dates}") + return dates + + +@task +def get_join_dict(dict_list: list, new_dict: dict) -> List: + """ + Updates a list of dictionaries with a new dictionary. + """ + for dict_temp in dict_list: + dict_temp.update(new_dict) + + log(f"get_join_dict: {dict_list}") + return dict_list + + +@task(checkpoint=False) +def get_previous_date(days): + """ + Returns the date of {days} days ago in YYYY-MM-DD. + """ + now = pendulum.now(pendulum.timezone("America/Sao_Paulo")).subtract(days=days) + + return now.to_date_string() + + +############### +# +# Pretreat data +# +############### + + +@task(nout=2) +def transform_raw_to_nested_structure( + raw_filepath: str, + filepath: str, + error: str, + timestamp: datetime, + primary_key: list = None, +) -> tuple[str, str]: + """ + Task to transform raw data to nested structure + + Args: + raw_filepath (str): Path to the saved raw .json file + filepath (str): Path to the saved treated .csv file + error (str): Error catched from upstream tasks + timestamp (datetime): timestamp for flow run + primary_key (list, optional): Primary key to be used on nested structure + + Returns: + str: Error traceback + str: Path to the saved treated .csv file + """ + if error is None: + try: + # leitura do dado raw + error, data = read_raw_data(filepath=raw_filepath) + + if primary_key is None: + primary_key = [] + + log( + f""" + Received inputs: + - timestamp:\n{timestamp} + - data:\n{data.head()}""" + ) + + # Check empty dataframe + if data.empty: + log("Empty dataframe, skipping transformation...") + + else: + log(f"Raw data:\n{data_info_str(data)}", level="info") + + log("Adding captured timestamp column...", level="info") + data["timestamp_captura"] = timestamp + + if "customFieldValues" not in data: + log("Striping string columns...", level="info") + for col in data.columns[data.dtypes == "object"].to_list(): + data[col] = data[col].str.strip() + + log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info") + + log("Creating nested structure...", level="info") + pk_cols = primary_key + ["timestamp_captura"] + data = ( + data.groupby(pk_cols) + .apply(lambda x: x[data.columns.difference(pk_cols)].to_json(orient="records")) + .str.strip("[]") + .reset_index(name="content")[primary_key + ["content", "timestamp_captura"]] + ) + + log( + f"Finished nested structure! Data:\n{data_info_str(data)}", + level="info", + ) + + # save treated local + filepath = save_treated_local_func(data=data, error=error, filepath=filepath) + + except Exception: # pylint: disable=W0703 + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, filepath + + +############### +# +# Utilitary tasks +# +############### + + +@task(checkpoint=False) +def coalesce_task(value_list: Iterable): + """ + Task to get the first non None value of a list + + Args: + value_list (Iterable): a iterable object with the values + Returns: + any: value_list's first non None item + """ + + try: + return next(value for value in value_list if value is not None) + except StopIteration: + return None + + +@task(checkpoint=False, nout=2) +def unpack_mapped_results_nout2( + mapped_results: Iterable, +) -> tuple[list[Any], list[Any]]: + """ + Task to unpack the results from an nout=2 tasks in 2 lists when it is mapped + + Args: + mapped_results (Iterable): The mapped task return + + Returns: + tuple[list[Any], list[Any]]: The task original return splited in 2 lists: + - 1st list being all the first return + - 2nd list being all the second return + + """ + return [r[0] for r in mapped_results], [r[1] for r in mapped_results] + + +@task +def check_mapped_query_logs_output(query_logs_output: list[tuple]) -> bool: + """ + Task to check if there is recaptures pending + + Args: + query_logs_output (list[tuple]): the return from a mapped query_logs execution + + Returns: + bool: True if there is recaptures to do, otherwise False + """ + + if len(query_logs_output) == 0: + return False + + recapture_list = [i[0] for i in query_logs_output] + return any(recapture_list) + + +@task +def get_scheduled_start_times( + timestamp: datetime, parameters: list, intervals: Union[None, dict] = None +): + """ + Task to get start times to schedule flows + + Args: + timestamp (datetime): initial flow run timestamp + parameters (list): parameters for the flow + intervals (Union[None, dict], optional): intervals between each flow run. Defaults to None. + Optionally, you can pass specific intervals for some table_ids. + Suggests to pass intervals based on previous table observed execution times. + Defaults to dict(default=timedelta(minutes=2)). + + Returns: + list[datetime]: list of scheduled start times + """ + + if intervals is None: + intervals = dict() + + if "default" not in intervals.keys(): + intervals["default"] = timedelta(minutes=2) + + timestamps = [None] + last_schedule = timestamp + + for param in parameters[1:]: + last_schedule += intervals.get(param.get("table_id", "default"), intervals["default"]) + timestamps.append(last_schedule) + + return timestamps + + +@task +def rename_current_flow_run_now_time(prefix: str, now_time=None, wait=None) -> None: + """ + Rename the current flow run. + """ + flow_run_id = prefect.context.get("flow_run_id") + client = Client() + return client.set_flow_run_name(flow_run_id, f"{prefix}{now_time}") + + +@prefect.task(checkpoint=False) +def get_now_time(): + """ + Returns the HH:MM. + """ + now = pendulum.now(pendulum.timezone("America/Sao_Paulo")) + + return f"{now.hour}:{f'0{now.minute}' if len(str(now.minute))==1 else now.minute}" + + +@prefect.task(checkpoint=False) +def get_now_date(): + """ + Returns the current date in YYYY-MM-DD. + """ + now = pendulum.now(pendulum.timezone("America/Sao_Paulo")) + + return now.to_date_string() + + +@task +def get_current_flow_mode(labels: List[str]) -> str: + """ + Get the mode (prod/dev/staging) of the current flow. + """ + if labels[0].endswith("-dev"): + return "dev" + if labels[0].endswith("-staging"): + return "staging" + return "prod" diff --git a/pipelines/treatment/__init__.py b/pipelines/treatment/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/treatment/bilhetagem/__init__.py b/pipelines/treatment/bilhetagem/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/treatment/bilhetagem/flows.py b/pipelines/treatment/bilhetagem/flows.py new file mode 100644 index 000000000..4c79580c1 --- /dev/null +++ b/pipelines/treatment/bilhetagem/flows.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +"""Flows de tratamento da bilhetagem""" +from datetime import timedelta + +from prefect.run_configs import KubernetesRun +from prefect.storage import GCS +from prefeitura_rio.pipelines_utils.custom import Flow +from prefeitura_rio.pipelines_utils.state_handlers import ( + handler_inject_bd_credentials, + handler_skip_if_running, +) + +from pipelines.capture.jae.constants import constants as jae_capture_constants +from pipelines.capture.jae.flows import JAE_AUXILIAR_CAPTURE +from pipelines.constants import constants +from pipelines.schedules import generate_interval_schedule +from pipelines.tasks import run_subflow + +with Flow("Bilhetagem - Tratamento") as bilhetagem_tratamento: + + AUXILIAR_CAPTURE = run_subflow( + flow_name=JAE_AUXILIAR_CAPTURE.name, + parameters=jae_capture_constants.AUXILIAR_TABLE_CAPTURE_PARAMS.value, + maximum_parallelism=3, + ) + + AUXILIAR_CAPTURE.name = "run_captura_auxiliar_jae" + + +bilhetagem_tratamento.storage = GCS(constants.GCS_FLOWS_BUCKET.value) +bilhetagem_tratamento.run_config = KubernetesRun( + image=constants.DOCKER_IMAGE.value, + labels=[constants.RJ_SMTR_AGENT_LABEL.value], +) + +bilhetagem_tratamento.state_handlers = [ + handler_inject_bd_credentials, + handler_skip_if_running, +] + +bilhetagem_tratamento.schedule = generate_interval_schedule( + interval=timedelta(hours=1), + agent_label=constants.RJ_SMTR_AGENT_LABEL.value, +) diff --git a/pipelines/treatment/templates/__init__.py b/pipelines/treatment/templates/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/treatment/templates/flows.py b/pipelines/treatment/templates/flows.py new file mode 100644 index 000000000..1c058eb58 --- /dev/null +++ b/pipelines/treatment/templates/flows.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +"""Flows de Tratamento de dados Genéricos""" diff --git a/pipelines/treatment/templates/tasks.py b/pipelines/treatment/templates/tasks.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/utils/backup/__init__.py b/pipelines/utils/backup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/utils/backup/tasks.py b/pipelines/utils/backup/tasks.py new file mode 100644 index 000000000..2474448a3 --- /dev/null +++ b/pipelines/utils/backup/tasks.py @@ -0,0 +1,1485 @@ +# -*- coding: utf-8 -*- +# pylint: disable=W0703, W0511 +""" +Tasks for rj_smtr +""" +import io +import json +import os +import traceback +from datetime import date, datetime, timedelta +from pathlib import Path +from typing import Any, Dict, Iterable, List, Union + +import basedosdados as bd +import pandas as pd +import pendulum +import prefect +import requests +from basedosdados import Storage, Table +from prefect import Client, task +from prefect.backend import FlowRunView +from prefeitura_rio.pipelines_utils.dbt import run_dbt_model as run_dbt_model_func +from prefeitura_rio.pipelines_utils.infisical import inject_bd_credentials +from prefeitura_rio.pipelines_utils.logging import log +from prefeitura_rio.pipelines_utils.redis_pal import get_redis_client +from pytz import timezone + +from pipelines.constants import constants +from pipelines.utils.backup.utils import ( + bq_project, + create_or_append_table, + data_info_str, + dict_contains_keys, + get_last_run_timestamp, + get_raw_data_api, + get_raw_data_db, + get_raw_data_gcs, + get_raw_recursos, + get_table_min_max_value, + log_critical, + read_raw_data, + save_raw_local_func, + save_treated_local_func, + upload_run_logs_to_bq, +) +from pipelines.utils.secret import get_secret + + +############### +# +# SETUP +# +############### +@task +def setup_task(): + return inject_bd_credentials() + + +@task +def get_current_flow_labels() -> List[str]: + """ + Get the labels of the current flow. + """ + flow_run_id = prefect.context.get("flow_run_id") + flow_run_view = FlowRunView.from_flow_run_id(flow_run_id) + return flow_run_view.labels + + +############### +# +# DBT +# +############### + + +@task +def run_dbt_model( + dataset_id: str = None, + table_id: str = None, + dbt_alias: bool = False, + upstream: bool = None, + downstream: bool = None, + exclude: str = None, + flags: str = None, + _vars: dict | List[Dict] = None, +): + return run_dbt_model_func( + dataset_id=dataset_id, + table_id=table_id, + dbt_alias=dbt_alias, + upstream=upstream, + downstream=downstream, + exclude=exclude, + flags=flags, + _vars=_vars, + ) + + +# @task(max_retries=3, retry_delay=timedelta(seconds=10)) +# def build_incremental_model( # pylint: disable=too-many-arguments +# dataset_id: str, +# base_table_id: str, +# mat_table_id: str, +# field_name: str = "data_versao", +# refresh: bool = False, +# wait=None, # pylint: disable=unused-argument +# ): +# """ +# Utility task for backfilling table in predetermined steps. +# Assumes the step sizes will be defined on the .sql file. + +# Args: +# dbt_client (DbtClient): DBT interface object +# dataset_id (str): Dataset id on BigQuery +# base_table_id (str): Base table from which to materialize (usually, an external table) +# mat_table_id (str): Target table id for materialization +# field_name (str, optional): Key field (column) for dbt incremental filters. +# Defaults to "data_versao". +# refresh (bool, optional): If True, rebuild the table from scratch. Defaults to False. +# wait (NoneType, optional): Placeholder parameter, used to wait previous tasks finish. +# Defaults to None. + +# Returns: +# bool: whether the table was fully built or not. +# """ + +# query_project_id = bq_project() +# last_mat_date = get_table_min_max_value( +# query_project_id, dataset_id, mat_table_id, field_name, "max" +# ) +# last_base_date = get_table_min_max_value( +# query_project_id, dataset_id, base_table_id, field_name, "max" +# ) +# log( +# f""" +# Base table last version: {last_base_date} +# Materialized table last version: {last_mat_date} +# """ +# ) +# run_command = f"run --select models/{dataset_id}/{mat_table_id}.sql" + +# if refresh: +# log("Running in full refresh mode") +# log(f"DBT will run the following command:\n{run_command+' --full-refresh'}") +# run_dbt_model_func(dataset_id=dataset_id, table_id=mat_table_id, flags="--full-refresh") +# last_mat_date = get_table_min_max_value( +# query_project_id, dataset_id, mat_table_id, field_name, "max" +# ) + +# if last_base_date > last_mat_date: +# log("Running interval step materialization") +# log(f"DBT will run the following command:\n{run_command}") +# while last_base_date > last_mat_date: +# running = run_dbt_model_func(dataset_id=dataset_id, table_id=mat_table_id) +# # running = dbt_client.cli(run_command, sync=True) +# last_mat_date = get_table_min_max_value( +# query_project_id, +# dataset_id, +# mat_table_id, +# field_name, +# "max", +# wait=running, +# ) +# log(f"After this step, materialized table last version is: {last_mat_date}") +# if last_mat_date == last_base_date: +# log("Materialized table reached base table version!") +# return True +# log("Did not run interval step materialization...") +# return False + + +@task(checkpoint=False, nout=3) +def create_dbt_run_vars( + dataset_id: str, + dbt_vars: dict, + table_id: str, + raw_dataset_id: str, + raw_table_id: str, + mode: str, + timestamp: datetime, +) -> tuple[list[dict], Union[list[dict], dict, None], bool]: + """ + Create the variables to be used in dbt materialization based on a dict + + Args: + dataset_id (str): the dataset_id to get the variables + dbt_vars (dict): dict containing the parameters + table_id (str): the table_id get the date_range variable + raw_dataset_id (str): the raw_dataset_id get the date_range variable + raw_table_id (str): the raw_table_id get the date_range variable + mode (str): the mode to get the date_range variable + + Returns: + list[dict]: the variables to be used in DBT + Union[list[dict], dict, None]: the date variable (date_range or run_date) + bool: a flag that indicates if the date_range variable came from Redis + """ + + log(f"Creating DBT variables. Parameter received: {dbt_vars}") + + if not dbt_vars: + log("dbt_vars are blank. Skiping task...") + return [None], None, False + + final_vars = [] + date_var = None + flag_date_range = False + + if "date_range" in dbt_vars.keys(): + log("Creating date_range variable") + + # Set date_range variable manually + if dict_contains_keys(dbt_vars["date_range"], ["date_range_start", "date_range_end"]): + date_var = { + "date_range_start": dbt_vars["date_range"]["date_range_start"], + "date_range_end": dbt_vars["date_range"]["date_range_end"], + } + # Create date_range using Redis + else: + if not table_id: + log("table_id are blank. Skiping task...") + return [None], None, False + + raw_table_id = raw_table_id or table_id + + date_var = get_materialization_date_range.run( + dataset_id=dataset_id, + table_id=table_id, + raw_dataset_id=raw_dataset_id, + raw_table_id=raw_table_id, + table_run_datetime_column_name=dbt_vars["date_range"].get( + "table_run_datetime_column_name" + ), + mode=mode, + delay_hours=dbt_vars["date_range"].get("delay_hours", 0), + end_ts=timestamp, + ) + + flag_date_range = True + + final_vars.append(date_var.copy()) + + log(f"date_range created: {date_var}") + + elif "run_date" in dbt_vars.keys(): + log("Creating run_date variable") + + date_var = get_run_dates.run( + date_range_start=dbt_vars["run_date"].get("date_range_start", False), + date_range_end=dbt_vars["run_date"].get("date_range_end", False), + day_datetime=timestamp, + ) + + final_vars.append([d.copy() for d in date_var]) + + log(f"run_date created: {date_var}") + + elif "data_versao_gtfs" in dbt_vars.keys(): + log("Creating data_versao_gtfs variable") + + date_var = {"data_versao_gtfs": dbt_vars["data_versao_gtfs"]} + + final_vars.append(date_var.copy()) + + if "version" in dbt_vars.keys(): + log("Creating version variable") + dataset_sha = fetch_dataset_sha.run(dataset_id=dataset_id) + + # if there are other variables inside the list, update each item adding the version variable + if final_vars: + final_vars = get_join_dict.run(dict_list=final_vars, new_dict=dataset_sha) + else: + final_vars.append(dataset_sha) + + log(f"version created: {dataset_sha}") + + log(f"All variables was created, final value is: {final_vars}") + + return final_vars, date_var, flag_date_range + + +############### +# +# Local file management +# +############### + + +@task +def get_rounded_timestamp( + timestamp: Union[str, datetime, None] = None, + interval_minutes: Union[int, None] = None, +) -> datetime: + """ + Calculate rounded timestamp for flow run. + + Args: + timestamp (Union[str, datetime, None]): timestamp to be used as reference + interval_minutes (Union[int, None], optional): interval in minutes between each recapture + + Returns: + datetime: timestamp for flow run + """ + if isinstance(timestamp, str): + timestamp = datetime.fromisoformat(timestamp) + + if not timestamp: + timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) + + timestamp = timestamp.replace(second=0, microsecond=0) + + if interval_minutes: + if interval_minutes >= 60: + hours = interval_minutes / 60 + interval_minutes = round(((hours) % 1) * 60) + + if interval_minutes == 0: + rounded_minutes = interval_minutes + else: + rounded_minutes = (timestamp.minute // interval_minutes) * interval_minutes + + timestamp = timestamp.replace(minute=rounded_minutes) + + return timestamp + + +@task +def get_current_timestamp( + timestamp=None, truncate_minute: bool = True, return_str: bool = False +) -> Union[datetime, str]: + """ + Get current timestamp for flow run. + + Args: + timestamp: timestamp to be used as reference (optionally, it can be a string) + truncate_minute: whether to truncate the timestamp to the minute or not + return_str: if True, the return will be an isoformatted datetime string + otherwise it returns a datetime object + + Returns: + Union[datetime, str]: timestamp for flow run + """ + if isinstance(timestamp, str): + timestamp = datetime.fromisoformat(timestamp) + if not timestamp: + timestamp = datetime.now(tz=timezone(constants.TIMEZONE.value)) + if truncate_minute: + timestamp = timestamp.replace(second=0, microsecond=0) + if return_str: + timestamp = timestamp.isoformat() + + return timestamp + + +@task +def create_date_hour_partition( + timestamp: datetime, + partition_date_name: str = "data", + partition_date_only: bool = False, +) -> str: + """ + Create a date (and hour) Hive partition structure from timestamp. + + Args: + timestamp (datetime): timestamp to be used as reference + partition_date_name (str, optional): partition name. Defaults to "data". + partition_date_only (bool, optional): whether to add hour partition or not + + Returns: + str: partition string + """ + partition = f"{partition_date_name}={timestamp.strftime('%Y-%m-%d')}" + if not partition_date_only: + partition += f"/hora={timestamp.strftime('%H')}" + return partition + + +@task +def parse_timestamp_to_string(timestamp: datetime, pattern="%Y-%m-%d-%H-%M-%S") -> str: + """ + Parse timestamp to string pattern. + """ + return timestamp.strftime(pattern) + + +@task +def create_local_partition_path( + dataset_id: str, table_id: str, filename: str, partitions: str = None +) -> str: + """ + Create the full path sctructure which to save data locally before + upload. + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): table_id on BigQuery + filename (str, optional): Single csv name + partitions (str, optional): Partitioned directory structure, ie "ano=2022/mes=03/data=01" + Returns: + str: String path having `mode` and `filetype` to be replaced afterwards, + either to save raw or staging files. + """ + data_folder = os.getenv("DATA_FOLDER", "data") + file_path = f"{os.getcwd()}/{data_folder}/{{mode}}/{dataset_id}/{table_id}" + file_path += f"/{partitions}/{filename}.{{filetype}}" + log(f"Creating file path: {file_path}") + return file_path + + +@task +def save_raw_local(file_path: str, status: dict, mode: str = "raw") -> str: + """ + Saves json response from API to .json file. + Args: + file_path (str): Path which to save raw file + status (dict): Must contain keys + * data: json returned from API + * error: error catched from API request + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + Returns: + str: Path to the saved file + """ + _file_path = file_path.format(mode=mode, filetype="json") + Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + if status["error"] is None: + json.dump(status["data"], Path(_file_path).open("w", encoding="utf-8")) + log(f"Raw data saved to: {_file_path}") + return _file_path + + +@task +def save_treated_local(file_path: str, status: dict, mode: str = "staging") -> str: + """ + Save treated file to CSV. + + Args: + file_path (str): Path which to save treated file + status (dict): Must contain keys + * `data`: dataframe returned from treatement + * `error`: error catched from data treatement + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + + Returns: + str: Path to the saved file + """ + + log(f"Saving treated data to: {file_path}, {status}") + + _file_path = file_path.format(mode=mode, filetype="csv") + + Path(_file_path).parent.mkdir(parents=True, exist_ok=True) + if status["error"] is None: + status["data"].to_csv(_file_path, index=False) + log(f"Treated data saved to: {_file_path}") + + return _file_path + + +############### +# +# Extract data +# +############### +@task(nout=3, max_retries=3, retry_delay=timedelta(seconds=5)) +def query_logs( + dataset_id: str, + table_id: str, + datetime_filter=None, + max_recaptures: int = 90, + interval_minutes: int = 1, + recapture_window_days: int = 1, +): + """ + Queries capture logs to check for errors + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): table_id on BigQuery + datetime_filter (pendulum.datetime.DateTime, optional): + filter passed to query. This task will query the logs table + for the last n (n = recapture_window_days) days before datetime_filter + max_recaptures (int, optional): maximum number of recaptures to be done + interval_minutes (int, optional): interval in minutes between each recapture + recapture_window_days (int, optional): Number of days to query for erros + + Returns: + lists: errors (bool), + timestamps (list of pendulum.datetime.DateTime), + previous_errors (list of previous errors) + """ + + if not datetime_filter: + datetime_filter = pendulum.now(constants.TIMEZONE.value).replace(second=0, microsecond=0) + elif isinstance(datetime_filter, str): + datetime_filter = datetime.fromisoformat(datetime_filter).replace(second=0, microsecond=0) + + datetime_filter = datetime_filter.strftime("%Y-%m-%d %H:%M:%S") + + query = f""" + WITH + t AS ( + SELECT + DATETIME(timestamp_array) AS timestamp_array + FROM + UNNEST( + GENERATE_TIMESTAMP_ARRAY( + TIMESTAMP_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day), + TIMESTAMP('{datetime_filter}'), + INTERVAL {interval_minutes} minute) ) + AS timestamp_array + WHERE + timestamp_array < '{datetime_filter}' ), + logs_table AS ( + SELECT + SAFE_CAST(DATETIME(TIMESTAMP(timestamp_captura), + "America/Sao_Paulo") AS DATETIME) timestamp_captura, + SAFE_CAST(sucesso AS BOOLEAN) sucesso, + SAFE_CAST(erro AS STRING) erro, + SAFE_CAST(DATA AS DATE) DATA + FROM + rj-smtr-staging.{dataset_id}_staging.{table_id}_logs AS t + ), + logs AS ( + SELECT + *, + TIMESTAMP_TRUNC(timestamp_captura, minute) AS timestamp_array + FROM + logs_table + WHERE + DATA BETWEEN DATE(DATETIME_SUB('{datetime_filter}', + INTERVAL {recapture_window_days} day)) + AND DATE('{datetime_filter}') + AND timestamp_captura BETWEEN + DATETIME_SUB('{datetime_filter}', INTERVAL {recapture_window_days} day) + AND '{datetime_filter}' + ) + SELECT + CASE + WHEN logs.timestamp_captura IS NOT NULL THEN logs.timestamp_captura + ELSE + t.timestamp_array + END + AS timestamp_captura, + logs.erro + FROM + t + LEFT JOIN + logs + ON + logs.timestamp_array = t.timestamp_array + WHERE + logs.sucesso IS NOT TRUE + """ + log(f"Run query to check logs:\n{query}") + results = bd.read_sql(query=query, billing_project_id=bq_project()) + + if len(results) > 0: + results = results.sort_values(["timestamp_captura"]) + results["timestamp_captura"] = ( + pd.to_datetime(results["timestamp_captura"]) + .dt.tz_localize(constants.TIMEZONE.value) + .to_list() + ) + log(f"Recapture data for the following {len(results)} timestamps:\n{results}") + if len(results) > max_recaptures: + message = f""" + [SPPO - Recaptures] + Encontradas {len(results)} timestamps para serem recapturadas. + Essa run processará as seguintes: + ##### + {results[:max_recaptures]} + ##### + Sobraram as seguintes para serem recapturadas na próxima run: + ##### + {results[max_recaptures:]} + ##### + """ + log_critical(message) + + results = results[:max_recaptures] + return True, results["timestamp_captura"].to_list(), results["erro"].to_list() + return False, [], [] + + +@task +def get_raw( # pylint: disable=R0912 + url: str, + headers: str = None, + filetype: str = "json", + csv_args: dict = None, + params: dict = None, +) -> Dict: + """ + Request data from URL API + + Args: + url (str): URL to send request + headers (str, optional): Path to headers guardeded on Vault, if needed. + filetype (str, optional): Filetype to be formatted (supported only: json, csv and txt) + csv_args (dict, optional): Arguments for read_csv, if needed + params (dict, optional): Params to be sent on request + + Returns: + dict: Containing keys + * `data` (json): data result + * `error` (str): catched error, if any. Otherwise, returns None + """ + data = None + error = None + + try: + if headers is not None: + headers = get_secret(secret_path=headers) + # remove from headers, if present + remove_headers = ["host", "databases"] + for remove_header in remove_headers: + if remove_header in list(headers.keys()): + del headers[remove_header] + + response = requests.get( + url, + headers=headers, + timeout=constants.MAX_TIMEOUT_SECONDS.value, + params=params, + ) + + if response.ok: # status code is less than 400 + if filetype == "json": + data = response.json() + + # todo: move to data check on specfic API # pylint: disable=W0102 + if isinstance(data, dict) and "DescricaoErro" in data.keys(): + error = data["DescricaoErro"] + + elif filetype in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(io.StringIO(response.text), **csv_args).to_dict(orient="records") + else: + error = "Unsupported raw file extension. Supported only: json, csv and txt" + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return {"data": data, "error": error} + + +# @task(checkpoint=False, nout=2) +# def create_request_params( +# extract_params: dict, +# table_id: str, +# dataset_id: str, +# timestamp: datetime, +# interval_minutes: int, +# ) -> tuple[str, str]: +# """ +# Task to create request params + +# Args: +# extract_params (dict): extract parameters +# table_id (str): table_id on BigQuery +# dataset_id (str): dataset_id on BigQuery +# timestamp (datetime): timestamp for flow run +# interval_minutes (int): interval in minutes between each capture + +# Returns: +# request_params: host, database and query to request data +# request_url: url to request data +# """ +# request_params = None +# request_url = None + +# if dataset_id == constants.BILHETAGEM_DATASET_ID.value: +# database = constants.BILHETAGEM_GENERAL_CAPTURE_PARAMS.value["databases"][ +# extract_params["database"] +# ] +# request_url = database["host"] + +# datetime_range = get_datetime_range( +# timestamp=timestamp, interval=timedelta(minutes=interval_minutes) +# ) + +# request_params = { +# "database": extract_params["database"], +# "engine": database["engine"], +# "query": extract_params["query"].format(**datetime_range), +# } + +# elif dataset_id == constants.GTFS_DATASET_ID.value: +# request_params = extract_params["filename"] + +# elif dataset_id == constants.SUBSIDIO_SPPO_RECURSOS_DATASET_ID.value: +# extract_params["token"] = get_secret( +# secret_path=constants.SUBSIDIO_SPPO_RECURSO_API_SECRET_PATH.value +# )["token"] +# start = datetime.strftime( +# timestamp - timedelta(minutes=interval_minutes), "%Y-%m-%dT%H:%M:%S.%MZ" +# ) +# end = datetime.strftime(timestamp, "%Y-%m-%dT%H:%M:%S.%MZ") +# log(f" Start date {start}, end date {end}") +# recurso_params = { +# "dates": f"createdDate ge {start} and createdDate le {end}", +# "service": constants.SUBSIDIO_SPPO_RECURSO_SERVICE.value, +# } +# extract_params["$filter"] = extract_params["$filter"].format(**recurso_params) +# request_params = extract_params + +# request_url = constants.SUBSIDIO_SPPO_RECURSO_API_BASE_URL.value + +# return request_params, request_url + + +@task(checkpoint=False, nout=2) +def get_raw_from_sources( + source_type: str, + local_filepath: str, + source_path: str = None, + dataset_id: str = None, + table_id: str = None, + secret_path: str = None, + request_params: dict = None, +) -> tuple[str, str]: + """ + Task to get raw data from sources + + Args: + source_type (str): source type + local_filepath (str): local filepath + source_path (str, optional): source path. Defaults to None. + dataset_id (str, optional): dataset_id on BigQuery. Defaults to None. + table_id (str, optional): table_id on BigQuery. Defaults to None. + secret_path (str, optional): secret path. Defaults to None. + request_params (dict, optional): request parameters. Defaults to None. + + Returns: + error: error catched from upstream tasks + filepath: filepath to raw data + """ + error = None + filepath = None + data = None + + source_values = source_type.split("-", 1) + + source_type, filetype = source_values if len(source_values) == 2 else (source_values[0], None) + + log(f"Getting raw data from source type: {source_type}") + + try: + if source_type == "api": + error, data, filetype = get_raw_data_api( + url=source_path, + secret_path=secret_path, + api_params=request_params, + filetype=filetype, + ) + elif source_type == "gcs": + error, data, filetype = get_raw_data_gcs( + dataset_id=dataset_id, table_id=table_id, zip_filename=request_params + ) + elif source_type == "db": + error, data, filetype = get_raw_data_db( + host=source_path, secret_path=secret_path, **request_params + ) + elif source_type == "movidesk": + error, data, filetype = get_raw_recursos( + request_url=source_path, request_params=request_params + ) + else: + raise NotImplementedError(f"{source_type} not supported") + + filepath = save_raw_local_func(data=data, filepath=local_filepath, filetype=filetype) + + except NotImplementedError: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + log(f"Raw extraction ended returned values: {error}, {filepath}") + return error, filepath + + +############### +# +# Load data +# +############### + + +@task +def bq_upload( + dataset_id: str, + table_id: str, + filepath: str, + raw_filepath: str = None, + partitions: str = None, + status: dict = None, +): # pylint: disable=R0913 + """ + Upload raw and treated data to GCS and BigQuery. + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): table_id on BigQuery + filepath (str): Path to the saved treated .csv file + raw_filepath (str, optional): Path to raw .json file. Defaults to None. + partitions (str, optional): Partitioned directory structure, ie "ano=2022/mes=03/data=01". + Defaults to None. + status (dict, optional): Dict containing `error` key from + upstream tasks. + + Returns: + None + """ + log( + f""" + Received inputs: + raw_filepath = {raw_filepath}, type = {type(raw_filepath)} + treated_filepath = {filepath}, type = {type(filepath)} + dataset_id = {dataset_id}, type = {type(dataset_id)} + table_id = {table_id}, type = {type(table_id)} + partitions = {partitions}, type = {type(partitions)} + """ + ) + if status["error"] is not None: + return status["error"] + + error = None + + try: + # Upload raw to staging + if raw_filepath: + st_obj = Storage(table_id=table_id, dataset_id=dataset_id) + log( + f"""Uploading raw file to bucket {st_obj.bucket_name} at + {st_obj.bucket_name}/{dataset_id}/{table_id}""" + ) + st_obj.upload( + path=raw_filepath, + partitions=partitions, + mode="raw", + if_exists="replace", + ) + + # Creates and publish table if it does not exist, append to it otherwise + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=filepath, + partitions=partitions, + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error + + +@task +def bq_upload_from_dict(paths: dict, dataset_id: str, partition_levels: int = 1): + """Upload multiple tables from a dict structured as {table_id: csv_path}. + Present use case assumes table partitioned once. Adjust the parameter + 'partition_levels' to best suit new uses. + i.e. if your csv is saved as: + /date=/.csv + it has 1 level of partition. + if your csv file is saved as: + /date=/hour=/.csv + it has 2 levels of partition + + Args: + paths (dict): _description_ + dataset_id (str): _description_ + + Returns: + _type_: _description_ + """ + for key in paths.keys(): + log("#" * 80) + log(f"KEY = {key}") + tb_dir = paths[key].parent + # climb up the partition directories to reach the table dir + for i in range(partition_levels): # pylint: disable=unused-variable + tb_dir = tb_dir.parent + log(f"tb_dir = {tb_dir}") + create_or_append_table(dataset_id=dataset_id, table_id=key, path=tb_dir) + + log(f"Returning -> {tb_dir.parent}") + + return tb_dir.parent + + +@task +def upload_logs_to_bq( # pylint: disable=R0913 + dataset_id: str, + parent_table_id: str, + timestamp: str, + error: str = None, + previous_error: str = None, + recapture: bool = False, +): + """ + Upload execution status table to BigQuery. + Table is uploaded to the same dataset, named {parent_table_id}_logs. + If passing status_dict, should not pass timestamp and error. + + Args: + dataset_id (str): dataset_id on BigQuery + parent_table_id (str): Parent table id related to the status table + timestamp (str): ISO formatted timestamp string + error (str, optional): String associated with error caught during execution + Returns: + None + """ + table_id = parent_table_id + "_logs" + # Create partition directory + filename = f"{table_id}_{timestamp.isoformat()}" + partition = f"data={timestamp.date()}" + filepath = Path(f"""data/staging/{dataset_id}/{table_id}/{partition}/{filename}.csv""") + filepath.parent.mkdir(exist_ok=True, parents=True) + # Create dataframe to be uploaded + if not error and recapture is True: + # if the recapture is succeeded, update the column erro + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [True], + "erro": [f"[recapturado]{previous_error}"], + } + ) + log(f"Recapturing {timestamp} with previous error:\n{error}") + else: + # not recapturing or error during flow execution + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [error is None], + "erro": [error], + } + ) + # Save data local + dataframe.to_csv(filepath, index=False) + # Upload to Storage + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=filepath.as_posix(), + partitions=partition, + ) + if error is not None: + raise Exception(f"Pipeline failed with error: {error}") + + +@task +def upload_raw_data_to_gcs( + error: str, + raw_filepath: str, + table_id: str, + dataset_id: str, + partitions: list, +) -> Union[str, None]: + """ + Upload raw data to GCS. + + Args: + error (str): Error catched from upstream tasks. + raw_filepath (str): Path to the saved raw .json file + table_id (str): table_id on BigQuery + dataset_id (str): dataset_id on BigQuery + partitions (list): list of partition strings + + Returns: + Union[str, None]: if there is an error returns it traceback, otherwise returns None + """ + if error is None: + try: + st_obj = Storage(table_id=table_id, dataset_id=dataset_id) + log( + f"""Uploading raw file to bucket {st_obj.bucket_name} at + {st_obj.bucket_name}/{dataset_id}/{table_id}""" + ) + st_obj.upload( + path=raw_filepath, + partitions=partitions, + mode="raw", + if_exists="replace", + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error + + +@task +def upload_staging_data_to_gcs( + error: str, + staging_filepath: str, + timestamp: datetime, + table_id: str, + dataset_id: str, + partitions: list, + previous_error: str = None, + recapture: bool = False, +) -> Union[str, None]: + """ + Upload staging data to GCS. + + Args: + error (str): Error catched from upstream tasks. + staging_filepath (str): Path to the saved treated .csv file. + timestamp (datetime): timestamp for flow run. + table_id (str): table_id on BigQuery. + dataset_id (str): dataset_id on BigQuery. + partitions (list): list of partition strings. + + Returns: + Union[str, None]: if there is an error returns it traceback, otherwise returns None + """ + if error is None: + try: + # Creates and publish table if it does not exist, append to it otherwise + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=staging_filepath, + partitions=partitions, + ) + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + upload_run_logs_to_bq( + dataset_id=dataset_id, + parent_table_id=table_id, + error=error, + timestamp=timestamp, + mode="staging", + previous_error=previous_error, + recapture=recapture, + ) + + return error + + +############### +# +# Daterange tasks +# +############### + + +@task( + checkpoint=False, + max_retries=constants.MAX_RETRIES.value, + retry_delay=timedelta(seconds=constants.RETRY_DELAY.value), +) +def get_materialization_date_range( # pylint: disable=R0913 + dataset_id: str, + table_id: str, + raw_dataset_id: str, + raw_table_id: str, + table_run_datetime_column_name: str = None, + mode: str = "prod", + delay_hours: int = 0, + end_ts: datetime = None, +): + """ + Task for generating dict with variables to be passed to the + --vars argument on DBT. + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): model filename on the queries repo. + eg: if you have a model defined in the file .sql, + the table_id should be + table_date_column_name (Optional, str): if it's the first time this + is ran, will query the table for the maximum value on this field. + If rebuild is true, will query the table for the minimum value + on this field. + rebuild (Optional, bool): if true, queries the minimum date value on the + table and return a date range from that value to the datetime.now() time + delay(Optional, int): hours delayed from now time for materialization range + end_ts(Optional, datetime): date range's final date + Returns: + dict: containing date_range_start and date_range_end + """ + timestr = "%Y-%m-%dT%H:%M:%S" + # get start from redis + last_run = get_last_run_timestamp(dataset_id=dataset_id, table_id=table_id, mode=mode) + # if there's no timestamp set on redis, get max timestamp on source table + if last_run is None: + log("Failed to fetch key from Redis...\n Querying tables for last suceeded run") + if Table(dataset_id=dataset_id, table_id=table_id).table_exists("prod"): + last_run = get_table_min_max_value( + query_project_id=bq_project(), + dataset_id=dataset_id, + table_id=table_id, + field_name=table_run_datetime_column_name, + kind="max", + ) + log( + f""" + Queried last run from {dataset_id}.{table_id} + Got: + {last_run} as type {type(last_run)} + """ + ) + else: + last_run = get_table_min_max_value( + query_project_id=bq_project(), + dataset_id=raw_dataset_id, + table_id=raw_table_id, + field_name=table_run_datetime_column_name, + kind="max", + ) + log( + f""" + Queried last run from {raw_dataset_id}.{raw_table_id} + Got: + {last_run} as type {type(last_run)} + """ + ) + else: + last_run = datetime.strptime(last_run, timestr) + + if (not isinstance(last_run, datetime)) and (isinstance(last_run, date)): + last_run = datetime(last_run.year, last_run.month, last_run.day) + + # set start to last run hour (H) + start_ts = last_run.replace(minute=0, second=0, microsecond=0).strftime(timestr) + + # set end to now - delay + + if not end_ts: + end_ts = pendulum.now(constants.TIMEZONE.value).replace( + tzinfo=None, minute=0, second=0, microsecond=0 + ) + + end_ts = (end_ts - timedelta(hours=delay_hours)).replace(minute=0, second=0, microsecond=0) + + end_ts = end_ts.strftime(timestr) + + date_range = {"date_range_start": start_ts, "date_range_end": end_ts} + log(f"Got date_range as: {date_range}") + return date_range + + +@task +def set_last_run_timestamp( + dataset_id: str, table_id: str, timestamp: str, mode: str = "prod", wait=None +): # pylint: disable=unused-argument + """ + Set the `last_run_timestamp` key for the dataset_id/table_id pair + to datetime.now() time. Used after running a materialization to set the + stage for the next to come + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): model filename on the queries repo. + timestamp: Last run timestamp end. + wait (Any, optional): Used for defining dependencies inside the flow, + in general, pass the output of the task which should be run imediately + before this. Defaults to None. + + Returns: + _type_: _description_ + """ + log(f"Saving timestamp {timestamp} on Redis for {dataset_id}.{table_id}") + redis_client = get_redis_client() + key = dataset_id + "." + table_id + if mode == "dev": + key = f"{mode}.{key}" + content = redis_client.get(key) + if not content: + content = {} + content["last_run_timestamp"] = timestamp + redis_client.set(key, content) + return True + + +@task +def delay_now_time(timestamp: str, delay_minutes=6): + """Return timestamp string delayed by + + Args: + timestamp (str): Isoformat timestamp string + delay_minutes (int, optional): Minutes to delay timestamp by Defaults to 6. + + Returns: + str : timestamp string formatted as "%Y-%m-%dT%H-%M-%S" + """ + ts_obj = datetime.fromisoformat(timestamp) + ts_obj = ts_obj - timedelta(minutes=delay_minutes) + return ts_obj.strftime("%Y-%m-%dT%H-%M-%S") + + +@task +def fetch_dataset_sha(dataset_id: str): + """Fetches the SHA of a branch from Github""" + url = "https://api.github.com/repos/prefeitura-rio/queries-rj-smtr" + url += f"/commits?queries-rj-smtr/rj_smtr/{dataset_id}" + response = requests.get(url) + + if response.status_code != 200: + return None + + dataset_version = response.json()[0]["sha"] + return {"version": dataset_version} + + +@task +def get_run_dates( + date_range_start: str, date_range_end: str, day_datetime: datetime = None +) -> List: + """ + Generates a list of dates between date_range_start and date_range_end. + + Args: + date_range_start (str): the start date to create the date range + date_range_end (str): the end date to create the date range + day_datetime (datetime, Optional): a timestamp to use as run_date + if the range start or end is False + + Returns: + list: the list of run_dates + """ + if (date_range_start is False) or (date_range_end is False): + if day_datetime: + run_date = day_datetime.strftime("%Y-%m-%d") + else: + run_date = get_now_date.run() + dates = [{"run_date": run_date}] + else: + dates = [ + {"run_date": d.strftime("%Y-%m-%d")} + for d in pd.date_range(start=date_range_start, end=date_range_end) + ] + log(f"Will run the following dates: {dates}") + return dates + + +@task +def get_join_dict(dict_list: list, new_dict: dict) -> List: + """ + Updates a list of dictionaries with a new dictionary. + """ + for dict_temp in dict_list: + dict_temp.update(new_dict) + + log(f"get_join_dict: {dict_list}") + return dict_list + + +@task(checkpoint=False) +def get_previous_date(days): + """ + Returns the date of {days} days ago in YYYY-MM-DD. + """ + now = pendulum.now(pendulum.timezone("America/Sao_Paulo")).subtract(days=days) + + return now.to_date_string() + + +############### +# +# Pretreat data +# +############### + + +@task(nout=2) +def transform_raw_to_nested_structure( + raw_filepath: str, + filepath: str, + error: str, + timestamp: datetime, + primary_key: list = None, +) -> tuple[str, str]: + """ + Task to transform raw data to nested structure + + Args: + raw_filepath (str): Path to the saved raw .json file + filepath (str): Path to the saved treated .csv file + error (str): Error catched from upstream tasks + timestamp (datetime): timestamp for flow run + primary_key (list, optional): Primary key to be used on nested structure + + Returns: + str: Error traceback + str: Path to the saved treated .csv file + """ + if error is None: + try: + # leitura do dado raw + error, data = read_raw_data(filepath=raw_filepath) + + if primary_key is None: + primary_key = [] + + log( + f""" + Received inputs: + - timestamp:\n{timestamp} + - data:\n{data.head()}""" + ) + + # Check empty dataframe + if data.empty: + log("Empty dataframe, skipping transformation...") + + else: + log(f"Raw data:\n{data_info_str(data)}", level="info") + + log("Adding captured timestamp column...", level="info") + data["timestamp_captura"] = timestamp + + if "customFieldValues" not in data: + log("Striping string columns...", level="info") + for col in data.columns[data.dtypes == "object"].to_list(): + data[col] = data[col].str.strip() + + log(f"Finished cleaning! Data:\n{data_info_str(data)}", level="info") + + log("Creating nested structure...", level="info") + pk_cols = primary_key + ["timestamp_captura"] + data = ( + data.groupby(pk_cols) + .apply(lambda x: x[data.columns.difference(pk_cols)].to_json(orient="records")) + .str.strip("[]") + .reset_index(name="content")[primary_key + ["content", "timestamp_captura"]] + ) + + log( + f"Finished nested structure! Data:\n{data_info_str(data)}", + level="info", + ) + + # save treated local + filepath = save_treated_local_func(data=data, error=error, filepath=filepath) + + except Exception: # pylint: disable=W0703 + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, filepath + + +############### +# +# Utilitary tasks +# +############### + + +@task(checkpoint=False) +def coalesce_task(value_list: Iterable): + """ + Task to get the first non None value of a list + + Args: + value_list (Iterable): a iterable object with the values + Returns: + any: value_list's first non None item + """ + + try: + return next(value for value in value_list if value is not None) + except StopIteration: + return None + + +@task(checkpoint=False, nout=2) +def unpack_mapped_results_nout2( + mapped_results: Iterable, +) -> tuple[list[Any], list[Any]]: + """ + Task to unpack the results from an nout=2 tasks in 2 lists when it is mapped + + Args: + mapped_results (Iterable): The mapped task return + + Returns: + tuple[list[Any], list[Any]]: The task original return splited in 2 lists: + - 1st list being all the first return + - 2nd list being all the second return + + """ + return [r[0] for r in mapped_results], [r[1] for r in mapped_results] + + +@task +def check_mapped_query_logs_output(query_logs_output: list[tuple]) -> bool: + """ + Task to check if there is recaptures pending + + Args: + query_logs_output (list[tuple]): the return from a mapped query_logs execution + + Returns: + bool: True if there is recaptures to do, otherwise False + """ + + if len(query_logs_output) == 0: + return False + + recapture_list = [i[0] for i in query_logs_output] + return any(recapture_list) + + +@task +def get_scheduled_start_times( + timestamp: datetime, parameters: list, intervals: Union[None, dict] = None +): + """ + Task to get start times to schedule flows + + Args: + timestamp (datetime): initial flow run timestamp + parameters (list): parameters for the flow + intervals (Union[None, dict], optional): intervals between each flow run. Defaults to None. + Optionally, you can pass specific intervals for some table_ids. + Suggests to pass intervals based on previous table observed execution times. + Defaults to dict(default=timedelta(minutes=2)). + + Returns: + list[datetime]: list of scheduled start times + """ + + if intervals is None: + intervals = dict() + + if "default" not in intervals.keys(): + intervals["default"] = timedelta(minutes=2) + + timestamps = [None] + last_schedule = timestamp + + for param in parameters[1:]: + last_schedule += intervals.get(param.get("table_id", "default"), intervals["default"]) + timestamps.append(last_schedule) + + return timestamps + + +@task +def rename_current_flow_run_now_time(prefix: str, now_time=None, wait=None) -> None: + """ + Rename the current flow run. + """ + flow_run_id = prefect.context.get("flow_run_id") + client = Client() + return client.set_flow_run_name(flow_run_id, f"{prefix}{now_time}") + + +@prefect.task(checkpoint=False) +def get_now_time(): + """ + Returns the HH:MM. + """ + now = pendulum.now(pendulum.timezone("America/Sao_Paulo")) + + return f"{now.hour}:{f'0{now.minute}' if len(str(now.minute))==1 else now.minute}" + + +@prefect.task(checkpoint=False) +def get_now_date(): + """ + Returns the current date in YYYY-MM-DD. + """ + now = pendulum.now(pendulum.timezone("America/Sao_Paulo")) + + return now.to_date_string() + + +@task +def get_current_flow_mode(labels: List[str]) -> str: + """ + Get the mode (prod/dev/staging) of the current flow. + """ + if labels[0].endswith("-dev"): + return "dev" + if labels[0].endswith("-staging"): + return "staging" + return "prod" diff --git a/pipelines/utils/backup/utils.py b/pipelines/utils/backup/utils.py new file mode 100644 index 000000000..628b0b387 --- /dev/null +++ b/pipelines/utils/backup/utils.py @@ -0,0 +1,925 @@ +# -*- coding: utf-8 -*- +# flake8: noqa: E501 +""" +General purpose functions for rj_smtr +""" + +import io +import json +import time +import traceback +import zipfile +from datetime import date, datetime, timedelta +from ftplib import FTP +from pathlib import Path +from typing import Any, List, Union + +import basedosdados as bd +import pandas as pd +import psycopg2 +import psycopg2.extras +import pymysql +import pytz +import requests +from basedosdados import Table +from google.cloud.storage.blob import Blob +from prefect.schedules.clocks import IntervalClock +from prefeitura_rio.pipelines_utils.infisical import get_secret +from prefeitura_rio.pipelines_utils.logging import log +from prefeitura_rio.pipelines_utils.redis_pal import get_redis_client +from pytz import timezone + +from pipelines.constants import constants +from pipelines.utils.implicit_ftp import ImplicitFtpTls + +# Set BD config to run on cloud # +bd.config.from_file = True + + +def send_discord_message( + message: str, + webhook_url: str, +) -> None: + """ + Sends a message to a Discord channel. + """ + requests.post( + webhook_url, + data={"content": message}, + ) + + +def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH.value): + """Logs message to critical discord channel specified + + Args: + message (str): Message to post on the channel + secret_path (str, optional): Secret path storing the webhook to critical channel. + Defaults to constants.CRITICAL_SECRETPATH.value. + + """ + url = get_secret(secret_path)["url"] + return send_discord_message(message=message, webhook_url=url) + + +def create_or_append_table(dataset_id: str, table_id: str, path: str, partitions: str = None): + """Conditionally create table or append data to its relative GCS folder. + + Args: + dataset_id (str): target dataset_id on BigQuery + table_id (str): target table_id on BigQuery + path (str): Path to .csv data file + """ + tb_obj = Table(table_id=table_id, dataset_id=dataset_id) + if not tb_obj.table_exists("staging"): + log("Table does not exist in STAGING, creating table...") + dirpath = path.split(partitions)[0] + tb_obj.create( + path=dirpath, + if_table_exists="pass", + if_storage_data_exists="replace", + ) + log("Table created in STAGING") + else: + log("Table already exists in STAGING, appending to it...") + tb_obj.append(filepath=path, if_exists="replace", timeout=600, partitions=partitions) + log("Appended to table on STAGING successfully.") + + +def generate_df_and_save(data: dict, fname: Path): + """Save DataFrame as csv + + Args: + data (dict): dict with the data which to build the DataFrame + fname (Path): _description_ + """ + # Generate dataframe + dataframe = pd.DataFrame() + dataframe[data["key_column"]] = [piece[data["key_column"]] for piece in data["data"]] + dataframe["content"] = list(data["data"]) + + # Save dataframe to CSV + dataframe.to_csv(fname, index=False) + + +def bq_project(kind: str = "bigquery_prod"): + """Get the set BigQuery project_id + + Args: + kind (str, optional): Which client to get the project name from. + Options are 'bigquery_staging', 'bigquery_prod' and 'storage_staging' + Defaults to 'bigquery_prod'. + + Returns: + str: the requested project_id + """ + return bd.upload.base.Base().client[kind].project + + +def get_table_min_max_value( # pylint: disable=R0913 + query_project_id: str, + dataset_id: str, + table_id: str, + field_name: str, + kind: str, + wait=None, # pylint: disable=unused-argument +): + """Query a table to get the maximum value for the chosen field. + Useful to incrementally materialize tables via DBT + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): table_id on BigQuery + field_name (str): column name to query + kind (str): which value to get. Accepts min and max + """ + log(f"Getting {kind} value for {table_id}") + query = f""" + SELECT + {kind}({field_name}) + FROM {query_project_id}.{dataset_id}.{table_id} + """ + log(f"Will run query:\n{query}") + result = bd.read_sql(query=query, billing_project_id=bq_project()) + + return result.iloc[0][0] + + +def get_last_run_timestamp(dataset_id: str, table_id: str, mode: str = "prod") -> str: + """ + Query redis to retrive the time for when the last materialization + ran. + + Args: + dataset_id (str): dataset_id on BigQuery + table_id (str): model filename on the queries repo. + eg: if you have a model defined in the file .sql, + the table_id should be + mode (str): + + Returns: + Union[str, None]: _description_ + """ + redis_client = get_redis_client() + key = dataset_id + "." + table_id + log(f"Fetching key {key} from redis, working on mode {mode}") + if mode == "dev": + key = f"{mode}.{key}" + runs = redis_client.get(key) + # if runs is None: + # redis_client.set(key, "") + try: + last_run_timestamp = runs["last_run_timestamp"] + except KeyError: + return None + except TypeError: + return None + log(f"Got value {last_run_timestamp}") + return last_run_timestamp + + +def map_dict_keys(data: dict, mapping: dict) -> None: + """ + Map old keys to new keys in a dict. + """ + for old_key, new_key in mapping.items(): + data[new_key] = data.pop(old_key) + return data + + +def normalize_keys(data: dict): + _data = {key.lower(): value for key, value in data.items()} + return _data + + +def connect_ftp(secret_path: str = None, secure: bool = True): + """Connect to FTP + + Returns: + ImplicitFTP_TLS: ftp client + """ + + ftp_data = get_secret(secret_path)["data"] + if secure: + ftp_client = ImplicitFtpTls() + else: + ftp_client = FTP() + ftp_client.connect(host=ftp_data["host"], port=int(ftp_data["port"])) + ftp_client.login(user=ftp_data["username"], passwd=ftp_data["pwd"]) + if secure: + ftp_client.prot_p() + return ftp_client + + +def safe_cast(val, to_type, default=None): + """ + Safe cast value. + """ + try: + return to_type(val) + except ValueError: + return default + + +def set_redis_rdo_files(redis_client, dataset_id: str, table_id: str): + """ + Register downloaded files to Redis + + Args: + redis_client (_type_): _description_ + dataset_id (str): dataset_id on BigQuery + table_id (str): table_id on BigQuery + + Returns: + bool: if the key was properly set + """ + try: + content = redis_client.get(f"{dataset_id}.{table_id}")["files"] + except TypeError as e: + log(f"Caught error {e}. Will set unexisting key") + # set key to empty dict for filling later + redis_client.set(f"{dataset_id}.{table_id}", {"files": []}) + content = redis_client.get(f"{dataset_id}.{table_id}") + # update content + st_client = bd.Storage(dataset_id=dataset_id, table_id=table_id) + blob_names = [ + blob.name + for blob in st_client.client["storage_staging"].list_blobs( + st_client.bucket, prefix=f"staging/{dataset_id}/{table_id}" + ) + ] + files = [blob_name.split("/")[-1].replace(".csv", "") for blob_name in blob_names] + log(f"When setting key, found {len(files)} files. Will register on redis...") + content["files"] = files + # set key + return redis_client.set(f"{dataset_id}.{table_id}", content) + + +# PRE TREAT # + + +def check_not_null(data: pd.DataFrame, columns: list, subset_query: str = None): + """ + Check if there are null values in columns. + + Args: + columns (list): list of columns to check + subset_query (str): query to check if there are important data + being removed + + Returns: + None + """ + + for col in columns: + remove = data.query(f"{col} != {col}") # null values + log( + f"[data-check] There are {len(remove)} rows with null values in '{col}'", + level="info", + ) + + if subset_query is not None: + # Check if there are important data being removed + remove = remove.query(subset_query) + if len(remove) > 0: + log( + f"""[data-check] There are {len(remove)} critical rows with + null values in '{col}' (query: {subset_query})""", + level="warning", + ) + + +def filter_null(data: pd.DataFrame, columns: list, subset_query: str = None): + """ + Filter null values in columns. + + Args: + columns (list): list of columns to check + subset_query (str): query to check if there are important data + being removed + + Returns: + pandas.DataFrame: data without null values + """ + + for col in columns: + remove = data.query(f"{col} != {col}") # null values + data = data.drop(remove.index) + log( + f"[data-filter] Removed {len(remove)} rows with null '{col}'", + level="info", + ) + + if subset_query is not None: + # Check if there are important data being removed + remove = remove.query(subset_query) + if len(remove) > 0: + log( + f"[data-filter] Removed {len(remove)} critical rows with null '{col}'", + level="warning", + ) + + return data + + +def filter_data(data: pd.DataFrame, filters: list, subset_query: str = None): + """ + Filter data from a dataframe + + Args: + data (pd.DataFrame): data DataFrame + filters (list): list of queries to filter data + + Returns: + pandas.DataFrame: data without filter data + """ + for item in filters: + remove = data.query(item) + data = data.drop(remove.index) + log( + f"[data-filter] Removed {len(remove)} rows from filter: {item}", + level="info", + ) + + if subset_query is not None: + # Check if there are important data being removed + remove = remove.query(subset_query) + if len(remove) > 0: + log( + f"""[data-filter] Removed {len(remove)} critical rows + from filter: {item} (subquery: {subset_query})""", + level="warning", + ) + + return data + + +def check_relation(data: pd.DataFrame, columns: list): + """ + Check relation between collumns. + + Args: + data (pd.DataFrame): dataframe to be modified + columns (list): list of lists of columns to be checked + + Returns: + None + """ + + for cols in columns: + df_dup = data[~data.duplicated(subset=cols)].groupby(cols).count().reset_index().iloc[:, :1] + + for col in cols: + df_dup_col = ( + data[~data.duplicated(subset=col)].groupby(col).count().reset_index().iloc[:, :1] + ) + + if len(df_dup_col[~df_dup_col[col].duplicated()]) == len(df_dup): + log( + f"[data-check] Comparing '{col}' in '{cols}', there are no duplicated values", + level="info", + ) + else: + log( + f"[data-check] Comparing '{col}' in '{cols}', there are duplicated values", + level="warning", + ) + + +def data_info_str(data: pd.DataFrame): + """ + Return dataframe info as a str to log + + Args: + data (pd.DataFrame): dataframe + + Returns: + data.info() as a string + """ + buffer = io.StringIO() + data.info(buf=buffer) + return buffer.getvalue() + + +def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-locals + clock_interval: timedelta, + labels: List[str], + table_parameters: Union[list[dict], dict], + runs_interval_minutes: int = 15, + start_date: datetime = datetime(2020, 1, 1, tzinfo=pytz.timezone(constants.TIMEZONE.value)), + **general_flow_params, +) -> List[IntervalClock]: + """ + Generates multiple schedules + + Args: + clock_interval (timedelta): The interval to run the schedule + labels (List[str]): The labels to be added to the schedule + table_parameters (list): The table parameters to iterate over + runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15. + start_date (datetime, optional): The start date of the schedule. + Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(constants.TIMEZONE.value)). + general_flow_params: Any param that you want to pass to the flow + Returns: + List[IntervalClock]: The list of schedules + + """ + if isinstance(table_parameters, dict): + table_parameters = [table_parameters] + + clocks = [] + for count, parameters in enumerate(table_parameters): + parameter_defaults = parameters | general_flow_params + clocks.append( + IntervalClock( + interval=clock_interval, + start_date=start_date + timedelta(minutes=runs_interval_minutes * count), + labels=labels, + parameter_defaults=parameter_defaults, + ) + ) + return clocks + + +def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool: + """ + Test if the input dict has all keys present in the list + + Args: + input_dict (dict): the dict to test if has the keys + keys (list[str]): the list containing the keys to check + Returns: + bool: True if the input_dict has all the keys otherwise False + """ + return all(x in input_dict.keys() for x in keys) + + +def custom_serialization(obj: Any) -> Any: + """ + Function to serialize not JSON serializable objects + + Args: + obj (Any): Object to serialize + + Returns: + Any: Serialized object + """ + if isinstance(obj, (pd.Timestamp, date)): + if isinstance(obj, pd.Timestamp): + if obj.tzinfo is None: + obj = obj.tz_localize("UTC").tz_convert(constants.TIMEZONE.value) + return obj.isoformat() + + raise TypeError(f"Object of type {type(obj)} is not JSON serializable") + + +def save_raw_local_func( + data: Union[dict, str], + filepath: str, + mode: str = "raw", + filetype: str = "json", +) -> str: + """ + Saves json response from API to .json file. + Args: + data (Union[dict, str]): Raw data to save + filepath (str): Path which to save raw file + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + filetype (str, optional): The file format + Returns: + str: Path to the saved file + """ + + # diferentes tipos de arquivos para salvar + _filepath = filepath.format(mode=mode, filetype=filetype) + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) + + if filetype == "json": + if isinstance(data, str): + data = json.loads(data) + with Path(_filepath).open("w", encoding="utf-8") as fi: + json.dump(data, fi, default=custom_serialization) + + if filetype in ("txt", "csv"): + with open(_filepath, "w", encoding="utf-8") as file: + file.write(data) + + log(f"Raw data saved to: {_filepath}") + return _filepath + + +def get_raw_data_api( # pylint: disable=R0912 + url: str, + secret_path: str = None, + api_params: dict = None, + filetype: str = None, +) -> tuple[str, str, str]: + """ + Request data from URL API + + Args: + url (str): URL to request data + secret_path (str, optional): Secret path to get headers. Defaults to None. + api_params (dict, optional): Parameters to pass to API. Defaults to None. + filetype (str, optional): Filetype to save raw file. Defaults to None. + + Returns: + tuple[str, str, str]: Error, data and filetype + """ + error = None + data = None + try: + if secret_path is None: + headers = secret_path + else: + headers = get_secret(secret_path)["data"] + + response = requests.get( + url, + headers=headers, + timeout=constants.MAX_TIMEOUT_SECONDS.value, + params=api_params, + ) + + response.raise_for_status() + + if filetype == "json": + data = response.json() + else: + data = response.text + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data, filetype + + +def get_upload_storage_blob( + dataset_id: str, + filename: str, +) -> Blob: + """ + Get a blob from upload zone in storage + + Args: + dataset_id (str): The dataset id on BigQuery. + filename (str): The filename in GCS. + + Returns: + Blob: blob object + """ + bucket = bd.Storage(dataset_id="", table_id="") + log(f"Filename: {filename}, dataset_id: {dataset_id}") + blob_list = list( + bucket.client["storage_staging"] + .bucket(bucket.bucket_name) + .list_blobs(prefix=f"upload/{dataset_id}/{filename}.") + ) + + return blob_list[0] + + +def get_raw_data_gcs( + dataset_id: str, + table_id: str, + zip_filename: str = None, +) -> tuple[str, str, str]: + """ + Get raw data from GCS + + Args: + dataset_id (str): The dataset id on BigQuery. + table_id (str): The table id on BigQuery. + zip_filename (str, optional): The zip file name. Defaults to None. + + Returns: + tuple[str, str, str]: Error, data and filetype + """ + error = None + data = None + filetype = None + + try: + blob_search_name = zip_filename or table_id + blob = get_upload_storage_blob(dataset_id=dataset_id, filename=blob_search_name) + + filename = blob.name + filetype = filename.split(".")[-1] + + data = blob.download_as_bytes() + + if filetype == "zip": + with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: + filenames = zipped_file.namelist() + filename = list(filter(lambda x: x.split(".")[0] == table_id, filenames))[0] + filetype = filename.split(".")[-1] + data = zipped_file.read(filename) + + data = data.decode(encoding="utf-8") + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data, filetype + + +def get_raw_data_db( + query: str, engine: str, host: str, secret_path: str, database: str +) -> tuple[str, str, str]: + """ + Get data from Databases + + Args: + query (str): the SQL Query to execute + engine (str): The datase management system + host (str): The database host + secret_path (str): Secret path to get credentials + database (str): The database to connect + + Returns: + tuple[str, str, str]: Error, data and filetype + """ + connector_mapping = { + "postgresql": psycopg2.connect, + "mysql": pymysql.connect, + } + + data = None + error = None + filetype = "json" + + try: + credentials = get_secret(secret_path)["data"] + + with connector_mapping[engine]( + host=host, + user=credentials["user"], + password=credentials["password"], + database=database, + ) as connection: + data = pd.read_sql(sql=query, con=connection).to_dict(orient="records") + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data, filetype + + +def save_treated_local_func( + filepath: str, data: pd.DataFrame, error: str, mode: str = "staging" +) -> str: + """ + Save treated file to CSV. + + Args: + filepath (str): Path to save file + data (pd.DataFrame): Dataframe to save + error (str): Error catched during execution + mode (str, optional): Folder to save locally, later folder which to upload to GCS. + + Returns: + str: Path to the saved file + """ + _filepath = filepath.format(mode=mode, filetype="csv") + Path(_filepath).parent.mkdir(parents=True, exist_ok=True) + if error is None: + data.to_csv(_filepath, index=False) + log(f"Treated data saved to: {_filepath}") + return _filepath + + +def upload_run_logs_to_bq( # pylint: disable=R0913 + dataset_id: str, + parent_table_id: str, + timestamp: str, + error: str = None, + previous_error: str = None, + recapture: bool = False, + mode: str = "raw", +): + """ + Upload execution status table to BigQuery. + Table is uploaded to the same dataset, named {parent_table_id}_logs. + If passing status_dict, should not pass timestamp and error. + + Args: + dataset_id (str): dataset_id on BigQuery + parent_table_id (str): table_id on BigQuery + timestamp (str): timestamp to get datetime range + error (str): error catched during execution + previous_error (str): previous error catched during execution + recapture (bool): if the execution was a recapture + mode (str): folder to save locally, later folder which to upload to GCS + + Returns: + None + """ + table_id = parent_table_id + "_logs" + # Create partition directory + filename = f"{table_id}_{timestamp.isoformat()}" + partition = f"data={timestamp.date()}" + filepath = Path(f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv""") + filepath.parent.mkdir(exist_ok=True, parents=True) + # Create dataframe to be uploaded + if not error and recapture is True: + # if the recapture is succeeded, update the column erro + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [True], + "erro": [f"[recapturado]{previous_error}"], + } + ) + log(f"Recapturing {timestamp} with previous error:\n{previous_error}") + else: + # not recapturing or error during flow execution + dataframe = pd.DataFrame( + { + "timestamp_captura": [timestamp], + "sucesso": [error is None], + "erro": [error], + } + ) + # Save data local + dataframe.to_csv(filepath, index=False) + # Upload to Storage + create_or_append_table( + dataset_id=dataset_id, + table_id=table_id, + path=filepath.as_posix(), + partitions=partition, + ) + if error is not None: + raise Exception(f"Pipeline failed with error: {error}") + + +def get_datetime_range( + timestamp: datetime, + interval: timedelta, +) -> dict: + """ + Task to get datetime range in UTC + + Args: + timestamp (datetime): timestamp to get datetime range + interval (timedelta): interval to get datetime range + + Returns: + dict: datetime range + """ + + start = (timestamp - interval).astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") + + end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") + + return {"start": start, "end": end} + + +def read_raw_data(filepath: str, csv_args: dict = None) -> tuple[str, pd.DataFrame]: + """ + Read raw data from file + + Args: + filepath (str): filepath to read + csv_args (dict): arguments to pass to pandas.read_csv + + Returns: + tuple[str, pd.DataFrame]: error and data + """ + error = None + data = None + try: + file_type = filepath.split(".")[-1] + + if file_type == "json": + data = pd.read_json(filepath) + + # data = json.loads(data) + elif file_type in ("txt", "csv"): + if csv_args is None: + csv_args = {} + data = pd.read_csv(filepath, **csv_args) + else: + error = "Unsupported raw file extension. Supported only: json, csv and txt" + + except Exception: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + + return error, data + + +def get_raw_recursos(request_url: str, request_params: dict) -> tuple[str, str, str]: + """ + Returns a dataframe with recursos data from movidesk api. + """ + all_records = False + top = 1000 + skip = 0 + error = None + filetype = "json" + data = [] + + while not all_records: + try: + request_params["$top"] = top + request_params["$skip"] = skip + + log(f"Request url {request_url}") + + response = requests.get( + request_url, + params=request_params, + timeout=constants.MAX_TIMEOUT_SECONDS.value, + ) + response.raise_for_status() + + paginated_data = response.json() + + if isinstance(paginated_data, dict): + paginated_data = [paginated_data] + + if len(paginated_data) == top: + skip += top + time.sleep(36) + else: + if len(paginated_data) == 0: + log("Nenhum dado para tratar.") + break + all_records = True + data += paginated_data + + log(f"Dados (paginados): {len(data)}") + + except Exception as error: + error = traceback.format_exc() + log(f"[CATCHED] Task failed with error: \n{error}", level="error") + data = [] + break + + log(f"Request concluído, tamanho dos dados: {len(data)}.") + + return error, data, filetype + + +def build_table_id(mode: str, report_type: str): + """Build table_id based on which table is the target + of current flow run + + Args: + mode (str): SPPO or STPL + report_type (str): RHO or RDO + + Returns: + str: table_id + """ + if mode == "SPPO": + if report_type == "RDO": + table_id = constants.SPPO_RDO_TABLE_ID.value + else: + table_id = constants.SPPO_RHO_TABLE_ID.value + if mode == "STPL": + # slice the string to get rid of V at end of + # STPL reports filenames + if report_type[:3] == "RDO": + table_id = constants.STPL_RDO_TABLE_ID.value + else: + table_id = constants.STPL_RHO_TABLE_ID.value + return table_id + + +def generate_ftp_schedules(interval_minutes: int, label: str = constants.RJ_SMTR_AGENT_LABEL.value): + """Generates IntervalClocks with the parameters needed to capture + each report. + + Args: + interval_minutes (int): interval which this flow will be run. + label (str, optional): Prefect label, defines which agent to use when launching flow run. + Defaults to constants.RJ_SMTR_AGENT_LABEL.value. + + Returns: + List(IntervalClock): containing the clocks for scheduling runs + """ + modes = ["SPPO", "STPL"] + reports = ["RDO", "RHO"] + clocks = [] + for mode in modes: + for report in reports: + clocks.append( + IntervalClock( + interval=timedelta(minutes=interval_minutes), + start_date=datetime( + 2022, 12, 16, 5, 0, tzinfo=timezone(constants.TIMEZONE.value) + ), + parameter_defaults={ + "transport_mode": mode, + "report_type": report, + "table_id": build_table_id(mode=mode, report_type=report), + }, + labels=[label], + ) + ) + return clocks diff --git a/pipelines/utils/capture/__init__.py b/pipelines/utils/capture/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipelines/utils/capture/api.py b/pipelines/utils/capture/api.py new file mode 100644 index 000000000..11e1ec3d1 --- /dev/null +++ b/pipelines/utils/capture/api.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +"""Module to get data from apis""" +import time +from typing import Union + +import requests +from prefeitura_rio.pipelines_utils.logging import log + +from pipelines.constants import constants +from pipelines.utils.capture.base import DataExtractor +from pipelines.utils.fs import get_filetype + + +class APIExtractor(DataExtractor): + """ + Classe para extrair dados de API com uma página + + Args: + url (str): URL para fazer o request + headers (Union[None, dict]): Headers para o request + params (Union[None, dict]): Paramêtros para o request + save_filepath (str): Caminho para salvar os dados + """ + + def __init__( + self, + url: str, + headers: Union[None, dict], + params: Union[None, dict], + save_filepath: str, + ) -> None: + super().__init__(save_filepath=save_filepath) + self.url = url + self.params = params + self.headers = headers + self.filetype = get_filetype(save_filepath) + + def _get_data(self) -> Union[list[dict], dict, str]: + """ + Extrai os dados da API + + Returns: + Union[list[dict], dict, str]: list[dict] ou dict para APIs json + str para outros tipos + """ + for retry in range(constants.MAX_RETRIES.value): + response = requests.get( + self.url, + headers=self.headers, + timeout=constants.MAX_TIMEOUT_SECONDS.value, + params=self.params, + ) + + if response.ok: + break + if response.status_code >= 500: + log(f"Server error {response.status_code}") + if retry == constants.MAX_RETRIES.value - 1: + response.raise_for_status() + time.sleep(60) + else: + response.raise_for_status() + + if self.filetype == "json": + data = response.json() + else: + data = response.text + + return data + + +class APIExtractorTopSkip(APIExtractor): + """ + Classe para extrair dados de uma API paginada do tipo Top/Skip + + Args: + url (str): URL para fazer o request + headers (Union[None, dict]): Headers para o request + params (Union[None, dict]): Paramêtros para o request (exceto os de top e skip) + top_param_name (str): Nome do parâmetro de top (que define o tamanho da página) + skip_param_name (str): Nome do parâmetro de skip (quantidade de linhas a serem puladas) + page_size (int): Número de registros por página (valor a ser passado no parâmetro de top) + max_pages (int): Limite de páginas a ser extraídas + save_filepath (str): Caminho para salvar os dados + """ + + def __init__( + self, + url: str, + headers: Union[dict, None], + params: dict, + top_param_name: str, + skip_param_name: str, + page_size: int, + max_pages: int, + save_filepath: str, + ) -> None: + super().__init__( + url=url, + headers=headers, + params=params, + save_filepath=save_filepath, + ) + + if self.filetype != "json": + raise ValueError("File Type must be json") + + self.params[top_param_name] = page_size + self.skip_param_name = skip_param_name + self.params[skip_param_name] = 0 + self.page_size = page_size + self.max_pages = max_pages + + def _prepare_next_page(self): + """ + Incrementa o valor do skip para buscar a próxima página + """ + super()._prepare_next_page() + self.params[self.skip_param_name] += self.page_size + + def _check_if_last_page(self) -> bool: + """ + Verifica se a página tem menos registros do que o máximo + ou se chegou ao limite de páginas + """ + page_data_len = len(self.page_data) + current_page = self.current_page + 1 + log( + f""" + Page size: {self.page_size} + Current page: {current_page}/{self.max_pages} + Current page returned {page_data_len} rows""" + ) + + last_page = page_data_len < self.page_size or self.max_pages == current_page + if last_page: + log("Last page, ending extraction") + return last_page diff --git a/pipelines/utils/capture/base.py b/pipelines/utils/capture/base.py new file mode 100644 index 000000000..056aa1f4d --- /dev/null +++ b/pipelines/utils/capture/base.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +"""Module with the base class for data extractions""" +from abc import ABC, abstractmethod +from typing import Union + +from pipelines.utils.fs import save_local_file + + +class DataExtractor(ABC): + """ + Classe abstrata para criar Data Extractors + + Para criar extrações com uma página: + - Implemente o método "_get_data" + + Para criar extrações com várias páginas: + - Implemente o método "_get_data" + - Sobrescreva os métodos "_prepare_next_page" and "_check_if_last_page" + de acordo com a lógica de paginação da sua extração + + Args: + save_filepath (str): O caminho para salvar os dados extraídos + + Attributes: + save_filepath (str): O caminho para salvar os dados extraídos + data (list): Os dados extraídos de todas as páginas + last_page (bool): Se é a última página da captura ou não + page_data: Os dados extraídos da página atual + current_page (int): o número da página atual, iniciando em 0 + """ + + def __init__(self, save_filepath: str) -> None: + self.save_filepath = save_filepath + self.data = [] + self.last_page = False + self.page_data = None + self.current_page = 0 + + @abstractmethod + def _get_data(self) -> Union[list[dict], dict, str]: + """ + Método abstrato para extrair dos dados de uma página + + Para implementar, crie a lógica da extração, retornando + uma lista de dicionários, um dicionário ou uma string + + Returns: + Union[list[dict], dict, str]: Os dados extraídos + """ + + def _prepare_next_page(self): + """ + Prepara o objeto para extrair a próxima página + + Coloca os dados da página na lista de dados gerais + Verifica se é a última página + Incrementa o atributo current_page em 1 + """ + if isinstance(self.page_data, list): + self.data += self.page_data + else: + self.data.append(self.page_data) + + self.last_page = self._check_if_last_page() + + self.current_page += 1 + + def _check_if_last_page(self) -> bool: + """ + Verifica se é a última página + Para implementar uma extração paginada, + sobrescreva esse método com a lógica de verificação + """ + return True + + def extract(self) -> Union[dict, list[dict], str]: + """ + Extrai os dados completos de todas as páginas + + Returns: + Union[dict, list[dict], str]: Os dados retornados + """ + while not self.last_page: + self.page_data = self._get_data() + self._prepare_next_page() + + return self.data + + def save_raw_local(self): + """ + Salva os dados extraídos localmente + """ + save_local_file( + filepath=self.save_filepath, + data=self.data, + ) diff --git a/pipelines/utils/capture/db.py b/pipelines/utils/capture/db.py new file mode 100644 index 000000000..2cef6faf8 --- /dev/null +++ b/pipelines/utils/capture/db.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- +"""Module to get data from databases""" +import pandas as pd +from prefeitura_rio.pipelines_utils.logging import log +from sqlalchemy import create_engine + +from pipelines.utils.capture.base import DataExtractor +from pipelines.utils.fs import get_filetype + + +class DBExtractor(DataExtractor): + """ + Classe para extrair dados de banco de dados + + Args: + query (str): o SELECT para ser executada + engine (str): O banco de dados (postgres ou mysql) + host (str): O host do banco de dados + user (str): O usuário para se conectar + password (str): A senha do usuário + database (str): O nome da base (schema) + save_filepath (str): Caminho para salvar os dados + """ + + def __init__( + self, + query: str, + engine: str, + host: str, + user: str, + password: str, + database: str, + save_filepath: str, + ) -> None: + super().__init__(save_filepath=save_filepath) + if get_filetype(save_filepath) != "json": + raise ValueError("File type must be json") + + self.query = query + engine_mapping = { + "mysql": {"driver": "pymysql", "port": "3306"}, + "postgresql": {"driver": "psycopg2", "port": "5432"}, + } + engine_details = engine_mapping[engine] + driver = engine_details["driver"] + port = engine_details["port"] + connection_string = f"{engine}+{driver}://{user}:{password}@{host}:{port}/{database}" + self.connection = create_engine(connection_string) + + def _get_data(self) -> list[dict]: + """ + Executa a query e retorna os dados como JSON + + Returns: + list[dict]: Os dados retornados pela query + """ + max_retries = 10 + for retry in range(1, max_retries + 1): + try: + log(f"[ATTEMPT {retry}/{max_retries}]: {self.query}") + data = pd.read_sql(sql=self.query, con=self.connection).to_dict(orient="records") + for d in data: + for k, v in d.items(): + if pd.isna(v): + d[k] = None + break + except Exception as err: + if retry == max_retries: + raise err + + return data + + +class PaginatedDBExtractor(DBExtractor): + """ + Classe para extrair dados de um banco de dados com paginação offset/limit + + Args: + query (str): o SELECT para ser executada (sem o limit e offset) + engine (str): O banco de dados (postgres ou mysql) + host (str): O host do banco de dados + user (str): O usuário para se conectar + password (str): A senha do usuário + database (str): O nome da base (schema) + page_size (int): Número de linhas por página + max_pages (int): Número máximo de páginas para serem extraídas + save_filepath (str): Caminho para salvar os dados + """ + + def __init__( + self, + query: str, + engine: str, + host: str, + user: str, + password: str, + database: str, + page_size: int, + max_pages: int, + save_filepath: str, + ) -> None: + super().__init__( + query=query, + engine=engine, + host=host, + user=user, + password=password, + database=database, + save_filepath=save_filepath, + ) + self.offset = 0 + self.base_query = f"{query} LIMIT {page_size}" + self.query = f"{self.base_query} OFFSET 0" + self.max_pages = max_pages + self.page_size = page_size + + def _prepare_next_page(self): + """ + Incrementa o offset e concatena na query + """ + super()._prepare_next_page() + self.offset += self.page_size + self.query = f"{self.base_query} OFFSET {self.offset}" + + def _check_if_last_page(self): + """ + Verifica se o número de dados retornados na última página é menor que o máximo + ou se chegou ao limite de numero de páginas + """ + page_data_len = len(self.page_data) + current_page = self.current_page + 1 + log( + f""" + Page size: {self.page_size} + Current page: {current_page}/{self.max_pages} + Current page returned {page_data_len} rows""" + ) + + last_page = page_data_len < self.page_size or self.max_pages == current_page + if last_page: + log("Last page, ending extraction") + return last_page diff --git a/pipelines/utils/capture/gcs.py b/pipelines/utils/capture/gcs.py new file mode 100644 index 000000000..4f51c7f28 --- /dev/null +++ b/pipelines/utils/capture/gcs.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +"""Module to get data from GCS""" +from prefeitura_rio.pipelines_utils.logging import log + +from pipelines.utils.capture.base import DataExtractor +from pipelines.utils.fs import get_filetype +from pipelines.utils.gcp import Storage + + +class GCSExtractor(DataExtractor): + """ + Classe para extrair dados do GCS + + Args: + env (str): dev ou prod + folder (str): pasta que está o arquivo + filename (str): nome do arquivo sem extensão + save_filepath (str): Caminho para salvar o arquivo + (deve ter a mesma extensão do arquivo no GCS) + bucket_name (str): Nome do bucket no GCS + """ + + def __init__( + self, + env: str, + folder: str, + filename: str, + save_filepath: str, + bucket_name: str = None, + ) -> None: + super().__init__(save_filepath=save_filepath) + filetype = get_filetype(filepath=save_filepath) + self.complete_filename = f"{filename}.{filetype}" + self.storage = Storage(env=env, dataset_id=folder, bucket_name=bucket_name) + + def _get_data(self) -> str: + """Baixa o arquivo como string + + Returns: + str: conteúdo do arquivo + """ + log(f"Getting file: {self.complete_filename}") + data = self.storage.get_blob_string(mode="upload", filename=self.complete_filename) + + return data diff --git a/pipelines/utils/dbt_vars.py b/pipelines/utils/dbt_vars.py new file mode 100644 index 000000000..e7fa6cde3 --- /dev/null +++ b/pipelines/utils/dbt_vars.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +"""Módulo para criação de variáveis para execução do DBT""" + +from datetime import date, datetime, timedelta +from typing import Union + +import basedosdados as bd +from prefeitura_rio.pipelines_utils.logging import log + +from pipelines.constants import constants +from pipelines.utils.gcp import BQTable + +# from pytz import timezone + + +class DateRange: + def __init__( + self, + datetime_column_name: str, + truncate_hour: bool = True, + delay_hours: int = 0, + first_daterange_start: datetime = None, + ): + self.first_daterange_start = first_daterange_start + self.datetime_column_name = datetime_column_name + self.truncate_hour = truncate_hour + self.value_to_save = None + self.delay_hours = delay_hours + + def get_last_run_from_redis(self, redis_key: str) -> Union[None, datetime]: + pass + + def get_last_run_from_bq(self, table: BQTable) -> Union[None, datetime]: + last_run = None + if table.exists(): + project = constants.PROJECT_NAME.value[table.env] + query = f""" + SELECT + max({self.datetime_column_name}) + FROM + {project}.{table.dataset_id}.{table.table_id} + """ + + log(f"Will run query:\n{query}") + last_run = bd.read_sql(query=query, billing_project_id=project).iloc[0][0] + + if (not isinstance(last_run, datetime)) and (isinstance(last_run, date)): + last_run = datetime(last_run.year, last_run.month, last_run.day) + + return last_run + + def create_var( + self, + redis_key: str, + table: BQTable, + timestamp: datetime, + ) -> dict: + + last_run = ( + self.get_last_run_from_redis(redis_key=redis_key) + or self.get_last_run_from_bq(table=table) + or self.first_daterange_start + ) + + if last_run is None: + return {} + + ts_format = "%Y-%m-%dT%H:%M:%S" + + start_ts = last_run.replace(second=0, microsecond=0) + if self.truncate_hour: + start_ts = start_ts.replace(minute=0) + + start_ts = start_ts.strftime(ts_format) + + end_ts = timestamp - timedelta(hours=self.delay_hours) + + end_ts = end_ts.replace(second=0, microsecond=0) + + if self.truncate_hour: + end_ts = end_ts.replace(minute=0) + + end_ts = end_ts.strftime(ts_format) + + date_range = {"date_range_start": start_ts, "date_range_end": end_ts} + self.value_to_save = end_ts + + log(f"Got date_range as: {date_range}") diff --git a/pipelines/utils/fs.py b/pipelines/utils/fs.py new file mode 100644 index 000000000..a3ed00981 --- /dev/null +++ b/pipelines/utils/fs.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- +"""Module to deal with the filesystem""" +import json +import os +from datetime import datetime +from pathlib import Path +from typing import Union + +import pandas as pd +import pytz +from prefeitura_rio.pipelines_utils.logging import log + +from pipelines.constants import constants +from pipelines.utils.utils import custom_serialization + + +def get_data_folder_path() -> str: + """ + Retorna a pasta raíz para salvar os dados + + Returns: + str: Caminho para a pasta data + """ + return os.path.join(os.getcwd(), os.getenv("DATA_FOLDER", "data")) + + +def create_partition( + timestamp: datetime, + partition_date_only: bool, +) -> str: + """ + Cria a partição Hive de acordo com a timestamp + + Args: + timestamp (datetime): timestamp de referência + partition_date_only (bool): True se o particionamento deve ser feito apenas por data + False se o particionamento deve ser feito por data e hora + Returns: + str: string com o particionamento + """ + log("Creating file partition...") + log(f"Timestamp received: {timestamp}") + timestamp = timestamp.astimezone(tz=pytz.timezone(constants.TIMEZONE.value)) + log(f"Timestamp converted to {constants.TIMEZONE.value}: {timestamp}") + partition = f"data={timestamp.strftime('%Y-%m-%d')}" + if not partition_date_only: + partition = os.path.join(partition, f"hora={timestamp.strftime('%H')}") + + log(f"Partition created successfully: {partition}") + return partition + + +def create_capture_filepath( + dataset_id: str, + table_id: str, + timestamp: datetime, + raw_filetype: str, + partition: str = None, +) -> dict[str, str]: + """ + Cria os caminhos para salvar os dados localmente + + Args: + dataset_id (str): dataset_id no BigQuery + table_id (str): table_id no BigQuery + timestamp (datetime): timestamp da captura + partition (str, optional): Partição dos dados em formato Hive, ie "data=2020-01-01/hora=06" + Returns: + dict: caminhos para os dados raw e source + """ + log("Creating filepaths...") + log(f"Timestamp received: {timestamp}") + timestamp = timestamp.astimezone(tz=pytz.timezone(constants.TIMEZONE.value)) + log(f"Timestamp converted to {constants.TIMEZONE.value}: {timestamp}") + data_folder = get_data_folder_path() + log(f"Data folder: {data_folder}") + template_filepath = os.path.join( + data_folder, + "{mode}", + dataset_id, + table_id, + ) + + if partition is not None: + log("Received partition, appending it to filepath template") + template_filepath = os.path.join(template_filepath, partition) + + template_filepath = os.path.join( + template_filepath, + f"{timestamp.strftime(constants.FILENAME_PATTERN.value)}.{{filetype}}", + ) + + log(f"Filepath template: {template_filepath}") + + filepath = { + "raw": template_filepath.format(mode="raw", filetype=raw_filetype), + "source": template_filepath.format(mode="source", filetype="csv"), + } + + log(f"Created filepaths successfully: {filepath}") + + return filepath + + +def get_filetype(filepath: str): + """Retorna a extensão de um arquivo + + Args: + filepath (str): caminho para o arquivo + """ + return os.path.splitext(filepath)[1].removeprefix(".") + + +def save_local_file(filepath: str, data: Union[str, dict, list[dict], pd.DataFrame]): + """ + Salva um arquivo localmente + + Args: + filepath (str): Caminho para salvar o arquivo + data Union[str, dict, list[dict], pd.DataFrame]: Dados que serão salvos no arquivo + """ + log(f"Saving data on local file: {filepath}") + + log("Creating parent folder...") + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + log("Parent folder created!") + + if isinstance(data, pd.DataFrame): + log("Received a DataFrame, saving file as CSV") + data.to_csv(filepath, index=False) + log("File saved!") + return + + filetype = get_filetype(filepath) + log(f"Saving {filetype.upper()}") + with open(filepath, "w", encoding="utf-8") as file: + if filetype == "json": + if isinstance(data, str): + log("Converting string to python object") + data = json.loads(data) + + json.dump(data, file, default=custom_serialization) + + elif filetype in ("txt", "csv"): + file.write(data) + + else: + raise NotImplementedError( + "Unsupported raw file extension. Supported only: json, csv and txt" + ) + + log("File saved!") + + +def read_raw_data(filepath: str, reader_args: dict = None) -> pd.DataFrame: + """ + Lê os dados de um arquivo Raw + + Args: + filepath (str): Caminho do arquivo + reader_args (dict, optional): Argumentos para passar na função + de leitura (pd.read_csv ou pd.read_json) + + Returns: + pd.DataFrame: DataFrame com os dados lidos + """ + + log(f"Reading raw data in {filepath}") + if reader_args is None: + reader_args = {} + + filetype = get_filetype(filepath=filepath) + + log(f"Reading {filetype.upper()}") + if filetype == "json": + data = pd.read_json(filepath, **reader_args) + + elif filetype in ("txt", "csv"): + data = pd.read_csv(filepath, **reader_args) + else: + raise NotImplementedError( + "Unsupported raw file extension. Supported only: json, csv and txt" + ) + + return data diff --git a/pipelines/utils/gcp.py b/pipelines/utils/gcp.py new file mode 100644 index 000000000..daedd61df --- /dev/null +++ b/pipelines/utils/gcp.py @@ -0,0 +1,391 @@ +# -*- coding: utf-8 -*- +"""Module to interact with GCP""" +import csv +import inspect +import io +import zipfile +from dataclasses import dataclass +from datetime import datetime +from mimetypes import MimeTypes +from pathlib import Path +from typing import Type, TypeVar, Union + +from google.api_core.exceptions import NotFound +from google.cloud import bigquery, storage +from google.cloud.bigquery.external_config import HivePartitioningOptions +from prefeitura_rio.pipelines_utils.logging import log + +from pipelines.constants import constants +from pipelines.utils.fs import create_capture_filepath, create_partition + +T = TypeVar("T") + + +@dataclass +class GCPBase: + dataset_id: str + table_id: str + bucket_names: dict + env: str + + def __post_init__(self): + if self.bucket_names is None: + self.bucket_name = constants.DEFAULT_BUCKET_NAME.value[self.env] + else: + self.bucket_name = self.bucket_names[self.env] + + def __getitem__(self, key): + return self.__dict__[key] + + def client(self, service: str) -> Union[storage.Client, bigquery.Client]: + service_map = {"storage": storage.Client, "bigquery": bigquery.Client} + return service_map[service](project=constants.PROJECT_NAME.value[self.env]) + + def transfer_gcp_obj(self, target_class: Type[T], **additional_kwargs) -> T: + base_args = list(inspect.signature(GCPBase).parameters.keys()) + init_args = list(inspect.signature(target_class).parameters.keys()) + kwargs = {k: self[k] for k in init_args if k in base_args} | additional_kwargs + return target_class(**kwargs) + + +class Storage(GCPBase): + def __init__( + self, + env: str, + dataset_id: str, + table_id: str = None, + bucket_names: str = None, + ): + super().__init__( + dataset_id=dataset_id, + table_id=table_id, + bucket_names=bucket_names, + env=env, + ) + + self.bucket = self.client("storage").bucket(self.bucket_name) + + def create_blob_name( + self, + mode: str, + filename: str = None, + filetype: str = None, + partition: str = None, + ) -> str: + blob_name = f"{mode}/{self.dataset_id}" + if self.table_id is not None: + blob_name += f"/{self.table_id}" + + if partition is not None: + partition = partition.strip("/") + blob_name += f"/{partition}" + + if filename is not None: + blob_name += f"/{filename}" + + if filetype is not None: + blob_name += f".{filetype}" + else: + blob_name += "/" + + return blob_name + + def _check_mode(self, mode: str): + accept = ["upload", "raw", "source"] + if mode not in accept: + raise ValueError(f"mode must be: {', '.join(accept)}. Received {mode}") + + def upload_file( + self, + mode: str, + filepath: Union[str, Path], + partition: str = None, + if_exists="replace", + chunk_size=None, + **upload_kwargs, + ): + filepath = Path(filepath) + + if filepath.is_dir(): + raise IsADirectoryError("filepath is a directory") + + filename_parts = filepath.name.rsplit(".", 1) + filetype = filename_parts[1] if len(filename_parts) > 1 else None + blob_name = self.create_blob_name( + mode=mode, + partition=partition, + filename=filename_parts[0], + filetype=filetype, + ) + + blob = self.bucket.blob(blob_name, chunk_size=chunk_size) + + if not blob.exists() or if_exists == "replace": + log(f"Uploading file {filepath} to {self.bucket.name}/{blob_name}") + upload_kwargs["timeout"] = upload_kwargs.get("timeout", None) + + blob.upload_from_filename(str(filepath), **upload_kwargs) + log("File uploaded!") + + elif if_exists == "pass": + log("Blob already exists skipping upload") + + else: + raise FileExistsError("Blob already exists") + + def get_blob_obj( + self, + mode: str, + filename: str, + filetype: str = None, + partition: str = None, + ): + blob_name = self.create_blob_name( + mode=mode, + partition=partition, + filename=filename, + filetype=filetype, + ) + return self.bucket.get_blob(blob_name=blob_name) + + def get_blob_bytes( + self, + mode: str, + filename: str, + filetype: str = None, + partition: str = None, + ) -> bytes: + blob_name = self.create_blob_name( + mode=mode, + partition=partition, + filename=filename, + filetype=filetype, + ) + return self.bucket.get_blob(blob_name=blob_name).download_as_bytes() + + def get_blob_string( + self, + mode: str, + filename: str, + filetype: str = None, + partition: str = None, + ) -> str: + blob_name = self.create_blob_name( + mode=mode, + partition=partition, + filename=filename, + filetype=filetype, + ) + return self.bucket.get_blob(blob_name=blob_name).download_as_text() + + def unzip_file(self, mode: str, zip_filename: str, unzip_to: str): + data = self.get_blob_bytes(mode=mode, filename=zip_filename) + mime = MimeTypes() + with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: + for name in zipped_file.namelist(): + unzipped_data = zipped_file.read(name=name) + + filename_parts = name.rsplit(".", 1) + + filetype = filename_parts[1] if len(filename_parts) > 1 else None + + blob_name = self.create_blob_name( + mode=mode, + partition=unzip_to, + filename=filename_parts[0], + filetype=filetype, + ) + + self.bucket.blob(blob_name).upload_from_string( + data=unzipped_data, + content_type=mime.guess_type(name)[0], + ) + + def move_folder( + self, + new_storage: "Storage", + old_mode: str, + new_mode: str, + partitions: Union[str, list[str]] = None, + ): + partitions = ( + [partitions] if isinstance(partitions, str) or partitions is None else partitions + ) + + blobs = [] + + for partition in partitions: + blob_prefix = self.create_blob_name(mode=old_mode, partition=partition) + source_blobs = list(self.bucket.list_blobs(prefix=blob_prefix)) + + blob_mapping = [ + { + "source_blob": blob, + "new_name": blob.name.replace( + blob_prefix, + new_storage.create_blob_name(mode=new_mode, partition=partition), + 1, + ), + } + for blob in source_blobs + if not blob.name.endswith("/") + ] + + blobs += blob_mapping + + if new_storage.bucket_name != self.bucket_name: + for blob in blobs: + source_blob: storage.Blob = blob["source_blob"] + self.bucket.copy_blob(source_blob, new_storage.bucket, new_name=blob["new_name"]) + source_blob.delete() + else: + for blob in blobs: + self.bucket.rename_blob(blob["source_blob"], new_name=blob["new_name"]) + + +class Dataset(GCPBase): + def __init__(self, dataset_id: str, env: str, location: str = "southamerica-east1") -> None: + super().__init__( + dataset_id=dataset_id, + table_id="", + bucket_names=None, + env=env, + ) + self.location = location + + def exists(self) -> bool: + try: + self.client("bigquery").get_dataset(self.dataset_id) + return True + except NotFound: + return False + + def create(self): + if not self.exists(): + dataset_full_name = f"{constants.PROJECT_NAME.value[self.env]}.{self.dataset_id}" + dataset_obj = bigquery.Dataset(dataset_full_name) + dataset_obj.location = self.location + log(f"Creating dataset {dataset_full_name} | location: {self.location}") + self.client("bigquery").create_dataset(dataset_obj) + log("Dataset created!") + else: + log("Dataset already exists") + + +class BQTable(GCPBase): + def __init__( + self, + env: str, + dataset_id: str, + table_id: str, + bucket_names: dict = None, + timestamp: datetime = None, + partition_date_only: bool = False, + raw_filetype: str = "json", + ) -> None: + super().__init__( + dataset_id=dataset_id, + table_id=table_id, + bucket_names=bucket_names, + env=env, + ) + + self.table_full_name = ( + f"{constants.PROJECT_NAME.value[env]}.{self.dataset_id}.{self.table_id}" + ) + + self.partition = create_partition( + timestamp=timestamp, + partition_date_only=partition_date_only, + ) + + filepaths = create_capture_filepath( + dataset_id=dataset_id, + table_id=table_id, + timestamp=timestamp, + raw_filetype=raw_filetype, + partition=self.partition, + ) + + self.raw_filepath = filepaths.get("raw") + self.source_filepath = filepaths.get("source") + + self.timestamp = timestamp + + def _create_table_schema(self) -> list[bigquery.SchemaField]: + log("Creating table schema...") + with open(self.source_filepath, "r", encoding="utf-8") as fi: + columns = next(csv.reader(fi)) + + log(f"Columns found: {columns}") + schema = [ + bigquery.SchemaField(name=col, field_type="STRING", description=None) for col in columns + ] + log("Schema created!") + return schema + + def _create_table_config(self) -> bigquery.ExternalConfig: + if self.source_filepath is None: + raise AttributeError("source_filepath is None") + + external_config = bigquery.ExternalConfig("CSV") + external_config.options.skip_leading_rows = 1 + external_config.options.allow_quoted_newlines = True + external_config.autodetect = False + external_config.schema = self._create_table_schema() + external_config.options.field_delimiter = "," + external_config.options.allow_jagged_rows = False + + uri = f"gs://{self.bucket_name}/source/{self.dataset_id}/{self.table_id}/*" + external_config.source_uris = uri + hive_partitioning = HivePartitioningOptions() + hive_partitioning.mode = "STRINGS" + hive_partitioning.source_uri_prefix = uri.replace("*", "") + external_config.hive_partitioning = hive_partitioning + + return external_config + + def upload_raw_file(self): + if self.raw_filepath is None: + raise AttributeError("raw_filepath is None") + + st_obj = self.transfer_gcp_obj(target_class=Storage) + + st_obj.upload_file( + mode="raw", + filepath=self.raw_filepath, + partition=self.partition, + ) + + def exists(self) -> bool: + try: + return bool(self.client("bigquery").get_table(self.table_full_name)) + except NotFound: + return False + + def create(self, location: str = "southamerica-east1"): + log(f"Creating External Table: {self.table_full_name}") + self.append() + dataset_obj = self.transfer_gcp_obj(target_class=Dataset, location=location) + dataset_obj.create() + + client = self.client("bigquery") + + bq_table = bigquery.Table(self.table_full_name) + bq_table.description = f"staging table for `{self.table_full_name}`" + bq_table.external_data_configuration = self._create_table_config() + + client.create_table(bq_table) + log("Table created!") + + def append(self): + if self.source_filepath is None: + raise ValueError("source_filepath is None") + + st_obj = self.transfer_gcp_obj(target_class=Storage) + + st_obj.upload_file( + mode="source", + filepath=self.source_filepath, + partition=self.partition, + ) diff --git a/pipelines/utils/incremental_capture_strategy.py b/pipelines/utils/incremental_capture_strategy.py new file mode 100644 index 000000000..cd123726c --- /dev/null +++ b/pipelines/utils/incremental_capture_strategy.py @@ -0,0 +1,414 @@ +# -*- coding: utf-8 -*- +"""Module to get incremental capture values""" +from abc import ABC, abstractmethod +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Any, Union + +from prefeitura_rio.pipelines_utils.logging import log +from prefeitura_rio.pipelines_utils.redis_pal import get_redis_client +from pytz import timezone + +from pipelines.constants import constants +from pipelines.utils.fs import read_raw_data +from pipelines.utils.gcp import BQTable +from pipelines.utils.utils import isostr_to_datetime + + +@dataclass +class IncrementalInfo: + start_value: Any + end_value: Any + execution_mode: str + + def __getitem__(self, key): + return self.__dict__[key] + + +class IncrementalCaptureStrategy(ABC): + """ + Classe base para criar estratégias de captura incremental + Para criar uma nova estratégia incremental: + 1. herde essa classe + 2. Implemente os métodos: + to_dict: Deve retornar um dicionário com uma chave (o nome da estratégia) + e os valores do dicionário devem ser os argumentos para instânciar a classe + _get_end_value: Deve receber um start_value m e retornar o valor final da captura + get_value_to_save: Deve retornar o valor a ser salvo no Redis no final do Flow + parse_redis_value: Deve receber um valor retornado do Redis e converter ele para o + tipo que será usado no Flow + + Args: + max_incremental_window: A janela máxima para calcular o valor final + first_value (optional): O valor inicial para a primeira execução + """ + + def __init__( + self, + max_incremental_window: Any, + first_value: Any = None, + ) -> None: + self._max_incremental_window = max_incremental_window + self._first_value = first_value + self._redis_key = None + self.incremental_info = None + + def __getitem__(self, key): + return self.__dict__[key] + + def initialize( + self, + table: BQTable, + overwrite_start_value: Any = None, + overwrite_end_value: Any = None, + ): + """ + Define o modo de execução e inicializa os valores iniciais e finais + + Args: + table (BQTable): O objeto de tabela usada na extração + overwrite_start_value (optional): Sobrescreve o valor inicial + (deve ter o mesmo formato do valor retornado pelo Redis) + overwrite_end_value (optional): Sobrescreve o valor final + (deve ter o mesmo formato do valor retornado pelo Redis) + """ + self._redis_key = f"{table.env}.{table.dataset_id}.{table.table_id}" + + last_redis_value = self.query_redis().get(constants.REDIS_LAST_CAPTURED_VALUE_KEY.value) + + execution_mode = ( + constants.MODE_FULL.value if last_redis_value is None else constants.MODE_INCR.value + ) + + if execution_mode == constants.MODE_FULL.value and self._first_value is not None: + last_redis_value = self.parse_redis_value(self._first_value) + + else: + last_redis_value = self.parse_redis_value(last_redis_value) + + start_value = ( + self.parse_redis_value(overwrite_start_value) + if overwrite_start_value is not None + else last_redis_value + ) + + end_value = ( + self.parse_redis_value(overwrite_end_value) + if overwrite_end_value is not None + else self._get_end_value(start_value=start_value) + ) + + if start_value is not None: + assert start_value <= end_value, "start_value greater than end_value" + + self.incremental_info = IncrementalInfo( + start_value=start_value, + end_value=end_value, + execution_mode=execution_mode, + ) + + @abstractmethod + def to_dict(self) -> dict: + """ + Retorna o dicionário para ser passado como parâmetro em um Flow + + Returns: + dict: Dicionário com uma chave (o nome da estratégia) + e os valores do dicionário devem ser os argumentos para instânciar a classe + """ + + @abstractmethod + def _get_end_value(self, start_value: Any) -> Any: + """ + Calcula o valor final com base no valor inicial + + Args: + start_value: Valor inicial da captura + Returns: + Any: Valor final da captura + """ + + @abstractmethod + def get_value_to_save(self) -> Any: + """ + Retorna o valor para salvar no Redis + """ + + def query_redis(self) -> dict: + """ + Retorna o valor salvo no Redis + + Returns: + dict: o conteúdo da key no Redis + """ + redis_client = get_redis_client() + content = redis_client.get(self._redis_key) + if content is None: + content = {} + return content + + @abstractmethod + def parse_redis_value(self, value: Any) -> Any: + """ + Converte o valor retornado do Redis no tipo a ser usado no Flow + + Args: + value: valor a ser convertido + + Returns: + Any: Valor convertido + """ + + def save_on_redis(self): + """ + Salva o valor no Redis se ele for maior que o atual + + Args: + value_to_save: Valor a ser salvo no Redis + """ + value_to_save = self.get_value_to_save() + log(f"Saving value {value_to_save} on Redis") + content = self.query_redis() + old_value = content.get(constants.REDIS_LAST_CAPTURED_VALUE_KEY.value) + log(f"Value currently saved on key {self._redis_key} = {old_value}") + + if old_value is None: + flag_save = True + else: + old_value = self.parse_redis_value(old_value) + flag_save = self.parse_redis_value(value_to_save) > old_value + + if flag_save: + redis_client = get_redis_client() + content[constants.REDIS_LAST_CAPTURED_VALUE_KEY.value] = value_to_save + + redis_client.set(self._redis_key, content) + log(f"[key: {self._redis_key}] Value {value_to_save} saved on Redis!") + else: + log("Value already saved greater than value to save, task skipped") + + +class IDIncremental(IncrementalCaptureStrategy): + """ + Classe para fazer capturas incrementais com base em um ID sequencial inteiro + + Valor inicial: Valor salvo no Redis (tipo int) + Valor final: Valor inicial + max_incremental_window (tipo int) + + Salva no Redis o último id presente na captura + + Args: + max_incremental_window (int): Range máximo de ids a serem capturados + id_column_name (str): Nome da coluna de ID + first_value (optional): O valor inicial para a primeira execução + """ + + def __init__( + self, + max_incremental_window: int, + id_column_name: str, + first_value: int = None, + ) -> None: + super().__init__( + max_incremental_window=max_incremental_window, + first_value=first_value, + ) + self.id_column_name = id_column_name + self._raw_filepath = None + + def initialize( + self, + table: BQTable, + overwrite_start_value: int = None, + overwrite_end_value: int = None, + ): + """ + Executa o método da classe Base e pega o raw_filepath da tabela + + Args: + table (BQTable): O objeto de tabela usada na extração + overwrite_start_value (int, optional): Sobrescreve o valor inicial + overwrite_end_value (int, optional): Sobrescreve o valor final + """ + super().initialize( + table=table, + overwrite_start_value=overwrite_start_value, + overwrite_end_value=overwrite_end_value, + ) + self._raw_filepath = table.raw_filepath + + def to_dict(self) -> dict: + """ + Converte o objeto em um dicionário para ser passado como parâmetro no Flow + + Returns: + dict: Dicionário com a key "id" e o valor contendo argumentos para intanciação + """ + return { + "id": { + "max_incremental_window": self._max_incremental_window, + "first_value": self._first_value, + "id_column_name": self.id_column_name, + } + } + + def _get_end_value(self, start_value: int) -> int: + """ + Calcula o valor final + """ + if start_value is not None: + return start_value + int(self._max_incremental_window) + + def get_value_to_save(self) -> int: + """ + Busca no arquivo raw o último ID capturado + """ + df = read_raw_data(filepath=self._raw_filepath) + return df[self.id_column_name].dropna().astype(str).str.replace(".0", "").astype(int).max() + + def parse_redis_value(self, value: Union[int, str]) -> int: + """ + Converte o valor para inteiro + + Args: + value (Union[int, str]): Valor a ser convertido + Returns: + int: Valor convertido para inteiro + """ + if value is not None: + value = int(value) + + return value + + +class DatetimeIncremental(IncrementalCaptureStrategy): + """ + Classe para fazer capturas incrementais com base em uma data + + Valor inicial: Última data salva no Redis (tipo datetime) + Valor final: timestamp da tabela ou + valor inicial + max_incremental_window (caso seja menor que a timestamp) + (tipo datetime) + + Salva no Redis o valor final + + Args: + max_incremental_window (dict): Dicionário com os argumentos de timedelta + que representam o range máximo de datas a ser capturado + (ex.: {"days": 1} captura no maximo 1 dia depois da data inicial) + first_value (str, optional): O valor inicial para a primeira execução + (deve ser uma string de datetime no formato iso) + """ + + def __init__( + self, + max_incremental_window: dict, + first_value: str = None, + ) -> None: + super().__init__( + max_incremental_window=max_incremental_window, + first_value=first_value, + ) + self._timestamp = None + + def initialize( + self, + table: BQTable, + overwrite_start_value: str = None, + overwrite_end_value: str = None, + ): + """ + Executa o método da classe Base e pega o timestamp da tabela + + Args: + table (BQTable): O objeto de tabela usada na extração + overwrite_start_value (str, optional): Sobrescreve o valor inicial + (deve ser uma string de datetime no formato iso) + overwrite_end_value (str, optional): Sobrescreve o valor final + (deve ser uma string de datetime no formato iso) + """ + self._timestamp = table.timestamp + return super().initialize( + table=table, + overwrite_start_value=overwrite_start_value, + overwrite_end_value=overwrite_end_value, + ) + + def to_dict(self) -> dict: + """ + Converte o objeto em um dicionário para ser passado como parâmetro no Flow + + Returns: + dict: Dicionário com a key "datetime" e o valor contendo argumentos para intanciação + """ + return { + "datetime": { + "max_incremental_window": self._max_incremental_window, + "first_value": self._first_value, + } + } + + def _get_end_value(self, start_value: datetime) -> datetime: + """ + Calcula o valor final + """ + if start_value is not None: + end_value = min( + self._timestamp, start_value + timedelta(**self._max_incremental_window) + ) + else: + end_value = self._timestamp + + if not end_value.tzinfo: + end_value = end_value.replace(tzinfo=timezone("UTC")) + else: + end_value = end_value.astimezone(tz=timezone("UTC")) + + return end_value + + def get_value_to_save(self) -> str: + """ + Transforma o valor final em string para salvar no Redis + """ + return self.incremental_info.end_value.isoformat() + + def parse_redis_value(self, value: Union[datetime, str]) -> datetime: + """ + Converte o valor em um datetime com a timezone UTC + + Args: + value (Union[datetime, str]): Valor a ser convertido + Returns: + datetime: Valor convertido para datetime UTC + """ + if value is not None: + if isinstance(value, str): + value = isostr_to_datetime(value) + elif isinstance(value, datetime): + if value.tzinfo is None: + value = value.replace(tzinfo=timezone("UTC")) + else: + value = value.astimezone(tz=timezone("UTC")) + else: + raise ValueError("value must be str or datetime") + + return value + + +def incremental_strategy_from_dict(strategy_dict: dict) -> IncrementalCaptureStrategy: + """ + Instancia uma IncrementalCaptureStrategy com base em um dicionário + + Args: + strategy_dict (dict): Dicionário com uma key (tipo da incremental: id ou datetime) + e valores sendo os argumentos para passar ao construtor do objeto + + Returns: + IncrementalCaptureStrategy: classe concreta instanciada + """ + incremental_type = list(strategy_dict.keys())[0] + class_map = { + "id": IDIncremental, + "datetime": DatetimeIncremental, + } + return class_map[incremental_type](**strategy_dict[incremental_type]) diff --git a/pipelines/utils/jinja.py b/pipelines/utils/jinja.py new file mode 100644 index 000000000..76c989657 --- /dev/null +++ b/pipelines/utils/jinja.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +"""Module to render jinja templates""" +import re + +from jinja2 import Environment + +from pipelines.constants import constants + + +def render_template( + template_string: str, + execution_mode: str, + _vars: dict, + normalize: bool = False, +) -> str: + """ + Renderiza um template Jinja + + a macro is_incremental() pode ser usada da mesma forma que no DBT + + Args: + template_string (str): A string a ser tratada + execution_mode (str): full ou incr + _vars (dict): Dicionário no formato {nome_variavel: valor_variavel, ...} + normalize (bool, optional): Se True, remove quebras de linha, espaços duplos e tabs, + criando a string final com uma apenas linha. Defaults to False + + Returns: + str: A string renderizada + + """ + + def is_incremental() -> bool: + return execution_mode == constants.MODE_INCR.value + + template_env = Environment() + template_env.globals["is_incremental"] = is_incremental + template = template_env.from_string(template_string) + + rendered_string = template.render(_vars) + + if normalize: + rendered_string = re.sub(r"\s+", " ", rendered_string).strip() + + return rendered_string diff --git a/pipelines/utils/prefect.py b/pipelines/utils/prefect.py new file mode 100644 index 000000000..be484c84f --- /dev/null +++ b/pipelines/utils/prefect.py @@ -0,0 +1,334 @@ +# -*- coding: utf-8 -*- +"""Prefect functions""" +import inspect + +# import json +from typing import Any, Callable, Dict, Type, Union + +import prefect +from prefect import unmapped +from prefect.backend.flow_run import FlowRunView, FlowView, watch_flow_run + +# from prefect.engine.signals import PrefectStateSignal, signal_from_state +from prefect.tasks.prefect import create_flow_run, wait_for_flow_run +from prefeitura_rio.pipelines_utils.logging import log + +from pipelines.constants import constants +from pipelines.utils.capture.base import DataExtractor + +# from prefect.engine.state import Cancelled, State + + +class TypedParameter(prefect.Parameter): + """ + Parâmetro do Prefect com verificação de tipos + + Args: + accepted_types Union[tuple[Type], Type]: Tipo ou tupla de tipos aceitos pelo parâmetro + **parameter_kwargs: Parâmetros para ser passados à classe Parametro padrão do Prefect + """ + + def __init__(self, accepted_types: Union[tuple[Type], Type], **parameter_kwargs): + self.accepted_types = accepted_types + super().__init__(**parameter_kwargs) + + def run(self) -> Any: + """ + Metodo padrão do parâmetro do Prefect, mas com teste de tipagem + """ + param_value = super().run() + assert isinstance( + param_value, self.accepted_types + ), f"Param {self.name} must be {self.accepted_types}. Received {type(param_value)}" + + return param_value + + +def extractor_task(func: Callable, **task_init_kwargs): + """ + Decorator para tasks create_extractor_task do flow generico de captura + Usado da mesma forma que o decorator task padrão do Prefect. + + A função da task pode receber os seguintes argumentos: + env: str, + dataset_id: str, + table_id: str, + save_filepath: str, + data_extractor_params: dict, + incremental_info: IncrementalInfo + + Garante que os argumentos e retorno da task estão corretos e + possibilita que a task seja criada sem precisar de todos os argumentos passados pelo flow + """ + task_init_kwargs["name"] = task_init_kwargs.get("name", func.__name__) + signature = inspect.signature(func) + assert task_init_kwargs.get("nout", 1) == 1, "nout must be 1" + + return_annotation = signature.return_annotation + + if hasattr(return_annotation, "__origin__") and return_annotation.__origin__ is Union: + return_assertion = all(issubclass(t, DataExtractor) for t in return_annotation.__args__) + else: + return_assertion = issubclass( + signature.return_annotation, + DataExtractor, + ) + + assert return_assertion, "return must be DataExtractor subclass" + + def decorator(func): + expected_arguments = [ + "env", + "dataset_id", + "table_id", + "save_filepath", + "data_extractor_params", + "incremental_info", + ] + + function_arguments = [p.name for p in signature.parameters.values()] + + invalid_args = [a for a in function_arguments if a not in expected_arguments] + + if len(invalid_args) > 0: + raise ValueError(f"Invalid arguments: {', '.join(invalid_args)}") + + def wrapper(**kwargs): + return func(**{k: v for k, v in kwargs.items() if k in function_arguments}) + + task_init_kwargs["checkpoint"] = False + return prefect.task(wrapper, **task_init_kwargs) + + if func is None: + return decorator + return decorator(func=func) + + +def run_local(flow: prefect.Flow, parameters: Dict[str, Any] = None): + """ + Executa um flow localmente + """ + # Setup for local run + flow.storage = None + flow.run_config = None + flow.schedule = None + flow.state_handlers = [] + + # Run flow + return flow.run(parameters=parameters) if parameters else flow.run() + + +def flow_is_running_local() -> bool: + """ + Testa se o flow está rodando localmente + + Returns: + bool: True se está rodando local, False se está na nuvem + """ + return prefect.context.get("project_name") is None + + +def rename_current_flow_run(name: str) -> bool: + """ + Renomeia a run atual do Flow + + Returns: + bool: Se o flow foi renomeado + """ + if not flow_is_running_local(): + flow_run_id = prefect.context.get("flow_run_id") + client = prefect.Client() + return client.set_flow_run_name(flow_run_id, name) + return False + + +def get_current_flow_labels() -> list[str]: + """ + Get the labels of the current flow. + """ + flow_run_id = prefect.context.get("flow_run_id") + flow_run_view = FlowRunView.from_flow_run_id(flow_run_id) + return flow_run_view.labels + + +def create_subflow_run( + flow_name: str, + parameters: dict, + idempotency_key: str, + project_name: str = None, + labels: list[str] = None, +) -> str: + """ + Executa um subflow + + Args: + flow_name (str): Nome do flow a ser executado. + parameters (dict): Parâmetros para executar o flow + idempotency_key (str): Uma chave única para a run do flow, execuções de flows + com a mesma idempotency_key são consideradas a mesma + project_name (str, optional): Nome do projeto no Prefect para executar o flow, + se não for especificado, é utilizado o nome do projeto do flow atual + labels (list[str]): Labels para executar o flow, + se não for especificado, são utilizadas as labels do flow atual + + Returns: + str: o id da execução do flow + """ + + if prefect.context["flow_name"] == flow_name: + raise RecursionError("Can not run recursive flows") + + if project_name is None: + project_name = prefect.context.get("project_name") + + if labels is None: + labels = get_current_flow_labels() + + log( + f"""Will run flow with the following data: + flow name: {flow_name} + project name: {project_name} + labels: {labels} + parameters: {parameters} + """ + ) + + flow = FlowView.from_flow_name(flow_name, project_name=project_name) + + client = prefect.Client() + + flow_run_id = client.create_flow_run( + flow_id=flow.flow_id, + parameters=parameters, + labels=labels, + idempotency_key=idempotency_key, + ) + + # try: + # prefect.context["_subflow_ids"].append(flow_run_id) + # except KeyError: + # prefect.context["_subflow_ids"] = [flow_run_id] + + run_url = constants.FLOW_RUN_URL_PATTERN.value.format(run_id=flow_run_id) + + log(f"Created flow run: {run_url}") + + return flow_run_id + + +def wait_subflow_run(flow_run_id: str) -> FlowRunView: + flow_run = FlowRunView.from_flow_run_id(flow_run_id) + + for exec_log in watch_flow_run( + flow_run_id, + stream_states=True, + stream_logs=True, + ): + message = f"Flow {flow_run.name!r}: {exec_log.message}" + prefect.context.logger.log(exec_log.level, message) + + flow_run = flow_run.get_latest() + + # state_signal = signal_from_state(flow_run.state)( + # message=f"{flow_run_id} finished in state {flow_run.state}", + # result=flow_run, + # ) + return flow_run + + +def run_flow_mapped( + flow_name: str, + parameters: list[dict], + project_name: str = None, + labels: list[str] = None, + maximum_parallelism: int = None, +): + """ + Executa e espera várias execuções de um mesmo flow em paralelo + com diferentes argumentos + + Args: + flow_name (str): Nome do flow a ser executado. + parameters (list[dict]): Lista de parâmetros para cada execução do flow. + project_name (str, optional): Nome do projeto no Prefect para executar o flow, + se não for especificado, é utilizado o nome do projeto do flow atual + labels (list[str]): Labels para executar o flow, + se não for especificado, são utilizadas as labels do flow atual + + Returns: + FunctionTask: retorno da task wait_for_flow_run + """ + if not isinstance(parameters, list): + raise ValueError("Parameters must be a list") + + if prefect.context["flow_name"] == flow_name: + raise ValueError("Can not run recursive flows") + + if project_name is None: + project_name = prefect.context.get("project_name") + + if labels is None: + labels = get_current_flow_labels() + + if maximum_parallelism is None: + execution_list = [parameters] + else: + execution_list = [ + parameters[i : i + maximum_parallelism] # noqa + for i in range(0, len(parameters), maximum_parallelism) + ] + + complete_wait = [] + for params in execution_list: + subflow_runs = create_flow_run.map( + flow_name=unmapped(flow_name), + project_name=unmapped(project_name), + labels=unmapped(labels), + parameters=params, + ) + + wait_runs = wait_for_flow_run.map( + subflow_runs, + stream_states=unmapped(True), + stream_logs=unmapped(True), + raise_final_state=unmapped(True), + ) + complete_wait.append(wait_runs) + + return complete_wait + + +# def handler_cancel_subflows(obj, old_state: State, new_state: State) -> State: +# if isinstance(new_state, Cancelled): +# client = prefect.Client() +# subflows = prefect.context.get("_subflow_ids", []) +# if len(subflows) > 0: +# query = f""" +# query {{ +# flow_run( +# where: {{ +# _and: [ +# {{state: {{_in: ["Running", "Submitted", "Scheduled"]}}}}, +# {{id: {{_in: {json.dumps(subflows)}}}}} +# ] +# }} +# ) {{ +# id +# }} +# }} +# """ +# # pylint: disable=no-member +# response = client.graphql(query=query) +# active_subflow_runs = response["data"]["flow_run"] +# if active_subflow_runs: +# logger = prefect.context.get("logger") +# logger.info(f"Found {len(active_subflow_runs)} subflows running") +# for subflow_run_id in active_subflow_runs: +# logger.info(f"cancelling run: {subflow_run_id}") +# client.cancel_flow_run(flow_run_id=subflow_run_id) +# logger("Run cancelled!") +# return new_state + + +class FailedSubFlow(Exception): + """Erro para ser usado quando um subflow falha""" diff --git a/pipelines/utils/pretreatment.py b/pipelines/utils/pretreatment.py new file mode 100644 index 000000000..0301c4608 --- /dev/null +++ b/pipelines/utils/pretreatment.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +"""Functions to pretreat data""" +import inspect +from datetime import datetime + +import pandas as pd +from prefeitura_rio.pipelines_utils.logging import log + + +def transform_to_nested_structure(data: pd.DataFrame, primary_keys: list) -> pd.DataFrame: + """ + Transforma colunas do DataFrame na coluna content no formato Json + agrupando pelas primary keys + + Args: + data (pd.DataFrame): DataFrame para aplicar o tratamento + primary_keys (list): Lista de primary keys + + Returns: + pd.DataFrame: Dataframe contendo as colunas listadas nas primary keys + coluna content + """ + return ( + data.groupby(primary_keys) + .apply(lambda x: x[data.columns.difference(primary_keys)].to_json(orient="records")) + .str.strip("[]") + .reset_index(name="content")[primary_keys + ["content"]] + ) + + +def pretreatment_func(func): + """ + Decorator para ajudar no desenvolvimento de funções + de pre-tratamento para serem passadas no flow generico de captura + + Faz a checagem dos parâmetros e do retorno da função + e possibilita a criação da função sem precisar de todos + os parâmetros passados pela Task + """ + + def wrapper(**kwargs): + signature = inspect.signature(func) + assert issubclass( + signature.return_annotation, + pd.DataFrame, + ), "return must be pandas DataFrame" + func_parameter_names = signature.parameters.keys() + func_parameters = signature.parameters.values() + expected_arguments = {"data": pd.DataFrame, "timestamp": datetime, "primary_keys": list} + + invalid_args = [ + a.name + for a in func_parameters + if a.name not in expected_arguments + or isinstance(a.annotation, expected_arguments[a.name]) + ] + + if len(invalid_args) > 0: + raise ValueError(f"Invalid arguments: {', '.join(invalid_args)}") + + kwargs = {k: v for k, v in kwargs.items() if k in func_parameter_names} + return func(**kwargs) + + return wrapper + + +@pretreatment_func +def strip_string_columns(data: pd.DataFrame) -> pd.DataFrame: + """ + Aplica a função strip em todas as colunas do formato string + de um DataFrame + + Args: + data (pd.DataFrame): Dataframe a ser tratado + + Returns: + pd.DataFrame: Dataframe tratado + """ + for col in data.columns[data.dtypes == "object"].to_list(): + try: + data[col] = data[col].str.strip() + except AttributeError as e: + log(f"Error {e} on column {col}") + return data diff --git a/pipelines/utils/secret.py b/pipelines/utils/secret.py index 46b1905be..ad35e519a 100644 --- a/pipelines/utils/secret.py +++ b/pipelines/utils/secret.py @@ -1,27 +1,25 @@ # -*- coding: utf-8 -*- from prefeitura_rio.pipelines_utils.infisical import get_infisical_client -from pipelines.utils.utils import normalize_keys - def get_secret(secret_path: str = "/", secret_name: str = None, environment: str = "dev"): """ - Fetches secrets from Infisical. If passing only `secret_path` and - no `secret_name`, returns all secrets inside a folder. + Pega os dados de um secret no Infisical. Se for passado somente um secret_path + sem o argumento secret_name, retorna todos os secrets dentro da pasta. Args: - secret_name (str, optional): _description_. Defaults to None. - secret_path (str, optional): _description_. Defaults to '/'. - environment (str, optional): _description_. Defaults to 'dev'. + secret_path (str, optional): Pasta do secret no infisical. Defaults to "/". + secret_name (str, optional): Nome do secret. Defaults to None. + environment (str, optional): Ambiente para ler o secret. Defaults to 'dev'. Returns: - _type_: _description_ + dict: Dicionário com os dados retornados do Infisical """ client = get_infisical_client() if not secret_path.startswith("/"): secret_path = f"/{secret_path}" if secret_path and not secret_name: secrets = client.get_all_secrets(path=secret_path) - return normalize_keys({s.secret_name: s.secret_value for s in secrets}) + return {s.secret_name.lower(): s.secret_value for s in secrets} secret = client.get_secret(secret_name=secret_name, path=secret_path, environment=environment) - return {secret_name: secret.secret_value} + return {secret_name.lower(): secret.secret_value} diff --git a/pipelines/utils/utils.py b/pipelines/utils/utils.py index 80cc11c3a..996284f97 100644 --- a/pipelines/utils/utils.py +++ b/pipelines/utils/utils.py @@ -1,470 +1,31 @@ # -*- coding: utf-8 -*- -# flake8: noqa: E501 -""" -General purpose functions for rj_smtr -""" - +"""General purpose functions""" import io -import json -import math -import time -import traceback -import zipfile -from datetime import date, datetime, timedelta -from ftplib import FTP -from pathlib import Path -from typing import Any, List, Union +from datetime import date, datetime +from typing import Any import basedosdados as bd import pandas as pd -import psycopg2 -import psycopg2.extras -import pymysql import pytz -import requests -from basedosdados import Storage, Table -from google.cloud.storage.blob import Blob -from prefect.schedules.clocks import IntervalClock -from prefeitura_rio.pipelines_utils.infisical import get_secret -from prefeitura_rio.pipelines_utils.logging import log # TODO: add or relocate imports -from prefeitura_rio.pipelines_utils.redis_pal import get_redis_client -from pytz import timezone -from redis_pal import RedisPal +from pandas_gbq.exceptions import GenericGBQException +from prefeitura_rio.pipelines_utils.logging import log from pipelines.constants import constants -from pipelines.utils.implicit_ftp import ImplicitFtpTls # Set BD config to run on cloud # bd.config.from_file = True -def send_discord_message( - message: str, - webhook_url: str, -) -> None: - """ - Sends a message to a Discord channel. - """ - requests.post( - webhook_url, - data={"content": message}, - ) - - -def log_critical(message: str, secret_path: str = constants.CRITICAL_SECRET_PATH.value): - """Logs message to critical discord channel specified - - Args: - message (str): Message to post on the channel - secret_path (str, optional): Secret path storing the webhook to critical channel. - Defaults to constants.CRITICAL_SECRETPATH.value. - - """ - url = get_secret(secret_path=secret_path)["data"]["url"] - return send_discord_message(message=message, webhook_url=url) - - -def create_or_append_table(dataset_id: str, table_id: str, path: str, partitions: str = None): - """Conditionally create table or append data to its relative GCS folder. - - Args: - dataset_id (str): target dataset_id on BigQuery - table_id (str): target table_id on BigQuery - path (str): Path to .csv data file - """ - tb_obj = Table(table_id=table_id, dataset_id=dataset_id) - if not tb_obj.table_exists("staging"): - log("Table does not exist in STAGING, creating table...") - dirpath = path.split(partitions)[0] - tb_obj.create( - path=dirpath, - if_table_exists="pass", - if_storage_data_exists="replace", - ) - log("Table created in STAGING") - else: - log("Table already exists in STAGING, appending to it...") - tb_obj.append(filepath=path, if_exists="replace", timeout=600, partitions=partitions) - log("Appended to table on STAGING successfully.") - - -def generate_df_and_save(data: dict, fname: Path): - """Save DataFrame as csv - - Args: - data (dict): dict with the data which to build the DataFrame - fname (Path): _description_ - """ - # Generate dataframe - dataframe = pd.DataFrame() - dataframe[data["key_column"]] = [piece[data["key_column"]] for piece in data["data"]] - dataframe["content"] = list(data["data"]) - - # Save dataframe to CSV - dataframe.to_csv(fname, index=False) - - -def bq_project(kind: str = "bigquery_prod"): - """Get the set BigQuery project_id - - Args: - kind (str, optional): Which client to get the project name from. - Options are 'bigquery_staging', 'bigquery_prod' and 'storage_staging' - Defaults to 'bigquery_prod'. - - Returns: - str: the requested project_id - """ - return bd.upload.base.Base().client[kind].project - - -def get_table_min_max_value( # pylint: disable=R0913 - query_project_id: str, - dataset_id: str, - table_id: str, - field_name: str, - kind: str, - wait=None, # pylint: disable=unused-argument -): - """Query a table to get the maximum value for the chosen field. - Useful to incrementally materialize tables via DBT - - Args: - dataset_id (str): dataset_id on BigQuery - table_id (str): table_id on BigQuery - field_name (str): column name to query - kind (str): which value to get. Accepts min and max - """ - log(f"Getting {kind} value for {table_id}") - query = f""" - SELECT - {kind}({field_name}) - FROM {query_project_id}.{dataset_id}.{table_id} - """ - log(f"Will run query:\n{query}") - result = bd.read_sql(query=query, billing_project_id=bq_project()) - - return result.iloc[0][0] - - -def get_last_run_timestamp(dataset_id: str, table_id: str, mode: str = "prod") -> str: - """ - Query redis to retrive the time for when the last materialization - ran. - - Args: - dataset_id (str): dataset_id on BigQuery - table_id (str): model filename on the queries repo. - eg: if you have a model defined in the file .sql, - the table_id should be - mode (str): - - Returns: - Union[str, None]: _description_ - """ - redis_client = get_redis_client() - key = dataset_id + "." + table_id - log(f"Fetching key {key} from redis, working on mode {mode}") - if mode == "dev": - key = f"{mode}.{key}" - runs = redis_client.get(key) - # if runs is None: - # redis_client.set(key, "") - try: - last_run_timestamp = runs["last_run_timestamp"] - except KeyError: - return None - except TypeError: - return None - log(f"Got value {last_run_timestamp}") - return last_run_timestamp - - -def map_dict_keys(data: dict, mapping: dict) -> None: - """ - Map old keys to new keys in a dict. - """ - for old_key, new_key in mapping.items(): - data[new_key] = data.pop(old_key) - return data - - -def normalize_keys(data: dict): - _data = {key.lower(): value for key, value in data.items()} - return _data - - -def connect_ftp(secret_path: str = None, secure: bool = True): - """Connect to FTP - - Returns: - ImplicitFTP_TLS: ftp client - """ - - ftp_data = get_secret(secret_path)["data"] - if secure: - ftp_client = ImplicitFtpTls() - else: - ftp_client = FTP() - ftp_client.connect(host=ftp_data["host"], port=int(ftp_data["port"])) - ftp_client.login(user=ftp_data["username"], passwd=ftp_data["pwd"]) - if secure: - ftp_client.prot_p() - return ftp_client - - -def safe_cast(val, to_type, default=None): - """ - Safe cast value. - """ - try: - return to_type(val) - except ValueError: - return default - - -def set_redis_rdo_files(redis_client, dataset_id: str, table_id: str): - """ - Register downloaded files to Redis - - Args: - redis_client (_type_): _description_ - dataset_id (str): dataset_id on BigQuery - table_id (str): table_id on BigQuery - - Returns: - bool: if the key was properly set - """ - try: - content = redis_client.get(f"{dataset_id}.{table_id}")["files"] - except TypeError as e: - log(f"Caught error {e}. Will set unexisting key") - # set key to empty dict for filling later - redis_client.set(f"{dataset_id}.{table_id}", {"files": []}) - content = redis_client.get(f"{dataset_id}.{table_id}") - # update content - st_client = bd.Storage(dataset_id=dataset_id, table_id=table_id) - blob_names = [ - blob.name - for blob in st_client.client["storage_staging"].list_blobs( - st_client.bucket, prefix=f"staging/{dataset_id}/{table_id}" - ) - ] - files = [blob_name.split("/")[-1].replace(".csv", "") for blob_name in blob_names] - log(f"When setting key, found {len(files)} files. Will register on redis...") - content["files"] = files - # set key - return redis_client.set(f"{dataset_id}.{table_id}", content) - - -# PRE TREAT # - - -def check_not_null(data: pd.DataFrame, columns: list, subset_query: str = None): - """ - Check if there are null values in columns. - - Args: - columns (list): list of columns to check - subset_query (str): query to check if there are important data - being removed - - Returns: - None - """ - - for col in columns: - remove = data.query(f"{col} != {col}") # null values - log( - f"[data-check] There are {len(remove)} rows with null values in '{col}'", - level="info", - ) - - if subset_query is not None: - # Check if there are important data being removed - remove = remove.query(subset_query) - if len(remove) > 0: - log( - f"""[data-check] There are {len(remove)} critical rows with - null values in '{col}' (query: {subset_query})""", - level="warning", - ) - - -def filter_null(data: pd.DataFrame, columns: list, subset_query: str = None): - """ - Filter null values in columns. - - Args: - columns (list): list of columns to check - subset_query (str): query to check if there are important data - being removed - - Returns: - pandas.DataFrame: data without null values - """ - - for col in columns: - remove = data.query(f"{col} != {col}") # null values - data = data.drop(remove.index) - log( - f"[data-filter] Removed {len(remove)} rows with null '{col}'", - level="info", - ) - - if subset_query is not None: - # Check if there are important data being removed - remove = remove.query(subset_query) - if len(remove) > 0: - log( - f"[data-filter] Removed {len(remove)} critical rows with null '{col}'", - level="warning", - ) - - return data - - -def filter_data(data: pd.DataFrame, filters: list, subset_query: str = None): - """ - Filter data from a dataframe - - Args: - data (pd.DataFrame): data DataFrame - filters (list): list of queries to filter data - - Returns: - pandas.DataFrame: data without filter data - """ - for item in filters: - remove = data.query(item) - data = data.drop(remove.index) - log( - f"[data-filter] Removed {len(remove)} rows from filter: {item}", - level="info", - ) - - if subset_query is not None: - # Check if there are important data being removed - remove = remove.query(subset_query) - if len(remove) > 0: - log( - f"""[data-filter] Removed {len(remove)} critical rows - from filter: {item} (subquery: {subset_query})""", - level="warning", - ) - - return data - - -def check_relation(data: pd.DataFrame, columns: list): - """ - Check relation between collumns. - - Args: - data (pd.DataFrame): dataframe to be modified - columns (list): list of lists of columns to be checked - - Returns: - None - """ - - for cols in columns: - df_dup = data[~data.duplicated(subset=cols)].groupby(cols).count().reset_index().iloc[:, :1] - - for col in cols: - df_dup_col = ( - data[~data.duplicated(subset=col)].groupby(col).count().reset_index().iloc[:, :1] - ) - - if len(df_dup_col[~df_dup_col[col].duplicated()]) == len(df_dup): - log( - f"[data-check] Comparing '{col}' in '{cols}', there are no duplicated values", - level="info", - ) - else: - log( - f"[data-check] Comparing '{col}' in '{cols}', there are duplicated values", - level="warning", - ) - - -def data_info_str(data: pd.DataFrame): - """ - Return dataframe info as a str to log - - Args: - data (pd.DataFrame): dataframe - - Returns: - data.info() as a string - """ - buffer = io.StringIO() - data.info(buf=buffer) - return buffer.getvalue() - - -def generate_execute_schedules( # pylint: disable=too-many-arguments,too-many-locals - clock_interval: timedelta, - labels: List[str], - table_parameters: Union[list[dict], dict], - runs_interval_minutes: int = 15, - start_date: datetime = datetime(2020, 1, 1, tzinfo=pytz.timezone(constants.TIMEZONE.value)), - **general_flow_params, -) -> List[IntervalClock]: - """ - Generates multiple schedules - - Args: - clock_interval (timedelta): The interval to run the schedule - labels (List[str]): The labels to be added to the schedule - table_parameters (list): The table parameters to iterate over - runs_interval_minutes (int, optional): The interval between each schedule. Defaults to 15. - start_date (datetime, optional): The start date of the schedule. - Defaults to datetime(2020, 1, 1, tzinfo=pytz.timezone(constants.TIMEZONE.value)). - general_flow_params: Any param that you want to pass to the flow - Returns: - List[IntervalClock]: The list of schedules - - """ - if isinstance(table_parameters, dict): - table_parameters = [table_parameters] - - clocks = [] - for count, parameters in enumerate(table_parameters): - parameter_defaults = parameters | general_flow_params - clocks.append( - IntervalClock( - interval=clock_interval, - start_date=start_date + timedelta(minutes=runs_interval_minutes * count), - labels=labels, - parameter_defaults=parameter_defaults, - ) - ) - return clocks - - -def dict_contains_keys(input_dict: dict, keys: list[str]) -> bool: - """ - Test if the input dict has all keys present in the list - - Args: - input_dict (dict): the dict to test if has the keys - keys (list[str]): the list containing the keys to check - Returns: - bool: True if the input_dict has all the keys otherwise False - """ - return all(x in input_dict.keys() for x in keys) - - def custom_serialization(obj: Any) -> Any: """ - Function to serialize not JSON serializable objects + Função para serializar objetos não serializaveis + pela função json.dump Args: - obj (Any): Object to serialize + obj (Any): Objeto a ser serializado Returns: - Any: Serialized object + Any: Object serializado """ if isinstance(obj, (pd.Timestamp, date)): if isinstance(obj, pd.Timestamp): @@ -475,453 +36,104 @@ def custom_serialization(obj: Any) -> Any: raise TypeError(f"Object of type {type(obj)} is not JSON serializable") -def save_raw_local_func( - data: Union[dict, str], - filepath: str, - mode: str = "raw", - filetype: str = "json", -) -> str: - """ - Saves json response from API to .json file. - Args: - data (Union[dict, str]): Raw data to save - filepath (str): Path which to save raw file - mode (str, optional): Folder to save locally, later folder which to upload to GCS. - filetype (str, optional): The file format - Returns: - str: Path to the saved file - """ - - # diferentes tipos de arquivos para salvar - _filepath = filepath.format(mode=mode, filetype=filetype) - Path(_filepath).parent.mkdir(parents=True, exist_ok=True) - - if filetype == "json": - if isinstance(data, str): - data = json.loads(data) - with Path(_filepath).open("w", encoding="utf-8") as fi: - json.dump(data, fi, default=custom_serialization) - - if filetype in ("txt", "csv"): - with open(_filepath, "w", encoding="utf-8") as file: - file.write(data) - - log(f"Raw data saved to: {_filepath}") - return _filepath - - -def get_raw_data_api( # pylint: disable=R0912 - url: str, - secret_path: str = None, - api_params: dict = None, - filetype: str = None, -) -> tuple[str, str, str]: - """ - Request data from URL API - - Args: - url (str): URL to request data - secret_path (str, optional): Secret path to get headers. Defaults to None. - api_params (dict, optional): Parameters to pass to API. Defaults to None. - filetype (str, optional): Filetype to save raw file. Defaults to None. - - Returns: - tuple[str, str, str]: Error, data and filetype - """ - error = None - data = None - try: - if secret_path is None: - headers = secret_path - else: - headers = get_secret(secret_path)["data"] - - response = requests.get( - url, - headers=headers, - timeout=constants.MAX_TIMEOUT_SECONDS.value, - params=api_params, - ) - - response.raise_for_status() - - if filetype == "json": - data = response.json() - else: - data = response.text - - except Exception: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - return error, data, filetype - - -def get_upload_storage_blob( - dataset_id: str, - filename: str, -) -> Blob: +def data_info_str(data: pd.DataFrame): """ - Get a blob from upload zone in storage + Retorna as informações de um Dataframe como string Args: - dataset_id (str): The dataset id on BigQuery. - filename (str): The filename in GCS. + data (pd.DataFrame): Dataframe para extrair as informações Returns: - Blob: blob object + str: retorno do método data.info() """ - bucket = bd.Storage(dataset_id="", table_id="") - log(f"Filename: {filename}, dataset_id: {dataset_id}") - blob_list = list( - bucket.client["storage_staging"] - .bucket(bucket.bucket_name) - .list_blobs(prefix=f"upload/{dataset_id}/{filename}.") - ) - - return blob_list[0] + buffer = io.StringIO() + data.info(buf=buffer) + return buffer.getvalue() -def get_raw_data_gcs( - dataset_id: str, - table_id: str, - zip_filename: str = None, -) -> tuple[str, str, str]: +def create_timestamp_captura(timestamp: datetime) -> str: """ - Get raw data from GCS + Cria o valor para a coluna timestamp_captura Args: - dataset_id (str): The dataset id on BigQuery. - table_id (str): The table id on BigQuery. - zip_filename (str, optional): The zip file name. Defaults to None. + timestamp (datetime): timestamp a ser escrita Returns: - tuple[str, str, str]: Error, data and filetype + str: Valor a ser escrito na coluna timestamp_captura """ - error = None - data = None - filetype = None - - try: - blob_search_name = zip_filename or table_id - blob = get_upload_storage_blob(dataset_id=dataset_id, filename=blob_search_name) - - filename = blob.name - filetype = filename.split(".")[-1] - - data = blob.download_as_bytes() - - if filetype == "zip": - with zipfile.ZipFile(io.BytesIO(data), "r") as zipped_file: - filenames = zipped_file.namelist() - filename = list(filter(lambda x: x.split(".")[0] == table_id, filenames))[0] - filetype = filename.split(".")[-1] - data = zipped_file.read(filename) + if timestamp.tzinfo is None: + timestamp = timestamp.replace(tzinfo=pytz.UTC) - data = data.decode(encoding="utf-8") - - except Exception: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - return error, data, filetype + return timestamp.astimezone(tz=pytz.timezone(constants.TIMEZONE.value)).strftime( + "%Y-%m-%d %H:%M:%S-03:00" + ) -def get_raw_data_db( - query: str, engine: str, host: str, secret_path: str, database: str -) -> tuple[str, str, str]: +def isostr_to_datetime(datetime_str: str) -> datetime: """ - Get data from Databases + Converte uma string de data no formato iso em um datetime em UTC Args: - query (str): the SQL Query to execute - engine (str): The datase management system - host (str): The database host - secret_path (str): Secret path to get credentials - database (str): The database to connect + datetime_str (str): String a ser convertida Returns: - tuple[str, str, str]: Error, data and filetype - """ - connector_mapping = { - "postgresql": psycopg2.connect, - "mysql": pymysql.connect, - } - - data = None - error = None - filetype = "json" - - try: - credentials = get_secret(secret_path)["data"] - - with connector_mapping[engine]( - host=host, - user=credentials["user"], - password=credentials["password"], - database=database, - ) as connection: - data = pd.read_sql(sql=query, con=connection).to_dict(orient="records") - - except Exception: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - return error, data, filetype - - -def save_treated_local_func( - filepath: str, data: pd.DataFrame, error: str, mode: str = "staging" -) -> str: + datetime: String convertida em datetime """ - Save treated file to CSV. + converted = datetime.fromisoformat(datetime_str) + if converted.tzinfo is None: + converted = converted.replace(tzinfo=pytz.UTC) + else: + converted = converted.astimezone(tz=pytz.timezone("UTC")) - Args: - filepath (str): Path to save file - data (pd.DataFrame): Dataframe to save - error (str): Error catched during execution - mode (str, optional): Folder to save locally, later folder which to upload to GCS. + return converted - Returns: - str: Path to the saved file - """ - _filepath = filepath.format(mode=mode, filetype="csv") - Path(_filepath).parent.mkdir(parents=True, exist_ok=True) - if error is None: - data.to_csv(_filepath, index=False) - log(f"Treated data saved to: {_filepath}") - return _filepath - -def upload_run_logs_to_bq( # pylint: disable=R0913 +def create_sql_update_filter( + env: str, dataset_id: str, - parent_table_id: str, - timestamp: str, - error: str = None, - previous_error: str = None, - recapture: bool = False, - mode: str = "raw", -): - """ - Upload execution status table to BigQuery. - Table is uploaded to the same dataset, named {parent_table_id}_logs. - If passing status_dict, should not pass timestamp and error. - - Args: - dataset_id (str): dataset_id on BigQuery - parent_table_id (str): table_id on BigQuery - timestamp (str): timestamp to get datetime range - error (str): error catched during execution - previous_error (str): previous error catched during execution - recapture (bool): if the execution was a recapture - mode (str): folder to save locally, later folder which to upload to GCS - - Returns: - None - """ - table_id = parent_table_id + "_logs" - # Create partition directory - filename = f"{table_id}_{timestamp.isoformat()}" - partition = f"data={timestamp.date()}" - filepath = Path(f"""data/{mode}/{dataset_id}/{table_id}/{partition}/{filename}.csv""") - filepath.parent.mkdir(exist_ok=True, parents=True) - # Create dataframe to be uploaded - if not error and recapture is True: - # if the recapture is succeeded, update the column erro - dataframe = pd.DataFrame( - { - "timestamp_captura": [timestamp], - "sucesso": [True], - "erro": [f"[recapturado]{previous_error}"], - } - ) - log(f"Recapturing {timestamp} with previous error:\n{previous_error}") - else: - # not recapturing or error during flow execution - dataframe = pd.DataFrame( - { - "timestamp_captura": [timestamp], - "sucesso": [error is None], - "erro": [error], - } - ) - # Save data local - dataframe.to_csv(filepath, index=False) - # Upload to Storage - create_or_append_table( - dataset_id=dataset_id, - table_id=table_id, - path=filepath.as_posix(), - partitions=partition, - ) - if error is not None: - raise Exception(f"Pipeline failed with error: {error}") - - -def get_datetime_range( - timestamp: datetime, - interval: timedelta, -) -> dict: + table_id: str, + columns_to_search: list[str], +) -> str: """ - Task to get datetime range in UTC + Cria condição para ser usada no WHERE de queries SQL + de modo a buscar por mudanças em um conjunto de colunas + com base na tabela do BQ. Args: - timestamp (datetime): timestamp to get datetime range - interval (timedelta): interval to get datetime range + env (str): Dev ou prod. + dataset_id (str): Dataset_id no BigQuery. + table_id (str): Table_id no BigQuery. + columns_to_search (list[str]): Lista de nomes das colunas + para buscar por alterações. Returns: - dict: datetime range - """ - - start = (timestamp - interval).astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") - - end = timestamp.astimezone(tz=pytz.timezone("UTC")).strftime("%Y-%m-%d %H:%M:%S") - - return {"start": start, "end": end} - - -def read_raw_data(filepath: str, csv_args: dict = None) -> tuple[str, pd.DataFrame]: + str: Condição para ser adicionada na query. Se a tabela não existir no BQ, retorna 1=1 """ - Read raw data from file - - Args: - filepath (str): filepath to read - csv_args (dict): arguments to pass to pandas.read_csv + project = constants.PROJECT_NAME.value[env] + log(f"project = {project}") + columns_to_concat_bq = [c.split(".")[-1] for c in columns_to_search] + concat_arg = ",'_'," - Returns: - tuple[str, pd.DataFrame]: error and data - """ - error = None - data = None try: - file_type = filepath.split(".")[-1] - - if file_type == "json": - data = pd.read_json(filepath) - - # data = json.loads(data) - elif file_type in ("txt", "csv"): - if csv_args is None: - csv_args = {} - data = pd.read_csv(filepath, **csv_args) - else: - error = "Unsupported raw file extension. Supported only: json, csv and txt" - - except Exception: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - - return error, data - - -def get_raw_recursos(request_url: str, request_params: dict) -> tuple[str, str, str]: - """ - Returns a dataframe with recursos data from movidesk api. - """ - all_records = False - top = 1000 - skip = 0 - error = None - filetype = "json" - data = [] - - while not all_records: - try: - request_params["$top"] = top - request_params["$skip"] = skip - - log(f"Request url {request_url}") - - response = requests.get( - request_url, - params=request_params, - timeout=constants.MAX_TIMEOUT_SECONDS.value, - ) - response.raise_for_status() - - paginated_data = response.json() - - if isinstance(paginated_data, dict): - paginated_data = [paginated_data] - - if len(paginated_data) == top: - skip += top - time.sleep(36) - else: - if len(paginated_data) == 0: - log("Nenhum dado para tratar.") - break - all_records = True - data += paginated_data - - log(f"Dados (paginados): {len(data)}") - - except Exception as error: - error = traceback.format_exc() - log(f"[CATCHED] Task failed with error: \n{error}", level="error") - data = [] - break - - log(f"Request concluído, tamanho dos dados: {len(data)}.") - - return error, data, filetype - - -def build_table_id(mode: str, report_type: str): - """Build table_id based on which table is the target - of current flow run - - Args: - mode (str): SPPO or STPL - report_type (str): RHO or RDO - - Returns: - str: table_id - """ - if mode == "SPPO": - if report_type == "RDO": - table_id = constants.SPPO_RDO_TABLE_ID.value - else: - table_id = constants.SPPO_RHO_TABLE_ID.value - if mode == "STPL": - # slice the string to get rid of V at end of - # STPL reports filenames - if report_type[:3] == "RDO": - table_id = constants.STPL_RDO_TABLE_ID.value - else: - table_id = constants.STPL_RHO_TABLE_ID.value - return table_id - - -def generate_ftp_schedules(interval_minutes: int, label: str = constants.RJ_SMTR_AGENT_LABEL.value): - """Generates IntervalClocks with the parameters needed to capture - each report. - - Args: - interval_minutes (int): interval which this flow will be run. - label (str, optional): Prefect label, defines which agent to use when launching flow run. - Defaults to constants.RJ_SMTR_AGENT_LABEL.value. - - Returns: - List(IntervalClock): containing the clocks for scheduling runs - """ - modes = ["SPPO", "STPL"] - reports = ["RDO", "RHO"] - clocks = [] - for mode in modes: - for report in reports: - clocks.append( - IntervalClock( - interval=timedelta(minutes=interval_minutes), - start_date=datetime( - 2022, 12, 16, 5, 0, tzinfo=timezone(constants.TIMEZONE.value) - ), - parameter_defaults={ - "transport_mode": mode, - "report_type": report, - "table_id": build_table_id(mode=mode, report_type=report), - }, - labels=[label], - ) - ) - return clocks + query = f""" + SELECT + CONCAT("'", {concat_arg.join(columns_to_concat_bq)}, "'") + FROM + `{project}.{dataset_id}.{table_id}` + """ + log(query) + last_values = bd.read_sql(query=query, billing_project_id=project) + + last_values = last_values.iloc[:, 0].to_list() + last_values = ", ".join(last_values) + update_condition = f"""CONCAT( + {concat_arg.join(columns_to_search)} + ) NOT IN ({last_values}) + """ + + except GenericGBQException as err: + if "404 Not found" in str(err): + log("table not found, setting updates to 1=1") + update_condition = "1=1" + + return update_condition diff --git a/poetry.lock b/poetry.lock index 613c0f27e..7f0c63736 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "agate" @@ -1059,6 +1059,77 @@ files = [ {file = "graphql_core-3.2.3-py3-none-any.whl", hash = "sha256:5766780452bd5ec8ba133f8bf287dc92713e3868ddd83aee4faab9fc3e303dc3"}, ] +[[package]] +name = "greenlet" +version = "3.0.3" +description = "Lightweight in-process concurrent programming" +optional = false +python-versions = ">=3.7" +files = [ + {file = "greenlet-3.0.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:9da2bd29ed9e4f15955dd1595ad7bc9320308a3b766ef7f837e23ad4b4aac31a"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d353cadd6083fdb056bb46ed07e4340b0869c305c8ca54ef9da3421acbdf6881"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dca1e2f3ca00b84a396bc1bce13dd21f680f035314d2379c4160c98153b2059b"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3ed7fb269f15dc662787f4119ec300ad0702fa1b19d2135a37c2c4de6fadfd4a"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd4f49ae60e10adbc94b45c0b5e6a179acc1736cf7a90160b404076ee283cf83"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:73a411ef564e0e097dbe7e866bb2dda0f027e072b04da387282b02c308807405"}, + {file = "greenlet-3.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7f362975f2d179f9e26928c5b517524e89dd48530a0202570d55ad6ca5d8a56f"}, + {file = "greenlet-3.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:649dde7de1a5eceb258f9cb00bdf50e978c9db1b996964cd80703614c86495eb"}, + {file = "greenlet-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:68834da854554926fbedd38c76e60c4a2e3198c6fbed520b106a8986445caaf9"}, + {file = "greenlet-3.0.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:b1b5667cced97081bf57b8fa1d6bfca67814b0afd38208d52538316e9422fc61"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52f59dd9c96ad2fc0d5724107444f76eb20aaccb675bf825df6435acb7703559"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:afaff6cf5200befd5cec055b07d1c0a5a06c040fe5ad148abcd11ba6ab9b114e"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe754d231288e1e64323cfad462fcee8f0288654c10bdf4f603a39ed923bef33"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2797aa5aedac23af156bbb5a6aa2cd3427ada2972c828244eb7d1b9255846379"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7f009caad047246ed379e1c4dbcb8b020f0a390667ea74d2387be2998f58a22"}, + {file = "greenlet-3.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c5e1536de2aad7bf62e27baf79225d0d64360d4168cf2e6becb91baf1ed074f3"}, + {file = "greenlet-3.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:894393ce10ceac937e56ec00bb71c4c2f8209ad516e96033e4b3b1de270e200d"}, + {file = "greenlet-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:1ea188d4f49089fc6fb283845ab18a2518d279c7cd9da1065d7a84e991748728"}, + {file = "greenlet-3.0.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:70fb482fdf2c707765ab5f0b6655e9cfcf3780d8d87355a063547b41177599be"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4d1ac74f5c0c0524e4a24335350edad7e5f03b9532da7ea4d3c54d527784f2e"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:149e94a2dd82d19838fe4b2259f1b6b9957d5ba1b25640d2380bea9c5df37676"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15d79dd26056573940fcb8c7413d84118086f2ec1a8acdfa854631084393efcc"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b7db1ebff4ba09aaaeae6aa491daeb226c8150fc20e836ad00041bcb11230"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fcd2469d6a2cf298f198f0487e0a5b1a47a42ca0fa4dfd1b6862c999f018ebbf"}, + {file = "greenlet-3.0.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1f672519db1796ca0d8753f9e78ec02355e862d0998193038c7073045899f305"}, + {file = "greenlet-3.0.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2516a9957eed41dd8f1ec0c604f1cdc86758b587d964668b5b196a9db5bfcde6"}, + {file = "greenlet-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:bba5387a6975598857d86de9eac14210a49d554a77eb8261cc68b7d082f78ce2"}, + {file = "greenlet-3.0.3-cp37-cp37m-macosx_11_0_universal2.whl", hash = "sha256:5b51e85cb5ceda94e79d019ed36b35386e8c37d22f07d6a751cb659b180d5274"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:daf3cb43b7cf2ba96d614252ce1684c1bccee6b2183a01328c98d36fcd7d5cb0"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99bf650dc5d69546e076f413a87481ee1d2d09aaaaaca058c9251b6d8c14783f"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dd6e660effd852586b6a8478a1d244b8dc90ab5b1321751d2ea15deb49ed414"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3391d1e16e2a5a1507d83e4a8b100f4ee626e8eca43cf2cadb543de69827c4c"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1f145462f1fa6e4a4ae3c0f782e580ce44d57c8f2c7aae1b6fa88c0b2efdb41"}, + {file = "greenlet-3.0.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1a7191e42732df52cb5f39d3527217e7ab73cae2cb3694d241e18f53d84ea9a7"}, + {file = "greenlet-3.0.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0448abc479fab28b00cb472d278828b3ccca164531daab4e970a0458786055d6"}, + {file = "greenlet-3.0.3-cp37-cp37m-win32.whl", hash = "sha256:b542be2440edc2d48547b5923c408cbe0fc94afb9f18741faa6ae970dbcb9b6d"}, + {file = "greenlet-3.0.3-cp37-cp37m-win_amd64.whl", hash = "sha256:01bc7ea167cf943b4c802068e178bbf70ae2e8c080467070d01bfa02f337ee67"}, + {file = "greenlet-3.0.3-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:1996cb9306c8595335bb157d133daf5cf9f693ef413e7673cb07e3e5871379ca"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ddc0f794e6ad661e321caa8d2f0a55ce01213c74722587256fb6566049a8b04"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9db1c18f0eaad2f804728c67d6c610778456e3e1cc4ab4bbd5eeb8e6053c6fc"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7170375bcc99f1a2fbd9c306f5be8764eaf3ac6b5cb968862cad4c7057756506"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b66c9c1e7ccabad3a7d037b2bcb740122a7b17a53734b7d72a344ce39882a1b"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:098d86f528c855ead3479afe84b49242e174ed262456c342d70fc7f972bc13c4"}, + {file = "greenlet-3.0.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:81bb9c6d52e8321f09c3d165b2a78c680506d9af285bfccbad9fb7ad5a5da3e5"}, + {file = "greenlet-3.0.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fd096eb7ffef17c456cfa587523c5f92321ae02427ff955bebe9e3c63bc9f0da"}, + {file = "greenlet-3.0.3-cp38-cp38-win32.whl", hash = "sha256:d46677c85c5ba00a9cb6f7a00b2bfa6f812192d2c9f7d9c4f6a55b60216712f3"}, + {file = "greenlet-3.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:419b386f84949bf0e7c73e6032e3457b82a787c1ab4a0e43732898a761cc9dbf"}, + {file = "greenlet-3.0.3-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:da70d4d51c8b306bb7a031d5cff6cc25ad253affe89b70352af5f1cb68e74b53"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086152f8fbc5955df88382e8a75984e2bb1c892ad2e3c80a2508954e52295257"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d73a9fe764d77f87f8ec26a0c85144d6a951a6c438dfe50487df5595c6373eac"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7dcbe92cc99f08c8dd11f930de4d99ef756c3591a5377d1d9cd7dd5e896da71"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1551a8195c0d4a68fac7a4325efac0d541b48def35feb49d803674ac32582f61"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:64d7675ad83578e3fc149b617a444fab8efdafc9385471f868eb5ff83e446b8b"}, + {file = "greenlet-3.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b37eef18ea55f2ffd8f00ff8fe7c8d3818abd3e25fb73fae2ca3b672e333a7a6"}, + {file = "greenlet-3.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:77457465d89b8263bca14759d7c1684df840b6811b2499838cc5b040a8b5b113"}, + {file = "greenlet-3.0.3-cp39-cp39-win32.whl", hash = "sha256:57e8974f23e47dac22b83436bdcf23080ade568ce77df33159e019d161ce1d1e"}, + {file = "greenlet-3.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:c5ee858cfe08f34712f548c3c363e807e7186f03ad7a5039ebadb29e8c6be067"}, + {file = "greenlet-3.0.3.tar.gz", hash = "sha256:43374442353259554ce33599da8b692d5aa96f8976d567d4badf263371fbe491"}, +] + +[package.extras] +docs = ["Sphinx", "furo"] +test = ["objgraph", "psutil"] + [[package]] name = "grpc-google-iam-v1" version = "0.13.0" @@ -1433,16 +1504,6 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -2249,7 +2310,6 @@ files = [ {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, @@ -2258,8 +2318,6 @@ files = [ {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, @@ -3051,6 +3109,93 @@ files = [ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] +[[package]] +name = "sqlalchemy" +version = "2.0.25" +description = "Database Abstraction Library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "SQLAlchemy-2.0.25-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4344d059265cc8b1b1be351bfb88749294b87a8b2bbe21dfbe066c4199541ebd"}, + {file = "SQLAlchemy-2.0.25-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6f9e2e59cbcc6ba1488404aad43de005d05ca56e069477b33ff74e91b6319735"}, + {file = "SQLAlchemy-2.0.25-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84daa0a2055df9ca0f148a64fdde12ac635e30edbca80e87df9b3aaf419e144a"}, + {file = "SQLAlchemy-2.0.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc8b7dabe8e67c4832891a5d322cec6d44ef02f432b4588390017f5cec186a84"}, + {file = "SQLAlchemy-2.0.25-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f5693145220517b5f42393e07a6898acdfe820e136c98663b971906120549da5"}, + {file = "SQLAlchemy-2.0.25-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:db854730a25db7c956423bb9fb4bdd1216c839a689bf9cc15fada0a7fb2f4570"}, + {file = "SQLAlchemy-2.0.25-cp310-cp310-win32.whl", hash = "sha256:14a6f68e8fc96e5e8f5647ef6cda6250c780612a573d99e4d881581432ef1669"}, + {file = "SQLAlchemy-2.0.25-cp310-cp310-win_amd64.whl", hash = "sha256:87f6e732bccd7dcf1741c00f1ecf33797383128bd1c90144ac8adc02cbb98643"}, + {file = "SQLAlchemy-2.0.25-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:342d365988ba88ada8af320d43df4e0b13a694dbd75951f537b2d5e4cb5cd002"}, + {file = "SQLAlchemy-2.0.25-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f37c0caf14b9e9b9e8f6dbc81bc56db06acb4363eba5a633167781a48ef036ed"}, + {file = "SQLAlchemy-2.0.25-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa9373708763ef46782d10e950b49d0235bfe58facebd76917d3f5cbf5971aed"}, + {file = "SQLAlchemy-2.0.25-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d24f571990c05f6b36a396218f251f3e0dda916e0c687ef6fdca5072743208f5"}, + {file = "SQLAlchemy-2.0.25-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75432b5b14dc2fff43c50435e248b45c7cdadef73388e5610852b95280ffd0e9"}, + {file = "SQLAlchemy-2.0.25-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:884272dcd3ad97f47702965a0e902b540541890f468d24bd1d98bcfe41c3f018"}, + {file = "SQLAlchemy-2.0.25-cp311-cp311-win32.whl", hash = "sha256:e607cdd99cbf9bb80391f54446b86e16eea6ad309361942bf88318bcd452363c"}, + {file = "SQLAlchemy-2.0.25-cp311-cp311-win_amd64.whl", hash = "sha256:7d505815ac340568fd03f719446a589162d55c52f08abd77ba8964fbb7eb5b5f"}, + {file = "SQLAlchemy-2.0.25-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0dacf67aee53b16f365c589ce72e766efaabd2b145f9de7c917777b575e3659d"}, + {file = "SQLAlchemy-2.0.25-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b801154027107461ee992ff4b5c09aa7cc6ec91ddfe50d02bca344918c3265c6"}, + {file = "SQLAlchemy-2.0.25-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59a21853f5daeb50412d459cfb13cb82c089ad4c04ec208cd14dddd99fc23b39"}, + {file = "SQLAlchemy-2.0.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29049e2c299b5ace92cbed0c1610a7a236f3baf4c6b66eb9547c01179f638ec5"}, + {file = "SQLAlchemy-2.0.25-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b64b183d610b424a160b0d4d880995e935208fc043d0302dd29fee32d1ee3f95"}, + {file = "SQLAlchemy-2.0.25-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4f7a7d7fcc675d3d85fbf3b3828ecd5990b8d61bd6de3f1b260080b3beccf215"}, + {file = "SQLAlchemy-2.0.25-cp312-cp312-win32.whl", hash = "sha256:cf18ff7fc9941b8fc23437cc3e68ed4ebeff3599eec6ef5eebf305f3d2e9a7c2"}, + {file = "SQLAlchemy-2.0.25-cp312-cp312-win_amd64.whl", hash = "sha256:91f7d9d1c4dd1f4f6e092874c128c11165eafcf7c963128f79e28f8445de82d5"}, + {file = "SQLAlchemy-2.0.25-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:bb209a73b8307f8fe4fe46f6ad5979649be01607f11af1eb94aa9e8a3aaf77f0"}, + {file = "SQLAlchemy-2.0.25-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:798f717ae7c806d67145f6ae94dc7c342d3222d3b9a311a784f371a4333212c7"}, + {file = "SQLAlchemy-2.0.25-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fdd402169aa00df3142149940b3bf9ce7dde075928c1886d9a1df63d4b8de62"}, + {file = "SQLAlchemy-2.0.25-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:0d3cab3076af2e4aa5693f89622bef7fa770c6fec967143e4da7508b3dceb9b9"}, + {file = "SQLAlchemy-2.0.25-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:74b080c897563f81062b74e44f5a72fa44c2b373741a9ade701d5f789a10ba23"}, + {file = "SQLAlchemy-2.0.25-cp37-cp37m-win32.whl", hash = "sha256:87d91043ea0dc65ee583026cb18e1b458d8ec5fc0a93637126b5fc0bc3ea68c4"}, + {file = "SQLAlchemy-2.0.25-cp37-cp37m-win_amd64.whl", hash = "sha256:75f99202324383d613ddd1f7455ac908dca9c2dd729ec8584c9541dd41822a2c"}, + {file = "SQLAlchemy-2.0.25-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:420362338681eec03f53467804541a854617faed7272fe71a1bfdb07336a381e"}, + {file = "SQLAlchemy-2.0.25-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c88f0c7dcc5f99bdb34b4fd9b69b93c89f893f454f40219fe923a3a2fd11625"}, + {file = "SQLAlchemy-2.0.25-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3be4987e3ee9d9a380b66393b77a4cd6d742480c951a1c56a23c335caca4ce3"}, + {file = "SQLAlchemy-2.0.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a159111a0f58fb034c93eeba211b4141137ec4b0a6e75789ab7a3ef3c7e7e3"}, + {file = "SQLAlchemy-2.0.25-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8b8cb63d3ea63b29074dcd29da4dc6a97ad1349151f2d2949495418fd6e48db9"}, + {file = "SQLAlchemy-2.0.25-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:736ea78cd06de6c21ecba7416499e7236a22374561493b456a1f7ffbe3f6cdb4"}, + {file = "SQLAlchemy-2.0.25-cp38-cp38-win32.whl", hash = "sha256:10331f129982a19df4284ceac6fe87353ca3ca6b4ca77ff7d697209ae0a5915e"}, + {file = "SQLAlchemy-2.0.25-cp38-cp38-win_amd64.whl", hash = "sha256:c55731c116806836a5d678a70c84cb13f2cedba920212ba7dcad53260997666d"}, + {file = "SQLAlchemy-2.0.25-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:605b6b059f4b57b277f75ace81cc5bc6335efcbcc4ccb9066695e515dbdb3900"}, + {file = "SQLAlchemy-2.0.25-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:665f0a3954635b5b777a55111ababf44b4fc12b1f3ba0a435b602b6387ffd7cf"}, + {file = "SQLAlchemy-2.0.25-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecf6d4cda1f9f6cb0b45803a01ea7f034e2f1aed9475e883410812d9f9e3cfcf"}, + {file = "SQLAlchemy-2.0.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c51db269513917394faec5e5c00d6f83829742ba62e2ac4fa5c98d58be91662f"}, + {file = "SQLAlchemy-2.0.25-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:790f533fa5c8901a62b6fef5811d48980adeb2f51f1290ade8b5e7ba990ba3de"}, + {file = "SQLAlchemy-2.0.25-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1b1180cda6df7af84fe72e4530f192231b1f29a7496951db4ff38dac1687202d"}, + {file = "SQLAlchemy-2.0.25-cp39-cp39-win32.whl", hash = "sha256:555651adbb503ac7f4cb35834c5e4ae0819aab2cd24857a123370764dc7d7e24"}, + {file = "SQLAlchemy-2.0.25-cp39-cp39-win_amd64.whl", hash = "sha256:dc55990143cbd853a5d038c05e79284baedf3e299661389654551bd02a6a68d7"}, + {file = "SQLAlchemy-2.0.25-py3-none-any.whl", hash = "sha256:a86b4240e67d4753dc3092d9511886795b3c2852abe599cffe108952f7af7ac3"}, + {file = "SQLAlchemy-2.0.25.tar.gz", hash = "sha256:a2c69a7664fb2d54b8682dd774c3b54f67f84fa123cf84dda2a5f40dcaa04e08"}, +] + +[package.dependencies] +greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} +typing-extensions = ">=4.6.0" + +[package.extras] +aiomysql = ["aiomysql (>=0.2.0)", "greenlet (!=0.4.17)"] +aioodbc = ["aioodbc", "greenlet (!=0.4.17)"] +aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)", "typing_extensions (!=3.10.0.1)"] +asyncio = ["greenlet (!=0.4.17)"] +asyncmy = ["asyncmy (>=0.2.3,!=0.2.4,!=0.2.6)", "greenlet (!=0.4.17)"] +mariadb-connector = ["mariadb (>=1.0.1,!=1.1.2,!=1.1.5)"] +mssql = ["pyodbc"] +mssql-pymssql = ["pymssql"] +mssql-pyodbc = ["pyodbc"] +mypy = ["mypy (>=0.910)"] +mysql = ["mysqlclient (>=1.4.0)"] +mysql-connector = ["mysql-connector-python"] +oracle = ["cx_oracle (>=8)"] +oracle-oracledb = ["oracledb (>=1.0.1)"] +postgresql = ["psycopg2 (>=2.7)"] +postgresql-asyncpg = ["asyncpg", "greenlet (!=0.4.17)"] +postgresql-pg8000 = ["pg8000 (>=1.29.1)"] +postgresql-psycopg = ["psycopg (>=3.0.7)"] +postgresql-psycopg2binary = ["psycopg2-binary"] +postgresql-psycopg2cffi = ["psycopg2cffi"] +postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] +pymysql = ["pymysql"] +sqlcipher = ["sqlcipher3_binary"] + [[package]] name = "sqlparse" version = "0.4.4" @@ -3440,4 +3585,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "85fe9e6473ac4080266366c7979715797c706a03ca5a4beef307ffa3ba9aac85" +content-hash = "54a8e7450ae4d994a18d35b69b9b9da2fa10b6d881acd774cc4118cd5890b414" diff --git a/pyproject.toml b/pyproject.toml index 56699abbc..72bca4757 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -128,6 +128,7 @@ zipp = "3.17.0" pymysql = "^1.1.0" psycopg2-binary = "^2.9.9" redis-pal = "^1.0.0" +sqlalchemy = "^2.0.25" [tool.poetry.group.dev]