Skip to content

Commit 9c02703

Browse files
authored
Merge branch 'master' into pre-commit-ci-update-config
2 parents 0eefa98 + 83756bf commit 9c02703

File tree

5 files changed

+254
-24
lines changed

5 files changed

+254
-24
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Changelog - br_rj_riodejaneiro_onibus_gps
2+
3+
## [1.0.1] - 2024-04-26
4+
5+
### Adicionado
6+
7+
- Cria task `clean_br_rj_riodejaneiro_onibus_gps` (https://github.com/prefeitura-rio/pipelines/pull/673)
8+
9+
### Alterado
10+
11+
- Otimiza e inclui parâmetros de rematerialização no flow `materialize_sppo` (https://github.com/prefeitura-rio/pipelines/pull/673)
12+
13+
## [1.0.0] - 2024-04-26
14+
15+
### Adicionado
16+
17+
- Adiciona flow `recaptura_realocacao_sppo` (https://github.com/prefeitura-rio/pipelines/pull/668)
18+
19+
### Alterado
20+
21+
- Altera flow `recaptura`, incluindo acionamento do `recaptura_realocacao_sppo` (https://github.com/prefeitura-rio/pipelines/pull/668)
22+
23+
### Corrigido
24+
25+
- Corrigido parâmetro `timestamp` do flow `realocacao_sppo` (https://github.com/prefeitura-rio/pipelines/pull/668)

pipelines/rj_smtr/br_rj_riodejaneiro_onibus_gps/flows.py

Lines changed: 150 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
Flows for br_rj_riodejaneiro_onibus_gps
44
"""
55

6-
from prefect import Parameter, case
6+
from prefect import Parameter, case, task
77
from prefect.run_configs import KubernetesRun
88
from prefect.storage import GCS
99
from prefect.tasks.prefect import create_flow_run, wait_for_flow_run
1010
from prefect.utilities.edges import unmapped
11+
from prefect.tasks.control_flow import merge
1112

1213

1314
# EMD Imports #
@@ -34,6 +35,7 @@
3435
get_materialization_date_range,
3536
# get_local_dbt_client,
3637
get_raw,
38+
get_rounded_timestamp,
3739
parse_timestamp_to_string,
3840
query_logs,
3941
save_raw_local,
@@ -47,6 +49,7 @@
4749
create_api_url_onibus_gps,
4850
create_api_url_onibus_realocacao,
4951
pre_treatment_br_rj_riodejaneiro_onibus_realocacao,
52+
clean_br_rj_riodejaneiro_onibus_gps,
5053
)
5154

5255
from pipelines.rj_smtr.schedules import (
@@ -79,7 +82,7 @@
7982
rebuild = Parameter("rebuild", False)
8083

8184
# SETUP
82-
timestamp = get_current_timestamp()
85+
timestamp = get_rounded_timestamp(interval_minutes=10)
8386

8487
rename_flow_run = rename_current_flow_run_now_time(
8588
prefix=realocacao_sppo.name + ": ", now_time=timestamp
@@ -154,6 +157,9 @@
154157
dataset_id = Parameter("dataset_id", default=constants.GPS_SPPO_DATASET_ID.value)
155158
table_id = Parameter("table_id", default=constants.GPS_SPPO_TREATED_TABLE_ID.value)
156159
rebuild = Parameter("rebuild", False)
160+
rematerialization = Parameter("rematerialization", default=False)
161+
date_range_start = Parameter("date_range_start", default=None)
162+
date_range_end = Parameter("date_range_end", default=None)
157163

158164
LABELS = get_current_flow_labels()
159165
MODE = get_current_flow_mode(LABELS)
@@ -164,22 +170,35 @@
164170
# dbt_client = get_local_dbt_client(host="localhost", port=3001)
165171

166172
# Set specific run parameters #
167-
date_range = get_materialization_date_range(
168-
dataset_id=dataset_id,
169-
table_id=table_id,
170-
raw_dataset_id=raw_dataset_id,
171-
raw_table_id=raw_table_id,
172-
table_run_datetime_column_name="timestamp_gps",
173-
mode=MODE,
174-
delay_hours=constants.GPS_SPPO_MATERIALIZE_DELAY_HOURS.value,
173+
with case(rematerialization, False):
174+
rematerialization_dates_false = date_range = get_materialization_date_range(
175+
dataset_id=dataset_id,
176+
table_id=table_id,
177+
raw_dataset_id=raw_dataset_id,
178+
raw_table_id=raw_table_id,
179+
table_run_datetime_column_name="timestamp_gps",
180+
mode=MODE,
181+
delay_hours=constants.GPS_SPPO_MATERIALIZE_DELAY_HOURS.value,
182+
)
183+
with case(rematerialization, True):
184+
date_range = {
185+
"date_range_start": date_range_start,
186+
"date_range_end": date_range_end,
187+
}
188+
rematerialization_dates_true = clean_br_rj_riodejaneiro_onibus_gps(date_range)
189+
190+
rematerialization_dates = merge(
191+
rematerialization_dates_true, rematerialization_dates_false
175192
)
193+
176194
dataset_sha = fetch_dataset_sha(
177195
dataset_id=dataset_id,
196+
upstream_tasks=[rematerialization_dates],
178197
)
179198

180199
# Run materialization #
181200
with case(rebuild, True):
182-
RUN = run_dbt_model(
201+
RUN_TRUE = run_dbt_model(
183202
dbt_client=dbt_client,
184203
dataset_id=dataset_id,
185204
table_id=table_id,
@@ -188,30 +207,39 @@
188207
_vars=[date_range, dataset_sha],
189208
flags="--full-refresh",
190209
)
191-
set_last_run_timestamp(
192-
dataset_id=dataset_id,
193-
table_id=table_id,
194-
timestamp=date_range["date_range_end"],
195-
wait=RUN,
196-
mode=MODE,
197-
)
210+
198211
with case(rebuild, False):
199-
RUN = run_dbt_model(
212+
RUN_FALSE = run_dbt_model(
200213
dbt_client=dbt_client,
201214
dataset_id=dataset_id,
202215
table_id=table_id,
203216
exclude="+data_versao_efetiva",
204217
_vars=[date_range, dataset_sha],
205218
upstream=True,
206219
)
207-
set_last_run_timestamp(
220+
221+
RUN = merge(RUN_TRUE, RUN_FALSE)
222+
223+
with case(rematerialization, False):
224+
SET_FALSE = set_last_run_timestamp(
208225
dataset_id=dataset_id,
209226
table_id=table_id,
210227
timestamp=date_range["date_range_end"],
211228
wait=RUN,
212229
mode=MODE,
213230
)
214231

232+
with case(rematerialization, True):
233+
SET_TRUE = task(
234+
lambda: [None],
235+
checkpoint=False,
236+
name="assign_none_to_previous_runs",
237+
)()
238+
239+
SET = merge(SET_TRUE, SET_FALSE)
240+
241+
materialize_sppo.set_reference_tasks([RUN, rematerialization_dates, SET])
242+
215243
materialize_sppo.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
216244
materialize_sppo.run_config = KubernetesRun(
217245
image=emd_constants.DOCKER_IMAGE.value,
@@ -282,20 +310,120 @@
282310
)
283311
captura_sppo_v2.schedule = every_minute
284312

313+
with Flow(
314+
"SMTR: GPS SPPO Realocação - Recaptura (subflow)",
315+
code_owners=["caio", "fernanda", "boris", "rodrigo"],
316+
) as recaptura_realocacao_sppo:
317+
timestamp = Parameter("timestamp", default=None)
318+
recapture_window_days = Parameter("recapture_window_days", default=1)
319+
320+
# SETUP #
321+
LABELS = get_current_flow_labels()
322+
323+
# Consulta de logs para verificar erros
324+
errors, timestamps, previous_errors = query_logs(
325+
dataset_id=constants.GPS_SPPO_RAW_DATASET_ID.value,
326+
table_id=constants.GPS_SPPO_REALOCACAO_RAW_TABLE_ID.value,
327+
datetime_filter=get_rounded_timestamp(timestamp=timestamp, interval_minutes=10),
328+
interval_minutes=10,
329+
recapture_window_days=recapture_window_days,
330+
)
331+
332+
rename_flow_run = rename_current_flow_run_now_time(
333+
prefix=recaptura_realocacao_sppo.name + ": ",
334+
now_time=get_now_time(),
335+
wait=timestamps,
336+
)
337+
338+
# Em caso de erros, executa a recaptura
339+
with case(errors, True):
340+
# SETUP #
341+
partitions = create_date_hour_partition.map(timestamps)
342+
filename = parse_timestamp_to_string.map(timestamps)
343+
344+
filepath = create_local_partition_path.map(
345+
dataset_id=unmapped(constants.GPS_SPPO_RAW_DATASET_ID.value),
346+
table_id=unmapped(constants.GPS_SPPO_REALOCACAO_RAW_TABLE_ID.value),
347+
filename=filename,
348+
partitions=partitions,
349+
)
350+
351+
url = create_api_url_onibus_realocacao.map(timestamp=timestamps)
352+
353+
# EXTRACT #
354+
raw_status = get_raw.map(url)
355+
356+
raw_filepath = save_raw_local.map(status=raw_status, file_path=filepath)
357+
358+
# CLEAN #
359+
treated_status = pre_treatment_br_rj_riodejaneiro_onibus_realocacao.map(
360+
status=raw_status, timestamp=timestamps
361+
)
362+
363+
treated_filepath = save_treated_local.map(
364+
status=treated_status, file_path=filepath
365+
)
366+
367+
# LOAD #
368+
error = bq_upload.map(
369+
dataset_id=unmapped(constants.GPS_SPPO_RAW_DATASET_ID.value),
370+
table_id=unmapped(constants.GPS_SPPO_REALOCACAO_RAW_TABLE_ID.value),
371+
filepath=treated_filepath,
372+
raw_filepath=raw_filepath,
373+
partitions=partitions,
374+
status=treated_status,
375+
)
376+
377+
upload_logs_to_bq.map(
378+
dataset_id=unmapped(constants.GPS_SPPO_RAW_DATASET_ID.value),
379+
parent_table_id=unmapped(constants.GPS_SPPO_REALOCACAO_RAW_TABLE_ID.value),
380+
error=error,
381+
previous_error=previous_errors,
382+
timestamp=timestamps,
383+
recapture=unmapped(True),
384+
)
385+
386+
recaptura_realocacao_sppo.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value)
387+
recaptura_realocacao_sppo.run_config = KubernetesRun(
388+
image=emd_constants.DOCKER_IMAGE.value,
389+
labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
390+
)
285391

286392
with Flow(
287393
"SMTR: GPS SPPO - Tratamento", code_owners=["caio", "fernanda", "boris", "rodrigo"]
288394
) as recaptura:
289395
version = Parameter("version", default=2)
290-
datetime_filter = Parameter("datetime_filter", default=None)
396+
datetime_filter_gps = Parameter("datetime_filter_gps", default=None)
291397
materialize = Parameter("materialize", default=True)
292398
# SETUP #
293399
LABELS = get_current_flow_labels()
294400

401+
rounded_timestamp = get_rounded_timestamp(interval_minutes=60)
402+
rounded_timestamp_str = parse_timestamp_to_string(
403+
timestamp=rounded_timestamp, pattern="%Y-%m-%d %H:%M:%S"
404+
)
405+
406+
# roda o subflow de recaptura da realocação
407+
run_recaptura_realocacao_sppo = create_flow_run(
408+
flow_name=recaptura_realocacao_sppo.name,
409+
project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value,
410+
labels=LABELS,
411+
run_name=recaptura_realocacao_sppo.name,
412+
parameters={"timestamp": rounded_timestamp_str},
413+
)
414+
415+
wait_recaptura_realocacao_sppo = wait_for_flow_run(
416+
run_recaptura_realocacao_sppo,
417+
stream_states=True,
418+
stream_logs=True,
419+
raise_final_state=True,
420+
)
421+
295422
errors, timestamps, previous_errors = query_logs(
296423
dataset_id=constants.GPS_SPPO_RAW_DATASET_ID.value,
297424
table_id=constants.GPS_SPPO_RAW_TABLE_ID.value,
298-
datetime_filter=datetime_filter,
425+
datetime_filter=datetime_filter_gps,
426+
upstream_tasks=[wait_recaptura_realocacao_sppo],
299427
)
300428

301429
rename_flow_run = rename_current_flow_run_now_time(

pipelines/rj_smtr/br_rj_riodejaneiro_onibus_gps/tasks.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import pandas as pd
1010
from prefect import task
1111
import pendulum
12+
import basedosdados as bd
13+
from typing import Union
1214

1315
# EMD Imports #
1416

@@ -254,3 +256,71 @@ def pre_treatment_br_rj_riodejaneiro_onibus_gps(
254256
log(f"[CATCHED] Task failed with error: \n{error}", level="error")
255257

256258
return {"data": df_gps, "error": error}
259+
260+
261+
@task
262+
def clean_br_rj_riodejaneiro_onibus_gps(date_range: dict) -> Union[str, None]:
263+
"""
264+
Clean GPS data for a given date range.
265+
266+
This function deletes records from three different tables in the database:
267+
- `rj-smtr.br_rj_riodejaneiro_onibus_gps.sppo_aux_registros_filtrada`
268+
- `rj-smtr.br_rj_riodejaneiro_onibus_gps.sppo_aux_registros_realocacao`
269+
- `rj-smtr.br_rj_riodejaneiro_veiculos.gps_sppo`
270+
271+
The records to be deleted are determined by the provided
272+
date range and the timestamp_gps column.
273+
274+
Parameters:
275+
- date_range (dict): A dictionary containing the start
276+
and end dates for the data to be cleaned.
277+
278+
Returns:
279+
- str or None: If an error occurs during the cleaning process,
280+
the error message is returned. Otherwise, None is returned.
281+
282+
"""
283+
error = None
284+
285+
try:
286+
q = f"""
287+
DELETE
288+
FROM
289+
`rj-smtr.br_rj_riodejaneiro_onibus_gps.sppo_aux_registros_filtrada`
290+
WHERE
291+
(data BETWEEN DATE("{date_range['date_range_start']}")
292+
AND DATE("{date_range['date_range_end']}"))
293+
AND (timestamp_gps > "{date_range['date_range_start']}"
294+
AND timestamp_gps <= "{date_range['date_range_end']}");
295+
DELETE
296+
FROM
297+
`rj-smtr.br_rj_riodejaneiro_onibus_gps.sppo_aux_registros_realocacao`
298+
WHERE
299+
(data BETWEEN DATE("{date_range['date_range_start']}")
300+
AND DATE("{date_range['date_range_end']}"))
301+
AND (timestamp_gps > "{date_range['date_range_start']}"
302+
AND timestamp_gps <= "{date_range['date_range_end']}");
303+
DELETE
304+
FROM
305+
`rj-smtr.br_rj_riodejaneiro_veiculos.gps_sppo`
306+
WHERE
307+
(data BETWEEN DATE("{date_range['date_range_start']}")
308+
AND DATE("{date_range['date_range_end']}"))
309+
AND (timestamp_gps > "{date_range['date_range_start']}"
310+
AND timestamp_gps <= "{date_range['date_range_end']}");
311+
"""
312+
log(q)
313+
314+
results = bd.read_sql(q)
315+
316+
log(
317+
f"""Cleaned GPS data for
318+
{date_range['date_range_start']} to {date_range['date_range_end']}\n
319+
Resulting:\n
320+
{results}"""
321+
)
322+
except Exception: # pylint: disable = W0703
323+
error = traceback.format_exc()
324+
log(f"[CATCHED] Task failed with error: \n{error}", level="error")
325+
326+
return error
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Changelog - veiculo
2+
3+
## [1.0.0] - 2024-04-25
4+
5+
### Alterado
6+
7+
- Desliga schedule dos flows `sppo_infracao_captura` e `sppo_licenciamento_captura` em razão de indisponibilidade e geração de dados imprecisos na fonte (SIURB) (https://github.com/prefeitura-rio/pipelines/pull/672)

pipelines/rj_smtr/veiculo/flows.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@
140140
image=emd_constants.DOCKER_IMAGE.value,
141141
labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
142142
)
143-
sppo_licenciamento_captura.schedule = every_day_hour_seven
143+
# sppo_licenciamento_captura.schedule = every_day_hour_seven
144144

145145
with Flow(
146146
f"SMTR: {constants.VEICULO_DATASET_ID.value} {constants.SPPO_INFRACAO_TABLE_ID.value} - Captura",
@@ -218,7 +218,7 @@
218218
image=emd_constants.DOCKER_IMAGE.value,
219219
labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
220220
)
221-
sppo_infracao_captura.schedule = every_day_hour_seven
221+
# sppo_infracao_captura.schedule = every_day_hour_seven
222222

223223
# flake8: noqa: E501
224224
with Flow(

0 commit comments

Comments
 (0)