|
3 | 3 | Flows for br_rj_riodejaneiro_onibus_gps |
4 | 4 | """ |
5 | 5 |
|
6 | | -from prefect import Parameter, case |
| 6 | +from prefect import Parameter, case, task |
7 | 7 | from prefect.run_configs import KubernetesRun |
8 | 8 | from prefect.storage import GCS |
9 | 9 | from prefect.tasks.prefect import create_flow_run, wait_for_flow_run |
10 | 10 | from prefect.utilities.edges import unmapped |
| 11 | +from prefect.tasks.control_flow import merge |
11 | 12 |
|
12 | 13 |
|
13 | 14 | # EMD Imports # |
|
34 | 35 | get_materialization_date_range, |
35 | 36 | # get_local_dbt_client, |
36 | 37 | get_raw, |
| 38 | + get_rounded_timestamp, |
37 | 39 | parse_timestamp_to_string, |
38 | 40 | query_logs, |
39 | 41 | save_raw_local, |
|
47 | 49 | create_api_url_onibus_gps, |
48 | 50 | create_api_url_onibus_realocacao, |
49 | 51 | pre_treatment_br_rj_riodejaneiro_onibus_realocacao, |
| 52 | + clean_br_rj_riodejaneiro_onibus_gps, |
50 | 53 | ) |
51 | 54 |
|
52 | 55 | from pipelines.rj_smtr.schedules import ( |
|
79 | 82 | rebuild = Parameter("rebuild", False) |
80 | 83 |
|
81 | 84 | # SETUP |
82 | | - timestamp = get_current_timestamp() |
| 85 | + timestamp = get_rounded_timestamp(interval_minutes=10) |
83 | 86 |
|
84 | 87 | rename_flow_run = rename_current_flow_run_now_time( |
85 | 88 | prefix=realocacao_sppo.name + ": ", now_time=timestamp |
|
154 | 157 | dataset_id = Parameter("dataset_id", default=constants.GPS_SPPO_DATASET_ID.value) |
155 | 158 | table_id = Parameter("table_id", default=constants.GPS_SPPO_TREATED_TABLE_ID.value) |
156 | 159 | rebuild = Parameter("rebuild", False) |
| 160 | + rematerialization = Parameter("rematerialization", default=False) |
| 161 | + date_range_start = Parameter("date_range_start", default=None) |
| 162 | + date_range_end = Parameter("date_range_end", default=None) |
157 | 163 |
|
158 | 164 | LABELS = get_current_flow_labels() |
159 | 165 | MODE = get_current_flow_mode(LABELS) |
|
164 | 170 | # dbt_client = get_local_dbt_client(host="localhost", port=3001) |
165 | 171 |
|
166 | 172 | # Set specific run parameters # |
167 | | - date_range = get_materialization_date_range( |
168 | | - dataset_id=dataset_id, |
169 | | - table_id=table_id, |
170 | | - raw_dataset_id=raw_dataset_id, |
171 | | - raw_table_id=raw_table_id, |
172 | | - table_run_datetime_column_name="timestamp_gps", |
173 | | - mode=MODE, |
174 | | - delay_hours=constants.GPS_SPPO_MATERIALIZE_DELAY_HOURS.value, |
| 173 | + with case(rematerialization, False): |
| 174 | + rematerialization_dates_false = date_range = get_materialization_date_range( |
| 175 | + dataset_id=dataset_id, |
| 176 | + table_id=table_id, |
| 177 | + raw_dataset_id=raw_dataset_id, |
| 178 | + raw_table_id=raw_table_id, |
| 179 | + table_run_datetime_column_name="timestamp_gps", |
| 180 | + mode=MODE, |
| 181 | + delay_hours=constants.GPS_SPPO_MATERIALIZE_DELAY_HOURS.value, |
| 182 | + ) |
| 183 | + with case(rematerialization, True): |
| 184 | + date_range = { |
| 185 | + "date_range_start": date_range_start, |
| 186 | + "date_range_end": date_range_end, |
| 187 | + } |
| 188 | + rematerialization_dates_true = clean_br_rj_riodejaneiro_onibus_gps(date_range) |
| 189 | + |
| 190 | + rematerialization_dates = merge( |
| 191 | + rematerialization_dates_true, rematerialization_dates_false |
175 | 192 | ) |
| 193 | + |
176 | 194 | dataset_sha = fetch_dataset_sha( |
177 | 195 | dataset_id=dataset_id, |
| 196 | + upstream_tasks=[rematerialization_dates], |
178 | 197 | ) |
179 | 198 |
|
180 | 199 | # Run materialization # |
181 | 200 | with case(rebuild, True): |
182 | | - RUN = run_dbt_model( |
| 201 | + RUN_TRUE = run_dbt_model( |
183 | 202 | dbt_client=dbt_client, |
184 | 203 | dataset_id=dataset_id, |
185 | 204 | table_id=table_id, |
|
188 | 207 | _vars=[date_range, dataset_sha], |
189 | 208 | flags="--full-refresh", |
190 | 209 | ) |
191 | | - set_last_run_timestamp( |
192 | | - dataset_id=dataset_id, |
193 | | - table_id=table_id, |
194 | | - timestamp=date_range["date_range_end"], |
195 | | - wait=RUN, |
196 | | - mode=MODE, |
197 | | - ) |
| 210 | + |
198 | 211 | with case(rebuild, False): |
199 | | - RUN = run_dbt_model( |
| 212 | + RUN_FALSE = run_dbt_model( |
200 | 213 | dbt_client=dbt_client, |
201 | 214 | dataset_id=dataset_id, |
202 | 215 | table_id=table_id, |
203 | 216 | exclude="+data_versao_efetiva", |
204 | 217 | _vars=[date_range, dataset_sha], |
205 | 218 | upstream=True, |
206 | 219 | ) |
207 | | - set_last_run_timestamp( |
| 220 | + |
| 221 | + RUN = merge(RUN_TRUE, RUN_FALSE) |
| 222 | + |
| 223 | + with case(rematerialization, False): |
| 224 | + SET_FALSE = set_last_run_timestamp( |
208 | 225 | dataset_id=dataset_id, |
209 | 226 | table_id=table_id, |
210 | 227 | timestamp=date_range["date_range_end"], |
211 | 228 | wait=RUN, |
212 | 229 | mode=MODE, |
213 | 230 | ) |
214 | 231 |
|
| 232 | + with case(rematerialization, True): |
| 233 | + SET_TRUE = task( |
| 234 | + lambda: [None], |
| 235 | + checkpoint=False, |
| 236 | + name="assign_none_to_previous_runs", |
| 237 | + )() |
| 238 | + |
| 239 | + SET = merge(SET_TRUE, SET_FALSE) |
| 240 | + |
| 241 | + materialize_sppo.set_reference_tasks([RUN, rematerialization_dates, SET]) |
| 242 | + |
215 | 243 | materialize_sppo.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) |
216 | 244 | materialize_sppo.run_config = KubernetesRun( |
217 | 245 | image=emd_constants.DOCKER_IMAGE.value, |
|
282 | 310 | ) |
283 | 311 | captura_sppo_v2.schedule = every_minute |
284 | 312 |
|
| 313 | +with Flow( |
| 314 | + "SMTR: GPS SPPO Realocação - Recaptura (subflow)", |
| 315 | + code_owners=["caio", "fernanda", "boris", "rodrigo"], |
| 316 | +) as recaptura_realocacao_sppo: |
| 317 | + timestamp = Parameter("timestamp", default=None) |
| 318 | + recapture_window_days = Parameter("recapture_window_days", default=1) |
| 319 | + |
| 320 | + # SETUP # |
| 321 | + LABELS = get_current_flow_labels() |
| 322 | + |
| 323 | + # Consulta de logs para verificar erros |
| 324 | + errors, timestamps, previous_errors = query_logs( |
| 325 | + dataset_id=constants.GPS_SPPO_RAW_DATASET_ID.value, |
| 326 | + table_id=constants.GPS_SPPO_REALOCACAO_RAW_TABLE_ID.value, |
| 327 | + datetime_filter=get_rounded_timestamp(timestamp=timestamp, interval_minutes=10), |
| 328 | + interval_minutes=10, |
| 329 | + recapture_window_days=recapture_window_days, |
| 330 | + ) |
| 331 | + |
| 332 | + rename_flow_run = rename_current_flow_run_now_time( |
| 333 | + prefix=recaptura_realocacao_sppo.name + ": ", |
| 334 | + now_time=get_now_time(), |
| 335 | + wait=timestamps, |
| 336 | + ) |
| 337 | + |
| 338 | + # Em caso de erros, executa a recaptura |
| 339 | + with case(errors, True): |
| 340 | + # SETUP # |
| 341 | + partitions = create_date_hour_partition.map(timestamps) |
| 342 | + filename = parse_timestamp_to_string.map(timestamps) |
| 343 | + |
| 344 | + filepath = create_local_partition_path.map( |
| 345 | + dataset_id=unmapped(constants.GPS_SPPO_RAW_DATASET_ID.value), |
| 346 | + table_id=unmapped(constants.GPS_SPPO_REALOCACAO_RAW_TABLE_ID.value), |
| 347 | + filename=filename, |
| 348 | + partitions=partitions, |
| 349 | + ) |
| 350 | + |
| 351 | + url = create_api_url_onibus_realocacao.map(timestamp=timestamps) |
| 352 | + |
| 353 | + # EXTRACT # |
| 354 | + raw_status = get_raw.map(url) |
| 355 | + |
| 356 | + raw_filepath = save_raw_local.map(status=raw_status, file_path=filepath) |
| 357 | + |
| 358 | + # CLEAN # |
| 359 | + treated_status = pre_treatment_br_rj_riodejaneiro_onibus_realocacao.map( |
| 360 | + status=raw_status, timestamp=timestamps |
| 361 | + ) |
| 362 | + |
| 363 | + treated_filepath = save_treated_local.map( |
| 364 | + status=treated_status, file_path=filepath |
| 365 | + ) |
| 366 | + |
| 367 | + # LOAD # |
| 368 | + error = bq_upload.map( |
| 369 | + dataset_id=unmapped(constants.GPS_SPPO_RAW_DATASET_ID.value), |
| 370 | + table_id=unmapped(constants.GPS_SPPO_REALOCACAO_RAW_TABLE_ID.value), |
| 371 | + filepath=treated_filepath, |
| 372 | + raw_filepath=raw_filepath, |
| 373 | + partitions=partitions, |
| 374 | + status=treated_status, |
| 375 | + ) |
| 376 | + |
| 377 | + upload_logs_to_bq.map( |
| 378 | + dataset_id=unmapped(constants.GPS_SPPO_RAW_DATASET_ID.value), |
| 379 | + parent_table_id=unmapped(constants.GPS_SPPO_REALOCACAO_RAW_TABLE_ID.value), |
| 380 | + error=error, |
| 381 | + previous_error=previous_errors, |
| 382 | + timestamp=timestamps, |
| 383 | + recapture=unmapped(True), |
| 384 | + ) |
| 385 | + |
| 386 | +recaptura_realocacao_sppo.storage = GCS(emd_constants.GCS_FLOWS_BUCKET.value) |
| 387 | +recaptura_realocacao_sppo.run_config = KubernetesRun( |
| 388 | + image=emd_constants.DOCKER_IMAGE.value, |
| 389 | + labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value], |
| 390 | +) |
285 | 391 |
|
286 | 392 | with Flow( |
287 | 393 | "SMTR: GPS SPPO - Tratamento", code_owners=["caio", "fernanda", "boris", "rodrigo"] |
288 | 394 | ) as recaptura: |
289 | 395 | version = Parameter("version", default=2) |
290 | | - datetime_filter = Parameter("datetime_filter", default=None) |
| 396 | + datetime_filter_gps = Parameter("datetime_filter_gps", default=None) |
291 | 397 | materialize = Parameter("materialize", default=True) |
292 | 398 | # SETUP # |
293 | 399 | LABELS = get_current_flow_labels() |
294 | 400 |
|
| 401 | + rounded_timestamp = get_rounded_timestamp(interval_minutes=60) |
| 402 | + rounded_timestamp_str = parse_timestamp_to_string( |
| 403 | + timestamp=rounded_timestamp, pattern="%Y-%m-%d %H:%M:%S" |
| 404 | + ) |
| 405 | + |
| 406 | + # roda o subflow de recaptura da realocação |
| 407 | + run_recaptura_realocacao_sppo = create_flow_run( |
| 408 | + flow_name=recaptura_realocacao_sppo.name, |
| 409 | + project_name=emd_constants.PREFECT_DEFAULT_PROJECT.value, |
| 410 | + labels=LABELS, |
| 411 | + run_name=recaptura_realocacao_sppo.name, |
| 412 | + parameters={"timestamp": rounded_timestamp_str}, |
| 413 | + ) |
| 414 | + |
| 415 | + wait_recaptura_realocacao_sppo = wait_for_flow_run( |
| 416 | + run_recaptura_realocacao_sppo, |
| 417 | + stream_states=True, |
| 418 | + stream_logs=True, |
| 419 | + raise_final_state=True, |
| 420 | + ) |
| 421 | + |
295 | 422 | errors, timestamps, previous_errors = query_logs( |
296 | 423 | dataset_id=constants.GPS_SPPO_RAW_DATASET_ID.value, |
297 | 424 | table_id=constants.GPS_SPPO_RAW_TABLE_ID.value, |
298 | | - datetime_filter=datetime_filter, |
| 425 | + datetime_filter=datetime_filter_gps, |
| 426 | + upstream_tasks=[wait_recaptura_realocacao_sppo], |
299 | 427 | ) |
300 | 428 |
|
301 | 429 | rename_flow_run = rename_current_flow_run_now_time( |
|
0 commit comments