1- from collections import defaultdict
2- from datetime import datetime , timedelta
3- import dateutil
4- from copy import deepcopy
51import asyncio
2+ import functools
63import json
4+ import operator
5+ import re
6+ from collections import Counter , defaultdict
7+ from copy import deepcopy
8+ from datetime import datetime , timedelta
79
8- from util .job_submitter import try_submit_mozart_job
9-
10- from data_subscriber .es_conn_util import get_document_timestamp_min_max
10+ import dateutil
1111
1212from data_subscriber .cmr import CMR_TIME_FORMAT , async_query_cmr
13- from data_subscriber .url import determine_acquisition_cycle , rtc_for_dist_unique_id
1413from data_subscriber .cslc_utils import save_blocked_download_job , parse_r2_product_file_name
15- from data_subscriber .query import BaseQuery , get_query_timerange , DateTimeRange
1614from data_subscriber .dist_s1_utils import (localize_dist_burst_db , process_dist_burst_db , compute_dist_s1_triggering ,
1715 extend_rtc_for_dist_records , build_rtc_native_ids , rtc_granules_by_acq_index ,
1816 basic_decorate_granule , add_unique_rtc_granules , get_unique_rtc_id_for_dist ,
19- parse_k_parameter , decorate_granule , PENDING_TYPE_RTC_FOR_DIST_DOWNLOAD )
17+ parse_k_parameter , PENDING_TYPE_RTC_FOR_DIST_DOWNLOAD )
18+ from data_subscriber .es_conn_util import get_document_timestamp_min_max
19+ from data_subscriber .query import BaseQuery , DateTimeRange
2020from data_subscriber .rtc_for_dist .dist_dependency import DistDependency , CMR_RTC_CACHE_INDEX
21-
21+ from rtc_utils import rtc_granule_regex
2222from tools .populate_cmr_rtc_cache import populate_cmr_rtc_cache , parse_rtc_granule_metadata
23+ from util .job_submitter import try_submit_mozart_job
2324
2425DIST_K_MULT_FACTOR = 2 # TODO: This should be a setting in probably settings.yaml; must be an integer
2526EARLIEST_POSSIBLE_RTC_DATE = "2016-01-01T00:00:00Z"
@@ -298,7 +299,7 @@ def retrieve_baseline_granules(self, product_id, downloads, args, k_offsets_and_
298299 self .logger .info (f"Retrieving K-1 granules { start_date = } { end_date = } for { product_id = } " )
299300 self .logger .debug (new_args )
300301
301- # Step 1 of 2 : This will return dict of acquisition_cycle -> set of granules for only onse that match the burst pattern
302+ # Step 1 of 3 : This will return dict of acquisition_cycle -> set of granules for only onse that match the burst pattern
302303 granules = asyncio .run (async_query_cmr (new_args , self .token , self .cmr , self .settings , query_timerange , datetime .now (), verbose = verbose ))
303304 for granule in granules :
304305 basic_decorate_granule (granule )
@@ -307,29 +308,72 @@ def retrieve_baseline_granules(self, product_id, downloads, args, k_offsets_and_
307308 granules = self .unique_latest_granules (granules )
308309 granules_map = rtc_granules_by_acq_index (granules )
309310
310- # Step 2 of 2 ...Sort that by acquisition_cycle in decreasing order and then pick the first k-1 frames
311+ # Step 2 of 3 ...Sort that by acquisition_cycle in decreasing order and then pick the first k-1 frames
311312 acq_day_indices = sorted (granules_map .keys (), reverse = True )
313+ possible_k_granules = []
312314 for acq_day_index in acq_day_indices :
313315 granules = granules_map [acq_day_index ]
314- k_granules .extend (granules )
316+ possible_k_granules .extend (granules )
315317 k_satisfied += 1
316318 self .logger .info (f"{ product_id = } { acq_day_index = } satisfies. { k_satisfied = } { k_offset = } { k_count = } { len (granules )= } " )
317319 if k_satisfied == k_count :
318320 break
319321
320322 counter += 1
321323
324+ # Step 3 of 3: Only copy over k_count per burst_id from possible_k_granules to k_granules
325+ burst_id_to_granules_map = defaultdict (list )
326+ for granule in possible_k_granules :
327+ match_product_id = re .match (rtc_granule_regex , granule ["granule_id" ])
328+ burst_id = match_product_id .group ("burst_id" )
329+ if len (burst_id_to_granules_map [burst_id ]) >= k_count :
330+ continue # skip any extra baseline granules per burst_id, capping the number to k_count, per k_offset
331+
332+ burst_id_to_granules_map [burst_id ].append (granule )
333+ burst_id_to_granules_map = dict (burst_id_to_granules_map )
334+
335+ possible_k_granules = functools .reduce (operator .add , burst_id_to_granules_map .values (), [])
336+ k_granules .extend (possible_k_granules )
337+
338+ self .logger .info (f"{ len (k_granules )= } " )
322339 return k_granules
323340
324341 def download_job_submission_handler (self , total_granules , query_timerange ):
325342
326343 def add_filtered_urls (granule , filtered_urls : list ):
327344 if granule .get ("filtered_urls" ):
345+ polarizations = []
328346 for filter_url in granule .get ("filtered_urls" ):
329- # Get rid of .h and mask.tif files that aren't used
330- # NOTE: If we want to enable https downloads in the download worker, we need to change this
331- if "s3://" in filter_url and (filter_url [- 6 :] in ["VV.tif" , "VH.tif" , "HH.tif" , "HV.tif" ]):
332- filtered_urls .append (filter_url )
347+ if filter_url .endswith ("VV.tif" ):
348+ polarizations .append ("VVVH" )
349+ if filter_url .endswith ("HH.tif" ):
350+ polarizations .append ("HHHV" )
351+
352+ most_common_polarization = Counter (polarizations ).most_common (1 )
353+
354+ if most_common_polarization and most_common_polarization [0 ][0 ] == "VVVH" :
355+ for filter_url in granule .get ("filtered_urls" ):
356+ # NOTE: If we want to enable https downloads in the download worker, we need to change this
357+ if not filter_url .startswith ("s3://" ):
358+ continue
359+
360+ if any (filter_url .endswith (s ) for s in ["VV.tif" , "VH.tif" ]):
361+ filtered_urls .append (filter_url )
362+ elif most_common_polarization and most_common_polarization [0 ][0 ] == "HHHV" :
363+ for filter_url in granule .get ("filtered_urls" ):
364+ # NOTE: If we want to enable https downloads in the download worker, we need to change this
365+ if not filter_url .startswith ("s3://" ):
366+ continue
367+
368+ if any (filter_url .endswith (s ) for s in ["HH.tif" , "HV.tif" ]):
369+ filtered_urls .append (filter_url )
370+ else :
371+ self .logger .error (f"Unexpected polarization { most_common_polarization = } . Falling back to regular filtering." )
372+ for filter_url in granule .get ("filtered_urls" ):
373+ # Get rid of .h and mask.tif files that aren't used
374+ # NOTE: If we want to enable https downloads in the download worker, we need to change this
375+ if "s3://" in filter_url and (filter_url [- 6 :] in ["VV.tif" , "VH.tif" , "HH.tif" , "HV.tif" ]):
376+ filtered_urls .append (filter_url )
333377
334378 batch_id_to_urls_map = defaultdict (list )
335379 batch_id_to_baseline_urls = defaultdict (list )
0 commit comments