@@ -234,29 +234,30 @@ def __cmr_collection_query(provider, short_name):
234234 return search_results ['feed' ]['entry' ]
235235
236236def __cmr_query (provider , short_name , version , time_start , time_end , ** kwargs ):
237- """Perform a scrolling CMR query for files matching input criteria."""
237+ """Perform a search-after CMR query for files matching input criteria."""
238238 kwargs .setdefault ('polygon' ,None )
239239 kwargs .setdefault ('name_filter' ,None )
240240 kwargs .setdefault ('return_metadata' ,False )
241241 # build params
242242 params = '&short_name={0}' .format (short_name )
243243 if version != None :
244244 params += '&version={0}' .format (version )
245- if time_start != None and time_end != None :
245+ if time_start is not None and time_end is not None :
246246 params += '&temporal[]={0},{1}' .format (time_start , time_end )
247247 if kwargs ['polygon' ]:
248248 params += '&polygon={0}' .format (kwargs ['polygon' ])
249249 if kwargs ['name_filter' ]:
250250 params += '&options[producer_granule_id][pattern]=true'
251251 params += '&producer_granule_id[]=' + kwargs ['name_filter' ]
252+
252253 CMR_URL = 'https://cmr.earthdata.nasa.gov'
253254 cmr_query_url = ('{0}/search/granules.json?provider={1}'
254255 '&sort_key[]=start_date&sort_key[]=producer_granule_id'
255- '&scroll=true& page_size={2}' .format (CMR_URL , provider , CMR_PAGE_SIZE ))
256+ '&page_size={2}' .format (CMR_URL , provider , CMR_PAGE_SIZE ))
256257 cmr_query_url += params
257- logger .debug ('cmr request={0} \n ' . format ( cmr_query_url ) )
258+ logger .debug (f'Initial CMR request: { cmr_query_url } ' )
258259
259- cmr_scroll_id = None
260+ cmr_search_after = None
260261 ctx = ssl .create_default_context ()
261262 ctx .check_hostname = False
262263 ctx .verify_mode = ssl .CERT_NONE
@@ -266,15 +267,18 @@ def __cmr_query(provider, short_name, version, time_start, time_end, **kwargs):
266267 metadata = sliderule .emptyframe ()
267268 while True :
268269 req = urllib .request .Request (cmr_query_url )
269- if cmr_scroll_id :
270- req .add_header ('cmr-scroll-id' , cmr_scroll_id )
270+ if cmr_search_after :
271+ req .add_header ('CMR-Search-After' , cmr_search_after )
272+ logger .debug (f'Requesting next page with CMR-Search-After: { cmr_search_after } ' )
273+
271274 response = urllib .request .urlopen (req , context = ctx )
272- if not cmr_scroll_id :
273- # Python 2 and 3 have different case for the http headers
274- headers = { k . lower (): v for k , v in dict ( response . info ()). items ()}
275- cmr_scroll_id = headers [ 'cmr-scroll-id' ]
275+
276+ headers = { k . lower (): v for k , v in dict ( response . info ()). items ()}
277+ cmr_search_after = headers . get ( 'cmr-search-after' )
278+
276279 search_page = response .read ()
277280 search_page = json .loads (search_page .decode ('utf-8' ))
281+
278282 url_scroll_results = __cmr_filter_urls (search_page , DATASETS [short_name ]["formats" ])
279283 if not url_scroll_results :
280284 break
@@ -284,10 +288,22 @@ def __cmr_query(provider, short_name, version, time_start, time_end, **kwargs):
284288 metadata_results = __cmr_granule_metadata (search_page )
285289 else :
286290 metadata_results = geopandas .pd .DataFrame ([None for _ in url_scroll_results ])
291+
287292 # append granule metadata
288293 metadata = geopandas .pd .concat ([metadata , metadata_results ])
289294
290- return (urls ,metadata )
295+ # Two ways to determine that there is no more data available:
296+ # 1. The number of granules in the current response is less than the requested 'page_size':
297+ # 2. The absence of the 'CMR-Search-After' header
298+ result_count = len (search_page ['feed' ]['entry' ])
299+ if result_count < CMR_PAGE_SIZE :
300+ logger .debug (f'Received { result_count } results, fewer than page size. Ending pagination after processing.' )
301+ break
302+ if not cmr_search_after :
303+ logger .debug ('No CMR-Search-After header found, no more pages.' )
304+ break
305+
306+ return urls , metadata
291307
292308###############################################################################
293309# CMR UTILITIES
@@ -389,7 +405,12 @@ def __cmr_max_version(provider, short_name):
389405#
390406def __build_geojson (rsps ):
391407 geojson = rsps .json ()
392- del geojson ["links" ]
408+ next = None
409+ if "links" in geojson :
410+ for link in geojson ["links" ]:
411+ if link ["rel" ] == "next" :
412+ next = link ["href" ]
413+ del geojson ["links" ]
393414 if 'numberMatched' in geojson :
394415 del geojson ['numberMatched' ]
395416 if 'numberReturned' in geojson :
@@ -410,7 +431,7 @@ def __build_geojson(rsps):
410431 if "href" in assetsDict [val ]:
411432 propertiesDict [val ] = assetsDict [val ]["href" ]
412433 del geojson ["features" ][i ]["assets" ]
413- return geojson
434+ return geojson , next
414435
415436#
416437# Perform a STAC Query
@@ -450,22 +471,16 @@ def __stac_search(provider, short_name, collections, polygons, time_start, time_
450471 # make initial stac request
451472 data = context .post (url , data = json .dumps (rqst ), headers = headers )
452473 data .raise_for_status ()
453- geojson = __build_geojson (data )
454-
455- # iterate through additional pages if not all returned
456- num_returned = geojson ["context" ]["returned" ]
457- num_matched = geojson ["context" ]["matched" ]
458- if num_matched > max_requested_resources :
459- logger .warn ("Number of matched resources truncated from {} to {}" .format (num_matched , max_requested_resources ))
460- num_matched = max_requested_resources
461- num_pages = int ((num_matched + (num_returned - 1 )) / num_returned )
462- for page in range (2 , num_pages + 1 ):
463- rqst ["page" ] = page
464- data = context .post (url , data = json .dumps (rqst ), headers = headers )
474+ geojson , next_link = __build_geojson (data )
475+
476+ # Continue fetching pages if 'next' link is available
477+ while next_link :
478+ data = context .get (next_link , headers = headers )
465479 data .raise_for_status ()
466- _geojson = __build_geojson (data )
480+ _geojson , next_link = __build_geojson (data )
467481 geojson ["features" ] += _geojson ["features" ]
468- geojson ["context" ]["returned" ] = num_matched
482+
483+ geojson ["context" ]["returned" ] = len (geojson ["features" ])
469484 geojson ["context" ]["limit" ] = max_requested_resources
470485
471486 # return geojson dictionary
0 commit comments