pangeo-data · jbusecke · Mar 9, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 21, 2023
diff --git a/create_filtered_catalog.py b/create_filtered_catalog.py
@@ -5,29 +5,29 @@
 from functools import reduce
 from tqdm.autonotebook import tqdm
 from datetime import date
-from retractions import query_retraction_retry
+from retractions import query_retraction
 
 gcs = gcsfs.GCSFileSystem()
 catalog_url = "https://cmip6.storage.googleapis.com/pangeo-cmip6.csv"
 node_urls = [
 "https://esgf-node.llnl.gov/esg-search/search",
 "https://esgf-data.dkrz.de/esg-search/search",
-"https://esgf-index1.ceda.ac.uk/esg-search/search",
+# "https://esgf-index1.ceda.ac.uk/esg-search/search",
 "https://esgf-node.ipsl.upmc.fr/esg-search/search",
 ]
 
 params = {
     "type": "Dataset",
     "mip_era": "CMIP6",
-    "replica": "false",
+    # "replica": "false", #FIXME: Somehow this still does not give me the same results from every node...very strange
     "distrib": "true",
     "retracted": "true",
     "format": "application/solr+json",
     "fields": "instance_id",
 }
 # query every one of the nodes
 retracted_ids = {
-     url.split('.')[1] :query_retraction_retry(
+     url.split('.')[1] :query_retraction(
         url, params, batchsize=10000
     ) for url in node_urls
 }
@@ -79,8 +79,6 @@ def unique_instances(df, df_full):
 backup_df = pd.read_csv(f"https://cmip6.storage.googleapis.com/{backup_filename}")
 print(f'Backed up catalog has {len(backup_df)} items')
 
-
-
 # FILTER THE CURRENT CATALOG
 pangeo_df["instance_id"] = pangeo_df["zstore"].apply(
     lambda x: ".".join(x.replace("gs://cmip6/", "").split("/")[0:-1])

diff --git a/retractions.py b/retractions.py
@@ -1,4 +1,3 @@
-from tqdm.autonotebook import tqdm
 import requests
 import json
 import time
@@ -7,23 +6,30 @@ def query_retraction(url, params, batchsize):
     print(f"Downloading Retraction Data from {url}...")
     resp = requests.get(url=url, params=params)
     header = resp.json()  # Check the JSON Response Content documentation below
-    n_items = header["response"]["numFound"]
-    print(f"Found {n_items} items.")
-
-    batches = range(0, n_items+1, batchsize)  # if I offset these, can
     params["limit"] = batchsize
-
+    
     batch_jsons = []
-
+    print(f"batchsize= {batchsize}")
 
-    for batch in tqdm(batches):
-        params["offset"] = batch
+    has_data = True
+    offset = 0
+    while has_data:
+        print('----------')
+        print(f"offset={offset}")
+        params["offset"] = offset
         resp = requests.get(url=url, params=params)
         if resp.status_code != 200:
-            print(batch)
             print(resp.status_code)
         data = resp.json()
+        n_data = len(data["response"]["docs"])
+        print(f"{n_data} entries found")
+
         batch_jsons.append(data)
+        offset +=batchsize
+
+        if len(data["response"]["docs"]) == 0:
+            has_data = False
+            print('No more data found')
 
     # Convert to list of instance ids
     print("Extracting instance_ids...")
@@ -32,29 +38,13 @@ def query_retraction(url, params, batchsize):
         extracted = [i["instance_id"] for i in data["response"]["docs"]]
         all_retracted_instance_ids.extend(extracted)
 
-    # Fail out here if the total number of items is not what was promised in the header
-    # I had a few instances today, where that was the case, I think a simple retry is
-    # a good enough solution for now.
     n_retracted = len(all_retracted_instance_ids)
-    if n_retracted == n_items:
-        print('Successfully downloaded all retraction info')
-    else:
-        raise RuntimeError(f'Downloaded retraction info is incomplete. Found {n_retracted} items, expected {n_items}')
-
+
+    print(f'Downloaded {n_retracted} retraction info')
+
     # There is the possibility that we are getting duplicate instance_ids here because we query replicas
     # Make sure dubplicates are not carried forward
+    # TODO: Do we need to check that retracted is indeed true here?
     retracted_instance_ids = set(all_retracted_instance_ids)
     print(f"{len(all_retracted_instance_ids)-len(retracted_instance_ids)} replicas found")
     return retracted_instance_ids
-
-def query_retraction_retry(url, params, batchsize = 10000):
-    """Retrys query if it fails"""
-    status = 0
-    while status == 0:
-        try:
-            query_result = query_retraction(url, params, batchsize)
-            status = 1
-        except RuntimeError as e:
-            print(f"{e}.\nRetrying")
-
-    return query_result