Skip to content

Fixing the retraction logic #57

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 16 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions create_filtered_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,29 @@
from functools import reduce
from tqdm.autonotebook import tqdm
from datetime import date
from retractions import query_retraction_retry
from retractions import query_retraction

gcs = gcsfs.GCSFileSystem()
catalog_url = "https://cmip6.storage.googleapis.com/pangeo-cmip6.csv"
node_urls = [
"https://esgf-node.llnl.gov/esg-search/search",
"https://esgf-data.dkrz.de/esg-search/search",
"https://esgf-index1.ceda.ac.uk/esg-search/search",
# "https://esgf-index1.ceda.ac.uk/esg-search/search",
"https://esgf-node.ipsl.upmc.fr/esg-search/search",
]

params = {
"type": "Dataset",
"mip_era": "CMIP6",
"replica": "false",
# "replica": "false", #FIXME: Somehow this still does not give me the same results from every node...very strange
"distrib": "true",
"retracted": "true",
"format": "application/solr+json",
"fields": "instance_id",
}
# query every one of the nodes
retracted_ids = {
url.split('.')[1] :query_retraction_retry(
url.split('.')[1] :query_retraction(
url, params, batchsize=10000
) for url in node_urls
}
Expand Down Expand Up @@ -79,8 +79,6 @@ def unique_instances(df, df_full):
backup_df = pd.read_csv(f"https://cmip6.storage.googleapis.com/{backup_filename}")
print(f'Backed up catalog has {len(backup_df)} items')



# FILTER THE CURRENT CATALOG
pangeo_df["instance_id"] = pangeo_df["zstore"].apply(
lambda x: ".".join(x.replace("gs://cmip6/", "").split("/")[0:-1])
Expand Down
50 changes: 20 additions & 30 deletions retractions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from tqdm.autonotebook import tqdm
import requests
import json
import time
Expand All @@ -7,23 +6,30 @@ def query_retraction(url, params, batchsize):
print(f"Downloading Retraction Data from {url}...")
resp = requests.get(url=url, params=params)
header = resp.json() # Check the JSON Response Content documentation below
n_items = header["response"]["numFound"]
print(f"Found {n_items} items.")

batches = range(0, n_items+1, batchsize) # if I offset these, can
params["limit"] = batchsize

batch_jsons = []

print(f"batchsize= {batchsize}")

for batch in tqdm(batches):
params["offset"] = batch
has_data = True
offset = 0
while has_data:
print('----------')
print(f"offset={offset}")
params["offset"] = offset
resp = requests.get(url=url, params=params)
if resp.status_code != 200:
print(batch)
print(resp.status_code)
data = resp.json()
n_data = len(data["response"]["docs"])
print(f"{n_data} entries found")

batch_jsons.append(data)
offset +=batchsize

if len(data["response"]["docs"]) == 0:
has_data = False
print('No more data found')

# Convert to list of instance ids
print("Extracting instance_ids...")
Expand All @@ -32,29 +38,13 @@ def query_retraction(url, params, batchsize):
extracted = [i["instance_id"] for i in data["response"]["docs"]]
all_retracted_instance_ids.extend(extracted)

# Fail out here if the total number of items is not what was promised in the header
# I had a few instances today, where that was the case, I think a simple retry is
# a good enough solution for now.
n_retracted = len(all_retracted_instance_ids)
if n_retracted == n_items:
print('Successfully downloaded all retraction info')
else:
raise RuntimeError(f'Downloaded retraction info is incomplete. Found {n_retracted} items, expected {n_items}')


print(f'Downloaded {n_retracted} retraction info')

# There is the possibility that we are getting duplicate instance_ids here because we query replicas
# Make sure dubplicates are not carried forward
# TODO: Do we need to check that retracted is indeed true here?
retracted_instance_ids = set(all_retracted_instance_ids)
print(f"{len(all_retracted_instance_ids)-len(retracted_instance_ids)} replicas found")
return retracted_instance_ids

def query_retraction_retry(url, params, batchsize = 10000):
"""Retrys query if it fails"""
status = 0
while status == 0:
try:
query_result = query_retraction(url, params, batchsize)
status = 1
except RuntimeError as e:
print(f"{e}.\nRetrying")

return query_result