Skip to content

Commit b9373d5

Browse files
Merge pull request #237 from ARGA-Genomes/alaBiocache
ALA Biocache
2 parents 608e13d + 7e935f4 commit b9373d5

File tree

5 files changed

+174
-47
lines changed

5 files changed

+174
-47
lines changed

dataSources/ala/avh/config.json

Lines changed: 0 additions & 21 deletions
This file was deleted.

dataSources/ala/avh/scripts/processing.py

Lines changed: 0 additions & 26 deletions
This file was deleted.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{
2+
"datasetID": "0007001",
3+
"retrieveType": "script",
4+
"downloading": {
5+
"path": "./processing.py",
6+
"function": "collectBiocache",
7+
"args": [
8+
{
9+
"q": "*:*",
10+
"fq": "(basis_of_record:\"PRESERVED_SPECIMEN\" OR basis_of_record:\"MATERIAL_SAMPLE\" OR basis_of_record:\"LIVING_SPECIMEN\" OR basis_of_record:\"MATERIAL_CITATION\")",
11+
"disableAllQualityFilters": true,
12+
"qualityProfile": "ALA",
13+
"qc": "-_nest_parent_:*"
14+
},
15+
"{OUT-PATH}"
16+
],
17+
"output": "biocache.zip"
18+
},
19+
"processing": {
20+
"linear": [
21+
{
22+
"path": ".lib/zipping.py",
23+
"function": "extract",
24+
"args": [
25+
"{IN-PATH}",
26+
"{OUT-DIR}"
27+
],
28+
"output": "biocache"
29+
},
30+
{
31+
"path": "./processing.py",
32+
"function": "cleanup",
33+
"args": [
34+
"{IN-PATH}",
35+
"{OUT-PATH}"
36+
],
37+
"output": "biocache.csv"
38+
}
39+
]
40+
},
41+
"conversion": {
42+
"mapColumnName": "ala-biocache mappings"
43+
},
44+
"update": {
45+
"type": "weekly",
46+
"day": "sunday",
47+
"time": 9,
48+
"repeat": 2
49+
}
50+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import requests
2+
from pathlib import Path
3+
import lib.downloading as dl
4+
import logging
5+
from lib.secrets import secrets
6+
import lib.bigFiles as bf
7+
8+
def collectBiocache(queryParamters: dict, outputFilePath: Path) -> None:
9+
paramters = {
10+
"email": secrets.general.email,
11+
"emailNotify": False
12+
}
13+
14+
baseURL = "https://api.ala.org.au/occurrences/occurrences/offline/download?"
15+
url = dl.urlBuilder(baseURL, paramters | queryParamters)
16+
17+
response = requests.get(url)
18+
data = response.json()
19+
20+
statusURL = data["statusUrl"]
21+
totalRecords = data["totalRecords"]
22+
logging.info(f"Found {totalRecords} total records")
23+
24+
dl.asyncRunner(statusURL, "status", "finished", "downloadUrl", outputFilePath)
25+
26+
def cleanup(folderPath: Path, outputFilePath: Path) -> None:
27+
extraFiles = [
28+
"citation.csv",
29+
"headings.csv",
30+
"README.html"
31+
]
32+
33+
for fileName in extraFiles:
34+
path = folderPath / fileName
35+
path.unlink(missing_ok=True)
36+
37+
bf.combineDirectoryFiles(outputFilePath, folderPath)
38+
39+
# status = {
40+
# "inQueue": [
41+
# "totalRecords",
42+
# "queueSize",
43+
# "statusUrl",
44+
# "cancelUrl",
45+
# "searchUrl"
46+
# ],
47+
# "running": [
48+
# "totalRecords",
49+
# "records",
50+
# "statusUrl",
51+
# "cancelUrl",
52+
# "searchUrl"
53+
# ],
54+
# "finished": [
55+
# "totalRecords",
56+
# "queueSize",
57+
# "downloadUrl",
58+
# "statusUrl",
59+
# "cancelUrl",
60+
# "searchUrl"
61+
# ]
62+
# }

src/lib/downloading.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from requests.exceptions import HTTPError
55
import logging
66
from lib.progressBar import ProgressBar
7+
from urllib.parse import quote
8+
import time
79

810
class RepeatDownloader:
911
def __init__(self, headers: dict = {}, username: str = "", password: str = "", chunkSize: int = 1024*1024, verbose: bool = False):
@@ -57,3 +59,63 @@ def download(url: str, filePath: Path, chunkSize: int = 1024*1024, verbose: bool
5759
print(f"Downloaded chunk: {idx}", end="\r")
5860

5961
return True
62+
63+
def urlBuilder(url: str, parameters: dict) -> str:
64+
def encode(key: str, value: any) -> str:
65+
if isinstance(value, bool):
66+
value = str(value).lower()
67+
68+
if isinstance(value, int):
69+
value = str(value)
70+
71+
return f"{key}={quote(value)}"
72+
73+
flatParams = []
74+
for paramterName, parameterValue in parameters.items():
75+
if isinstance(parameterValue, list):
76+
for item in parameterValue:
77+
flatParams.append(encode(paramterName, item))
78+
continue
79+
80+
flatParams.append(encode(paramterName, parameterValue))
81+
82+
return f"{url}" + "&".join(flatParams)
83+
84+
def asyncRunner(checkURL: str, statusField: str, completedStr: str, downloadField: str, outputFilePath: Path, recheckDelay: int = 10) -> bool:
85+
session = requests.Session()
86+
87+
def getCompleted() -> tuple[bool, str, str]:
88+
response = session.get(checkURL)
89+
if response.status_code != 200:
90+
logging.warning(f"Failed to retrieve {checkURL}, received status code {response.status_code}. Reason: {response.reason}")
91+
return True, None, None
92+
93+
data = response.json()
94+
95+
statusValue = data.get(statusField, "Unknown")
96+
downloadURL = data.get(downloadField, "")
97+
98+
return statusValue == completedStr, statusValue, downloadURL
99+
100+
loading = "|/-\\"
101+
totalChecks = 0
102+
recheckDelay = max(recheckDelay, 5)
103+
reprintsPerSecond = 2
104+
105+
logging.info(f"Polling {checkURL} for status...")
106+
completed, status, downloadURL = getCompleted()
107+
while not completed:
108+
for _ in range(recheckDelay):
109+
for _ in range(reprintsPerSecond):
110+
print(f"> ({loading[totalChecks % len(loading)]}) Status: {status}", end="\r")
111+
time.sleep(1 / reprintsPerSecond)
112+
113+
totalChecks += 1
114+
115+
completed, status, downloadURL = getCompleted()
116+
117+
if status is None:
118+
logging.error("Failed to check status of download.")
119+
return False
120+
121+
return download(downloadURL, outputFilePath, verbose=True)

0 commit comments

Comments
 (0)