Skip to content

Commit c568c26

Browse files
Merge pull request #141 from ARGA-Genomes/ALALists
ALA Lists
2 parents 241c3d7 + c470319 commit c568c26

File tree

2 files changed

+70
-0
lines changed

2 files changed

+70
-0
lines changed

dataSources/ala/lists/config.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"retrieveType": "script",
3+
"download": {
4+
"path": "./processing.py",
5+
"function": "collect",
6+
"args": [
7+
"{OUTPATH}"
8+
],
9+
"output": "lists.csv"
10+
},
11+
"conversion": {}
12+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from pathlib import Path
2+
import pandas as pd
3+
import requests
4+
5+
def collect(outputPath: Path) -> None:
6+
baseURL = "https://lists-ws.test.ala.org.au/"
7+
session = requests.Session()
8+
recordsPerPage = 100
9+
10+
def getURL(endpoint: str, params: dict, pageSize: int, page: int = 1) -> dict:
11+
fields = dict(params)
12+
fields["page"] = page
13+
fields["pageSize"] = pageSize
14+
15+
url = f"{baseURL}{endpoint}?" + "&".join(f"{k}={v}" for k, v in fields.items())
16+
response = session.get(url)
17+
data = response.json()
18+
return data
19+
20+
listsMetadata = outputPath.parent / "metadata.csv"
21+
if not listsMetadata.exists():
22+
records = []
23+
metadataEndpoint = "speciesList/"
24+
25+
query = {"tag": "arga"}
26+
data = getURL(metadataEndpoint, query, recordsPerPage)
27+
records.extend(data["lists"])
28+
totalItems = data["listCount"]
29+
remainingCalls = ((totalItems / recordsPerPage).__ceil__()) - 1
30+
31+
for call, _ in enumerate(range(remainingCalls), start=2):
32+
data = getURL(metadataEndpoint, query, recordsPerPage, call)
33+
records.extend(data["lists"])
34+
35+
df = pd.DataFrame.from_records(records)
36+
df = df.drop(["description"], axis=1)
37+
df.to_csv(listsMetadata, index=False)
38+
else:
39+
df = pd.read_csv(listsMetadata)
40+
41+
records = []
42+
for id in df["id"]:
43+
page = 1
44+
while True:
45+
print(f"Getting page #{page} for id {id}", end="\r")
46+
data = getURL(f"speciesListItems/{id}", {}, recordsPerPage, page)
47+
if not data:
48+
break
49+
50+
records.extend(data)
51+
page += 1
52+
53+
print()
54+
55+
df2 = pd.DataFrame.from_records(records)
56+
df = df.rename(columns={"id": "speciesListID", "version": "speciesListVersion"})
57+
df = df.merge(df2, "outer", on="speciesListID")
58+
df2.to_csv(outputPath, index=False)

0 commit comments

Comments
 (0)