Skip to content

Commit 7738aa6

Browse files
Merge pull request #244 from ARGA-Genomes/alaLists
ALA Lists
2 parents eb8df18 + 2c22644 commit 7738aa6

File tree

1 file changed

+23
-53
lines changed

1 file changed

+23
-53
lines changed
Lines changed: 23 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,28 @@
11
from pathlib import Path
2-
import pandas as pd
3-
import requests
2+
import lib.downloading as dl
3+
import lib.bigFiles as bf
4+
import lib.common as cmn
5+
6+
relevantLists = {
7+
"ARGA Threatened Species": "dr23195",
8+
"ARGA Useful Species": "dr23194",
9+
"ARGA Venomous and Poisonous Species": "dr23195",
10+
"ARGA Migratory Species": "dr23193",
11+
"ARGA Native Species": "dr23205",
12+
"ARGA Milestone Species": "dr23177",
13+
"ARGA Edible Species": "dr23094",
14+
"ARGA Exotic Species": "dr23197",
15+
"ARGA Bushfire Recovery": "dr25948",
16+
"ARGA Commercial Species": "dr23169",
17+
"ARGA Crop Wild Relatives": "dr23173",
18+
}
419

520
def collect(outputPath: Path) -> None:
6-
baseURL = "https://lists-ws.test.ala.org.au/"
7-
session = requests.Session()
8-
recordsPerPage = 100
9-
10-
def getURL(endpoint: str, params: dict, pageSize: int, page: int = 1) -> dict:
11-
fields = dict(params)
12-
fields["page"] = page
13-
fields["pageSize"] = pageSize
21+
subDir = outputPath.parent / "sections"
22+
subDir.mkdir()
1423

15-
url = f"{baseURL}{endpoint}?" + "&".join(f"{k}={v}" for k, v in fields.items())
16-
response = session.get(url)
17-
data = response.json()
18-
return data
24+
for listName, dataResourceUID in relevantLists.items():
25+
dl.download(f"https://lists-ws.test.ala.org.au/v2/download/{dataResourceUID}?zip=false", subDir / f"{listName.replace(' ', '_')}.csv", verbose=True)
1926

20-
listsMetadata = outputPath.parent / "metadata.csv"
21-
if not listsMetadata.exists():
22-
records = []
23-
metadataEndpoint = "speciesList/"
24-
25-
query = {"tag": "arga"}
26-
data = getURL(metadataEndpoint, query, recordsPerPage)
27-
records.extend(data["lists"])
28-
totalItems = data["listCount"]
29-
remainingCalls = ((totalItems / recordsPerPage).__ceil__()) - 1
30-
31-
for call, _ in enumerate(range(remainingCalls), start=2):
32-
data = getURL(metadataEndpoint, query, recordsPerPage, call)
33-
records.extend(data["lists"])
34-
35-
df = pd.DataFrame.from_records(records)
36-
df = df.drop(["description"], axis=1)
37-
df.to_csv(listsMetadata, index=False)
38-
else:
39-
df = pd.read_csv(listsMetadata)
40-
41-
records = []
42-
for id in df["id"]:
43-
page = 1
44-
while True:
45-
print(f"Getting page #{page} for id {id}", end="\r")
46-
data = getURL(f"speciesListItems/{id}", {}, recordsPerPage, page)
47-
if not data:
48-
break
49-
50-
records.extend(data)
51-
page += 1
52-
53-
print()
54-
55-
df2 = pd.DataFrame.from_records(records)
56-
df = df.rename(columns={"id": "speciesListID", "version": "speciesListVersion"})
57-
df = df.merge(df2, "outer", on="speciesListID")
58-
df2.to_csv(outputPath, index=False)
27+
bf.combineDirectoryFiles(outputPath, subDir)
28+
cmn.clearFolder(subDir, True)

0 commit comments

Comments
 (0)