|
1 | 1 | from pathlib import Path |
2 | | -import pandas as pd |
3 | | -import requests |
| 2 | +import lib.downloading as dl |
| 3 | +import lib.bigFiles as bf |
| 4 | +import lib.common as cmn |
| 5 | + |
| 6 | +relevantLists = { |
| 7 | + "ARGA Threatened Species": "dr23195", |
| 8 | + "ARGA Useful Species": "dr23194", |
| 9 | + "ARGA Venomous and Poisonous Species": "dr23195", |
| 10 | + "ARGA Migratory Species": "dr23193", |
| 11 | + "ARGA Native Species": "dr23205", |
| 12 | + "ARGA Milestone Species": "dr23177", |
| 13 | + "ARGA Edible Species": "dr23094", |
| 14 | + "ARGA Exotic Species": "dr23197", |
| 15 | + "ARGA Bushfire Recovery": "dr25948", |
| 16 | + "ARGA Commercial Species": "dr23169", |
| 17 | + "ARGA Crop Wild Relatives": "dr23173", |
| 18 | +} |
4 | 19 |
|
5 | 20 | def collect(outputPath: Path) -> None: |
6 | | - baseURL = "https://lists-ws.test.ala.org.au/" |
7 | | - session = requests.Session() |
8 | | - recordsPerPage = 100 |
9 | | - |
10 | | - def getURL(endpoint: str, params: dict, pageSize: int, page: int = 1) -> dict: |
11 | | - fields = dict(params) |
12 | | - fields["page"] = page |
13 | | - fields["pageSize"] = pageSize |
| 21 | + subDir = outputPath.parent / "sections" |
| 22 | + subDir.mkdir() |
14 | 23 |
|
15 | | - url = f"{baseURL}{endpoint}?" + "&".join(f"{k}={v}" for k, v in fields.items()) |
16 | | - response = session.get(url) |
17 | | - data = response.json() |
18 | | - return data |
| 24 | + for listName, dataResourceUID in relevantLists.items(): |
| 25 | + dl.download(f"https://lists-ws.test.ala.org.au/v2/download/{dataResourceUID}?zip=false", subDir / f"{listName.replace(' ', '_')}.csv", verbose=True) |
19 | 26 |
|
20 | | - listsMetadata = outputPath.parent / "metadata.csv" |
21 | | - if not listsMetadata.exists(): |
22 | | - records = [] |
23 | | - metadataEndpoint = "speciesList/" |
24 | | - |
25 | | - query = {"tag": "arga"} |
26 | | - data = getURL(metadataEndpoint, query, recordsPerPage) |
27 | | - records.extend(data["lists"]) |
28 | | - totalItems = data["listCount"] |
29 | | - remainingCalls = ((totalItems / recordsPerPage).__ceil__()) - 1 |
30 | | - |
31 | | - for call, _ in enumerate(range(remainingCalls), start=2): |
32 | | - data = getURL(metadataEndpoint, query, recordsPerPage, call) |
33 | | - records.extend(data["lists"]) |
34 | | - |
35 | | - df = pd.DataFrame.from_records(records) |
36 | | - df = df.drop(["description"], axis=1) |
37 | | - df.to_csv(listsMetadata, index=False) |
38 | | - else: |
39 | | - df = pd.read_csv(listsMetadata) |
40 | | - |
41 | | - records = [] |
42 | | - for id in df["id"]: |
43 | | - page = 1 |
44 | | - while True: |
45 | | - print(f"Getting page #{page} for id {id}", end="\r") |
46 | | - data = getURL(f"speciesListItems/{id}", {}, recordsPerPage, page) |
47 | | - if not data: |
48 | | - break |
49 | | - |
50 | | - records.extend(data) |
51 | | - page += 1 |
52 | | - |
53 | | - print() |
54 | | - |
55 | | - df2 = pd.DataFrame.from_records(records) |
56 | | - df = df.rename(columns={"id": "speciesListID", "version": "speciesListVersion"}) |
57 | | - df = df.merge(df2, "outer", on="speciesListID") |
58 | | - df2.to_csv(outputPath, index=False) |
| 27 | + bf.combineDirectoryFiles(outputPath, subDir) |
| 28 | + cmn.clearFolder(subDir, True) |
0 commit comments