|
| 1 | +from pathlib import Path |
| 2 | +import pandas as pd |
| 3 | +import requests |
| 4 | + |
| 5 | +def collect(outputPath: Path) -> None: |
| 6 | + baseURL = "https://lists-ws.test.ala.org.au/" |
| 7 | + session = requests.Session() |
| 8 | + recordsPerPage = 100 |
| 9 | + |
| 10 | + def getURL(endpoint: str, params: dict, pageSize: int, page: int = 1) -> dict: |
| 11 | + fields = dict(params) |
| 12 | + fields["page"] = page |
| 13 | + fields["pageSize"] = pageSize |
| 14 | + |
| 15 | + url = f"{baseURL}{endpoint}?" + "&".join(f"{k}={v}" for k, v in fields.items()) |
| 16 | + response = session.get(url) |
| 17 | + data = response.json() |
| 18 | + return data |
| 19 | + |
| 20 | + listsMetadata = outputPath.parent / "metadata.csv" |
| 21 | + if not listsMetadata.exists(): |
| 22 | + records = [] |
| 23 | + metadataEndpoint = "speciesList/" |
| 24 | + |
| 25 | + query = {"tag": "arga"} |
| 26 | + data = getURL(metadataEndpoint, query, recordsPerPage) |
| 27 | + records.extend(data["lists"]) |
| 28 | + totalItems = data["listCount"] |
| 29 | + remainingCalls = ((totalItems / recordsPerPage).__ceil__()) - 1 |
| 30 | + |
| 31 | + for call, _ in enumerate(range(remainingCalls), start=2): |
| 32 | + data = getURL(metadataEndpoint, query, recordsPerPage, call) |
| 33 | + records.extend(data["lists"]) |
| 34 | + |
| 35 | + df = pd.DataFrame.from_records(records) |
| 36 | + df = df.drop(["description"], axis=1) |
| 37 | + df.to_csv(listsMetadata, index=False) |
| 38 | + else: |
| 39 | + df = pd.read_csv(listsMetadata) |
| 40 | + |
| 41 | + records = [] |
| 42 | + for id in df["id"]: |
| 43 | + page = 1 |
| 44 | + while True: |
| 45 | + print(f"Getting page #{page} for id {id}", end="\r") |
| 46 | + data = getURL(f"speciesListItems/{id}", {}, recordsPerPage, page) |
| 47 | + if not data: |
| 48 | + break |
| 49 | + |
| 50 | + records.extend(data) |
| 51 | + page += 1 |
| 52 | + |
| 53 | + print() |
| 54 | + |
| 55 | + df2 = pd.DataFrame.from_records(records) |
| 56 | + df = df.rename(columns={"id": "speciesListID", "version": "speciesListVersion"}) |
| 57 | + df = df.merge(df2, "outer", on="speciesListID") |
| 58 | + df2.to_csv(outputPath, index=False) |
0 commit comments