Skip to content

Commit 999cdfa

Browse files
Merge pull request #167 from ARGA-Genomes/develop
Merge Develop into Main
2 parents 398bfd8 + d713815 commit 999cdfa

File tree

112 files changed

+1587
-1879
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

112 files changed

+1587
-1879
lines changed

config.toml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
[files]
2-
31
[folders]
42
src = "./src" # Source folder for all python code
53
dataSources = "./dataSources" # Location of all source related files
6-
mapping = "./mapping" # Location for map files
7-
logs = "./logs" # Location of all logging files
4+
logs = "./logs" # Location of all logging files
5+
package = "" # Location for packaged files to be put in, leave blank to leave in respective dataSource location
6+
7+
[settings]
8+
logToConsole = true
9+
logLevel = "info" # Log levels: debug, info, warning, error, critical

dataSources/42bp/genomeArk/config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"datasetID": "ARGA:TL:0000193",
33
"retrieveType": "script",
4-
"download": {
4+
"downloading": {
55
"path": "./processing.py",
66
"function": "build",
77
"args": [
8-
"{OUTPATH}",
8+
"{OUT-PATH}",
99
"./species.json"
1010
],
1111
"output": "genomeArk.csv"

dataSources/42bp/genomeArk/processing.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import requests
22
from pathlib import Path
3-
import lib.commonFuncs as cmn
3+
import lib.common as cmn
44
import yaml
55
import csv
66
from yaml.scanner import ScannerError
77
import json
8-
from lib.tools.downloader import Downloader
8+
import lib.downloading as dl
99

1010
def build(outputFilePath: Path, savedFilePath: Path) -> None:
1111
location = "https://42basepairs.com/api/v1/files/s3/genomeark/species/"
@@ -25,7 +25,6 @@ def build(outputFilePath: Path, savedFilePath: Path) -> None:
2525

2626
allData = []
2727
columns = []
28-
downloader = Downloader()
2928
for species in speciesList:
3029
name = species.get("name", "")
3130

@@ -35,7 +34,7 @@ def build(outputFilePath: Path, savedFilePath: Path) -> None:
3534
downloadURL = baseDLUrl + name + "metadata.yaml"
3635
filePath = outputFilePath.parent / f"{name[:-1]}_metadata.yaml"
3736
if not filePath.exists():
38-
success = downloader.download(downloadURL, filePath)
37+
success = dl.download(downloadURL, filePath)
3938
if not success:
4039
continue
4140

dataSources/afd/checklist/config.json

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,40 @@
11
{
22
"datasetID": "ARGA:TL:0001000",
33
"retrieveType": "script",
4-
"download": {
4+
"downloading": {
55
"path": "./processing.py",
66
"function": "retrieve",
77
"args": [
8-
"{OUTPATH}"
8+
"{OUT-PATH}"
99
],
1010
"output": "rawAFD.csv"
1111
},
1212
"processing": {
13-
"final": [
13+
"linear": [
1414
{
1515
"path": "./processing.py",
1616
"function": "cleanup",
1717
"args": [
18-
"{INPATH}",
19-
"{OUTPATH}"
18+
"{IN-PATH}",
19+
"{OUT-PATH}"
2020
],
2121
"output": "cleanedAFD.csv"
2222
},
2323
{
2424
"path": "./processing.py",
2525
"function": "addParents",
2626
"args": [
27-
"{INPATH}",
28-
"{OUTPATH}"
27+
"{IN-PATH}",
28+
"{OUT-PATH}"
2929
],
3030
"output": "afd.csv"
3131
},
3232
{
3333
"path": "./processing.py",
3434
"function": "enrich",
3535
"args": [
36-
"{INPATH}",
37-
"{OUTPATH}"
36+
"{IN-PATH}",
37+
"{OUT-PATH}"
3838
],
3939
"output": "enrichedAFD.csv"
4040
}

dataSources/afd/checklist/processing.py

Lines changed: 44 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
from pathlib import Path
44
import pandas as pd
55
from io import BytesIO
6-
from lib.tools.bigFileWriter import BigFileWriter, Format
6+
from lib.bigFileWriter import BigFileWriter, Format
77
from bs4 import BeautifulSoup
8-
from lib.tools.progressBar import SteppableProgressBar
8+
from lib.progressBar import SteppableProgressBar
99
import re
1010
import traceback
1111

@@ -82,19 +82,8 @@ def cleanup(filePath: Path, outputFilePath: Path) -> None:
8282
df = df.drop([
8383
"CAVS_CODE",
8484
"CAAB_CODE",
85-
"PUB_PUB_AUTHOR",
86-
"PUB_PUB_YEAR",
87-
"PUB_PUB_TITLE",
88-
"PUB_PUB_PAGES",
89-
"PUB_PUB_PARENT_BOOK_TITLE",
90-
"PUB_PUB_PARENT_JOURNAL_TITLE",
91-
"PUB_PUB_PARENT_ARTICLE_TITLE",
92-
"PUB_PUB_PUBLICATION_DATE",
93-
"PUB_PUB_PUBLISHER",
9485
"PUB_PUB_FORMATTED",
9586
"PUB_PUB_QUALIFICATION",
96-
"PUB_PUB_TYPE",
97-
"PUBLICATION_GUID",
9887
"PUBLICATION_LAST_UPDATE",
9988
"PARENT_PUBLICATION_GUID"
10089
], axis=1)
@@ -105,9 +94,24 @@ def cleanup(filePath: Path, outputFilePath: Path) -> None:
10594
"NAME_GUID": "name_id",
10695
"TAXON_GUID": "taxon_id",
10796
"TAXON_LAST_UPDATE": "updated_at",
108-
"PARENT_TAXON_GUID": "parent_taxon_id"
97+
"PARENT_TAXON_GUID": "parent_taxon_id",
98+
"PUB_PUB_AUTHOR": "publication_author",
99+
"PUB_PUB_YEAR": "publication_year",
100+
"PUB_PUB_TITLE": "publication_title",
101+
"PUB_PUB_PAGES": "publication_pages",
102+
"PUB_PUB_PUBLICATION_DATE": "publication_date",
103+
"PUB_PUB_PUBLISHER": "publisher",
104+
"PUB_PUB_TYPE": "publication_type",
105+
"PUBLICATION_GUID": "publication_id"
109106
})
110107

108+
df["published_media_title"] = df["PUB_PUB_PARENT_BOOK_TITLE"] + df["PUB_PUB_PARENT_JOURNAL_TITLE"] + df["PUB_PUB_PARENT_ARTICLE_TITLE"]
109+
df = df.drop([
110+
"PUB_PUB_PARENT_BOOK_TITLE",
111+
"PUB_PUB_PARENT_JOURNAL_TITLE",
112+
"PUB_PUB_PARENT_ARTICLE_TITLE"
113+
], axis=1)
114+
111115
df = df.rename(columns={column: column.lower() for column in df.columns})
112116
df = df.rename(columns={"qualification": "notes"})
113117
df = df[df["scientific_name"] != "Unplaced Synonym(s)"]
@@ -147,7 +151,7 @@ def cleanup(filePath: Path, outputFilePath: Path) -> None:
147151
df["canonical_name"] = df.apply(lambda row: f"{row['canonical_genus']} {row['species']}" if row["taxon_rank"] == "Species" else f"{row['canonical_genus']} {row['species']} {row['subspecies']}" if row["taxon_rank"] == "subspecies" else row["names_various"], axis=1)
148152
df["authorship"] = df.apply(lambda row: f"{row['author']}, {row['year']}" if row["author"] not in ("", "NaN", "nan") else "", axis=1)
149153
df["scientific_name_authorship"] = df.apply(lambda row: f"({row['authorship']})" if row['orig_combination'] == 'N' and row["authorship"] not in ("", "NaN", "nan") else row["authorship"], axis=1)
150-
154+
151155
df.to_csv(outputFilePath, index=False)
152156

153157
def addParents(filePath: Path, outputFilePath: Path) -> None:
@@ -187,36 +191,35 @@ def enrich(filePath: Path, outputFilePath: Path) -> None:
187191
subDF = df[df["taxon_rank"] == rank]
188192

189193
enrichmentPath = outputFilePath.parent / f"{rank}.csv"
190-
if enrichmentPath.exists():
191-
continue
194+
if not enrichmentPath.exists():
195+
writer = BigFileWriter(enrichmentPath, rank, subfileType=Format.CSV)
196+
writer.populateFromFolder(writer.subfileDir)
197+
subfileNames = [file.fileName for file in writer.writtenFiles]
192198

193-
writer = BigFileWriter(enrichmentPath, rank, subfileType=Format.CSV)
194-
writer.populateFromFolder(writer.subfileDir)
195-
subfileNames = [file.fileName for file in writer.writtenFiles]
196-
197-
uniqueSeries = subDF["taxon_id"].unique()
198-
uniqueSeries = [item for item in uniqueSeries if item not in subfileNames]
199-
200-
bar = SteppableProgressBar(len(uniqueSeries), processName=f"{rank} Progress")
201-
for taxonID in uniqueSeries:
202-
bar.update()
203-
204-
response = session.get(f"https://biodiversity.org.au/afd/taxa/{taxonID}/complete")
205-
try:
206-
records = _parseContent(response.text, taxonID, rank.lower())
207-
except:
208-
print(taxonID)
209-
print(traceback.format_exc())
210-
return
199+
uniqueSeries = subDF["taxon_id"].unique()
200+
uniqueSeries = [item for item in uniqueSeries if item not in subfileNames]
211201

212-
recordDF = pd.DataFrame.from_records(records)
213-
writer.writeDF(recordDF, taxonID)
202+
bar = SteppableProgressBar(50, len(uniqueSeries), f"{rank} Progress")
203+
for taxonID in uniqueSeries:
204+
bar.update()
205+
206+
response = session.get(f"https://biodiversity.org.au/afd/taxa/{taxonID}/complete")
207+
try:
208+
records = _parseContent(response.text, taxonID, rank.lower())
209+
except:
210+
print(taxonID)
211+
print(traceback.format_exc())
212+
return
213+
214+
recordDF = pd.DataFrame.from_records(records)
215+
writer.writeDF(recordDF, taxonID)
216+
217+
writer.oneFile(False)
214218

215-
writer.oneFile(False)
216219
enrichmentDF = pd.read_csv(enrichmentPath, dtype=object)
217-
df = df.merge(enrichmentDF, "left", ["taxon_id", rank.lower()])
220+
df = df.merge(enrichmentDF, "left", left_on=["taxon_id", "canonical_name"], right_on=["taxon_id", rank.lower()])
218221

219-
df.to_csv(outputFilePath)
222+
df.to_csv(outputFilePath, index=False)
220223

221224
def _parseContent(content: str, taxonID: str, rank: str) -> list[dict]:
222225
soup = BeautifulSoup(content, "html.parser")
@@ -285,7 +288,7 @@ def _parseContent(content: str, taxonID: str, rank: str) -> list[dict]:
285288
for typeData in synonymData.find_all("div"):
286289
data[typeData.find("h5").text.lower().replace(" ", "_")[:-1]] = synonymData.find("span").text
287290

288-
record = {"taxon_id": taxonID, rank: synonymTitle.find("strong").text.split()[-1]} | data
291+
record = {"taxon_id": taxonID, rank: synonymTitle.find("strong").text} | data
289292
records.append(record | distributionData | descriptorData)
290293

291294
return records

dataSources/ala/avh/config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"datasetID": "0007001",
33
"retrieveType": "script",
4-
"download": {
4+
"downloading": {
55
"path": "./processing.py",
66
"function": "build",
77
"args": [
8-
"{OUTPATH}"
8+
"{OUT-PATH}"
99
],
1010
"output": "avh.csv"
1111
},

dataSources/ala/avh/processing.py

Lines changed: 2 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
import pandas as pd
22
import requests
3-
import json
4-
import math
53
from pathlib import Path
6-
import lib.dataframeFuncs as dff
4+
75

86
def build(outputFilePath: Path) -> None:
97
baseURL = "https://biocache-ws.ala.org.au/ws/occurrences/search?q=*%3A*&disableAllQualityFilters=true&qualityProfile=AVH&fq=type_status%3A*&fq=country%3A%22Australia%22&qc=data_hub_uid%3Adh9"
@@ -14,7 +12,7 @@ def build(outputFilePath: Path) -> None:
1412
jsData = rawData.json()
1513

1614
records = jsData["totalRecords"]
17-
totalCalls = math.ceil(records / readSize)
15+
totalCalls = (records / readSize).__ceil__()
1816

1917
occurrences = []
2018
for call in range(totalCalls):
@@ -26,32 +24,3 @@ def build(outputFilePath: Path) -> None:
2624

2725
df = pd.DataFrame.from_records(occurrences)
2826
df.to_csv(outputFilePath, index=False)
29-
30-
def collect(outputDir: Path, profile: str, tokenFilePath: Path) -> None:
31-
with open(tokenFilePath) as fp:
32-
token = json.load(fp)
33-
34-
bearerToken = token["access_token"]
35-
baseURL = "https://api.ala.org.au/profiles"
36-
endpoint = f"/api/opus/{profile}/profile?pageSize=1000"
37-
response = requests.get(baseURL + endpoint, headers={"Authorization": f"Bearer {bearerToken}"})
38-
data = response.json()
39-
40-
if "message" in data and "not authorized" in data["message"]:
41-
print("Failed to authorize, please make sure bearer token is valid.")
42-
return
43-
44-
print(f"Accessing profile: {profile}")
45-
46-
records = []
47-
for idx, entry in enumerate(data, start=1):
48-
uuid = entry["uuid"]
49-
print(f"At record: {idx}", end="\r")
50-
51-
response = requests.get(baseURL + f"/api/opus/{profile}/profile/{uuid}", headers={"Authorization": f"Bearer {bearerToken}"})
52-
records.append(response.json())
53-
print()
54-
55-
df = pd.DataFrame.from_records(records)
56-
df = dff.removeSpaces(df)
57-
df.to_csv(outputDir / f"{profile}.csv", index=False)

dataSources/ala/lists/config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"retrieveType": "script",
3-
"download": {
3+
"downloading": {
44
"path": "./processing.py",
55
"function": "collect",
66
"args": [
7-
"{OUTPATH}"
7+
"{OUT-PATH}"
88
],
99
"output": "lists.csv"
1010
},

dataSources/ala/profiles/config.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
"mangrovewatch": {},
88
"weeds-australia": {}
99
},
10-
"download": {
11-
"path": "sourceProcessing/ala.py",
10+
"downloading": {
11+
"path": "../processing.py",
1212
"function": "collect",
1313
"args": [
14-
"{OUTDIR}",
14+
"{OUT-DIR}",
1515
"{SUBSECTION}",
16-
"./token.json"
16+
"../token.json"
1717
],
1818
"output": "{SUBSECTION}.csv"
1919
},
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import json
2+
import requests
3+
from pathlib import Path
4+
import lib.dataframes as dff
5+
import pandas as pd
6+
7+
def collect(outputDir: Path, profile: str, tokenFilePath: Path) -> None:
8+
with open(tokenFilePath) as fp:
9+
token = json.load(fp)
10+
11+
bearerToken = token["access_token"]
12+
baseURL = "https://api.ala.org.au/profiles"
13+
endpoint = f"/api/opus/{profile}/profile?pageSize=1000"
14+
response = requests.get(baseURL + endpoint, headers={"Authorization": f"Bearer {bearerToken}"})
15+
data = response.json()
16+
17+
if "message" in data and "not authorized" in data["message"]:
18+
print("Failed to authorize, please make sure bearer token is valid.")
19+
return
20+
21+
print(f"Accessing profile: {profile}")
22+
23+
records = []
24+
for idx, entry in enumerate(data, start=1):
25+
uuid = entry["uuid"]
26+
print(f"At record: {idx}", end="\r")
27+
28+
response = requests.get(baseURL + f"/api/opus/{profile}/profile/{uuid}", headers={"Authorization": f"Bearer {bearerToken}"})
29+
records.append(response.json())
30+
print()
31+
32+
df = pd.DataFrame.from_records(records)
33+
df = dff.removeSpaces(df)
34+
df.to_csv(outputDir / f"{profile}.csv", index=False)

0 commit comments

Comments
 (0)