ARGA-Genomes
diff --git a/‎config.toml‎
Lines changed: 6 additions & 4 deletions b/‎config.toml‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎dataSources/42bp/genomeArk/config.json‎
Lines changed: 2 additions & 2 deletions b/‎dataSources/42bp/genomeArk/config.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dataSources/42bp/genomeArk/processing.py‎
Lines changed: 3 additions & 4 deletions b/‎dataSources/42bp/genomeArk/processing.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎dataSources/afd/checklist/config.json‎
Lines changed: 9 additions & 9 deletions b/‎dataSources/afd/checklist/config.json‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎dataSources/afd/checklist/processing.py‎
Lines changed: 44 additions & 41 deletions b/‎dataSources/afd/checklist/processing.py‎
Lines changed: 44 additions & 41 deletions
diff --git a/‎dataSources/ala/avh/config.json‎
Lines changed: 2 additions & 2 deletions b/‎dataSources/ala/avh/config.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dataSources/ala/avh/processing.py‎
Lines changed: 2 additions & 33 deletions b/‎dataSources/ala/avh/processing.py‎
Lines changed: 2 additions & 33 deletions
diff --git a/‎dataSources/ala/lists/config.json‎
Lines changed: 2 additions & 2 deletions b/‎dataSources/ala/lists/config.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dataSources/ala/profiles/config.json‎
Lines changed: 4 additions & 4 deletions b/‎dataSources/ala/profiles/config.json‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎dataSources/ala/profiles/processing.py‎
Lines changed: 34 additions & 0 deletions b/‎dataSources/ala/profiles/processing.py‎
Lines changed: 34 additions & 0 deletions
@@ -1,7 +1,9 @@
-[files]
-
 [folders]
 src = "./src" # Source folder for all python code
 dataSources = "./dataSources" # Location of all source related files
-mapping = "./mapping" # Location for map files
-logs = "./logs" # Location of all logging files
+logs = "./logs" # Location of all logging files
+package = "" # Location for packaged files to be put in, leave blank to leave in respective dataSource location
+
+[settings]
+logToConsole = true
+logLevel = "info" # Log levels: debug, info, warning, error, critical
@@ -1,11 +1,11 @@
 {
     "datasetID": "ARGA:TL:0000193",
     "retrieveType": "script",
-    "download": {
+    "downloading": {
         "path": "./processing.py",
         "function": "build",
         "args": [
-            "{OUTPATH}",
+            "{OUT-PATH}",
             "./species.json"
         ],
         "output": "genomeArk.csv"
 
@@ -1,11 +1,11 @@
 import requests
 from pathlib import Path
-import lib.commonFuncs as cmn
+import lib.common as cmn
 import yaml
 import csv
 from yaml.scanner import ScannerError
 import json
-from lib.tools.downloader import Downloader
+import lib.downloading as dl
 
 def build(outputFilePath: Path, savedFilePath: Path) -> None:
     location = "https://42basepairs.com/api/v1/files/s3/genomeark/species/"
@@ -25,7 +25,6 @@ def build(outputFilePath: Path, savedFilePath: Path) -> None:
 
     allData = []
     columns = []
-    downloader = Downloader()
     for species in speciesList:
         name = species.get("name", "")
 
@@ -35,7 +34,7 @@ def build(outputFilePath: Path, savedFilePath: Path) -> None:
         downloadURL = baseDLUrl + name + "metadata.yaml"
         filePath = outputFilePath.parent / f"{name[:-1]}_metadata.yaml"
         if not filePath.exists():
-            success = downloader.download(downloadURL, filePath)
+            success = dl.download(downloadURL, filePath)
             if not success:
                 continue
 
 
@@ -1,40 +1,40 @@
 {
     "datasetID": "ARGA:TL:0001000",
     "retrieveType": "script",
-    "download": {
+    "downloading": {
         "path": "./processing.py",
         "function": "retrieve",
         "args": [
-            "{OUTPATH}"
+            "{OUT-PATH}"
         ],
         "output": "rawAFD.csv"
     },
     "processing": {
-        "final": [
+        "linear": [
             {
                 "path": "./processing.py",
                 "function": "cleanup",
                 "args": [
-                    "{INPATH}",
-                    "{OUTPATH}"
+                    "{IN-PATH}",
+                    "{OUT-PATH}"
                 ],
                 "output": "cleanedAFD.csv"
             },
             {
                 "path": "./processing.py",
                 "function": "addParents",
                 "args": [
-                    "{INPATH}",
-                    "{OUTPATH}"
+                    "{IN-PATH}",
+                    "{OUT-PATH}"
                 ],
                 "output": "afd.csv"
             },
             {
                 "path": "./processing.py",
                 "function": "enrich",
                 "args": [
-                    "{INPATH}",
-                    "{OUTPATH}"
+                    "{IN-PATH}",
+                    "{OUT-PATH}"
                 ],
                 "output": "enrichedAFD.csv"
             }
 
@@ -3,9 +3,9 @@
 from pathlib import Path
 import pandas as pd
 from io import BytesIO
-from lib.tools.bigFileWriter import BigFileWriter, Format
+from lib.bigFileWriter import BigFileWriter, Format
 from bs4 import BeautifulSoup
-from lib.tools.progressBar import SteppableProgressBar
+from lib.progressBar import SteppableProgressBar
 import re
 import traceback
 
@@ -82,19 +82,8 @@ def cleanup(filePath: Path, outputFilePath: Path) -> None:
     df = df.drop([
         "CAVS_CODE",
         "CAAB_CODE",
-        "PUB_PUB_AUTHOR",
-        "PUB_PUB_YEAR",
-        "PUB_PUB_TITLE",
-        "PUB_PUB_PAGES",
-        "PUB_PUB_PARENT_BOOK_TITLE",
-        "PUB_PUB_PARENT_JOURNAL_TITLE",
-        "PUB_PUB_PARENT_ARTICLE_TITLE",
-        "PUB_PUB_PUBLICATION_DATE",
-        "PUB_PUB_PUBLISHER",
         "PUB_PUB_FORMATTED",
         "PUB_PUB_QUALIFICATION",
-        "PUB_PUB_TYPE",
-        "PUBLICATION_GUID",
         "PUBLICATION_LAST_UPDATE",
         "PARENT_PUBLICATION_GUID"
     ], axis=1)
@@ -105,9 +94,24 @@ def cleanup(filePath: Path, outputFilePath: Path) -> None:
         "NAME_GUID": "name_id",
         "TAXON_GUID": "taxon_id",
         "TAXON_LAST_UPDATE": "updated_at",
-        "PARENT_TAXON_GUID": "parent_taxon_id"
+        "PARENT_TAXON_GUID": "parent_taxon_id",
+        "PUB_PUB_AUTHOR": "publication_author",
+        "PUB_PUB_YEAR": "publication_year",
+        "PUB_PUB_TITLE": "publication_title",
+        "PUB_PUB_PAGES": "publication_pages",
+        "PUB_PUB_PUBLICATION_DATE": "publication_date",
+        "PUB_PUB_PUBLISHER": "publisher",
+        "PUB_PUB_TYPE": "publication_type",
+        "PUBLICATION_GUID": "publication_id"
     })
 
+    df["published_media_title"] = df["PUB_PUB_PARENT_BOOK_TITLE"] + df["PUB_PUB_PARENT_JOURNAL_TITLE"] + df["PUB_PUB_PARENT_ARTICLE_TITLE"]
+    df = df.drop([
+        "PUB_PUB_PARENT_BOOK_TITLE",
+        "PUB_PUB_PARENT_JOURNAL_TITLE",
+        "PUB_PUB_PARENT_ARTICLE_TITLE"
+    ], axis=1)
+
     df = df.rename(columns={column: column.lower() for column in df.columns})
     df = df.rename(columns={"qualification": "notes"})
     df = df[df["scientific_name"] != "Unplaced Synonym(s)"]
@@ -147,7 +151,7 @@ def cleanup(filePath: Path, outputFilePath: Path) -> None:
     df["canonical_name"] = df.apply(lambda row: f"{row['canonical_genus']} {row['species']}" if row["taxon_rank"] == "Species" else f"{row['canonical_genus']} {row['species']} {row['subspecies']}" if row["taxon_rank"] == "subspecies" else row["names_various"], axis=1)
     df["authorship"] = df.apply(lambda row: f"{row['author']}, {row['year']}" if row["author"] not in ("", "NaN", "nan") else "", axis=1)
     df["scientific_name_authorship"] = df.apply(lambda row: f"({row['authorship']})" if row['orig_combination'] == 'N' and row["authorship"] not in ("", "NaN", "nan") else row["authorship"], axis=1)
-    
+
     df.to_csv(outputFilePath, index=False)
 
 def addParents(filePath: Path, outputFilePath: Path) -> None:
@@ -187,36 +191,35 @@ def enrich(filePath: Path, outputFilePath: Path) -> None:
         subDF = df[df["taxon_rank"] == rank]
 
         enrichmentPath = outputFilePath.parent / f"{rank}.csv"
-        if enrichmentPath.exists():
-            continue
+        if not enrichmentPath.exists():
+            writer = BigFileWriter(enrichmentPath, rank, subfileType=Format.CSV)
+            writer.populateFromFolder(writer.subfileDir)
+            subfileNames = [file.fileName for file in writer.writtenFiles]
 
-        writer = BigFileWriter(enrichmentPath, rank, subfileType=Format.CSV)
-        writer.populateFromFolder(writer.subfileDir)
-        subfileNames = [file.fileName for file in writer.writtenFiles]
-
-        uniqueSeries = subDF["taxon_id"].unique()
-        uniqueSeries = [item for item in uniqueSeries if item not in subfileNames]
-        
-        bar = SteppableProgressBar(len(uniqueSeries), processName=f"{rank} Progress")
-        for taxonID in uniqueSeries:
-            bar.update()
-
-            response = session.get(f"https://biodiversity.org.au/afd/taxa/{taxonID}/complete")
-            try:
-                records = _parseContent(response.text, taxonID, rank.lower())
-            except:
-                print(taxonID)
-                print(traceback.format_exc())
-                return
+            uniqueSeries = subDF["taxon_id"].unique()
+            uniqueSeries = [item for item in uniqueSeries if item not in subfileNames]
 
-            recordDF = pd.DataFrame.from_records(records)
-            writer.writeDF(recordDF, taxonID)
+            bar = SteppableProgressBar(50, len(uniqueSeries), f"{rank} Progress")
+            for taxonID in uniqueSeries:
+                bar.update()
+
+                response = session.get(f"https://biodiversity.org.au/afd/taxa/{taxonID}/complete")
+                try:
+                    records = _parseContent(response.text, taxonID, rank.lower())
+                except:
+                    print(taxonID)
+                    print(traceback.format_exc())
+                    return
+                
+                recordDF = pd.DataFrame.from_records(records)
+                writer.writeDF(recordDF, taxonID)
+
+            writer.oneFile(False)
 
-        writer.oneFile(False)
         enrichmentDF = pd.read_csv(enrichmentPath, dtype=object)
-        df = df.merge(enrichmentDF, "left", ["taxon_id", rank.lower()])
+        df = df.merge(enrichmentDF, "left", left_on=["taxon_id", "canonical_name"], right_on=["taxon_id", rank.lower()])
 
-    df.to_csv(outputFilePath)
+    df.to_csv(outputFilePath, index=False)
 
 def _parseContent(content: str, taxonID: str, rank: str) -> list[dict]:
     soup = BeautifulSoup(content, "html.parser")
@@ -285,7 +288,7 @@ def _parseContent(content: str, taxonID: str, rank: str) -> list[dict]:
         for typeData in synonymData.find_all("div"):
             data[typeData.find("h5").text.lower().replace(" ", "_")[:-1]] = synonymData.find("span").text
 
-        record = {"taxon_id": taxonID, rank: synonymTitle.find("strong").text.split()[-1]} | data
+        record = {"taxon_id": taxonID, rank: synonymTitle.find("strong").text} | data
         records.append(record | distributionData | descriptorData)
 
     return records
@@ -1,11 +1,11 @@
 {
     "datasetID": "0007001",
     "retrieveType": "script",
-    "download": {
+    "downloading": {
         "path": "./processing.py",
         "function": "build",
         "args": [
-            "{OUTPATH}"
+            "{OUT-PATH}"
         ],
         "output": "avh.csv"
     },
 
@@ -1,9 +1,7 @@
 import pandas as pd
 import requests
-import json
-import math
 from pathlib import Path
-import lib.dataframeFuncs as dff
+
 
 def build(outputFilePath: Path) -> None:
     baseURL = "https://biocache-ws.ala.org.au/ws/occurrences/search?q=*%3A*&disableAllQualityFilters=true&qualityProfile=AVH&fq=type_status%3A*&fq=country%3A%22Australia%22&qc=data_hub_uid%3Adh9"
@@ -14,7 +12,7 @@ def build(outputFilePath: Path) -> None:
     jsData = rawData.json()
 
     records = jsData["totalRecords"]
-    totalCalls = math.ceil(records / readSize)
+    totalCalls = (records / readSize).__ceil__()
 
     occurrences = []
     for call in range(totalCalls):
@@ -26,32 +24,3 @@ def build(outputFilePath: Path) -> None:
 
     df = pd.DataFrame.from_records(occurrences)
     df.to_csv(outputFilePath, index=False)
-
-def collect(outputDir: Path, profile: str, tokenFilePath: Path) -> None:
-    with open(tokenFilePath) as fp:
-        token = json.load(fp)
-
-    bearerToken = token["access_token"]
-    baseURL = "https://api.ala.org.au/profiles"
-    endpoint = f"/api/opus/{profile}/profile?pageSize=1000"
-    response = requests.get(baseURL + endpoint, headers={"Authorization": f"Bearer {bearerToken}"})
-    data = response.json()
-
-    if "message" in data and "not authorized" in data["message"]:
-        print("Failed to authorize, please make sure bearer token is valid.")
-        return
-    
-    print(f"Accessing profile: {profile}")
-
-    records = []
-    for idx, entry in enumerate(data, start=1):
-        uuid = entry["uuid"]
-        print(f"At record: {idx}", end="\r")
-
-        response = requests.get(baseURL + f"/api/opus/{profile}/profile/{uuid}", headers={"Authorization": f"Bearer {bearerToken}"})
-        records.append(response.json())
-    print()
-
-    df = pd.DataFrame.from_records(records)
-    df = dff.removeSpaces(df)
-    df.to_csv(outputDir / f"{profile}.csv", index=False)
@@ -1,10 +1,10 @@
 {
     "retrieveType": "script",
-    "download": {
+    "downloading": {
         "path": "./processing.py",
         "function": "collect",
         "args": [
-            "{OUTPATH}"
+            "{OUT-PATH}"
         ],
         "output": "lists.csv"
     },
 
@@ -7,13 +7,13 @@
         "mangrovewatch": {},
         "weeds-australia": {}
     },
-    "download": {
-        "path": "sourceProcessing/ala.py",
+    "downloading": {
+        "path": "../processing.py",
         "function": "collect",
         "args": [
-            "{OUTDIR}",
+            "{OUT-DIR}",
             "{SUBSECTION}",
-            "./token.json"
+            "../token.json"
         ],
         "output": "{SUBSECTION}.csv"
     },
 
@@ -0,0 +1,34 @@
+import json
+import requests
+from pathlib import Path
+import lib.dataframes as dff
+import pandas as pd
+
+def collect(outputDir: Path, profile: str, tokenFilePath: Path) -> None:
+    with open(tokenFilePath) as fp:
+        token = json.load(fp)
+
+    bearerToken = token["access_token"]
+    baseURL = "https://api.ala.org.au/profiles"
+    endpoint = f"/api/opus/{profile}/profile?pageSize=1000"
+    response = requests.get(baseURL + endpoint, headers={"Authorization": f"Bearer {bearerToken}"})
+    data = response.json()
+
+    if "message" in data and "not authorized" in data["message"]:
+        print("Failed to authorize, please make sure bearer token is valid.")
+        return
+    
+    print(f"Accessing profile: {profile}")
+
+    records = []
+    for idx, entry in enumerate(data, start=1):
+        uuid = entry["uuid"]
+        print(f"At record: {idx}", end="\r")
+
+        response = requests.get(baseURL + f"/api/opus/{profile}/profile/{uuid}", headers={"Authorization": f"Bearer {bearerToken}"})
+        records.append(response.json())
+    print()
+
+    df = pd.DataFrame.from_records(records)
+    df = dff.removeSpaces(df)
+    df.to_csv(outputDir / f"{profile}.csv", index=False)