Updated ncbi api processing to require api key

christopher-mangion · christopher-mangion · commit 524ec23d6a77 · 2025-10-07T14:02:09.000+10:00
Updated merge function to not attempt to run if both input files do not exist
Updated apiWorker to use v2 api instead of v2alpha
diff --git a/dataSources/ncbi/llib/apiWorker.py b/dataSources/ncbi/llib/apiWorker.py
@@ -44,7 +44,7 @@ def apiWorker(queue: Queue, id: int, apiKey: str, recordsPerCall: int, accession
 
     try:
         for string in accessionStrings:
-            url = f"https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{string}/dataset_report"
+            url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{string}/dataset_report"
             response = session.get(url, headers=headers, params=params)
             data = response.json()
             records = data.get("reports", [])
diff --git a/dataSources/ncbi/llib/sharedProcessing.py b/dataSources/ncbi/llib/sharedProcessing.py
@@ -13,15 +13,12 @@
 
 def getStats(summaryFile: DataFile, outputPath: Path):
     apiKey = secrets.ncbi.key
+    if not isinstance(apiKey, str):
+        logging.error("No API key found in secrets file, and is required to access NCBI api. Please update 'secrets.toml' with 'key' field under 'ncbi'.")
+        return
+    
+    logging.info("Found API key")
     processes = 10
-    if apiKey is None:
-        apiKey = "" # Use empty string
-        processes = 3 # Reduce processes count due to reduced api call rate
-        logging.info("No API key found")
-
-    else:
-        logging.info("Found API key")
-
     recordsPerCall = 200
     recordsPerSubsection = 30000
     accessionCol = "#assembly_accession"
@@ -68,6 +65,14 @@ def getStats(summaryFile: DataFile, outputPath: Path):
     writer.combine(False, index=False)
 
 def merge(summaryFile: DataFile, statsFilePath: Path, outputPath: Path) -> None:
+    if not summaryFile.exists():
+        logging.error("Unable to merge files as summary file doesn't exist")
+        return
+    
+    if not statsFilePath.exists():
+        logging.error("Unable to merge files as stats file doesn't exist")
+        return
+
     df = summaryFile.read(low_memory=False)
     df2 = pd.read_csv(statsFilePath, low_memory=False)
     df = df.merge(df2, how="outer", left_on="#assembly_accession", right_on="current_accession")