33from pathlib import Path
44import pandas as pd
55from io import BytesIO
6- from lib .tools . bigFileWriter import BigFileWriter , Format
6+ from lib .bigFileWriter import BigFileWriter , Format
77from bs4 import BeautifulSoup
8- from lib .tools . progressBar import SteppableProgressBar
8+ from lib .progressBar import SteppableProgressBar
99import re
1010import traceback
1111
@@ -82,19 +82,8 @@ def cleanup(filePath: Path, outputFilePath: Path) -> None:
8282 df = df .drop ([
8383 "CAVS_CODE" ,
8484 "CAAB_CODE" ,
85- "PUB_PUB_AUTHOR" ,
86- "PUB_PUB_YEAR" ,
87- "PUB_PUB_TITLE" ,
88- "PUB_PUB_PAGES" ,
89- "PUB_PUB_PARENT_BOOK_TITLE" ,
90- "PUB_PUB_PARENT_JOURNAL_TITLE" ,
91- "PUB_PUB_PARENT_ARTICLE_TITLE" ,
92- "PUB_PUB_PUBLICATION_DATE" ,
93- "PUB_PUB_PUBLISHER" ,
9485 "PUB_PUB_FORMATTED" ,
9586 "PUB_PUB_QUALIFICATION" ,
96- "PUB_PUB_TYPE" ,
97- "PUBLICATION_GUID" ,
9887 "PUBLICATION_LAST_UPDATE" ,
9988 "PARENT_PUBLICATION_GUID"
10089 ], axis = 1 )
@@ -105,9 +94,24 @@ def cleanup(filePath: Path, outputFilePath: Path) -> None:
10594 "NAME_GUID" : "name_id" ,
10695 "TAXON_GUID" : "taxon_id" ,
10796 "TAXON_LAST_UPDATE" : "updated_at" ,
108- "PARENT_TAXON_GUID" : "parent_taxon_id"
97+ "PARENT_TAXON_GUID" : "parent_taxon_id" ,
98+ "PUB_PUB_AUTHOR" : "publication_author" ,
99+ "PUB_PUB_YEAR" : "publication_year" ,
100+ "PUB_PUB_TITLE" : "publication_title" ,
101+ "PUB_PUB_PAGES" : "publication_pages" ,
102+ "PUB_PUB_PUBLICATION_DATE" : "publication_date" ,
103+ "PUB_PUB_PUBLISHER" : "publisher" ,
104+ "PUB_PUB_TYPE" : "publication_type" ,
105+ "PUBLICATION_GUID" : "publication_id"
109106 })
110107
108+ df ["published_media_title" ] = df ["PUB_PUB_PARENT_BOOK_TITLE" ] + df ["PUB_PUB_PARENT_JOURNAL_TITLE" ] + df ["PUB_PUB_PARENT_ARTICLE_TITLE" ]
109+ df = df .drop ([
110+ "PUB_PUB_PARENT_BOOK_TITLE" ,
111+ "PUB_PUB_PARENT_JOURNAL_TITLE" ,
112+ "PUB_PUB_PARENT_ARTICLE_TITLE"
113+ ], axis = 1 )
114+
111115 df = df .rename (columns = {column : column .lower () for column in df .columns })
112116 df = df .rename (columns = {"qualification" : "notes" })
113117 df = df [df ["scientific_name" ] != "Unplaced Synonym(s)" ]
@@ -147,7 +151,7 @@ def cleanup(filePath: Path, outputFilePath: Path) -> None:
147151 df ["canonical_name" ] = df .apply (lambda row : f"{ row ['canonical_genus' ]} { row ['species' ]} " if row ["taxon_rank" ] == "Species" else f"{ row ['canonical_genus' ]} { row ['species' ]} { row ['subspecies' ]} " if row ["taxon_rank" ] == "subspecies" else row ["names_various" ], axis = 1 )
148152 df ["authorship" ] = df .apply (lambda row : f"{ row ['author' ]} , { row ['year' ]} " if row ["author" ] not in ("" , "NaN" , "nan" ) else "" , axis = 1 )
149153 df ["scientific_name_authorship" ] = df .apply (lambda row : f"({ row ['authorship' ]} )" if row ['orig_combination' ] == 'N' and row ["authorship" ] not in ("" , "NaN" , "nan" ) else row ["authorship" ], axis = 1 )
150-
154+
151155 df .to_csv (outputFilePath , index = False )
152156
153157def addParents (filePath : Path , outputFilePath : Path ) -> None :
@@ -187,36 +191,35 @@ def enrich(filePath: Path, outputFilePath: Path) -> None:
187191 subDF = df [df ["taxon_rank" ] == rank ]
188192
189193 enrichmentPath = outputFilePath .parent / f"{ rank } .csv"
190- if enrichmentPath .exists ():
191- continue
194+ if not enrichmentPath .exists ():
195+ writer = BigFileWriter (enrichmentPath , rank , subfileType = Format .CSV )
196+ writer .populateFromFolder (writer .subfileDir )
197+ subfileNames = [file .fileName for file in writer .writtenFiles ]
192198
193- writer = BigFileWriter (enrichmentPath , rank , subfileType = Format .CSV )
194- writer .populateFromFolder (writer .subfileDir )
195- subfileNames = [file .fileName for file in writer .writtenFiles ]
196-
197- uniqueSeries = subDF ["taxon_id" ].unique ()
198- uniqueSeries = [item for item in uniqueSeries if item not in subfileNames ]
199-
200- bar = SteppableProgressBar (len (uniqueSeries ), processName = f"{ rank } Progress" )
201- for taxonID in uniqueSeries :
202- bar .update ()
203-
204- response = session .get (f"https://biodiversity.org.au/afd/taxa/{ taxonID } /complete" )
205- try :
206- records = _parseContent (response .text , taxonID , rank .lower ())
207- except :
208- print (taxonID )
209- print (traceback .format_exc ())
210- return
199+ uniqueSeries = subDF ["taxon_id" ].unique ()
200+ uniqueSeries = [item for item in uniqueSeries if item not in subfileNames ]
211201
212- recordDF = pd .DataFrame .from_records (records )
213- writer .writeDF (recordDF , taxonID )
202+ bar = SteppableProgressBar (50 , len (uniqueSeries ), f"{ rank } Progress" )
203+ for taxonID in uniqueSeries :
204+ bar .update ()
205+
206+ response = session .get (f"https://biodiversity.org.au/afd/taxa/{ taxonID } /complete" )
207+ try :
208+ records = _parseContent (response .text , taxonID , rank .lower ())
209+ except :
210+ print (taxonID )
211+ print (traceback .format_exc ())
212+ return
213+
214+ recordDF = pd .DataFrame .from_records (records )
215+ writer .writeDF (recordDF , taxonID )
216+
217+ writer .oneFile (False )
214218
215- writer .oneFile (False )
216219 enrichmentDF = pd .read_csv (enrichmentPath , dtype = object )
217- df = df .merge (enrichmentDF , "left" , ["taxon_id" , rank .lower ()])
220+ df = df .merge (enrichmentDF , "left" , left_on = [ "taxon_id" , "canonical_name" ], right_on = ["taxon_id" , rank .lower ()])
218221
219- df .to_csv (outputFilePath )
222+ df .to_csv (outputFilePath , index = False )
220223
221224def _parseContent (content : str , taxonID : str , rank : str ) -> list [dict ]:
222225 soup = BeautifulSoup (content , "html.parser" )
@@ -285,7 +288,7 @@ def _parseContent(content: str, taxonID: str, rank: str) -> list[dict]:
285288 for typeData in synonymData .find_all ("div" ):
286289 data [typeData .find ("h5" ).text .lower ().replace (" " , "_" )[:- 1 ]] = synonymData .find ("span" ).text
287290
288- record = {"taxon_id" : taxonID , rank : synonymTitle .find ("strong" ).text . split ()[ - 1 ] } | data
291+ record = {"taxon_id" : taxonID , rank : synonymTitle .find ("strong" ).text } | data
289292 records .append (record | distributionData | descriptorData )
290293
291294 return records
0 commit comments