11import re
2- from typing import TYPE_CHECKING , Any
2+ from typing import TYPE_CHECKING , Any , Optional , dict
33
44import openpyxl
5+ from openpyxl .worksheet .worksheet import Worksheet
56
67from bam_masterdata .logger import logger
78
8- # logger = logger()
99
10-
11- def index_to_excel_column (index ):
10+ def index_to_excel_column (index : int ) -> str :
1211 """
1312 Converts a 1-based index to an Excel column name.
1413
@@ -25,7 +24,7 @@ def index_to_excel_column(index):
2524 return column
2625
2726
28- def get_last_non_empty_row (sheet , start_index ) :
27+ def get_last_non_empty_row (sheet : Worksheet , start_index : int ) -> Optional [ int ] :
2928 """
3029 Finds the last non-empty row before encountering a completely empty row.
3130
@@ -50,7 +49,7 @@ def get_last_non_empty_row(sheet, start_index):
5049 return last_non_empty_row # If no empty row is encountered, return the last non-empty row
5150
5251
53- def is_reduced_version (generated_code_value , code ) :
52+ def is_reduced_version (generated_code_value : str , code : str ) -> bool :
5453 """
5554 Check if generated_code_value is a reduced version of code.
5655
@@ -96,7 +95,9 @@ def is_reduced_version(generated_code_value, code):
9695 return True
9796
9897
99- def properties_to_dict (sheet , start_index_row , last_non_empty_row ):
98+ def properties_to_dict (
99+ sheet : Worksheet , start_index_row : int , last_non_empty_row : int
100+ ) -> dict [str , dict [str , Any ]]:
100101 """
101102 Extracts properties from an Entity type block in the Excel sheet and returns them as a dictionary.
102103
@@ -119,6 +120,7 @@ def properties_to_dict(sheet, start_index_row, last_non_empty_row):
119120 "Property label" ,
120121 "Data type" ,
121122 "Vocabulary code" ,
123+ # "Object code",
122124 ]
123125
124126 # Determine the header row index
@@ -280,7 +282,21 @@ def properties_to_dict(sheet, start_index_row, last_non_empty_row):
280282 return property_dict
281283
282284
283- def terms_to_dict (sheet , start_index_row , last_non_empty_row ):
285+ def terms_to_dict (
286+ sheet : Worksheet , start_index_row : int , last_non_empty_row : int
287+ ) -> dict [str , dict [str , Any ]]:
288+ """
289+ Extracts terms from a Vocabulary block in the Excel sheet and returns them as a dictionary.
290+
291+ Args:
292+ sheet: The worksheet object.
293+ start_index_row: Row where the current entity type begins (1-based index).
294+ last_non_empty_row: Row where the current entity type finish (1-based index).
295+
296+ Returns:
297+ A dictionary where each key is a vocabulary term code and the value is a dictionary
298+ containing the attributes of the vocabulary term.
299+ """
284300 terms_dict = {}
285301 expected_terms = ["Code" , "Description" , "Url template" , "Label" , "Official" ]
286302
@@ -374,12 +390,31 @@ def terms_to_dict(sheet, start_index_row, last_non_empty_row):
374390 return terms_dict
375391
376392
377- def block_to_entity_dict (sheet , start_index_row , last_non_empty_row , complete_dict ):
393+ def block_to_entity_dict (
394+ sheet : Worksheet ,
395+ start_index_row : int ,
396+ last_non_empty_row : int ,
397+ complete_dict : dict [str , Any ],
398+ ) -> dict [str , Any ]:
399+ """
400+ Extracts entity attributes from an Excel sheet block and returns them as a dictionary.
401+
402+ Args:
403+ sheet: The worksheet object.
404+ start_index_row: The row where the current entity type begins (1-based index).
405+ last_non_empty_row: The row where the current entity type finishes (1-based index).
406+ complete_dict: The dictionary to store the extracted entity attributes.
407+
408+ Returns:
409+ A dictionary containing the entity attributes.
410+ """
378411 attributes_dict = {}
379412
413+ # Get the entity type from the specified cell
380414 entity_type_position = f"A{ start_index_row } "
381415 entity_type = sheet [entity_type_position ].value
382416
417+ # Define the valid entity types
383418 entity_types = [
384419 "OBJECT_TYPE" ,
385420 "SAMPLE_TYPE" ,
@@ -389,14 +424,16 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
389424 "VOCABULARY_TYPE" ,
390425 ]
391426
427+ # Get the header terms from the row below the entity type row
392428 header_terms = [cell .value for cell in sheet [start_index_row + 1 ]]
393429
430+ # Check if the entity type is valid
394431 if entity_type not in entity_types :
395432 print (
396433 "The entity type (cell A1) should be one of the following: SAMPLE_TYPE/OBJECT_TYPE, EXPERIMENT_TYPE/COLLECTION_TYPE, DATASET_TYPE, PROPERTY_TYPE, VOCABULARY_TYPE"
397434 )
398- # return "\n".join(errors)
399435 else :
436+ # Process based on the entity type
400437 if entity_type == "SAMPLE_TYPE" or entity_type == "OBJECT_TYPE" :
401438 expected_terms = [
402439 "Code" ,
@@ -478,12 +515,14 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
478515 auto_generate_value = auto_generate_value == "true"
479516 attributes_dict ["autoGeneratedCode" ] = auto_generate_value
480517
518+ # Assign the properties dictionary as a field for the entity dictionary
481519 attributes_dict ["properties" ] = properties_to_dict (
482520 sheet , start_index_row , last_non_empty_row
483521 )
484522
485523 complete_dict [code_value ] = attributes_dict
486524
525+ # Return the sorted dictionary (by inheritance, using dots "." as criteria for sorting)
487526 return dict (
488527 sorted (complete_dict .items (), key = lambda item : item [0 ].count ("." ))
489528 )
@@ -537,12 +576,14 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
537576 validation_value = ""
538577 attributes_dict ["validationPlugin" ] = validation_value
539578
579+ # Assign the properties dictionary as a field for the entity dictionary
540580 attributes_dict ["properties" ] = properties_to_dict (
541581 sheet , start_index_row , last_non_empty_row
542582 )
543583
544584 complete_dict [code_value ] = attributes_dict
545585
586+ # Return the sorted dictionary (by inheritance, using dots "." as criteria for sorting)
546587 return dict (
547588 sorted (complete_dict .items (), key = lambda item : item [0 ].count ("." ))
548589 )
@@ -668,6 +709,7 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
668709
669710 complete_dict [code_value ] = attributes_dict
670711
712+ # Return the sorted dictionary (by inheritance, using dots "." as criteria for sorting)
671713 return dict (
672714 sorted (complete_dict .items (), key = lambda item : item [0 ].count ("." ))
673715 )
@@ -722,20 +764,36 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
722764 url_value = ""
723765 attributes_dict ["url_template" ] = url_value
724766
767+ # Assign the terms dictionary as a field for the vocabulary dictionary
725768 attributes_dict ["terms" ] = terms_to_dict (
726769 sheet , start_index_row , last_non_empty_row
727770 )
728771
729772 complete_dict [code_value ] = attributes_dict
730773
774+ # Return the sorted dictionary (by inheritance, using dots "." as criteria for sorting)
731775 return dict (
732776 sorted (complete_dict .items (), key = lambda item : item [0 ].count ("." ))
733777 )
734778
735779
736- def excel_to_entities (excel_path , output_directory = "./artifacts/tmp/" ):
780+ def excel_to_entities (
781+ excel_path : str , output_directory : str = "./artifacts/tmp/"
782+ ) -> dict [str , dict [str , Any ]]:
783+ """
784+ Extracts entities from an Excel file and returns them as a dictionary.
785+
786+ Args:
787+ excel_path: The path to the Excel file.
788+ output_directory: The directory to store the output files.
789+
790+ Returns:
791+ A dictionary where each key is a normalized sheet name and the value is a dictionary
792+ containing the extracted entities.
793+ """
737794 sheets_dict = {}
738795
796+ # Load the workbook and get the sheet names
739797 workbook = openpyxl .load_workbook (excel_path )
740798 sheet_names = workbook .sheetnames
741799
0 commit comments