Added missing docstrings, comments, typings and review notes

carlosmada22 · carlosmada22 · commit ca498d31d187 · 2025-01-27T14:13:59.000+01:00
diff --git a/bam_masterdata/cli/cli.py b/bam_masterdata/cli/cli.py
@@ -39,7 +39,7 @@ def cli():
     """,
 )
 @click.option(
-    "--path",
+    "--excel-file",
     type=click.Path(exists=True, dir_okay=False),
     required=False,
     help="""
@@ -50,7 +50,7 @@ def fill_masterdata(url, path):
     start_time = time.time()
 
     # Define output directory
-    output_directory = "./artifacts/tmp/" if path else DATAMODEL_DIR
+    output_directory = os.path.join(DATAMODEL_DIR, "tmp") if path else DATAMODEL_DIR
 
     # Ensure the output directory exists
     os.makedirs(output_directory, exist_ok=True)
diff --git a/bam_masterdata/cli/excel_to_entities.py b/bam_masterdata/cli/excel_to_entities.py
@@ -1,14 +1,13 @@
 import re
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional, dict
 
 import openpyxl
+from openpyxl.worksheet.worksheet import Worksheet
 
 from bam_masterdata.logger import logger
 
-# logger = logger()
 
-
-def index_to_excel_column(index):
+def index_to_excel_column(index: int) -> str:
     """
     Converts a 1-based index to an Excel column name.
 
@@ -25,7 +24,7 @@ def index_to_excel_column(index):
     return column
 
 
-def get_last_non_empty_row(sheet, start_index):
+def get_last_non_empty_row(sheet: Worksheet, start_index: int) -> Optional[int]:
     """
     Finds the last non-empty row before encountering a completely empty row.
 
@@ -50,7 +49,7 @@ def get_last_non_empty_row(sheet, start_index):
     return last_non_empty_row  # If no empty row is encountered, return the last non-empty row
 
 
-def is_reduced_version(generated_code_value, code):
+def is_reduced_version(generated_code_value: str, code: str) -> bool:
     """
     Check if generated_code_value is a reduced version of code.
 
@@ -96,7 +95,9 @@ def is_reduced_version(generated_code_value, code):
     return True
 
 
-def properties_to_dict(sheet, start_index_row, last_non_empty_row):
+def properties_to_dict(
+    sheet: Worksheet, start_index_row: int, last_non_empty_row: int
+) -> dict[str, dict[str, Any]]:
     """
     Extracts properties from an Entity type block in the Excel sheet and returns them as a dictionary.
 
@@ -119,6 +120,7 @@ def properties_to_dict(sheet, start_index_row, last_non_empty_row):
         "Property label",
         "Data type",
         "Vocabulary code",
+        # "Object code",
     ]
 
     # Determine the header row index
@@ -280,7 +282,21 @@ def properties_to_dict(sheet, start_index_row, last_non_empty_row):
     return property_dict
 
 
-def terms_to_dict(sheet, start_index_row, last_non_empty_row):
+def terms_to_dict(
+    sheet: Worksheet, start_index_row: int, last_non_empty_row: int
+) -> dict[str, dict[str, Any]]:
+    """
+    Extracts terms from a Vocabulary block in the Excel sheet and returns them as a dictionary.
+
+    Args:
+        sheet: The worksheet object.
+        start_index_row: Row where the current entity type begins (1-based index).
+        last_non_empty_row: Row where the current entity type finish (1-based index).
+
+    Returns:
+        A dictionary where each key is a vocabulary term code and the value is a dictionary
+        containing the attributes of the vocabulary term.
+    """
     terms_dict = {}
     expected_terms = ["Code", "Description", "Url template", "Label", "Official"]
 
@@ -374,12 +390,31 @@ def terms_to_dict(sheet, start_index_row, last_non_empty_row):
     return terms_dict
 
 
-def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_dict):
+def block_to_entity_dict(
+    sheet: Worksheet,
+    start_index_row: int,
+    last_non_empty_row: int,
+    complete_dict: dict[str, Any],
+) -> dict[str, Any]:
+    """
+    Extracts entity attributes from an Excel sheet block and returns them as a dictionary.
+
+    Args:
+        sheet: The worksheet object.
+        start_index_row: The row where the current entity type begins (1-based index).
+        last_non_empty_row: The row where the current entity type finishes (1-based index).
+        complete_dict: The dictionary to store the extracted entity attributes.
+
+    Returns:
+        A dictionary containing the entity attributes.
+    """
     attributes_dict = {}
 
+    # Get the entity type from the specified cell
     entity_type_position = f"A{start_index_row}"
     entity_type = sheet[entity_type_position].value
 
+    # Define the valid entity types
     entity_types = [
         "OBJECT_TYPE",
         "SAMPLE_TYPE",
@@ -389,14 +424,16 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
         "VOCABULARY_TYPE",
     ]
 
+    # Get the header terms from the row below the entity type row
     header_terms = [cell.value for cell in sheet[start_index_row + 1]]
 
+    # Check if the entity type is valid
     if entity_type not in entity_types:
         print(
             "The entity type (cell A1) should be one of the following: SAMPLE_TYPE/OBJECT_TYPE, EXPERIMENT_TYPE/COLLECTION_TYPE, DATASET_TYPE, PROPERTY_TYPE, VOCABULARY_TYPE"
         )
-        # return "\n".join(errors)
     else:
+        # Process based on the entity type
         if entity_type == "SAMPLE_TYPE" or entity_type == "OBJECT_TYPE":
             expected_terms = [
                 "Code",
@@ -478,12 +515,14 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
                         auto_generate_value = auto_generate_value == "true"
                         attributes_dict["autoGeneratedCode"] = auto_generate_value
 
+            # Assign the properties dictionary as a field for the entity dictionary
             attributes_dict["properties"] = properties_to_dict(
                 sheet, start_index_row, last_non_empty_row
             )
 
             complete_dict[code_value] = attributes_dict
 
+            # Return the sorted dictionary (by inheritance, using dots "." as criteria for sorting)
             return dict(
                 sorted(complete_dict.items(), key=lambda item: item[0].count("."))
             )
@@ -537,12 +576,14 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
                             validation_value = ""
                         attributes_dict["validationPlugin"] = validation_value
 
+            # Assign the properties dictionary as a field for the entity dictionary
             attributes_dict["properties"] = properties_to_dict(
                 sheet, start_index_row, last_non_empty_row
             )
 
             complete_dict[code_value] = attributes_dict
 
+            # Return the sorted dictionary (by inheritance, using dots "." as criteria for sorting)
             return dict(
                 sorted(complete_dict.items(), key=lambda item: item[0].count("."))
             )
@@ -668,6 +709,7 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
 
             complete_dict[code_value] = attributes_dict
 
+            # Return the sorted dictionary (by inheritance, using dots "." as criteria for sorting)
             return dict(
                 sorted(complete_dict.items(), key=lambda item: item[0].count("."))
             )
@@ -722,20 +764,36 @@ def block_to_entity_dict(sheet, start_index_row, last_non_empty_row, complete_di
                             url_value = ""
                         attributes_dict["url_template"] = url_value
 
+            # Assign the terms dictionary as a field for the vocabulary dictionary
             attributes_dict["terms"] = terms_to_dict(
                 sheet, start_index_row, last_non_empty_row
             )
 
             complete_dict[code_value] = attributes_dict
 
+            # Return the sorted dictionary (by inheritance, using dots "." as criteria for sorting)
             return dict(
                 sorted(complete_dict.items(), key=lambda item: item[0].count("."))
             )
 
 
-def excel_to_entities(excel_path, output_directory="./artifacts/tmp/"):
+def excel_to_entities(
+    excel_path: str, output_directory: str = "./artifacts/tmp/"
+) -> dict[str, dict[str, Any]]:
+    """
+    Extracts entities from an Excel file and returns them as a dictionary.
+
+    Args:
+        excel_path: The path to the Excel file.
+        output_directory: The directory to store the output files.
+
+    Returns:
+        A dictionary where each key is a normalized sheet name and the value is a dictionary
+        containing the extracted entities.
+    """
     sheets_dict = {}
 
+    # Load the workbook and get the sheet names
     workbook = openpyxl.load_workbook(excel_path)
     sheet_names = workbook.sheetnames