Skip to content

Commit e348c82

Browse files
authored
Extract datamodel to json and excel (#18)
* Added CLI for extract entities to JSON and excel Added utils functions * Added description to cli functions * Changed path to artifacts generated files * Added testing for entities_to_json and entities_to_excel * Fix ruff * Fix fill_masterdata Fix cls_name in entities.py * Added url option to fill_masterdata command Defined a global DATAMODEL_DIR in cli.py * Added comment for bug in get_entities and get_property_assignments Pumped pybis to 1.37.0 * Added subprocess running ruff after generating masterdata * Fix ruff * Fix input url from environ
1 parent dd9fb05 commit e348c82

File tree

16 files changed

+653
-34
lines changed

16 files changed

+653
-34
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,4 +171,7 @@ cython_debug/
171171

172172
# pytest coverage
173173
pytest.xml
174-
pytest-coverage.txt
174+
pytest-coverage.txt
175+
176+
# artifacts
177+
artifacts

.vscode/settings.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,10 @@
5757
"cwd": "${workspaceFolder}",
5858
"program": "${workspaceFolder}/.venv/bin/bam_masterdata",
5959
"justMyCode": false,
60-
"args": ["fill_masterdata"]
60+
"args": [
61+
"fill_masterdata",
62+
// "--url=https://devel.datastore.bam.de/"
63+
]
6164
},
6265
{
6366
"name": "BM export-to-json",

bam_masterdata/cli/cli.py

Lines changed: 99 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,23 @@
11
import os
2+
import subprocess
23
import time
34
from pathlib import Path
45

56
import click
7+
from decouple import config as environ
8+
from openpyxl import Workbook
69

10+
from bam_masterdata.cli.entities_to_excel import entities_to_excel
11+
from bam_masterdata.cli.entities_to_json import entities_to_json
712
from bam_masterdata.cli.fill_masterdata import MasterdataCodeGenerator
13+
from bam_masterdata.logger import logger
14+
from bam_masterdata.utils import (
15+
delete_and_create_dir,
16+
import_module,
17+
listdir_py_modules,
18+
)
19+
20+
DATAMODEL_DIR = os.path.join(".", "bam_masterdata", "datamodel")
821

922

1023
@click.group(help="Entry point to run `bam_masterdata` CLI commands.")
@@ -14,19 +27,31 @@ def cli():
1427

1528
@cli.command(
1629
name="fill_masterdata",
17-
help="Fill the masterdata from the openBIS instance specified in the `.env` in the bam_masterdata/datamodel/ subfolder.",
30+
help="Fill the masterdata from the openBIS instance and stores it in the bam_masterdata/datamodel/ modules.",
31+
)
32+
@click.option(
33+
"--url",
34+
type=str,
35+
required=False,
36+
help="""
37+
(Optional) The URL of the openBIS instance from which to extract the data model. If not defined,
38+
it is using the value of the `OPENBIS_URL` environment variable.
39+
""",
1840
)
19-
def fill_masterdata():
41+
def fill_masterdata(url):
2042
start_time = time.time()
2143

2244
# ! this takes a lot of time loading all the entities in Openbis
23-
generator = MasterdataCodeGenerator()
45+
# Use the URL if provided, otherwise fall back to defaults
46+
if not url:
47+
url = environ("OPENBIS_URL")
48+
click.echo(f"Using the openBIS instance: {url}\n")
49+
generator = MasterdataCodeGenerator(url=url)
2450

2551
# Add each module to the `bam_masterdata/datamodel` directory
26-
output_dir = os.path.join(".", "bam_masterdata", "datamodel")
2752
for module_name in ["property", "collection", "dataset", "object", "vocabulary"]:
2853
module_start_time = time.perf_counter() # more precise time measurement
29-
output_file = Path(os.path.join(output_dir, f"{module_name}_types.py"))
54+
output_file = Path(os.path.join(DATAMODEL_DIR, f"{module_name}_types.py"))
3055

3156
# Get the method from `MasterdataCodeGenerator`
3257
code = getattr(generator, f"generate_{module_name}_types")()
@@ -40,12 +65,76 @@ def fill_masterdata():
4065
elapsed_time = time.time() - start_time
4166
click.echo(f"Generated all types in {elapsed_time:.2f} seconds\n\n")
4267

43-
# ! this could be automated in the CLI
44-
click.echo(
45-
"Don't forget to apply ruff at the end after generating the files by doing:\n"
68+
try:
69+
# Run ruff check
70+
click.echo("Running `ruff check .`...")
71+
subprocess.run(["ruff", "check", "."], check=True)
72+
73+
# Run ruff format
74+
click.echo("Running `ruff format .`...")
75+
subprocess.run(["ruff", "format", "."], check=True)
76+
except subprocess.CalledProcessError as e:
77+
click.echo(f"Error during ruff execution: {e}", err=True)
78+
else:
79+
click.echo("Ruff checks and formatting completed successfully!")
80+
81+
82+
@cli.command(
83+
name="export_to_json",
84+
help="Export entities to JSON files to the `./artifacts/` folder.",
85+
)
86+
def export_to_json():
87+
# Get the directories from the Python modules and the export directory for the static artifacts
88+
export_dir = os.path.join(".", "artifacts")
89+
90+
# Delete and create the export directory
91+
delete_and_create_dir(directory_path=export_dir, logger=logger)
92+
93+
# Get the Python modules to process the datamodel
94+
py_modules = listdir_py_modules(directory_path=DATAMODEL_DIR, logger=logger)
95+
96+
# Process each module using the `to_json` method of each entity
97+
for module_path in py_modules:
98+
entities_to_json(module_path=module_path, export_dir=export_dir, logger=logger)
99+
100+
click.echo(f"All entity artifacts have been generated and saved to {export_dir}")
101+
102+
103+
@cli.command(
104+
name="export_to_excel",
105+
help="Export entities to an Excel file in the path `./artifacts/masterdata.xlsx`.",
106+
)
107+
def export_to_excel():
108+
# Get the Python modules to process the datamodel
109+
py_modules = listdir_py_modules(directory_path=DATAMODEL_DIR, logger=logger)
110+
111+
# Load the definitions module classes
112+
definitions_module = import_module(
113+
module_path="./bam_masterdata/metadata/definitions.py"
46114
)
47-
click.echo(" ruff check .\n")
48-
click.echo(" ruff format .\n")
115+
116+
# Process the modules and save the entities to the openBIS masterdata Excel file
117+
masterdata_file = os.path.join(".", "artifacts", "masterdata.xlsx")
118+
wb = Workbook()
119+
for i, module_path in enumerate(py_modules):
120+
if i == 0:
121+
ws = wb.active
122+
else:
123+
ws = wb.create_sheet()
124+
ws.title = (
125+
os.path.basename(module_path)
126+
.capitalize()
127+
.replace(".py", "")
128+
.replace("_", " ")
129+
)
130+
entities_to_excel(
131+
worksheet=ws,
132+
module_path=module_path,
133+
definitions_module=definitions_module,
134+
)
135+
wb.save(masterdata_file)
136+
137+
click.echo(f"All masterdata have been generated and saved to {masterdata_file}")
49138

50139

51140
if __name__ == "__main__":
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import inspect
2+
from typing import TYPE_CHECKING, Any
3+
4+
if TYPE_CHECKING:
5+
from openpyxl.worksheet.worksheet import Worksheet
6+
7+
from bam_masterdata.utils import import_module
8+
9+
10+
def entities_to_excel(
11+
worksheet: "Worksheet",
12+
module_path: str,
13+
definitions_module: Any,
14+
) -> None:
15+
"""
16+
Export entities to the Excel file. The Python modules are imported using the function `import_module`,
17+
and their contents are inspected (using `inspect`) to find the classes in the datamodel containing
18+
`defs` and with a `to_json` method defined. Each row is then appended to the `worksheet`.
19+
20+
Args:
21+
worksheet (Worksheet): The worksheet to append the entities.
22+
module_path (str): Path to the Python module file.
23+
definitions_module (Any): The module containing the definitions of the entities. This is used
24+
to match the header definitions of the entities.
25+
"""
26+
def_members = inspect.getmembers(definitions_module, inspect.isclass)
27+
module = import_module(module_path=module_path)
28+
29+
# Special case of `PropertyTypeDef` in `property_types.py`
30+
if "property_types.py" in module_path:
31+
for name, obj in inspect.getmembers(module):
32+
if name.startswith("_") or name == "PropertyTypeDef":
33+
continue
34+
35+
# Entity title
36+
worksheet.append([obj.excel_name])
37+
38+
# Entity header definitions and values
39+
worksheet.append(obj.excel_headers)
40+
row = []
41+
for f_set in obj.model_fields.keys():
42+
if f_set == "data_type":
43+
val = obj.data_type.value
44+
else:
45+
val = getattr(obj, f_set)
46+
row.append(val)
47+
worksheet.append(row)
48+
worksheet.append([""]) # empty row after entity definitions
49+
return None
50+
51+
# All other datamodel modules
52+
for _, obj in inspect.getmembers(module, inspect.isclass):
53+
# Ensure the class has the `to_json` method
54+
if not hasattr(obj, "defs") or not callable(getattr(obj, "to_json")):
55+
continue
56+
57+
obj_instance = obj()
58+
59+
# Entity title
60+
obj_definitions = obj_instance.defs
61+
worksheet.append([obj_definitions.excel_name])
62+
63+
# Entity header definitions and values
64+
for def_name, def_cls in def_members:
65+
if def_name == obj_definitions.name:
66+
break
67+
worksheet.append(obj_definitions.excel_headers)
68+
header_values = [
69+
getattr(obj_definitions, f_set) for f_set in def_cls.model_fields.keys()
70+
]
71+
worksheet.append(header_values)
72+
73+
# Properties assignment for ObjectType, DatasetType, and CollectionType
74+
if obj_instance.cls_name in ["ObjectType", "DatasetType", "CollectionType"]:
75+
if not obj_instance.properties:
76+
continue
77+
worksheet.append(obj_instance.properties[0].excel_headers)
78+
for prop in obj_instance.properties:
79+
row = []
80+
for f_set in prop.model_fields.keys():
81+
if f_set == "data_type":
82+
val = prop.data_type.value
83+
else:
84+
val = getattr(prop, f_set)
85+
row.append(val)
86+
worksheet.append(row)
87+
# Terms assignment for VocabularyType
88+
elif obj_instance.cls_name == "VocabularyType":
89+
if not obj_instance.terms:
90+
continue
91+
worksheet.append(obj_instance.terms[0].excel_headers)
92+
for term in obj_instance.terms:
93+
worksheet.append(
94+
getattr(term, f_set) for f_set in term.model_fields.keys()
95+
)
96+
97+
# ? do the PropertyTypeDef need to be exported to Excel?
98+
99+
worksheet.append([""]) # empty row after entity definitions
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import inspect
2+
import json
3+
import os
4+
from typing import TYPE_CHECKING
5+
6+
if TYPE_CHECKING:
7+
from structlog._config import BoundLoggerLazyProxy
8+
9+
import click
10+
11+
from bam_masterdata.utils import delete_and_create_dir, import_module
12+
13+
14+
def entities_to_json(
15+
module_path: str, export_dir: str, logger: "BoundLoggerLazyProxy"
16+
) -> None:
17+
"""
18+
Export entities to JSON files. The Python modules are imported using the function `import_module`,
19+
and their contents are inspected (using `inspect`) to find the classes in the datamodel containing
20+
`defs` and with a `to_json` method defined.
21+
22+
Args:
23+
module_path (str): Path to the Python module file.
24+
export_dir (str): Path to the directory where the JSON files will be saved.
25+
logger (BoundLoggerLazyProxy): The logger to log messages.
26+
"""
27+
module = import_module(module_path=module_path)
28+
# export to specific subfolders for each type of entity (each module)
29+
module_export_dir = os.path.join(
30+
export_dir, os.path.basename(module_path).replace(".py", "")
31+
)
32+
delete_and_create_dir(directory_path=module_export_dir, logger=logger)
33+
34+
# Special case of `PropertyTypeDef` in `property_types.py`
35+
if "property_types.py" in module_path:
36+
for name, obj in inspect.getmembers(module):
37+
if name.startswith("_") or name == "PropertyTypeDef":
38+
continue
39+
try:
40+
json_data = json.dumps(obj.model_dump(), indent=2)
41+
output_file = os.path.join(module_export_dir, f"{obj.code}.json")
42+
with open(output_file, "w", encoding="utf-8") as f:
43+
f.write(json_data)
44+
45+
click.echo(f"Saved JSON for class {name} to {output_file}")
46+
except Exception as err:
47+
click.echo(f"Failed to process class {name} in {module_path}: {err}")
48+
return None
49+
50+
# All other datamodel modules
51+
for name, obj in inspect.getmembers(module, inspect.isclass):
52+
# Ensure the class has the `to_json` method
53+
if not hasattr(obj, "defs") or not callable(getattr(obj, "to_json")):
54+
continue
55+
56+
try:
57+
# Instantiate the class and call the method
58+
json_data = obj().to_json(indent=2)
59+
60+
# Write JSON data to file
61+
output_file = os.path.join(module_export_dir, f"{obj.defs.code}.json")
62+
with open(output_file, "w", encoding="utf-8") as f:
63+
f.write(json_data)
64+
65+
click.echo(f"Saved JSON for class {name} to {output_file}")
66+
except Exception as err:
67+
click.echo(f"Failed to process class {name} in {module_path}: {err}")

bam_masterdata/cli/fill_masterdata.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import click
44

55
from bam_masterdata.openbis import OpenbisEntities
6+
from bam_masterdata.openbis.login import environ
67

78

89
class MasterdataCodeGenerator:
@@ -11,14 +12,14 @@ class MasterdataCodeGenerator:
1112
openBIS instance.
1213
"""
1314

14-
def __init__(self):
15+
def __init__(self, url: str = ""):
1516
start_time = time.time()
1617
# * This part takes some time due to the loading of all entities from Openbis
17-
self.properties = OpenbisEntities().get_property_dict()
18-
self.collections = OpenbisEntities().get_collection_dict()
19-
self.datasets = OpenbisEntities().get_dataset_dict()
20-
self.objects = OpenbisEntities().get_object_dict()
21-
self.vocabularies = OpenbisEntities().get_vocabulary_dict()
18+
self.properties = OpenbisEntities(url=url).get_property_dict()
19+
self.collections = OpenbisEntities(url=url).get_collection_dict()
20+
self.datasets = OpenbisEntities(url=url).get_dataset_dict()
21+
self.objects = OpenbisEntities(url=url).get_object_dict()
22+
self.vocabularies = OpenbisEntities(url=url).get_vocabulary_dict()
2223
elapsed_time = time.time() - start_time
2324
click.echo(
2425
f"Loaded OpenBIS entities in `MasterdataCodeGenerator` initialization {elapsed_time:.2f} seconds\n"
@@ -103,7 +104,7 @@ def add_properties(
103104
# ! patching dataType=SAMPLE instead of OBJECT
104105
if prop_data.get("dataType", "") == "SAMPLE":
105106
prop_data["dataType"] = "OBJECT"
106-
lines.append(f" data_type=\"{prop_data.get('dataType', '')}\",")
107+
lines.append(f' data_type="{prop_data.get("dataType", "")}",')
107108
property_label = (prop_data.get("label") or "").replace("\n", "\\n")
108109
lines.append(f' property_label="{property_label}",')
109110
description = (
@@ -163,7 +164,7 @@ def generate_property_types(self) -> str:
163164
# ! patching dataType=SAMPLE instead of OBJECT
164165
if data.get("dataType", "") == "SAMPLE":
165166
data["dataType"] = "OBJECT"
166-
lines.append(f" data_type=\"{data.get('dataType', '')}\",")
167+
lines.append(f' data_type="{data.get("dataType", "")}",')
167168
property_label = (
168169
(data.get("label") or "").replace('"', '\\"').replace("\n", "\\n")
169170
)
@@ -222,7 +223,7 @@ def generate_collection_types(self) -> str:
222223
lines.append(f' description="""{description}""",')
223224
if data.get("validationPlugin") != "":
224225
lines.append(
225-
f" validation_script=\"{data.get('validationPlugin')}\","
226+
f' validation_script="{data.get("validationPlugin")}",'
226227
)
227228
lines.append(" )")
228229
lines.append("")
@@ -327,7 +328,7 @@ def generate_object_types(self) -> str:
327328
)
328329
lines.append(f' description="""{description}""",')
329330
lines.append(
330-
f" generated_code_prefix=\"{data.get('generatedCodePrefix', '')}\","
331+
f' generated_code_prefix="{data.get("generatedCodePrefix", "")}",'
331332
)
332333
lines.append(" )")
333334
lines.append("")

0 commit comments

Comments
 (0)