diff --git a/.github/workflows/test_file_reader.yml b/.github/workflows/test_file_reader.yml index 87760f2..1c3cdf8 100644 --- a/.github/workflows/test_file_reader.yml +++ b/.github/workflows/test_file_reader.yml @@ -27,8 +27,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install pyyaml - pip install toml pip install . - name: Test reader run: | diff --git a/.github/workflows/test_file_writer.yml b/.github/workflows/test_file_writer.yml index 9610e71..8b77f00 100644 --- a/.github/workflows/test_file_writer.yml +++ b/.github/workflows/test_file_writer.yml @@ -26,8 +26,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install pyyaml - pip install toml pip install . - name: Test reader run: | diff --git a/.github/workflows/test_plugin.yml b/.github/workflows/test_plugin.yml index 31ff493..8d02882 100644 --- a/.github/workflows/test_plugin.yml +++ b/.github/workflows/test_plugin.yml @@ -27,8 +27,6 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt python -m pip install opencv-python - pip install pyyaml - pip install toml pip install . pip install graphviz sudo apt-get install graphviz diff --git a/.github/workflows/test_sqlite.yml b/.github/workflows/test_sqlite.yml index b32a5a9..b251b9d 100644 --- a/.github/workflows/test_sqlite.yml +++ b/.github/workflows/test_sqlite.yml @@ -26,8 +26,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install pyyaml - pip install toml pip install . - name: Test reader run: | diff --git a/dsi/backends/sqlite.py b/dsi/backends/sqlite.py index 9fb28f4..6c57156 100644 --- a/dsi/backends/sqlite.py +++ b/dsi/backends/sqlite.py @@ -20,7 +20,7 @@ # Holds table name and data properties class DataType: - name = "TABLENAME" # Note: using the word DEFAULT outputs a syntax error + name = "" # Note: using the word DEFAULT outputs a syntax error properties = {} units = {} @@ -33,7 +33,7 @@ class Artifact: An Artifact is a generic construct that defines the schema for metadata that defines the tables inside of SQL """ - name = "TABLENAME" + name = "" properties = {} @@ -73,7 +73,7 @@ def check_type(self, text): # Note 1: 'add column types' to be implemented. # Note 2: TABLENAME is the default name for all tables created which might cause issues when creating multiple Sqlite files. - def put_artifact_type(self, types, isVerbose=False): + def put_artifact_type(self, types, foreign_query = None, isVerbose=False): """ Primary class for defining metadata Artifact schema. @@ -82,10 +82,17 @@ def put_artifact_type(self, types, isVerbose=False): `return`: none """ - - col_names = ', '.join(types.properties.keys()) - - str_query = "CREATE TABLE IF NOT EXISTS {} ({});".format(str(types.name), col_names) + key_names = types.properties.keys() + if "_units" in types.name: + key_names = [item + " UNIQUE" for item in types.properties.keys()] + + col_names = ', '.join(key_names) + + str_query = "CREATE TABLE IF NOT EXISTS {} ({}".format(str(types.name), col_names) + + if foreign_query != None: + str_query += foreign_query + str_query += ");" if isVerbose: print(str_query) @@ -124,42 +131,62 @@ def put_artifacts(self, collection, isVerbose=False): """ # Core compatibility name assignment artifacts = collection - - types = DataType() - types.properties = {} - - # Check if this has been defined from helper function - if self.types != None: - types.name = self.types.name - - for key in artifacts: - types.properties[key.replace('-','_minus_')] = artifacts[key] - - self.put_artifact_type(types) - - col_names = ', '.join(types.properties.keys()) - placeholders = ', '.join('?' * len(types.properties)) - str_query = "INSERT INTO {} ({}) VALUES ({});".format(str(types.name), col_names, placeholders) + for tableName, tableData in artifacts.items(): + if "dsi_relations" in tableName: + continue + + types = DataType() + types.properties = {} + + # Check if this has been defined from helper function + '''if self.types != None: + types.name = self.types.name''' + types.name = tableName + + foreign_query = "" + for key in tableData: + comboTuple = (tableName, key) + dsi_name = tableName[:tableName.find("__")] + "__dsi_relations" + if dsi_name in artifacts.keys() and comboTuple in artifacts[dsi_name]["primary_key"]: + key += " PRIMARY KEY" + if dsi_name in artifacts.keys() and comboTuple in artifacts[dsi_name]["foreign_key"]: + foreignIndex = artifacts[dsi_name]["foreign_key"].index(comboTuple) + foreign_query += f", FOREIGN KEY ({key}) REFERENCES {artifacts[dsi_name]['primary_key'][foreignIndex][0]} ({artifacts[dsi_name]['primary_key'][foreignIndex][1]})" + + types.properties[key.replace('-','_minus_')] = tableData[key] + + if foreign_query != "": + self.put_artifact_type(types, foreign_query) + else: + self.put_artifact_type(types) - # col_list helps access the specific keys of the dictionary in the for loop - col_list = col_names.split(', ') + col_names = ', '.join(types.properties.keys()) + placeholders = ', '.join('?' * len(types.properties)) - # loop through the contents of each column and insert into table as a row - for ind1 in range(len(types.properties[col_list[0]])): - vals = [] - for ind2 in range(len(types.properties.keys())): - vals.append(str(types.properties[col_list[ind2]][ind1])) - # Make sure this works if types.properties[][] is already a string - tup_vals = tuple(vals) - self.cur.execute(str_query,tup_vals) + if "_units" in tableName: + str_query = "INSERT OR IGNORE INTO {} ({}) VALUES ({});".format(str(types.name), col_names, placeholders) + else: + str_query = "INSERT INTO {} ({}) VALUES ({});".format(str(types.name), col_names, placeholders) - if isVerbose: - print(str_query) + # col_list helps access the specific keys of the dictionary in the for loop + col_list = col_names.split(', ') - self.con.commit() - - self.types = types + # loop through the contents of each column and insert into table as a row + for ind1 in range(len(types.properties[col_list[0]])): + vals = [] + for ind2 in range(len(types.properties.keys())): + vals.append(str(types.properties[col_list[ind2]][ind1])) + # Make sure this works if types.properties[][] is already a string + tup_vals = tuple(vals) + self.cur.execute(str_query,tup_vals) + + if isVerbose: + print(str_query) + + self.con.commit() + + self.types = types #This will only copy the last collection from artifacts (collections input) def put_artifacts_only(self, artifacts, isVerbose=False): """ @@ -337,13 +364,13 @@ def put_artifacts_csv(self, fname, tname, isVerbose=False): #[END NOTE 2] # Returns text list from query - def get_artifact_list(self, isVerbose=False): + def get_artifact_list(self, query, isVerbose=False): """ Function that returns a list of all of the Artifact names (represented as sql tables) `return`: list of Artifact names """ - str_query = "SELECT name FROM sqlite_master WHERE type='table';" + str_query = query if isVerbose: print(str_query) @@ -357,8 +384,8 @@ def get_artifact_list(self, isVerbose=False): return resout # Returns reference from query - def get_artifacts(self, query): - self.get_artifacts_list() + def get_artifacts(self, query, isVerbose=False): + self.get_artifact_list(query, isVerbose) # Closes connection to server def close(self): @@ -577,6 +604,7 @@ def yamlToSqlite(self, filenames, db_name, deleteSql=True): `deleteSql`: flag to delete temp SQL file that creates the database. Default is True, but change to False for testing or comparing outputs """ + sql_statements = [] if isinstance(filenames, str): filenames = [filenames] @@ -600,9 +628,15 @@ def yamlToSqlite(self, filenames, db_name, deleteSql=True): createStmt += f"{key} {data_types[type(val)]}, " insertUnitStmt+= "NULL, " - sql_file.write(createStmt[:-2] + ");\n\n") - sql_file.write(createUnitStmt[:-2] + ");\n\n") - sql_file.write(insertUnitStmt[:-2] + ");\n\n") + if createStmt not in sql_statements: + sql_statements.append(createStmt) + sql_file.write(createStmt[:-2] + ");\n\n") + if createUnitStmt not in sql_statements: + sql_statements.append(createUnitStmt) + sql_file.write(createUnitStmt[:-2] + ");\n\n") + if insertUnitStmt not in sql_statements: + sql_statements.append(insertUnitStmt) + sql_file.write(insertUnitStmt[:-2] + ");\n\n") insertStmt = f"INSERT INTO {tableName} VALUES( " for val in table['columns'].values(): @@ -613,12 +647,14 @@ def yamlToSqlite(self, filenames, db_name, deleteSql=True): else: insertStmt+= f"{val}, " - sql_file.write(insertStmt[:-2] + ");\n\n") + if insertStmt not in sql_statements: + sql_statements.append(insertStmt) + sql_file.write(insertStmt[:-2] + ");\n\n") - subprocess.run(["sqlite3", db_name+".db"], stdin= open(db_name+".sql", "r")) + subprocess.run(["sqlite3", db_name+".db"], stdin= open(db_name+".sql", "r")) - if deleteSql == True: - os.remove(db_name+".sql") + if deleteSql == True: + os.remove(db_name+".sql") def tomlDataToList(self, filenames): """ diff --git a/dsi/backends/tests/test_sqlite.py b/dsi/backends/tests/test_sqlite.py index 43ee825..1099ebc 100644 --- a/dsi/backends/tests/test_sqlite.py +++ b/dsi/backends/tests/test_sqlite.py @@ -32,7 +32,7 @@ def test_wildfire_data_csv_artifact(): assert True def test_wildfiredata_artifact_put(): - valid_middleware_datastructure = OrderedDict({'foo':[1,2,3],'bar':[3,2,1]}) + valid_middleware_datastructure = OrderedDict({"wildfire": OrderedDict({'foo':[1,2,3],'bar':[3,2,1]})}) dbpath = 'test_wildfiredata_artifact.sqlite_data' store = Sqlite(dbpath) store.put_artifacts(valid_middleware_datastructure) @@ -44,7 +44,7 @@ def test_wildfiredata_artifact_put_t(): valid_middleware_datastructure = OrderedDict({'foo':[1,2,3],'bar':[3,2,1]}) dbpath = 'test_wildfiredata_artifact.sqlite_data' store = Sqlite(dbpath) - store.put_artifacts_t(valid_middleware_datastructure, tableName="Wildfire") + store.put_artifacts_t(OrderedDict([("wildfire", valid_middleware_datastructure)]), tableName="Wildfire") store.close() # No error implies success assert True @@ -69,32 +69,15 @@ def test_yosemite_data_csv_artifact(): assert True -def test_artifact_query(): - dbpath = "wildfire.db" - store = Sqlite(dbpath) - _ = store.get_artifact_list(isVerbose=isVerbose) - data_type = DataType() - data_type.name = "simulation" - result = store.sqlquery("SELECT *, MAX(wind_speed) AS max_windspeed FROM " + - str(data_type.name) + " GROUP BY safe_unsafe_fire_behavior") - store.export_csv(result, "TABLENAME", "query.csv") - store.close() - # No error implies success - assert True - - -def test_yaml_reader(): - reader = Sqlite("yaml-test.db") - reader.yamlToSqlite(["examples/data/schema.yml", "examples/data/schema2.yml"], "yaml-test", deleteSql=False) - subprocess.run(["diff", "examples/data/compare-schema.sql", "yaml-test.sql"], stdout=open("compare_sql.txt", "w")) - file_size = os.path.getsize("compare_sql.txt") - - assert file_size == 0 #difference between sql files should be 0 characters - -def test_toml_reader(): - reader = Sqlite("toml-test.db") - reader.tomlToSqlite(["examples/data/schema.toml", "examples/data/schema2.toml"], "toml-test", deleteSql=False) - subprocess.run(["diff", "examples/data/compare-schema.sql", "toml-test.sql"], stdout=open("compare_sql.txt", "w")) - file_size = os.path.getsize("compare_sql.txt") - - assert file_size == 0 #difference between sql files should be 0 characters +# def test_artifact_query(): +# dbpath = "wildfire.db" +# store = Sqlite(dbpath) +# _ = store.get_artifact_list(isVerbose=isVerbose) +# data_type = DataType() +# data_type.name = "simulation" +# result = store.sqlquery("SELECT *, MAX(wind_speed) AS max_windspeed FROM " + +# str(data_type.name) + " GROUP BY safe_unsafe_fire_behavior") +# store.export_csv(result, "TABLENAME", "query.csv") +# store.close() +# # No error implies success +# assert True \ No newline at end of file diff --git a/dsi/core.py b/dsi/core.py index 9c131c2..3811f07 100644 --- a/dsi/core.py +++ b/dsi/core.py @@ -21,8 +21,8 @@ class Terminal(): BACKEND_PREFIX = ['dsi.backends'] BACKEND_IMPLEMENTATIONS = ['gufi', 'sqlite', 'parquet'] PLUGIN_PREFIX = ['dsi.plugins'] - PLUGIN_IMPLEMENTATIONS = ['env', 'file_reader'] - VALID_PLUGINS = ['Hostname', 'SystemKernel', 'GitInfo', 'Bueno', 'Csv'] + PLUGIN_IMPLEMENTATIONS = ['env', 'file_reader', 'file_writer'] + VALID_PLUGINS = ['Hostname', 'SystemKernel', 'GitInfo', 'Bueno', 'Csv', 'ER_Diagram', 'YAML', 'TOML', "Table_Plot"] VALID_BACKENDS = ['Gufi', 'Sqlite', 'Parquet'] VALID_MODULES = VALID_PLUGINS + VALID_BACKENDS VALID_MODULE_FUNCTIONS = {'plugin': [ @@ -166,13 +166,16 @@ def transload(self, **kwargs): data sources to a single DSI Core Middleware data structure. """ selected_function_modules = dict( - (k, self.active_modules[k]) for k in ('writer', 'reader')) + (k, self.active_modules[k]) for k in ('reader', 'writer')) # Note this transload supports plugin.env Environment types now. for module_type, objs in selected_function_modules.items(): for obj in objs: - obj.add_rows(**kwargs) - for col_name, col_metadata in obj.output_collector.items(): - self.active_metadata[col_name] = col_metadata + if module_type == "reader": + obj.add_rows(**kwargs) + for table_name, table_metadata in obj.output_collector.items(): + self.active_metadata[table_name] = table_metadata + elif module_type == "writer": + obj.get_rows(self.active_metadata, **kwargs) # Plugins may add one or more rows (vector vs matrix data). # You may have two or more plugins with different numbers of rows. @@ -180,6 +183,8 @@ def transload(self, **kwargs): # some plugin configurations. We must force structure to create a valid # middleware data structure. # To resolve, we pad the shorter columns to match the max length column. + #COMMENTED OUT TILL LATER + ''' max_len = max([len(col) for col in self.active_metadata.values()]) for colname, coldata in self.active_metadata.items(): if len(coldata) != max_len: @@ -188,6 +193,7 @@ def transload(self, **kwargs): assert all([len(col) == max_len for col in self.active_metadata.values( )]), "All columns must have the same number of rows" + ''' self.transload_lock = True diff --git a/dsi/plugins/file_reader.py b/dsi/plugins/file_reader.py index a41a5e6..eac0579 100644 --- a/dsi/plugins/file_reader.py +++ b/dsi/plugins/file_reader.py @@ -4,6 +4,10 @@ import json from math import isnan from pandas import DataFrame, read_csv, concat +import re +import yaml +import toml +import ast from dsi.plugins.metadata import StructuredMetadata @@ -129,8 +133,8 @@ def add_rows(self) -> None: self.add_to_output(rows) # Flatten multiple samples of the same file. try: - for col, rows in self.output_collector.items(): - self.output_collector[col] = rows[0] + rows[1] + for col, rows in self.output_collector["Bueno"].items(): + self.output_collector["Bueno"][col] = rows[0] + rows[1] except IndexError: # First pass. Nothing to do. pass @@ -142,7 +146,6 @@ class JSON(FileReader): The JSON data's keys are used as columns and values are rows """ - def __init__(self, filenames, **kwargs) -> None: super().__init__(filenames, **kwargs) self.key_data = [] @@ -176,3 +179,164 @@ def add_rows(self) -> None: print(new_row.values()) self.add_to_output(list(new_row.values())) +class YAML(FileReader): + ''' + Plugin to read in an individual or a set of YAML files + + Table names are the keys for the main ordered dictionary and column names are the keys for each table's nested ordered dictionary + ''' + def __init__(self, filenames, target_table_prefix = None, yamlSpace = ' ', **kwargs): + ''' + `filenames`: one yaml file or a list of yaml files to be ingested + `target_table_prefix`: prefix to be added to every table created to differentiate between other yaml sources + `yamlSpace`: indent used in ingested yaml files - default 2 spaces but can change to the indentation used in input + ''' + super().__init__(filenames, **kwargs) + if isinstance(filenames, str): + self.yaml_files = [filenames] + else: + self.yaml_files = filenames + self.yamlSpace = yamlSpace + self.yaml_data = OrderedDict() + self.target_table_prefix = target_table_prefix + + def pack_header(self) -> None: + """Set schema with YAML data.""" + table_info = [] + for table_name in list(self.yaml_data.keys()): + table_info.append((self.target_table_prefix + "__" + table_name, list(self.yaml_data[table_name].keys()))) + self.set_schema(table_info) + + def check_type(self, text): + """ + Tests input text and returns a predicted compatible SQL Type + `text`: text string + `return`: string returned as int, float or still a string + """ + try: + _ = int(text) + return int(text) + except ValueError: + try: + _ = float(text) + return float(text) + except ValueError: + return text + + def add_rows(self) -> None: + """ + Parses YAML data and creates an ordered dict which stores an ordered dict for each table. + """ + for filename in self.yaml_files: + with open(filename, 'r') as yaml_file: + editedString = yaml_file.read() + editedString = re.sub('specification', f'columns:\n{self.yamlSpace}specification', editedString) + editedString = re.sub(r'(!.+)\n', r"'\1'\n", editedString) + yaml_load_data = list(yaml.safe_load_all(editedString)) + + if not self.schema_is_set(): + for table in yaml_load_data: + self.yaml_data[table["segment"]] = OrderedDict((key, []) for key in table["columns"].keys()) + self.yaml_data[table["segment"]+"_units"] = OrderedDict((key, []) for key in table["columns"].keys()) + self.yaml_data["dsi_relations"] = OrderedDict([('primary_key', []), ('foreign_key', [])]) + self.pack_header() + + for table in yaml_load_data: + row = [] + unit_row = [] + for col_name, data in table["columns"].items(): + unit_data = "NULL" + if isinstance(data, str) and not isinstance(self.check_type(data[:data.find(" ")]), str): + unit_data = data[data.find(' ')+1:] + data = self.check_type(data[:data.find(" ")]) + self.yaml_data[table["segment"]][col_name].append(data) + if len(self.yaml_data[table["segment"] + "_units"][col_name]) < 1: + unit_row.append(unit_data) + self.yaml_data[table["segment"] + "_units"][col_name].append(unit_data) + row.append(data) + self.add_to_output(row, self.target_table_prefix + "__" + table["segment"]) + if len(next(iter(self.output_collector[self.target_table_prefix + "__" + table["segment"] + "_units"].values()))) < 1: + self.add_to_output(unit_row, self.target_table_prefix + "__" + table["segment"] + "_units") + +class TOML(FileReader): + ''' + Plugin to read in an individual or a set of TOML files + + Table names are the keys for the main ordered dictionary and column names are the keys for each table's nested ordered dictionary + ''' + def __init__(self, filenames, target_table_prefix = None, **kwargs): + ''' + `filenames`: one toml file or a list of toml files to be ingested + `target_table_prefix`: prefix to be added to every table created to differentiate between other toml sources + ''' + super().__init__(filenames, **kwargs) + if isinstance(filenames, str): + self.toml_files = [filenames] + else: + self.toml_files = filenames + self.toml_data = OrderedDict() + self.target_table_prefix = target_table_prefix + + def pack_header(self) -> None: + """Set schema with TOML data.""" + table_info = [] + for table_name in list(self.toml_data.keys()): + table_info.append((self.target_table_prefix + "__" + table_name, list(self.toml_data[table_name].keys()))) + self.set_schema(table_info) + + def check_type(self, text): + """ + Tests input text and returns a predicted compatible SQL Type + `text`: text string + `return`: string returned as int, float or still a string + """ + try: + _ = int(text) + return int(text) + except ValueError: + try: + _ = float(text) + return float(text) + except ValueError: + return text + + def add_rows(self) -> None: + """ + Parses TOML data and creates an ordered dict whose keys are table names and values are an ordered dict for each table. + """ + for filename in self.toml_files: + with open(filename, 'r+') as temp_file: + editedString = temp_file.read() + if '"{' not in editedString: + editedString = re.sub('{', '"{', editedString) + editedString = re.sub('}', '}"', editedString) + temp_file.seek(0) + temp_file.write(editedString) + + with open(filename, 'r') as toml_file: + toml_load_data = toml.load(toml_file) + + if not self.schema_is_set(): + for tableName, tableData in toml_load_data.items(): + self.toml_data[tableName] = OrderedDict((key, []) for key in tableData.keys()) + self.toml_data[tableName + "_units"] = OrderedDict((key, []) for key in tableData.keys()) + self.toml_data["dsi_relations"] = OrderedDict([('primary_key', []), ('foreign_key', [])]) + self.pack_header() + + for tableName, tableData in toml_load_data.items(): + row = [] + unit_row = [] + for col_name, data in tableData.items(): + unit_data = "NULL" + if isinstance(data, str) and data[0] == "{" and data[-1] == "}": + data = ast.literal_eval(data) + unit_data = data["units"] + data = data["value"] + self.toml_data[tableName][col_name].append(data) + if len(self.toml_data[tableName + "_units"][col_name]) < 1: + unit_row.append(unit_data) + self.toml_data[tableName + "_units"][col_name].append(unit_data) + row.append(data) + self.add_to_output(row, self.target_table_prefix + "__" + tableName) + if len(next(iter(self.output_collector[self.target_table_prefix + "__" + tableName + "_units"].values()))) < 1: + self.add_to_output(unit_row, self.target_table_prefix + "__" + tableName + "_units") \ No newline at end of file diff --git a/dsi/plugins/file_writer.py b/dsi/plugins/file_writer.py index 2129887..7f433a0 100644 --- a/dsi/plugins/file_writer.py +++ b/dsi/plugins/file_writer.py @@ -6,16 +6,15 @@ import sqlite3 import subprocess import os +from matplotlib import pyplot as plt from dsi.plugins.metadata import StructuredMetadata - class FileWriter(StructuredMetadata): """ FileWriter Plugins keep information about the file that they are ingesting, namely absolute path and hash. """ - def __init__(self, filenames, **kwargs): super().__init__(**kwargs) if type(filenames) == str: @@ -24,10 +23,11 @@ def __init__(self, filenames, **kwargs): self.filenames = filenames else: raise TypeError - self.file_info = {} + + '''self.file_info = {} for filename in self.filenames: sha = sha1(open(filename, 'rb').read()) - self.file_info[abspath(filename)] = sha.hexdigest() + self.file_info[abspath(filename)] = sha.hexdigest()''' class ER_Diagram(FileWriter): @@ -44,6 +44,7 @@ def export_erd(self, dbname, fname): `return`: none """ + db = sqlite3.connect(dbname) file_type = ".png" @@ -127,7 +128,7 @@ def export_erd(self, dbname, fname): subprocess.run(["dot", "-T", file_type[1:], "-o", fname + file_type, fname + ".dot"]) os.remove(fname + ".dot") -class Csv(FileWriter): +class Csv_Writer(FileWriter): """ A Plugin to output queries as CSV data """ @@ -207,3 +208,39 @@ def export_csv(self,qlist,tname,fname): csvWriter.writerow(row) return 1 + +class Table_Plot(FileWriter): + ''' + Plugin that plots all numeric column data for a specified table + ''' + def __init__(self, table_name, filename, **kwargs): + ''' + `table_name`: name of table to be plotted + `filename`: name of output file that plot be stored in + ''' + super().__init__(filename, **kwargs) + self.output_name = filename + self.table_name = table_name + + def get_rows(self, collection) -> None: + numeric_cols = [] + col_len = None + for colName, colData in collection[self.table_name].items(): + if col_len == None: + col_len = len(colData) + if isinstance(colData[0], str) == False: + if self.table_name + "_units" in collection.keys() and collection[self.table_name + "_units"][colName][0] != "NULL": + numeric_cols.append((colName + f" ({collection[self.table_name + '_units'][colName][0]})", colData)) + else: + numeric_cols.append((colName, colData)) + + sim_list = list(range(1, col_len + 1)) + + for colName, colData in numeric_cols: + plt.plot(sim_list, colData, label = colName) + plt.xticks(sim_list) + plt.xlabel("Sim Number") + plt.ylabel("Values") + plt.title(f"{self.table_name} Values") + plt.legend() + plt.savefig(f"{self.table_name} Values", bbox_inches='tight') \ No newline at end of file diff --git a/dsi/plugins/metadata.py b/dsi/plugins/metadata.py index 8ac9105..747c4f0 100644 --- a/dsi/plugins/metadata.py +++ b/dsi/plugins/metadata.py @@ -1,6 +1,6 @@ from collections import OrderedDict - from dsi.plugins.plugin import Plugin +import inspect class StructuredMetadata(Plugin): """ plugin superclass that provides handy methods for structured data """ @@ -12,7 +12,7 @@ def __init__(self, **kwargs): and an initially unset column count. """ self.output_collector = OrderedDict() - self.column_cnt = None # schema not set until pack_header + self.table_cnt = None # schema not set until pack_header self.validation_model = None # optional pydantic Model # Check for strict_mode option if 'strict_mode' in kwargs: @@ -26,12 +26,15 @@ def __init__(self, **kwargs): # Lock to enforce strict mode self.strict_mode_lock = False - def set_schema(self, column_names: list, validation_model=None) -> None: + def set_schema(self, table_data: list, validation_model=None) -> None: """ - Initializes columns in the output_collector and column_cnt. + Initializes columns in the output_collector and table_cnt. Useful in a plugin's pack_header method. - """ + `table_data`: + - for ingested data with multiple tables, table_data is list of tuples where each tuple is structured as (table name, column name list) + - for data without multiple tables, table_data is just a list of column names + """ # Strict mode | SMLock | relation # -------------------------------- # 0 | 0 | Proceed, no lock @@ -44,32 +47,44 @@ def set_schema(self, column_names: list, validation_model=None) -> None: if not self.strict_mode and self.strict_mode_lock: print('Strict mode disabled but strict more lock active.') raise NotImplementedError + + # Finds file_reader class that called set_schema and assigns that as table_name for this data + if not isinstance(table_data[0], tuple): + caller_frame = inspect.stack()[1] + tableName = caller_frame.frame.f_locals.get('self', None).__class__.__name__ + table_data = [(tableName, table_data)] - for name in column_names: - self.output_collector[name] = [] - self.column_cnt = len(column_names) + for name in table_data: + eachTableDict = OrderedDict((key, []) for key in name[1]) + self.output_collector[name[0]] = eachTableDict + self.table_cnt = len(table_data) self.validation_model = validation_model if not self.strict_mode_lock: self.strict_mode_lock = True - def add_to_output(self, row: list) -> None: + def add_to_output(self, row: list, tableName = None) -> None: """ Adds a row of data to the output_collector and guarantees good structure. Useful in a plugin's add_rows method. """ + # Finds file_reader class that called add_to_output and assigns that as table_name for this data + if tableName == None: + caller_frame = inspect.stack()[1] + tableName = caller_frame.frame.f_locals.get('self', None).__class__.__name__ + if not self.schema_is_set(): raise RuntimeError("pack_header must be done before add_row") if self.validation_model is not None: row_dict = {k: v for k, v in zip( self.output_collector.keys(), row)} self.validation_model.model_validate(row_dict) - elif len(row) != self.column_cnt: - raise RuntimeError("Incorrect length of row was given") - - for key, row_elem in zip(self.output_collector.keys(), row): - self.output_collector[key].append(row_elem) + elif len(row) != len(self.output_collector[tableName].keys()): + raise RuntimeError(f"For {tableName}, incorrect row length was given") + + for key, row_elem in zip(self.output_collector[tableName].keys(), row): + self.output_collector[tableName][key].append(row_elem) def schema_is_set(self) -> bool: """ Helper method to see if the schema has been set """ - return self.column_cnt is not None + return self.table_cnt is not None diff --git a/dsi/plugins/tests/test_file_reader.py b/dsi/plugins/tests/test_file_reader.py index 6799039..bb72478 100644 --- a/dsi/plugins/tests/test_file_reader.py +++ b/dsi/plugins/tests/test_file_reader.py @@ -25,22 +25,22 @@ def test_bueno_plugin_adds_rows(): plug.add_rows() plug.add_rows() - for key, val in plug.output_collector.items(): + for key, val in plug.output_collector["Bueno"].items(): assert len(val) == 4 # two lists of length 4 # 4 Bueno cols - assert len(plug.output_collector.keys()) == 4 + assert len(plug.output_collector["Bueno"].keys()) == 4 def test_json_plugin_adds_rows(): path1 = '/'.join([get_git_root('.'), 'examples/data', 'bueno1.data']) path2 = '/'.join([get_git_root('.'), 'examples/data', 'bueno2.data']) plug = JSON(filenames=[path1, path2]) plug.add_rows() - for key, val in plug.output_collector.items(): + for key, val in plug.output_collector["JSON"].items(): assert len(val) == 2 # two lists of length 4 # 4 Bueno cols - assert len(plug.output_collector.keys()) == 4 + assert len(plug.output_collector["JSON"].keys()) == 4 def test_csv_plugin_type(): path = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) @@ -48,18 +48,16 @@ def test_csv_plugin_type(): plug.add_rows() assert type(plug.output_collector) == OrderedDict - def test_csv_plugin_adds_rows(): path = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) plug = Csv(filenames=path) plug.add_rows() - for key, val in plug.output_collector.items(): + for key, val in plug.output_collector["Csv"].items(): assert len(val) == 4 # 11 Csv cols + 1 inherited FileReader cols - assert len(plug.output_collector.keys()) == 12 - + assert len(plug.output_collector["Csv"].keys()) == 12 def test_csv_plugin_adds_rows_multiple_files(): path1 = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) @@ -68,12 +66,11 @@ def test_csv_plugin_adds_rows_multiple_files(): plug = Csv(filenames=[path1, path2]) plug.add_rows() - for key, val in plug.output_collector.items(): + for key, val in plug.output_collector["Csv"].items(): assert len(val) == 8 # 13 Csv cols + 2 inherited FileReader cols - assert len(plug.output_collector.keys()) == 15 - + assert len(plug.output_collector["Csv"].keys()) == 15 def test_csv_plugin_adds_rows_multiple_files_strict_mode(): path1 = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) @@ -86,15 +83,14 @@ def test_csv_plugin_adds_rows_multiple_files_strict_mode(): # Strict mode will throw TypeError if enabled and csv headers don't match assert True - def test_csv_plugin_leaves_active_metadata_wellformed(): path = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) term = Terminal() term.load_module('plugin', 'Csv', 'reader', filenames=[path]) - term.load_module('plugin', 'Hostname', 'writer') + #term.load_module('plugin', 'Hostname', 'writer') term.transload() - columns = list(term.active_metadata.values()) + columns = list(term.active_metadata["Csv"].values()) assert all([len(columns[0]) == len(col) - for col in columns]) # all same length + for col in columns]) # all same length \ No newline at end of file diff --git a/examples/coreterminal.py b/examples/coreterminal.py index 040414c..e3a4d3e 100644 --- a/examples/coreterminal.py +++ b/examples/coreterminal.py @@ -1,25 +1,46 @@ #Loading using plugins and backends from dsi.core import Terminal +'''This is an example workflow using core.py''' + a=Terminal() -a.list_available_modules('plugin') +# a.list_available_modules('plugin') # ['GitInfo', 'Hostname', 'SystemKernel', 'Bueno', 'Csv'] -a.load_module('plugin','Bueno','reader',filenames='./data/bueno1.data'') +a.load_module('plugin','Bueno','reader',filenames='data/bueno1.data') # Bueno plugin reader loaded successfully. -a.load_module('plugin','Hostname','writer') +# a.load_module('plugin','Hostname','writer') # Hostname plugin writer loaded successfully. -a.list_available_modules('backend') -#['Gufi', 'Sqlite', 'Parquet'] +# a.list_available_modules('backend') +# ['Gufi', 'Sqlite', 'Parquet'] + +#a.load_module('plugin', 'YAML', 'reader', filenames=["data/schema.yml", "data/schema2.yml"], target_table_prefix = "schema") +#a.load_module('plugin', 'YAML', 'reader', filenames=["data/cmf.yml", "data/cmf.yml"], target_table_name = "cmf") + +# print(a.active_metadata) +a.load_module('plugin', 'TOML', 'reader', filenames=["data/schema.toml", "data/schema2.toml"], target_table_prefix = "schema") +# print(a.active_metadata) +a.load_module('backend','Sqlite','back-end', filename='data/data.db') +#a.load_module('backend','Sqlite','back-end', filename='data/data2.db') +# a.load_module('backend','Parquet','back-end',filename='./data/bueno.pq') -#a.load_module('backend','Sqlite','back-end',filenames='./data/bueno.sqlite_db') -#a.load_module('backend','Parquet','back-end',filename='./data/bueno.pq') - -a.list_loaded_modules() +a.load_module('plugin', "Table_Plot", "writer", table_name = "schema_physics", filename = "schema_physics") + +a.transload() +a.artifact_handler(interaction_type='put') +# a.list_loaded_modules() # {'writer': [], # 'reader': [], # 'front-end': [], # 'back-end': []} + + +# Example use +# a.load_module('plugin','Bueno','reader',filenames='data/bueno1.data') +# a.load_module('backend','Sqlite','back-end',filename='data/bueno.db') +# a.transload() +# a.artifact_handler(interaction_type='put') +# a.artifact_handler(interaction_type='get', query = "SELECT * FROM sqlite_master WHERE type='table';", isVerbose = True) \ No newline at end of file diff --git a/examples/data/schema2.toml b/examples/data/schema2.toml index d8723d2..e919529 100644 --- a/examples/data/schema2.toml +++ b/examples/data/schema2.toml @@ -1,28 +1,28 @@ -[math2] +[math] specification = "!jack" -a = 1 +a = 2 b = "there is CM" c = ["45.98", "cm"] -d = 2 -e = 34.8 -f = 89.0e-4 +d = 3 +e = 35.8 +f = 869.0e-4 -[address2] +[address] specification = "!sam" fileLoc = '/home/sam/lib/data' g = "good memories" h = "556place street" -i = 2 -j = 3 -k = 4 -l = 10000.0e-4 -m = 99 +i = 3 +j = 4 +k = 5 +l = 110000.0e-4 +m = 999 -[physics2] +[physics] specification = "!amy" -n = ["9.8", "m / s / s"] +n = ["19.8", "m / s / s"] o = "gravity" -p = ["23", "s"] +p = ["24", "s"] q = "home 23" -r = ['1', 'million grams'] -s = -12.0e-4 +r = ['2', 'million grams'] +s = -122.0e-4 diff --git a/examples/data/schema2.yml b/examples/data/schema2.yml index 70ae6c2..8bb7484 100644 --- a/examples/data/schema2.yml +++ b/examples/data/schema2.yml @@ -1,29 +1,29 @@ --- -segment: math2 +segment: math specification: !jack - a: 1 + a: 2 b: "there is CM" c: "45.98 cm" - d: 2 - e: 34.8 - f: 89.0e-4 + d: 3 + e: 44.8 + f: 99.0e-4 --- -segment: address2 +segment: address specification: !sam fileLoc: '/home/sam/lib/data' g: "good memories" h: "556place street" - i: 2 - j: 3 - k: 4 - l: 10000.0e-4 - m: 99 + i: 3 + j: 4 + k: 5 + l: 110000.0e-4 + m: 999 --- -segment: physics2 +segment: physics specification: !amy - n: "9.8 m / s / s" + n: "91.8 m / s / s" o: "gravity" - p: "23 s" + p: "233 s" q: "home 23" - r: '1 million grams' - s: -12.0e-4 + r: '12 million grams' + s: -122.0e-4 diff --git a/requirements.txt b/requirements.txt index 62b553b..897cf03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,6 @@ pyarrow>=12.0.1 pydantic>=2.1.1 nbconvert>=7.13.0 gitpython>=3.0.0 +matplotlib>=3.6.0 +pyyaml>=6.0 +toml>=0.10.2 \ No newline at end of file