Skip to content
This repository was archived by the owner on Apr 15, 2022. It is now read-only.

Commit 29d0ed2

Browse files
author
Epstein
authored
Dbaas 3992 (#63)
* working on metadata, cleanup other code * added metadata function * forgot to call * syntax * syntax * case sensitive * syntax * case sensitive * new table design * syntax change
1 parent c05d67a commit 29d0ed2

File tree

3 files changed

+129
-42
lines changed

3 files changed

+129
-42
lines changed

splicemachine/mlflow_support/constants.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,12 @@ def get_valid() -> tuple:
6262
return (
6363
FileExtensions.spark, FileExtensions.keras, FileExtensions.h2o, FileExtensions.sklearn
6464
)
65+
66+
class ModelStatuses():
67+
"""
68+
Class containing names
69+
for In Database Model Deployments
70+
"""
71+
deployed: str = 'DEPLOYED'
72+
deleted: str = 'DELETED'
73+
SUPPORTED_STATUSES = [deployed, deleted]

splicemachine/mlflow_support/mlflow_support.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -625,13 +625,10 @@ def _deploy_db(db_schema_name,
625625
if library == DBLibraries.MLeap:
626626
# Mleap needs a dataframe in order to serialize the model
627627
df = get_df_for_mleap(mlflow._splice_context, schema_table_name, df)
628-
model_type, classes = SparkUtils.prep_model_for_deployment(mlflow._splice_context, fitted_model, df, classes, run_id)
629-
elif library == DBLibraries.H2OMOJO:
630-
model_type, classes = H2OUtils.prep_model_for_deployment(mlflow._splice_context, fitted_model, classes, run_id)
631-
elif library == DBLibraries.SKLearn:
632-
model_type, classes = SKUtils.prep_model_for_deployment(mlflow._splice_context, fitted_model, classes, run_id, sklearn_args)
633-
elif library == DBLibraries.Keras:
634-
model_type, classes = KerasUtils.prep_model_for_deployment(mlflow._splice_context, fitted_model, classes, run_id, pred_threshold)
628+
629+
model_type, classes, model_already_exists = ModelUtils[library].prep_model_for_deployment(mlflow._splice_context,
630+
fitted_model, classes, run_id,
631+
df, pred_threshold, sklearn_args)
635632

636633

637634
print(f'Deploying model {run_id} to table {schema_table_name}')
@@ -668,18 +665,34 @@ def _deploy_db(db_schema_name,
668665
create_parsing_trigger(mlflow._splice_context, schema_table_name, primary_key, run_id, classes, model_type, verbose)
669666
print('Done.')
670667

668+
add_model_to_metadata(mlflow._splice_context, run_id, schema_table_name)
669+
670+
671671
except Exception as e:
672672
import traceback
673-
print('Model deployment failed. Rolling back transactions')
674-
# drop_tables_on_failure(mlflow._splice_context, schema_table_name, run_id)
675673
exc = 'Model deployment failed. Rolling back transactions.\n'
674+
print(exc)
675+
drop_tables_on_failure(mlflow._splice_context, schema_table_name, run_id, model_already_exists)
676676
if not verbose:
677677
exc += 'For more insight into the SQL statement that generated this error, rerun with verbose=True'
678678
traceback.print_exc()
679679
raise SpliceMachineException(exc)
680680

681681
print('Model Deployed.')
682682

683+
@_mlflow_patch('get_deployed_models')
684+
def _get_deployed_models() -> PandasDF:
685+
"""
686+
Get the currently deployed models in the database
687+
:return: Pandas df
688+
"""
689+
690+
return mlflow._splice_context.df(
691+
"""
692+
SELECT * FROM MLMANAGER.LIVE_MODEL_STATUS
693+
"""
694+
).toPandas()
695+
683696

684697
def apply_patches():
685698
"""
@@ -689,7 +702,7 @@ def apply_patches():
689702
targets = [_register_splice_context, _lp, _lm, _timer, _log_artifact, _log_feature_transformations,
690703
_log_model_params, _log_pipeline_stages, _log_model, _load_model, _download_artifact,
691704
_start_run, _current_run_id, _current_exp_id, _deploy_aws, _deploy_azure, _deploy_db, _login_director,
692-
_get_run_ids_by_name]
705+
_get_run_ids_by_name, _get_deployed_models]
693706

694707
for target in targets:
695708
gorilla.apply(gorilla.Patch(mlflow, target.__name__.lstrip('_'), target, settings=_GORILLA_SETTINGS))

splicemachine/mlflow_support/utilities.py

Lines changed: 97 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,10 @@ class H2OUtils:
5353
def prep_model_for_deployment(splice_context: PySpliceContext,
5454
model: H2OModel,
5555
classes: List[str],
56-
run_id: str) -> (H2OModelType, List[str]):
56+
run_id: str,
57+
df: SparkDF or None,
58+
pred_threshold: float or None,
59+
sklearn_args: Dict[str,str] or None) -> (H2OModelType, List[str]):
5760
"""
5861
Gets the H2O mojo model
5962
Gets the model type
@@ -69,7 +72,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
6972

7073
# Get the H2O MOJO model and insert it into the MODELS table
7174
h2omojo, rawmojo = H2OUtils.get_h2omojo_model(splice_context, model)
72-
H2OUtils.insert_h2omojo_model(splice_context, run_id, h2omojo)
75+
model_already_exists = H2OUtils.insert_h2omojo_model(splice_context, run_id, h2omojo)
7376

7477
# Get model type
7578
model_type, model_category = H2OUtils.get_model_type(h2omojo)
@@ -106,7 +109,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
106109
elif model_category == 'AnomalyDetection':
107110
classes = ['score', 'normalizedScore']
108111

109-
return model_type, classes
112+
return model_type, classes, model_already_exists
110113

111114
@staticmethod
112115
def get_model_type(h2omojo: object) -> (H2OModelType, str):
@@ -153,14 +156,14 @@ def load_h2o_model(model_blob: bytes) -> H2OModel:
153156
return model
154157

155158
@staticmethod
156-
def insert_h2omojo_model(splice_context: PySpliceContext, run_id: str, model: object) -> None:
159+
def insert_h2omojo_model(splice_context: PySpliceContext, run_id: str, model: object) -> bool:
157160
baos = splice_context.jvm.java.io.ByteArrayOutputStream()
158161
oos = splice_context.jvm.java.io.ObjectOutputStream(baos)
159162
oos.writeObject(model)
160163
oos.flush()
161164
oos.close()
162165
byte_array = baos.toByteArray()
163-
insert_model(splice_context, run_id, byte_array, 'h2omojo', h2o.__version__)
166+
return insert_model(splice_context, run_id, byte_array, 'h2omojo', h2o.__version__)
164167

165168

166169
class SKUtils:
@@ -174,9 +177,9 @@ def load_sklearn_model(model_blob: bytes):
174177
return load_pickle_string(model_blob)
175178

176179
@staticmethod
177-
def insert_sklearn_model(splice_context: PySpliceContext, run_id: str, model: ScikitModel) -> None:
180+
def insert_sklearn_model(splice_context: PySpliceContext, run_id: str, model: ScikitModel) -> bool:
178181
byte_stream = save_pickle_string(model)
179-
insert_model(splice_context, run_id, byte_stream, 'sklearn', sklearn_version)
182+
return insert_model(splice_context, run_id, byte_stream, 'sklearn', sklearn_version)
180183

181184
@staticmethod
182185
def validate_sklearn_args(model: ScikitModel, sklearn_args: Dict[str, str]) -> Dict[str, str]:
@@ -207,7 +210,11 @@ def validate_sklearn_args(model: ScikitModel, sklearn_args: Dict[str, str]) -> D
207210
t = ('return_std', 'return_cov')
208211
exc = f'predict_args value is invalid. Available options are {t}'
209212
else:
210-
model_params = get_model_params(model.predict) if hasattr(model, 'predict') else get_model_params(model.transform)
213+
if isinstance(model, SKPipeline): # If we are working with a Pipeline, we want to check the last step for arguments
214+
m = model.steps[-1][-1]
215+
model_params = get_model_params(m.predict) if hasattr(m, 'predict') else get_model_params(m.transform)
216+
else:
217+
model_params = get_model_params(model.predict) if hasattr(model, 'predict') else get_model_params(model.transform)
211218
if p not in model_params.parameters:
212219
exc = f'predict_args set to {p} but that parameter is not available for this model!'
213220
elif sklearn_args and 'predict_args' not in sklearn_args and 'predict_call' not in sklearn_args:
@@ -224,12 +231,14 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
224231
model: ScikitModel,
225232
classes: List[str],
226233
run_id: str,
227-
sklearn_args: Dict[str, str]) -> (SklearnModelType, List[str]):
234+
df: SparkDF or None,
235+
pred_threshold: float or None,
236+
sklearn_args: Dict[str,str] or None) -> (SklearnModelType, List[str]):
228237

229238
sklearn_args = SKUtils.validate_sklearn_args(model, sklearn_args)
230239

231240
model_type = SKUtils.get_model_type(model, sklearn_args)
232-
SKUtils.insert_sklearn_model(splice_context, run_id, model)
241+
model_already_exists = SKUtils.insert_sklearn_model(splice_context, run_id, model)
233242
if classes and model_type != SklearnModelType.KEY_VALUE:
234243
print('Prediction labels found but model is not type Classification. Removing labels')
235244
classes = None
@@ -260,7 +269,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
260269
if classes:
261270
print(f'Prediction labels found. Using {classes} as labels for predictions {list(range(0, len(classes)))} respectively')
262271

263-
return model_type, classes
272+
return model_type, classes, model_already_exists
264273

265274
@staticmethod
266275
def get_pipeline_model_type(pipeline: SKPipeline) -> SklearnModelType:
@@ -322,12 +331,12 @@ def load_keras_model(model_blob):
322331
return load_kr_model(hfile)
323332

324333
@staticmethod
325-
def insert_keras_model(splice_context: PySpliceContext, run_id: str, model: KerasModel) -> None:
334+
def insert_keras_model(splice_context: PySpliceContext, run_id: str, model: KerasModel) -> bool:
326335
model.save('/tmp/model.h5')
327336
with open('/tmp/model.h5', 'rb') as f:
328337
byte_stream = bytearray(bytes(f.read()))
329-
insert_model(splice_context, run_id, byte_stream, 'keras', KERAS_VERSION)
330338
remove('/tmp/model.h5')
339+
return insert_model(splice_context, run_id, byte_stream, 'keras', KERAS_VERSION)
331340

332341
@staticmethod
333342
def get_keras_model_type(model: KerasModel, pred_threshold: float) -> KerasModelType:
@@ -364,7 +373,9 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
364373
model: KerasModel,
365374
classes: List[str],
366375
run_id: str,
367-
pred_threshold: float) -> (KerasModelType, List[str]):
376+
df: SparkDF or None,
377+
pred_threshold: float or None,
378+
sklearn_args: Dict[str,str] or None)-> (KerasModelType, List[str]):
368379
"""
369380
Inserts the model into the MODELS table for deployment
370381
Gets the Keras model type
@@ -377,7 +388,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
377388
:return: (KerasModelType, List[str]) the modelType and the classes
378389
"""
379390
KerasUtils.validate_keras_model(model)
380-
KerasUtils.insert_keras_model(splice_context, run_id, model)
391+
model_already_exists = KerasUtils.insert_keras_model(splice_context, run_id, model)
381392
model_type: KerasModelType = KerasUtils.get_keras_model_type(model, pred_threshold)
382393
if model_type == KerasModelType.KEY_VALUE:
383394
output_shape = model.layers[-1].output_shape
@@ -390,7 +401,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
390401
classes = ['prediction'] + classes
391402
if len(classes) > 2 and pred_threshold:
392403
print(f"Found multiclass model with pred_threshold {pred_threshold}. Ignoring threshold.")
393-
return model_type, classes
404+
return model_type, classes, model_already_exists
394405

395406

396407

@@ -610,9 +621,11 @@ def load_spark_model(splice_ctx, spark_pipeline_blob):
610621
@staticmethod
611622
def prep_model_for_deployment(splice_context: PySpliceContext,
612623
fittedPipe: PipelineModel,
613-
df: SparkDF,
614624
classes: List[str],
615-
run_id: str) -> (SparkModelType, List[str]):
625+
run_id: str,
626+
df: SparkDF,
627+
pred_threshold: float or None,
628+
sklearn_args: Dict[str,str] or None) -> (SparkModelType, List[str]):
616629
"""
617630
All preprocessing steps to prepare for in DB deployment. Get the mleap model, get class labels
618631
:param fittedPipe:
@@ -644,9 +657,9 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
644657
df = fittedPipe.transform(df)
645658
# Get the Mleap model and insert it into the MODELS table
646659
mleap_model = get_mleap_model(splice_context, fittedPipe, df, run_id)
647-
insert_mleap_model(splice_context, run_id, mleap_model)
660+
model_already_exists = insert_mleap_model(splice_context, run_id, mleap_model)
648661

649-
return model_type, classes
662+
return model_type, classes, model_already_exists
650663

651664

652665
def get_model_library(model) -> DBLibraries:
@@ -697,7 +710,7 @@ def get_user():
697710
" Cloud Jupyter is currently unsupported")
698711

699712

700-
def insert_model(splice_context: PySpliceContext, run_id: str, byte_array: bytearray, library: str, version: str) -> None:
713+
def insert_model(splice_context: PySpliceContext, run_id: str, byte_array: bytearray, library: str, version: str) -> bool:
701714
"""
702715
Insert a serialized model into the Mlmanager models table
703716
:param splice_context: pysplicectx
@@ -713,6 +726,7 @@ def insert_model(splice_context: PySpliceContext, run_id: str, byte_array: bytea
713726
if model_exists:
714727
print(
715728
'A model with this run ID is already deployed. We are NOT replacing it. We will use the currently existing model.\nTo replace, use a new run_id')
729+
return True
716730

717731
else:
718732
db_connection = splice_context.getConnection()
@@ -728,6 +742,7 @@ def insert_model(splice_context: PySpliceContext, run_id: str, byte_array: bytea
728742

729743
prepared_statement.execute()
730744
prepared_statement.close()
745+
return False
731746

732747

733748
def insert_artifact(splice_context, name, byte_array, run_uuid, file_ext=None):
@@ -793,7 +808,9 @@ def get_mleap_model(splice_context, fittedPipe, df, run_id: str):
793808
return obj
794809

795810

796-
def insert_mleap_model(splice_context, run_id, model):
811+
def insert_mleap_model(splice_context: PySpliceContext,
812+
run_id: str,
813+
model: PipelineModel or SparkModel) -> bool:
797814
"""
798815
Insert an MLeap Transformer model into the database as a Blob
799816
:param splice_context: pysplicectx
@@ -809,7 +826,7 @@ def insert_mleap_model(splice_context, run_id, model):
809826
oos.flush()
810827
oos.close()
811828
byte_array = baos.toByteArray()
812-
insert_model(splice_context, run_id, byte_array, 'mleap', MLEAP_VERSION)
829+
return insert_model(splice_context, run_id, byte_array, 'mleap', MLEAP_VERSION)
813830

814831

815832
def validate_primary_key(splice_ctx: PySpliceContext,
@@ -994,8 +1011,8 @@ def create_vti_prediction_trigger(splice_context: PySpliceContext,
9941011
prediction_call += f", '{pred_threshold}'"
9951012

9961013
prediction_call += ')'
997-
998-
SQL_PRED_TRIGGER = f'CREATE TRIGGER runModel_{schema_table_name.replace(".", "_")}_{run_id}\n \tAFTER INSERT\n ' \
1014+
schema = schema_table_name.split('.')[0]
1015+
SQL_PRED_TRIGGER = f'CREATE TRIGGER {schema}.runModel_{schema_table_name.replace(".", "_")}_{run_id}\n \tAFTER INSERT\n ' \
9991016
f'\tON {schema_table_name}\n \tREFERENCING NEW AS NEWROW\n \tFOR EACH ROW\n \t\tUPDATE ' \
10001017
f'{schema_table_name} SET ('
10011018

@@ -1068,7 +1085,8 @@ def create_prediction_trigger(splice_context, schema_table_name, run_id, feature
10681085
elif model_type == H2OModelType.KEY_VALUE:
10691086
prediction_call = 'MLMANAGER.PREDICT_KEY_VALUE'
10701087

1071-
SQL_PRED_TRIGGER = f'CREATE TRIGGER runModel_{schema_table_name.replace(".", "_")}_{run_id}\n \tBEFORE INSERT\n ' \
1088+
schema = schema_table_name.split('.')[0]
1089+
SQL_PRED_TRIGGER = f'CREATE TRIGGER {schema}.runModel_{schema_table_name.replace(".", "_")}_{run_id}\n \tBEFORE INSERT\n ' \
10721090
f'\tON {schema_table_name}\n \tREFERENCING NEW AS NEWROW\n \tFOR EACH ROW\n \tBEGIN ATOMIC \t\t' \
10731091
f'SET NEWROW.PREDICTION='
10741092

@@ -1105,7 +1123,8 @@ def create_parsing_trigger(splice_context, schema_table_name, primary_key, run_i
11051123
:param model_type: (Enum) the model type (H2OModelType or SparkModelType)
11061124
:param verbose: (bool) whether to print the SQL query
11071125
"""
1108-
SQL_PARSE_TRIGGER = f'CREATE TRIGGER PARSERESULT_{schema_table_name.replace(".", "_")}_{run_id}' \
1126+
schema = schema_table_name.split('.')[0]
1127+
SQL_PARSE_TRIGGER = f'CREATE TRIGGER {schema}.PARSERESULT_{schema_table_name.replace(".", "_")}_{run_id}' \
11091128
f'\n \tBEFORE INSERT\n \tON {schema_table_name}\n \tREFERENCING NEW AS NEWROW\n' \
11101129
f' \tFOR EACH ROW\n \t\tBEGIN ATOMIC\n\t set '
11111130
set_prediction_case_str = 'NEWROW.PREDICTION=\n\t\tCASE\n'
@@ -1187,11 +1206,57 @@ def get_df_for_mleap(splice_ctx: PySpliceContext,
11871206

11881207
return df
11891208

1209+
def add_model_to_metadata(splice_context: PySpliceContext,
1210+
run_id: str,
1211+
schema_table_name: str) -> None:
1212+
1213+
if splice_context.tableExists(f'{SQL.MLMANAGER_SCHEMA}.MODEL_METADATA'):
1214+
schema_table_name = schema_table_name.upper()
1215+
schema, table = schema_table_name.split('.')
1216+
1217+
table_id = splice_context.df(f"select a.tableid from sys.systables a join sys.sysschemas b on a.schemaid=b.schemaid "
1218+
f"where a.tablename='{table}' and b.schemaname='{schema}'").collect()[0][0]
1219+
1220+
trigger_name_1 = f"RUNMODEL_{schema_table_name.replace('.','_')}_{run_id}".upper()
1221+
trigger_id_1, create_ts = splice_context.df(f"select triggerid, varchar(creationtimestamp) from sys.systriggers "
1222+
f"where triggername='{trigger_name_1}' and tableid='{table_id}'")\
1223+
.collect()[0]
11901224

1191-
def drop_tables_on_failure(splice_context, schema_table_name, run_id) -> None:
1225+
# Not all models will have a second trigger
1226+
trigger_name_2 = f"PARSERESULT_{schema_table_name.replace('.', '_')}_{run_id}".upper()
1227+
trigger_id_2 = splice_context.df(f"select triggerid from sys.systriggers where triggername='{trigger_name_2}' "
1228+
f"and tableid='{table_id}'").collect()
1229+
1230+
# Adding extra single quote to trigger_id_2 case NULL
1231+
trigger_id_2 = f"'{trigger_id_2[0][0]}'" if trigger_id_2 else 'NULL'
1232+
1233+
# We don't add the quotes around trigger_id_2 here because we handle it above in the NULL case
1234+
splice_context.execute(f"INSERT INTO {SQL.MLMANAGER_SCHEMA}.MODEL_METADATA"
1235+
f"(RUN_UUID, ACTION, TABLEID, TRIGGER_TYPE, TRIGGERID, TRIGGERID_2, DB_ENV, DB_USER, ACTION_DATE)"
1236+
f"values ('{run_id}', 'DEPLOYED', '{table_id}', 'INSERT', '{trigger_id_1}', {trigger_id_2},"
1237+
f"'PROD', '{get_user()}', '{create_ts}')")
1238+
1239+
1240+
1241+
def drop_tables_on_failure(splice_context: PySpliceContext,
1242+
schema_table_name: str,
1243+
run_id: str,
1244+
model_already_exists: bool) -> None:
11921245
"""
1193-
Drop the tables if the db deployment fails
1246+
Due to some limitations DB-7726 we can't use fully utilize a single consistent JDBC connection using NSDS
1247+
So we will try to rollback on failure using basic logic.
1248+
1249+
If the model was already in the models table (ie it had been deployed before), we will leave it. Otherwise, delete
1250+
Leave the tables.
11941251
"""
1195-
splice_context.execute(f'DROP TABLE IF EXISTS {schema_table_name}')
1196-
splice_context.execute(f'DROP TABLE IF EXISTS {schema_table_name}_preds')
1197-
splice_context.execute(f'DELETE FROM {SQL.MLMANAGER_SCHEMA}.MODELS WHERE RUN_UUID=\'{run_id}\'')
1252+
1253+
# splice_context.execute(f'DROP TABLE IF EXISTS {schema_table_name}')
1254+
if not model_already_exists:
1255+
splice_context.execute(f'DELETE FROM {SQL.MLMANAGER_SCHEMA}.MODELS WHERE RUN_UUID=\'{run_id}\'')
1256+
1257+
ModelUtils = {
1258+
DBLibraries.MLeap: SparkUtils,
1259+
DBLibraries.H2OMOJO: H2OUtils,
1260+
DBLibraries.Keras: KerasUtils,
1261+
DBLibraries.SKLearn: SKUtils,
1262+
}

0 commit comments

Comments
 (0)