@@ -53,7 +53,10 @@ class H2OUtils:
5353 def prep_model_for_deployment (splice_context : PySpliceContext ,
5454 model : H2OModel ,
5555 classes : List [str ],
56- run_id : str ) -> (H2OModelType , List [str ]):
56+ run_id : str ,
57+ df : SparkDF or None ,
58+ pred_threshold : float or None ,
59+ sklearn_args : Dict [str ,str ] or None ) -> (H2OModelType , List [str ]):
5760 """
5861 Gets the H2O mojo model
5962 Gets the model type
@@ -69,7 +72,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
6972
7073 # Get the H2O MOJO model and insert it into the MODELS table
7174 h2omojo , rawmojo = H2OUtils .get_h2omojo_model (splice_context , model )
72- H2OUtils .insert_h2omojo_model (splice_context , run_id , h2omojo )
75+ model_already_exists = H2OUtils .insert_h2omojo_model (splice_context , run_id , h2omojo )
7376
7477 # Get model type
7578 model_type , model_category = H2OUtils .get_model_type (h2omojo )
@@ -106,7 +109,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
106109 elif model_category == 'AnomalyDetection' :
107110 classes = ['score' , 'normalizedScore' ]
108111
109- return model_type , classes
112+ return model_type , classes , model_already_exists
110113
111114 @staticmethod
112115 def get_model_type (h2omojo : object ) -> (H2OModelType , str ):
@@ -153,14 +156,14 @@ def load_h2o_model(model_blob: bytes) -> H2OModel:
153156 return model
154157
155158 @staticmethod
156- def insert_h2omojo_model (splice_context : PySpliceContext , run_id : str , model : object ) -> None :
159+ def insert_h2omojo_model (splice_context : PySpliceContext , run_id : str , model : object ) -> bool :
157160 baos = splice_context .jvm .java .io .ByteArrayOutputStream ()
158161 oos = splice_context .jvm .java .io .ObjectOutputStream (baos )
159162 oos .writeObject (model )
160163 oos .flush ()
161164 oos .close ()
162165 byte_array = baos .toByteArray ()
163- insert_model (splice_context , run_id , byte_array , 'h2omojo' , h2o .__version__ )
166+ return insert_model (splice_context , run_id , byte_array , 'h2omojo' , h2o .__version__ )
164167
165168
166169class SKUtils :
@@ -174,9 +177,9 @@ def load_sklearn_model(model_blob: bytes):
174177 return load_pickle_string (model_blob )
175178
176179 @staticmethod
177- def insert_sklearn_model (splice_context : PySpliceContext , run_id : str , model : ScikitModel ) -> None :
180+ def insert_sklearn_model (splice_context : PySpliceContext , run_id : str , model : ScikitModel ) -> bool :
178181 byte_stream = save_pickle_string (model )
179- insert_model (splice_context , run_id , byte_stream , 'sklearn' , sklearn_version )
182+ return insert_model (splice_context , run_id , byte_stream , 'sklearn' , sklearn_version )
180183
181184 @staticmethod
182185 def validate_sklearn_args (model : ScikitModel , sklearn_args : Dict [str , str ]) -> Dict [str , str ]:
@@ -207,7 +210,11 @@ def validate_sklearn_args(model: ScikitModel, sklearn_args: Dict[str, str]) -> D
207210 t = ('return_std' , 'return_cov' )
208211 exc = f'predict_args value is invalid. Available options are { t } '
209212 else :
210- model_params = get_model_params (model .predict ) if hasattr (model , 'predict' ) else get_model_params (model .transform )
213+ if isinstance (model , SKPipeline ): # If we are working with a Pipeline, we want to check the last step for arguments
214+ m = model .steps [- 1 ][- 1 ]
215+ model_params = get_model_params (m .predict ) if hasattr (m , 'predict' ) else get_model_params (m .transform )
216+ else :
217+ model_params = get_model_params (model .predict ) if hasattr (model , 'predict' ) else get_model_params (model .transform )
211218 if p not in model_params .parameters :
212219 exc = f'predict_args set to { p } but that parameter is not available for this model!'
213220 elif sklearn_args and 'predict_args' not in sklearn_args and 'predict_call' not in sklearn_args :
@@ -224,12 +231,14 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
224231 model : ScikitModel ,
225232 classes : List [str ],
226233 run_id : str ,
227- sklearn_args : Dict [str , str ]) -> (SklearnModelType , List [str ]):
234+ df : SparkDF or None ,
235+ pred_threshold : float or None ,
236+ sklearn_args : Dict [str ,str ] or None ) -> (SklearnModelType , List [str ]):
228237
229238 sklearn_args = SKUtils .validate_sklearn_args (model , sklearn_args )
230239
231240 model_type = SKUtils .get_model_type (model , sklearn_args )
232- SKUtils .insert_sklearn_model (splice_context , run_id , model )
241+ model_already_exists = SKUtils .insert_sklearn_model (splice_context , run_id , model )
233242 if classes and model_type != SklearnModelType .KEY_VALUE :
234243 print ('Prediction labels found but model is not type Classification. Removing labels' )
235244 classes = None
@@ -260,7 +269,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
260269 if classes :
261270 print (f'Prediction labels found. Using { classes } as labels for predictions { list (range (0 , len (classes )))} respectively' )
262271
263- return model_type , classes
272+ return model_type , classes , model_already_exists
264273
265274 @staticmethod
266275 def get_pipeline_model_type (pipeline : SKPipeline ) -> SklearnModelType :
@@ -322,12 +331,12 @@ def load_keras_model(model_blob):
322331 return load_kr_model (hfile )
323332
324333 @staticmethod
325- def insert_keras_model (splice_context : PySpliceContext , run_id : str , model : KerasModel ) -> None :
334+ def insert_keras_model (splice_context : PySpliceContext , run_id : str , model : KerasModel ) -> bool :
326335 model .save ('/tmp/model.h5' )
327336 with open ('/tmp/model.h5' , 'rb' ) as f :
328337 byte_stream = bytearray (bytes (f .read ()))
329- insert_model (splice_context , run_id , byte_stream , 'keras' , KERAS_VERSION )
330338 remove ('/tmp/model.h5' )
339+ return insert_model (splice_context , run_id , byte_stream , 'keras' , KERAS_VERSION )
331340
332341 @staticmethod
333342 def get_keras_model_type (model : KerasModel , pred_threshold : float ) -> KerasModelType :
@@ -364,7 +373,9 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
364373 model : KerasModel ,
365374 classes : List [str ],
366375 run_id : str ,
367- pred_threshold : float ) -> (KerasModelType , List [str ]):
376+ df : SparkDF or None ,
377+ pred_threshold : float or None ,
378+ sklearn_args : Dict [str ,str ] or None )-> (KerasModelType , List [str ]):
368379 """
369380 Inserts the model into the MODELS table for deployment
370381 Gets the Keras model type
@@ -377,7 +388,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
377388 :return: (KerasModelType, List[str]) the modelType and the classes
378389 """
379390 KerasUtils .validate_keras_model (model )
380- KerasUtils .insert_keras_model (splice_context , run_id , model )
391+ model_already_exists = KerasUtils .insert_keras_model (splice_context , run_id , model )
381392 model_type : KerasModelType = KerasUtils .get_keras_model_type (model , pred_threshold )
382393 if model_type == KerasModelType .KEY_VALUE :
383394 output_shape = model .layers [- 1 ].output_shape
@@ -390,7 +401,7 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
390401 classes = ['prediction' ] + classes
391402 if len (classes ) > 2 and pred_threshold :
392403 print (f"Found multiclass model with pred_threshold { pred_threshold } . Ignoring threshold." )
393- return model_type , classes
404+ return model_type , classes , model_already_exists
394405
395406
396407
@@ -610,9 +621,11 @@ def load_spark_model(splice_ctx, spark_pipeline_blob):
610621 @staticmethod
611622 def prep_model_for_deployment (splice_context : PySpliceContext ,
612623 fittedPipe : PipelineModel ,
613- df : SparkDF ,
614624 classes : List [str ],
615- run_id : str ) -> (SparkModelType , List [str ]):
625+ run_id : str ,
626+ df : SparkDF ,
627+ pred_threshold : float or None ,
628+ sklearn_args : Dict [str ,str ] or None ) -> (SparkModelType , List [str ]):
616629 """
617630 All preprocessing steps to prepare for in DB deployment. Get the mleap model, get class labels
618631 :param fittedPipe:
@@ -644,9 +657,9 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
644657 df = fittedPipe .transform (df )
645658 # Get the Mleap model and insert it into the MODELS table
646659 mleap_model = get_mleap_model (splice_context , fittedPipe , df , run_id )
647- insert_mleap_model (splice_context , run_id , mleap_model )
660+ model_already_exists = insert_mleap_model (splice_context , run_id , mleap_model )
648661
649- return model_type , classes
662+ return model_type , classes , model_already_exists
650663
651664
652665def get_model_library (model ) -> DBLibraries :
@@ -697,7 +710,7 @@ def get_user():
697710 " Cloud Jupyter is currently unsupported" )
698711
699712
700- def insert_model (splice_context : PySpliceContext , run_id : str , byte_array : bytearray , library : str , version : str ) -> None :
713+ def insert_model (splice_context : PySpliceContext , run_id : str , byte_array : bytearray , library : str , version : str ) -> bool :
701714 """
702715 Insert a serialized model into the Mlmanager models table
703716 :param splice_context: pysplicectx
@@ -713,6 +726,7 @@ def insert_model(splice_context: PySpliceContext, run_id: str, byte_array: bytea
713726 if model_exists :
714727 print (
715728 'A model with this run ID is already deployed. We are NOT replacing it. We will use the currently existing model.\n To replace, use a new run_id' )
729+ return True
716730
717731 else :
718732 db_connection = splice_context .getConnection ()
@@ -728,6 +742,7 @@ def insert_model(splice_context: PySpliceContext, run_id: str, byte_array: bytea
728742
729743 prepared_statement .execute ()
730744 prepared_statement .close ()
745+ return False
731746
732747
733748def insert_artifact (splice_context , name , byte_array , run_uuid , file_ext = None ):
@@ -793,7 +808,9 @@ def get_mleap_model(splice_context, fittedPipe, df, run_id: str):
793808 return obj
794809
795810
796- def insert_mleap_model (splice_context , run_id , model ):
811+ def insert_mleap_model (splice_context : PySpliceContext ,
812+ run_id : str ,
813+ model : PipelineModel or SparkModel ) -> bool :
797814 """
798815 Insert an MLeap Transformer model into the database as a Blob
799816 :param splice_context: pysplicectx
@@ -809,7 +826,7 @@ def insert_mleap_model(splice_context, run_id, model):
809826 oos .flush ()
810827 oos .close ()
811828 byte_array = baos .toByteArray ()
812- insert_model (splice_context , run_id , byte_array , 'mleap' , MLEAP_VERSION )
829+ return insert_model (splice_context , run_id , byte_array , 'mleap' , MLEAP_VERSION )
813830
814831
815832def validate_primary_key (splice_ctx : PySpliceContext ,
@@ -994,8 +1011,8 @@ def create_vti_prediction_trigger(splice_context: PySpliceContext,
9941011 prediction_call += f", '{ pred_threshold } '"
9951012
9961013 prediction_call += ')'
997-
998- SQL_PRED_TRIGGER = f'CREATE TRIGGER runModel_{ schema_table_name .replace ("." , "_" )} _{ run_id } \n \t AFTER INSERT\n ' \
1014+ schema = schema_table_name . split ( '.' )[ 0 ]
1015+ SQL_PRED_TRIGGER = f'CREATE TRIGGER { schema } . runModel_{ schema_table_name .replace ("." , "_" )} _{ run_id } \n \t AFTER INSERT\n ' \
9991016 f'\t ON { schema_table_name } \n \t REFERENCING NEW AS NEWROW\n \t FOR EACH ROW\n \t \t UPDATE ' \
10001017 f'{ schema_table_name } SET ('
10011018
@@ -1068,7 +1085,8 @@ def create_prediction_trigger(splice_context, schema_table_name, run_id, feature
10681085 elif model_type == H2OModelType .KEY_VALUE :
10691086 prediction_call = 'MLMANAGER.PREDICT_KEY_VALUE'
10701087
1071- SQL_PRED_TRIGGER = f'CREATE TRIGGER runModel_{ schema_table_name .replace ("." , "_" )} _{ run_id } \n \t BEFORE INSERT\n ' \
1088+ schema = schema_table_name .split ('.' )[0 ]
1089+ SQL_PRED_TRIGGER = f'CREATE TRIGGER { schema } .runModel_{ schema_table_name .replace ("." , "_" )} _{ run_id } \n \t BEFORE INSERT\n ' \
10721090 f'\t ON { schema_table_name } \n \t REFERENCING NEW AS NEWROW\n \t FOR EACH ROW\n \t BEGIN ATOMIC \t \t ' \
10731091 f'SET NEWROW.PREDICTION='
10741092
@@ -1105,7 +1123,8 @@ def create_parsing_trigger(splice_context, schema_table_name, primary_key, run_i
11051123 :param model_type: (Enum) the model type (H2OModelType or SparkModelType)
11061124 :param verbose: (bool) whether to print the SQL query
11071125 """
1108- SQL_PARSE_TRIGGER = f'CREATE TRIGGER PARSERESULT_{ schema_table_name .replace ("." , "_" )} _{ run_id } ' \
1126+ schema = schema_table_name .split ('.' )[0 ]
1127+ SQL_PARSE_TRIGGER = f'CREATE TRIGGER { schema } .PARSERESULT_{ schema_table_name .replace ("." , "_" )} _{ run_id } ' \
11091128 f'\n \t BEFORE INSERT\n \t ON { schema_table_name } \n \t REFERENCING NEW AS NEWROW\n ' \
11101129 f' \t FOR EACH ROW\n \t \t BEGIN ATOMIC\n \t set '
11111130 set_prediction_case_str = 'NEWROW.PREDICTION=\n \t \t CASE\n '
@@ -1187,11 +1206,57 @@ def get_df_for_mleap(splice_ctx: PySpliceContext,
11871206
11881207 return df
11891208
1209+ def add_model_to_metadata (splice_context : PySpliceContext ,
1210+ run_id : str ,
1211+ schema_table_name : str ) -> None :
1212+
1213+ if splice_context .tableExists (f'{ SQL .MLMANAGER_SCHEMA } .MODEL_METADATA' ):
1214+ schema_table_name = schema_table_name .upper ()
1215+ schema , table = schema_table_name .split ('.' )
1216+
1217+ table_id = splice_context .df (f"select a.tableid from sys.systables a join sys.sysschemas b on a.schemaid=b.schemaid "
1218+ f"where a.tablename='{ table } ' and b.schemaname='{ schema } '" ).collect ()[0 ][0 ]
1219+
1220+ trigger_name_1 = f"RUNMODEL_{ schema_table_name .replace ('.' ,'_' )} _{ run_id } " .upper ()
1221+ trigger_id_1 , create_ts = splice_context .df (f"select triggerid, varchar(creationtimestamp) from sys.systriggers "
1222+ f"where triggername='{ trigger_name_1 } ' and tableid='{ table_id } '" )\
1223+ .collect ()[0 ]
11901224
1191- def drop_tables_on_failure (splice_context , schema_table_name , run_id ) -> None :
1225+ # Not all models will have a second trigger
1226+ trigger_name_2 = f"PARSERESULT_{ schema_table_name .replace ('.' , '_' )} _{ run_id } " .upper ()
1227+ trigger_id_2 = splice_context .df (f"select triggerid from sys.systriggers where triggername='{ trigger_name_2 } ' "
1228+ f"and tableid='{ table_id } '" ).collect ()
1229+
1230+ # Adding extra single quote to trigger_id_2 case NULL
1231+ trigger_id_2 = f"'{ trigger_id_2 [0 ][0 ]} '" if trigger_id_2 else 'NULL'
1232+
1233+ # We don't add the quotes around trigger_id_2 here because we handle it above in the NULL case
1234+ splice_context .execute (f"INSERT INTO { SQL .MLMANAGER_SCHEMA } .MODEL_METADATA"
1235+ f"(RUN_UUID, ACTION, TABLEID, TRIGGER_TYPE, TRIGGERID, TRIGGERID_2, DB_ENV, DB_USER, ACTION_DATE)"
1236+ f"values ('{ run_id } ', 'DEPLOYED', '{ table_id } ', 'INSERT', '{ trigger_id_1 } ', { trigger_id_2 } ,"
1237+ f"'PROD', '{ get_user ()} ', '{ create_ts } ')" )
1238+
1239+
1240+
1241+ def drop_tables_on_failure (splice_context : PySpliceContext ,
1242+ schema_table_name : str ,
1243+ run_id : str ,
1244+ model_already_exists : bool ) -> None :
11921245 """
1193- Drop the tables if the db deployment fails
1246+ Due to some limitations DB-7726 we can't use fully utilize a single consistent JDBC connection using NSDS
1247+ So we will try to rollback on failure using basic logic.
1248+
1249+ If the model was already in the models table (ie it had been deployed before), we will leave it. Otherwise, delete
1250+ Leave the tables.
11941251 """
1195- splice_context .execute (f'DROP TABLE IF EXISTS { schema_table_name } ' )
1196- splice_context .execute (f'DROP TABLE IF EXISTS { schema_table_name } _preds' )
1197- splice_context .execute (f'DELETE FROM { SQL .MLMANAGER_SCHEMA } .MODELS WHERE RUN_UUID=\' { run_id } \' ' )
1252+
1253+ # splice_context.execute(f'DROP TABLE IF EXISTS {schema_table_name}')
1254+ if not model_already_exists :
1255+ splice_context .execute (f'DELETE FROM { SQL .MLMANAGER_SCHEMA } .MODELS WHERE RUN_UUID=\' { run_id } \' ' )
1256+
1257+ ModelUtils = {
1258+ DBLibraries .MLeap : SparkUtils ,
1259+ DBLibraries .H2OMOJO : H2OUtils ,
1260+ DBLibraries .Keras : KerasUtils ,
1261+ DBLibraries .SKLearn : SKUtils ,
1262+ }
0 commit comments