splicemachine
diff --git a/‎splicemachine/mlflow_support/constants.py‎
Lines changed: 14 additions & 0 deletions b/‎splicemachine/mlflow_support/constants.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎splicemachine/mlflow_support/mlflow_support.py‎
Lines changed: 30 additions & 37 deletions b/‎splicemachine/mlflow_support/mlflow_support.py‎
Lines changed: 30 additions & 37 deletions
@@ -1,5 +1,19 @@
 from enum import Enum
 
+# The MLeap package does not include a __version__ so we'll store it here
+MLEAP_VERSION = '0.15.0'
+# When storing models for in-DB deployment, we save the model with a name.
+class DBLibraries():
+    MLeap = 'mleap'
+    H2OMOJO = 'h2omojo'
+    SUPPORTED_LIBRARIES = [MLeap, H2OMOJO]
+
+class H2OModelType(Enum): # Based off https://github.com/h2oai/h2o-3/blob/master/h2o-genmodel/src/main/java/hex/ModelCategory.java
+    REGRESSION = 0 # Models that return a single Double value (Regression, HGLMRegression)
+    SINGULAR = 1 # Models that return a single Int value (Clustering)
+    CLASSIFICATION = 2 # Models that only return N classes with values associated (Binomial, Multinomial, Ordinal)
+    KEY_VALUE_RETURN = 3 # Models whose output labels are known (AutoEncoder, TargetEncoder, DimReduction, WordEmbedding, AnomalyDetection)
+
 
 class SparkModelType(Enum):
     """
 
@@ -15,6 +15,7 @@
 import h2o
 import pyspark
 
+from splicemachine.mlflow_support.constants import *
 from splicemachine.mlflow_support.utilities import *
 from splicemachine.spark.context import PySpliceContext
 from splicemachine.spark.constants import CONVERSIONS
@@ -67,7 +68,7 @@ def _check_for_splice_ctx():
     spark operations to take place
     """
 
-    if not getattr(mlflow, '_splice_context'):
+    if not hasattr(mlflow, '_splice_context'):
         raise SpliceMachineException(
             "You must run `mlflow.register_splice_context(py_splice_context) before "
             "you can run this mlflow operation!"
@@ -117,8 +118,8 @@ def _lm(key, value):
 @_mlflow_patch('log_model')
 def _log_model(model, name='model'):
     """
-    Log a fitted spark pipeline or model
-    :param model: (PipelineModel or Model) is the fitted Spark Model/Pipeline to store
+    Log a fitted spark pipeline/model or H2O model
+    :param model: (PipelineModel or Model) is the fitted Spark Model/Pipeline or H2O model to store
         with the current run
     :param name: (str) the run relative name to store the model under
     """
@@ -319,7 +320,7 @@ def _load_model(run_id=None, name='model'):
         with open('/tmp/model', 'wb') as file:
             file.write(model_blob)
         model = h2o.load_model('/tmp/model')
-        rmtree('/tmp/model')
+        remove('/tmp/model')
     return model
 
 
@@ -361,7 +362,7 @@ def _initiate_job(payload, endpoint):
     :param endpoint: (str) REST endpoint to target
     :return: (str) Response text from request
     """
-    if not getattr(mlflow, '_basic_auth'):
+    if not hasattr(mlflow, '_basic_auth'):
         raise Exception(
             "You have not logged into MLManager director."
             " Please run mlflow.login_director(username, password)"
@@ -473,7 +474,6 @@ def _deploy_azure(endpoint_name, resource_group, workspace, run_id=None, region=
     }
     return _initiate_job(request_payload, '/api/rest/initiate')
 
-
 @_mlflow_patch('deploy_database')
 def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
                run_id=None, classes=None, verbose=False, replace=False) -> None:
@@ -511,6 +511,7 @@ def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
     run_id = run_id if run_id else mlflow.active_run().info.run_uuid
     db_table_name = db_table_name if db_table_name else f'data_{run_id}'
     schema_table_name = f'{db_schema_name}.{db_table_name}' if db_schema_name else db_table_name
+    assert type(df) is pyspark.sql.dataframe.DataFrame, "Dataframe must be a PySpark dataframe!"
 
     feature_columns = df.columns
     # Get the datatype of each column in the dataframe
@@ -519,37 +520,25 @@ def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
     # Make sure primary_key is valid format
     validate_primary_key(primary_key)
 
-    # Get model type
-    modelType = SparkUtils.get_model_type(fittedPipe)
 
-    print(f'Deploying model {run_id} to table {schema_table_name}')
-
-    if classes:
-        if modelType not in (SparkModelType.CLASSIFICATION, SparkModelType.CLUSTERING_WITH_PROB):
-            print('Prediction labels found but model is not type Classification. Removing labels')
-            classes = None
-        else:
-            # handling spaces in class names
-            classes = [c.replace(' ', '_') for c in classes]
-            print(
-                f'Prediction labels found. Using {classes} as labels for predictions {list(range(0, len(classes)))} respectively')
+    # library = get_model_library(run_id)
+    typ = str(type(fittedPipe))
+    library = 'mleap' if 'pyspark' in typ else 'h2omojo' if 'h2o' in typ else None
+    if library == DBLibraries.MLeap:
+        modelType, classes = SparkUtils.prep_model_for_deployment(mlflow._splice_context, fittedPipe, df, classes, run_id)
+    elif library == DBLibraries.H2OMOJO:
+        modelType, classes = H2OUtils.prep_model_for_deployment(mlflow._splice_context, fittedPipe, classes, run_id)
     else:
-        if modelType in (SparkModelType.CLASSIFICATION, SparkModelType.CLUSTERING_WITH_PROB):
-            # Add a column for each class of the prediction to output the probability of the prediction
-            classes = [f'C{i}' for i in range(SparkUtils.get_num_classes(fittedPipe))]
-
-    # See if the df passed in has already been transformed.
-    # If not, transform it
-    if 'prediction' not in df.columns:
-        df = fittedPipe.transform(df)
-    # Get the Mleap model and insert it into the MODELS table
-    mleap_model = get_mleap_model(mlflow._splice_context, fittedPipe, df, run_id)
-    insert_mleap_model(mlflow._splice_context, run_id, mleap_model)
+        raise SpliceMachineException('Model type is not supported for in DB Deployment!. '
+                                     'Currently, model must be H2O or Spark.')
+
+
+    print(f'Deploying model {run_id} to table {schema_table_name}')
 
     # Create the schema of the table (we use this a few times)
     schema_str = ''
     for i in feature_columns:
-        schema_str += f'\t{i} {CONVERSIONS[schema_types[str(i)]]},\n'
+        schema_str += f'\t{i} {CONVERSIONS[schema_types[str(i)]]},'
 
     try:
         # Create table 1: DATA
@@ -562,17 +551,20 @@ def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
         create_data_preds_table(mlflow._splice_context, run_id, schema_table_name, classes, primary_key, modelType, verbose)
         print('Done.')
 
-        # Create Trigger 1: (model prediction)
+        # Create Trigger 1: model prediction
         print('Creating model prediction trigger ...', end=' ')
-        create_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, feature_columns, schema_types,
-                                  schema_str,
-                                  primary_key, modelType, verbose)
+        if modelType == H2OModelType.KEY_VALUE_RETURN:
+            create_vti_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, feature_columns, schema_types, schema_str, primary_key, classes, verbose)
+        else:
+            create_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, feature_columns, schema_types,
+                                    schema_str, primary_key, modelType, verbose)
         print('Done.')
 
-        if modelType in (SparkModelType.CLASSIFICATION, SparkModelType.CLUSTERING_WITH_PROB):
+        if modelType in (SparkModelType.CLASSIFICATION, SparkModelType.CLUSTERING_WITH_PROB,
+                         H2OModelType.CLASSIFICATION):
             # Create Trigger 2: model parsing
             print('Creating parsing trigger ...', end=' ')
-            create_parsing_trigger(mlflow._splice_context, schema_table_name, primary_key, run_id, classes, verbose)
+            create_parsing_trigger(mlflow._splice_context, schema_table_name, primary_key, run_id, classes, modelType, verbose)
             print('Done.')
     except Exception as e:
         import traceback
@@ -581,6 +573,7 @@ def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
         if not verbose:
             print('For more insight into the SQL statement that generated this error, rerun with verbose=True')
         traceback.print_exc()
+        raise SpliceMachineException('Model deployment failed.')
 
     print('Model Deployed.')