splicemachine
diff --git a/‎splicemachine/mlflow_support/constants.py‎
Lines changed: 11 additions & 0 deletions b/‎splicemachine/mlflow_support/constants.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎splicemachine/mlflow_support/mlflow_support.py‎
Lines changed: 52 additions & 26 deletions b/‎splicemachine/mlflow_support/mlflow_support.py‎
Lines changed: 52 additions & 26 deletions
@@ -6,6 +6,7 @@
 class DBLibraries():
     MLeap = 'mleap'
     H2OMOJO = 'h2omojo'
+    SKLearn = 'sklearn'
     SUPPORTED_LIBRARIES = [MLeap, H2OMOJO]
 
 class H2OModelType(Enum): # Based off https://github.com/h2oai/h2o-3/blob/master/h2o-genmodel/src/main/java/hex/ModelCategory.java
@@ -24,6 +25,16 @@ class SparkModelType(Enum):
     CLUSTERING_WITH_PROB = 2
     CLUSTERING_WO_PROB = 3
 
+class SklearnModelType(Enum):
+    """
+    Model Types for SKLearn models
+    Sklearn isn't as well defined in their model categories, so we are going to classify them by their return values
+    """
+    POINT_PREDICTION_REG = 0
+    POINT_PREDICTION_CLF = 1
+    KEY_VALUE = 2
+
+
 class FileExtensions():
     """
     Class containing names for
 
@@ -10,7 +10,6 @@
 from requests.auth import HTTPBasicAuth
 from mleap.pyspark import spark_support
 import pyspark
-from pyspark.ml.base import Estimator as SparkModel
 import sklearn
 from sklearn.base import BaseEstimator as ScikitModel
 from tensorflow import __version__ as tf_version
@@ -21,6 +20,8 @@
 from splicemachine.mlflow_support.utilities import *
 from splicemachine.spark.context import PySpliceContext
 from splicemachine.spark.constants import CONVERSIONS
+from pyspark.sql.dataframe import DataFrame as SparkDF
+from pandas.core.frame import DataFrame as PandasDF
 
 _TESTING = env_vars.get("TESTING", False)
 _TRACKING_URL = get_pod_uri("mlflow", "5001", _TESTING)
@@ -277,9 +278,9 @@ def _log_model_params(pipeline_or_model):
     for param in verbose_parameters:
         try:
             value = float(verbose_parameters[param])
-            mlflow.log_param('Hyperparameter- ' + param.split('-')[0], value)
+            mlflow.log_param(param.split('-')[0], value)
         except:
-            mlflow.log_param('Hyperparameter- ' + param.split('-')[0], verbose_parameters[param])
+            mlflow.log_param(param.split('-')[0], verbose_parameters[param])
 
 
 @_mlflow_patch('timer')
@@ -292,12 +293,14 @@ def _timer(timer_name, param=True):
     :return:
     """
     try:
+        print(f'Starting Code Block {timer_name}...', end=' ')
         t0 = time.time()
         yield
     finally:
         t1 = time.time() - t0
         # Syntactic Sugar
         (mlflow.log_param if param else mlflow.log_metric)(timer_name, t1)
+        print('Done.')
         print(
             f"Code Block {timer_name}:\nRan in {round(t1, 3)} secs\nRan in {round(t1 / 60, 3)} mins"
         )
@@ -320,7 +323,7 @@ def _download_artifact(name, local_path, run_id=None):
     file_ext = path.splitext(local_path)[1]
 
     run_id = run_id or mlflow.active_run().info.run_uuid
-    blob_data, f_etx = SparkUtils.retrieve_artifact_stream(mlflow._splice_context, run_id, name)
+    blob_data, f_ext = SparkUtils.retrieve_artifact_stream(mlflow._splice_context, run_id, name)
 
     if not file_ext: # If the user didn't provide the file (ie entered . as the local_path), fill it in for them
         local_path += f'/{name}.{f_etx}'
@@ -508,31 +511,50 @@ def _deploy_azure(endpoint_name, resource_group, workspace, run_id=None, region=
     return _initiate_job(request_payload, '/api/rest/initiate')
 
 @_mlflow_patch('deploy_database')
-def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
-               run_id=None, classes=None, verbose=False, replace=False) -> None:
-    """
-    Function to deploy a trained (Spark for now) model to the Database. This creates 2 tables: One with the features of the model, and one with the prediction and metadata.
+def _deploy_db(fittedModel,
+               df,
+               db_schema_name,
+               db_table_name,
+               primary_key,
+               run_id: str=None,
+               classes=None,
+               sklearn_args={},
+               verbose=False,
+               replace=False) -> None:
+    """
+    Function to deploy a trained (currently Spark, Sklearn or H2O) model to the Database.
+    This creates 2 tables: One with the features of the model, and one with the prediction and metadata.
     They are linked with a column called MOMENT_ID
 
-    :param fittedPipe: (spark pipeline or model) The fitted pipeline to deploy
+    :param fittedModel: (ML pipeline or model) The fitted pipeline to deploy
     :param df: (Spark DF) The dataframe used to train the model
                 NOTE: this dataframe should NOT be transformed by the model. The columns in this df are the ones
                 that will be used to create the table.
     :param db_schema_name: (str) the schema name to deploy to. If None, the currently set schema will be used.
     :param db_table_name: (str) the table name to deploy to. If none, the run_id will be used for the table name(s)
     :param primary_key: (List[Tuple[str, str]]) List of column + SQL datatype to use for the primary/composite key
     :param run_id: (str) The active run_id
-    :param classes: List[str] The classes (prediction values) for the model being deployed.
-                    NOTE: If not supplied, the table will have column named c0,c1,c2 etc for each class
-    :param verbose: bool Whether or not to print out the queries being created. Helpful for debugging
+    :param classes: (List[str]) The classes (prediction labels) for the model being deployed.
+                    NOTE: If not supplied, the table will have default column names for each class
+    :param sklearn_args: (dict{str: str}) Prediction options for sklearn models
+                        Available key value options:
+                        'predict_call': 'predict', 'predict_proba', or 'transform'
+                                                                       - Determines the function call for the model
+                                                                       If blank, predict will be used
+                                                                       (or transform if model doesn't have predict)
+                        'predict_args': 'return_std' or 'return_cov' - For Bayesian and Gaussian models
+                                                                         Only one can be specified
+                        If the model does not have the option specified, it will be ignored.
+    :param verbose: (bool) Whether or not to print out the queries being created. Helpful for debugging
+    :param replace: (bool) whether or not to replace a currently existing model. This param does not yet work
 
     This function creates the following:
     * Table (default called DATA_{run_id}) where run_id is the run_id of the mlflow run associated to that model. This will have a column for each feature in the feature vector as well as a MOMENT_ID as primary key
     * Table (default called DATA_{run_id}_PREDS) That will have the columns:
         USER which is the current user who made the request
         EVAL_TIME which is the CURRENT_TIMESTAMP
         MOMENT_ID same as the DATA table to link predictions to rows in the table
-        PREDICTION. The prediction of the model. If the :classes: param is not filled in, this will be c0,c1,c2 etc for classification models
+        PREDICTION. The prediction of the model. If the :classes: param is not filled in, this will be default values for classification models
         A column for each class of the predictor with the value being the probability/confidence of the model if applicable
     * A trigger that runs on (after) insertion to the data table that runs an INSERT into the prediction table,
         calling the PREDICT function, passing in the row of data as well as the schema of the dataset, and the run_id of the model to run
@@ -541,10 +563,14 @@ def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
     """
     _check_for_splice_ctx()
     classes = classes if classes else []
+
     run_id = run_id if run_id else mlflow.active_run().info.run_uuid
     db_table_name = db_table_name if db_table_name else f'data_{run_id}'
     schema_table_name = f'{db_schema_name}.{db_table_name}' if db_schema_name else db_table_name
-    assert type(df) is pyspark.sql.dataframe.DataFrame, "Dataframe must be a PySpark dataframe!"
+    assert type(df) in (SparkDF, PandasDF), "Dataframe must be a PySpark or Pandas dataframe!"
+
+    if type(df) == PandasDF:
+        df = mlflow._splice_context.spark_session.createDataFrame(df)
 
     feature_columns = df.columns
     # Get the datatype of each column in the dataframe
@@ -553,14 +579,13 @@ def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
     # Make sure primary_key is valid format
     validate_primary_key(primary_key)
 
-
-    # library = get_model_library(run_id)
-    typ = str(type(fittedPipe))
-    library = 'mleap' if 'pyspark' in typ else 'h2omojo' if 'h2o' in typ else None
+    library = get_model_library(fittedModel)
     if library == DBLibraries.MLeap:
-        modelType, classes = SparkUtils.prep_model_for_deployment(mlflow._splice_context, fittedPipe, df, classes, run_id)
+        model_type, classes = SparkUtils.prep_model_for_deployment(mlflow._splice_context, fittedModel, df, classes, run_id)
     elif library == DBLibraries.H2OMOJO:
-        modelType, classes = H2OUtils.prep_model_for_deployment(mlflow._splice_context, fittedPipe, classes, run_id)
+        model_type, classes = H2OUtils.prep_model_for_deployment(mlflow._splice_context, fittedModel, classes, run_id)
+    elif library == DBLibraries.SKLearn:
+        model_type, classes = SKUtils.prep_model_for_deployment(mlflow._splice_context, fittedModel, classes, run_id, sklearn_args)
     else:
         raise SpliceMachineException('Model type is not supported for in DB Deployment!. '
                                      'Currently, model must be H2O or Spark.')
@@ -581,23 +606,24 @@ def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
 
         # Create table 2: DATA_PREDS
         print('Creating prediction table ...', end=' ')
-        create_data_preds_table(mlflow._splice_context, run_id, schema_table_name, classes, primary_key, modelType, verbose)
+        create_data_preds_table(mlflow._splice_context, run_id, schema_table_name, classes, primary_key, model_type, verbose)
         print('Done.')
 
         # Create Trigger 1: model prediction
         print('Creating model prediction trigger ...', end=' ')
-        if modelType == H2OModelType.KEY_VALUE_RETURN:
-            create_vti_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, feature_columns, schema_types, schema_str, primary_key, classes, verbose)
+        if model_type in (H2OModelType.KEY_VALUE_RETURN, SklearnModelType.KEY_VALUE):
+            create_vti_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, feature_columns,
+                                          schema_types, schema_str, primary_key, classes, model_type, sklearn_args, verbose)
         else:
             create_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, feature_columns, schema_types,
-                                    schema_str, primary_key, modelType, verbose)
+                                    schema_str, primary_key, model_type, verbose)
         print('Done.')
 
-        if modelType in (SparkModelType.CLASSIFICATION, SparkModelType.CLUSTERING_WITH_PROB,
+        if model_type in (SparkModelType.CLASSIFICATION, SparkModelType.CLUSTERING_WITH_PROB,
                          H2OModelType.CLASSIFICATION):
             # Create Trigger 2: model parsing
             print('Creating parsing trigger ...', end=' ')
-            create_parsing_trigger(mlflow._splice_context, schema_table_name, primary_key, run_id, classes, modelType, verbose)
+            create_parsing_trigger(mlflow._splice_context, schema_table_name, primary_key, run_id, classes, model_type, verbose)
             print('Done.')
     except Exception as e:
         import traceback