Dbaas 3643 (#49)

Epstein · web-flow · commit a3453cd14bf5 · 2020-04-02T12:41:26.000-04:00
* DBAAS-3642: made spark model saving and loading generic

* DBAAS-3643: model class is object, not string

* accounting for new columns

* fix version column
diff --git a/splicemachine/mlflow_support/mlflow_support.py b/splicemachine/mlflow_support/mlflow_support.py
@@ -2,14 +2,18 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from io import BytesIO
-from os import path
+from os import path, remove
+from shutil import rmtree
 from zipfile import ZipFile
+from sys import version as py_version
 
 import gorilla
 import mlflow
 import requests
 from requests.auth import HTTPBasicAuth
 from mleap.pyspark import spark_support
+import h2o
+import pyspark
 
 from splicemachine.mlflow_support.utilities import *
 from splicemachine.spark.context import PySpliceContext
@@ -21,7 +25,7 @@
 _CLIENT = mlflow.tracking.MlflowClient(tracking_uri=_TRACKING_URL)
 
 _GORILLA_SETTINGS = gorilla.Settings(allow_hit=True, store_hit=True)
-
+_PYTHON_VERSION = py_version.split('|')[0].strip()
 
 def _mlflow_patch(name):
     """
@@ -110,8 +114,8 @@ def _lm(key, value):
     mlflow.log_metric(key, value)
 
 
-@_mlflow_patch('log_spark_model')
-def _log_spark_model(model, name='model'):
+@_mlflow_patch('log_model')
+def _log_model(model, name='model'):
     """
     Log a fitted spark pipeline or model
     :param model: (PipelineModel or Model) is the fitted Spark Model/Pipeline to store
@@ -120,26 +124,27 @@ def _log_spark_model(model, name='model'):
     """
     _check_for_splice_ctx()
     if _get_current_run_data().tags.get('splice.model_name'):  # this function has already run
-        raise Exception("Only one model is permitted per run.")
+        raise SpliceMachineException("Only one model is permitted per run.")
 
     mlflow.set_tag('splice.model_name', name)  # read in backend for deployment
-
-    jvm = mlflow._splice_context.jvm
-    java_import(jvm, "java.io.{BinaryOutputStream, ObjectOutputStream, ByteArrayInputStream}")
-
-    if not SparkUtils.is_spark_pipeline(model):
-        model = PipelineModel(
-            stages=[model]
-        )  # create a pipeline with only the model if a model is passed in
-
-    baos = jvm.java.io.ByteArrayOutputStream()  # serialize the PipelineModel to a byte array
-    oos = jvm.java.io.ObjectOutputStream(baos)
-    oos.writeObject(model._to_java())
-    oos.flush()
-    oos.close()
-    insert_artifact(mlflow._splice_context, name, baos.toByteArray(), mlflow.active_run().info.run_uuid,
-                    file_ext='sparkmodel')  # write the byte stream to the db as a BLOB
-
+    model_class = str(model.__class__)
+    mlflow.set_tag('splice.model_type', model_class)
+    mlflow.set_tag('splice.model_py_version', _PYTHON_VERSION)
+
+    run_id = mlflow.active_run().info.run_uuid
+    if 'h2o' in model_class.lower():
+        mlflow.set_tag('splice.h2o_version', h2o.__version__)
+        model_path = h2o.save_model(model=model, path='/tmp/model', force=True)
+        with open(model_path, 'rb') as artifact:
+            byte_stream = bytearray(bytes(artifact.read()))
+        insert_artifact(mlflow._splice_context, name, byte_stream, run_id, file_ext='h2omodel')
+        rmtree('/tmp/model')
+
+    elif 'spark' in model_class.lower():
+        mlflow.set_tag('splice.spark_version', pyspark.__version__)
+        SparkUtils.log_spark_model(mlflow._splice_context, model, name, run_id=run_id)
+    else:
+        raise SpliceMachineException('Currently we only support logging Spark and H2O models.')
 
 @_mlflow_patch('start_run')
 def _start_run(run_id=None, tags=None, experiment_id=None, run_name=None, nested=False):
@@ -284,21 +289,18 @@ def _download_artifact(name, local_path, run_id=None):
     _check_for_splice_ctx()
     file_ext = path.splitext(local_path)[1]
 
-    if not file_ext:
-        raise ValueError('local_path variable must contain the file extension!')
-
     run_id = run_id or mlflow.active_run().info.run_uuid
-    blob_data = SparkUtils.retrieve_artifact_stream(mlflow._splice_context, run_id, name)
-    if file_ext == '.zip':
-        zip_file = ZipFile(BytesIO(blob_data))
-        zip_file.extractall()
-    else:
-        with open(local_path, 'wb') as artifact_file:
+    blob_data, f_etx = SparkUtils.retrieve_artifact_stream(mlflow._splice_context, run_id, name)
+
+    if not file_ext: # If the user didn't provide the file (ie entered . as the local_path), fill it in for them
+        local_path += f'/{name}.{f_etx}'
+
+    with open(local_path, 'wb') as artifact_file:
             artifact_file.write(blob_data)
 
 
-@_mlflow_patch('load_spark_model')
-def _load_spark_model(run_id=None, name='model'):
+@_mlflow_patch('load_model')
+def _load_model(run_id=None, name='model'):
     """
     Download a model from database
     and load it into Spark
@@ -308,17 +310,17 @@ def _load_spark_model(run_id=None, name='model'):
     """
     _check_for_splice_ctx()
     run_id = run_id or mlflow.active_run().info.run_uuid
-    spark_pipeline_blob = SparkUtils.retrieve_artifact_stream(mlflow._splice_context, run_id, name)
-    bis = mlflow._splice_context.jvm.java.io.ByteArrayInputStream(spark_pipeline_blob)
-    ois = mlflow._splice_context.jvm.java.io.ObjectInputStream(bis)
-    pipeline = PipelineModel._from_java(ois.readObject())  # convert object from Java
-    # PipelineModel to Python PipelineModel
-    ois.close()
+    model_blob, file_ext = SparkUtils.retrieve_artifact_stream(mlflow._splice_context, run_id, name)
 
-    if len(pipeline.stages) == 1 and SparkUtils.is_spark_pipeline(pipeline.stages[0]):
-        pipeline = pipeline.stages[0]
+    if file_ext == 'sparkmodel':
+        model = SparkUtils.load_spark_model(mlflow._splice_context, model_blob)
 
-    return pipeline
+    elif file_ext == 'h2omodel':
+        with open('/tmp/model', 'wb') as file:
+            file.write(model_blob)
+        model = h2o.load_model('/tmp/model')
+        rmtree('/tmp/model')
+    return model
 
 
 @_mlflow_patch('log_artifact')
@@ -510,11 +512,6 @@ def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
     db_table_name = db_table_name if db_table_name else f'data_{run_id}'
     schema_table_name = f'{db_schema_name}.{db_table_name}' if db_schema_name else db_table_name
 
-    # Get the VectorAssembler so we can get the features of the model
-    # FIXME: this might not be correct. If transformations are made before hitting the VectorAssembler, they
-    # FIXME: Also need to be included in the columns of the table. We need the df columns + VectorAssembler inputCols
-    # FIXME: We can do something similar to the log_feature_transformations function to get necessary columns
-    # FIXME: Or, this may just be df.columns ...
     feature_columns = df.columns
     # Get the datatype of each column in the dataframe
     schema_types = {str(i.name): re.sub("[0-9,()]", "", str(i.dataType)) for i in df.schema}
@@ -562,7 +559,7 @@ def _deploy_db(fittedPipe, df, db_schema_name, db_table_name, primary_key,
 
         # Create table 2: DATA_PREDS
         print('Creating prediction table ...', end=' ')
-        create_data_preds_table(mlflow._splice_context, schema_table_name, classes, primary_key, modelType, verbose)
+        create_data_preds_table(mlflow._splice_context, run_id, schema_table_name, classes, primary_key, modelType, verbose)
         print('Done.')
 
         # Create Trigger 1: (model prediction)
@@ -594,7 +591,7 @@ def apply_patches():
     ALL GORILLA PATCHES SHOULD BE PREFIXED WITH "_" BEFORE THEIR DESTINATION IN MLFLOW
     """
     targets = [_register_splice_context, _lp, _lm, _timer, _log_artifact, _log_feature_transformations,
-               _log_model_params, _log_pipeline_stages, _log_spark_model, _load_spark_model, _download_artifact,
+               _log_model_params, _log_pipeline_stages, _log_model, _load_model, _download_artifact,
                _start_run, _current_run_id, _current_exp_id, _deploy_aws, _deploy_azure, _deploy_db, _login_director]
 
     for target in targets:
diff --git a/splicemachine/mlflow_support/utilities.py b/splicemachine/mlflow_support/utilities.py
@@ -1,4 +1,4 @@
-from os import environ as env_vars, popen as rbash, system as bash
+from os import environ as env_vars, popen as rbash, system as bash, remove
 from sys import getsizeof
 import re
 
@@ -9,7 +9,9 @@
 from splicemachine.spark.constants import SQL_TYPES
 from splicemachine.mlflow_support.constants import SparkModelType
 from mleap.pyspark.spark_support import SimpleSparkSerializer
+import h2o
 
+from pyspark.ml.pipeline import PipelineModel
 
 class SpliceMachineException(Exception):
     pass
@@ -18,9 +20,9 @@ class SpliceMachineException(Exception):
 class SQL:
     MLMANAGER_SCHEMA = 'MLMANAGER'
     ARTIFACT_INSERT_SQL = f'INSERT INTO {MLMANAGER_SCHEMA}.ARTIFACTS (run_uuid, name, "size", "binary", file_extension) VALUES (?, ?, ?, ?, ?)'
-    ARTIFACT_RETRIEVAL_SQL = 'SELECT "binary" FROM ' + f'{MLMANAGER_SCHEMA}.' + 'ARTIFACTS WHERE name=\'{name}\' ' \
+    ARTIFACT_RETRIEVAL_SQL = 'SELECT "binary", file_extension FROM ' + f'{MLMANAGER_SCHEMA}.' + 'ARTIFACTS WHERE name=\'{name}\' ' \
                                                                                 'AND run_uuid=\'{runid}\''
-    MODEL_INSERT_SQL = f'INSERT INTO {MLMANAGER_SCHEMA}.MODELS(RUN_UUID, MODEL) VALUES (?, ?)'
+    MODEL_INSERT_SQL = f'INSERT INTO {MLMANAGER_SCHEMA}.MODELS(RUN_UUID, MODEL, LIBRARY, "version") VALUES (?, ?, ?, ?)'
     MODEL_RETRIEVAL_SQL = 'SELECT MODEL FROM {MLMANAGER_SCHEMA}.MODELS WHERE RUN_UUID=\'{run_uuid}\''
 
 
@@ -137,7 +139,7 @@ def retrieve_artifact_stream(splice_context, run_id, name):
         try:
             return splice_context.df(
                 SQL.ARTIFACT_RETRIEVAL_SQL.format(name=name, runid=run_id)
-            ).collect()[0][0]
+            ).collect()[0]
         except IndexError as e:
             raise Exception(f"Unable to find the artifact with the given run id {run_id} and name {name}")
 
@@ -185,7 +187,37 @@ def get_model_type(pipeline_or_model):
                 m_type = SparkModelType.CLUSTERING_WO_PROB
 
         return m_type
+    @staticmethod
+    def log_spark_model(splice_ctx, model, name, run_id):
+        jvm = splice_ctx.jvm
+        java_import(jvm, "java.io.{BinaryOutputStream, ObjectOutputStream, ByteArrayInputStream}")
+
+        if not SparkUtils.is_spark_pipeline(model):
+            model = PipelineModel(
+                stages=[model]
+            )  # create a pipeline with only the model if a model is passed in
+
+        baos = jvm.java.io.ByteArrayOutputStream()  # serialize the PipelineModel to a byte array
+        oos = jvm.java.io.ObjectOutputStream(baos)
+        oos.writeObject(model._to_java())
+        oos.flush()
+        oos.close()
+        insert_artifact(splice_ctx, name, baos.toByteArray(), run_id,
+                    file_ext='sparkmodel')  # write the byte stream to the db as a BLOB
 
+    @staticmethod
+    def load_spark_model(splice_ctx, spark_pipeline_blob):
+        jvm = splice_ctx.jvm
+        bis = jvm.java.io.ByteArrayInputStream(spark_pipeline_blob)
+        ois = jvm.java.io.ObjectInputStream(bis)
+        pipeline = PipelineModel._from_java(ois.readObject())  # convert object from Java
+        # PipelineModel to Python PipelineModel
+        ois.close()
+
+        if len(pipeline.stages) == 1 and not SparkUtils.is_spark_pipeline(pipeline.stages[0]):
+            pipeline = pipeline.stages[0]
+
+        return pipeline
 
 def find_inputs_by_output(dictionary, value):
     """
@@ -231,6 +263,9 @@ def insert_model(splice_context, run_id, byte_array):
     prepared_statement = db_connection.prepareStatement(SQL.MODEL_INSERT_SQL)
     prepared_statement.setString(1, run_id)
     prepared_statement.setBinaryStream(2, binary_input_stream)
+    # FIXME: Dynamically set this per model type (only mleap for now)
+    prepared_statement.setString(3, 'MLEAP')
+    prepared_statement.setString(4, '0.15.0')
 
     prepared_statement.execute()
     prepared_statement.close()
@@ -368,12 +403,13 @@ def create_data_table(splice_context, schema_table_name, schema_str, primary_key
     splice_context.execute(SQL_TABLE)
 
 
-def create_data_preds_table(splice_context, schema_table_name, classes, primary_key,
+def create_data_preds_table(splice_context, run_id, schema_table_name, classes, primary_key,
                               modelType, verbose):
     """
     Creates the data prediction table that holds the prediction for the rows of the data table
     :param splice_context: pysplicectx
     :param schema_table_name: (str) the schema.table to create the table under
+    :param run_id: (str) the run_id for this model
     :param classes: (List[str]) the labels of the model (if they exist)
     :param primary_key: List[Tuple[str,str]] column name, SQL datatype for the primary key(s) of the table
     :param modelType: (ModelType) Whether the model is a Regression, Classification or Clustering (with/without probabilities)
@@ -386,8 +422,8 @@ def create_data_preds_table(splice_context, schema_table_name, classes, primary_
     SQL_PRED_TABLE = f'''CREATE TABLE {schema_table_name}_PREDS (
         \tCUR_USER VARCHAR(50) DEFAULT CURRENT_USER,
         \tEVAL_TIME TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+        \tRUN_ID VARCHAR(50) DEFAULT \'{run_id}\',
         '''
-    # FIXME: Add the run_id as a column with constant default value to always be the run_id
     pk_cols = ''
     for i in primary_key:
         SQL_PRED_TABLE += f'\t{i[0]} {i[1]},\n'