Skip to content
This repository was archived by the owner on Apr 15, 2022. It is now read-only.

Commit 7b24863

Browse files
author
Ben Epstein
authored
Dbaas 4183 (#76)
* model cols fix + better model checking for spark * fix for issue 70 and 73 * fix in case user doesn't pass in model_cols * py4j version * better exception for serializetobundle
1 parent 9faa3d1 commit 7b24863

File tree

4 files changed

+36
-12
lines changed

4 files changed

+36
-12
lines changed

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
py4j==0.10.8.1
1+
py4j==0.10.7
22
pytest==5.1.3
33
mlflow==1.6.0
44
mleap==0.15.0
@@ -11,7 +11,7 @@ numpy==1.18.2
1111
pandas==1.0.3
1212
scipy==1.4.1
1313
tensorflow==2.2.0
14-
pyspark==2.4.0
14+
pyspark
1515
h2o-pysparkling-2.4==3.28.1.2-1
1616
sphinx-tabs
1717
IPython

splicemachine/mlflow_support/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,4 @@ class ModelStatuses():
7171
deployed: str = 'DEPLOYED'
7272
deleted: str = 'DELETED'
7373
SUPPORTED_STATUSES = [deployed, deleted]
74+

splicemachine/mlflow_support/mlflow_support.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -641,8 +641,10 @@ def _deploy_db(db_schema_name,
641641
Will ONLY be used if the table does not exist and a dataframe is passed in
642642
:param model_cols: (List[str]) The columns from the table to use for the model. If None, all columns in the table
643643
will be passed to the model. If specified, the columns will be passed to the model
644-
IN THAT ORDER. The columns passed here must exist in the table.
645-
:param classes: (List[str]) The classes (prediction labels) for the model being deployed.\n
644+
IN THAT ORDER. The columns passed here must exist in the table. If creating the
645+
table from a dataframe, the table will be created from the columns in the DF, not
646+
model_cols. model_cols is only used at prediction time
647+
:param classes: (List[str]) The classes (prediction labels) for the model being deployed.
646648
NOTE: If not supplied, the table will have default column names for each class
647649
:param sklearn_args: (dict{str: str}) Prediction options for sklearn models: \n
648650
* Available key value options: \n
@@ -703,6 +705,8 @@ def _deploy_db(db_schema_name,
703705

704706
schema_table_name = f'{db_schema_name}.{db_table_name}'
705707

708+
# Feature columns are all of the columns of the table, model_cols are the subset of feature columns that are used \
709+
# in predictions. schema_types contains all columns from feature_columns
706710
feature_columns, schema_types = get_feature_columns_and_types(mlflow._splice_context, df, create_model_table,
707711
model_cols, schema_table_name)
708712

@@ -725,7 +729,9 @@ def _deploy_db(db_schema_name,
725729
# Create the schema of the table (we use this a few times)
726730
schema_str = ''
727731
for i in feature_columns:
728-
schema_str += f'\t{i} {CONVERSIONS[schema_types[str(i)]]},'
732+
spark_data_type = schema_types[str(i)]
733+
assert spark_data_type in CONVERSIONS, f'Type {spark_data_type} not supported for table creation. Remove column and try again'
734+
schema_str += f'\t{i} {CONVERSIONS[spark_data_type]},'
729735

730736
try:
731737
# Create/Alter table 1: DATA
@@ -739,11 +745,13 @@ def _deploy_db(db_schema_name,
739745

740746
# Create Trigger 1: model prediction
741747
print('Creating model prediction trigger ...', end=' ')
748+
# If model_cols were passed in, we'll use them here. Otherwise, use all of the columns (stored in feature_columns)
749+
model_cols = model_cols or feature_columns
742750
if model_type in (H2OModelType.KEY_VALUE, SklearnModelType.KEY_VALUE, KerasModelType.KEY_VALUE):
743-
create_vti_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, feature_columns, schema_types,
751+
create_vti_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, model_cols, schema_types,
744752
schema_str, primary_key, classes, model_type, sklearn_args, pred_threshold, verbose)
745753
else:
746-
create_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, feature_columns, schema_types,
754+
create_prediction_trigger(mlflow._splice_context, schema_table_name, run_id, model_cols, schema_types,
747755
schema_str, primary_key, model_type, verbose)
748756
print('Done.')
749757

splicemachine/mlflow_support/utilities.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from pyspark.ml.base import Model as SparkModel
1414
from pyspark.ml.feature import IndexToString
1515
from pyspark.ml.wrapper import JavaModel
16+
from pyspark.ml import classification as spark_classification, regression as spark_regression, \
17+
clustering as spark_clustering, recommendation as spark_recommendation
1618
from pyspark.sql.types import StructType
1719
from pyspark.sql.dataframe import DataFrame as SparkDF
1820
from pandas.core.frame import DataFrame as PandasDF
@@ -452,6 +454,8 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
452454

453455

454456
class SparkUtils:
457+
MODEL_MODULES = [spark_classification.__name__, spark_recommendation.__name__, spark_clustering.__name__,
458+
spark_regression.__name__]
455459
@staticmethod
456460
def get_stages(pipeline: PipelineModel):
457461
"""
@@ -531,9 +535,10 @@ def get_model_stage(pipeline: PipelineModel) -> SparkModel:
531535
for i in SparkUtils.get_stages(pipeline):
532536
# StandardScaler is also implemented as a base Model and JavaModel for some reason but that's not a model
533537
# So we need to make sure the stage isn't a feature
534-
if isinstance(i, SparkModel) and isinstance(i, JavaModel) and 'feature' not in i.__module__:
538+
if getattr(i, '__module__', None) in SparkUtils.MODEL_MODULES:
535539
return i
536-
raise AttributeError('Could not find model stage in Pipeline! Is this a fitted spark Pipeline?')
540+
raise AttributeError("It looks like you're trying to deploy a pipeline without a supported Spark Model. Supported Spark models "
541+
"are listed here: https://mleap-docs.combust.ml/core-concepts/transformers/support.html")
537542

538543
@staticmethod
539544
def try_get_class_labels(pipeline: PipelineModel):
@@ -707,6 +712,12 @@ def prep_model_for_deployment(splice_context: PySpliceContext,
707712
:param classes:
708713
:return:
709714
"""
715+
716+
# Check if model is not a pipeline. This would occur when user logs a Pipeline with 1 stage
717+
if not SparkUtils.is_spark_pipeline(fittedPipe):
718+
print('You are deploying a singular Spark Model. It will be deployed as a Pipeline with 1 stage. This will'
719+
'not affect expected behavior or outcomes.')
720+
fittedPipe = PipelineModel(stages=[fittedPipe])
710721
# Get model type
711722
model_type = SparkUtils.get_model_type(fittedPipe)
712723
# See if the labels are in an IndexToString stage. Will either return List[str] or empty []
@@ -892,7 +903,13 @@ def get_mleap_model(splice_context: PySpliceContext,
892903
# Serialize the Spark model into Mleap format
893904
if f'{run_id}.zip' in rbash('ls /tmp').read():
894905
remove(f'/tmp/{run_id}.zip')
895-
fittedPipe.serializeToBundle(f"jar:file:///tmp/{run_id}.zip", df)
906+
907+
try:
908+
fittedPipe.serializeToBundle(f"jar:file:///tmp/{run_id}.zip", df)
909+
except:
910+
m = getattr(fittedPipe, '__class__', 'UnknownModel')
911+
raise SpliceMachineException(f'It look like your model type {m} is not supported. Supported models are listed'
912+
f'here https://mleap-docs.combust.ml/core-concepts/transformers/support.html') from None
896913

897914
jvm = splice_context.jvm
898915
java_import(jvm, "com.splicemachine.mlrunner.FileRetriever")
@@ -1292,8 +1309,6 @@ def get_feature_columns_and_types(splice_ctx: PySpliceContext,
12921309
assert type(df) in (SparkDF, PandasDF), "Dataframe must be a PySpark or Pandas dataframe!"
12931310
if type(df) == PandasDF:
12941311
df = splice_ctx.spark_session.createDataFrame(df)
1295-
if model_cols:
1296-
df = df.select(*model_cols)
12971312
feature_columns = df.columns
12981313
# Get the datatype of each column in the dataframe
12991314
schema_types = {str(i.name): re.sub("[0-9,()]", "", str(i.dataType)) for i in df.schema}

0 commit comments

Comments
 (0)