Skip to content
This repository was archived by the owner on Apr 15, 2022. It is now read-only.

Commit 55dda8d

Browse files
author
Ben Epstein
authored
Merge pull request #101 from splicemachine/DBAAS-4984
Dbaas 4984
2 parents 90f874f + 0c59d72 commit 55dda8d

File tree

9 files changed

+285
-12
lines changed

9 files changed

+285
-12
lines changed

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
'private-members':True,
4747
'inherited-members':True,
4848
'undoc-members': False,
49-
'exclude-members': '_validate_feature_vector_keys,_process_features,__prune_features_for_elimination,_register_metadata,_register_metadata,__update_deployment_status,__log_mlflow_results,__get_feature_importance,__get_pipeline,_validate_training_view,_validate_feature_set,_validate_feature,__validate_feature_data_type,_check_for_splice_ctx,_dropTableIfExists, _generateDBSchema,_getCreateTableSchema,_jstructtype,_spliceSparkPackagesName,_splicemachineContext,apply_patches, main'
49+
'exclude-members': '_retrieve_model_data_sets,_retrieve_training_set_metadata_from_deployement,_validate_feature_vector_keys,_process_features,__prune_features_for_elimination,_register_metadata,_register_metadata,__update_deployment_status,__log_mlflow_results,__get_feature_importance,__get_pipeline,_validate_training_view,_validate_feature_set,_validate_feature,__validate_feature_data_type,_check_for_splice_ctx,_dropTableIfExists, _generateDBSchema,_getCreateTableSchema,_jstructtype,_spliceSparkPackagesName,_splicemachineContext,apply_patches, main'
5050
}
5151

5252
# Add any paths that contain templates here, relative to this directory.

splicemachine/features/constants.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,26 @@ class SQL:
191191
SELECT {feature_names} FROM {feature_sets} WHERE
192192
"""
193193

194+
get_deployment_metadata = f"""
195+
SELECT tv.name, d.training_set_start_ts, d.training_set_end_ts,
196+
string_agg(f.name,',') features
197+
FROM featurestore.deployment d
198+
INNER JOIN {FEATURE_STORE_SCHEMA}.training_set ts ON d.training_set_id=ts.training_set_id
199+
INNER JOIN {FEATURE_STORE_SCHEMA}.training_set_feature tsf ON tsf.training_set_id=d.training_set_id
200+
LEFT OUTER JOIN {FEATURE_STORE_SCHEMA}.training_view tv ON tv.view_id = ts.view_id
201+
INNER JOIN {FEATURE_STORE_SCHEMA}.feature f ON tsf.feature_id=f.feature_id
202+
WHERE d.model_schema_name = '{{schema_name}}'
203+
AND d.model_table_name = '{{table_name}}'
204+
GROUP BY 1,2,3
205+
"""
206+
207+
get_model_predictions = """
208+
SELECT EVAL_TIME,
209+
PREDICTION
210+
FROM {schema_name}.{table_name} WHERE EVAL_TIME>='{start_time}' AND EVAL_TIME<'{end_time}'
211+
ORDER BY EVAL_TIME
212+
"""
213+
194214
class Columns:
195215
feature = ['feature_id', 'feature_set_id', 'name', 'description', 'feature_data_type', 'feature_type',
196216
'tags', 'compliance_level', 'last_update_ts', 'last_update_username']

splicemachine/features/feature.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
class Feature:
44
def __init__(self, *, name, description, feature_data_type, feature_type, tags, feature_set_id=None, feature_id=None, **kwargs):
5-
self.name = name
5+
self.name = name.upper()
66
self.description = description
77
self.feature_data_type = feature_data_type
88
self.feature_type = feature_type

splicemachine/features/feature_set.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ def __init__(self, *, splice_ctx: PySpliceContext, table_name, schema_name, desc
1212
primary_keys: Dict[str, str], feature_set_id=None, deployed: bool = False, **kwargs):
1313
self.splice_ctx = splice_ctx
1414

15-
self.table_name = table_name
16-
self.schema_name = schema_name
15+
self.table_name = table_name.upper()
16+
self.schema_name = schema_name.upper()
1717
self.description = description
1818
self.primary_keys = primary_keys
1919
self.feature_set_id = feature_set_id
@@ -91,6 +91,7 @@ def __update_deployment_status(self, status: bool):
9191
"""
9292
self.splice_ctx.execute(SQL.update_fset_deployment_status.format(status=int(status),
9393
feature_set_id=self.feature_set_id))
94+
self.deployed = True
9495

9596

9697
def deploy(self, verbose=False):

splicemachine/features/feature_store.py

Lines changed: 132 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas import DataFrame as PandasDF
88

99
from pyspark.sql.dataframe import DataFrame as SparkDF
10+
import pyspark.sql.functions as psf
1011
from pyspark.ml import Pipeline
1112
from pyspark.ml.classification import RandomForestClassifier
1213
from pyspark.ml.regression import RandomForestRegressor
@@ -16,8 +17,9 @@
1617
from splicemachine.spark import PySpliceContext
1718
from splicemachine.features import Feature, FeatureSet
1819
from .training_set import TrainingSet
19-
from .utils import (dict_to_lower, _generate_training_set_history_sql,
20-
_generate_training_set_sql, _create_temp_training_view)
20+
from .utils.drift_utils import (add_feature_plot, remove_outliers, datetime_range_split, build_feature_drift_plot, build_model_drift_plot)
21+
from .utils.training_utils import (dict_to_lower, _generate_training_set_history_sql,
22+
_generate_training_set_sql, _create_temp_training_view)
2123
from .constants import SQL, FeatureType
2224
from .training_view import TrainingView
2325

@@ -317,7 +319,6 @@ def get_training_set(self, features: Union[List[Feature], List[str]], current_va
317319
temp_vw = _create_temp_training_view(features, fsets)
318320
sql = _generate_training_set_history_sql(temp_vw, features, fsets, start_time=start_time, end_time=end_time)
319321

320-
321322
# Here we create a null training view and pass it into the training set. We do this because this special kind
322323
# of training set isn't standard. It's not based on a training view, on primary key columns, a label column,
323324
# or a timestamp column . This is simply a joined set of features from different feature sets.
@@ -335,7 +336,7 @@ def get_training_set(self, features: Union[List[Feature], List[str]], current_va
335336
ts.start_time = ts.end_time
336337

337338
if self.mlflow_ctx and not return_sql:
338-
self.mlflow_ctx._active_training_set: TrainingSet = ts
339+
self.mlflow_ctx._active_training_set = ts
339340
ts._register_metadata(self.mlflow_ctx)
340341
return sql if return_sql else self.splice_ctx.df(sql)
341342

@@ -412,13 +413,17 @@ def list_training_sets(self) -> Dict[str, Optional[str]]:
412413
"""
413414
raise NotImplementedError("To see available training views, run fs.describe_training_views()")
414415

415-
def _validate_feature_set(self, schema_name, table_name):
416+
def _validate_feature_set(self, schema_name: str, table_name: str):
416417
"""
417418
Asserts a feature set doesn't already exist in the database
418419
:param schema_name: schema name of the feature set
419420
:param table_name: table name of the feature set
420421
:return: None
421422
"""
423+
# database stores object names in upper case
424+
schema_name = schema_name.upper()
425+
table_name = table_name.upper()
426+
422427
str = f'Feature Set {schema_name}.{table_name} already exists. Use a different schema and/or table name.'
423428
# Validate Table
424429
assert not self.splice_ctx.tableExists(schema_name, table_name=table_name), str
@@ -436,6 +441,10 @@ def create_feature_set(self, schema_name: str, table_name: str, primary_keys: Di
436441
:param desc: The (optional) description
437442
:return: FeatureSet
438443
"""
444+
# database stores object names in upper case
445+
schema_name = schema_name.upper()
446+
table_name = table_name.upper()
447+
439448
self._validate_feature_set(schema_name, table_name)
440449
fset = FeatureSet(splice_ctx=self.splice_ctx, schema_name=schema_name, table_name=table_name,
441450
primary_keys=primary_keys,
@@ -494,6 +503,10 @@ def create_feature(self, schema_name: str, table_name: str, name: str, feature_d
494503
:return: Feature created
495504
"""
496505
self.__validate_feature_data_type(feature_data_type)
506+
# database stores object names in upper case
507+
schema_name = schema_name.upper()
508+
table_name = table_name.upper()
509+
497510
if self.splice_ctx.tableExists(schema_name, table_name):
498511
raise SpliceMachineException(f"Feature Set {schema_name}.{table_name} is already deployed. You cannot "
499512
f"add features to a deployed feature set.")
@@ -604,7 +617,7 @@ def _process_features(self, features: List[Union[Feature, str]]) -> List[Feature
604617
" a feature name (string) or a Feature object"
605618
return all_features
606619

607-
def deploy_feature_set(self, schema_name, table_name):
620+
def deploy_feature_set(self, schema_name: str, table_name: str):
608621
"""
609622
Deploys a feature set to the database. This persists the feature stores existence.
610623
As of now, once deployed you cannot delete the feature set or add/delete features.
@@ -614,6 +627,9 @@ def deploy_feature_set(self, schema_name, table_name):
614627
:param table_name: The table of the created feature set
615628
"""
616629
try:
630+
# database stores object names in upper case
631+
schema_name = schema_name.upper()
632+
table_name = table_name.upper()
617633
fset = self.get_feature_sets(_filter={'schema_name': schema_name, 'table_name': table_name})[0]
618634
except:
619635
raise SpliceMachineException(
@@ -642,6 +658,10 @@ def describe_feature_set(self, schema_name: str, table_name: str) -> None:
642658
:param table_name: feature set table name
643659
:return: None
644660
"""
661+
# database stores object names in upper case
662+
schema_name = schema_name.upper()
663+
table_name = table_name.upper()
664+
645665
fset = self.get_feature_sets(_filter={'schema_name': schema_name, 'table_name': table_name})
646666
if not fset: raise SpliceMachineException(
647667
f"Feature Set {schema_name}.{table_name} not found. Check name and try again.")
@@ -689,6 +709,111 @@ def describe_training_view(self, training_view: str) -> None:
689709
def set_feature_description(self):
690710
raise NotImplementedError
691711

712+
def get_training_set_from_deployment(self, schema_name: str, table_name: str):
713+
"""
714+
Reads Feature Store metadata to rebuild orginal training data set used for the given deployed model.
715+
:param schema_name: model schema name
716+
:param table_name: model table name
717+
:return:
718+
"""
719+
# database stores object names in upper case
720+
schema_name = schema_name.upper()
721+
table_name = table_name.upper()
722+
723+
metadata = self._retrieve_training_set_metadata_from_deployement(schema_name, table_name)
724+
features = metadata['FEATURES'].split(',')
725+
tv_name = metadata['NAME']
726+
start_time = metadata['TRAINING_SET_START_TS']
727+
end_time = metadata['TRAINING_SET_END_TS']
728+
if tv_name:
729+
training_set_df = self.get_training_set_from_view(training_view=tv_name, start_time=start_time,
730+
end_time=end_time, features=features)
731+
else:
732+
training_set_df = self.get_training_set(features=features, start_time=start_time, end_time=end_time)
733+
return training_set_df
734+
735+
def _retrieve_model_data_sets(self, schema_name: str, table_name: str):
736+
"""
737+
Returns the training set dataframe and model table dataframe for a given deployed model.
738+
:param schema_name: model schema name
739+
:param table_name: model table name
740+
:return:
741+
"""
742+
# database stores object names in upper case
743+
schema_name = schema_name.upper()
744+
table_name = table_name.upper()
745+
746+
training_set_df = self.get_training_set_from_deployment(schema_name, table_name)
747+
model_table_df = self.splice_ctx.df(f'SELECT * FROM {schema_name}.{table_name}')
748+
return training_set_df, model_table_df
749+
750+
def _retrieve_training_set_metadata_from_deployement(self, schema_name: str, table_name: str):
751+
"""
752+
Reads Feature Store metadata to retrieve definition of training set used to train the specified model.
753+
:param schema_name: model schema name
754+
:param table_name: model table name
755+
:return:
756+
"""
757+
# database stores object names in upper case
758+
schema_name = schema_name.upper()
759+
table_name = table_name.upper()
760+
761+
sql = SQL.get_deployment_metadata.format(schema_name=schema_name, table_name=table_name)
762+
deploy_df = self.splice_ctx.df(sql).collect()
763+
cnt = len(deploy_df)
764+
if cnt == 1:
765+
return deploy_df[0]
766+
767+
def display_model_feature_drift(self, schema_name: str, table_name: str):
768+
"""
769+
Displays feature by feature comparison between the training set of the deployed model and the input feature
770+
values used with the model since deployment.
771+
:param schema_name: name of database schema where model table is deployed
772+
:param table_name: name of the model table
773+
:return: None
774+
"""
775+
# database stores object names in upper case
776+
schema_name = schema_name.upper()
777+
table_name = table_name.upper()
778+
779+
metadata = self._retrieve_training_set_metadata_from_deployement(schema_name, table_name)
780+
if not metadata:
781+
raise SpliceMachineException(f"Could not find deployment for model table {schema_name}.{table_name}") from None
782+
training_set_df, model_table_df = self._retrieve_model_data_sets(schema_name, table_name)
783+
features = metadata['FEATURES'].split(',')
784+
build_feature_drift_plot(features, training_set_df, model_table_df)
785+
786+
787+
def display_model_drift(self, schema_name: str, table_name: str, time_intervals: int,
788+
start_time: datetime = None, end_time: datetime = None):
789+
"""
790+
Displays as many as 'time_intervals' plots showing the distribution of the model prediction within each time
791+
period. Time periods are equal periods of time where predictions are present in the model table
792+
'schema_name'.'table_name'. Model predictions are first filtered to only those occurring after 'start_time' if
793+
specified and before 'end_time' if specified.
794+
:param schema_name: schema where the model table resides
795+
:param table_name: name of the model table
796+
:param time_intervals: number of time intervals to plot
797+
:param start_time: if specified, filters to only show predictions occurring after this date/time
798+
:param end_time: if specified, filters to only show predictions occurring before this date/time
799+
:return: None
800+
"""
801+
# database stores object names in upper case
802+
schema_name = schema_name.upper()
803+
table_name = table_name.upper()
804+
805+
# set default timeframe if not specified
806+
if not start_time:
807+
start_time = datetime(1900, 1, 1, 0, 0, 0)
808+
if not end_time:
809+
end_time = datetime.now()
810+
# retrieve predictions the model has made over time
811+
sql = SQL.get_model_predictions.format(schema_name=schema_name, table_name=table_name,
812+
start_time=start_time, end_time=end_time)
813+
model_table_df = self.splice_ctx.df(sql)
814+
build_model_drift_plot(model_table_df, time_intervals)
815+
816+
692817
def __get_pipeline(self, df, features, label, model_type):
693818
"""
694819
Creates a Pipeline with preprocessing steps (StringINdexer, VectorAssembler) for each feature depending
@@ -736,6 +861,7 @@ def __log_mlflow_results(self, name, rounds, mlflow_results):
736861
:param name: MLflow run name
737862
:param rounds: Number of rounds of feature elimination that were run
738863
:param mlflow_results: The params / metrics to log
864+
:return:
739865
"""
740866
try:
741867
if self.mlflow_ctx.active_run():

splicemachine/features/utils/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)