77from pandas import DataFrame as PandasDF
88
99from pyspark .sql .dataframe import DataFrame as SparkDF
10+ import pyspark .sql .functions as psf
1011from pyspark .ml import Pipeline
1112from pyspark .ml .classification import RandomForestClassifier
1213from pyspark .ml .regression import RandomForestRegressor
1617from splicemachine .spark import PySpliceContext
1718from splicemachine .features import Feature , FeatureSet
1819from .training_set import TrainingSet
19- from .utils import (dict_to_lower , _generate_training_set_history_sql ,
20- _generate_training_set_sql , _create_temp_training_view )
20+ from .utils .drift_utils import (add_feature_plot , remove_outliers , datetime_range_split , build_feature_drift_plot , build_model_drift_plot )
21+ from .utils .training_utils import (dict_to_lower , _generate_training_set_history_sql ,
22+ _generate_training_set_sql , _create_temp_training_view )
2123from .constants import SQL , FeatureType
2224from .training_view import TrainingView
2325
@@ -317,7 +319,6 @@ def get_training_set(self, features: Union[List[Feature], List[str]], current_va
317319 temp_vw = _create_temp_training_view (features , fsets )
318320 sql = _generate_training_set_history_sql (temp_vw , features , fsets , start_time = start_time , end_time = end_time )
319321
320-
321322 # Here we create a null training view and pass it into the training set. We do this because this special kind
322323 # of training set isn't standard. It's not based on a training view, on primary key columns, a label column,
323324 # or a timestamp column . This is simply a joined set of features from different feature sets.
@@ -335,7 +336,7 @@ def get_training_set(self, features: Union[List[Feature], List[str]], current_va
335336 ts .start_time = ts .end_time
336337
337338 if self .mlflow_ctx and not return_sql :
338- self .mlflow_ctx ._active_training_set : TrainingSet = ts
339+ self .mlflow_ctx ._active_training_set = ts
339340 ts ._register_metadata (self .mlflow_ctx )
340341 return sql if return_sql else self .splice_ctx .df (sql )
341342
@@ -412,13 +413,17 @@ def list_training_sets(self) -> Dict[str, Optional[str]]:
412413 """
413414 raise NotImplementedError ("To see available training views, run fs.describe_training_views()" )
414415
415- def _validate_feature_set (self , schema_name , table_name ):
416+ def _validate_feature_set (self , schema_name : str , table_name : str ):
416417 """
417418 Asserts a feature set doesn't already exist in the database
418419 :param schema_name: schema name of the feature set
419420 :param table_name: table name of the feature set
420421 :return: None
421422 """
423+ # database stores object names in upper case
424+ schema_name = schema_name .upper ()
425+ table_name = table_name .upper ()
426+
422427 str = f'Feature Set { schema_name } .{ table_name } already exists. Use a different schema and/or table name.'
423428 # Validate Table
424429 assert not self .splice_ctx .tableExists (schema_name , table_name = table_name ), str
@@ -436,6 +441,10 @@ def create_feature_set(self, schema_name: str, table_name: str, primary_keys: Di
436441 :param desc: The (optional) description
437442 :return: FeatureSet
438443 """
444+ # database stores object names in upper case
445+ schema_name = schema_name .upper ()
446+ table_name = table_name .upper ()
447+
439448 self ._validate_feature_set (schema_name , table_name )
440449 fset = FeatureSet (splice_ctx = self .splice_ctx , schema_name = schema_name , table_name = table_name ,
441450 primary_keys = primary_keys ,
@@ -494,6 +503,10 @@ def create_feature(self, schema_name: str, table_name: str, name: str, feature_d
494503 :return: Feature created
495504 """
496505 self .__validate_feature_data_type (feature_data_type )
506+ # database stores object names in upper case
507+ schema_name = schema_name .upper ()
508+ table_name = table_name .upper ()
509+
497510 if self .splice_ctx .tableExists (schema_name , table_name ):
498511 raise SpliceMachineException (f"Feature Set { schema_name } .{ table_name } is already deployed. You cannot "
499512 f"add features to a deployed feature set." )
@@ -604,7 +617,7 @@ def _process_features(self, features: List[Union[Feature, str]]) -> List[Feature
604617 " a feature name (string) or a Feature object"
605618 return all_features
606619
607- def deploy_feature_set (self , schema_name , table_name ):
620+ def deploy_feature_set (self , schema_name : str , table_name : str ):
608621 """
609622 Deploys a feature set to the database. This persists the feature stores existence.
610623 As of now, once deployed you cannot delete the feature set or add/delete features.
@@ -614,6 +627,9 @@ def deploy_feature_set(self, schema_name, table_name):
614627 :param table_name: The table of the created feature set
615628 """
616629 try :
630+ # database stores object names in upper case
631+ schema_name = schema_name .upper ()
632+ table_name = table_name .upper ()
617633 fset = self .get_feature_sets (_filter = {'schema_name' : schema_name , 'table_name' : table_name })[0 ]
618634 except :
619635 raise SpliceMachineException (
@@ -642,6 +658,10 @@ def describe_feature_set(self, schema_name: str, table_name: str) -> None:
642658 :param table_name: feature set table name
643659 :return: None
644660 """
661+ # database stores object names in upper case
662+ schema_name = schema_name .upper ()
663+ table_name = table_name .upper ()
664+
645665 fset = self .get_feature_sets (_filter = {'schema_name' : schema_name , 'table_name' : table_name })
646666 if not fset : raise SpliceMachineException (
647667 f"Feature Set { schema_name } .{ table_name } not found. Check name and try again." )
@@ -689,6 +709,111 @@ def describe_training_view(self, training_view: str) -> None:
689709 def set_feature_description (self ):
690710 raise NotImplementedError
691711
712+ def get_training_set_from_deployment (self , schema_name : str , table_name : str ):
713+ """
714+ Reads Feature Store metadata to rebuild orginal training data set used for the given deployed model.
715+ :param schema_name: model schema name
716+ :param table_name: model table name
717+ :return:
718+ """
719+ # database stores object names in upper case
720+ schema_name = schema_name .upper ()
721+ table_name = table_name .upper ()
722+
723+ metadata = self ._retrieve_training_set_metadata_from_deployement (schema_name , table_name )
724+ features = metadata ['FEATURES' ].split (',' )
725+ tv_name = metadata ['NAME' ]
726+ start_time = metadata ['TRAINING_SET_START_TS' ]
727+ end_time = metadata ['TRAINING_SET_END_TS' ]
728+ if tv_name :
729+ training_set_df = self .get_training_set_from_view (training_view = tv_name , start_time = start_time ,
730+ end_time = end_time , features = features )
731+ else :
732+ training_set_df = self .get_training_set (features = features , start_time = start_time , end_time = end_time )
733+ return training_set_df
734+
735+ def _retrieve_model_data_sets (self , schema_name : str , table_name : str ):
736+ """
737+ Returns the training set dataframe and model table dataframe for a given deployed model.
738+ :param schema_name: model schema name
739+ :param table_name: model table name
740+ :return:
741+ """
742+ # database stores object names in upper case
743+ schema_name = schema_name .upper ()
744+ table_name = table_name .upper ()
745+
746+ training_set_df = self .get_training_set_from_deployment (schema_name , table_name )
747+ model_table_df = self .splice_ctx .df (f'SELECT * FROM { schema_name } .{ table_name } ' )
748+ return training_set_df , model_table_df
749+
750+ def _retrieve_training_set_metadata_from_deployement (self , schema_name : str , table_name : str ):
751+ """
752+ Reads Feature Store metadata to retrieve definition of training set used to train the specified model.
753+ :param schema_name: model schema name
754+ :param table_name: model table name
755+ :return:
756+ """
757+ # database stores object names in upper case
758+ schema_name = schema_name .upper ()
759+ table_name = table_name .upper ()
760+
761+ sql = SQL .get_deployment_metadata .format (schema_name = schema_name , table_name = table_name )
762+ deploy_df = self .splice_ctx .df (sql ).collect ()
763+ cnt = len (deploy_df )
764+ if cnt == 1 :
765+ return deploy_df [0 ]
766+
767+ def display_model_feature_drift (self , schema_name : str , table_name : str ):
768+ """
769+ Displays feature by feature comparison between the training set of the deployed model and the input feature
770+ values used with the model since deployment.
771+ :param schema_name: name of database schema where model table is deployed
772+ :param table_name: name of the model table
773+ :return: None
774+ """
775+ # database stores object names in upper case
776+ schema_name = schema_name .upper ()
777+ table_name = table_name .upper ()
778+
779+ metadata = self ._retrieve_training_set_metadata_from_deployement (schema_name , table_name )
780+ if not metadata :
781+ raise SpliceMachineException (f"Could not find deployment for model table { schema_name } .{ table_name } " ) from None
782+ training_set_df , model_table_df = self ._retrieve_model_data_sets (schema_name , table_name )
783+ features = metadata ['FEATURES' ].split (',' )
784+ build_feature_drift_plot (features , training_set_df , model_table_df )
785+
786+
787+ def display_model_drift (self , schema_name : str , table_name : str , time_intervals : int ,
788+ start_time : datetime = None , end_time : datetime = None ):
789+ """
790+ Displays as many as 'time_intervals' plots showing the distribution of the model prediction within each time
791+ period. Time periods are equal periods of time where predictions are present in the model table
792+ 'schema_name'.'table_name'. Model predictions are first filtered to only those occurring after 'start_time' if
793+ specified and before 'end_time' if specified.
794+ :param schema_name: schema where the model table resides
795+ :param table_name: name of the model table
796+ :param time_intervals: number of time intervals to plot
797+ :param start_time: if specified, filters to only show predictions occurring after this date/time
798+ :param end_time: if specified, filters to only show predictions occurring before this date/time
799+ :return: None
800+ """
801+ # database stores object names in upper case
802+ schema_name = schema_name .upper ()
803+ table_name = table_name .upper ()
804+
805+ # set default timeframe if not specified
806+ if not start_time :
807+ start_time = datetime (1900 , 1 , 1 , 0 , 0 , 0 )
808+ if not end_time :
809+ end_time = datetime .now ()
810+ # retrieve predictions the model has made over time
811+ sql = SQL .get_model_predictions .format (schema_name = schema_name , table_name = table_name ,
812+ start_time = start_time , end_time = end_time )
813+ model_table_df = self .splice_ctx .df (sql )
814+ build_model_drift_plot (model_table_df , time_intervals )
815+
816+
692817 def __get_pipeline (self , df , features , label , model_type ):
693818 """
694819 Creates a Pipeline with preprocessing steps (StringINdexer, VectorAssembler) for each feature depending
@@ -736,6 +861,7 @@ def __log_mlflow_results(self, name, rounds, mlflow_results):
736861 :param name: MLflow run name
737862 :param rounds: Number of rounds of feature elimination that were run
738863 :param mlflow_results: The params / metrics to log
864+ :return:
739865 """
740866 try :
741867 if self .mlflow_ctx .active_run ():
0 commit comments