forked from py-why/EconML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_causal_analysis.py
1801 lines (1553 loc) · 90.4 KB
/
_causal_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.
"""Module for assessing causal feature importance."""
import warnings
from collections import OrderedDict, namedtuple
import joblib
import lightgbm as lgb
import numpy as np
from numpy import iterable
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import _tree
from sklearn.utils.validation import column_or_1d
from ...cate_interpreter import SingleTreeCateInterpreter, SingleTreePolicyInterpreter
from ...dml import LinearDML, CausalForestDML
from ...inference import NormalInferenceResults
from ...sklearn_extensions.linear_model import WeightedLasso
from ...sklearn_extensions.model_selection import GridSearchCVList
from ...utilities import _RegressionWrapper, get_feature_names_or_default, one_hot_encoder
# TODO: this utility is documented but internal; reimplement?
from sklearn.utils import _safe_indexing
# TODO: this utility is even less public...
from packaging.version import parse
import sklearn
if parse(sklearn.__version__) < parse("1.5"):
from sklearn.utils import _get_column_indices
else:
from sklearn.utils._indexing import _get_column_indices
class _CausalInsightsConstants:
RawFeatureNameKey = 'raw_name'
EngineeredNameKey = 'name'
CategoricalColumnKey = 'cat'
TypeKey = 'type'
PointEstimateKey = 'point'
StandardErrorKey = 'stderr'
ZStatKey = 'zstat'
ConfidenceIntervalLowerKey = 'ci_lower'
ConfidenceIntervalUpperKey = 'ci_upper'
PValueKey = 'p_value'
Version = 'version'
CausalComputationTypeKey = 'causal_computation_type'
ConfoundingIntervalKey = 'confounding_interval'
ViewKey = 'view'
InitArgsKey = 'init_args'
RowData = 'row_data' # NOTE: RowData is mutually exclusive with the other data columns
ALL = [RawFeatureNameKey,
EngineeredNameKey,
CategoricalColumnKey,
TypeKey,
PointEstimateKey,
StandardErrorKey,
ZStatKey,
ConfidenceIntervalLowerKey,
ConfidenceIntervalUpperKey,
PValueKey,
Version,
CausalComputationTypeKey,
ConfoundingIntervalKey,
ViewKey,
InitArgsKey,
RowData]
def _get_default_shared_insights_output():
"""
Get dictionary elements shared among all analyses.
In case of breaking changes to this dictionary output, the major version of this
dictionary should be updated. In case of a change to this dictionary, the minor
version should be updated.
"""
return {
_CausalInsightsConstants.RawFeatureNameKey: [],
_CausalInsightsConstants.EngineeredNameKey: [],
_CausalInsightsConstants.CategoricalColumnKey: [],
_CausalInsightsConstants.TypeKey: [],
_CausalInsightsConstants.Version: '1.0',
_CausalInsightsConstants.CausalComputationTypeKey: "simple",
_CausalInsightsConstants.ConfoundingIntervalKey: None,
_CausalInsightsConstants.InitArgsKey: {}
}
def _get_default_specific_insights(view):
# keys should be mutually exclusive with shared keys, so that the dictionaries can be cleanly merged
return {
_CausalInsightsConstants.PointEstimateKey: [],
_CausalInsightsConstants.StandardErrorKey: [],
_CausalInsightsConstants.ZStatKey: [],
_CausalInsightsConstants.ConfidenceIntervalLowerKey: [],
_CausalInsightsConstants.ConfidenceIntervalUpperKey: [],
_CausalInsightsConstants.PValueKey: [],
_CausalInsightsConstants.ViewKey: view
}
def _get_metadata_causal_insights_keys():
return [_CausalInsightsConstants.Version,
_CausalInsightsConstants.CausalComputationTypeKey,
_CausalInsightsConstants.ConfoundingIntervalKey,
_CausalInsightsConstants.ViewKey]
def _get_column_causal_insights_keys():
return [_CausalInsightsConstants.RawFeatureNameKey,
_CausalInsightsConstants.EngineeredNameKey,
_CausalInsightsConstants.CategoricalColumnKey,
_CausalInsightsConstants.TypeKey]
def _get_data_causal_insights_keys():
return [_CausalInsightsConstants.PointEstimateKey,
_CausalInsightsConstants.StandardErrorKey,
_CausalInsightsConstants.ZStatKey,
_CausalInsightsConstants.ConfidenceIntervalLowerKey,
_CausalInsightsConstants.ConfidenceIntervalUpperKey,
_CausalInsightsConstants.PValueKey]
def _first_stage_reg(X, y, *, automl=True, random_state=None, verbose=0):
if automl:
model = GridSearchCVList([LassoCV(random_state=random_state),
RandomForestRegressor(
n_estimators=100, random_state=random_state, min_samples_leaf=10),
lgb.LGBMRegressor(num_leaves=32, random_state=random_state)],
param_grid_list=[{},
{'min_weight_fraction_leaf':
[.001, .01, .1]},
{'learning_rate': [0.1, 0.3], 'max_depth': [3, 5]}],
cv=3,
scoring='r2',
verbose=verbose)
best_est = model.fit(X, y).best_estimator_
if isinstance(best_est, LassoCV):
return Lasso(alpha=best_est.alpha_, random_state=random_state)
else:
return best_est
else:
model = LassoCV(cv=5, random_state=random_state).fit(X, y)
return Lasso(alpha=model.alpha_, random_state=random_state)
def _first_stage_clf(X, y, *, make_regressor=False, automl=True, min_count=None, random_state=None, verbose=0):
# use same Cs as would be used by default by LogisticRegressionCV
cs = np.logspace(-4, 4, 10)
if min_count is None:
min_count = _CAT_LIMIT # we have at least this many instances
if automl:
# NOTE: we don't use LogisticRegressionCV inside the grid search because of the nested stratification
# which could affect how many times each distinct Y value needs to be present in the data
model = GridSearchCVList([LogisticRegression(max_iter=1000,
random_state=random_state),
RandomForestClassifier(n_estimators=100, min_samples_leaf=10,
random_state=random_state),
lgb.LGBMClassifier(num_leaves=32, random_state=random_state)],
param_grid_list=[{'C': cs},
{'max_depth': [3, None],
'min_weight_fraction_leaf': [.001, .01, .1]},
{'learning_rate': [0.1, 0.3], 'max_depth': [3, 5]}],
cv=min(3, min_count),
scoring='neg_log_loss',
verbose=verbose)
est = model.fit(X, y).best_estimator_
else:
model = LogisticRegressionCV(
cv=min(5, min_count), max_iter=1000, Cs=cs, random_state=random_state).fit(X, y)
est = LogisticRegression(C=model.C_[0], max_iter=1000, random_state=random_state)
if make_regressor:
return _RegressionWrapper(est)
else:
return est
def _final_stage(*, random_state=None, verbose=0):
return GridSearchCVList([WeightedLasso(random_state=random_state),
RandomForestRegressor(n_estimators=100, random_state=random_state, verbose=verbose)],
param_grid_list=[{'alpha': [.001, .01, .1, 1, 10]},
{'max_depth': [3, 5],
'min_samples_leaf': [10, 50]}],
cv=3,
scoring='neg_mean_squared_error',
verbose=verbose)
# simplification of sklearn's ColumnTransformer that encodes categoricals and passes through selected other columns
# but also supports get_feature_names with expected signature
class _ColumnTransformer(TransformerMixin):
def __init__(self, categorical, passthrough):
self.categorical = categorical
self.passthrough = passthrough
def fit(self, X):
cat_cols = _safe_indexing(X, self.categorical, axis=1)
if cat_cols.shape[1] > 0:
self.has_cats = True
# NOTE: set handle_unknown to 'ignore' so that we don't throw at runtime if given a novel value
self.one_hot_encoder = one_hot_encoder(handle_unknown='ignore').fit(cat_cols)
else:
self.has_cats = False
self.d_x = X.shape[1]
return self
def transform(self, X):
rest = _safe_indexing(X, self.passthrough, axis=1)
if self.has_cats:
cats = self.one_hot_encoder.transform(_safe_indexing(X, self.categorical, axis=1))
# NOTE: we rely on the passthrough columns coming first in the concatenated X;W
# when we pipeline scaling with our first stage models later, so the order here is important
return np.hstack((rest, cats))
else:
return rest
# TODO: remove once older sklearn support is no longer needed
def get_feature_names(self, names=None):
return self.get_feature_names_out(names)
def get_feature_names_out(self, names=None):
if names is None:
names = [f"x{i}" for i in range(self.d_x)]
rest = _safe_indexing(names, self.passthrough, axis=0)
if self.has_cats:
cats = get_feature_names_or_default(self.one_hot_encoder, _safe_indexing(names, self.categorical, axis=0))
return np.concatenate((rest, cats))
else:
return rest
# Wrapper to make sure that we get a deep copy of the contents instead of clone returning an untrained copy
class _Wrapper:
def __init__(self, item):
self.item = item
class _FrozenTransformer(TransformerMixin, BaseEstimator):
def __init__(self, wrapper):
self.wrapper = wrapper
def fit(self, X, y):
return self
def transform(self, X):
return self.wrapper.item.transform(X)
def _freeze(transformer):
return _FrozenTransformer(_Wrapper(transformer))
# Convert python objects to (possibly nested) types that can easily be represented as literals
def _sanitize(obj):
if obj is None or isinstance(obj, (bool, int, str, float)):
return obj
elif isinstance(obj, dict):
return {_sanitize(key): _sanitize(obj[key]) for key in obj}
else:
try:
return [_sanitize(item) for item in obj]
except Exception:
raise ValueError(f"Could not sanitize input {obj}")
# Convert SingleTreeInterpreter to a python dictionary
def _tree_interpreter_to_dict(interp, features, leaf_data=lambda t, n: {}):
tree = interp.tree_model_.tree_
node_dict = interp.node_dict_
def recurse(node_id):
if tree.children_left[node_id] == _tree.TREE_LEAF:
return {'leaf': True, 'n_samples': tree.n_node_samples[node_id], **leaf_data(tree, node_id, node_dict)}
else:
return {'leaf': False, 'feature': features[tree.feature[node_id]], 'threshold': tree.threshold[node_id],
'left': recurse(tree.children_left[node_id]),
'right': recurse(tree.children_right[node_id])}
return recurse(0)
class _PolicyOutput:
"""
A type encapsulating various information related to a learned policy.
Attributes
----------
tree_dictionary:dict
The policy tree represented as a dictionary,
policy_value:float
The average value of applying the recommended policy (over using the control),
always_treat:dict of str to float
A dictionary mapping each non-control treatment to the value of always treating with it (over control),
control_name:str
The name of the control treatment
"""
def __init__(self, tree_dictionary, policy_value, always_treat, control_name):
self.tree_dictionary = tree_dictionary
self.policy_value = policy_value
self.always_treat = always_treat
self.control_name = control_name
# named tuple type for storing results inside CausalAnalysis class;
# must be lifted to module level to enable pickling
_result = namedtuple("_result", field_names=[
"feature_index", "feature_name", "feature_baseline", "feature_levels", "hinds",
"X_transformer", "W_transformer", "estimator", "global_inference", "treatment_value"])
def _process_feature(name, feat_ind, verbose, categorical_inds, categories, heterogeneity_inds, min_counts, y, X,
nuisance_models, h_model, random_state, model_y, cv, mc_iters):
try:
if verbose > 0:
print(f"CausalAnalysis: Feature {name}")
discrete_treatment = feat_ind in categorical_inds
if discrete_treatment:
cats = categories[categorical_inds.index(feat_ind)]
else:
cats = 'auto' # just leave the setting at the default otherwise
# the transformation logic here is somewhat tricky; we always need to encode the categorical columns,
# whether they end up in X or in W. However, for the continuous columns, we want to scale them all
# when running the first stage models, but don't want to scale the X columns when running the final model,
# since then our coefficients will have odd units and our trees will also have decisions using those units.
#
# we achieve this by pipelining the X scaling with the Y and T models (with fixed scaling, not refitting)
hinds = heterogeneity_inds[feat_ind]
WX_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'),
[ind for ind in categorical_inds
if ind != feat_ind]),
('drop', 'drop', feat_ind)],
remainder=StandardScaler())
W_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'),
[ind for ind in categorical_inds
if ind != feat_ind and ind not in hinds]),
('drop', 'drop', hinds),
('drop_feat', 'drop', feat_ind)],
remainder=StandardScaler())
X_cont_inds = [ind for ind in hinds
if ind != feat_ind and ind not in categorical_inds]
# Use _ColumnTransformer instead of ColumnTransformer so we can get feature names
X_transformer = _ColumnTransformer([ind for ind in categorical_inds
if ind != feat_ind and ind in hinds],
X_cont_inds)
# Controls are all other columns of X
WX = WX_transformer.fit_transform(X)
# can't use X[:, feat_ind] when X is a DataFrame
T = _safe_indexing(X, feat_ind, axis=1)
# TODO: we can't currently handle unseen values of the feature column when getting the effect;
# we might want to modify OrthoLearner (and other discrete treatment classes)
# so that the user can opt-in to allowing unseen treatment values
# (and return NaN or something in that case)
W = W_transformer.fit_transform(X)
X_xf = X_transformer.fit_transform(X)
# HACK: this is slightly ugly because we rely on the fact that DML passes [X;W] to the first stage models
# and so we can just peel the first columns off of that combined array for rescaling in the pipeline
# TODO: consider addding an API to DML that allows for better understanding of how the nuisance inputs are
# built, such as model_y_feature_names, model_t_feature_names, model_y_transformer, etc., so that this
# becomes a valid approach to handling this
X_scaler = ColumnTransformer([('scale', StandardScaler(),
list(range(len(X_cont_inds))))],
remainder='passthrough').fit(np.hstack([X_xf, W])).named_transformers_['scale']
X_scaler_fixed = ColumnTransformer([('scale', _freeze(X_scaler),
list(range(len(X_cont_inds))))],
remainder='passthrough')
if W.shape[1] == 0:
# array checking routines don't accept 0-width arrays
W = None
if X_xf.shape[1] == 0:
X_xf = None
if verbose > 0:
print("CausalAnalysis: performing model selection on T model")
# perform model selection
model_t = (_first_stage_clf(WX, T, automl=nuisance_models == 'automl',
min_count=min_counts.get(feat_ind, None),
random_state=random_state, verbose=verbose)
if discrete_treatment else _first_stage_reg(WX, T, automl=nuisance_models == 'automl',
random_state=random_state,
verbose=verbose))
pipelined_model_t = Pipeline([('scale', X_scaler_fixed),
('model', model_t)])
pipelined_model_y = Pipeline([('scale', X_scaler_fixed),
('model', model_y)])
if X_xf is None and h_model == 'forest':
warnings.warn(f"Using a linear model instead of a forest model for feature '{name}' "
"because forests don't support models with no heterogeneity indices")
h_model = 'linear'
if h_model == 'linear':
est = LinearDML(model_y=pipelined_model_y,
model_t=pipelined_model_t,
discrete_treatment=discrete_treatment,
fit_cate_intercept=True,
categories=cats,
random_state=random_state,
cv=cv,
mc_iters=mc_iters)
elif h_model == 'forest':
est = CausalForestDML(model_y=pipelined_model_y,
model_t=pipelined_model_t,
discrete_treatment=discrete_treatment,
n_estimators=4000,
min_var_leaf_on_val=True,
categories=cats,
random_state=random_state,
verbose=verbose,
cv=cv,
mc_iters=mc_iters)
if verbose > 0:
print("CausalAnalysis: tuning forest")
est.tune(y, T, X=X_xf, W=W)
if verbose > 0:
print("CausalAnalysis: training causal model")
est.fit(y, T, X=X_xf, W=W, cache_values=True)
# Prefer ate__inference to const_marginal_ate_inference(X) because it is doubly-robust and not conservative
if h_model == 'forest' and discrete_treatment:
global_inference = est.ate__inference()
else:
# convert to NormalInferenceResults for consistency
inf = est.const_marginal_ate_inference(X=X_xf)
global_inference = NormalInferenceResults(d_t=inf.d_t, d_y=inf.d_y,
pred=inf.mean_point,
pred_stderr=inf.stderr_mean,
mean_pred_stderr=None,
inf_type='ate')
# Set the dictionary values shared between local and global summaries
if discrete_treatment:
cats = est.transformer.categories_[0]
baseline = cats[est.transformer.drop_idx_[0]]
cats = cats[np.setdiff1d(np.arange(len(cats)),
est.transformer.drop_idx_[0])]
d_t = len(cats)
insights = {
_CausalInsightsConstants.TypeKey: ['cat'] * d_t,
_CausalInsightsConstants.RawFeatureNameKey: [name] * d_t,
_CausalInsightsConstants.CategoricalColumnKey: cats.tolist(),
_CausalInsightsConstants.EngineeredNameKey: [
f"{name} (base={baseline}): {c}" for c in cats]
}
treatment_value = 1
else:
d_t = 1
cats = ["num"]
baseline = None
insights = {
_CausalInsightsConstants.TypeKey: ["num"],
_CausalInsightsConstants.RawFeatureNameKey: [name],
_CausalInsightsConstants.CategoricalColumnKey: [name],
_CausalInsightsConstants.EngineeredNameKey: [name]
}
# calculate a "typical" treatment value, using the mean of the absolute value of non-zero treatments
treatment_value = np.mean(np.abs(T[T != 0]))
result = _result(feature_index=feat_ind,
feature_name=name,
feature_baseline=baseline,
feature_levels=cats,
hinds=hinds,
X_transformer=X_transformer,
W_transformer=W_transformer,
estimator=est,
global_inference=global_inference,
treatment_value=treatment_value)
return insights, result
except Exception as e:
warnings.warn(f"Exception caught when training model for feature {name}: {e}")
return e
# Unless we're opting into minimal cross-fitting, this is the minimum number of instances of each category
# required to fit a discrete DML model
_CAT_LIMIT = 10
# TODO: Add other nuisance model options, such as {'azure_automl', 'forests', 'boosting'} that will use particular
# sub-cases of models or also integrate with azure autoML. (post-MVP)
# TODO: Add other heterogeneity model options, such as {'automl'} for performing
# model selection for the causal effect, or {'sparse_linear'} for using a debiased lasso. (post-MVP)
# TODO: Enable multi-class classification (post-MVP)
class CausalAnalysis:
"""
Note: this class is experimental and the API may evolve over our next few releases.
Gets causal importance of features.
Parameters
----------
feature_inds: array_like of int, str, or bool
The features for which to estimate causal effects, expressed as either column indices,
column names, or boolean flags indicating which columns to pick
categorical: array_like of int, str, or bool
The features which are categorical in nature, expressed as either column indices,
column names, or boolean flags indicating which columns to pick
heterogeneity_inds: array_like of int, str, or bool, or None or list of array_like elements or None, default None
If a 1d array, then whenever estimating a heterogeneous (local) treatment effect
model, then only the features in this array will be used for heterogeneity. If a 2d
array then its first dimension should be len(feature_inds) and whenever estimating
a local causal effect for target feature feature_inds[i], then only features in
heterogeneity_inds[i] will be used for heterogeneity. If heterogeneity_inds[i]=None, then all features
are used for heterogeneity when estimating local causal effect for feature_inds[i], and likewise if
heterogeneity_inds[i]=[] then no features will be used for heterogeneity. If heterogeneity_ind=None
then all features are used for heterogeneity for all features, and if heterogeneity_inds=[] then
no features will be.
feature_names: list of str, optional
The names for all of the features in the data. Not necessary if the input will be a dataframe.
If None and the input is a plain numpy array, generated feature names will be ['X1', 'X2', ...].
upper_bound_on_cat_expansion: int, default 5
The maximum number of categorical values allowed, because they are expanded via one-hot encoding. If a
feature has more than this many values, then a causal effect model is not fitted for that target feature
and a warning flag is raised. The remainder of the models are fitted.
classification: bool, default False
Whether this is a classification (as opposed to regression) task
nuisance_models: one of {'linear', 'automl'}, default 'linear'
The model class to use for nuisance estimation. Separate nuisance models are trained to predict the
outcome and also each individual feature column from all of the other columns in the dataset as a prerequisite
step before computing the actual causal effect for that feature column. If 'linear', then
:class:`~sklearn.linear_model.LassoCV` (for regression) or :class:`~sklearn.linear_model.LogisticRegressionCV`
(for classification) is used for these models. If 'automl', then model selection picks the best-performing
among several different model classes for each model being trained using k-fold cross-validation, which
requires additional computation.
heterogeneity_model: one of {'linear', 'forest'}, default 'linear'
The type of model to use for the final heterogeneous treatment effect model. 'linear' means that a
the estimated treatment effect for a column will be a linear function of the heterogeneity features for that
column, while 'forest' means that a forest model will be trained to compute the effect from those heterogeneity
features instead.
categories: 'auto' or list of ('auto' or list of values), default 'auto'
What categories to use for the categorical columns. If 'auto', then the categories will be inferred for
all categorical columns; otherwise this argument should have as many entries as there are categorical columns,
and each entry should be either 'auto' to infer the values for that column or the list of values for the
column. If explicit values are provided, the first value is treated as the "control" value for that column
against which other values are compared.
n_jobs: int, default -1
Degree of parallelism to use when training models via joblib.Parallel
verbose : int, default 0
Controls the verbosity when fitting and predicting.
cv: int, cross-validation generator or an iterable, default 5
Determines the strategy for cross-fitting used when training causal models for each feature.
Possible inputs for cv are:
- integer, to specify the number of folds.
- :term:`CV splitter`
- An iterable yielding (train, test) splits as arrays of indices.
For integer inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).
mc_iters: int, default 3
The number of times to rerun the first stage models to reduce the variance of the causal model nuisances.
skip_cat_limit_checks: bool, default False
By default, categorical features need to have several instances of each category in order for a model to be
fit robustly. Setting this to True will skip these checks (although at least 2 instances will always be
required for linear heterogeneity models, and 4 for forest heterogeneity models even in that case).
random_state : int, RandomState instance, or None, default None
Controls the randomness of the estimator. The features are always
randomly permuted at each split. When ``max_features < n_features``, the algorithm will
select ``max_features`` at random at each split before finding the best
split among them. But the best found split may vary across different
runs, even if ``max_features=n_features``. That is the case, if the
improvement of the criterion is identical for several splits and one
split has to be selected at random. To obtain a deterministic behaviour
during fitting, ``random_state`` has to be fixed to an integer.
Attributes
----------
nuisance_models_: str
The nuisance models setting used for the most recent call to fit
heterogeneity_model: str
The heterogeneity model setting used for the most recent call to fit
feature_names_: list of str
The list of feature names from the data in the most recent call to fit
trained_feature_indices_: list of int
The list of feature indices where models were trained successfully
untrained_feature_indices_: list of tuple of (int, str or Exception)
The list of indices that were requested but not able to be trained succesfully,
along with either a reason or caught Exception for each
"""
def __init__(self, feature_inds, categorical, heterogeneity_inds=None, feature_names=None, classification=False,
upper_bound_on_cat_expansion=5, nuisance_models='linear', heterogeneity_model='linear', *,
categories='auto', n_jobs=-1, verbose=0, cv=5, mc_iters=3, skip_cat_limit_checks=False,
random_state=None):
self.feature_inds = feature_inds
self.categorical = categorical
self.heterogeneity_inds = heterogeneity_inds
self.feature_names = feature_names
self.classification = classification
self.upper_bound_on_cat_expansion = upper_bound_on_cat_expansion
self.nuisance_models = nuisance_models
self.heterogeneity_model = heterogeneity_model
self.categories = categories
self.n_jobs = n_jobs
self.verbose = verbose
self.cv = cv
self.mc_iters = mc_iters
self.skip_cat_limit_checks = skip_cat_limit_checks
self.random_state = random_state
def fit(self, X, y, warm_start=False):
"""
Fits global and local causal effect models for each feature in feature_inds on the data.
Parameters
----------
X : array_like
Feature data
y : array_like of shape (n,) or (n,1)
Outcome. If classification=True, then y should take two values. Otherwise an error is raised
that only binary classification is implemented for now.
TODO. enable multi-class classification for y (post-MVP)
warm_start : bool, default False
If False, train models for each feature in `feature_inds`.
If True, train only models for features in `feature_inds` that had not already been trained by
the previous call to `fit`, and for which neither the corresponding heterogeneity_inds, nor the
automl flag have changed. If heterogeneity_inds have changed, then the final stage model of these features
will be refit. If the automl flag has changed, then whole model is refit, despite the warm start flag.
"""
# Validate inputs
assert self.nuisance_models in ['automl', 'linear'], (
"The only supported nuisance models are 'linear' and 'automl', "
f"but was given {self.nuisance_models}")
assert self.heterogeneity_model in ['linear', 'forest'], (
"The only supported heterogeneity models are 'linear' and 'forest' but received "
f"{self.heterogeneity_model}")
assert np.ndim(X) == 2, f"X must be a 2-dimensional array, but here had shape {np.shape(X)}"
assert iterable(self.feature_inds), f"feature_inds should be array_like, but got {self.feature_inds}"
assert iterable(self.categorical), f"categorical should be array_like, but got {self.categorical}"
assert self.heterogeneity_inds is None or iterable(self.heterogeneity_inds), (
f"heterogeneity_inds should be None or array_like, but got {self.heterogeneity_inds}")
assert self.feature_names is None or iterable(self.feature_names), (
f"feature_names should be None or array_like, but got {self.feature_names}")
assert self.categories == 'auto' or iterable(self.categories), (
f"categories should be 'auto' or array_like, but got {self.categories}")
# TODO: check compatibility of X and Y lengths
if warm_start:
if not hasattr(self, "_results"):
# no previous fit, cancel warm start
warm_start = False
elif self._d_x != X.shape[1]:
raise ValueError(
f"Can't warm start: previous X had {self._d_x} columns, new X has {X.shape[1]} columns")
# work with numeric feature indices, so that we can easily compare with categorical ones
train_inds = _get_column_indices(X, self.feature_inds)
if len(train_inds) == 0:
raise ValueError(
"No features specified. At least one feature index must be specified so that a model can be trained.")
heterogeneity_inds = self.heterogeneity_inds
if heterogeneity_inds is None:
heterogeneity_inds = [None for ind in train_inds]
# if heterogeneity_inds is 1D, repeat it
if heterogeneity_inds == [] or isinstance(heterogeneity_inds[0], (int, str, bool)):
heterogeneity_inds = [heterogeneity_inds for _ in train_inds]
# heterogeneity inds should be a 2D list of length same as train_inds
elif heterogeneity_inds is not None and len(heterogeneity_inds) != len(train_inds):
raise ValueError("Heterogeneity indexes should have the same number of entries, but here "
f" there were {len(heterogeneity_inds)} heterogeneity entries but "
f" {len(train_inds)} feature indices.")
# replace None elements of heterogeneity_inds and ensure indices are numeric
heterogeneity_inds = {ind: list(range(X.shape[1])) if hinds is None else _get_column_indices(X, hinds)
for ind, hinds in zip(train_inds, heterogeneity_inds)}
if warm_start:
train_y_model = False
if self.nuisance_models != self.nuisance_models_:
warnings.warn("warm_start will be ignored since the nuisance models have changed "
f"from {self.nuisance_models_} to {self.nuisance_models} since the previous call to fit")
warm_start = False
train_y_model = True
if self.heterogeneity_model != self.heterogeneity_model_:
warnings.warn("warm_start will be ignored since the heterogeneity model has changed "
f"from {self.heterogeneity_model_} to {self.heterogeneity_model} "
"since the previous call to fit")
warm_start = False
# TODO: bail out also if categorical columns, classification, random_state changed?
else:
train_y_model = True
# TODO: should we also train a new model_y under any circumstances when warm_start is True?
if warm_start:
new_inds = [ind for ind in train_inds if (ind not in self._cache or
heterogeneity_inds[ind] != self._cache[ind][1].hinds)]
else:
new_inds = list(train_inds)
self._cache = {} # store mapping from feature to insights, results
# train the Y model
if train_y_model:
# perform model selection for the Y model using all X, not on a per-column basis
allX = ColumnTransformer([('encode',
one_hot_encoder(drop='first'),
self.categorical)],
remainder=StandardScaler()).fit_transform(X)
if self.verbose > 0:
print("CausalAnalysis: performing model selection on overall Y model")
if self.classification:
self._model_y = _first_stage_clf(allX, y, automl=self.nuisance_models == 'automl',
make_regressor=True,
random_state=self.random_state, verbose=self.verbose)
else:
self._model_y = _first_stage_reg(allX, y, automl=self.nuisance_models == 'automl',
random_state=self.random_state, verbose=self.verbose)
if self.classification:
# now that we've trained the classifier and wrapped it, ensure that y is transformed to
# work with the regression wrapper
# we use column_or_1d to treat pd.Series and pd.DataFrame objects the same way as arrays
y = column_or_1d(y).reshape(-1, 1)
# note that this needs to happen after wrapping to generalize to the multi-class case,
# since otherwise we'll have too many columns to be able to train a classifier
y = one_hot_encoder(drop='first').fit_transform(y)
assert y.ndim == 1 or y.shape[1] == 1, ("Multiclass classification isn't supported" if self.classification
else "Only a single outcome is supported")
self._vec_y = y.ndim == 1
self._d_x = X.shape[1]
# start with empty results and default shared insights
self._results = []
self._shared = _get_default_shared_insights_output()
self._shared[_CausalInsightsConstants.InitArgsKey] = {
'feature_inds': _sanitize(self.feature_inds),
'categorical': _sanitize(self.categorical),
'heterogeneity_inds': _sanitize(self.heterogeneity_inds),
'feature_names': _sanitize(self.feature_names),
'classification': _sanitize(self.classification),
'upper_bound_on_cat_expansion': _sanitize(self.upper_bound_on_cat_expansion),
'nuisance_models': _sanitize(self.nuisance_models),
'heterogeneity_model': _sanitize(self.heterogeneity_model),
'categories': _sanitize(self.categories),
'n_jobs': _sanitize(self.n_jobs),
'verbose': _sanitize(self.verbose),
'random_state': _sanitize(self.random_state)
}
# convert categorical indicators to numeric indices
categorical_inds = _get_column_indices(X, self.categorical)
categories = self.categories
if categories == 'auto':
categories = ['auto' for _ in categorical_inds]
else:
assert len(categories) == len(categorical_inds), (
"If categories is not 'auto', it must contain one entry per categorical column. Instead, categories"
f"has length {len(categories)} while there are {len(categorical_inds)} categorical columns.")
# check for indices over the categorical expansion bound
invalid_inds = getattr(self, 'untrained_feature_indices_', [])
# assume we'll be able to train former failures this time; we'll add them back if not
invalid_inds = [(ind, reason) for (ind, reason) in invalid_inds if ind not in new_inds]
self._has_column_names = True
if self.feature_names is None:
if hasattr(X, "iloc"):
feature_names = X.columns
else:
self._has_column_names = False
feature_names = [f"x{i}" for i in range(X.shape[1])]
else:
feature_names = self.feature_names
self.feature_names_ = feature_names
min_counts = {}
for ind in new_inds:
column_text = self._format_col(ind)
if ind in categorical_inds:
cats, counts = np.unique(_safe_indexing(X, ind, axis=1), return_counts=True)
min_ind = np.argmin(counts)
n_cat = len(cats)
if n_cat > self.upper_bound_on_cat_expansion:
warnings.warn(f"{column_text} has more than {self.upper_bound_on_cat_expansion} "
f"values (found {n_cat}) so no heterogeneity model will be fit for it; "
"increase 'upper_bound_on_cat_expansion' to change this behavior.")
# can't remove in place while iterating over new_inds, so store in separate list
invalid_inds.append((ind, 'upper_bound_on_cat_expansion'))
elif counts[min_ind] < _CAT_LIMIT:
if self.skip_cat_limit_checks and (counts[min_ind] >= 5 or
(counts[min_ind] >= 2 and
self.heterogeneity_model != 'forest')):
# train the model, but warn
warnings.warn(f"{column_text}'s value {cats[min_ind]} has only {counts[min_ind]} instances in "
f"the training dataset, which is less than the lower limit ({_CAT_LIMIT}). "
"A model will still be fit because 'skip_cat_limit_checks' is True, "
"but this model may not be robust.")
min_counts[ind] = counts[min_ind]
elif counts[min_ind] < 2 or (counts[min_ind] < 5 and self.heterogeneity_model == 'forest'):
# no model can be trained in this case since we need more folds
warnings.warn(f"{column_text}'s value {cats[min_ind]} has only {counts[min_ind]} instances in "
"the training dataset, but linear heterogeneity models need at least 2 and "
"forest heterogeneity models need at least 5 instances, so no model will be fit "
"for this column")
invalid_inds.append((ind, 'cat_limit'))
else:
# don't train a model, but suggest workaround since there are enough instances of least
# populated class
warnings.warn(f"{column_text}'s value {cats[min_ind]} has only {counts[min_ind]} instances in "
f"the training dataset, which is less than the lower limit ({_CAT_LIMIT}), "
"so no heterogeneity model will be fit for it. This check can be turned off by "
"setting 'skip_cat_limit_checks' to True, but that may result in an inaccurate "
"model for this feature.")
invalid_inds.append((ind, 'cat_limit'))
for (ind, _) in invalid_inds:
new_inds.remove(ind)
# also remove from train_inds so we don't try to access the result later
train_inds.remove(ind)
if len(train_inds) == 0:
raise ValueError("No features remain; increase the upper_bound_on_cat_expansion and ensure that there "
"are several instances of each categorical value so that at least "
"one feature model can be trained.")
# extract subset of names matching new columns
new_feat_names = _safe_indexing(feature_names, new_inds)
cache_updates = dict(zip(new_inds,
joblib.Parallel(
n_jobs=self.n_jobs,
verbose=self.verbose
)(joblib.delayed(_process_feature)(
feat_name, feat_ind,
self.verbose, categorical_inds, categories, heterogeneity_inds, min_counts, y, X,
self.nuisance_models, self.heterogeneity_model, self.random_state, self._model_y,
self.cv, self.mc_iters)
for feat_name, feat_ind in zip(new_feat_names, new_inds))))
# track indices where an exception was thrown, since we can't remove from dictionary while iterating
inds_to_remove = []
for ind, value in cache_updates.items():
if isinstance(value, Exception):
# don't want to cache this failed result
inds_to_remove.append(ind)
train_inds.remove(ind)
invalid_inds.append((ind, value))
for ind in inds_to_remove:
del cache_updates[ind]
self._cache.update(cache_updates)
for ind in train_inds:
dict_update, result = self._cache[ind]
self._results.append(result)
for k in dict_update:
self._shared[k] += dict_update[k]
invalid_inds.sort()
self.untrained_feature_indices_ = invalid_inds
self.trained_feature_indices_ = train_inds
self.nuisance_models_ = self.nuisance_models
self.heterogeneity_model_ = self.heterogeneity_model
return self
def _format_col(self, ind):
if self._has_column_names:
return f"Column {ind} ({self.feature_names_[ind]})"
else:
return f"Column {ind}"
# properties to return from effect InferenceResults
@staticmethod
def _point_props(alpha):
return [(_CausalInsightsConstants.PointEstimateKey, 'point_estimate'),
(_CausalInsightsConstants.StandardErrorKey, 'stderr'),
(_CausalInsightsConstants.ZStatKey, 'zstat'),
(_CausalInsightsConstants.PValueKey, 'pvalue'),
(_CausalInsightsConstants.ConfidenceIntervalLowerKey, lambda inf: inf.conf_int(alpha=alpha)[0]),
(_CausalInsightsConstants.ConfidenceIntervalUpperKey, lambda inf: inf.conf_int(alpha=alpha)[1])]
# properties to return from PopulationSummaryResults
@staticmethod
def _summary_props(alpha):
return [(_CausalInsightsConstants.PointEstimateKey, 'mean_point'),
(_CausalInsightsConstants.StandardErrorKey, 'stderr_mean'),
(_CausalInsightsConstants.ZStatKey, 'zstat'),
(_CausalInsightsConstants.PValueKey, 'pvalue'),
(_CausalInsightsConstants.ConfidenceIntervalLowerKey, lambda inf: inf.conf_int_mean(alpha=alpha)[0]),
(_CausalInsightsConstants.ConfidenceIntervalUpperKey, lambda inf: inf.conf_int_mean(alpha=alpha)[1])]
# Converts strings to property lookups or method calls as a convenience so that the
# _point_props and _summary_props above can be applied to an inference object
@staticmethod
def _make_accessor(attr):
if isinstance(attr, str):
s = attr
def attr(o):
val = getattr(o, s)
if callable(val):
return val()
else:
return val
return attr
# Create a summary combining all results into a single output; this is used
# by the various causal_effect and causal_effect_dict methods to generate either a dataframe
# or a dictionary, respectively, based on the summary function passed into this method
def _summarize(self, *, summary, get_inference, props, expand_arr, drop_sample):
assert hasattr(self, "_results"), "This object has not been fit, so cannot get results"
# ensure array has shape (m,y,t)
def ensure_proper_dims(arr):
if expand_arr:
# population summary is missing sample dimension; add it for consistency
arr = np.expand_dims(arr, 0)
if self._vec_y:
# outcome dimension is missing; add it for consistency
arr = np.expand_dims(arr, axis=1)
assert 2 <= arr.ndim <= 3
# add singleton treatment dimension if missing
return arr if arr.ndim == 3 else np.expand_dims(arr, axis=2)
# store set of inference results so we don't need to recompute per-attribute below in summary/coalesce
infs = [get_inference(res) for res in self._results]
# each attr has dimension (m,y) or (m,y,t)
def coalesce(attr):
"""Join together the arrays for each feature."""
attr = self._make_accessor(attr)
# concatenate along treatment dimension
arr = np.concatenate([ensure_proper_dims(attr(inf))
for inf in infs], axis=2)
# for dictionary representation, want to remove unneeded sample dimension
# in cohort and global results
if drop_sample:
arr = np.squeeze(arr, 0)
return arr
return summary([(key, coalesce(val)) for key, val in props])
def _pandas_summary(self, get_inference, *, props, n,
expand_arr=False, keep_all_levels=False):
"""
Summarizes results into a dataframe.
Parameters
----------
get_inference : lambda
Method to get the relevant inference results from each result object
props : list of (string, str or lambda)
Set of column names and ways to get the corresponding values from the inference object
n : int
The number of samples in the dataset