-
Notifications
You must be signed in to change notification settings - Fork 61
/
Copy pathdemo15.py
109 lines (87 loc) · 3.58 KB
/
demo15.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lightautoml.automl.base import AutoML
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.dataset.roles import CategoryRole
from lightautoml.dataset.roles import NumericRole
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector
from lightautoml.pipelines.selection.importance_based import (
ModelBasedImportanceEstimator,
)
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
################################
# Features:
# - group_by transformer
################################
N_FOLDS = 3 # number of folds for cross-validation inside AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
N_THREADS = 4 # threads cnt for lgbm and linear models
TIMEOUT = 100
USED_COLS = ["SK_ID_CURR", "TARGET", "NAME_CONTRACT_TYPE", "CODE_GENDER", "AMT_INCOME_TOTAL", "DAYS_BIRTH"]
TARGET = "TARGET"
# load data
data = pd.read_csv("./data/sampled_app_train.csv")
data = data[USED_COLS]
train, test = train_test_split(data, test_size=2000, random_state=42)
# Using TabularAutoML preset
task = Task("binary")
roles = {
"target": TARGET,
CategoryRole(dtype=str): ["NAME_CONTRACT_TYPE", "CODE_GENDER"],
NumericRole(np.float32): ["AMT_INCOME_TOTAL"],
}
# specify groupby triplets: [("group_col", "feature", "transform_type"),]
groupby_triplets = [
("CODE_GENDER", "AMT_INCOME_TOTAL", "max"),
("NAME_CONTRACT_TYPE", "CODE_GENDER", "mode"),
("NAME_CONTRACT_TYPE", "AMT_INCOME_TOTAL", "delta_mean"),
]
print(f"Try TabularAutoML with the following groupby_triplets:\n{groupby_triplets}")
automl = TabularAutoML(
task=task,
timeout=TIMEOUT,
cpu_limit=N_THREADS,
reader_params={"n_jobs": N_THREADS, "cv": N_FOLDS, "random_state": RANDOM_STATE},
general_params={"use_algos": [["lgb"]]},
gbm_pipeline_params={"use_groupby": True, "groupby_triplets": groupby_triplets},
)
automl.fit_predict(train, roles=roles)
feature_scores = automl.levels[0][0].ml_algos[0].get_features_score()
print(f"Feature importances of BoostLGBM model. Pay attention to groupby features:\n{feature_scores}")
# Custom pipeline with groupby features defined by importance
print("\nTry custom pipeline with groupby features defined by importance:\n")
task = Task("binary")
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)
model0 = BoostLGBM(default_params={"learning_rate": 0.1, "num_leaves": 64, "seed": 42, "num_threads": N_THREADS})
pie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(LGBSimpleFeatures(), model0, pie, cutoff=-9999)
pipe = LGBAdvancedPipeline(
use_groupby=True, pre_selector=selector, groupby_types=["delta_median", "std"], groupby_top_based_on="importance"
)
model = BoostLGBM(
default_params={
"learning_rate": 0.05,
"num_leaves": 128,
"seed": 1,
"num_threads": N_THREADS,
}
)
pipeline = MLPipeline([model], pre_selection=selector, features_pipeline=pipe, post_selection=None)
automl = AutoML(
reader,
[[pipeline]],
skip_conn=False,
)
oof_pred = automl.fit_predict(
train,
roles={"target": TARGET},
)
print(f"Feature used by BoostLGBM model. Pay attention to groupby features:\n{pipe.output_features}")