Skip to content

Commit 10561c7

Browse files
committed
Stable Version with Catboost
1 parent 79620e7 commit 10561c7

11 files changed

+5097
-5092
lines changed

.gitignore

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
.ipynb_checkpoints/
2-
__pycache__/
3-
.idea/
4-
dist/
5-
autoviz.egg-info/
6-
autoviml.egg-info/
7-
build/
1+
.ipynb_checkpoints/
2+
__pycache__/
3+
.idea/
4+
dist/
5+
autoviz.egg-info/
6+
autoviml.egg-info/
7+
build/
88
diagnosis/

Auto_ViML_Demo.ipynb

Lines changed: 571 additions & 571 deletions
Large diffs are not rendered by default.

LICENSE

Lines changed: 201 additions & 201 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 128 additions & 127 deletions
Large diffs are not rendered by default.

autoviml/Auto_ViML.py

Lines changed: 3512 additions & 3508 deletions
Large diffs are not rendered by default.

autoviml/QuickML_Ensembling.py

Lines changed: 184 additions & 184 deletions
Large diffs are not rendered by default.

autoviml/QuickML_Stacking.py

Lines changed: 182 additions & 182 deletions
Large diffs are not rendered by default.

autoviml/Transform_KM_Features.py

Lines changed: 141 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -1,141 +1,141 @@
1-
import numpy as np
2-
import pandas as pd
3-
from sklearn.cluster import KMeans
4-
import sklearn
5-
import scipy
6-
7-
class KMeansFeaturizer:
8-
"""Transforms numeric data into k-means cluster memberships.
9-
From the Feature Engineering Book by Alice Zheng
10-
https://github.com/alicezheng/feature-engineering-book/blob/master/kmeans_featurizer.py
11-
This transformer runs k-means on the input data and converts each data point
12-
into the id of the closest cluster. If a target variable is present, it is
13-
scaled and included as input to k-means in order to derive clusters that
14-
obey the classification boundary as well as group similar points together.
15-
Parameters
16-
----------
17-
k: integer, optional, default 100
18-
The number of clusters to group data into.
19-
target_scale: float, [0, infty], optional, default 5.0
20-
The scaling factor for the target variable. Set this to zero to ignore
21-
the target. For classification problems, larger `target_scale` values
22-
will produce clusters that better respect the class boundary.
23-
random_state : integer or numpy.RandomState, optional
24-
This is passed to k-means as the generator used to initialize the
25-
kmeans centers. If an integer is given, it fixes the seed. Defaults to
26-
the global numpy random number generator.
27-
Attributes
28-
----------
29-
cluster_centers_ : array, [k, n_features]
30-
Coordinates of cluster centers. n_features does count the target column.
31-
"""
32-
33-
def __init__(self, k=100, target_scale=5.0, random_state=None):
34-
self.k = k
35-
self.target_scale = target_scale
36-
self.random_state = random_state
37-
38-
def fit(self, X, y=None):
39-
"""Runs k-means on the input data and find centroids.
40-
If no target is given (`y` is None) then run vanilla k-means on input
41-
`X`.
42-
If target `y` is given, then include the target (weighted by
43-
`target_scale`) as an extra dimension for k-means clustering. In this
44-
case, run k-means twice, first with the target, then an extra iteration
45-
without.
46-
After fitting, the attribute `cluster_centers_` are set to the k-means
47-
centroids in the input space represented by `X`.
48-
Parameters
49-
----------
50-
X : array-like or sparse matrix, shape=(n_data_points, n_features)
51-
y : vector of length n_data_points, optional, default None
52-
If provided, will be weighted with `target_scale` and included in
53-
k-means clustering as hint.
54-
"""
55-
n_features = X.shape[1]
56-
if y is None:
57-
# No target variable, just do plain k-means
58-
km_model = KMeans(n_clusters=self.k,
59-
n_init=20,
60-
random_state=self.random_state)
61-
km_model.fit(X)
62-
63-
self.km_model_ = km_model
64-
self.cluster_centers_ = km_model.cluster_centers_
65-
return self
66-
67-
# There is target information. Apply appropriate scaling and include
68-
# into input data to k-means
69-
data_with_target = np.hstack((X, y[:,np.newaxis]*self.target_scale))
70-
71-
# Build a pre-training k-means model on data and target
72-
km_model_pretrain = KMeans(n_clusters=self.k,
73-
n_init=20,
74-
random_state=self.random_state)
75-
km_model_pretrain.fit(data_with_target)
76-
77-
# Run k-means a second time to get the clusters in the original space
78-
# without target info. Initialize using centroids found in pre-training.
79-
# Go through a single iteration of cluster assignment and centroid
80-
# recomputation.
81-
km_model = KMeans(n_clusters=self.k,
82-
init=km_model_pretrain.cluster_centers_[:,:n_features],
83-
n_init=1,
84-
max_iter=1)
85-
km_model.fit(X)
86-
87-
self.km_model = km_model
88-
self.cluster_centers_ = km_model.cluster_centers_
89-
return self
90-
91-
def transform(self, X, y=None):
92-
"""Output the closest cluster id for each input data point.
93-
Parameters
94-
----------
95-
X : array-like or sparse matrix, shape=(n_data_points, n_features)
96-
y : vector of length n_data_points, optional, default None
97-
Target vector is ignored even if provided.
98-
Returns
99-
-------
100-
cluster_ids : array, shape[n_data_points,1]
101-
"""
102-
clusters = self.km_model.predict(X)
103-
return clusters[:,np.newaxis]
104-
105-
def fit_transform(self, X, y=None):
106-
"""Runs fit followed by transform.
107-
"""
108-
self.fit(X, y)
109-
return self.transform(X, y)
110-
111-
112-
from collections import defaultdict
113-
import operator
114-
import pdb
115-
import copy
116-
from sklearn.model_selection import train_test_split
117-
def Transform_KM_Features(training_data, training_labels, test_data, km_max=0):
118-
seed = 99
119-
preds = list(training_data)
120-
target = training_labels.name
121-
train_index = training_data.index
122-
test_index = test_data.index
123-
if km_max == 0:
124-
km_max = int(np.log10(training_data.shape[0])+0.49)
125-
if km_max <= 2:
126-
k_max = 2
127-
else:
128-
k_max = copy.deepcopy(km_max)
129-
kmf = KMeansFeaturizer(k=k_max, target_scale=0, random_state=seed)
130-
kmf_hint = kmf.fit(training_data, training_labels)
131-
132-
training_cluster_features = kmf_hint.transform(training_data)
133-
test_cluster_features = kmf_hint.transform(test_data)
134-
npx = np.c_[training_data, training_labels.values]
135-
training_with_cluster = np.c_[npx,training_cluster_features]
136-
test_with_cluster = np.c_[test_data, test_cluster_features]
137-
train_with_cluster_df = pd.DataFrame(training_with_cluster,index=train_index,
138-
columns=preds+[target,'cluster'])
139-
test_with_cluster_df = pd.DataFrame(test_with_cluster,index=test_index,
140-
columns=preds+['cluster'])
141-
return train_with_cluster_df, test_with_cluster_df
1+
import numpy as np
2+
import pandas as pd
3+
from sklearn.cluster import KMeans
4+
import sklearn
5+
import scipy
6+
7+
class KMeansFeaturizer:
8+
"""Transforms numeric data into k-means cluster memberships.
9+
From the Feature Engineering Book by Alice Zheng
10+
https://github.com/alicezheng/feature-engineering-book/blob/master/kmeans_featurizer.py
11+
This transformer runs k-means on the input data and converts each data point
12+
into the id of the closest cluster. If a target variable is present, it is
13+
scaled and included as input to k-means in order to derive clusters that
14+
obey the classification boundary as well as group similar points together.
15+
Parameters
16+
----------
17+
k: integer, optional, default 100
18+
The number of clusters to group data into.
19+
target_scale: float, [0, infty], optional, default 5.0
20+
The scaling factor for the target variable. Set this to zero to ignore
21+
the target. For classification problems, larger `target_scale` values
22+
will produce clusters that better respect the class boundary.
23+
random_state : integer or numpy.RandomState, optional
24+
This is passed to k-means as the generator used to initialize the
25+
kmeans centers. If an integer is given, it fixes the seed. Defaults to
26+
the global numpy random number generator.
27+
Attributes
28+
----------
29+
cluster_centers_ : array, [k, n_features]
30+
Coordinates of cluster centers. n_features does count the target column.
31+
"""
32+
33+
def __init__(self, k=100, target_scale=5.0, random_state=None):
34+
self.k = k
35+
self.target_scale = target_scale
36+
self.random_state = random_state
37+
38+
def fit(self, X, y=None):
39+
"""Runs k-means on the input data and find centroids.
40+
If no target is given (`y` is None) then run vanilla k-means on input
41+
`X`.
42+
If target `y` is given, then include the target (weighted by
43+
`target_scale`) as an extra dimension for k-means clustering. In this
44+
case, run k-means twice, first with the target, then an extra iteration
45+
without.
46+
After fitting, the attribute `cluster_centers_` are set to the k-means
47+
centroids in the input space represented by `X`.
48+
Parameters
49+
----------
50+
X : array-like or sparse matrix, shape=(n_data_points, n_features)
51+
y : vector of length n_data_points, optional, default None
52+
If provided, will be weighted with `target_scale` and included in
53+
k-means clustering as hint.
54+
"""
55+
n_features = X.shape[1]
56+
if y is None:
57+
# No target variable, just do plain k-means
58+
km_model = KMeans(n_clusters=self.k,
59+
n_init=20,
60+
random_state=self.random_state)
61+
km_model.fit(X)
62+
63+
self.km_model_ = km_model
64+
self.cluster_centers_ = km_model.cluster_centers_
65+
return self
66+
67+
# There is target information. Apply appropriate scaling and include
68+
# into input data to k-means
69+
data_with_target = np.hstack((X, y[:,np.newaxis]*self.target_scale))
70+
71+
# Build a pre-training k-means model on data and target
72+
km_model_pretrain = KMeans(n_clusters=self.k,
73+
n_init=20,
74+
random_state=self.random_state)
75+
km_model_pretrain.fit(data_with_target)
76+
77+
# Run k-means a second time to get the clusters in the original space
78+
# without target info. Initialize using centroids found in pre-training.
79+
# Go through a single iteration of cluster assignment and centroid
80+
# recomputation.
81+
km_model = KMeans(n_clusters=self.k,
82+
init=km_model_pretrain.cluster_centers_[:,:n_features],
83+
n_init=1,
84+
max_iter=1)
85+
km_model.fit(X)
86+
87+
self.km_model = km_model
88+
self.cluster_centers_ = km_model.cluster_centers_
89+
return self
90+
91+
def transform(self, X, y=None):
92+
"""Output the closest cluster id for each input data point.
93+
Parameters
94+
----------
95+
X : array-like or sparse matrix, shape=(n_data_points, n_features)
96+
y : vector of length n_data_points, optional, default None
97+
Target vector is ignored even if provided.
98+
Returns
99+
-------
100+
cluster_ids : array, shape[n_data_points,1]
101+
"""
102+
clusters = self.km_model.predict(X)
103+
return clusters[:,np.newaxis]
104+
105+
def fit_transform(self, X, y=None):
106+
"""Runs fit followed by transform.
107+
"""
108+
self.fit(X, y)
109+
return self.transform(X, y)
110+
111+
112+
from collections import defaultdict
113+
import operator
114+
import pdb
115+
import copy
116+
from sklearn.model_selection import train_test_split
117+
def Transform_KM_Features(training_data, training_labels, test_data, km_max=0):
118+
seed = 99
119+
preds = list(training_data)
120+
target = training_labels.name
121+
train_index = training_data.index
122+
test_index = test_data.index
123+
if km_max == 0:
124+
km_max = int(np.log10(training_data.shape[0])+0.49)
125+
if km_max <= 2:
126+
k_max = 2
127+
else:
128+
k_max = copy.deepcopy(km_max)
129+
kmf = KMeansFeaturizer(k=k_max, target_scale=0, random_state=seed)
130+
kmf_hint = kmf.fit(training_data, training_labels)
131+
132+
training_cluster_features = kmf_hint.transform(training_data)
133+
test_cluster_features = kmf_hint.transform(test_data)
134+
npx = np.c_[training_data, training_labels.values]
135+
training_with_cluster = np.c_[npx,training_cluster_features]
136+
test_with_cluster = np.c_[test_data, test_cluster_features]
137+
train_with_cluster_df = pd.DataFrame(training_with_cluster,index=train_index,
138+
columns=preds+[target,'cluster'])
139+
test_with_cluster_df = pd.DataFrame(test_with_cluster,index=test_index,
140+
columns=preds+['cluster'])
141+
return train_with_cluster_df, test_with_cluster_df

0 commit comments

Comments
 (0)