|
1 |
| -import numpy as np |
2 |
| -import pandas as pd |
3 |
| -from sklearn.cluster import KMeans |
4 |
| -import sklearn |
5 |
| -import scipy |
6 |
| - |
7 |
| -class KMeansFeaturizer: |
8 |
| - """Transforms numeric data into k-means cluster memberships. |
9 |
| - From the Feature Engineering Book by Alice Zheng |
10 |
| - https://github.com/alicezheng/feature-engineering-book/blob/master/kmeans_featurizer.py |
11 |
| - This transformer runs k-means on the input data and converts each data point |
12 |
| - into the id of the closest cluster. If a target variable is present, it is |
13 |
| - scaled and included as input to k-means in order to derive clusters that |
14 |
| - obey the classification boundary as well as group similar points together. |
15 |
| - Parameters |
16 |
| - ---------- |
17 |
| - k: integer, optional, default 100 |
18 |
| - The number of clusters to group data into. |
19 |
| - target_scale: float, [0, infty], optional, default 5.0 |
20 |
| - The scaling factor for the target variable. Set this to zero to ignore |
21 |
| - the target. For classification problems, larger `target_scale` values |
22 |
| - will produce clusters that better respect the class boundary. |
23 |
| - random_state : integer or numpy.RandomState, optional |
24 |
| - This is passed to k-means as the generator used to initialize the |
25 |
| - kmeans centers. If an integer is given, it fixes the seed. Defaults to |
26 |
| - the global numpy random number generator. |
27 |
| - Attributes |
28 |
| - ---------- |
29 |
| - cluster_centers_ : array, [k, n_features] |
30 |
| - Coordinates of cluster centers. n_features does count the target column. |
31 |
| - """ |
32 |
| - |
33 |
| - def __init__(self, k=100, target_scale=5.0, random_state=None): |
34 |
| - self.k = k |
35 |
| - self.target_scale = target_scale |
36 |
| - self.random_state = random_state |
37 |
| - |
38 |
| - def fit(self, X, y=None): |
39 |
| - """Runs k-means on the input data and find centroids. |
40 |
| - If no target is given (`y` is None) then run vanilla k-means on input |
41 |
| - `X`. |
42 |
| - If target `y` is given, then include the target (weighted by |
43 |
| - `target_scale`) as an extra dimension for k-means clustering. In this |
44 |
| - case, run k-means twice, first with the target, then an extra iteration |
45 |
| - without. |
46 |
| - After fitting, the attribute `cluster_centers_` are set to the k-means |
47 |
| - centroids in the input space represented by `X`. |
48 |
| - Parameters |
49 |
| - ---------- |
50 |
| - X : array-like or sparse matrix, shape=(n_data_points, n_features) |
51 |
| - y : vector of length n_data_points, optional, default None |
52 |
| - If provided, will be weighted with `target_scale` and included in |
53 |
| - k-means clustering as hint. |
54 |
| - """ |
55 |
| - n_features = X.shape[1] |
56 |
| - if y is None: |
57 |
| - # No target variable, just do plain k-means |
58 |
| - km_model = KMeans(n_clusters=self.k, |
59 |
| - n_init=20, |
60 |
| - random_state=self.random_state) |
61 |
| - km_model.fit(X) |
62 |
| - |
63 |
| - self.km_model_ = km_model |
64 |
| - self.cluster_centers_ = km_model.cluster_centers_ |
65 |
| - return self |
66 |
| - |
67 |
| - # There is target information. Apply appropriate scaling and include |
68 |
| - # into input data to k-means |
69 |
| - data_with_target = np.hstack((X, y[:,np.newaxis]*self.target_scale)) |
70 |
| - |
71 |
| - # Build a pre-training k-means model on data and target |
72 |
| - km_model_pretrain = KMeans(n_clusters=self.k, |
73 |
| - n_init=20, |
74 |
| - random_state=self.random_state) |
75 |
| - km_model_pretrain.fit(data_with_target) |
76 |
| - |
77 |
| - # Run k-means a second time to get the clusters in the original space |
78 |
| - # without target info. Initialize using centroids found in pre-training. |
79 |
| - # Go through a single iteration of cluster assignment and centroid |
80 |
| - # recomputation. |
81 |
| - km_model = KMeans(n_clusters=self.k, |
82 |
| - init=km_model_pretrain.cluster_centers_[:,:n_features], |
83 |
| - n_init=1, |
84 |
| - max_iter=1) |
85 |
| - km_model.fit(X) |
86 |
| - |
87 |
| - self.km_model = km_model |
88 |
| - self.cluster_centers_ = km_model.cluster_centers_ |
89 |
| - return self |
90 |
| - |
91 |
| - def transform(self, X, y=None): |
92 |
| - """Output the closest cluster id for each input data point. |
93 |
| - Parameters |
94 |
| - ---------- |
95 |
| - X : array-like or sparse matrix, shape=(n_data_points, n_features) |
96 |
| - y : vector of length n_data_points, optional, default None |
97 |
| - Target vector is ignored even if provided. |
98 |
| - Returns |
99 |
| - ------- |
100 |
| - cluster_ids : array, shape[n_data_points,1] |
101 |
| - """ |
102 |
| - clusters = self.km_model.predict(X) |
103 |
| - return clusters[:,np.newaxis] |
104 |
| - |
105 |
| - def fit_transform(self, X, y=None): |
106 |
| - """Runs fit followed by transform. |
107 |
| - """ |
108 |
| - self.fit(X, y) |
109 |
| - return self.transform(X, y) |
110 |
| - |
111 |
| - |
112 |
| -from collections import defaultdict |
113 |
| -import operator |
114 |
| -import pdb |
115 |
| -import copy |
116 |
| -from sklearn.model_selection import train_test_split |
117 |
| -def Transform_KM_Features(training_data, training_labels, test_data, km_max=0): |
118 |
| - seed = 99 |
119 |
| - preds = list(training_data) |
120 |
| - target = training_labels.name |
121 |
| - train_index = training_data.index |
122 |
| - test_index = test_data.index |
123 |
| - if km_max == 0: |
124 |
| - km_max = int(np.log10(training_data.shape[0])+0.49) |
125 |
| - if km_max <= 2: |
126 |
| - k_max = 2 |
127 |
| - else: |
128 |
| - k_max = copy.deepcopy(km_max) |
129 |
| - kmf = KMeansFeaturizer(k=k_max, target_scale=0, random_state=seed) |
130 |
| - kmf_hint = kmf.fit(training_data, training_labels) |
131 |
| - |
132 |
| - training_cluster_features = kmf_hint.transform(training_data) |
133 |
| - test_cluster_features = kmf_hint.transform(test_data) |
134 |
| - npx = np.c_[training_data, training_labels.values] |
135 |
| - training_with_cluster = np.c_[npx,training_cluster_features] |
136 |
| - test_with_cluster = np.c_[test_data, test_cluster_features] |
137 |
| - train_with_cluster_df = pd.DataFrame(training_with_cluster,index=train_index, |
138 |
| - columns=preds+[target,'cluster']) |
139 |
| - test_with_cluster_df = pd.DataFrame(test_with_cluster,index=test_index, |
140 |
| - columns=preds+['cluster']) |
141 |
| - return train_with_cluster_df, test_with_cluster_df |
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +from sklearn.cluster import KMeans |
| 4 | +import sklearn |
| 5 | +import scipy |
| 6 | + |
| 7 | +class KMeansFeaturizer: |
| 8 | + """Transforms numeric data into k-means cluster memberships. |
| 9 | + From the Feature Engineering Book by Alice Zheng |
| 10 | + https://github.com/alicezheng/feature-engineering-book/blob/master/kmeans_featurizer.py |
| 11 | + This transformer runs k-means on the input data and converts each data point |
| 12 | + into the id of the closest cluster. If a target variable is present, it is |
| 13 | + scaled and included as input to k-means in order to derive clusters that |
| 14 | + obey the classification boundary as well as group similar points together. |
| 15 | + Parameters |
| 16 | + ---------- |
| 17 | + k: integer, optional, default 100 |
| 18 | + The number of clusters to group data into. |
| 19 | + target_scale: float, [0, infty], optional, default 5.0 |
| 20 | + The scaling factor for the target variable. Set this to zero to ignore |
| 21 | + the target. For classification problems, larger `target_scale` values |
| 22 | + will produce clusters that better respect the class boundary. |
| 23 | + random_state : integer or numpy.RandomState, optional |
| 24 | + This is passed to k-means as the generator used to initialize the |
| 25 | + kmeans centers. If an integer is given, it fixes the seed. Defaults to |
| 26 | + the global numpy random number generator. |
| 27 | + Attributes |
| 28 | + ---------- |
| 29 | + cluster_centers_ : array, [k, n_features] |
| 30 | + Coordinates of cluster centers. n_features does count the target column. |
| 31 | + """ |
| 32 | + |
| 33 | + def __init__(self, k=100, target_scale=5.0, random_state=None): |
| 34 | + self.k = k |
| 35 | + self.target_scale = target_scale |
| 36 | + self.random_state = random_state |
| 37 | + |
| 38 | + def fit(self, X, y=None): |
| 39 | + """Runs k-means on the input data and find centroids. |
| 40 | + If no target is given (`y` is None) then run vanilla k-means on input |
| 41 | + `X`. |
| 42 | + If target `y` is given, then include the target (weighted by |
| 43 | + `target_scale`) as an extra dimension for k-means clustering. In this |
| 44 | + case, run k-means twice, first with the target, then an extra iteration |
| 45 | + without. |
| 46 | + After fitting, the attribute `cluster_centers_` are set to the k-means |
| 47 | + centroids in the input space represented by `X`. |
| 48 | + Parameters |
| 49 | + ---------- |
| 50 | + X : array-like or sparse matrix, shape=(n_data_points, n_features) |
| 51 | + y : vector of length n_data_points, optional, default None |
| 52 | + If provided, will be weighted with `target_scale` and included in |
| 53 | + k-means clustering as hint. |
| 54 | + """ |
| 55 | + n_features = X.shape[1] |
| 56 | + if y is None: |
| 57 | + # No target variable, just do plain k-means |
| 58 | + km_model = KMeans(n_clusters=self.k, |
| 59 | + n_init=20, |
| 60 | + random_state=self.random_state) |
| 61 | + km_model.fit(X) |
| 62 | + |
| 63 | + self.km_model_ = km_model |
| 64 | + self.cluster_centers_ = km_model.cluster_centers_ |
| 65 | + return self |
| 66 | + |
| 67 | + # There is target information. Apply appropriate scaling and include |
| 68 | + # into input data to k-means |
| 69 | + data_with_target = np.hstack((X, y[:,np.newaxis]*self.target_scale)) |
| 70 | + |
| 71 | + # Build a pre-training k-means model on data and target |
| 72 | + km_model_pretrain = KMeans(n_clusters=self.k, |
| 73 | + n_init=20, |
| 74 | + random_state=self.random_state) |
| 75 | + km_model_pretrain.fit(data_with_target) |
| 76 | + |
| 77 | + # Run k-means a second time to get the clusters in the original space |
| 78 | + # without target info. Initialize using centroids found in pre-training. |
| 79 | + # Go through a single iteration of cluster assignment and centroid |
| 80 | + # recomputation. |
| 81 | + km_model = KMeans(n_clusters=self.k, |
| 82 | + init=km_model_pretrain.cluster_centers_[:,:n_features], |
| 83 | + n_init=1, |
| 84 | + max_iter=1) |
| 85 | + km_model.fit(X) |
| 86 | + |
| 87 | + self.km_model = km_model |
| 88 | + self.cluster_centers_ = km_model.cluster_centers_ |
| 89 | + return self |
| 90 | + |
| 91 | + def transform(self, X, y=None): |
| 92 | + """Output the closest cluster id for each input data point. |
| 93 | + Parameters |
| 94 | + ---------- |
| 95 | + X : array-like or sparse matrix, shape=(n_data_points, n_features) |
| 96 | + y : vector of length n_data_points, optional, default None |
| 97 | + Target vector is ignored even if provided. |
| 98 | + Returns |
| 99 | + ------- |
| 100 | + cluster_ids : array, shape[n_data_points,1] |
| 101 | + """ |
| 102 | + clusters = self.km_model.predict(X) |
| 103 | + return clusters[:,np.newaxis] |
| 104 | + |
| 105 | + def fit_transform(self, X, y=None): |
| 106 | + """Runs fit followed by transform. |
| 107 | + """ |
| 108 | + self.fit(X, y) |
| 109 | + return self.transform(X, y) |
| 110 | + |
| 111 | + |
| 112 | +from collections import defaultdict |
| 113 | +import operator |
| 114 | +import pdb |
| 115 | +import copy |
| 116 | +from sklearn.model_selection import train_test_split |
| 117 | +def Transform_KM_Features(training_data, training_labels, test_data, km_max=0): |
| 118 | + seed = 99 |
| 119 | + preds = list(training_data) |
| 120 | + target = training_labels.name |
| 121 | + train_index = training_data.index |
| 122 | + test_index = test_data.index |
| 123 | + if km_max == 0: |
| 124 | + km_max = int(np.log10(training_data.shape[0])+0.49) |
| 125 | + if km_max <= 2: |
| 126 | + k_max = 2 |
| 127 | + else: |
| 128 | + k_max = copy.deepcopy(km_max) |
| 129 | + kmf = KMeansFeaturizer(k=k_max, target_scale=0, random_state=seed) |
| 130 | + kmf_hint = kmf.fit(training_data, training_labels) |
| 131 | + |
| 132 | + training_cluster_features = kmf_hint.transform(training_data) |
| 133 | + test_cluster_features = kmf_hint.transform(test_data) |
| 134 | + npx = np.c_[training_data, training_labels.values] |
| 135 | + training_with_cluster = np.c_[npx,training_cluster_features] |
| 136 | + test_with_cluster = np.c_[test_data, test_cluster_features] |
| 137 | + train_with_cluster_df = pd.DataFrame(training_with_cluster,index=train_index, |
| 138 | + columns=preds+[target,'cluster']) |
| 139 | + test_with_cluster_df = pd.DataFrame(test_with_cluster,index=test_index, |
| 140 | + columns=preds+['cluster']) |
| 141 | + return train_with_cluster_df, test_with_cluster_df |
0 commit comments