Skip to content

Commit 94969f8

Browse files
committed
added features
added features
1 parent 40dec97 commit 94969f8

File tree

1 file changed

+34
-7
lines changed

1 file changed

+34
-7
lines changed

sberbank/sberbank_kernel.py

+34-7
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,23 @@ def main():
5252
train_df.loc[train_df['build_year'] == 3, 'build_year'] = np.nan
5353
train_df.loc[train_df['build_year'] == 71, 'build_year'] = np.nan
5454

55-
#missing data imputation, merging and label encoding
55+
# truncate the extreme values in price_doc
56+
ulimit = np.percentile(train_df.price_doc.values, 99)
57+
llimit = np.percentile(train_df.price_doc.values, 1)
58+
train_df['price_doc'].loc[train_df['price_doc']>ulimit] = ulimit
59+
train_df['price_doc'].loc[train_df['price_doc']<llimit] = llimit
60+
61+
#missing data imputation, merging, feature engineering and label encoding
5662
train_df['env'] = 'train'
5763
test_df['env'] = 'test'
5864
test_idx = test_df['id']
5965

6066
train_df.drop(['id'], axis=1, inplace=True)
6167
test_df.drop(['id'], axis=1, inplace=True)
6268
test_df['price_doc'] = 0
63-
64-
train_df = train_df.dropna() #drop training rows
65-
macro_df = macro_df.dropna()
69+
70+
#train_df = train_df.dropna() #drop training rows
71+
#macro_df = macro_df.dropna()
6672
tdf_med = test_df.median()
6773
tdf_product_mode = stats.mode(test_df['product_type'])[0][0]
6874
test_df = test_df.fillna(tdf_med) #fill-in test rows
@@ -75,8 +81,29 @@ def main():
7581
print "num nans: ", all_df.isnull().sum().sum()
7682

7783
#add month and day of week
84+
all_df["year"] = all_df.timestamp.dt.year
7885
all_df['month'] = all_df.timestamp.dt.month
7986
all_df['dow'] = all_df.timestamp.dt.dayofweek
87+
88+
#add month-year
89+
month_year = (all_df.timestamp.dt.month + all_df.timestamp.dt.year * 100)
90+
month_year_cnt_map = month_year.value_counts().to_dict()
91+
all_df['month_year_cnt'] = month_year.map(month_year_cnt_map)
92+
93+
#add week-year count
94+
week_year = (all_df.timestamp.dt.weekofyear + all_df.timestamp.dt.year * 100)
95+
week_year_cnt_map = week_year.value_counts().to_dict()
96+
all_df['week_year_cnt'] = week_year.map(week_year_cnt_map)
97+
98+
# num of floor from top
99+
all_df["floor_from_top"] = all_df["max_floor"] - all_df["floor"]
100+
101+
# difference between full area and living area
102+
all_df["extra_sq"] = all_df["full_sq"] - all_df["life_sq"]
103+
104+
# age of building
105+
all_df["age_of_building"] = all_df["build_year"] - all_df["year"]
106+
80107

81108
for f in all_df.columns:
82109
if all_df[f].dtype == 'object' and f is not 'env':
@@ -212,13 +239,13 @@ def main():
212239
print "running MLP..."
213240
model = Sequential()
214241

215-
model.add(Dense(40, input_dim = X_train.shape[1], init = 'he_normal'))
242+
model.add(Dense(64, input_dim = X_train.shape[1], init = 'he_normal'))
216243
model.add(Activation('relu'))
217244
model.add(BatchNormalization())
218-
model.add(Dense(40, init = 'he_normal'))
245+
model.add(Dense(32, init = 'he_normal'))
219246
model.add(Activation('relu'))
220247
model.add(BatchNormalization())
221-
model.add(Dense(20, init = 'he_normal'))
248+
model.add(Dense(16, init = 'he_normal'))
222249
model.add(Activation('relu'))
223250
model.add(BatchNormalization())
224251
model.add(Dense(1, init = 'he_normal'))

0 commit comments

Comments
 (0)