@@ -52,17 +52,23 @@ def main():
52
52
train_df .loc [train_df ['build_year' ] == 3 , 'build_year' ] = np .nan
53
53
train_df .loc [train_df ['build_year' ] == 71 , 'build_year' ] = np .nan
54
54
55
- #missing data imputation, merging and label encoding
55
+ # truncate the extreme values in price_doc
56
+ ulimit = np .percentile (train_df .price_doc .values , 99 )
57
+ llimit = np .percentile (train_df .price_doc .values , 1 )
58
+ train_df ['price_doc' ].loc [train_df ['price_doc' ]> ulimit ] = ulimit
59
+ train_df ['price_doc' ].loc [train_df ['price_doc' ]< llimit ] = llimit
60
+
61
+ #missing data imputation, merging, feature engineering and label encoding
56
62
train_df ['env' ] = 'train'
57
63
test_df ['env' ] = 'test'
58
64
test_idx = test_df ['id' ]
59
65
60
66
train_df .drop (['id' ], axis = 1 , inplace = True )
61
67
test_df .drop (['id' ], axis = 1 , inplace = True )
62
68
test_df ['price_doc' ] = 0
63
-
64
- train_df = train_df .dropna () #drop training rows
65
- macro_df = macro_df .dropna ()
69
+
70
+ # train_df = train_df.dropna() #drop training rows
71
+ # macro_df = macro_df.dropna()
66
72
tdf_med = test_df .median ()
67
73
tdf_product_mode = stats .mode (test_df ['product_type' ])[0 ][0 ]
68
74
test_df = test_df .fillna (tdf_med ) #fill-in test rows
@@ -75,8 +81,29 @@ def main():
75
81
print "num nans: " , all_df .isnull ().sum ().sum ()
76
82
77
83
#add month and day of week
84
+ all_df ["year" ] = all_df .timestamp .dt .year
78
85
all_df ['month' ] = all_df .timestamp .dt .month
79
86
all_df ['dow' ] = all_df .timestamp .dt .dayofweek
87
+
88
+ #add month-year
89
+ month_year = (all_df .timestamp .dt .month + all_df .timestamp .dt .year * 100 )
90
+ month_year_cnt_map = month_year .value_counts ().to_dict ()
91
+ all_df ['month_year_cnt' ] = month_year .map (month_year_cnt_map )
92
+
93
+ #add week-year count
94
+ week_year = (all_df .timestamp .dt .weekofyear + all_df .timestamp .dt .year * 100 )
95
+ week_year_cnt_map = week_year .value_counts ().to_dict ()
96
+ all_df ['week_year_cnt' ] = week_year .map (week_year_cnt_map )
97
+
98
+ # num of floor from top
99
+ all_df ["floor_from_top" ] = all_df ["max_floor" ] - all_df ["floor" ]
100
+
101
+ # difference between full area and living area
102
+ all_df ["extra_sq" ] = all_df ["full_sq" ] - all_df ["life_sq" ]
103
+
104
+ # age of building
105
+ all_df ["age_of_building" ] = all_df ["build_year" ] - all_df ["year" ]
106
+
80
107
81
108
for f in all_df .columns :
82
109
if all_df [f ].dtype == 'object' and f is not 'env' :
@@ -212,13 +239,13 @@ def main():
212
239
print "running MLP..."
213
240
model = Sequential ()
214
241
215
- model .add (Dense (40 , input_dim = X_train .shape [1 ], init = 'he_normal' ))
242
+ model .add (Dense (64 , input_dim = X_train .shape [1 ], init = 'he_normal' ))
216
243
model .add (Activation ('relu' ))
217
244
model .add (BatchNormalization ())
218
- model .add (Dense (40 , init = 'he_normal' ))
245
+ model .add (Dense (32 , init = 'he_normal' ))
219
246
model .add (Activation ('relu' ))
220
247
model .add (BatchNormalization ())
221
- model .add (Dense (20 , init = 'he_normal' ))
248
+ model .add (Dense (16 , init = 'he_normal' ))
222
249
model .add (Activation ('relu' ))
223
250
model .add (BatchNormalization ())
224
251
model .add (Dense (1 , init = 'he_normal' ))
0 commit comments