|
13 | 13 | "source": [ |
14 | 14 | "Based on the conclusion of the Exploratory Data Analysis, we fill in some missing data assuming the following hypothesis:\n", |
15 | 15 | "\n", |
16 | | - "- Fill in the Credit_History with 1. \n", |
| 16 | + "- Fill in the CoapplicantIncome with 0.\n", |
17 | 17 | "- Fill in the LoanAmount_Terms with 360 for clients who have this value missing.\n", |
18 | | - "- Fill in the CoapplicantIncome with 0." |
| 18 | + "- Fill in the Self_Employed with 'No'." |
19 | 19 | ] |
20 | 20 | }, |
21 | 21 | { |
22 | 22 | "cell_type": "code", |
23 | | - "execution_count": 2, |
| 23 | + "execution_count": 13, |
24 | 24 | "metadata": {}, |
25 | 25 | "outputs": [], |
26 | 26 | "source": [ |
|
40 | 40 | }, |
41 | 41 | { |
42 | 42 | "cell_type": "code", |
43 | | - "execution_count": 3, |
| 43 | + "execution_count": 14, |
44 | 44 | "metadata": {}, |
45 | 45 | "outputs": [ |
46 | 46 | { |
47 | 47 | "data": { |
48 | 48 | "text/plain": [ |
49 | | - "(614, 14)" |
| 49 | + "(614, 15)" |
50 | 50 | ] |
51 | 51 | }, |
52 | | - "execution_count": 3, |
| 52 | + "execution_count": 14, |
53 | 53 | "metadata": {}, |
54 | 54 | "output_type": "execute_result" |
55 | 55 | } |
|
69 | 69 | }, |
70 | 70 | { |
71 | 71 | "cell_type": "code", |
72 | | - "execution_count": 4, |
| 72 | + "execution_count": 15, |
73 | 73 | "metadata": {}, |
74 | 74 | "outputs": [], |
75 | 75 | "source": [ |
76 | 76 | "df_fill = df_import.copy()\n", |
77 | | - "df_fill.loc[df_fill['Credit_History'].isnull(),'Credit_History'] = 1\n", |
78 | 77 | "df_fill.loc[df_fill['Loan_Amount_Term'].isnull(),'Loan_Amount_Term'] = 360\n", |
79 | | - "# df_fill.loc[df_fill['Self_Employed'].isnull(),'Self_Employed'] = 'No'\n", |
| 78 | + "df_fill.loc[df_fill['Self_Employed'].isnull(),'Self_Employed'] = 'No'\n", |
80 | 79 | "df_fill.loc[df_fill['CoapplicantIncome'].isnull(),'CoapplicantIncome'] = 0" |
81 | 80 | ] |
82 | 81 | }, |
|
89 | 88 | }, |
90 | 89 | { |
91 | 90 | "cell_type": "code", |
92 | | - "execution_count": 5, |
| 91 | + "execution_count": 4, |
93 | 92 | "metadata": {}, |
94 | 93 | "outputs": [], |
95 | 94 | "source": [ |
|
98 | 97 | "total_income = df_fill['ApplicantIncome'] + df_fill['CoapplicantIncome']\n", |
99 | 98 | "remaining_income = (total_income - base_loan_installment) / total_income\n", |
100 | 99 | "\n", |
| 100 | + "married = df_import['Married'].copy()\n", |
| 101 | + "married[married == 'Yes'] = 1\n", |
| 102 | + "married[married == 'No'] = 0\n", |
| 103 | + "\n", |
| 104 | + "dependents = df_import['Dependents'].copy()\n", |
| 105 | + "dependents[dependents == '0'] = 0\n", |
| 106 | + "dependents[dependents == '1'] = 1\n", |
| 107 | + "dependents[dependents == '2'] = 2\n", |
| 108 | + "dependents[dependents == '3+'] = 3\n", |
| 109 | + "\n", |
| 110 | + "\n", |
| 111 | + "people_in_house = dependents + married + 1\n", |
| 112 | + "household_percapita = (total_income / people_in_house).astype(float)\n", |
| 113 | + "household_percapita.describe()\n", |
| 114 | + "\n", |
| 115 | + "\n", |
101 | 116 | "df_fill['Base_Loan_Installment'] = base_loan_installment\n", |
102 | 117 | "df_fill['Remaining_Income'] = remaining_income" |
103 | 118 | ] |
|
106 | 121 | "cell_type": "markdown", |
107 | 122 | "metadata": {}, |
108 | 123 | "source": [ |
109 | | - "## Removing NaNs and Checking Valid Values Again" |
| 124 | + "## Removing NaNs and Outliers" |
110 | 125 | ] |
111 | 126 | }, |
112 | 127 | { |
113 | 128 | "cell_type": "code", |
114 | | - "execution_count": 7, |
| 129 | + "execution_count": 5, |
115 | 130 | "metadata": {}, |
116 | 131 | "outputs": [ |
117 | 132 | { |
118 | 133 | "data": { |
119 | 134 | "text/plain": [ |
120 | | - "Gender 535\n", |
121 | | - "Married 535\n", |
122 | | - "Dependents 535\n", |
123 | | - "Education 535\n", |
124 | | - "Self_Employed 535\n", |
125 | | - "ApplicantIncome 535\n", |
126 | | - "CoapplicantIncome 535\n", |
127 | | - "LoanAmount 535\n", |
128 | | - "Loan_Amount_Term 535\n", |
129 | | - "Credit_History 535\n", |
130 | | - "Property_Area 535\n", |
131 | | - "Loan_Status 535\n", |
132 | | - "Base_Loan_Installment 535\n", |
133 | | - "Remaining_Income 535\n", |
| 135 | + "Gender 518\n", |
| 136 | + "Married 518\n", |
| 137 | + "Dependents 518\n", |
| 138 | + "Education 518\n", |
| 139 | + "Self_Employed 518\n", |
| 140 | + "ApplicantIncome 518\n", |
| 141 | + "CoapplicantIncome 518\n", |
| 142 | + "LoanAmount 518\n", |
| 143 | + "Loan_Amount_Term 518\n", |
| 144 | + "Credit_History 518\n", |
| 145 | + "Property_Area 518\n", |
| 146 | + "Loan_Status 518\n", |
| 147 | + "Base_Loan_Installment 518\n", |
| 148 | + "Remaining_Income 518\n", |
134 | 149 | "dtype: int64" |
135 | 150 | ] |
136 | 151 | }, |
137 | | - "execution_count": 7, |
| 152 | + "execution_count": 5, |
138 | 153 | "metadata": {}, |
139 | 154 | "output_type": "execute_result" |
140 | 155 | } |
|
144 | 159 | "df_fill.count()" |
145 | 160 | ] |
146 | 161 | }, |
| 162 | + { |
| 163 | + "cell_type": "code", |
| 164 | + "execution_count": 6, |
| 165 | + "metadata": {}, |
| 166 | + "outputs": [ |
| 167 | + { |
| 168 | + "name": "stdout", |
| 169 | + "output_type": "stream", |
| 170 | + "text": [ |
| 171 | + "(491, 14)\n" |
| 172 | + ] |
| 173 | + } |
| 174 | + ], |
| 175 | + "source": [ |
| 176 | + "numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Base_Loan_Installment','Remaining_Income']\n", |
| 177 | + "df_fill_no_outlier = preprocess_utils.remove_outliers(df_fill,numerical_columns,threshold = 3)" |
| 178 | + ] |
| 179 | + }, |
147 | 180 | { |
148 | 181 | "cell_type": "markdown", |
149 | 182 | "metadata": {}, |
|
153 | 186 | }, |
154 | 187 | { |
155 | 188 | "cell_type": "code", |
156 | | - "execution_count": 8, |
| 189 | + "execution_count": 7, |
157 | 190 | "metadata": {}, |
158 | 191 | "outputs": [], |
159 | 192 | "source": [ |
|
170 | 203 | }, |
171 | 204 | { |
172 | 205 | "cell_type": "code", |
173 | | - "execution_count": 9, |
| 206 | + "execution_count": 8, |
174 | 207 | "metadata": {}, |
175 | 208 | "outputs": [ |
176 | 209 | { |
|
189 | 222 | ], |
190 | 223 | "source": [ |
191 | 224 | "categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']\n", |
192 | | - "df_encoded = df_fill.copy()\n", |
193 | | - "df_encoded[categorical_columns],ordinal_encoder = encode_labels(df_fill[categorical_columns])" |
| 225 | + "df_encoded = df_fill_no_outlier.copy()\n", |
| 226 | + "df_encoded[categorical_columns],ordinal_encoder = encode_labels(df_fill_no_outlier[categorical_columns])" |
194 | 227 | ] |
195 | 228 | }, |
196 | 229 | { |
|
202 | 235 | }, |
203 | 236 | { |
204 | 237 | "cell_type": "code", |
205 | | - "execution_count": 10, |
| 238 | + "execution_count": 9, |
206 | 239 | "metadata": {}, |
207 | 240 | "outputs": [], |
208 | 241 | "source": [ |
|
211 | 244 | }, |
212 | 245 | { |
213 | 246 | "cell_type": "code", |
214 | | - "execution_count": 11, |
| 247 | + "execution_count": 10, |
215 | 248 | "metadata": {}, |
216 | 249 | "outputs": [ |
217 | 250 | { |
|
392 | 425 | " <th>8</th>\n", |
393 | 426 | " <td>1.0</td>\n", |
394 | 427 | " <td>1.0</td>\n", |
395 | | - " <td>1.0</td>\n", |
| 428 | + " <td>2.0</td>\n", |
396 | 429 | " <td>0.0</td>\n", |
397 | 430 | " <td>0.0</td>\n", |
398 | | - " <td>12841</td>\n", |
399 | | - " <td>10968.0</td>\n", |
400 | | - " <td>349.0</td>\n", |
| 431 | + " <td>3200</td>\n", |
| 432 | + " <td>700.0</td>\n", |
| 433 | + " <td>70.0</td>\n", |
401 | 434 | " <td>360.0</td>\n", |
402 | 435 | " <td>1.0</td>\n", |
| 436 | + " <td>2.0</td>\n", |
403 | 437 | " <td>1.0</td>\n", |
404 | | - " <td>0.0</td>\n", |
405 | | - " <td>969.444444</td>\n", |
406 | | - " <td>0.959282</td>\n", |
| 438 | + " <td>194.444444</td>\n", |
| 439 | + " <td>0.950142</td>\n", |
407 | 440 | " </tr>\n", |
408 | 441 | " <tr>\n", |
409 | 442 | " <th>9</th>\n", |
|
412 | 445 | " <td>2.0</td>\n", |
413 | 446 | " <td>0.0</td>\n", |
414 | 447 | " <td>0.0</td>\n", |
415 | | - " <td>3200</td>\n", |
416 | | - " <td>700.0</td>\n", |
417 | | - " <td>70.0</td>\n", |
| 448 | + " <td>2500</td>\n", |
| 449 | + " <td>1840.0</td>\n", |
| 450 | + " <td>109.0</td>\n", |
418 | 451 | " <td>360.0</td>\n", |
419 | 452 | " <td>1.0</td>\n", |
420 | 453 | " <td>2.0</td>\n", |
421 | 454 | " <td>1.0</td>\n", |
422 | | - " <td>194.444444</td>\n", |
423 | | - " <td>0.950142</td>\n", |
| 455 | + " <td>302.777778</td>\n", |
| 456 | + " <td>0.930236</td>\n", |
424 | 457 | " </tr>\n", |
425 | 458 | " </tbody>\n", |
426 | 459 | "</table>\n", |
|
436 | 469 | "5 1.0 1.0 0.0 1.0 0.0 2333 \n", |
437 | 470 | "6 1.0 1.0 3.0 0.0 0.0 3036 \n", |
438 | 471 | "7 1.0 1.0 2.0 0.0 0.0 4006 \n", |
439 | | - "8 1.0 1.0 1.0 0.0 0.0 12841 \n", |
440 | | - "9 1.0 1.0 2.0 0.0 0.0 3200 \n", |
| 472 | + "8 1.0 1.0 2.0 0.0 0.0 3200 \n", |
| 473 | + "9 1.0 1.0 2.0 0.0 0.0 2500 \n", |
441 | 474 | "\n", |
442 | 475 | " CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History \\\n", |
443 | 476 | "0 1508.0 128.0 360.0 1.0 \n", |
|
448 | 481 | "5 1516.0 95.0 360.0 1.0 \n", |
449 | 482 | "6 2504.0 158.0 360.0 0.0 \n", |
450 | 483 | "7 1526.0 168.0 360.0 1.0 \n", |
451 | | - "8 10968.0 349.0 360.0 1.0 \n", |
452 | | - "9 700.0 70.0 360.0 1.0 \n", |
| 484 | + "8 700.0 70.0 360.0 1.0 \n", |
| 485 | + "9 1840.0 109.0 360.0 1.0 \n", |
453 | 486 | "\n", |
454 | 487 | " Property_Area Loan_Status Base_Loan_Installment Remaining_Income \n", |
455 | 488 | "0 0.0 0.0 355.555556 0.941626 \n", |
|
460 | 493 | "5 2.0 1.0 263.888889 0.931440 \n", |
461 | 494 | "6 1.0 0.0 438.888889 0.920778 \n", |
462 | 495 | "7 2.0 1.0 466.666667 0.915642 \n", |
463 | | - "8 1.0 0.0 969.444444 0.959282 \n", |
464 | | - "9 2.0 1.0 194.444444 0.950142 " |
| 496 | + "8 2.0 1.0 194.444444 0.950142 \n", |
| 497 | + "9 2.0 1.0 302.777778 0.930236 " |
465 | 498 | ] |
466 | 499 | }, |
467 | | - "execution_count": 11, |
| 500 | + "execution_count": 10, |
468 | 501 | "metadata": {}, |
469 | 502 | "output_type": "execute_result" |
470 | 503 | } |
|
502 | 535 | "name": "python", |
503 | 536 | "nbconvert_exporter": "python", |
504 | 537 | "pygments_lexer": "ipython3", |
505 | | - "version": "3.5.4" |
| 538 | + "version": "3.6.9" |
506 | 539 | } |
507 | 540 | }, |
508 | 541 | "nbformat": 4, |
|
0 commit comments