Skip to content

Commit 3f1b4a1

Browse files
committed
Finished project and submited first model results
1 parent d506b2a commit 3f1b4a1

34 files changed

+6832
-2928
lines changed

01_exploratory_analysis_pre_cleaning.ipynb

Lines changed: 0 additions & 982 deletions
This file was deleted.

01_exploratory_analysis_pre_treatment.ipynb

Lines changed: 1039 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 86 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@
1313
"source": [
1414
"Based on the conclusion of the Exploratory Data Analysis, we fill in some missing data assuming the following hypothesis:\n",
1515
"\n",
16-
"- Fill in the Credit_History with 1. \n",
16+
"- Fill in the CoapplicantIncome with 0.\n",
1717
"- Fill in the LoanAmount_Terms with 360 for clients who have this value missing.\n",
18-
"- Fill in the CoapplicantIncome with 0."
18+
"- Fill in the Self_Employed with 'No'."
1919
]
2020
},
2121
{
2222
"cell_type": "code",
23-
"execution_count": 2,
23+
"execution_count": 13,
2424
"metadata": {},
2525
"outputs": [],
2626
"source": [
@@ -40,16 +40,16 @@
4040
},
4141
{
4242
"cell_type": "code",
43-
"execution_count": 3,
43+
"execution_count": 14,
4444
"metadata": {},
4545
"outputs": [
4646
{
4747
"data": {
4848
"text/plain": [
49-
"(614, 14)"
49+
"(614, 15)"
5050
]
5151
},
52-
"execution_count": 3,
52+
"execution_count": 14,
5353
"metadata": {},
5454
"output_type": "execute_result"
5555
}
@@ -69,14 +69,13 @@
6969
},
7070
{
7171
"cell_type": "code",
72-
"execution_count": 4,
72+
"execution_count": 15,
7373
"metadata": {},
7474
"outputs": [],
7575
"source": [
7676
"df_fill = df_import.copy()\n",
77-
"df_fill.loc[df_fill['Credit_History'].isnull(),'Credit_History'] = 1\n",
7877
"df_fill.loc[df_fill['Loan_Amount_Term'].isnull(),'Loan_Amount_Term'] = 360\n",
79-
"# df_fill.loc[df_fill['Self_Employed'].isnull(),'Self_Employed'] = 'No'\n",
78+
"df_fill.loc[df_fill['Self_Employed'].isnull(),'Self_Employed'] = 'No'\n",
8079
"df_fill.loc[df_fill['CoapplicantIncome'].isnull(),'CoapplicantIncome'] = 0"
8180
]
8281
},
@@ -89,7 +88,7 @@
8988
},
9089
{
9190
"cell_type": "code",
92-
"execution_count": 5,
91+
"execution_count": 4,
9392
"metadata": {},
9493
"outputs": [],
9594
"source": [
@@ -98,6 +97,22 @@
9897
"total_income = df_fill['ApplicantIncome'] + df_fill['CoapplicantIncome']\n",
9998
"remaining_income = (total_income - base_loan_installment) / total_income\n",
10099
"\n",
100+
"married = df_import['Married'].copy()\n",
101+
"married[married == 'Yes'] = 1\n",
102+
"married[married == 'No'] = 0\n",
103+
"\n",
104+
"dependents = df_import['Dependents'].copy()\n",
105+
"dependents[dependents == '0'] = 0\n",
106+
"dependents[dependents == '1'] = 1\n",
107+
"dependents[dependents == '2'] = 2\n",
108+
"dependents[dependents == '3+'] = 3\n",
109+
"\n",
110+
"\n",
111+
"people_in_house = dependents + married + 1\n",
112+
"household_percapita = (total_income / people_in_house).astype(float)\n",
113+
"household_percapita.describe()\n",
114+
"\n",
115+
"\n",
101116
"df_fill['Base_Loan_Installment'] = base_loan_installment\n",
102117
"df_fill['Remaining_Income'] = remaining_income"
103118
]
@@ -106,35 +121,35 @@
106121
"cell_type": "markdown",
107122
"metadata": {},
108123
"source": [
109-
"## Removing NaNs and Checking Valid Values Again"
124+
"## Removing NaNs and Outliers"
110125
]
111126
},
112127
{
113128
"cell_type": "code",
114-
"execution_count": 7,
129+
"execution_count": 5,
115130
"metadata": {},
116131
"outputs": [
117132
{
118133
"data": {
119134
"text/plain": [
120-
"Gender 535\n",
121-
"Married 535\n",
122-
"Dependents 535\n",
123-
"Education 535\n",
124-
"Self_Employed 535\n",
125-
"ApplicantIncome 535\n",
126-
"CoapplicantIncome 535\n",
127-
"LoanAmount 535\n",
128-
"Loan_Amount_Term 535\n",
129-
"Credit_History 535\n",
130-
"Property_Area 535\n",
131-
"Loan_Status 535\n",
132-
"Base_Loan_Installment 535\n",
133-
"Remaining_Income 535\n",
135+
"Gender 518\n",
136+
"Married 518\n",
137+
"Dependents 518\n",
138+
"Education 518\n",
139+
"Self_Employed 518\n",
140+
"ApplicantIncome 518\n",
141+
"CoapplicantIncome 518\n",
142+
"LoanAmount 518\n",
143+
"Loan_Amount_Term 518\n",
144+
"Credit_History 518\n",
145+
"Property_Area 518\n",
146+
"Loan_Status 518\n",
147+
"Base_Loan_Installment 518\n",
148+
"Remaining_Income 518\n",
134149
"dtype: int64"
135150
]
136151
},
137-
"execution_count": 7,
152+
"execution_count": 5,
138153
"metadata": {},
139154
"output_type": "execute_result"
140155
}
@@ -144,6 +159,24 @@
144159
"df_fill.count()"
145160
]
146161
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": 6,
165+
"metadata": {},
166+
"outputs": [
167+
{
168+
"name": "stdout",
169+
"output_type": "stream",
170+
"text": [
171+
"(491, 14)\n"
172+
]
173+
}
174+
],
175+
"source": [
176+
"numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Base_Loan_Installment','Remaining_Income']\n",
177+
"df_fill_no_outlier = preprocess_utils.remove_outliers(df_fill,numerical_columns,threshold = 3)"
178+
]
179+
},
147180
{
148181
"cell_type": "markdown",
149182
"metadata": {},
@@ -153,7 +186,7 @@
153186
},
154187
{
155188
"cell_type": "code",
156-
"execution_count": 8,
189+
"execution_count": 7,
157190
"metadata": {},
158191
"outputs": [],
159192
"source": [
@@ -170,7 +203,7 @@
170203
},
171204
{
172205
"cell_type": "code",
173-
"execution_count": 9,
206+
"execution_count": 8,
174207
"metadata": {},
175208
"outputs": [
176209
{
@@ -189,8 +222,8 @@
189222
],
190223
"source": [
191224
"categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']\n",
192-
"df_encoded = df_fill.copy()\n",
193-
"df_encoded[categorical_columns],ordinal_encoder = encode_labels(df_fill[categorical_columns])"
225+
"df_encoded = df_fill_no_outlier.copy()\n",
226+
"df_encoded[categorical_columns],ordinal_encoder = encode_labels(df_fill_no_outlier[categorical_columns])"
194227
]
195228
},
196229
{
@@ -202,7 +235,7 @@
202235
},
203236
{
204237
"cell_type": "code",
205-
"execution_count": 10,
238+
"execution_count": 9,
206239
"metadata": {},
207240
"outputs": [],
208241
"source": [
@@ -211,7 +244,7 @@
211244
},
212245
{
213246
"cell_type": "code",
214-
"execution_count": 11,
247+
"execution_count": 10,
215248
"metadata": {},
216249
"outputs": [
217250
{
@@ -392,18 +425,18 @@
392425
" <th>8</th>\n",
393426
" <td>1.0</td>\n",
394427
" <td>1.0</td>\n",
395-
" <td>1.0</td>\n",
428+
" <td>2.0</td>\n",
396429
" <td>0.0</td>\n",
397430
" <td>0.0</td>\n",
398-
" <td>12841</td>\n",
399-
" <td>10968.0</td>\n",
400-
" <td>349.0</td>\n",
431+
" <td>3200</td>\n",
432+
" <td>700.0</td>\n",
433+
" <td>70.0</td>\n",
401434
" <td>360.0</td>\n",
402435
" <td>1.0</td>\n",
436+
" <td>2.0</td>\n",
403437
" <td>1.0</td>\n",
404-
" <td>0.0</td>\n",
405-
" <td>969.444444</td>\n",
406-
" <td>0.959282</td>\n",
438+
" <td>194.444444</td>\n",
439+
" <td>0.950142</td>\n",
407440
" </tr>\n",
408441
" <tr>\n",
409442
" <th>9</th>\n",
@@ -412,15 +445,15 @@
412445
" <td>2.0</td>\n",
413446
" <td>0.0</td>\n",
414447
" <td>0.0</td>\n",
415-
" <td>3200</td>\n",
416-
" <td>700.0</td>\n",
417-
" <td>70.0</td>\n",
448+
" <td>2500</td>\n",
449+
" <td>1840.0</td>\n",
450+
" <td>109.0</td>\n",
418451
" <td>360.0</td>\n",
419452
" <td>1.0</td>\n",
420453
" <td>2.0</td>\n",
421454
" <td>1.0</td>\n",
422-
" <td>194.444444</td>\n",
423-
" <td>0.950142</td>\n",
455+
" <td>302.777778</td>\n",
456+
" <td>0.930236</td>\n",
424457
" </tr>\n",
425458
" </tbody>\n",
426459
"</table>\n",
@@ -436,8 +469,8 @@
436469
"5 1.0 1.0 0.0 1.0 0.0 2333 \n",
437470
"6 1.0 1.0 3.0 0.0 0.0 3036 \n",
438471
"7 1.0 1.0 2.0 0.0 0.0 4006 \n",
439-
"8 1.0 1.0 1.0 0.0 0.0 12841 \n",
440-
"9 1.0 1.0 2.0 0.0 0.0 3200 \n",
472+
"8 1.0 1.0 2.0 0.0 0.0 3200 \n",
473+
"9 1.0 1.0 2.0 0.0 0.0 2500 \n",
441474
"\n",
442475
" CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History \\\n",
443476
"0 1508.0 128.0 360.0 1.0 \n",
@@ -448,8 +481,8 @@
448481
"5 1516.0 95.0 360.0 1.0 \n",
449482
"6 2504.0 158.0 360.0 0.0 \n",
450483
"7 1526.0 168.0 360.0 1.0 \n",
451-
"8 10968.0 349.0 360.0 1.0 \n",
452-
"9 700.0 70.0 360.0 1.0 \n",
484+
"8 700.0 70.0 360.0 1.0 \n",
485+
"9 1840.0 109.0 360.0 1.0 \n",
453486
"\n",
454487
" Property_Area Loan_Status Base_Loan_Installment Remaining_Income \n",
455488
"0 0.0 0.0 355.555556 0.941626 \n",
@@ -460,11 +493,11 @@
460493
"5 2.0 1.0 263.888889 0.931440 \n",
461494
"6 1.0 0.0 438.888889 0.920778 \n",
462495
"7 2.0 1.0 466.666667 0.915642 \n",
463-
"8 1.0 0.0 969.444444 0.959282 \n",
464-
"9 2.0 1.0 194.444444 0.950142 "
496+
"8 2.0 1.0 194.444444 0.950142 \n",
497+
"9 2.0 1.0 302.777778 0.930236 "
465498
]
466499
},
467-
"execution_count": 11,
500+
"execution_count": 10,
468501
"metadata": {},
469502
"output_type": "execute_result"
470503
}
@@ -502,7 +535,7 @@
502535
"name": "python",
503536
"nbconvert_exporter": "python",
504537
"pygments_lexer": "ipython3",
505-
"version": "3.5.4"
538+
"version": "3.6.9"
506539
}
507540
},
508541
"nbformat": 4,

0 commit comments

Comments
 (0)