-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdashboard.py
124 lines (108 loc) · 5.92 KB
/
dashboard.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src.data_cleaning import *
from src.data_eda import *
# endregion ========= Import Libraries =========
# region ========= Load Data =========
raw_data = pd.read_csv('./data/raw_loan_data.csv')
metadata_path = './data/metadata.txt'
metadata = pd.read_csv(metadata_path, sep='\t', header=None, names=['Feature', 'Description', 'Type'])
num_features = raw_data.select_dtypes(exclude='object').columns.tolist()
cat_features = raw_data.select_dtypes(include='object').columns.tolist()
all_features = num_features + cat_features
# endregion ========= Load Metadata =========
# region ========= EDA =========
# Make a dashboard
import matplotlib.pyplot as plt
import matplotlib.style
matplotlib.style.use('seaborn-v0_8-whitegrid') # Set global theme style
# Calculate descriptive statistics
desc_stats = raw_data.describe().T[['min', 'mean', 'std', 'max']].sort_values(by='max', ascending=False).applymap(lambda x: f'{x:,.2f}')
# Create a mosaic plot layout dynamically from all features
num_rows = (len(all_features) + 2) // 3 # Calculate number of rows needed
mosaic = []
for i in range(num_rows):
row = all_features[i*3:(i+1)*3]
while len(row) < 3:
row.append('reserve') # Fill the row with 'reserve' if less than 3 features
mosaic.append(row)
mosaic_final = [
['info', 'person_gender', 'person_education', 'person_home_ownership'],
['person_age', 'person_income', 'loan_amnt', 'credit_score'],
['person_emp_exp','loan_percent_income', 'loan_int_rate', 'loan_status'],
['cb_person_cred_hist_length','loan_intent', 'previous_loan_defaults_on_file', 'reserve_1']
]
fig, axs = plt.subplot_mosaic(mosaic_final, figsize=(24, 18), layout='constrained')
fig.patch.set_facecolor('#f0f0f0') # Set background color for the figure
def plot_features(data, num_features, cat_features, axs, metadata):
""""Docstring for plot_features
:param data: The dataset containing the features
:type data: pd.DataFrame
:param num_features: List of numerical features
:type num_features: list
:param cat_features: List of categorical features
:type cat_features: list
:param axs: Axes for plotting
:type axs: dict
:param metadata: Metadata containing feature descriptions
:type metadata: pd.DataFrame
"""
# Plot loan_status as count bar plot
loan_status_counts = data['loan_status'].value_counts().sort_values(ascending=False)
axs['loan_status'].bar(loan_status_counts.index, loan_status_counts.values, color='#4CAF50')
axs['loan_status'].set_title('Bar Plot of Loan Status', fontweight='bold', fontsize=14)
axs['loan_status'].set_xlabel('Loan Status', fontsize=12)
axs['loan_status'].set_ylabel('Count', fontsize=12)
axs['loan_status'].set_xticks([0, 1]) # Set x-axis ticks to only 0 and 1
axs['loan_status'].set_xticklabels(['Approve', 'Reject'], fontsize=10) # Set tick labels to Approve and Reject
# Add count numbers in the middle of the bars with thousand separator
for i, count in enumerate(loan_status_counts.values):
axs['loan_status'].text(i, count / 2, f'{count:,}', ha='center', va='center', color='black', fontsize=14)
# Plot histogram for credit_score
axs['credit_score'].hist(data['credit_score'], bins=30, color='#2196F3', edgecolor='black')
axs['credit_score'].set_title('Histogram of Credit Score', fontweight='bold', fontsize=14)
axs['credit_score'].set_xlabel('Credit Score', fontsize=12)
axs['credit_score'].set_ylabel('Frequency', fontsize=12)
for feature in cat_features:
counts = data[feature].value_counts().sort_values(ascending=False)
description = metadata.loc[metadata['Feature'] == feature, 'Description'].values[0]
axs[feature].bar(counts.index, counts.values, color='#FF5722')
axs[feature].set_title(f'Bar Plot of {description}', fontweight='bold', fontsize=14)
axs[feature].set_ylabel('Count', fontsize=12)
axs[feature].set_xlabel('', fontsize=12) # Hide x-axis label
for feature in num_features:
if feature not in ['loan_status', 'credit_score']:
description = metadata.loc[metadata['Feature'] == feature, 'Description'].values[0]
axs[feature].boxplot(data[feature], vert=False, patch_artist=True, boxprops=dict(facecolor='#FFC107'))
axs[feature].set_title(f'Box Plot of {description}', fontweight='bold', fontsize=14)
axs[feature].set_xlabel('', fontsize=12) # Hide x-axis label
axs[feature].yaxis.set_visible(False) # Hide y-axis
# Set x-axis labels for 'loan_intent' to be more readable
axs['loan_intent'].set_xticklabels(axs['loan_intent'].get_xticklabels(), rotation=45, ha='right', fontsize=10)
# Add conclusion text
conclusion_text = (
f"Exploratory Data Analysis Insights:\n"
f"1. Presence of outliers in 'Age' and 'Years of Experience'.\n"
f"2. Significant scale differences in 'Income' and 'Age'.\n"
f"3. Imbalance observed in 'Loan Status'.\n"
f"4. 'Credit Score' exhibits outliers.\n"
f"5. There is no missing values in the dataset."
)
axs['reserve_1'].text(0.01, 0.99, conclusion_text, ha='left', va='top', fontsize=14, wrap=True, linespacing=2)
axs['reserve_1'].set_axis_off() # Hide the axis
# Add info text
info_text = (
f"**Dataset Information:**\n"
f"**Number of Samples:** {data.shape[0]:,}\n"
f"**Number of Features:** {data.shape[1]:,}\n"
f"**Numerical Features:** {len(num_features):,}\n"
f"**Categorical Features:** {len(cat_features):,}\n"
)
axs['info'].text(0.01, 0.99, info_text, ha='left', va='top', fontsize=14, wrap=True, linespacing=2)
axs['info'].set_axis_off() # Hide the axis
plot_features(raw_data, num_features, cat_features, axs, metadata)
# Save the plot
plt.savefig('/mnt/d/Beta/plots/eda_dashboard.png', dpi=384, bbox_inches='tight')
plt.show()
# endregion ========= EDA =========