data_analytics&_model_(1).py

# -*- coding: utf-8 -*-
"""data_analytics&_model (1).ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1oIE9E4iA2iUAU-jQ7kVlSncKzXBYGcgc

## Data cleaning
"""


import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import feature_selection, metrics
from pandas.plotting import scatter_matrix
from seaborn import pairplot
from sklearn import feature_selection, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import copy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

os.chdir('/content/drive/MyDrive/big_data_pred/cw/')

dbdata = pd.read_csv('diabetic_data.csv')

#shape of the daibetes dataframe
dbdata.shape
#this implies that the data has 101,766 rows (observations) and 50 columns (attributes)

dbdata.head()

dbdata.info()
#some of the columns which are meant to be categorical are numerical in the data
#hence we need to convert them to categorical

dbdata.describe()

dbdata.hist(figsize = (20,20))
plt.show()

dbdata.isnull().sum()
#while there are several missing data in the dataframe, isnull() function shows that there is non.
#this is becuase the null data are not represented by na or nan, but rather by space and ?
#to replace the null values with na so that we can track them i'll use the regex functions

dbdata = dbdata.replace('?', np.nan) # replace ? with nan
dbdata = dbdata.replace('^\s+', np.nan, regex=True) # replace empty spaces with nan
print(dbdata.isna().sum())
print(dbdata.shape[0])

#Drop column with more than 50% missing values
dbdata.dropna(thresh=len(dbdata.index)/2, axis=1, inplace=True)
dbdata.shape


for x in dbdata.columns:
    f = dbdata[x].value_counts()/dbdata.shape[0]
    if f.to_frame().iloc[:, 0].max() >= 0.95:
      dbdata.drop(x, axis = 1, inplace = True)
print(dbdata.shape)

dbdata['age'].value_counts()


def age(df):
  for i in range(df.shape[0]):
    if(df.loc[i,'age']=='[70-80)'):
        df.loc[i,'age']=75
    elif(df.loc[i,'age']=='[60-70)'):
        df.loc[i,'age']=65
    elif(df.loc[i,'age']=='[50-60)'):
        df.loc[i,'age']=55
    elif(df.loc[i,'age']=='[80-90)'):
        df.loc[i,'age']=85
    elif(df.loc[i,'age']=='[40-50)'):
        df.loc[i,'age']=45
    elif(df.loc[i,'age']=='[30-40)'):
        df.loc[i,'age']=35
    elif(df.loc[i,'age']=='[90-100)'):
        df.loc[i,'age']=95
    elif(df.loc[i,'age']=='[20-30)'):
        df.loc[i,'age']=25
    elif(df.loc[i,'age']=='[10-20)'):
        df.loc[i,'age']=15
    elif(df.loc[i,'age']=='[0-10)'):
        df.loc[i,'age']=5

age(dbdata)


dbdata['age'] = pd.to_numeric(dbdata['age'], errors='coerce')

#replacing missing values in the follwing column with
diagcols = ['diag_1','diag_2', 'diag_3']

dbdata[diagcols] = dbdata[diagcols].fillna(0)

dbdata.dropna(inplace = True)

numeric_cols = dbdata.select_dtypes(include=['int64']).copy()
categorica_cols = dbdata.select_dtypes(include=['object']).copy()

for x in numeric_cols:
  sns.boxplot(dbdata.loc[:,x])
  plt.show()

#removing outliers above 3 std from the mean of the numeric columns
upperlimit = numeric_cols.mean() + 3*numeric_cols.std()
rmoutlier = numeric_cols[numeric_cols < upperlimit]

for x in numeric_cols.columns:
  shell = dbdata[x] <= (dbdata[x].mean() + 3*dbdata[x].std())
  dbdata = dbdata[shell]

dbdata.shape

dbdata.drop_duplicates(subset=['patient_nbr'], inplace=True)
dbdata.shape

for x in numeric_cols:
  sns.boxplot(dbdata.loc[:,x])
  plt.show()

dbdata

readmitted_count = dbdata.groupby(['readmitted']).size().sort_values(ascending=False)
readmitted_count
# Providing the information that 37.25% is the mean average rate of readmittence

"""## Data Exploration"""

# Commented out IPython magic to ensure Python compatibility.
# Move to the main install cell at the top

# %matplotlib inline
plt.style.use('ggplot')

"""#### Organising Data"""

dbdata.head()

# Merges the icd_description onto the table
icd_codes = pd.read_csv('/content/drive/MyDrive/big_data_pred/cw/icd_codes.csv')

icd_diabetes = pd.merge(dbdata, icd_codes, left_on='diag_1', right_on= 'ICD_Code')
icd_diabetes.columns

icd_diabetes.head()

# Produces a new table with information from the previous dataset, with the columns needed for analysis
exploration_data = copy.deepcopy(icd_diabetes)

exploration_data = exploration_data[['age', 'race', 'gender', 'Description', 'readmitted', 'time_in_hospital']]
exploration_data.columns = ['age', 'race', 'gender', 'diag_desc', 'Readmitted', 'time_in_hospital']

exploration_data.head()

"""#### Exploring Age Data"""

age_data = copy.deepcopy(exploration_data)

age_data = age_data[['age', 'Readmitted']]
age_data.head()

# Produces a table of the number of occurences for each age and readmission.
age_data = pd.crosstab(exploration_data['age'], exploration_data['Readmitted'] )
age_data

# Produces a barchart showing the frequencies of readmissions by age
age_data.plot(kind = 'bar')
plt.title('Frequency of Readmissions By Age Group')
plt.xlabel('Age')
plt.ylabel('Number of Readmissions')
#plt.savefig('/content/drive/MyDrive/big_data_pred/cw/graphs_out/Frequency_of_Readmissions_By_Age_Group.png', dpi=300)
plt.show()

# Converts the table into percentages of totals
age_data_perc = age_data[['<30', '>30', 'NO']] = age_data[['<30', '>30', 'NO']].apply(lambda x: x/x.sum() * 100, axis=1)
age_data_perc

# Produces a graph showing the percentages of readmissions by age
age_data_perc.plot(kind = 'bar')
plt.title('Percentage of Readmissions By Age Group')
plt.xlabel('Age')
plt.ylabel('Percentage of readmissions')
# plt.savefig('/content/drive/MyDrive/big_data_pred/cw/graphs_out/Percentage_of_Readmissions_By_Age_Group.png', dpi=300)
plt.show()

"""#### Exploring Race Data"""

# Produces a table of the number of occurences for each race and readmission.
race_data = pd.crosstab(exploration_data['race'], exploration_data['Readmitted'] )
race_data

# Produces a barchart showing the frequencies of readmissions by race
race_data.plot(kind = 'bar')
plt.title('Frequency of Readmissions By Race')
plt.xlabel('Race')
plt.ylabel('Number of Readmissions')
# plt.savefig('/content/drive/MyDrive/big_data_pred/cw/graphs_out/Frequency_of_Readmissions_By_Race.png', dpi=300)
plt.show()

# Shows the nummber of different race values in the dataset
exploration_data['race'].value_counts()

# Converts the table into percentages of totals
race_data_perc = race_data[['<30', '>30', 'NO']] = race_data[['<30', '>30', 'NO']].apply(lambda x: x/x.sum() * 100, axis=1)
race_data_perc

# Produces a graph showing the percentages of readmissions by race
race_data_perc.plot(kind = 'bar')
plt.title('Percentage of Readmissions By Race')
plt.xlabel('Race')
plt.ylabel('Percentage of readmissions')
# plt.savefig('/content/drive/MyDrive/big_data_pred/cw/graphs_out/Percentage_of_Readmissions_By_Race.png', dpi=300)
plt.show()

"""#### Exploring Gender Data"""

# Produces a table of the number of occurences for each gender and readmission.
gender_data = pd.crosstab(exploration_data['gender'], exploration_data['Readmitted'] )
gender_data

# Produces a barchart showing the frequencies of readmissions by gender
gender_data.plot(kind = 'bar')
plt.title('Frequency of Readmissions By Gender')
plt.xlabel('Gender')
plt.ylabel('Number of Readmissions')
# plt.savefig('/content/drive/MyDrive/big_data_pred/cw/graphs_out/Frequency_of_Readmissions_By_Gender.png', dpi=300)
plt.show()

exploration_data['gender'].value_counts()

# Converts the table into percentages of totals
gender_data_perc = gender_data[['<30', '>30', 'NO']] = gender_data[['<30', '>30', 'NO']].apply(lambda x: x/x.sum() * 100, axis=1)
gender_data_perc

# Produces a graph showing the percentages of readmissions by gender
gender_data_perc.plot(kind = 'bar')
plt.title('Percentage of Readmissions By Gender')
plt.xlabel('Gender')
plt.ylabel('Percentage of readmissions')
plt.show()

"""#### Exploring Diagnosis Data"""

# Produces a table of the number of occurences for each diagnosis type and readmission.
diagnosis_data = pd.crosstab(exploration_data['diag_desc'], exploration_data['Readmitted'], margins = True)
diagnosis_data

diagnosis_data = exploration_data[['diag_desc', 'Readmitted']]
# Converts readmitted to '1' and not readmitted to '0'
diagnosis_data['Readmitted'] = diagnosis_data['Readmitted'].map({'<30': 1, '>30': 1, 'NO': 0})
# Produces a table of the amount of times each diagnosis type was readmitted or not
diagnosis_data = diagnosis_data.groupby(['diag_desc', 'Readmitted']).size().sort_values(ascending=False)
diagnosis_data = diagnosis_data.to_frame()
diagnosis_data.sort_values(by = ['diag_desc'])
diagnosis_data.rename({0: 'count'}, axis=1, inplace=True)
diagnosis_data

# Produces a table with just readmitted. To be merged later.
readmitted_diag = diagnosis_data.query('Readmitted == 1')
readmitted_diag.rename({'count': 'readmitted_count'}, axis=1, inplace=True)
readmitted_diag

# Produces a table with patients not readmitted. To be merged later.
not_readmitted_diag = diagnosis_data.query('Readmitted == 0')
not_readmitted_diag.rename({'count': 'not_readmitted_count'}, axis=1, inplace=True)
not_readmitted_diag

# Completes an outer join on the above tables, to compare the amount of readmissions for each diagnosis type
merged_diag = not_readmitted_diag.merge(readmitted_diag, on='diag_desc', how='outer')

# Replaces Na values with 0
# As they were only missing initially because they were missing values, i.e. 0
merged_diag = merged_diag.fillna(0)

# Removes any data with less that 30 cases reported. This removes any extreme outliers
merged_diag = merged_diag[merged_diag['not_readmitted_count'] + merged_diag['readmitted_count'] > 30]

# Produces a new column with the percentage of people that were readmitted, for each diagnosis type
merged_diag['perc_of_readmitted'] = round((merged_diag['readmitted_count'] / (merged_diag['readmitted_count'] + merged_diag['not_readmitted_count'])) * 100, 2)
merged_diag = merged_diag.sort_values(by='perc_of_readmitted', ascending=False)
merged_diag

readmitted_count = dbdata.groupby(['readmitted']).size().sort_values(ascending=False)
perc_readmitted = ((readmitted_count[1] + readmitted_count[2]) / (readmitted_count[0] + readmitted_count[1] + readmitted_count[2]) * 100)
print("The mean average of percentage of readmissions was ", round(perc_readmitted, 2), "%", sep = "")
# Providing the information that 37.25% is the mean average rate of readmittence

# Produces a graph to display the above data
# Centers the data around the mean rate of readmissions
merged_diag['colors'] = ['red' if x > round(perc_readmitted, 2) else 'green' for x in merged_diag['perc_of_readmitted']]
# Increases the resolution and size so the output is usable
plt.figure(figsize=(14,30), dpi= 200)
# Defines the graph
plt.hlines(y=merged_diag.index, xmin=37.25, xmax=merged_diag.perc_of_readmitted, color=merged_diag.colors, alpha=0.4, linewidth=5)
plt.xlabel('Readmissions as a percentage of the total amount of cases')
plt.title('Number of readmissions per diagnosis type, as a percentage of the amount of cases.\nRelative to the mean amount of readmissions.', loc='left')
# plt.savefig('/content/drive/MyDrive/big_data_pred/cw/graphs_out/Number_of_readmissions_per_diagnosis_type.png',figsize=(20,25), dpi=200)
plt.show()

"""#### Exploring Length of Stay Data"""

# Produces a table of the length of stay in hospital, compared to age.
stay_length_data = pd.crosstab(exploration_data['age'], exploration_data['time_in_hospital'] )
stay_length_data

# Converts the table into percentages of totals
stay_length_data_perc = stay_length_data[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]] = stay_length_data[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]].apply(lambda x: x/x.sum() * 100, axis=1)
stay_length_data_perc

# Removes some extreme values whihc subtract from the overall analysis
stay_length_data_perc.drop(13, inplace=True, axis=1)
stay_length_data_perc.drop(12, inplace=True, axis=1)
stay_length_data_perc.drop(11, inplace=True, axis=1)
stay_length_data_perc.drop(5, inplace=True, axis=0)
stay_length_data_perc.drop(15, inplace=True, axis=0)
stay_length_data_perc.drop(25, inplace=True, axis=0)
stay_length_data_perc

# Produces a graph showing the percentages of readmissions by gender
stay_length_data_perc.plot(kind = 'line')
plt.title('Percentage of lengths of stays, per age group')
plt.xlabel('Age groups')
plt.ylabel('Percentage of length of stay')
plt.legend(loc=(1.04,0.3))
plt.show()

"""
### Data exploration Analysis 
"""

for x in dbdata.dtypes[dbdata.dtypes != "object"].index:
  sns.boxplot(dbdata.loc[:,x])
  plt.show()

dbdata['diag_1'].value_counts()
dbdata['diag_2'].value_counts()
dbdata['diag_3'].value_counts()

icd_codes = pd.read_csv('icd_codes.csv')

icd_diabetes = pd.merge(dbdata, icd_codes, left_on='diag_1', right_on= 'ICD_Code')
icd_diabetes.columns

# plot with various axes scales
fig = plt.figure(figsize=(8, 12))
f = plt.figure(1)
f.subplots_adjust(hspace=.3)
## Plot Age vs readmitted
plt.subplot(4,1,1)
plt.plot(icd_diabetes['age'],icd_diabetes['readmitted'],'rs')
plt.ylabel('Age')
plt.xlabel('Readmitted')
plt.grid(True)
plt.title('Age vs Readmitted')

## Plot Race vs Readmitted
plt.subplot(4,1,2)
plt.plot(icd_diabetes['race'],icd_diabetes['readmitted'],'bo')
plt.ylabel('Race')
plt.xlabel('Readmitted')
plt.grid(True)
plt.title('Race vs Readmitted')

## Plot Gender vs Readmitted
plt.subplot(4,1,3)
plt.plot(icd_diabetes['gender'],icd_diabetes['readmitted'],'go')
plt.ylabel('Gender')
plt.xlabel('Readmitted')
plt.title('Gender vs Readmitted')
plt.grid(True)
plt.show()

## Plot Primary Diagnosis vs Readmitted
plt.subplot(4,1,4)
plt.plot(icd_diabetes['diag_1'],icd_diabetes['readmitted'],'go')
plt.ylabel('Primary Diagnosis')
plt.xlabel('Readmitted')
plt.title('Primary Diagnosis vs Readmitted')
plt.grid(True)
plt.show()


scatter_matrix(icd_diabetes, diagonal='kde')
plt.show()

pairplot(icd_diabetes)
plt.show()

## Numeric columns
plot_cols = ["age",
              "race",
              "gender",
              "diag_1"]

## Create a scatter plot matric --- a pair-wise scatter plots
def auto_pairs(plot_cols, df):
    fig = plt.figure(1, figsize=(12, 12))
    fig.clf()
    ax = fig.gca()
    scatter_matrix(df[plot_cols], diagonal='hist', ax = ax)

    plt.show()
    return('Done')
auto_pairs(plot_cols, icd_diabetes)

## Create conditional scatter plot
def auto_scatter(df, plot_cols):

    for col in plot_cols:
        g = sns.FacetGrid(df,\
        margin_titles=True,\
        hue="fuel-type",\
        palette={"diesel": "red", "gas": "blue"},\
        height=8)
        g.map(plt.scatter, col, "price")
        g.add_legend()
        plt.show()

plot_cols3 = ["length", \
               "curb-weight", \
               "engine-size", \
               "city-mpg"]
auto_scatter(df_cln, plot_cols3)

# Let us check the impact of property size on the market value
icd_d.plot.scatter(x='Property_size', y='Market_value',c='DarkBlue')
plt.xlabel('Property_size')
plt.ylabel('Market_value')
plt.show()

f, ax = plt.subplots(2)
data.plot.scatter(x="curb-weight",y="engine-size",ax=ax[0], title="Original data")
data_norm.plot.scatter(x="curb-weight", y="engine-size", ax=ax[1], title="Normalized data")
f.subplots_adjust(hspace=1)
plt.show()

"""# Part 3"""

subset_list = ['num_medications', 'number_outpatient', 'number_emergency', 'time_in_hospital',\
'number_inpatient', 'encounter_id', 'age', 'num_lab_procedures', 'number_diagnoses',\
'num_procedures', 'readmitted']

logsubset = dbdata[subset_list]
indp_col = dbdata[subset_list].select_dtypes(include=[np.number])
indp_col
#of all the independent variable only age is not numerical.

logsubset

readmittion_test = logsubset.copy()

readmittion_test[readmittion_test['readmitted'] != 'NO']

readmittion_test.loc[:,'readmitted'][readmittion_test.loc[:,'readmitted'] != 'NO'] = 0
readmittion_test.loc[:,'readmitted'][readmittion_test.loc[:,'readmitted'] == 'NO'] = 1

readmittion_test[readmittion_test['readmitted'] == 0]

dbdata.head()

#Showing the percentage of the responses
print((readmittion_test['readmitted'].value_counts())/readmittion_test['readmitted'].size)

readmittion_test['readmitted'].hist(figsize = (10,10))
plt.show()
#over 62% person were readmitted while less tha 39% were not readmitted.

readmittion_test.groupby('readmitted').mean()
#the average number of medications taken by someone who was admitted is 14.801811	 ........

def readmitted_hist(df, plots_var, grid_cols):
  for x in plots_var:
    plot = sns.FacetGrid(df, col = grid_cols, margin_titles = True)
    plot.map(plt.hist, x)
    plt.show()

readmitted_hist(readmittion_test, indp_col, 'readmitted')

def readmittion_boxplot(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        fig = plt.figure(figsize=(6, 6))
        fig.clf()
        ax = fig.gca()
        df.boxplot(column=[col], ax=ax, by=['readmitted'])
        plt.show()
    return('Done')
readmittion_boxplot(readmittion_test)

"""From the result of the boxplot, only variables [number_of_inpatient, number_of_emergency, 
number_of_outpatient] seems to have no effect on the readmittion of patients and this sounds 
logical. The histogram also gives similar result."""

readmittion_test.head()

X = readmittion_test.select_dtypes(include=[np.number]).copy()
scaler = StandardScaler()

X0 = scaler.fit_transform(X)
X0 =  pd.DataFrame(X0, index=X.index, columns=X.columns)
Y0 = readmittion_test['readmitted'].astype('int')


logit_model=sm.Logit(Y0,X0)
result=logit_model.fit()
print(result.summary())

"""The result from the p-values shows that most of variable are significant in the model."""

X_train, X_test, Y_train, Y_test = train_test_split(
    X0, Y0, test_size = 0.3)
lg = linear_model.LogisticRegression()
lg.fit(X_train, Y_train)

print("Score from training data: {}".format(lg.score(X_train, Y_train)))
print("Score from test data: {}".format(lg.score(X_test, Y_test)))
print("Intercept:\n {}".format(lg.intercept_))
print("Coefficients:\n")
for feat, coef in zip(X, lg.coef_[0]):
    print(" {:>20}: {}".format(feat, coef))

y_pred = lg.predict(X_test)
print(y_pred)

confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)

"""The result shows that 420 and 3017 were correctly predicted while 271 
and 1573 were wrongly predicted."""


print(classification_report(Y_test, y_pred))

#Chose 7 becuase the boxplot shows me 7 variables have impact on the readmittio
selected = feature_selection.RFE(lg, n_features_to_select=7, verbose=0, step=1)
selected = selected.fit(X0, Y0)
r_features = X0.loc[:, selected.support_]
print("R features are:\n{}".format(','.join(list(r_features))))

X0.loc[:, selected.support_]

X2 = r_features
Y = Y0
trainX, testX, trainY, testY = train_test_split(X2, Y, test_size=0.3, random_state=0)
lg2 = linear_model.LogisticRegression()
lg2.fit(trainX, trainY)
print("Score from training data: {}".format(lg2.score(trainX, trainY)))
print("Score from test data: {}".format(lg2.score(testX, testY)))
print("Intercept:\n {}".format(lg2.intercept_))
print("Coefficients:\n")
for feat2, coef2 in zip(X2, lg2.coef_[0]):
    print(" {:>20}: {}".format(feat2, coef2))

predicted = lg2.predict(testX)
print("Mean hits: {}".format(np.mean(predicted==testY)))

scores = cross_val_score(linear_model.LogisticRegression(), X2, Y, scoring='accuracy', cv=8)
scores
print("Mean scores: {}".format(scores.mean()))


confusion_matrix2 = confusion_matrix(testY, predicted)
print(confusion_matrix2)


prob = np.array(lg2.predict_proba(testX)[:, 1])
testY += 1
fpr, sensitivity, _ = metrics.roc_curve(testY, prob, pos_label=2)
print("AUC = {}".format(metrics.auc(fpr, sensitivity)))
plt.scatter(fpr, fpr, c='b', marker='s')
plt.scatter(fpr, sensitivity, c='r', marker='o')
plt.title('AUC of Linear Model')
plt.xlabel('False positive Rate')
plt.ylabel('True positive Rate')
plt.show()