-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogistic-regression.py
117 lines (93 loc) · 3.52 KB
/
logistic-regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Encoding: utf-8
"""
written by: Lawrence McDaniel
https://lawrencemcdaniel.com
date: jun-2023
usage: minimalist implementation of Logistic Regression model.
"""
import os
import warnings
# ------------------------------------------------------------------------------
# IMPORTANT: DON'T FORGET TO INSTALL THESE LIBRARIES WITH pip
# ------------------------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# module initializations
sns.set()
HERE = os.path.abspath(os.path.dirname(__file__))
warnings.filterwarnings("ignore")
def metrics_score(actual, predicted):
"""
Create a common function for measuring the
accuracy of both the train as well as test data.
"""
print("Metrics Score.")
print(classification_report(actual, predicted))
cm = confusion_matrix(actual, predicted)
plt.figure(figsize=(8, 5))
sns.heatmap(
cm,
annot=True,
fmt=".2f",
xticklabels=["Not Cancelled", "Cancelled"],
yticklabels=["Not Cancelled", "Cancelled"],
)
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()
def prepare_data():
"""
Raw database transformations:
- clean the data
- remove columns that don't contain any information
- recast data types as necessary
- convert categorical data into series of dummy columns
- split dependent / independent variables
- split training / test data sets
"""
original_db = pd.read_csv(os.path.join(HERE, "data", "reservations-db.csv"))
# need to be careful to only work with a **COPY** of the original
# source data, lest we accidentally permanently modify any of this
# raw data.
data = original_db.copy()
# remove the ID column from the data set, since it contains
# no predictive information.
data = data.drop(["Booking_ID"], axis=1)
# recast dependent variable as boolean
data["booking_status"] = data["booking_status"].apply(
lambda x: 1 if x == "Canceled" else 0
)
# hive off the dependent variable, "booking_status"
x = data.drop(["booking_status"], axis=1)
y = data["booking_status"]
# encode all categorical features
x = pd.get_dummies(x, drop_first=True)
# Split data in train and test sets
return train_test_split(x, y, test_size=0.30, stratify=y, random_state=1)
def logistic_regression():
"""
- create training and test data sets
- create a Logistic Regression model
- train the model
- generate confusion matrix and f-score for the training set
- generate confusion matrix and f-score for the test set
"""
print("Prepare data")
x_train, x_test, y_train, y_test = prepare_data()
print("train model")
model = LogisticRegression()
model.fit(x_train, y_train)
# Set the optimal threshold (refer to the Jupyter Notebook to see how we arrived at 42)
optimal_threshold = 0.42
print("model training data and measure results")
y_pred_train = model.predict_proba(x_train)
metrics_score(y_train, y_pred_train[:, 1] > optimal_threshold)
print("model test data and measure results")
y_pred_test = model.predict_proba(x_test)
metrics_score(y_test, y_pred_test[:, 1] > optimal_threshold)
if __name__ == "__main__":
logistic_regression()