FullStackWithLawrence
diff --git a/‎decision-tree.py
Lines changed: 174 additions & 0 deletions b/‎decision-tree.py
Lines changed: 174 additions & 0 deletions
@@ -0,0 +1,174 @@
+# Encoding: utf-8
+"""
+    written by:     Lawrence McDaniel
+                    https://lawrencemcdaniel.com
+
+    date:           jun-2023
+
+    usage:          minimalist implementation of Logistic Regression model.
+"""
+import os
+import warnings
+
+# ------------------------------------------------------------------------------
+# IMPORTANT: DON'T FORGET TO INSTALL THESE LIBRARIES WITH pip
+# ------------------------------------------------------------------------------
+# Code to ignore warnings from function usage
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Importing the Machine Learning models we require from Scikit-Learn
+from sklearn.tree import DecisionTreeClassifier
+from sklearn import tree
+from sklearn import metrics
+
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.metrics import confusion_matrix, classification_report, recall_score
+
+# module initializations
+sns.set()
+HERE = os.path.abspath(os.path.dirname(__file__))
+warnings.filterwarnings("ignore")
+
+
+def metrics_score(actual, predicted):
+    """
+    Create a common function for measuring the
+    accuracy of both the train as well as test data.
+    """
+    print("Metrics Score.")
+    print(classification_report(actual, predicted))
+
+    cm = confusion_matrix(actual, predicted)
+    plt.figure(figsize=(8, 5))
+
+    sns.heatmap(
+        cm,
+        annot=True,
+        fmt=".2f",
+        xticklabels=["Not Cancelled", "Cancelled"],
+        yticklabels=["Not Cancelled", "Cancelled"],
+    )
+    plt.ylabel("Actual")
+    plt.xlabel("Predicted")
+    plt.show()
+
+
+def prepare_data():
+    """
+    Raw database transformations:
+        - clean the data
+        - remove columns that don't contain any information
+        - recast data types as necessary
+        - convert categorical data into series of dummy columns
+        - split dependent / independent variables
+        - split training / test data sets
+    """
+    print("Preparing data sets")
+    original_db = pd.read_csv(os.path.join(HERE, "data", "reservations-db.csv"))
+
+    # need to be careful to only work with a **COPY** of the original
+    # source data, lest we accidentally permanently modify any of this
+    # raw data.
+    data = original_db.copy()
+
+    # remove the ID column from the data set, since it contains
+    # no predictive information.
+    data = data.drop(["Booking_ID"], axis=1)
+
+    # recast dependent variable as boolean
+    data["booking_status"] = data["booking_status"].apply(
+        lambda x: 1 if x == "Canceled" else 0
+    )
+
+    # hive off the dependent variable, "booking_status"
+    x = data.drop(["booking_status"], axis=1)
+    y = data["booking_status"]
+
+    # encode all categorical features
+    x = pd.get_dummies(x, drop_first=True)
+
+    # Split data in train and test sets
+    return train_test_split(x, y, test_size=0.30, stratify=y, random_state=1)
+
+
+def decision_tree():
+    """
+    - create training and test data sets
+    - create a Logistic Regression model
+    - train the model
+    - generate confusion matrix and f-score for the training set
+    - generate confusion matrix and f-score for the test set
+    """
+    print("Decision Tree")
+    x_train, x_test, y_train, y_test = prepare_data()
+
+    # Linear Kernel or linear decision boundary
+    print("- training")
+    model_dt = DecisionTreeClassifier(class_weight={0: 0.17, 1: 0.83}, random_state=1)
+    model_dt.fit(x_train, y_train)
+
+    print("- modeling on training data")
+    pred_train_dt = model_dt.predict(x_train)
+    metrics_score(y_train, pred_train_dt)
+
+    print("- modeling on test data")
+    pred_test_dt = model_dt.predict(x_test)
+    metrics_score(y_test, pred_test_dt)
+
+    # Metrics to evaluate the model
+
+    # Choose the type of classifier.
+    estimator = DecisionTreeClassifier(class_weight={0: 0.17, 1: 0.83}, random_state=1)
+
+    # Grid of parameters to choose from
+    parameters = {
+        "max_depth": np.arange(2, 7, 2),
+        "max_leaf_nodes": [50, 75, 150, 250],
+        "min_samples_split": [10, 30, 50, 70],
+    }
+    scorer = metrics.make_scorer(recall_score, pos_label=1)
+
+    # Run the grid search
+    grid_obj = GridSearchCV(estimator, parameters, scoring=scorer, cv=10)
+    grid_obj = grid_obj.fit(x_train, y_train)
+
+    # Set the clf to the best combination of parameters
+    estimator = grid_obj.best_estimator_
+
+    # Fit the best algorithm to the data.
+    estimator.fit(x_train, y_train)
+
+    dt_tuned = estimator.predict(x_train)
+    metrics_score(y_train, dt_tuned)
+
+    # Checking performance on the training dataset
+    print("- remodeling on training data")
+    y_pred_tuned = estimator.predict(x_test)
+    metrics_score(y_test, y_pred_tuned)
+
+    # visualization of decision tree
+    feature_names = list(x_train.columns)
+    plt.figure(figsize=(20, 10))
+    out = tree.plot_tree(
+        estimator,
+        max_depth=3,
+        feature_names=feature_names,
+        filled=True,
+        fontsize=9,
+        node_ids=False,
+        class_names=None,
+    )
+    # below code will add arrows to the decision tree split if they are missing
+    for o in out:
+        arrow = o.arrow_patch
+        if arrow is not None:
+            arrow.set_edgecolor("black")
+            arrow.set_linewidth(1)
+    plt.show()
+
+
+if __name__ == "__main__":
+    decision_tree()