diff --git a/jupyter-notebook/Random_Forest_Notebook.html b/jupyter-notebook/Random_Forest_Notebook.html new file mode 100644 index 0000000..be56094 --- /dev/null +++ b/jupyter-notebook/Random_Forest_Notebook.html @@ -0,0 +1,18095 @@ + + +
+ + +A significant number of hotel bookings are called off due to cancellations or no-shows. Typical reasons for cancellations include change of plans, scheduling conflicts, etc. This is often made easier by the option to do so free of charge or preferably at a low cost. This may be beneficial to hotel guests, but it is a less desirable and possibly revenue-diminishing factor for hotels to deal with. Such losses are particularly high on last-minute cancellations.
+The new technologies involving online booking channels have dramatically changed customers’ booking possibilities and behavior. This adds a further dimension to the challenge of how hotels handle cancellations, which are no longer limited to traditional booking and guest characteristics.
+This pattern of cancellations of bookings impacts a hotel on various fronts:
+This increasing number of cancellations calls for a Machine Learning based solution that can help in predicting which booking is likely to be canceled. INN Hotels Group has a chain of hotels in Portugal - they are facing problems with this high number of booking cancellations and have reached out to your firm for data-driven solutions. You, as a Data Scientist, have to analyze the data provided to find which factors have a high influence on booking cancellations, build a predictive model that can predict which booking is going to be canceled in advance, and help in formulating profitable policies for cancellations and refunds.
+The data contains the different attributes of customers' booking details. The detailed data dictionary is given below:
+Data Dictionary
+# Importing the basic libraries we will require for the project
+
+!pip install pandas
+!pip install numpy
+!pip install matplotlib
+!pip install seaborn
+!pip install scikit-learn
+
+
+import warnings
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn import tree
+from sklearn.ensemble import RandomForestClassifier
+from sklearn import metrics
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
+from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score,precision_recall_curve,roc_curve,make_scorer
+from sklearn.metrics import ConfusionMatrixDisplay
+
+sns.set()
+warnings.filterwarnings('ignore')
+
Requirement already satisfied: pandas in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (1.5.3) +Requirement already satisfied: python-dateutil>=2.8.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from pandas) (2.8.2) +Requirement already satisfied: pytz>=2020.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from pandas) (2022.7) +Requirement already satisfied: numpy>=1.21.0 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from pandas) (1.23.5) +Requirement already satisfied: six>=1.5 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0) +Requirement already satisfied: numpy in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (1.23.5) +Requirement already satisfied: matplotlib in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (3.7.0) +Requirement already satisfied: pillow>=6.2.0 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib) (9.4.0) +Requirement already satisfied: numpy>=1.20 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib) (1.23.5) +Requirement already satisfied: fonttools>=4.22.0 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib) (4.25.0) +Requirement already satisfied: cycler>=0.10 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib) (0.11.0) +Requirement already satisfied: contourpy>=1.0.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib) (1.0.5) +Requirement already satisfied: packaging>=20.0 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib) (22.0) +Requirement already satisfied: python-dateutil>=2.7 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib) (2.8.2) +Requirement already satisfied: pyparsing>=2.3.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib) (3.0.9) +Requirement already satisfied: kiwisolver>=1.0.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib) (1.4.4) +Requirement already satisfied: six>=1.5 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0) +Requirement already satisfied: seaborn in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (0.12.2) +Requirement already satisfied: numpy!=1.24.0,>=1.17 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from seaborn) (1.23.5) +Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from seaborn) (3.7.0) +Requirement already satisfied: pandas>=0.25 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from seaborn) (1.5.3) +Requirement already satisfied: kiwisolver>=1.0.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4) +Requirement already satisfied: pillow>=6.2.0 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (9.4.0) +Requirement already satisfied: pyparsing>=2.3.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9) +Requirement already satisfied: packaging>=20.0 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (22.0) +Requirement already satisfied: contourpy>=1.0.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.0.5) +Requirement already satisfied: python-dateutil>=2.7 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2) +Requirement already satisfied: cycler>=0.10 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0) +Requirement already satisfied: fonttools>=4.22.0 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.25.0) +Requirement already satisfied: pytz>=2020.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from pandas>=0.25->seaborn) (2022.7) +Requirement already satisfied: six>=1.5 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.16.0) +Requirement already satisfied: scikit-learn in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (1.2.1) +Requirement already satisfied: scipy>=1.3.2 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from scikit-learn) (1.10.0) +Requirement already satisfied: joblib>=1.1.1 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from scikit-learn) (1.1.1) +Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from scikit-learn) (2.2.0) +Requirement already satisfied: numpy>=1.17.3 in /Users/mcdaniel/anaconda3/lib/python3.10/site-packages (from scikit-learn) (1.23.5) ++
hotel = pd.read_csv("../data/reservations-db.csv")
+
# Copying data to another variable to avoid any changes to original data
+data = hotel.copy()
+
Let's view the first few rows and last few rows of the dataset in order to understand its structure a little better.
+We will use the head() and tail() methods from Pandas to do this.
+ +data.head()
+
+ | Booking_ID | +no_of_adults | +no_of_children | +no_of_weekend_nights | +no_of_week_nights | +type_of_meal_plan | +required_car_parking_space | +room_type_reserved | +lead_time | +arrival_year | +arrival_month | +arrival_date | +market_segment_type | +repeated_guest | +no_of_previous_cancellations | +no_of_previous_bookings_not_canceled | +avg_price_per_room | +no_of_special_requests | +booking_status | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | +INN00001 | +2 | +0 | +1 | +2 | +Meal Plan 1 | +0 | +Room_Type 1 | +224 | +2017 | +10 | +2 | +Offline | +0 | +0 | +0 | +65.00 | +0 | +Not_Canceled | +
1 | +INN00002 | +2 | +0 | +2 | +3 | +Not Selected | +0 | +Room_Type 1 | +5 | +2018 | +11 | +6 | +Online | +0 | +0 | +0 | +106.68 | +1 | +Not_Canceled | +
2 | +INN00003 | +1 | +0 | +2 | +1 | +Meal Plan 1 | +0 | +Room_Type 1 | +1 | +2018 | +2 | +28 | +Online | +0 | +0 | +0 | +60.00 | +0 | +Canceled | +
3 | +INN00004 | +2 | +0 | +0 | +2 | +Meal Plan 1 | +0 | +Room_Type 1 | +211 | +2018 | +5 | +20 | +Online | +0 | +0 | +0 | +100.00 | +0 | +Canceled | +
4 | +INN00005 | +2 | +0 | +1 | +1 | +Not Selected | +0 | +Room_Type 1 | +48 | +2018 | +4 | +11 | +Online | +0 | +0 | +0 | +94.50 | +0 | +Canceled | +
data.tail()
+
+ | Booking_ID | +no_of_adults | +no_of_children | +no_of_weekend_nights | +no_of_week_nights | +type_of_meal_plan | +required_car_parking_space | +room_type_reserved | +lead_time | +arrival_year | +arrival_month | +arrival_date | +market_segment_type | +repeated_guest | +no_of_previous_cancellations | +no_of_previous_bookings_not_canceled | +avg_price_per_room | +no_of_special_requests | +booking_status | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
36270 | +INN36271 | +3 | +0 | +2 | +6 | +Meal Plan 1 | +0 | +Room_Type 4 | +85 | +2018 | +8 | +3 | +Online | +0 | +0 | +0 | +167.80 | +1 | +Not_Canceled | +
36271 | +INN36272 | +2 | +0 | +1 | +3 | +Meal Plan 1 | +0 | +Room_Type 1 | +228 | +2018 | +10 | +17 | +Online | +0 | +0 | +0 | +90.95 | +2 | +Canceled | +
36272 | +INN36273 | +2 | +0 | +2 | +6 | +Meal Plan 1 | +0 | +Room_Type 1 | +148 | +2018 | +7 | +1 | +Online | +0 | +0 | +0 | +98.39 | +2 | +Not_Canceled | +
36273 | +INN36274 | +2 | +0 | +0 | +3 | +Not Selected | +0 | +Room_Type 1 | +63 | +2018 | +4 | +21 | +Online | +0 | +0 | +0 | +94.50 | +0 | +Canceled | +
36274 | +INN36275 | +2 | +0 | +1 | +2 | +Meal Plan 1 | +0 | +Room_Type 1 | +207 | +2018 | +12 | +30 | +Offline | +0 | +0 | +0 | +161.67 | +0 | +Not_Canceled | +
data.shape
+
(36275, 19)+
data.info()
+
<class 'pandas.core.frame.DataFrame'> +RangeIndex: 36275 entries, 0 to 36274 +Data columns (total 19 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 Booking_ID 36275 non-null object + 1 no_of_adults 36275 non-null int64 + 2 no_of_children 36275 non-null int64 + 3 no_of_weekend_nights 36275 non-null int64 + 4 no_of_week_nights 36275 non-null int64 + 5 type_of_meal_plan 36275 non-null object + 6 required_car_parking_space 36275 non-null int64 + 7 room_type_reserved 36275 non-null object + 8 lead_time 36275 non-null int64 + 9 arrival_year 36275 non-null int64 + 10 arrival_month 36275 non-null int64 + 11 arrival_date 36275 non-null int64 + 12 market_segment_type 36275 non-null object + 13 repeated_guest 36275 non-null int64 + 14 no_of_previous_cancellations 36275 non-null int64 + 15 no_of_previous_bookings_not_canceled 36275 non-null int64 + 16 avg_price_per_room 36275 non-null float64 + 17 no_of_special_requests 36275 non-null int64 + 18 booking_status 36275 non-null object +dtypes: float64(1), int64(13), object(5) +memory usage: 5.3+ MB ++
Booking_ID
, type_of_meal_plan
, room_type_reserved
, market_segment_type
, and booking_status
are of object type while rest columns are numeric in nature.
There are no null values in the dataset.
+# checking for duplicate values
+data.duplicated().sum()
+
0+
Let's drop the Booking_ID column first before we proceed forward, as a column with unique values will have almost no predictive power for the Machine Learning problem at hand.
+ +data = data.drop(["Booking_ID"], axis=1)
+
data.head()
+
+ | no_of_adults | +no_of_children | +no_of_weekend_nights | +no_of_week_nights | +type_of_meal_plan | +required_car_parking_space | +room_type_reserved | +lead_time | +arrival_year | +arrival_month | +arrival_date | +market_segment_type | +repeated_guest | +no_of_previous_cancellations | +no_of_previous_bookings_not_canceled | +avg_price_per_room | +no_of_special_requests | +booking_status | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | +2 | +0 | +1 | +2 | +Meal Plan 1 | +0 | +Room_Type 1 | +224 | +2017 | +10 | +2 | +Offline | +0 | +0 | +0 | +65.00 | +0 | +Not_Canceled | +
1 | +2 | +0 | +2 | +3 | +Not Selected | +0 | +Room_Type 1 | +5 | +2018 | +11 | +6 | +Online | +0 | +0 | +0 | +106.68 | +1 | +Not_Canceled | +
2 | +1 | +0 | +2 | +1 | +Meal Plan 1 | +0 | +Room_Type 1 | +1 | +2018 | +2 | +28 | +Online | +0 | +0 | +0 | +60.00 | +0 | +Canceled | +
3 | +2 | +0 | +0 | +2 | +Meal Plan 1 | +0 | +Room_Type 1 | +211 | +2018 | +5 | +20 | +Online | +0 | +0 | +0 | +100.00 | +0 | +Canceled | +
4 | +2 | +0 | +1 | +1 | +Not Selected | +0 | +Room_Type 1 | +48 | +2018 | +4 | +11 | +Online | +0 | +0 | +0 | +94.50 | +0 | +Canceled | +
Let's check the statistical summary of the data.
+ +data.describe()
+
+ | no_of_adults | +no_of_children | +no_of_weekend_nights | +no_of_week_nights | +required_car_parking_space | +lead_time | +arrival_year | +arrival_month | +arrival_date | +repeated_guest | +no_of_previous_cancellations | +no_of_previous_bookings_not_canceled | +avg_price_per_room | +no_of_special_requests | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +36275.000000 | +
mean | +1.844962 | +0.105279 | +0.810724 | +2.204300 | +0.030986 | +85.232557 | +2017.820427 | +7.423653 | +15.596995 | +0.025637 | +0.023349 | +0.153411 | +103.423539 | +0.619655 | +
std | +0.518715 | +0.402648 | +0.870644 | +1.410905 | +0.173281 | +85.930817 | +0.383836 | +3.069894 | +8.740447 | +0.158053 | +0.368331 | +1.754171 | +35.089424 | +0.786236 | +
min | +0.000000 | +0.000000 | +0.000000 | +0.000000 | +0.000000 | +0.000000 | +2017.000000 | +1.000000 | +1.000000 | +0.000000 | +0.000000 | +0.000000 | +0.000000 | +0.000000 | +
25% | +2.000000 | +0.000000 | +0.000000 | +1.000000 | +0.000000 | +17.000000 | +2018.000000 | +5.000000 | +8.000000 | +0.000000 | +0.000000 | +0.000000 | +80.300000 | +0.000000 | +
50% | +2.000000 | +0.000000 | +1.000000 | +2.000000 | +0.000000 | +57.000000 | +2018.000000 | +8.000000 | +16.000000 | +0.000000 | +0.000000 | +0.000000 | +99.450000 | +0.000000 | +
75% | +2.000000 | +0.000000 | +2.000000 | +3.000000 | +0.000000 | +126.000000 | +2018.000000 | +10.000000 | +23.000000 | +0.000000 | +0.000000 | +0.000000 | +120.000000 | +1.000000 | +
max | +4.000000 | +10.000000 | +7.000000 | +17.000000 | +1.000000 | +443.000000 | +2018.000000 | +12.000000 | +31.000000 | +1.000000 | +13.000000 | +58.000000 | +540.000000 | +5.000000 | +
Observations:
+Let's explore these variables in some more depth by observing their distributions.
+ +We will first define a hist_box() function that provides both a boxplot and a histogram in the same visual, with which we can perform univariate analysis on the columns of this dataset.
+ +# Defining the hist_box() function
+def hist_box(data,col):
+ f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': (0.15, 0.85)}, figsize=(12,6))
+ # Adding a graph in each part
+ sns.boxplot(data[col], ax=ax_box, showmeans=True)
+ sns.distplot(data[col], ax=ax_hist)
+ plt.show()
+
Lead Time
using the hist_box function provided and write your insights.¶hist_box(data, "lead_time")
+
Observations:
+Most guests are booking reservations just ahead of their arrival date, with the remainder being kind of normally distributed. Nearly all reservations are made within around six months of arrival, and virtually no one books more than 1 year in advance.
+ +Average Price per Room
using the hist_box function provided and write your insights.¶hist_box(data, "avg_price_per_room")
+
Interestingly some rooms have a price equal to 0. Let's check them.
+ +data[data["avg_price_per_room"] == 0]
+
+ | no_of_adults | +no_of_children | +no_of_weekend_nights | +no_of_week_nights | +type_of_meal_plan | +required_car_parking_space | +room_type_reserved | +lead_time | +arrival_year | +arrival_month | +arrival_date | +market_segment_type | +repeated_guest | +no_of_previous_cancellations | +no_of_previous_bookings_not_canceled | +avg_price_per_room | +no_of_special_requests | +booking_status | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
63 | +1 | +0 | +0 | +1 | +Meal Plan 1 | +0 | +Room_Type 1 | +2 | +2017 | +9 | +10 | +Complementary | +0 | +0 | +0 | +0.0 | +1 | +Not_Canceled | +
145 | +1 | +0 | +0 | +2 | +Meal Plan 1 | +0 | +Room_Type 1 | +13 | +2018 | +6 | +1 | +Complementary | +1 | +3 | +5 | +0.0 | +1 | +Not_Canceled | +
209 | +1 | +0 | +0 | +0 | +Meal Plan 1 | +0 | +Room_Type 1 | +4 | +2018 | +2 | +27 | +Complementary | +0 | +0 | +0 | +0.0 | +1 | +Not_Canceled | +
266 | +1 | +0 | +0 | +2 | +Meal Plan 1 | +0 | +Room_Type 1 | +1 | +2017 | +8 | +12 | +Complementary | +1 | +0 | +1 | +0.0 | +1 | +Not_Canceled | +
267 | +1 | +0 | +2 | +1 | +Meal Plan 1 | +0 | +Room_Type 1 | +4 | +2017 | +8 | +23 | +Complementary | +0 | +0 | +0 | +0.0 | +1 | +Not_Canceled | +
... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +
35983 | +1 | +0 | +0 | +1 | +Meal Plan 1 | +0 | +Room_Type 7 | +0 | +2018 | +6 | +7 | +Complementary | +1 | +4 | +17 | +0.0 | +1 | +Not_Canceled | +
36080 | +1 | +0 | +1 | +1 | +Meal Plan 1 | +0 | +Room_Type 7 | +0 | +2018 | +3 | +21 | +Complementary | +1 | +3 | +15 | +0.0 | +1 | +Not_Canceled | +
36114 | +1 | +0 | +0 | +1 | +Meal Plan 1 | +0 | +Room_Type 1 | +1 | +2018 | +3 | +2 | +Online | +0 | +0 | +0 | +0.0 | +0 | +Not_Canceled | +
36217 | +2 | +0 | +2 | +1 | +Meal Plan 1 | +0 | +Room_Type 2 | +3 | +2017 | +8 | +9 | +Online | +0 | +0 | +0 | +0.0 | +2 | +Not_Canceled | +
36250 | +1 | +0 | +0 | +2 | +Meal Plan 2 | +0 | +Room_Type 1 | +6 | +2017 | +12 | +10 | +Online | +0 | +0 | +0 | +0.0 | +0 | +Not_Canceled | +
545 rows × 18 columns
+data.loc[data["avg_price_per_room"] == 0, "market_segment_type"].value_counts()
+
Complementary 354 +Online 191 +Name: market_segment_type, dtype: int64+
# Calculating the 25th quantile
+Q1 = data["avg_price_per_room"].quantile(0.25)
+
+# Calculating the 75th quantile
+Q3 = data["avg_price_per_room"].quantile(0.75)
+
+# Calculating IQR
+IQR = Q3 - Q1
+
+# Calculating value of upper whisker
+Upper_Whisker = Q3 + 1.5 * IQR
+Upper_Whisker
+
179.55+
# assigning the outliers the value of upper whisker
+data.loc[data["avg_price_per_room"] >= 500, "avg_price_per_room"] = Upper_Whisker
+
Number of Children
+ +sns.histplot(data['no_of_children'])
+plt.show()
+
data['no_of_children'].value_counts(normalize=True)
+
0 0.925624 +1 0.044604 +2 0.029166 +3 0.000524 +9 0.000055 +10 0.000028 +Name: no_of_children, dtype: float64+
# replacing 9, and 10 children with 3
+data["no_of_children"] = data["no_of_children"].replace([9, 10], 3)
+
Arrival Month
+ +sns.histplot(data["arrival_month"])
+plt.show()
+
data['arrival_month'].value_counts(normalize=True)
+
10 0.146575 +9 0.127112 +8 0.105114 +6 0.088298 +12 0.083280 +11 0.082150 +7 0.080496 +4 0.075424 +5 0.071620 +3 0.065003 +2 0.046975 +1 0.027953 +Name: arrival_month, dtype: float64+
Booking Status
+ +sns.histplot(data["booking_status"])
+plt.show()
+
data['booking_status'].value_counts(normalize=True)
+
Not_Canceled 0.672364 +Canceled 0.327636 +Name: booking_status, dtype: float64+
Let's encode Canceled bookings to 1 and Not_Canceled as 0 for further analysis
+ +data["booking_status"] = data["booking_status"].apply(
+ lambda x: 1 if x == "Canceled" else 0
+)
+
cols_list = data.select_dtypes(include=np.number).columns.tolist()
+
+plt.figure(figsize=(12, 7))
+sns.heatmap(data.corr())
+plt.show()
+
Observations:
+relationships of primary interest:
+relationships of secondary interest:
+Hotel rates are dynamic and change according to demand and customer demographics. Let's see how prices vary across different market segments
+ +plt.figure(figsize=(10, 6))
+sns.boxplot(
+ data=data, x="market_segment_type", y="avg_price_per_room", palette="gist_rainbow"
+)
+plt.show()
+
We will define a stacked barplot() function to help analyse how the target variable varies across predictor categories.
+ +# Defining the stacked_barplot() function
+def stacked_barplot(data,predictor,target,figsize=(10,6)):
+ (pd.crosstab(data[predictor],data[target],normalize='index')*100).plot(kind='bar',figsize=figsize,stacked=True)
+ plt.legend(loc="lower right")
+ plt.ylabel('Percentage Cancellations %')
+
Market Segment Type
against the target variable Booking Status
using the stacked_barplot function provided and write your insights.¶stacked_barplot(data, "market_segment_type", "booking_status")
+
Observations:
+Repeated Guest
against the target variable Booking Status
using the stacked_barplot function provided and write your insights.¶Repeating guests are the guests who stay in the hotel often and are important to brand equity.
+ +stacked_barplot(data, "repeated_guest", "booking_status")
+
Observations:
+Let's analyze the customer who stayed for at least a day at the hotel.
+ +stay_data = data[(data["no_of_week_nights"] > 0) & (data["no_of_weekend_nights"] > 0)]
+stay_data["total_days"] = (stay_data["no_of_week_nights"] + stay_data["no_of_weekend_nights"])
+
+stacked_barplot(stay_data, "total_days", "booking_status",figsize=(15,6))
+
As hotel room prices are dynamic, Let's see how the prices vary across different months
+ +plt.figure(figsize=(10, 5))
+sns.lineplot(y=data["avg_price_per_room"], x=data["arrival_month"], ci=None)
+plt.show()
+
Separating the independent variables (X) and the dependent variable (Y)
+ +X = data.drop(["booking_status"], axis=1)
+Y = data["booking_status"]
+
+X = pd.get_dummies(X, drop_first=True) # Encoding the Categorical features
+
Splitting the data into a 70% train and 30% test set
+Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. In such cases it is recommended to use the stratified sampling technique to ensure that relative class frequencies are approximately preserved in each train and validation fold.
+ +# Splitting data in train and test sets
+X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30,stratify=Y, random_state=1)
+
print("Shape of Training set : ", X_train.shape)
+print("Shape of test set : ", X_test.shape)
+print("Percentage of classes in training set:")
+print(y_train.value_counts(normalize=True))
+print("Percentage of classes in test set:")
+print(y_test.value_counts(normalize=True))
+
Shape of Training set : (25392, 27) +Shape of test set : (10883, 27) +Percentage of classes in training set: +0 0.672377 +1 0.327623 +Name: booking_status, dtype: float64 +Percentage of classes in test set: +0 0.672333 +1 0.327667 +Name: booking_status, dtype: float64 ++
Both the cases are important as:
+If we predict that a booking will not be canceled and the booking gets canceled then the hotel will lose resources and will have to bear additional costs of distribution channels.
+If we predict that a booking will get canceled and the booking doesn't get canceled the hotel might not be able to provide satisfactory services to the customer by assuming that this booking will be canceled. This might damage brand equity.
+F1 Score
to be maximized, the greater the F1 score, the higher the chances of minimizing False Negatives and False Positives. Also, let's create a function to calculate and print the classification report and confusion matrix so that we don't have to rewrite the same code repeatedly for each model.
+ +# Creating metric function
+def metrics_score(actual, predicted):
+ print(classification_report(actual, predicted))
+
+ cm = confusion_matrix(actual, predicted)
+ plt.figure(figsize=(8,5))
+
+ sns.heatmap(cm, annot=True, fmt='.2f', xticklabels=['Not Cancelled', 'Cancelled'], yticklabels=['Not Cancelled', 'Cancelled'])
+ plt.ylabel('Actual')
+ plt.xlabel('Predicted')
+ plt.show()
+
rf_estimator = RandomForestClassifier(class_weight = {0: 0.17, 1: 0.83}, random_state = 1)
+rf_estimator.fit(X_train, y_train)
+
RandomForestClassifier(class_weight={0: 0.17, 1: 0.83}, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(class_weight={0: 0.17, 1: 0.83}, random_state=1)
y_pred_train_rf = rf_estimator.predict(X_train)
+metrics_score(y_train, y_pred_train_rf)
+
precision recall f1-score support + + 0 1.00 0.99 0.99 17073 + 1 0.98 1.00 0.99 8319 + + accuracy 0.99 25392 + macro avg 0.99 0.99 0.99 25392 +weighted avg 0.99 0.99 0.99 25392 + ++
Observations:
+y_pred_test_rf = rf_estimator.predict(X_test)
+metrics_score(y_test, y_pred_test_rf)
+
precision recall f1-score support + + 0 0.91 0.95 0.93 7317 + 1 0.88 0.80 0.84 3566 + + accuracy 0.90 10883 + macro avg 0.89 0.87 0.88 10883 +weighted avg 0.90 0.90 0.90 10883 + ++
Observations
+Let's check the feature importance of the Random Forest
+ +importances = rf_estimator.feature_importances_
+
+columns = X.columns
+
+importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)
+
+plt.figure(figsize = (13, 13))
+
+sns.barplot(x=importance_df.Importance, y=importance_df.index)
+
<Axes: xlabel='Importance'>+
Observations:
+
+