# Importing the basic libraries we will require for the project

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Importing the Machine Learning models we require from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

# Importing the other functions we may require from Scikit-Learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder

# To get diferent metric scores
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score,plot_confusion_matrix,precision_recall_curve,roc_curve,make_scorer

# Code to ignore warnings from function usage
import warnings;
import numpy as np
warnings.filterwarnings('ignore')


hotel = pd.read_csv("INNHotelsGroup.csv")


# Copying data to another variable to avoid any changes to original data
data = hotel.copy()


data.head()


data.tail()


data.shape

(36275, 19)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date                          36275 non-null  int64  
 12  market_segment_type                   36275 non-null  object 
 13  repeated_guest                        36275 non-null  int64  
 14  no_of_previous_cancellations          36275 non-null  int64  
 15  no_of_previous_bookings_not_canceled  36275 non-null  int64  
 16  avg_price_per_room                    36275 non-null  float64
 17  no_of_special_requests                36275 non-null  int64  
 18  booking_status                        36275 non-null  object 
dtypes: float64(1), int64(13), object(5)
memory usage: 5.3+ MB


# checking for duplicate values
data.duplicated().sum()

0


data = data.drop(["Booking_ID"], axis=1)


data.head()


# Remove _________ and complete the code
data.describe().T


# Defining the hist_box() function
def hist_box(data,col):
  f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': (0.15, 0.85)}, figsize=(12,6))
  # Adding a graph in each part
  sns.boxplot(data[col], ax=ax_box, showmeans=True)
  sns.distplot(data[col], ax=ax_hist)
  plt.show()


# Remove _________ and complete the code
hist_box(data,'lead_time')


# Remove _________ and complete the code
hist_box(data,'avg_price_per_room')


data[data["avg_price_per_room"] == 0]


data.loc[data["avg_price_per_room"] == 0, "market_segment_type"].value_counts()

Complementary    354
Online           191
Name: market_segment_type, dtype: int64


# Calculating the 25th quantile
Q1 = data["avg_price_per_room"].quantile(0.25)

# Calculating the 75th quantile
Q3 = data["avg_price_per_room"].quantile(0.75)

# Calculating IQR
IQR = Q3 - Q1

# Calculating value of upper whisker
Upper_Whisker = Q3 + 1.5 * IQR
Upper_Whisker

179.55


# assigning the outliers the value of upper whisker
data.loc[data["avg_price_per_room"] >= 500, "avg_price_per_room"] = Upper_Whisker


sns.countplot(data['no_of_children'])
plt.show()


data['no_of_children'].value_counts(normalize=True)

0     0.925624
1     0.044604
2     0.029166
3     0.000524
9     0.000055
10    0.000028
Name: no_of_children, dtype: float64


# replacing 9, and 10 children with 3
data["no_of_children"] = data["no_of_children"].replace([9, 10], 3)


sns.countplot(data["arrival_month"])
plt.show()


data['arrival_month'].value_counts(normalize=True)

10    0.146575
9     0.127112
8     0.105114
6     0.088298
12    0.083280
11    0.082150
7     0.080496
4     0.075424
5     0.071620
3     0.065003
2     0.046975
1     0.027953
Name: arrival_month, dtype: float64


sns.countplot(data["booking_status"])
plt.show()


data['booking_status'].value_counts(normalize=True)

Not_Canceled    0.672364
Canceled        0.327636
Name: booking_status, dtype: float64


data["booking_status"] = data["booking_status"].replace(
    { "Canceled":1,'Not_Canceled':0} 
)


# Remove _________ and complete the code
cols_list = data.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(12, 7))
sns.heatmap(data.corr(),annot=True)
plt.show()


plt.figure(figsize=(10, 6))
sns.boxplot(
    data=data, x="market_segment_type", y="avg_price_per_room", palette="gist_rainbow"
)
plt.show()


# Defining the stacked_barplot() function
def stacked_barplot(data,predictor,target,figsize=(10,6)):
  (pd.crosstab(data[predictor],data[target],normalize='index')*100).plot(kind='bar',figsize=figsize,stacked=True)
  plt.legend(loc="lower right")
  plt.ylabel('Percentage Cancellations %')


# Remove _________ and complete the code
stacked_barplot(data,'market_segment_type','booking_status')


# Remove _________ and complete the code
stacked_barplot(data,'repeated_guest','booking_status')


stay_data = data[(data["no_of_week_nights"] > 0) & (data["no_of_weekend_nights"] > 0)]
stay_data["total_days"] = (stay_data["no_of_week_nights"] + stay_data["no_of_weekend_nights"])

stacked_barplot(stay_data, "total_days", "booking_status",figsize=(15,6))


plt.figure(figsize=(10, 5))
sns.lineplot(y=data["avg_price_per_room"], x=data["arrival_month"], ci=None)
plt.show()


X = data.drop(["booking_status"], axis=1)
Y = data["booking_status"]

X = pd.get_dummies(X, drop_first=True) # Encoding the Categorical features


# Splitting data in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30,stratify=Y, random_state=1)


print("Shape of Training set : ", X_train.shape)
print("Shape of test set : ", X_test.shape)
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))

Shape of Training set :  (25392, 27)
Shape of test set :  (10883, 27)
Percentage of classes in training set:
0    0.672377
1    0.327623
Name: booking_status, dtype: float64
Percentage of classes in test set:
0    0.672333
1    0.327667
Name: booking_status, dtype: float64


# Creating metric function 
def metrics_score(actual, predicted):
    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8,5))
    
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels=['Not Cancelled', 'Cancelled'], yticklabels=['Not Cancelled', 'Cancelled'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()


# Remove _________ and complete the code

# Fitting logistic regression model
lg =LogisticRegression()
lg.fit(X_train,y_train)

LogisticRegression()


# Remove _________ and complete the code

# Checking the performance on the training data
y_pred_train = lg.predict(X_train)
metrics_score(y_train,y_pred_train)

              precision    recall  f1-score   support

           0       0.82      0.90      0.86     17073
           1       0.74      0.58      0.65      8319

    accuracy                           0.80     25392
   macro avg       0.78      0.74      0.75     25392
weighted avg       0.79      0.80      0.79     25392


# Remove _________ and complete the code

# Checking the performance on the test dataset
y_pred_test = lg.predict(X_test)
metrics_score(y_test,y_pred_test)

              precision    recall  f1-score   support

           0       0.81      0.90      0.85      7317
           1       0.74      0.57      0.65      3566

    accuracy                           0.79     10883
   macro avg       0.77      0.74      0.75     10883
weighted avg       0.79      0.79      0.79     10883


# Remove _________ and complete the code

# Predict_proba gives the probability of each observation belonging to each class
y_scores_lg=lg.predict_proba(X_train)

precisions_lg, recalls_lg, thresholds_lg = precision_recall_curve(y_train,y_scores_lg[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_lg, precisions_lg[:-1], 'b--', label='precision')
plt.plot(thresholds_lg, recalls_lg[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()


# Setting the optimal threshold
optimal_threshold = 0.42


# Remove _________ and complete the code

# Creating confusion matrix
y_pred_train = lg.predict_proba(X_train)
metrics_score(y_train,y_pred_train[:,1]>optimal_threshold)

              precision    recall  f1-score   support

           0       0.85      0.85      0.85     17073
           1       0.69      0.68      0.69      8319

    accuracy                           0.80     25392
   macro avg       0.77      0.77      0.77     25392
weighted avg       0.79      0.80      0.80     25392


# Remove _________ and complete the code

y_pred_test = lg.predict_proba(X_test)
metrics_score(y_test,y_pred_test[:,1]>optimal_threshold)

              precision    recall  f1-score   support

           0       0.84      0.85      0.84      7317
           1       0.68      0.67      0.67      3566

    accuracy                           0.79     10883
   macro avg       0.76      0.76      0.76     10883
weighted avg       0.79      0.79      0.79     10883


# Scaling the data
sc=StandardScaler()

# Fit_transform on train data
X_train_scaled=sc.fit_transform(X_train)
X_train_scaled=pd.DataFrame(X_train_scaled, columns=X.columns)

# Transform on test data
X_test_scaled=sc.transform(X_test)
X_test_scaled=pd.DataFrame(X_test_scaled, columns=X.columns)


# Remove _________ and complete the code

svm = SVC(kernel='linear',probability=True)# Linear kernal or linear decision boundary
model = svm.fit(X= X_train_scaled, y = y_train)


# Remove _________ and complete the code

y_pred_train_svm = model.predict(X_train_scaled)
metrics_score(y_train,y_pred_train_svm)

              precision    recall  f1-score   support

           0       0.83      0.90      0.86     17073
           1       0.74      0.61      0.67      8319

    accuracy                           0.80     25392
   macro avg       0.79      0.76      0.77     25392
weighted avg       0.80      0.80      0.80     25392


# Remove _________ and complete the code
y_pred_test_svm = model.predict(X_test_scaled)

metrics_score(y_test, y_pred_test_svm)

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      7317
           1       0.74      0.61      0.67      3566

    accuracy                           0.80     10883
   macro avg       0.78      0.75      0.76     10883
weighted avg       0.80      0.80      0.80     10883


# Remove _________ and complete the code

y_scores_svm=model.predict_proba(X_train_scaled)

precisions_svm, recalls_svm, thresholds_svm = precision_recall_curve(y_train, y_scores_svm[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_svm, precisions_svm[:-1], 'b--', label='precision')
plt.plot(thresholds_svm, recalls_svm[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()


optimal_threshold_svm=0.42


# Remove _________ and complete the code

y_pred_train_svm=model.predict_proba(X_train_scaled)
metrics_score(y_train, y_pred_train_svm[:,1]>optimal_threshold_svm)


# Remove _________ and complete the code

y_pred_test_svm=model.predict_proba(X_train_scaled)
metrics_score(y_train, y_pred_test_svm[:,1]>optimal_threshold_svm)


# Remove _________ and complete the code

svm_rbf=SVC(kernel="rbf",probability=True)
svm_rbf.fit(X_train_scaled,y_train)

SVC(probability=True)


# Remove _________ and complete the code

y_pred_train_svm_rbf = svm_rbf.predict(X_train_scaled)
metrics_score(y_train,y_pred_train_svm_rbf)

              precision    recall  f1-score   support

           0       0.86      0.92      0.89     17073
           1       0.81      0.69      0.74      8319

    accuracy                           0.85     25392
   macro avg       0.83      0.80      0.82     25392
weighted avg       0.84      0.85      0.84     25392


# Remove _________ and complete the code

y_pred_test = svm_rbf.predict(X_test_scaled)
metrics_score(y_test,y_pred_test)

              precision    recall  f1-score   support

           0       0.85      0.92      0.88      7317
           1       0.80      0.66      0.72      3566

    accuracy                           0.84     10883
   macro avg       0.82      0.79      0.80     10883
weighted avg       0.83      0.84      0.83     10883


# Predict on train data

y_scores_svm_rbf=svm_rbf.predict_proba(X_train_scaled)

precisions_svm_rbf, recalls_svm_rbf, thresholds_svm_rbf = precision_recall_curve(y_train, y_scores_svm_rbf[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_svm_rbf, precisions_svm_rbf[:-1], 'b--', label='precision')
plt.plot(thresholds_svm_rbf, recalls_svm_rbf[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()


optimal_threshold_svm=0.38


# Remove _________ and complete the code

y_pred_train_svm_rbf = svm_rbf.predict_proba(X_train_scaled)
metrics_score(y_train,y_pred_train_svm_rbf[:,1]>optimal_threshold_svm)

              precision    recall  f1-score   support

           0       0.88      0.89      0.88     17073
           1       0.76      0.75      0.76      8319

    accuracy                           0.84     25392
   macro avg       0.82      0.82      0.82     25392
weighted avg       0.84      0.84      0.84     25392


# Remove _________ and complete the code

y_pred_test = svm_rbf.predict_proba(X_test_scaled)
metrics_score(y_test,y_pred_test[:,1]>optimal_threshold_svm)

              precision    recall  f1-score   support

           0       0.87      0.88      0.88      7317
           1       0.75      0.74      0.74      3566

    accuracy                           0.83     10883
   macro avg       0.81      0.81      0.81     10883
weighted avg       0.83      0.83      0.83     10883


# Remove _________ and complete the code

model_dt = DecisionTreeClassifier(random_state=1)
model_dt.fit(X_train,y_train)

DecisionTreeClassifier(random_state=1)


# Remove _________ and complete the code

# Checking performance on the training dataset
pred_train_dt = model_dt.predict(X_train)
metrics_score(y_train,pred_train_dt)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     17073
           1       1.00      0.99      0.99      8319

    accuracy                           0.99     25392
   macro avg       1.00      0.99      0.99     25392
weighted avg       0.99      0.99      0.99     25392


pred_test_dt = model_dt.predict(X_test)
metrics_score(y_test,pred_test_dt)

              precision    recall  f1-score   support

           0       0.90      0.90      0.90      7317
           1       0.79      0.79      0.79      3566

    accuracy                           0.87     10883
   macro avg       0.85      0.85      0.85     10883
weighted avg       0.87      0.87      0.87     10883


# Remove _________ and complete the code

# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1)

# Grid of parameters to choose from
parameters = {
    "max_depth": np.arange(2, 7, 2),
    "max_leaf_nodes": [50, 75, 150, 250],
    "min_samples_split": [10, 30, 50, 70],
}


# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, cv=5,scoring='f1')
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_

# Fit the best algorithm to the data.
estimator.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, max_leaf_nodes=50, min_samples_split=10,
                       random_state=1)


# Remove _________ and complete the code

# Checking performance on the training dataset
dt_tuned = estimator.predict(X_train)
metrics_score(y_train,dt_tuned)

              precision    recall  f1-score   support

           0       0.86      0.93      0.89     17073
           1       0.82      0.68      0.75      8319

    accuracy                           0.85     25392
   macro avg       0.84      0.81      0.82     25392
weighted avg       0.85      0.85      0.84     25392


# Remove _________ and complete the code

# Checking performance on the training dataset
y_pred_tuned = estimator.predict(X_test)
metrics_score(y_test,y_pred_tuned)

              precision    recall  f1-score   support

           0       0.85      0.93      0.89      7317
           1       0.82      0.67      0.74      3566

    accuracy                           0.84     10883
   macro avg       0.84      0.80      0.81     10883
weighted avg       0.84      0.84      0.84     10883


feature_names = list(X_train.columns)
plt.figure(figsize=(20, 10))
out = tree.plot_tree(
    estimator,max_depth=3,
    feature_names=feature_names,
    filled=True,
    fontsize=9,
    node_ids=False,
    class_names=None,
)
# below code will add arrows to the decision tree split if they are missing
for o in out:
    arrow = o.arrow_patch
    if arrow is not None:
        arrow.set_edgecolor("black")
        arrow.set_linewidth(1)
plt.show()


# Remove _________ and complete the code

# Importance of features in the tree building

importances = model_dt.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(8, 8))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()


# Remove _________ and complete the code

rf_estimator = RandomForestClassifier( random_state = 1)

rf_estimator.fit(X_train, y_train)

RandomForestClassifier(random_state=1)


# Remove _________ and complete the code

y_pred_train_rf = rf_estimator.predict(X_train)

metrics_score(y_train,y_pred_train_rf)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     17073
           1       1.00      0.99      0.99      8319

    accuracy                           0.99     25392
   macro avg       0.99      0.99      0.99     25392
weighted avg       0.99      0.99      0.99     25392


# Remove _________ and complete the code

y_pred_test_rf = rf_estimator.predict(X_test)

metrics_score(y_test,y_pred_test_rf)

              precision    recall  f1-score   support

           0       0.91      0.95      0.93      7317
           1       0.88      0.80      0.84      3566

    accuracy                           0.90     10883
   macro avg       0.90      0.88      0.88     10883
weighted avg       0.90      0.90      0.90     10883


# Remove _________ and complete the code

importances = rf_estimator.feature_importances_

columns = X.columns

importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)
plt.figure(figsize = (13, 13))

sns.barplot(importance_df.Importance, importance_df.index)

<AxesSubplot:xlabel='Importance'>

	count	mean	std	min	25%	50%	75%	max
no_of_adults	36275.0	1.844962	0.518715	0.0	2.0	2.00	2.0	4.0
no_of_children	36275.0	0.105279	0.402648	0.0	0.0	0.00	0.0	10.0
no_of_weekend_nights	36275.0	0.810724	0.870644	0.0	0.0	1.00	2.0	7.0
no_of_week_nights	36275.0	2.204300	1.410905	0.0	1.0	2.00	3.0	17.0
required_car_parking_space	36275.0	0.030986	0.173281	0.0	0.0	0.00	0.0	1.0
lead_time	36275.0	85.232557	85.930817	0.0	17.0	57.00	126.0	443.0
arrival_year	36275.0	2017.820427	0.383836	2017.0	2018.0	2018.00	2018.0	2018.0
arrival_month	36275.0	7.423653	3.069894	1.0	5.0	8.00	10.0	12.0
arrival_date	36275.0	15.596995	8.740447	1.0	8.0	16.00	23.0	31.0
repeated_guest	36275.0	0.025637	0.158053	0.0	0.0	0.00	0.0	1.0
no_of_previous_cancellations	36275.0	0.023349	0.368331	0.0	0.0	0.00	0.0	13.0
no_of_previous_bookings_not_canceled	36275.0	0.153411	1.754171	0.0	0.0	0.00	0.0	58.0
avg_price_per_room	36275.0	103.423539	35.089424	0.0	80.3	99.45	120.0	540.0
no_of_special_requests	36275.0	0.619655	0.786236	0.0	0.0	0.00	1.0	5.0

	Booking_ID	no_of_adults	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	room_type_reserved	lead_time	arrival_year	arrival_month	arrival_date	market_segment_type	avg_price_per_room	no_of_special_requests	booking_status
0	INN00001	2	1	2	Meal Plan 1	Room_Type 1	224	2017	10	2	Offline	65.00	0	Not_Canceled
1	INN00002	2	2	3	Not Selected	Room_Type 1	5	2018	11	6	Online	106.68	1	Not_Canceled
2	INN00003	1	2	1	Meal Plan 1	Room_Type 1	1	2018	2	28	Online	60.00	0	Canceled
3	INN00004	2	0	2	Meal Plan 1	Room_Type 1	211	2018	5	20	Online	100.00	0	Canceled
4	INN00005	2	1	1	Not Selected	Room_Type 1	48	2018	4	11	Online	94.50	0	Canceled

	Booking_ID	no_of_adults	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	room_type_reserved	lead_time	arrival_year	arrival_month	arrival_date	market_segment_type	avg_price_per_room	no_of_special_requests	booking_status
36270	INN36271	3	2	6	Meal Plan 1	Room_Type 4	85	2018	8	3	Online	167.80	1	Not_Canceled
36271	INN36272	2	1	3	Meal Plan 1	Room_Type 1	228	2018	10	17	Online	90.95	2	Canceled
36272	INN36273	2	2	6	Meal Plan 1	Room_Type 1	148	2018	7	1	Online	98.39	2	Not_Canceled
36273	INN36274	2	0	3	Not Selected	Room_Type 1	63	2018	4	21	Online	94.50	0	Canceled
36274	INN36275	2	1	2	Meal Plan 1	Room_Type 1	207	2018	12	30	Offline	161.67	0	Not_Canceled

Project - Classification and Hypothesis Testing: Hotel Booking Cancellation Prediction¶

Marks: 40¶

Problem Statement¶

Context¶

Objective¶

Data Description¶

Importing the libraries required¶

Loading the dataset¶

Overview of the dataset¶

View the first and last 5 rows of the dataset¶

Understand the shape of the dataset¶

Check the data types of the columns for the dataset¶

Dropping duplicate values¶

Dropping the unique values column¶

Question 1: Check the summary statistics of the dataset and write your observations (2 Marks)¶

Exploratory Data Analysis¶

Question 2: Univariate Analysis¶

Question 2.1: Plot the histogram and box plot for the variable Lead Time using the hist_box function provided and write your insights. (1 Mark)¶

Question 2.2: Plot the histogram and box plot for the variable Average Price per Room using the hist_box function provided and write your insights. (1 Mark)¶

Let's understand the distribution of the categorical variables¶

Question 3: Bivariate Analysis¶

Question 3.1: Find and visualize the correlation matrix using a heatmap and write your observations from the plot. (2 Marks)¶

Question 3.2: Plot the stacked barplot for the variable Market Segment Type against the target variable Booking Status using the stacked_barplot function provided and write your insights. (1 Mark)¶

Question 3.3: Plot the stacked barplot for the variable Repeated Guest against the target variable Booking Status using the stacked_barplot function provided and write your insights. (1 Mark)¶

Data Preparation for Modeling¶

Model Evaluation Criterion¶

Model can make wrong predictions as:¶

Which case is more important?¶

How to reduce the losses?¶

Building the model¶

Question 4: Logistic Regression (6 Marks)¶

Question 4.1: Build a Logistic Regression model (Use the sklearn library) (1 Mark)¶

Question 4.2: Check the performance of the model on train and test data (2 Marks)¶

Question 4.3: Find the optimal threshold for the model using the Precision-Recall Curve. (1 Mark)¶

Question 4.4: Check the performance of the model on train and test data using the optimal threshold. (2 Marks)¶

Question 5: Support Vector Machines (11 Marks)¶

Question 5.1: Build a Support Vector Machine model using a linear kernel (1 Mark)¶

Question 5.2: Check the performance of the model on train and test data (2 Marks)¶

Question 5.3: Find the optimal threshold for the model using the Precision-Recall Curve. (1 Mark)¶

Question 5.4: Check the performance of the model on train and test data using the optimal threshold. (2 Marks)¶

Question 5.5: Build a Support Vector Machines model using an RBF kernel (1 Mark)¶

Question 5.6: Check the performance of the model on train and test data (2 Marks)¶

Checking model performance on test set¶

Question 5.7: Check the performance of the model on train and test data using the optimal threshold. (2 Marks)¶

Question 6: Decision Trees (7 Marks)¶

Question 6.1: Build a Decision Tree Model (1 Mark)¶

Question 6.2: Check the performance of the model on train and test data (2 Marks)¶

Checking model performance on test set¶

Question 6.3: Perform hyperparameter tuning for the decision tree model using GridSearch CV (1 Mark)¶

Question 6.4: Check the performance of the model on the train and test data using the tuned model (2 Mark)¶

Checking performance on the training set¶

Visualizing the Decision Tree¶

Question 6.5: What are some important features based on the tuned decision tree? (1 Mark)¶

Question 7: Random Forest (4 Marks)¶

Question 7.1: Build a Random Forest Model (1 Mark)¶

Question 7.2: Check the performance of the model on the train and test data (2 Marks)¶

Question 7.3: What are some important features based on the Random Forest? (1 Mark)¶

Question 8: Conclude ANY FOUR key takeaways for business recommendations (4 Marks)¶

Happy Learning!¶

Question 2.1: Plot the histogram and box plot for the variable `Lead Time` using the hist_box function provided and write your insights. (1 Mark)¶

Question 2.2: Plot the histogram and box plot for the variable `Average Price per Room` using the hist_box function provided and write your insights. (1 Mark)¶

Question 3.2: Plot the stacked barplot for the variable `Market Segment Type` against the target variable `Booking Status` using the stacked_barplot function provided and write your insights. (1 Mark)¶

Question 3.3: Plot the stacked barplot for the variable `Repeated Guest` against the target variable `Booking Status` using the stacked_barplot function provided and write your insights. (1 Mark)¶