#To install xgboost library use - 
!pip install xgboost
### IMPORT: ------------------------------------
import scipy.stats as stats 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

import statsmodels.api as sm
#--Sklearn library--
from sklearn.model_selection import train_test_split # Sklearn package's randomized data splitting function

from sklearn import metrics

from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from xgboost import XGBClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression

# Libtune to tune model, get different metric scores

from sklearn.metrics import  classification_report, accuracy_score, precision_score, recall_score,f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,plot_confusion_matrix #to plot confusion matric

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth',400)
# To supress numerical display in scientific notations
pd.set_option('display.float_format', lambda x: '%.5f' % x) 
warnings.filterwarnings('ignore') # To supress warnings
 # set the background for the graphs
plt.style.use('ggplot')
# For pandas profiling
from pandas_profiling import ProfileReport
print('Load Libraries-Done')
#!pip install xlrd
#!!pip install openpyxl

Requirement already satisfied: xgboost in c:\users\yogit\anaconda3\lib\site-packages (1.4.2)
Requirement already satisfied: scipy in c:\users\yogit\anaconda3\lib\site-packages (from xgboost) (1.6.2)
Requirement already satisfied: numpy in c:\users\yogit\anaconda3\lib\site-packages (from xgboost) (1.19.5)
Load Libraries-Done


#Reading the Excel file  used tourism.xlsx 
data_path='travel.csv'

#Loading the dataset - sheet_name parameter is used if there are tabs in the excel file
#df=pd.read_excel(data_path,sheet_name='Tourism')
df=pd.read_csv(data_path)

df_tour=df.copy()
print(f'There are {df_tour.shape[0]} rows and {df_tour.shape[1]} columns') # fstring

There are 4888 rows and 20 columns


df_tour.head()


df_tour.tail()


#### Understand the  dataset.
#get the size of dataframe
print ("Rows     : " , df_tour.shape[0])  #get number of rows/observations
print ("Columns  : " , df_tour.shape[1]) #get number of columns
print ("#"*40,"\n","Features : \n\n", df_tour.columns.tolist()) #get name of columns/features
print ("#"*40,"\nMissing values :\n\n", df_tour.isnull().sum().sort_values(ascending=False))
print( "#"*40,"\nPercent of missing :\n\n", round(df_tour.isna().sum() / df_tour.isna().count() * 100, 2).sort_values(ascending=False)) # looking at columns with most Missing Values

Rows     :  4888
Columns  :  20
######################################## 
 Features : 

 ['CustomerID', 'ProdTaken', 'Age', 'TypeofContact', 'CityTier', 'DurationOfPitch', 'Occupation', 'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched', 'PreferredPropertyStar', 'MaritalStatus', 'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar', 'NumberOfChildrenVisiting', 'Designation', 'MonthlyIncome']
######################################## 
Missing values :

 DurationOfPitch             251
MonthlyIncome               233
Age                         226
NumberOfTrips               140
NumberOfChildrenVisiting     66
NumberOfFollowups            45
PreferredPropertyStar        26
TypeofContact                25
Designation                   0
OwnCar                        0
PitchSatisfactionScore        0
Passport                      0
CustomerID                    0
MaritalStatus                 0
ProdTaken                     0
NumberOfPersonVisiting        0
Gender                        0
Occupation                    0
CityTier                      0
ProductPitched                0
dtype: int64
######################################## 
Percent of missing :

 DurationOfPitch            5.14000
MonthlyIncome              4.77000
Age                        4.62000
NumberOfTrips              2.86000
NumberOfChildrenVisiting   1.35000
NumberOfFollowups          0.92000
PreferredPropertyStar      0.53000
TypeofContact              0.51000
Designation                0.00000
OwnCar                     0.00000
PitchSatisfactionScore     0.00000
Passport                   0.00000
CustomerID                 0.00000
MaritalStatus              0.00000
ProdTaken                  0.00000
NumberOfPersonVisiting     0.00000
Gender                     0.00000
Occupation                 0.00000
CityTier                   0.00000
ProductPitched             0.00000
dtype: float64


#### Check the data types of the columns for the dataset.
df_tour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4662 non-null   float64
 3   TypeofContact             4863 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport                  4888 non-null   int64  
 15  PitchSatisfactionScore    4888 non-null   int64  
 16  OwnCar                    4888 non-null   int64  
 17  NumberOfChildrenVisiting  4822 non-null   float64
 18  Designation               4888 non-null   object 
 19  MonthlyIncome             4655 non-null   float64
dtypes: float64(7), int64(7), object(6)
memory usage: 763.9+ KB


#### Summary of the dataset.
df_tour.describe().T


#### Droping customer id
df_tour.drop(['CustomerID'],axis=1,inplace=True)


cat_cols = ['Designation','ProdTaken', 'OwnCar', 'Passport',
            'CityTier','MaritalStatus',
            'ProductPitched','Gender','Occupation','TypeofContact'
            ]


for i in cat_cols:
    print('Unique values in',i, 'are :')
    print(df_tour[i].value_counts())
    print('*'*50)

Unique values in Designation are :
Executive         1842
Manager           1732
Senior Manager     742
AVP                342
VP                 230
Name: Designation, dtype: int64
**************************************************
Unique values in ProdTaken are :
0    3968
1     920
Name: ProdTaken, dtype: int64
**************************************************
Unique values in OwnCar are :
1    3032
0    1856
Name: OwnCar, dtype: int64
**************************************************
Unique values in Passport are :
0    3466
1    1422
Name: Passport, dtype: int64
**************************************************
Unique values in CityTier are :
1    3190
3    1500
2     198
Name: CityTier, dtype: int64
**************************************************
Unique values in MaritalStatus are :
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: MaritalStatus, dtype: int64
**************************************************
Unique values in ProductPitched are :
Basic           1842
Deluxe          1732
Standard         742
Super Deluxe     342
King             230
Name: ProductPitched, dtype: int64
**************************************************
Unique values in Gender are :
Male       2916
Female     1817
Fe Male     155
Name: Gender, dtype: int64
**************************************************
Unique values in Occupation are :
Salaried          2368
Small Business    2084
Large Business     434
Free Lancer          2
Name: Occupation, dtype: int64
**************************************************
Unique values in TypeofContact are :
Self Enquiry       3444
Company Invited    1419
Name: TypeofContact, dtype: int64
**************************************************


df_tour.Gender.value_counts()

Male       2916
Female     1817
Fe Male     155
Name: Gender, dtype: int64


df_tour['Gender'] = df_tour['Gender'].apply(lambda x: 'Female' if x == 'Fe Male' else x)


df_tour.Gender.value_counts()

Male      2916
Female    1972
Name: Gender, dtype: int64


## Converting the data type of categorical features to 'category'

cat_cols = ['Designation','ProdTaken', 'OwnCar', 'Passport',
            'CityTier','MaritalStatus',
            'ProductPitched','Gender','Occupation','TypeofContact'
            ]
df_tour[cat_cols] = df_tour[cat_cols].astype('category')
df_tour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   ProdTaken                 4888 non-null   category
 1   Age                       4662 non-null   float64 
 2   TypeofContact             4863 non-null   category
 3   CityTier                  4888 non-null   category
 4   DurationOfPitch           4637 non-null   float64 
 5   Occupation                4888 non-null   category
 6   Gender                    4888 non-null   category
 7   NumberOfPersonVisiting    4888 non-null   int64   
 8   NumberOfFollowups         4843 non-null   float64 
 9   ProductPitched            4888 non-null   category
 10  PreferredPropertyStar     4862 non-null   float64 
 11  MaritalStatus             4888 non-null   category
 12  NumberOfTrips             4748 non-null   float64 
 13  Passport                  4888 non-null   category
 14  PitchSatisfactionScore    4888 non-null   int64   
 15  OwnCar                    4888 non-null   category
 16  NumberOfChildrenVisiting  4822 non-null   float64 
 17  Designation               4888 non-null   category
 18  MonthlyIncome             4655 non-null   float64 
dtypes: category(10), float64(7), int64(2)
memory usage: 393.1 KB


df_tour.describe(include=['category']).T


df_tour.Age.describe()

count   4662.00000
mean      37.62227
std        9.31639
min       18.00000
25%       31.00000
50%       36.00000
75%       44.00000
max       61.00000
Name: Age, dtype: float64


df_tour['Agebin'] = pd.cut(df_tour['Age'], bins = [18,25, 31, 40, 50, 65], labels = ['18-25','26-30', '31-40', '41-50', '51-65'])


df_tour.Agebin.value_counts()

31-40    1722
41-50    1073
26-30     971
51-65     549
18-25     333
Name: Agebin, dtype: int64


df_tour.MonthlyIncome.describe()

count    4655.00000
mean    23619.85349
std      5380.69836
min      1000.00000
25%     20346.00000
50%     22347.00000
75%     25571.00000
max     98678.00000
Name: MonthlyIncome, dtype: float64


df_tour['Incomebin'] = pd.cut(df_tour['MonthlyIncome'], bins = [0,15000,20000, 25000, 30000,35000,40000,45000,50000,100000], labels = ['<15000', '<20000', '<25000', '<30000','<35000','<40000','<45000','<50000','<100000'])


df_tour.Incomebin.value_counts()

<25000     2257
<20000     1038
<30000      768
<35000      382
<40000      206
<15000        2
<100000       2
<45000        0
<50000        0
Name: Incomebin, dtype: int64


def dist_box(data):
 # function plots a combined graph for univariate analysis of continous variable 
 #to check spread, central tendency , dispersion and outliers  
    Name=data.name.upper()
    fig,(ax_box,ax_dis)  =plt.subplots(nrows=2,sharex=True,gridspec_kw = {"height_ratios": (.25, .75)},figsize=(8, 5))
    mean=data.mean()
    median=data.median()
    mode=data.mode().tolist()[0]
    sns.set_theme(style="white")
    fig.suptitle("SPREAD OF DATA FOR "+ Name  , fontsize=18, fontweight='bold')
    sns.boxplot(x=data,showmeans=True, orient='h',color="teal",ax=ax_box)
    ax_box.set(xlabel='')
     # just trying to make visualisation better. This will set background to white
    sns.despine(top=True,right=True,left=True) # to remove side line from graph
    sns.distplot(data,kde=False,color='purple',ax=ax_dis)
    ax_dis.axvline(mean, color='r', linestyle='--',linewidth=2)
    ax_dis.axvline(median, color='g', linestyle='-',linewidth=2)
    ax_dis.axvline(mode, color='y', linestyle='-',linewidth=2)
    plt.legend({'Mean':mean,'Median':median,'Mode':mode})


#select all quantitative columns for checking the spread
list_col=  df_tour.select_dtypes(include='number').columns.to_list()
for i in range(len(list_col)):
    dist_box(df_tour[list_col[i]])


# Making a list of all categorical variables

plt.figure(figsize=(14,20))

sns.set_theme(style="white") 
cols=['TypeofContact', 'CityTier', 'Occupation', 'Gender',
        'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched', 
        'PreferredPropertyStar', 'MaritalStatus', 'NumberOfTrips',
        'Passport', 'PitchSatisfactionScore',
        'OwnCar', 'NumberOfChildrenVisiting', 
        'Designation','Agebin','Incomebin']
for i, variable in enumerate(cols):
                     plt.subplot(9,2,i+1)
                     order = df_tour[variable].value_counts(ascending=False).index   
                     #sns.set_palette(list_palette[i]) # to set the palette
                     sns.set_palette('Set2')
                     ax=sns.countplot(x=df_tour[variable], data=df_tour )
                     sns.despine(top=True,right=True,left=True) # to remove side line from graph
                     for p in ax.patches:
                           percentage = '{:.1f}%'.format(100 * p.get_height()/len(df_tour[variable]))
                           x = p.get_x() + p.get_width() / 2 - 0.05
                           y = p.get_y() + p.get_height()
                           plt.annotate(percentage, (x, y),ha='center')
                     plt.tight_layout()
                     plt.title(cols[i].upper())


sns.set_palette(sns.color_palette("Set2", 8))
plt.figure(figsize=(15,10))
sns.heatmap(df_tour.corr(),annot=True)
plt.show()


sns.set_palette(sns.color_palette("Set1", 8))
sns.pairplot(df_tour, hue="ProdTaken",corner=True)
plt.show()


### Function to plot distributions and Boxplots of customers
def plot(x,target='ProdTaken'):
    fig,axs = plt.subplots(2,2,figsize=(12,10))
    axs[0, 0].set_title(f'Distribution of {x} \n of a customer who had Not taken Product',fontsize=12,fontweight='bold')
    sns.distplot(df_tour[(df_tour[target] == 0)][x],ax=axs[0,0],color='teal')
    axs[0, 1].set_title(f"Distribution of {x}\n of a customer who had taken Product",fontsize=12,fontweight='bold')
    sns.distplot(df_tour[(df_tour[target] == 1)][x],ax=axs[0,1],color='orange')
    axs[1,0].set_title(f'Boxplot of {x} w.r.t Product taken',fontsize=12,fontweight='bold')
    
    line = plt.Line2D((.1,.9),(.5,.5), color='grey', linewidth=1.5,linestyle='--')
    fig.add_artist(line)
   
    sns.boxplot(df_tour[target],df_tour[x],ax=axs[1,0],palette='gist_rainbow',showmeans=True)
    axs[1,1].set_title(f'Boxplot of {x} w.r.t Product Taken - Without outliers',fontsize=12,fontweight='bold')
    sns.boxplot(df_tour[target],df_tour[x],ax=axs[1,1],showfliers=False,palette='gist_rainbow',showmeans=True) #turning off outliers from boxplot
    sns.despine(top=True,right=True,left=True) # to remove side line from graph
    plt.tight_layout(pad=4)
    plt.show()


#select all quantitative columns for checking the spread
#list_col=  ['Age','DurationOfPitch','MonthlyIncome']
list_col=df_tour.select_dtypes(include='number').columns.to_list()
#print(list_col)
#plt.figure(figsize=(14,23))
for j in range(len(list_col)):
   plot(list_col[j])


sns.set_palette(sns.color_palette("nipy_spectral", 8))

sns.swarmplot(y='MonthlyIncome',x='Occupation',hue='ProdTaken',data=df_tour)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.title('Monthly Income vs Occupation')

Text(0.5, 1.0, 'Monthly Income vs Occupation')


sns.set_palette(sns.color_palette("nipy_spectral", 8))
sns.scatterplot(y='MonthlyIncome',x='Age',hue='ProdTaken',data=df_tour)
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.title('Monthly Income vs Age')

Text(0.5, 1.0, 'Monthly Income vs Age')


df_tour.isnull().sum()

ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
Agebin                      240
Incomebin                   233
dtype: int64


df_tour['TypeofContact'].value_counts()

Self Enquiry       3444
Company Invited    1419
Name: TypeofContact, dtype: int64


df_tour['TypeofContact'].mode()

df_tour['TypeofContact']=df_tour['TypeofContact'].fillna('Self Enquiry')


#Verify if there are null values
df_tour["TypeofContact"].isnull().sum()

0


df_tour.groupby(["Gender",'ProductPitched'])["NumberOfFollowups"].median()

df_tour['NumberOfFollowups']=df_tour.groupby(["Gender",'ProductPitched'])["NumberOfFollowups"].apply(lambda x:x.fillna(x.median()))


#Look at few rows where values  is missing
df_tour[df_tour["PreferredPropertyStar"].isnull()]


df_tour['PreferredPropertyStar']=df_tour.groupby(["Designation"])["PreferredPropertyStar"].apply(lambda x:x.fillna(x.median()))

df_tour[df_tour["PreferredPropertyStar"].isnull()]


df_tour.groupby(['ProductPitched','NumberOfFollowups'], as_index=False)["DurationOfPitch"].median()

# Impute missing Duration of pitch with median value
df_tour["DurationOfPitch"] = df_tour.groupby(["ProductPitched",'NumberOfFollowups'])["DurationOfPitch"].apply(
    lambda x: x.fillna(x.median())
)


df_tour[df_tour["DurationOfPitch"].isnull()]

df_tour.isnull().sum()

ProdTaken                     0
Age                         226
TypeofContact                 0
CityTier                      0
DurationOfPitch               0
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups             0
ProductPitched                0
PreferredPropertyStar         0
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
Agebin                      240
Incomebin                   233
dtype: int64


df_tour.groupby(['MaritalStatus'])["NumberOfTrips"].median()

MaritalStatus
Divorced    3.00000
Married     3.00000
Single      2.00000
Unmarried   3.00000
Name: NumberOfTrips, dtype: float64


# Impute missing NumberOfTrips of pitch with median value
df_tour["NumberOfTrips"] = df_tour.groupby(["MaritalStatus"])["NumberOfTrips"].apply(
    lambda x: x.fillna(x.median())
)


df_tour[df_tour["NumberOfChildrenVisiting"].isnull()].head(10)


# Impute missing NumberOfChildrenVisited with 0
df_tour['NumberOfChildrenVisiting']=df_tour['NumberOfChildrenVisiting'].fillna(0)


df_tour[df_tour["Age"].isnull()].head(10)


df_tour.groupby(["Designation", "Gender","MaritalStatus"])["Age"].median()

# Impute missing Age with median value
df_tour["Age"] = df_tour.groupby(["Designation", "Gender","MaritalStatus"])["Age"].apply(
    lambda x: x.fillna(x.median())
)


df_tour.groupby(["Occupation",'Designation','Gender'])["MonthlyIncome"].median()

Occupation      Designation     Gender
Free Lancer     AVP             Female           NaN
                                Male             NaN
                Executive       Female           NaN
                                Male     18929.00000
                Manager         Female           NaN
                                Male             NaN
                Senior Manager  Female           NaN
                                Male             NaN
                VP              Female           NaN
                                Male             NaN
Large Business  AVP             Female   31872.00000
                                Male     29959.00000
                Executive       Female   20748.00000
                                Male     20743.50000
                Manager         Female   22198.50000
                                Male     21997.00000
                Senior Manager  Female   28266.00000
                                Male     26838.50000
                VP              Female   36583.00000
                                Male     36071.00000
Salaried        AVP             Female   31890.00000
                                Male     32505.50000
                Executive       Female   20748.00000
                                Male     20371.50000
                Manager         Female   22826.00000
                                Male     22873.50000
                Senior Manager  Female   26713.00000
                                Male     26052.50000
                VP              Female   35846.00000
                                Male     35971.00000
Small Business  AVP             Female   32171.50000
                                Male     32386.00000
                Executive       Female   20685.00000
                                Male     20582.00000
                Manager         Female   23071.00000
                                Male     22984.00000
                Senior Manager  Female   26383.50000
                                Male     26297.00000
                VP              Female   34845.00000
                                Male     35868.50000
Name: MonthlyIncome, dtype: float64


df_tour["MonthlyIncome"]=df_tour.groupby(["Occupation",'Designation','Gender'])["MonthlyIncome"].apply(
    lambda x: x.fillna(x.median())
)


df_tour.isnull().sum() #verify if all missing values have been treated

ProdTaken                     0
Age                           0
TypeofContact                 0
CityTier                      0
DurationOfPitch               0
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups             0
ProductPitched                0
PreferredPropertyStar         0
MaritalStatus                 0
NumberOfTrips                 0
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting      0
Designation                   0
MonthlyIncome                 0
Agebin                      240
Incomebin                   233
dtype: int64


df_tour.Age.describe()

count   4888.00000
mean      37.41356
std        9.16280
min       18.00000
25%       31.00000
50%       36.00000
75%       43.00000
max       61.00000
Name: Age, dtype: float64


df_tour['Agebin'] = pd.cut(df_tour['Age'], bins = [15,25, 31, 40, 50, 70], labels = ['15-25','26-30', '31-40', '41-50', '51-70'])


df_tour.Agebin.value_counts()

31-40    1894
41-50    1073
26-30    1025
51-70     549
15-25     347
Name: Agebin, dtype: int64


df_tour.MonthlyIncome.describe()

count    4888.00000
mean    23543.85577
std      5267.65145
min      1000.00000
25%     20438.25000
50%     22393.00000
75%     25424.75000
max     98678.00000
Name: MonthlyIncome, dtype: float64


df_tour['Incomebin'] = pd.cut(df_tour['MonthlyIncome'], bins = [0,15000,20000, 25000, 30000,35000,40000,45000,50000,100000], labels = ['<15000', '<20000', '<25000', '<30000','<35000','<40000','<45000','<50000','<100000'])


df_tour.Incomebin.value_counts()

<25000     2490
<20000     1038
<30000      768
<35000      382
<40000      206
<15000        2
<100000       2
<45000        0
<50000        0
Name: Incomebin, dtype: int64


df_tour.isnull().sum()

ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
Agebin                      0
Incomebin                   0
dtype: int64


cust_prof=df_tour[df_tour['ProdTaken']==1]


#plt.figure(figsize= [12,12])

sns.countplot(x="ProductPitched", data=cust_prof)
sns.despine(top=True,right=True,left=True) # to remove side line from graph


sns.countplot(x='Agebin',hue='ProductPitched',data=cust_prof).set_title('Agebin Product wise')
#sns.catplot(x='Agebin',hue='ProductPitched',col='ProdTaken',data=df_tour,kind='count')
sns.despine(top=True,right=True,left=True) # to remove side line from graph


sns.countplot(x='Incomebin',hue='ProductPitched',data=cust_prof).set_title('Incomebin Product Pitched')
#sns.catplot(x='Incomebin',hue='ProductPitched',col='ProdTaken',data=df_tour,kind='count',aspect=1.5)
sns.despine(top=True,right=True,left=True) # to remove side line from graph


sns.barplot(y='Age',x='ProductPitched',data=cust_prof).set_title('Age vs Product Pitched')
#sns.catplot(y='Age',x='ProductPitched',col='ProdTaken',data=df_tour,kind='bar',aspect=1.5)
sns.despine(top=True,right=True,left=True) # to remove side line from graph


sns.countplot(x="ProductPitched", data=cust_prof,  hue="Occupation")
#sns.catplot(hue='Occupation',x='ProductPitched',col='ProdTaken',data=df_tour,kind='count',aspect=1.5)
sns.despine(top=True,right=True,left=True) # to remove side line from graph


#sns.catplot(hue='Gender',x='ProductPitched',col='ProdTaken',data=df_tour,kind='count',aspect=1.5)
sns.countplot(x="ProductPitched", data=cust_prof,  hue="Gender")
sns.despine(top=True,right=True,left=True) # to remove side line from graph


sns.countplot(x="ProductPitched", data=cust_prof,  hue="MaritalStatus")
sns.despine(top=True,right=True,left=True) # to remove side line from graph


sns.barplot(y='MonthlyIncome',x='ProductPitched',data=cust_prof).set_title('Monthly Income vs Product Pitched')
sns.despine(top=True,right=True,left=True) # to remove side line from graph


sns.barplot(x='Designation',y='MonthlyIncome',data=cust_prof,hue='ProductPitched').set_title('Designation vs Income')
sns.despine(top=True,right=True,left=True) # to remove side line from graph
plt.legend(bbox_to_anchor=(1, 1))

<matplotlib.legend.Legend at 0x7f7831a23b50>


sns.countplot(x="ProductPitched", data=cust_prof,  hue="PreferredPropertyStar")
sns.despine(top=True,right=True,left=True) # to remove side line from graph


sns.countplot(x="ProductPitched", data=cust_prof,  hue="OwnCar")
sns.despine(top=True,right=True,left=True) # to remove side line from graph


sns.countplot(x="ProductPitched", data=cust_prof,  hue="CityTier")
sns.despine(top=True,right=True,left=True) # to remove side line from graph


cust_prof.groupby(['ProductPitched']).agg({'MonthlyIncome':{'mean','min','max'},'Age':{'mean','min','max'}})


cust_prof[cust_prof['ProductPitched']=='Basic'].describe(include='all').T


cust_prof[cust_prof['ProductPitched']=='Deluxe'].describe(include='all').T


cust_prof[cust_prof['ProductPitched']=='King'].describe(include='all').T


cust_prof[cust_prof['ProductPitched']=='Standard'].describe(include='all').T


cust_prof[cust_prof['ProductPitched']=='Super Deluxe'].describe(include='all').T


df_tour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   ProdTaken                 4888 non-null   category
 1   Age                       4888 non-null   float64 
 2   TypeofContact             4888 non-null   category
 3   CityTier                  4888 non-null   category
 4   DurationOfPitch           4888 non-null   float64 
 5   Occupation                4888 non-null   category
 6   Gender                    4888 non-null   category
 7   NumberOfPersonVisiting    4888 non-null   int64   
 8   NumberOfFollowups         4888 non-null   float64 
 9   ProductPitched            4888 non-null   category
 10  PreferredPropertyStar     4888 non-null   float64 
 11  MaritalStatus             4888 non-null   category
 12  NumberOfTrips             4888 non-null   float64 
 13  Passport                  4888 non-null   category
 14  PitchSatisfactionScore    4888 non-null   int64   
 15  OwnCar                    4888 non-null   category
 16  NumberOfChildrenVisiting  4888 non-null   float64 
 17  Designation               4888 non-null   category
 18  MonthlyIncome             4888 non-null   float64 
 19  Agebin                    4888 non-null   category
 20  Incomebin                 4888 non-null   category
dtypes: category(12), float64(7), int64(2)
memory usage: 403.2 KB


## Function to plot stacked bar chart
def stacked_plot(x):
    sns.set_palette(sns.color_palette("nipy_spectral", 8))
    tab1 = pd.crosstab(x,df_tour['ProdTaken'],margins=True)
    print(tab1)
    print('-'*120)
    tab = pd.crosstab(x,df_tour['ProdTaken'],normalize='index')
    tab.plot(kind='bar',stacked=True,figsize=(7,4))
    plt.xticks(rotation=360)
    labels=["No","Yes"]
    plt.legend(loc='lower left', frameon=False,)
    plt.legend(loc="upper left", labels=labels,title="Product Taken",bbox_to_anchor=(1,1))
    sns.despine(top=True,right=True,left=True) # to remove side line from graph
    #plt.legend(labels)
    plt.show()


list_columns=['TypeofContact','CityTier','Occupation','Gender','NumberOfPersonVisiting',
              'ProductPitched','PreferredPropertyStar','NumberOfTrips','MaritalStatus','Passport','OwnCar','Designation','Agebin','Incomebin','PitchSatisfactionScore','NumberOfFollowups']
for i, variable in enumerate(list_columns):
       stacked_plot(df_tour[variable])

ProdTaken           0    1   All
TypeofContact                   
Company Invited  1109  310  1419
Self Enquiry     2859  610  3469
All              3968  920  4888
------------------------------------------------------------------------------------------------------------------------

ProdTaken     0    1   All
CityTier                  
1          2670  520  3190
2           152   46   198
3          1146  354  1500
All        3968  920  4888
------------------------------------------------------------------------------------------------------------------------

ProdTaken          0    1   All
Occupation                     
Free Lancer        0    2     2
Large Business   314  120   434
Salaried        1954  414  2368
Small Business  1700  384  2084
All             3968  920  4888
------------------------------------------------------------------------------------------------------------------------

ProdTaken     0    1   All
Gender                    
Female     1630  342  1972
Male       2338  578  2916
All        3968  920  4888
------------------------------------------------------------------------------------------------------------------------

ProdTaken                  0    1   All
NumberOfPersonVisiting                 
1                         39    0    39
2                       1151  267  1418
3                       1942  460  2402
4                        833  193  1026
5                          3    0     3
All                     3968  920  4888
------------------------------------------------------------------------------------------------------------------------

ProdTaken          0    1   All
ProductPitched                 
Basic           1290  552  1842
Deluxe          1528  204  1732
King             210   20   230
Standard         618  124   742
Super Deluxe     322   20   342
All             3968  920  4888
------------------------------------------------------------------------------------------------------------------------

ProdTaken                 0    1   All
PreferredPropertyStar                 
3.0                    2531  488  3019
4.0                     731  182   913
5.0                     706  250   956
All                    3968  920  4888
------------------------------------------------------------------------------------------------------------------------

ProdTaken         0    1   All
NumberOfTrips                 
1.0             508  112   620
2.0            1186  305  1491
3.0             969  223  1192
4.0             417   61   478
5.0             396   62   458
6.0             258   64   322
7.0             156   62   218
8.0              76   29   105
19.0              0    1     1
20.0              0    1     1
21.0              1    0     1
22.0              1    0     1
All            3968  920  4888
------------------------------------------------------------------------------------------------------------------------

ProdTaken         0    1   All
MaritalStatus                 
Divorced        826  124   950
Married        2014  326  2340
Single          612  304   916
Unmarried       516  166   682
All            3968  920  4888
------------------------------------------------------------------------------------------------------------------------

ProdTaken     0    1   All
Passport                  
0          3040  426  3466
1           928  494  1422
All        3968  920  4888
------------------------------------------------------------------------------------------------------------------------


Q1 = df_tour.quantile(0.25)             #To find the 25th percentile and 75th percentile.
Q3 = df_tour.quantile(0.75)

IQR = Q3 - Q1                           #Inter Quantile Range (75th perentile - 25th percentile)

lower=Q1-1.5*IQR                        #Finding lower and upper bounds for all values. All values outside these bounds are outliers
upper=Q3+1.5*IQR


((df_tour.select_dtypes(include=['float64','int64'])<lower) | (df_tour.select_dtypes(include=['float64','int64'])>upper)).sum()/len(df_tour)*100

Age                        0.00000
DurationOfPitch            2.29133
NumberOfPersonVisiting     0.06137
NumberOfFollowups          6.38298
PreferredPropertyStar      0.00000
NumberOfTrips              2.22995
PitchSatisfactionScore     0.00000
NumberOfChildrenVisiting   0.00000
MonthlyIncome              7.52864
dtype: float64


numeric_columns = ['MonthlyIncome','NumberOfTrips']
# outlier detection using boxplot
plt.figure(figsize=(20,30))

for i, variable in enumerate(numeric_columns):
                     plt.subplot(4,4,i+1)
                     plt.boxplot(df_tour[variable],whis=1.5)
                     plt.tight_layout()
                     plt.title(variable)

plt.show()


# Check MonthlyIncome extreme values
df_tour.sort_values(by=["MonthlyIncome"],ascending = False).head(5)


# Check NumberOfTrips extreme values
df_tour.sort_values(by=["NumberOfTrips"],ascending = False).head(5)


df_tour[(df_tour.MonthlyIncome>40000) | (df_tour.MonthlyIncome<12000)]


#Dropping observaions with duration of pitch greater than 40. There are just 2 such observations
df_tour.drop(index=df_tour[df_tour.DurationOfPitch>37].index,inplace=True)

#Dropping observation with monthly income less than 12000 or greater than 40000. There are just 4 such observations
df_tour.drop(index=df_tour[(df_tour.MonthlyIncome>40000) | (df_tour.MonthlyIncome<12000)].index,inplace=True)

#Dropping observations with number of trips greater than 8. There are just 4 such observations
df_tour.drop(index=df_tour[df_tour.NumberOfTrips>10].index,inplace=True)


df_tour.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4878 entries, 0 to 4887
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   ProdTaken                 4878 non-null   category
 1   Age                       4878 non-null   float64 
 2   TypeofContact             4878 non-null   category
 3   CityTier                  4878 non-null   category
 4   DurationOfPitch           4878 non-null   float64 
 5   Occupation                4878 non-null   category
 6   Gender                    4878 non-null   category
 7   NumberOfPersonVisiting    4878 non-null   int64   
 8   NumberOfFollowups         4878 non-null   float64 
 9   ProductPitched            4878 non-null   category
 10  PreferredPropertyStar     4878 non-null   float64 
 11  MaritalStatus             4878 non-null   category
 12  NumberOfTrips             4878 non-null   float64 
 13  Passport                  4878 non-null   category
 14  PitchSatisfactionScore    4878 non-null   int64   
 15  OwnCar                    4878 non-null   category
 16  NumberOfChildrenVisiting  4878 non-null   float64 
 17  Designation               4878 non-null   category
 18  MonthlyIncome             4878 non-null   float64 
 19  Agebin                    4878 non-null   category
 20  Incomebin                 4878 non-null   category
dtypes: category(12), float64(7), int64(2)
memory usage: 440.4 KB


# Separating target column
X = df_tour.drop(['ProdTaken','PitchSatisfactionScore','ProductPitched','NumberOfFollowups','DurationOfPitch','Agebin','Incomebin'],axis=1)
X = pd.get_dummies(X,drop_first=True)
y = df_tour['ProdTaken']


# Splitting the data into train and test sets in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1,stratify=y)
X_train.shape, X_test.shape

((3414, 22), (1464, 22))


def make_confusion_matrix(y_actual,y_predict,title):
    '''Plot confusion matrix'''
    fig, ax = plt.subplots(1, 1)
    
    cm = confusion_matrix(y_actual, y_predict, labels=[0,1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=["No","Yes"])
    disp.plot(cmap='Greens',colorbar=True,ax=ax)
    
    ax.set_title(title)
    plt.tick_params(axis=u'both', which=u'both',length=0)
    plt.grid(b=None,axis='both',which='both',visible=False)
    plt.show()


def get_metrics_score(model,X_train_df,X_test_df,y_train_pass,y_test_pass,flag=True):
    '''
    Function to calculate different metric scores of the model - Accuracy, Recall, Precision, and F1 score
    model: classifier to predict values of X
    train, test: Independent features
    train_y,test_y: Dependent variable
    threshold: thresold for classifiying the observation as 1
    flag: If the flag is set to True then only the print statements showing different will be displayed. The default value is set to True.
    roc: If the roc is set to True then only roc score will be displayed. The default value is set to False.
    '''
    # defining an empty list to store train and test results
    score_list=[] 
    pred_train = model.predict(X_train_df)
    pred_test = model.predict(X_test_df)
    pred_train = np.round(pred_train)
    pred_test = np.round(pred_test)
    train_acc = accuracy_score(y_train_pass,pred_train)
    test_acc = accuracy_score(y_test_pass,pred_test)
    train_recall = recall_score(y_train_pass,pred_train)
    test_recall = recall_score(y_test_pass,pred_test)
    train_precision = precision_score(y_train_pass,pred_train)
    test_precision = precision_score(y_test_pass,pred_test)
    train_f1 = f1_score(y_train_pass,pred_train)
    test_f1 = f1_score(y_test_pass,pred_test)
    score_list.extend((train_acc,test_acc,train_recall,test_recall,train_precision,test_precision,train_f1,test_f1))
    if flag == True: 
          metric_names = ['Train_Accuracy', 'Test_Accuracy', 'Train_Recall', 'Test_Recall','Train_Precision',
                          'Test_Precision', 'Train_F1-Score', 'Test_F1-Score']
          cols = ['Metric', 'Score']
          records = [(name, score) for name, score in zip(metric_names, score_list)]
          display(pd.DataFrame.from_records(records, columns=cols, index='Metric').T)
          make_confusion_matrix(y_train_pass,pred_train,"Confusion Matrix for Train")     
          make_confusion_matrix(y_test_pass,pred_test,"Confusion Matrix for Test") 
    return score_list # returning the list with train and test scores


# # defining empty lists to add train and test results 
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []
f1_train = []
f1_test = []

def add_score_model(score):
     '''add score of modelto list'''
     acc_train.append(score[0])
     acc_test.append(score[1])
     recall_train.append(score[2])
     recall_test.append(score[3])
     precision_train.append(score[4])
     precision_test.append(score[5])
     f1_train.append(score[6])
     f1_test.append(score[7])


dtree=DecisionTreeClassifier(random_state=1, class_weight={0:0.20, 1:0.80})
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight={0: 0.2, 1: 0.8}, random_state=1)


dtree_score=get_metrics_score(dtree,X_train,X_test,y_train,y_test)
add_score_model(dtree_score)


#Fitting the model
bagging_classifier = BaggingClassifier(random_state=1, verbose=1)
bagging_classifier.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished

BaggingClassifier(random_state=1, verbose=1)


bagging_score=get_metrics_score(bagging_classifier,X_train,X_test,y_train,y_test)
add_score_model(bagging_score)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


rf_estimator = RandomForestClassifier(random_state=1)
rf_estimator.fit(X_train,y_train)

RandomForestClassifier(random_state=1)


score_list_rf=get_metrics_score(rf_estimator,X_train,X_test,y_train,y_test)
add_score_model(score_list_rf)


comparison_frame = pd.DataFrame({'Model':['Decision Tree',
                                          'Bagging Classifier',
                                          'Random Forest',
                                          ], 
                                          'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
                                          'Train_Recall':recall_train,'Test_Recall':recall_test,
                                          'Train_Precision':precision_train,'Test_Precision':precision_test,
                                          'Train_F1':f1_train,
                                          'Test_F1':f1_test  }) 

#Sorting models in decreasing order of test recall
comparison_frame.sort_values(by='Test_Recall',ascending=False)


#Choose the type of classifier. 
dtree_tuned = DecisionTreeClassifier(class_weight={0:0.20,1:0.80},random_state=1)

# Grid of parameters to choose from
parameters = {'max_depth': [1,4,7,15], 
              'min_samples_leaf': [2,3,5],
              'max_leaf_nodes' : [ 5,7,10,15]
              
             }

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

# Run the grid search
grid_obj = GridSearchCV(dtree_tuned, parameters, scoring=scorer,n_jobs=-1)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
dtree_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
dtree_tuned.fit(X_train, y_train)

DecisionTreeClassifier(class_weight={0: 0.2, 1: 0.8}, max_depth=7,
                       max_leaf_nodes=15, min_samples_leaf=2, random_state=1)


score_tune_dt=get_metrics_score(dtree_tuned,X_train,X_test,y_train,y_test)
add_score_model(score_tune_dt) # add score to dataframe


plt.figure(figsize=(15,10))
feature_names = X_train.columns
out = tree.plot_tree(dtree_tuned,feature_names=feature_names,filled=True,fontsize=9,class_names=['1','0'])
for o in out:
    arrow = o.arrow_patch
    if arrow is not None:
        arrow.set_edgecolor('black')
        arrow.set_linewidth(1)
plt.show()


feature_names = X_train.columns
importances = dtree_tuned.feature_importances_
indices = np.argsort(importances)
print (pd.DataFrame(dtree_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))

                               Imp
Passport_1                 0.31385
Designation_Executive      0.23321
CityTier_3                 0.13094
PreferredPropertyStar      0.08738
MaritalStatus_Single       0.05706
NumberOfTrips              0.05188
MonthlyIncome              0.04156
MaritalStatus_Married      0.03142
Designation_Senior Manager 0.02659
Age                        0.02611
Designation_Manager        0.00000
OwnCar_1                   0.00000
MaritalStatus_Unmarried    0.00000
Occupation_Small Business  0.00000
Gender_Male                0.00000
NumberOfPersonVisiting     0.00000
Occupation_Salaried        0.00000
Occupation_Large Business  0.00000
CityTier_2                 0.00000
TypeofContact_Self Enquiry 0.00000
NumberOfChildrenVisiting   0.00000
Designation_VP             0.00000


plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


# Choose the type of classifier. 
rf_tuned = RandomForestClassifier(class_weight={0:0.20,1:0.80},random_state=1)

parameters = { "max_depth":[5,9,15],
               "n_estimators": [150,200,250,500],
               "min_samples_leaf": np.arange(5, 10),
                "max_features": ['auto'],
                "max_samples": np.arange(0.3,0.5, 0.7)
              }
# parameters = {"n_estimators": [50,80,150], 
#               "max_depth": [1,2,3], 
#               "min_samples_split": [3,4,6,7],"max_features": ['auto'],
#              }
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

# Run the grid search
grid_obj = GridSearchCV(rf_tuned, parameters, scoring=scorer,cv=5,n_jobs=-1)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
rf_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
rf_tuned.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 0.2, 1: 0.8}, max_depth=5,
                       max_samples=0.3, min_samples_leaf=9, n_estimators=500,
                       random_state=1)


score_tune_rt=get_metrics_score(rf_tuned,X_train,X_test,y_train,y_test)
add_score_model(score_tune_rt)


feature_names = X_train.columns
importances = rf_tuned.feature_importances_
indices = np.argsort(importances)
print (pd.DataFrame(rf_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))

                               Imp
Passport_1                 0.19115
MonthlyIncome              0.12971
Age                        0.12861
Designation_Executive      0.12658
MaritalStatus_Single       0.06428
CityTier_3                 0.05120
NumberOfTrips              0.04778
PreferredPropertyStar      0.04145
Designation_Manager        0.04131
MaritalStatus_Married      0.03720
NumberOfChildrenVisiting   0.01648
NumberOfPersonVisiting     0.01608
MaritalStatus_Unmarried    0.01472
Gender_Male                0.01430
TypeofContact_Self Enquiry 0.01405
Designation_Senior Manager 0.01300
Occupation_Small Business  0.01155
Occupation_Salaried        0.01133
OwnCar_1                   0.00947
Occupation_Large Business  0.00897
Designation_VP             0.00679
CityTier_2                 0.00400


plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


# Choose the type of classifier. 

bagging_estimator_tuned = BaggingClassifier(DecisionTreeClassifier(class_weight={0:0.20,1:0.80},random_state=1),random_state=1)

# Grid of parameters to choose from
parameters = {'max_samples': [0.7,0.8,0.9,1], 
              'max_features': [0.7,0.8,0.9,1],
              'n_estimators' : [10,20,30,40,50],
             }
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)

# Run the grid search
grid_obj = GridSearchCV(bagging_estimator_tuned, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
bagging_estimator_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data.
bagging_estimator_tuned.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight={0: 0.2,
                                                                      1: 0.8},
                                                        random_state=1),
                  max_features=0.9, max_samples=0.9, n_estimators=30,
                  random_state=1)


bagging_tuned=get_metrics_score(bagging_estimator_tuned,X_train,X_test,y_train,y_test)
add_score_model(bagging_tuned)


comparison_frame = pd.DataFrame({'Model':['Decision Tree',
                                          'Bagging Classifier',
                                          'Random Forest',
                                          'Tuned Decision Tree',
                                          'Tuned Random Forest',
                                          'Tuned Bagging Classifier'], 
                                          'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
                                          'Train_Recall':recall_train,'Test_Recall':recall_test,
                                          'Train_Precision':precision_train,'Test_Precision':precision_test,
                                          'Train_F1':f1_train,
                                          'Test_F1':f1_test  }) 

#Sorting models in decreasing order of test recall
comparison_frame.sort_values(by='Test_Recall',ascending=False)


adaboost = AdaBoostClassifier(random_state=1)
adaboost.fit(X_train,y_train)

AdaBoostClassifier(random_state=1)


adaboost_score=get_metrics_score(adaboost,X_train,X_test,y_train,y_test)
add_score_model(adaboost_score)


gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train,y_train)

GradientBoostingClassifier(random_state=1)


gbc_score=get_metrics_score(gbc,X_train,X_test,y_train,y_test)
add_score_model(gbc_score)


xgb = XGBClassifier(random_state=1,eval_metric='logloss')
xgb.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)


xgb_score=get_metrics_score(xgb,X_train,X_test,y_train,y_test)
add_score_model(xgb_score)


feature_names = X_train.columns
importances = xgb.feature_importances_
indices = np.argsort(importances)
print (pd.DataFrame(xgb.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))

                               Imp
Designation_Executive      0.19272
Passport_1                 0.10137
CityTier_3                 0.06134
MaritalStatus_Unmarried    0.05306
MaritalStatus_Single       0.05169
TypeofContact_Self Enquiry 0.04979
Occupation_Large Business  0.04821
PreferredPropertyStar      0.04246
NumberOfTrips              0.03685
Designation_Senior Manager 0.03555
MaritalStatus_Married      0.03540
Age                        0.03488
Gender_Male                0.03235
CityTier_2                 0.03195
MonthlyIncome              0.02822
Occupation_Small Business  0.02735
NumberOfPersonVisiting     0.02626
Designation_Manager        0.02608
NumberOfChildrenVisiting   0.02314
Occupation_Salaried        0.02313
OwnCar_1                   0.02285
Designation_VP             0.01536


plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


comparison_frame = pd.DataFrame({'Model':['Decision Tree',
                                          'Random Forest',
                                          'Bagging Classifier',
                                          'Tuned Decision Tree',
                                          'Tuned Random Forest',
                                          'Tuned Bagging Classifier',
                                          'AdaBoost',
                                          'Gradient Boost',
                                          'XGboost',
                                          ], 
                                          'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
                                          'Train_Recall':recall_train,'Test_Recall':recall_test,
                                          'Train_Precision':precision_train,'Test_Precision':precision_test,
                                           'Train_F1':f1_train,
                                          'Test_F1':f1_test  }) 

#Sorting models in decreasing order of test recall
comparison_frame.sort_values(by='Test_Recall',ascending=False)


# Choose the type of classifier. 
abc_tuned = AdaBoostClassifier(random_state=1)

# Grid of parameters to choose from
## add from article
parameters = {
    #Let's try different max_depth for base_estimator
    "base_estimator":[DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2)],
    "n_estimators": np.arange(10,50,100),
    "learning_rate":np.arange(0.1,1,0.1)
}

# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)

# Run the grid search
grid_obj = GridSearchCV(abc_tuned, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
abc_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data.
abc_tuned.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.9, n_estimators=10, random_state=1)


abc_tuned_score=get_metrics_score(abc_tuned,X_train,X_test,y_train,y_test)
add_score_model(abc_tuned_score)


feature_names = X_train.columns
importances = abc_tuned.feature_importances_
indices = np.argsort(importances)
print (pd.DataFrame(abc_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))

                               Imp
Age                        0.18765
NumberOfTrips              0.11939
MonthlyIncome              0.08500
Designation_Executive      0.08263
Passport_1                 0.07928
Designation_Senior Manager 0.07538
CityTier_3                 0.07196
MaritalStatus_Single       0.05326
PreferredPropertyStar      0.04754
Occupation_Salaried        0.03610
MaritalStatus_Unmarried    0.03589
TypeofContact_Self Enquiry 0.03406
NumberOfPersonVisiting     0.03153
Gender_Male                0.03048
Occupation_Small Business  0.02984
MaritalStatus_Married      0.00000
Occupation_Large Business  0.00000
OwnCar_1                   0.00000
CityTier_2                 0.00000
Designation_Manager        0.00000
NumberOfChildrenVisiting   0.00000
Designation_VP             0.00000


plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


# Choose the type of classifier. 
gbc_tuned = GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1)

# Grid of parameters to choose from
## add from article
parameters = {
    "n_estimators": [100,150,200,250],
    "subsample":[0.8,0.9,1],
    "max_features":[0.7,0.8,0.9,1]
}

# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)

# Run the grid search
grid_obj = GridSearchCV(gbc_tuned, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
gbc_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data.
gbc_tuned.fit(X_train, y_train)

GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
                           max_features=0.9, n_estimators=250, random_state=1,
                           subsample=0.8)


gbc_tuned_score=get_metrics_score(gbc_tuned,X_train,X_test,y_train,y_test)
add_score_model(gbc_tuned_score)


feature_names = X_train.columns
importances = gbc_tuned.feature_importances_
indices = np.argsort(importances)
print (pd.DataFrame(gbc_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))

                               Imp
MonthlyIncome              0.29331
Age                        0.14855
Passport_1                 0.12045
Designation_Executive      0.09364
NumberOfTrips              0.05815
PreferredPropertyStar      0.04761
CityTier_3                 0.04426
MaritalStatus_Single       0.04385
MaritalStatus_Unmarried    0.02096
Occupation_Large Business  0.01820
Designation_Senior Manager 0.01778
Designation_Manager        0.01606
MaritalStatus_Married      0.01191
Gender_Male                0.01187
TypeofContact_Self Enquiry 0.01175
CityTier_2                 0.00896
NumberOfPersonVisiting     0.00842
NumberOfChildrenVisiting   0.00761
Occupation_Salaried        0.00663
Occupation_Small Business  0.00568
OwnCar_1                   0.00311
Designation_VP             0.00123


plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


# Choose the type of classifier. 
xgb_tuned = XGBClassifier(random_state=1,eval_metric='logloss')

# Grid of parameters to choose from
## add from

parameters = {
    "n_estimators": np.arange(10,100,20),
    "scale_pos_weight":[0,5],
    "colsample_bylevel":[0.5,1],
    "learning_rate":[0.001,0.01,0.1,0.5]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)

# Run the grid search
grid_obj = GridSearchCV(xgb_tuned, parameters,scoring=acc_scorer,cv=5,n_jobs=-1)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
xgb_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data.
xgb_tuned.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=90, n_jobs=4,
              num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=5, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)


xgb_tuned_score=get_metrics_score(xgb_tuned,X_train,X_test,y_train,y_test)
add_score_model(xgb_tuned_score)


feature_names = X_train.columns
importances = xgb_tuned.feature_importances_
indices = np.argsort(importances)
print (pd.DataFrame(xgb_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))

                               Imp
Passport_1                 0.16037
Designation_Executive      0.12216
CityTier_3                 0.06087
MaritalStatus_Single       0.05643
Designation_Senior Manager 0.05249
Designation_Manager        0.05182
PreferredPropertyStar      0.04584
MaritalStatus_Unmarried    0.04319
Age                        0.03845
MaritalStatus_Married      0.03805
MonthlyIncome              0.03623
CityTier_2                 0.03598
NumberOfTrips              0.03371
Occupation_Large Business  0.02990
TypeofContact_Self Enquiry 0.02968
Occupation_Small Business  0.02870
Designation_VP             0.02722
Occupation_Salaried        0.02520
Gender_Male                0.02504
OwnCar_1                   0.02117
NumberOfPersonVisiting     0.01927
NumberOfChildrenVisiting   0.01822


plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


estimators=[('Decision Tree', dtree_tuned),('Random Forest', rf_tuned),('AdaBoost',abc_tuned),
           ('Gradient Boosting', gbc_tuned)]
final_estimator=XGBClassifier(random_state=1,eval_metric='logloss')


stacking_estimator=StackingClassifier(estimators=estimators, final_estimator=final_estimator,cv=5,n_jobs=-1)
stacking_estimator.fit(X_train,y_train)

StackingClassifier(cv=5,
                   estimators=[('Decision Tree',
                                DecisionTreeClassifier(class_weight={0: 0.2,
                                                                     1: 0.8},
                                                       max_depth=7,
                                                       max_leaf_nodes=15,
                                                       min_samples_leaf=2,
                                                       random_state=1)),
                               ('Random Forest',
                                RandomForestClassifier(class_weight={0: 0.2,
                                                                     1: 0.8},
                                                       max_depth=5,
                                                       max_samples=0.3,
                                                       min_samples_leaf=9,
                                                       n_estimators=500,
                                                       random_state=1)),
                               ('AdaBoost',
                                AdaBoostClassifier(base...
                                                 importance_type='gain',
                                                 interaction_constraints=None,
                                                 learning_rate=None,
                                                 max_delta_step=None,
                                                 max_depth=None,
                                                 min_child_weight=None,
                                                 missing=nan,
                                                 monotone_constraints=None,
                                                 n_estimators=100, n_jobs=None,
                                                 num_parallel_tree=None,
                                                 random_state=1, reg_alpha=None,
                                                 reg_lambda=None,
                                                 scale_pos_weight=None,
                                                 subsample=None,
                                                 tree_method=None,
                                                 validate_parameters=None,
                                                 verbosity=None),
                   n_jobs=-1)


stacking_estimator=get_metrics_score(stacking_estimator,X_train,X_test,y_train,y_test)


add_score_model(stacking_estimator)


comparison_frame = pd.DataFrame({'Model':['Decision Tree',
                                          'Random Forest',
                                          'Bagging Classifier',
                                          'Tuned Decision Tree',
                                          'Tuned Random Forest',
                                          'Tuned Bagging Classifier',
                                          'AdaBoost',
                                          'Gradient Boost',
                                          'XGboost',
                                          'Tuned AdaBoost',
                                          'Tuned Gradient Boost',
                                          'Tuned XGboost','Stacking'
                                          ], 
                                          'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
                                          'Train_Recall':recall_train,'Test_Recall':recall_test,
                                          'Train_Precision':precision_train,'Test_Precision':precision_test,
                                           'Train_F1':f1_train,
                                          'Test_F1':f1_test  }) 

#Sorting models in decreasing order of test recall
comparison_frame.sort_values(by='Test_Recall',ascending=False)

	MonthlyIncome			Age
	min	max	mean	min	max	mean
ProductPitched
Basic	16009.00000	37868.00000	20179.10870	18.00000	59.00000	31.26449
Deluxe	17086.00000	38525.00000	23093.33333	21.00000	59.00000	37.59069
King	17517.00000	38537.00000	34672.10000	27.00000	59.00000	48.90000
Standard	17372.00000	38395.00000	26035.41935	19.00000	60.00000	40.99194
Super Deluxe	21151.00000	37502.00000	29823.80000	39.00000	56.00000	43.50000

	Model	Train_Accuracy	Test_Accuracy	Train_Recall	Test_Recall	Train_Precision	Test_Precision	Train_F1	Test_F1
3	Tuned Decision Tree	0.78559	0.76639	0.68069	0.66304	0.45332	0.42361	0.54421	0.51695
4	Tuned Random Forest	0.82484	0.81216	0.63707	0.62319	0.52842	0.50146	0.57768	0.55574
0	Decision Tree	1.00000	0.85246	1.00000	0.58696	1.00000	0.61364	1.00000	0.60000
1	Bagging Classifier	0.99033	0.88661	0.95327	0.51087	0.99512	0.81977	0.97375	0.62946
2	Random Forest	1.00000	0.88730	1.00000	0.48551	1.00000	0.85350	1.00000	0.61894
5	Tuned Bagging Classifier	0.99824	0.87910	0.99065	0.46014	1.00000	0.81935	0.99531	0.58933

	Model	Train_Accuracy	Test_Accuracy	Train_Recall	Test_Recall	Train_Precision	Test_Precision	Train_F1	Test_F1
3	Tuned Decision Tree	0.78559	0.76639	0.68069	0.66304	0.45332	0.42361	0.54421	0.51695
4	Tuned Random Forest	0.82484	0.81216	0.63707	0.62319	0.52842	0.50146	0.57768	0.55574
0	Decision Tree	1.00000	0.85246	1.00000	0.58696	1.00000	0.61364	1.00000	0.60000
8	XGboost	0.99531	0.89003	0.97508	0.55797	1.00000	0.79793	0.98738	0.65672
1	Random Forest	0.99033	0.88661	0.95327	0.51087	0.99512	0.81977	0.97375	0.62946
2	Bagging Classifier	1.00000	0.88730	1.00000	0.48551	1.00000	0.85350	1.00000	0.61894
5	Tuned Bagging Classifier	0.99824	0.87910	0.99065	0.46014	1.00000	0.81935	0.99531	0.58933
7	Gradient Boost	0.87961	0.86134	0.45016	0.38406	0.83285	0.76259	0.58443	0.51084
6	AdaBoost	0.84593	0.84563	0.30374	0.28623	0.71168	0.73148	0.42576	0.41146

	Model	Train_Accuracy	Test_Accuracy	Train_Recall	Test_Recall	Train_Precision	Test_Precision	Train_F1	Test_F1
11	Tuned XGboost	0.92794	0.84426	0.95639	0.72101	0.73798	0.56857	0.83311	0.63578
3	Tuned Decision Tree	0.78559	0.76639	0.68069	0.66304	0.45332	0.42361	0.54421	0.51695
4	Tuned Random Forest	0.82484	0.81216	0.63707	0.62319	0.52842	0.50146	0.57768	0.55574
0	Decision Tree	1.00000	0.85246	1.00000	0.58696	1.00000	0.61364	1.00000	0.60000
8	XGboost	0.99531	0.89003	0.97508	0.55797	1.00000	0.79793	0.98738	0.65672
1	Random Forest	0.99033	0.88661	0.95327	0.51087	0.99512	0.81977	0.97375	0.62946
12	Stacking	0.90920	0.84973	0.66822	0.50362	0.81559	0.62613	0.73459	0.55823
2	Bagging Classifier	1.00000	0.88730	1.00000	0.48551	1.00000	0.85350	1.00000	0.61894
5	Tuned Bagging Classifier	0.99824	0.87910	0.99065	0.46014	1.00000	0.81935	0.99531	0.58933
10	Tuned Gradient Boost	0.91418	0.85724	0.60125	0.43478	0.91253	0.69364	0.72488	0.53452
7	Gradient Boost	0.87961	0.86134	0.45016	0.38406	0.83285	0.76259	0.58443	0.51084
9	Tuned AdaBoost	0.84827	0.84085	0.34112	0.31884	0.69745	0.66165	0.45816	0.43032
6	AdaBoost	0.84593	0.84563	0.30374	0.28623	0.71168	0.73148	0.42576	0.41146

	CustomerID	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
0	200000	1	41.00000	Self Enquiry	3	6.00000	Salaried	Female	3	3.00000	Deluxe	3.00000	Single	1.00000	1	2	1	0.00000	Manager	20993.00000
1	200001	0	49.00000	Company Invited	1	14.00000	Salaried	Male	3	4.00000	Deluxe	4.00000	Divorced	2.00000	0	3	1	2.00000	Manager	20130.00000
2	200002	1	37.00000	Self Enquiry	1	8.00000	Free Lancer	Male	3	4.00000	Basic	3.00000	Single	7.00000	1	3	0	0.00000	Executive	17090.00000
3	200003	0	33.00000	Company Invited	1	9.00000	Salaried	Female	2	3.00000	Basic	3.00000	Divorced	2.00000	1	5	1	1.00000	Executive	17909.00000
4	200004	0	NaN	Self Enquiry	1	8.00000	Small Business	Male	2	3.00000	Basic	4.00000	Divorced	1.00000	0	5	1	0.00000	Executive	18468.00000

	CustomerID	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	PreferredPropertyStar	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome
4883	204883	1	49.00000	Self Enquiry	3	9.00000	Small Business	Male	3	5.00000	Deluxe	4.00000	Unmarried	2.00000	1	1	1	1.00000	Manager	26576.00000
4884	204884	1	28.00000	Company Invited	1	31.00000	Salaried	Male	4	5.00000	Basic	3.00000	Single	3.00000	1	3	1	2.00000	Executive	21212.00000
4885	204885	1	52.00000	Self Enquiry	3	17.00000	Salaried	Female	4	4.00000	Standard	4.00000	Married	7.00000	0	1	1	3.00000	Senior Manager	31820.00000
4886	204886	1	19.00000	Self Enquiry	3	16.00000	Small Business	Male	3	4.00000	Basic	3.00000	Single	3.00000	0	5	0	2.00000	Executive	20289.00000
4887	204887	1	36.00000	Self Enquiry	1	14.00000	Salaried	Male	4	4.00000	Basic	4.00000	Unmarried	3.00000	1	3	1	2.00000	Executive	24041.00000

	count	mean	std	min	25%	50%	75%	max
CustomerID	4888.00000	202443.50000	1411.18839	200000.00000	201221.75000	202443.50000	203665.25000	204887.00000
ProdTaken	4888.00000	0.18822	0.39092	0.00000	0.00000	0.00000	0.00000	1.00000
Age	4662.00000	37.62227	9.31639	18.00000	31.00000	36.00000	44.00000	61.00000
CityTier	4888.00000	1.65426	0.91658	1.00000	1.00000	1.00000	3.00000	3.00000
DurationOfPitch	4637.00000	15.49083	8.51964	5.00000	9.00000	13.00000	20.00000	127.00000
NumberOfPersonVisiting	4888.00000	2.90507	0.72489	1.00000	2.00000	3.00000	3.00000	5.00000
NumberOfFollowups	4843.00000	3.70845	1.00251	1.00000	3.00000	4.00000	4.00000	6.00000
PreferredPropertyStar	4862.00000	3.58104	0.79801	3.00000	3.00000	3.00000	4.00000	5.00000
NumberOfTrips	4748.00000	3.23652	1.84902	1.00000	2.00000	3.00000	4.00000	22.00000
Passport	4888.00000	0.29092	0.45423	0.00000	0.00000	0.00000	1.00000	1.00000
PitchSatisfactionScore	4888.00000	3.07815	1.36579	1.00000	2.00000	3.00000	4.00000	5.00000
OwnCar	4888.00000	0.62029	0.48536	0.00000	0.00000	1.00000	1.00000	1.00000
NumberOfChildrenVisiting	4822.00000	1.18727	0.85786	0.00000	1.00000	1.00000	2.00000	3.00000
MonthlyIncome	4655.00000	23619.85349	5380.69836	1000.00000	20346.00000	22347.00000	25571.00000	98678.00000

	count	unique	top	freq
ProdTaken	4888	2	0	3968
TypeofContact	4863	2	Self Enquiry	3444
CityTier	4888	3	1	3190
Occupation	4888	4	Salaried	2368
Gender	4888	2	Male	2916
ProductPitched	4888	5	Basic	1842
MaritalStatus	4888	4	Married	2340
Passport	4888	2	0	3466
OwnCar	4888	2	1	3032
Designation	4888	5	Executive	1842

	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	...	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome	Agebin	Incomebin
38	0	36.00000	Self Enquiry	1	11.00000	Salaried	Female	2	4.00000	Basic	...	Divorced	1.00000	1	2	1	0.00000	Executive	95000.00000	31-40	<100000
2609	0	51.00000	Self Enquiry	1	18.00000	Salaried	Female	3	4.00000	King	...	Single	5.00000	0	5	1	1.00000	VP	38604.00000	51-65	<40000
2634	0	53.00000	Self Enquiry	1	7.00000	Salaried	Male	4	5.00000	King	...	Divorced	2.00000	0	2	1	2.00000	VP	38677.00000	51-65	<40000
3012	1	56.00000	Self Enquiry	1	9.00000	Small Business	Male	4	4.00000	King	...	Divorced	7.00000	1	2	1	3.00000	VP	38537.00000	51-65	<40000
3190	0	42.00000	Company Invited	1	14.00000	Salaried	Female	3	6.00000	King	...	Married	3.00000	0	4	1	1.00000	VP	38651.00000	41-50	<40000
3193	1	53.00000	Self Enquiry	3	9.00000	Small Business	Female	3	6.00000	King	...	Divorced	3.00000	0	3	1	1.00000	VP	38523.00000	51-65	<40000
3214	0	47.00000	Self Enquiry	1	7.00000	Small Business	Male	3	4.00000	King	...	Married	2.00000	0	5	1	2.00000	VP	38305.00000	41-50	<40000
3295	0	57.00000	Self Enquiry	1	11.00000	Large Business	Female	4	4.00000	King	...	Married	6.00000	0	4	0	3.00000	VP	38621.00000	51-65	<40000
3342	0	44.00000	Self Enquiry	1	10.00000	Salaried	Male	4	6.00000	King	...	Divorced	5.00000	0	5	1	3.00000	VP	38418.00000	41-50	<40000
3362	0	52.00000	Company Invited	3	16.00000	Salaried	Male	3	4.00000	King	...	Married	6.00000	1	4	1	2.00000	VP	38525.00000	51-65	<40000
3400	0	57.00000	Self Enquiry	2	15.00000	Salaried	Male	3	4.00000	King	...	Single	8.00000	0	4	1	1.00000	VP	38395.00000	51-65	<40000
3453	0	59.00000	Self Enquiry	1	7.00000	Small Business	Female	4	4.00000	King	...	Divorced	5.00000	1	1	1	3.00000	VP	38379.00000	51-65	<40000
3598	0	48.00000	Self Enquiry	2	33.00000	Salaried	Female	4	4.00000	King	...	Married	5.00000	0	4	1	2.00000	VP	38336.00000	41-50	<40000
3686	0	41.00000	Self Enquiry	3	14.00000	Small Business	Male	3	4.00000	King	...	Single	3.00000	0	4	1	2.00000	VP	38511.00000	41-50	<40000
3775	0	49.00000	Self Enquiry	1	17.00000	Salaried	Male	4	5.00000	King	...	Married	6.00000	0	3	1	2.00000	VP	38343.00000	41-50	<40000
3845	0	56.00000	Self Enquiry	2	33.00000	Salaried	Male	4	2.00000	King	...	Married	6.00000	1	5	1	3.00000	VP	38314.00000	51-65	<40000
4079	0	51.00000	Self Enquiry	1	18.00000	Salaried	Female	3	4.00000	King	...	Single	5.00000	0	5	0	2.00000	VP	38604.00000	51-65	<40000
4104	0	53.00000	Self Enquiry	1	7.00000	Salaried	Male	4	5.00000	King	...	Married	2.00000	0	1	1	3.00000	VP	38677.00000	51-65	<40000
4482	1	56.00000	Self Enquiry	1	9.00000	Small Business	Male	4	4.00000	King	...	Married	7.00000	1	1	1	1.00000	VP	38537.00000	51-65	<40000
4660	0	42.00000	Company Invited	1	14.00000	Salaried	Female	3	6.00000	King	...	Married	3.00000	0	4	1	2.00000	VP	38651.00000	41-50	<40000
4663	1	53.00000	Self Enquiry	3	9.00000	Small Business	Female	3	6.00000	King	...	Married	3.00000	0	3	1	2.00000	VP	38523.00000	51-65	<40000
4684	0	47.00000	Self Enquiry	1	7.00000	Small Business	Male	3	4.00000	King	...	Married	2.00000	0	5	1	1.00000	VP	38305.00000	41-50	<40000
4765	0	57.00000	Self Enquiry	1	11.00000	Large Business	Female	4	4.00000	King	...	Married	6.00000	0	4	1	2.00000	VP	38621.00000	51-65	<40000
4812	0	44.00000	Self Enquiry	1	10.00000	Salaried	Male	4	6.00000	King	...	Married	5.00000	0	5	1	1.00000	VP	38418.00000	41-50	<40000
4832	1	52.00000	Company Invited	1	35.00000	Salaried	Male	4	5.00000	Deluxe	...	Single	5.00000	0	3	0	1.00000	Manager	38525.00000	51-65	<40000
4870	1	57.00000	Self Enquiry	3	23.00000	Salaried	Female	4	4.00000	Standard	...	Single	4.00000	1	5	1	3.00000	Senior Manager	38395.00000	51-65	<40000

	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	...	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome	Agebin	Incomebin
165	0	50.00000	Self Enquiry	1	17.00000	Salaried	Female	2	3.00000	King	...	Single	4.00000	0	5	1	NaN	VP	34926.00000	41-50	<35000
190	0	52.00000	Self Enquiry	1	6.00000	Salaried	Male	3	4.00000	King	...	Divorced	1.00000	0	2	1	NaN	VP	34999.00000	51-65	<35000
568	1	55.00000	Self Enquiry	1	8.00000	Small Business	Male	3	3.00000	King	...	Divorced	6.00000	1	2	1	NaN	VP	34859.00000	51-65	<35000
746	0	41.00000	Company Invited	1	13.00000	Salaried	Female	2	5.00000	King	...	Married	2.00000	0	4	1	NaN	VP	34973.00000	41-50	<35000
749	1	52.00000	Self Enquiry	3	8.00000	Small Business	Female	2	5.00000	King	...	Divorced	2.00000	0	3	1	NaN	VP	34845.00000	51-65	<35000
851	0	56.00000	Self Enquiry	1	10.00000	Large Business	Female	3	3.00000	King	...	Married	5.00000	0	4	0	NaN	VP	34943.00000	51-65	<35000
898	0	43.00000	Self Enquiry	1	9.00000	Salaried	Male	3	5.00000	King	...	Divorced	4.00000	0	5	1	NaN	VP	34740.00000	41-50	<35000
918	0	51.00000	Company Invited	3	15.00000	Salaried	Male	2	3.00000	King	...	Married	5.00000	1	4	1	NaN	VP	34847.00000	51-65	<35000
956	0	56.00000	Self Enquiry	2	14.00000	Salaried	Male	2	3.00000	King	...	Single	7.00000	0	4	1	NaN	VP	34717.00000	51-65	<35000
1009	0	58.00000	Self Enquiry	1	6.00000	Small Business	Female	3	3.00000	King	...	Divorced	4.00000	1	1	1	NaN	VP	34701.00000	51-65	<35000

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
ProdTaken	552.0	1.0	1.0	552.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Age	552.0	NaN	NaN	NaN	31.26449	8.7659	18.0	26.0	30.0	34.0	59.0
TypeofContact	552	2	Self Enquiry	358	NaN	NaN	NaN	NaN	NaN	NaN	NaN
CityTier	552.0	3.0	1.0	392.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
DurationOfPitch	552.0	NaN	NaN	NaN	15.69293	7.78024	6.0	9.0	14.0	21.0	36.0
Occupation	552	4	Salaried	260	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Gender	552	2	Male	344	NaN	NaN	NaN	NaN	NaN	NaN	NaN
NumberOfPersonVisiting	552.0	NaN	NaN	NaN	2.9058	0.70144	2.0	2.0	3.0	3.0	4.0
NumberOfFollowups	552.0	NaN	NaN	NaN	3.95109	0.96366	1.0	3.0	4.0	5.0	6.0
ProductPitched	552	1	Basic	552	NaN	NaN	NaN	NaN	NaN	NaN	NaN
PreferredPropertyStar	552.0	NaN	NaN	NaN	3.77174	0.86181	3.0	3.0	3.0	5.0	5.0
MaritalStatus	552	4	Single	230	NaN	NaN	NaN	NaN	NaN	NaN	NaN
NumberOfTrips	552.0	NaN	NaN	NaN	3.22101	2.07353	1.0	2.0	3.0	3.25	20.0
Passport	552.0	2.0	1.0	322.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
PitchSatisfactionScore	552.0	NaN	NaN	NaN	3.21377	1.35307	1.0	2.0	3.0	4.0	5.0
OwnCar	552.0	2.0	1.0	316.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
NumberOfChildrenVisiting	552.0	NaN	NaN	NaN	1.2192	0.86731	0.0	1.0	1.0	2.0	3.0
Designation	552	1	Executive	552	NaN	NaN	NaN	NaN	NaN	NaN	NaN
MonthlyIncome	552.0	NaN	NaN	NaN	20179.1087	3244.62663	16009.0	17581.25	20582.0	21365.75	37868.0
Agebin	552	5	26-30	204	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Incomebin	552	5	<25000	283	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
ProdTaken	204.0	1.0	1.0	204.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Age	204.0	NaN	NaN	NaN	37.59069	8.35041	21.0	32.0	35.75	43.25	59.0
TypeofContact	204	2	Self Enquiry	136	NaN	NaN	NaN	NaN	NaN	NaN	NaN
CityTier	204.0	2.0	3.0	144.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
DurationOfPitch	204.0	NaN	NaN	NaN	18.46324	8.84165	6.0	12.0	15.0	26.0	36.0
Occupation	204	3	Small Business	108	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Gender	204	2	Male	134	NaN	NaN	NaN	NaN	NaN	NaN	NaN
NumberOfPersonVisiting	204.0	NaN	NaN	NaN	2.95098	0.70714	2.0	2.0	3.0	3.0	4.0
NumberOfFollowups	204.0	NaN	NaN	NaN	3.97059	1.04061	1.0	3.0	4.0	5.0	6.0
ProductPitched	204	1	Deluxe	204	NaN	NaN	NaN	NaN	NaN	NaN	NaN
PreferredPropertyStar	204.0	NaN	NaN	NaN	3.69608	0.85718	3.0	3.0	3.0	5.0	5.0
MaritalStatus	204	4	Married	68	NaN	NaN	NaN	NaN	NaN	NaN	NaN
NumberOfTrips	204.0	NaN	NaN	NaN	3.69608	2.01369	1.0	2.0	3.0	5.0	8.0
Passport	204.0	2.0	0.0	104.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
PitchSatisfactionScore	204.0	NaN	NaN	NaN	3.03922	1.27825	1.0	2.0	3.0	4.0	5.0
OwnCar	204.0	2.0	1.0	124.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
NumberOfChildrenVisiting	204.0	NaN	NaN	NaN	1.16667	0.84321	0.0	1.0	1.0	2.0	3.0
Designation	204	1	Manager	204	NaN	NaN	NaN	NaN	NaN	NaN	NaN
MonthlyIncome	204.0	NaN	NaN	NaN	23093.33333	3512.7719	17086.0	20764.0	22964.0	24455.0	38525.0
Agebin	204	5	31-40	97	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Incomebin	204	5	<25000	137	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	...	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome	Agebin	Incomebin
2482	37.00000	Self Enquiry	1	12.00000	Salaried	Female	3	5.00000	Basic	...	Divorced	2.00000	1	2	1	1.00000	Executive	98678.00000	31-40	<100000
38	36.00000	Self Enquiry	1	11.00000	Salaried	Female	2	4.00000	Basic	...	Divorced	1.00000	1	2	1	0.00000	Executive	95000.00000	31-40	<100000
2634	53.00000	Self Enquiry	1	7.00000	Salaried	Male	4	5.00000	King	...	Divorced	2.00000	0	2	1	2.00000	VP	38677.00000	51-70	<40000
4104	53.00000	Self Enquiry	1	7.00000	Salaried	Male	4	5.00000	King	...	Married	2.00000	0	1	1	3.00000	VP	38677.00000	51-70	<40000
3190	42.00000	Company Invited	1	14.00000	Salaried	Female	3	6.00000	King	...	Married	3.00000	0	4	1	1.00000	VP	38651.00000	41-50	<40000

	ProdTaken	Age	TypeofContact	CityTier	DurationOfPitch	Occupation	Gender	NumberOfPersonVisiting	NumberOfFollowups	ProductPitched	...	MaritalStatus	NumberOfTrips	Passport	PitchSatisfactionScore	OwnCar	NumberOfChildrenVisiting	Designation	MonthlyIncome	Agebin	Incomebin
3260	0	40.00000	Company Invited	1	16.00000	Salaried	Male	4	4.00000	Deluxe	...	Unmarried	22.00000	0	2	1	1.00000	Manager	25460.00000	31-40	<30000
816	0	39.00000	Company Invited	1	15.00000	Salaried	Male	3	3.00000	Deluxe	...	Unmarried	21.00000	0	2	1	0.00000	Manager	21782.00000	31-40	<25000
2829	1	31.00000	Company Invited	1	11.00000	Large Business	Male	3	4.00000	Basic	...	Single	20.00000	1	4	1	2.00000	Executive	20963.00000	26-30	<25000
385	1	30.00000	Company Invited	1	10.00000	Large Business	Male	2	3.00000	Basic	...	Single	19.00000	1	4	1	1.00000	Executive	17285.00000	26-30	<20000
3155	1	30.00000	Self Enquiry	1	17.00000	Salaried	Female	4	5.00000	Basic	...	Single	8.00000	1	5	1	2.00000	Executive	21082.00000	26-30	<25000

Travel Package Purchase Prediction

Table of Contents

Context

Data Dictionary

Problem

Libraries

Read and Understand Data

View the first and last 5 rows of the dataset.¶

Data Preprocessing

Processing Gender status.¶

Age¶

Income¶

Exploratory Data Analysis

Univariate Analysis¶

Bivariate & Multivariate Analysis¶

Numberof follow up and number of person vsiting has very weak correlation.¶

Missing value Detection and Treatment

Missing value Treatment Type of contact¶

Missing value Treatment number of followup.¶

Missing value Treatment PreferredPropertyStar¶

Missing value Treatment Duration of pitch¶

Missing value Treatment for NumberOfTrips¶

Missing value Treatment NumberOfChildrenVisiting¶

Missing value Treatment Age¶

Missing value Treatment MonthlyIncome¶

Age¶

Income¶

Customer Profile by Product Type¶

Customer profile according to product pitched and product purchased¶

Insights based on EDA

Outlier Detection

Split the dataset¶

Model Building

Decision Tree¶

Bagging classifier¶

Random Forest¶

Model Performance Evaluation and Improvement-Bagging

Tuning Decision Tree¶

Tuning Random Forest¶

Tuning Bagging Classifier¶

Model Building Boosting

Adaboost¶

Gradient Boost¶

XGBoost¶

Model Performance Evaluation and Improvement-Boosting

Tuned AdaBoost Classifier¶

Tuned Gradient Boosting Classifier¶

Tuned XGBoost Classifier¶

Stacking Classifier

Comparing all models¶

Conclusion

Business Recommendations & Insights

Decision Tree ¶