#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') # To supress warnings
sns.set(style="whitegrid") # set the background for the graphs


#Reading the csv file cardiogoodfitness.csv in variable 
data=pd.read_csv("../input/cardiogoodfitness/CardioGoodFitness.csv")
dffitness=data.copy()


#Examine the data by looking at the first 5 rows of the data
dffitness.head()


dffitness.tail()


#get the size of dataframe
dffitness.shape

(180, 9)


#get the type of data in dataframe
dffitness.dtypes

Product          object
Age               int64
Gender           object
Education         int64
MaritalStatus    object
Usage             int64
Fitness           int64
Income            int64
Miles             int64
dtype: object


dffitness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Product        180 non-null    object
 1   Age            180 non-null    int64 
 2   Gender         180 non-null    object
 3   Education      180 non-null    int64 
 4   MaritalStatus  180 non-null    object
 5   Usage          180 non-null    int64 
 6   Fitness        180 non-null    int64 
 7   Income         180 non-null    int64 
 8   Miles          180 non-null    int64 
dtypes: int64(6), object(3)
memory usage: 12.8+ KB


#changing it to  object dtype to category  to save memory
dffitness.Product=dffitness["Product"].astype("category")
dffitness.Gender=dffitness["Gender"].astype("category")
dffitness.MaritalStatus=dffitness["MaritalStatus"].astype("category")


dffitness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Product        180 non-null    category
 1   Age            180 non-null    int64   
 2   Gender         180 non-null    category
 3   Education      180 non-null    int64   
 4   MaritalStatus  180 non-null    category
 5   Usage          180 non-null    int64   
 6   Fitness        180 non-null    int64   
 7   Income         180 non-null    int64   
 8   Miles          180 non-null    int64   
dtypes: category(3), int64(6)
memory usage: 9.5 KB


#What are the different column name?
dffitness.columns

Index(['Product', 'Age', 'Gender', 'Education', 'MaritalStatus', 'Usage',
       'Fitness', 'Income', 'Miles'],
      dtype='object')


# Are there any missing values?
dffitness.isnull().sum()

Product          0
Age              0
Gender           0
Education        0
MaritalStatus    0
Usage            0
Fitness          0
Income           0
Miles            0
dtype: int64


#Are there any duplicate values?
dffitness.duplicated().sum()

0


#This is to look at what all unique values have . Just trying to use python
list_col=['Product','MaritalStatus','Usage','Fitness','Education','Age']
#How many models we have?
#what is Martial status of customers?
# how many days people expect to use treadmill?
# what is self rated fitness of customers buying treadmill?
# what is eductaion of customer buying treadmill?
# what is eductaion of customer buying treadmill?
for col in list_col: 
    print('{} :{} ' . format(col.upper(),dffitness[col].unique()))

PRODUCT :['TM195', 'TM498', 'TM798']
Categories (3, object): ['TM195', 'TM498', 'TM798'] 
MARITALSTATUS :['Single', 'Partnered']
Categories (2, object): ['Single', 'Partnered'] 
USAGE :[3 2 4 5 6 7] 
FITNESS :[4 3 2 1 5] 
EDUCATION :[14 15 12 13 16 18 20 21] 
AGE :[18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 43 44 46 47 50 45 48 42]


dffitness.describe()


#Which is most sold Model?
dffitness.Product.value_counts()

TM195    80
TM498    60
TM798    40
Name: Product, dtype: int64


#Are Male customers buying treadmill more than female customers?
dffitness.Gender.value_counts()

Male      104
Female     76
Name: Gender, dtype: int64


#Are married customer buying Treadmill more than Single customers?
dffitness.MaritalStatus.value_counts()

Partnered    107
Single        73
Name: MaritalStatus, dtype: int64


dffitness[dffitness['Product'] == 'TM195'].describe().T


dffitness[dffitness['Product'] == 'TM498'].describe().T


dffitness[dffitness['Product'] == 'TM798'].describe().T


def dist_box_violin(data):
 # function plots a combined graph for univariate analysis of continous variable 
 #to check spread, central tendency , dispersion and outliers  
    Name=data.name.upper()
    fig, axes =plt.subplots(1,3,figsize=(17, 7))
    fig.suptitle("SPREAD OF DATA FOR "+ Name  , fontsize=18, fontweight='bold')
    sns.distplot(data,kde=False,color='Blue',ax=axes[0])
    axes[0].axvline(data.mean(), color='y', linestyle='--',linewidth=2)
    axes[0].axvline(data.median(), color='r', linestyle='dashed', linewidth=2)
    axes[0].axvline(data.mode()[0],color='g',linestyle='solid',linewidth=2)
    axes[0].legend({'Mean':data.mean(),'Median':data.median(),'Mode':data.mode()})
    sns.boxplot(x=data,showmeans=True, orient='h',color="purple",ax=axes[1])
    #just exploring violin plot
    sns.violinplot(data,ax=axes[2],showmeans=True)


dist_box_violin(dffitness.Income)


dist_box_violin(dffitness.Age)


dist_box_violin(dffitness.Miles)


dist_box_violin(dffitness.Fitness)


dist_box_violin(dffitness.Education)


dist_box_violin(dffitness.Usage)


#Univariate Analysis
#categorical variables 
plt.figure(figsize=(14,7))
dffitness['Product'].value_counts().plot.pie(autopct='%1.1f%%',figsize=(8,8))
plt.title("Pie chart of Product Sales")
plt.show()


# Function to create barplots that indicate percentage for each category.
def bar_perc(plot, feature):
    '''
    plot
    feature: 1-d categorical feature array
    '''
    total = len(feature) # length of the column
    for p in plot.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total) # percentage of each class of the category
        x = p.get_x() + p.get_width() / 2 - 0.05 # width of the plot
        y = p.get_y() + p.get_height()           # hieght of the plot
        plot.annotate(percentage, (x, y), size = 12) # annotate the percentage


fig1, axes1 =plt.subplots(1,3,figsize=(14, 7))
list_col=['Product','Gender','MaritalStatus']
j=0
for i in range(len(list_col)):
    order = dffitness[list_col[i]].value_counts(ascending=False).index # to display bar in ascending order
    axis=sns.countplot(x=list_col[i], data=dffitness , order=order,ax=axes1[i],palette='plasma').set(title=list_col[i].upper() + ' WISE SALES')
    bar_perc(axes1[i],dffitness[list_col[i]])


#Average age of customer buying each model
dffitness.groupby('Product')['Age'].mean()

Product
TM195    28.55
TM498    28.90
TM798    29.10
Name: Age, dtype: float64


#Average Income of customer buying each model
dffitness.groupby('Product')['Income'].mean()

Product
TM195    46418.025
TM498    48973.650
TM798    75441.575
Name: Income, dtype: float64


#Average Income of customer buying each model
dffitness.groupby('Product')['Miles'].mean()

Product
TM195     82.787500
TM498     87.933333
TM798    166.900000
Name: Miles, dtype: float64


plt.figure(figsize=(10,10))
prd_gender=pd.crosstab(dffitness['Product'],dffitness['Gender'] )
print(prd_gender)

ax=prd_gender.plot(kind='bar')

plt.title("PRODUCT BY GENDER")

Gender   Female  Male
Product              
TM195        40    40
TM498        29    31
TM798         7    33

Text(0.5, 1.0, 'PRODUCT BY GENDER')

<Figure size 720x720 with 0 Axes>


prd_mar_status=pd.crosstab(dffitness['Product'],dffitness['MaritalStatus'] )
print(prd_mar_status)
prd_mar_status.plot(kind='bar')
plt.title("PRODUCT BY MARTIAL STATUS")

MaritalStatus  Partnered  Single
Product                         
TM195                 48      32
TM498                 36      24
TM798                 23      17

Text(0.5, 1.0, 'PRODUCT BY MARTIAL STATUS')


plt.figure(figsize=(15,7))
sns.heatmap(dffitness.corr(), annot=True)

<AxesSubplot:>


corr_pairs = dffitness.corr().unstack() # give pairs of correlation
print( corr_pairs[abs(corr_pairs)>0.5]) # Gives us correlated data

Age        Age          1.000000
           Income       0.513414
Education  Education    1.000000
           Income       0.625827
Usage      Usage        1.000000
           Fitness      0.668606
           Income       0.519537
           Miles        0.759130
Fitness    Usage        0.668606
           Fitness      1.000000
           Income       0.535005
           Miles        0.785702
Income     Age          0.513414
           Education    0.625827
           Usage        0.519537
           Fitness      0.535005
           Income       1.000000
           Miles        0.543473
Miles      Usage        0.759130
           Fitness      0.785702
           Income       0.543473
           Miles        1.000000
dtype: float64


plt.figure(figsize=(15,7))
sns.pairplot(data=dffitness,corner=True)

<seaborn.axisgrid.PairGrid at 0x7f9fec72e990>

<Figure size 1080x504 with 0 Axes>


#Bi Varaite Analysis for 
#1.Product & Age
#2.Product & Income
#3.Product & Education
#4.Product & Usage
#5.Product & Fitness
#6.Product & Miles
fig1, axes1 =plt.subplots(3,2,figsize=(14, 19))
list1_col=['Age','Income','Education','Usage','Fitness','Miles']
#instead of writing  boxplot 6 times using for loop
for i in range(len(list1_col)):
    row=i//2
    col=i%2
    ax=axes1[row,col]
    
    sns.boxplot(dffitness[list1_col[i]],dffitness['Product'],ax=ax).set(title='PRODUCT BY ' + list1_col[i].upper()  )


#Bi Varaite Analysis for 
#1.Gender & Age
#2.Gender & Income
#3.Gender & Education
#4.Gender & Usage
#5.Gender & Fitness
#6.Gender & Miles

fig1, axes1 =plt.subplots(3,2,figsize=(14, 19))
list1_col=['Age','Income','Education','Usage','Fitness','Miles']
# to plot graph side by side.
for i in range(len(list1_col)):
    row=i//2
    col=i%2
    ax=axes1[row,col]
    sns.boxplot(dffitness[list1_col[i]],dffitness['Gender'],ax=ax).set(title='GENDER BY ' + list1_col[i].upper())


#Bi Varaite Analysis for 
#1.Martial Status & Age
#2.Martial Status & Income
#3.Martial Status & Education
#4.Martial Status & Usage
#5.Martial Status & Fitness
#6.Martial Status & Miles
plt.figure(figsize=(7,7))
fig1, axes1 =plt.subplots(3,2,figsize=(18, 19))
list1_col=['Age','Income','Education','Usage','Fitness','Miles']
for i in range(len(list1_col)):
    row=i//2
    col=i%2
    ax=axes1[row,col]
    sns.boxplot(dffitness[list1_col[i]],dffitness['MaritalStatus'],ax=ax).set(title='MARTIAL STATUS BY ' + list1_col[i].upper())

<Figure size 504x504 with 0 Axes>


plt.figure(figsize=(7,7))
sns.countplot(dffitness['Gender'],hue=dffitness["MaritalStatus"]).set(title='MARTIAL STATUS BY GENDER')

[Text(0.5, 1.0, 'MARTIAL STATUS BY GENDER')]


#Bivariate Analysis Age & Education
sns.jointplot(x = 'Age',y = 'Education',data = dffitness,color="red",kind='hex')

<seaborn.axisgrid.JointGrid at 0x7f9fe2d80c90>


plt.figure(figsize=(12,7))
sns.pointplot(x=dffitness["Education"],y=dffitness["Income"],hue=dffitness['Product']).set(title='EDUCATION  BY INCOME ')

[Text(0.5, 1.0, 'EDUCATION  BY INCOME ')]


plt.figure(figsize=(12,7))
sns.catplot(x='Usage', y='Income', col='Gender',hue='Product' ,kind="bar", data=dffitness)

<seaborn.axisgrid.FacetGrid at 0x7f9fe2cb7150>

<Figure size 864x504 with 0 Axes>


prd_mar_gen= pd.crosstab(index=dffitness["Product"], 
                             columns=[dffitness["MaritalStatus"],
                                      dffitness["Gender"]] ,
                             )  
prd_mar_gen


prd_mar_gen.plot(kind='bar',figsize=(10,7))

<AxesSubplot:xlabel='Product'>


# Income by gender by product and by marital status
sns.catplot(x='Gender',y='Income', hue='Product', col='MaritalStatus', data=dffitness,kind='bar');


prod_gen_fit=pd.crosstab(index=dffitness['Product'],columns=[dffitness['Gender'],dffitness['Fitness']])
prod_gen_fit


#scatter plot between income, age ,product and usage
sns.relplot(x="Age", y="Income", hue="Product", size="Usage",
            sizes=(40, 400), alpha=.5, palette="plasma",
            height=6, data=data).set(title='INCOME BY AGE ,PRODUCT AND USAGE');

	Age	Education	Usage	Fitness	Income	Miles
count	180.000000	180.000000	180.000000	180.000000	180.000000	180.000000
mean	28.788889	15.572222	3.455556	3.311111	53719.577778	103.194444
std	6.943498	1.617055	1.084797	0.958869	16506.684226	51.863605
min	18.000000	12.000000	2.000000	1.000000	29562.000000	21.000000
25%	24.000000	14.000000	3.000000	3.000000	44058.750000	66.000000
50%	26.000000	16.000000	3.000000	3.000000	50596.500000	94.000000
75%	33.000000	16.000000	4.000000	4.000000	58668.000000	114.750000
max	50.000000	21.000000	7.000000	5.000000	104581.000000	360.000000

	count	mean	std	min	25%	50%	75%	max
Age	80.0	28.5500	7.221452	18.0	23.0	26.0	33.0	50.0
Education	80.0	15.0375	1.216383	12.0	14.0	16.0	16.0	18.0
Usage	80.0	3.0875	0.782624	2.0	3.0	3.0	4.0	5.0
Fitness	80.0	2.9625	0.664540	1.0	3.0	3.0	3.0	5.0
Income	80.0	46418.0250	9075.783190	29562.0	38658.0	46617.0	53439.0	68220.0
Miles	80.0	82.7875	28.874102	38.0	66.0	85.0	94.0	188.0

	count	mean	std	min	25%	50%	75%	max
Age	60.0	28.900000	6.645248	19.0	24.0	26.0	33.25	48.0
Education	60.0	15.116667	1.222552	12.0	14.0	16.0	16.00	18.0
Usage	60.0	3.066667	0.799717	2.0	3.0	3.0	3.25	5.0
Fitness	60.0	2.900000	0.629770	1.0	3.0	3.0	3.00	4.0
Income	60.0	48973.650000	8653.989388	31836.0	44911.5	49459.5	53439.00	67083.0
Miles	60.0	87.933333	33.263135	21.0	64.0	85.0	106.00	212.0

	count	mean	std	min	25%	50%	75%	max
Age	40.0	29.100	6.971738	22.0	24.75	27.0	30.25	48.0
Education	40.0	17.325	1.639066	14.0	16.00	18.0	18.00	21.0
Usage	40.0	4.775	0.946993	3.0	4.00	5.0	5.00	7.0
Fitness	40.0	4.625	0.667467	3.0	4.00	5.0	5.00	5.0
Income	40.0	75441.575	18505.836720	48556.0	58204.75	76568.5	90886.00	104581.0
Miles	40.0	166.900	60.066544	80.0	120.00	160.0	200.00	360.0

Cardio Good Fitness

Table of Contents

Context

Data Dictionary

Libraries

Read and Understand Data

Data Preprocessing

Exploratory Data Analysis

Observation:¶

Bi variate Analysis¶

Multivariate Analysis¶

Conclusion

Important Observations¶

Customer Profiles of Models¶

¶

Business Recommendations & Insights

	Product	Age	Gender	Education	MaritalStatus	Usage	Fitness	Income	Miles
0	TM195	18	Male	14	Single	3	4	29562	112
1	TM195	19	Male	15	Single	2	3	31836	75
2	TM195	19	Female	14	Partnered	4	3	30699	66
3	TM195	19	Male	12	Single	3	3	32973	85
4	TM195	20	Male	13	Partnered	4	2	35247	47

	Product	Age	Gender	Education	MaritalStatus	Usage	Fitness	Income	Miles
175	TM798	40	Male	21	Single	6	5	83416	200
176	TM798	42	Male	18	Single	5	4	89641	200
177	TM798	45	Male	16	Single	5	5	90886	160
178	TM798	47	Male	18	Partnered	4	5	104581	120
179	TM798	48	Male	18	Partnered	4	5	95508	180

MaritalStatus	Partnered		Single
Gender	Female	Male	Female	Male
Product
TM195	27	21	13	19
TM498	15	21	14	10
TM798	4	19	3	14