In [33]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [34]:
# Loading the dataset
df = pd.read_csv("Customer Churn.csv")
In [35]:
df.head()
Out[35]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

In [36]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
In [37]:
#Total has datatype as object it should be of float data type
# we will replace the blanks with 0
df["TotalCharges"] = df["TotalCharges"].replace(" ","0")
# changing the data type to float
df["TotalCharges"] = df["TotalCharges"].astype("float")
In [38]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   float64
 20  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB
In [39]:
# Checking the null values
df.isnull().sum()
Out[39]:
0
customerID 0
gender 0
SeniorCitizen 0
Partner 0
Dependents 0
tenure 0
PhoneService 0
MultipleLines 0
InternetService 0
OnlineSecurity 0
OnlineBackup 0
DeviceProtection 0
TechSupport 0
StreamingTV 0
StreamingMovies 0
Contract 0
PaperlessBilling 0
PaymentMethod 0
MonthlyCharges 0
TotalCharges 0
Churn 0

In [40]:
# descriptive analysis of data
df.describe()
Out[40]:
SeniorCitizen tenure MonthlyCharges TotalCharges
count 7043.000000 7043.000000 7043.000000 7043.000000
mean 0.162147 32.371149 64.761692 2279.734304
std 0.368612 24.559481 30.090047 2266.794470
min 0.000000 0.000000 18.250000 0.000000
25% 0.000000 9.000000 35.500000 398.550000
50% 0.000000 29.000000 70.350000 1394.550000
75% 0.000000 55.000000 89.850000 3786.600000
max 1.000000 72.000000 118.750000 8684.800000
In [41]:
# To check duplicated in Customer id
df["customerID"] .duplicated().sum()
Out[41]:
np.int64(0)
In [42]:
# converted  0/ 1 values of SeniorCitizen column to Yes/No to make it easier to understand
# for this we will create a function
def conv(value):
  if value == 1:
    return "Yes"
  else:
    return "No"
df["SeniorCitizen"] = df["SeniorCitizen"].apply(conv)
In [43]:
df.head()
Out[43]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female No Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male No No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.50 No
2 3668-QPYBK Male No No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male No No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female No No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

In [44]:
ax = sns.countplot(x = 'Churn',data = df)
ax.bar_label(ax.containers[0])
plt.show()
No description has been provided for this image
In [45]:
# for the Churn Percentage we can use pie chart
plt.figure(figsize = (3,4))
gb = df.groupby("Churn").agg(({'Churn':"count"}))
plt.pie(gb['Churn'], labels = gb.index,autopct = "%1.2f%%")
plt.show()
No description has been provided for this image
In [46]:
# From the given pie chart we can conclude that 26.54% of our users have churned out , now lets explore the reason behind it
In [46]:

In [47]:
sns.countplot(x="gender", data = df, hue = "Churn")
plt.title("Churn by Gender")
plt.show()
No description has been provided for this image
In [48]:
# here the male and female both are churning at same rate
In [49]:
# Now lets see for  churn by senior citizen
# Crosstab and percentage calculation
ct = pd.crosstab(df['SeniorCitizen'], df['Churn'])
ct_pct = ct.div(ct.sum(axis=1), axis=0) * 100

# Plot
ax = ct_pct.plot(kind='bar', stacked=True)

plt.title("Churn Percentage by Senior Citizen")
plt.xlabel("Senior Citizen")
plt.ylabel("Percentage")
plt.legend(title="Churn")

# Add percentage labels
for container in ax.containers:
    ax.bar_label(
        container,
        fmt='%.1f%%',
        label_type='center'
    )

plt.tight_layout()
plt.show()
No description has been provided for this image
In [50]:
# we can see from above graph that a greater percentage of people in senior citizen category
In [51]:
# Now we will see People churned as per their service tenure
plt.figure(figsize = (9,4))
sns.histplot(x = "tenure",data = df , hue = "Churn")
plt.show()
No description has been provided for this image
In [52]:
# As we can see from the above chart mostof the people that are churned have used the service for 1 or 2 month
In [53]:
# Count of customers by Contract
ax = sns.countplot(x = 'Contract',data = df, hue = "Churn")
ax.bar_label(ax.containers[0])
plt.title("Count of Customers by Contract")
plt.show()
No description has been provided for this image
In [54]:
# we can see from the above charts that most of the churned users have month-to-month to those who have 1 year contract or 2 year contract
In [55]:
# We will create subplot for remaining columns
In [56]:
columns = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
           'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

# Number of columns for the subplot grid
n_cols = 3
n_rows = (len(columns) + n_cols - 1) // n_cols  # Calculate number of rows needed

# Create subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))

# Flatten the axes array for easy iteration (handles both 1D and 2D arrays)
axes = axes.flatten()

# Iterate over columns and plot count plots
for i, col in enumerate(columns):
    sns.countplot(x=col, data=df, ax=axes[i], hue = df["Churn"])
    axes[i].set_title(f'Count Plot of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Count')

# Remove empty subplots (if any)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()
No description has been provided for this image
In [57]:
# The majority of customers who do not churn tend to have services like PhoneService, InternetService (particularly DSL), and OnlineSecurity enabled. For services like OnlineBackup, TechSupport, and StreamingTV, churn rates are noticeably higher when these services are not used or are unavailable.
In [58]:
plt.figure(figsize = (6,4))
ax = sns.countplot(x = "PaymentMethod", data = df, hue = "Churn")
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
plt.title("Churned Customers by Payment Method")
plt.xticks(rotation = 45)
plt.show()
No description has been provided for this image
In [59]:
# customer is likely to churn when he is using electronic payment method