Skip to main content

Sham Sui Po, Hong Kong

Telco Customer Churn Cohort Study

Github Repository

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import svm
from sklearn.ensemble import (
RandomForestClassifier,
GradientBoostingClassifier,
AdaBoostClassifier
)
from sklearn.metrics import (
classification_report,
confusion_matrix,
ConfusionMatrixDisplay,
)
from sklearn.model_selection import (
train_test_split,
GridSearchCV
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
!wget https://github.com/mpolinowski/datasets/raw/master/telco_customer_churn.csv -P dataset
telco_df = pd.read_csv('dataset/telco_customer_churn.csv')
telco_df.head(5).transpose()
01234
genderFemaleMaleMaleMaleFemale
SeniorCitizen00000
PartnerYesNoNoNoNo
DependentsNoNoNoNoNo
tenure1342452
PhoneServiceNoYesYesNoYes
MultipleLinesNo phone serviceNoNoNo phone serviceNo
InternetServiceDSLDSLDSLDSLFiber optic
OnlineSecurityNoYesYesYesNo
OnlineBackupYesNoYesNoNo
DeviceProtectionNoYesNoYesNo
TechSupportNoNoNoYesNo
StreamingTVNoNoNoNoNo
StreamingMoviesNoNoNoNoNo
ContractMonth-to-monthOne yearMonth-to-monthOne yearMonth-to-month
PaperlessBillingYesNoYesNoYes
PaymentMethodElectronic checkMailed checkMailed checkBank transfer (automatic)Electronic check
MonthlyCharges29.8556.9553.8542.370.7
TotalCharges29.851889.5108.151840.75151.65
ChurnNoNoYesNoYes

Feature Exploration

# check for missing data or wrong dtype
telco_df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 7043 entries, 0 to 7042 Data columns (total 20 columns):

#ColumnNon-Null CountDtype
0gender7043 non-nullobject
1SeniorCitizen7043 non-nullint64
2Partner7043 non-nullobject
3Dependents7043 non-nullobject
4tenure7043 non-nullint64
5PhoneService7043 non-nullobject
6MultipleLines7043 non-nullobject
7InternetService7043 non-nullobject
8OnlineSecurity7043 non-nullobject
9OnlineBackup7043 non-nullobject
10DeviceProtection7043 non-nullobject
11TechSupport7043 non-nullobject
12StreamingTV7043 non-nullobject
13StreamingMovies7043 non-nullobject
14Contract7043 non-nullobject
15PaperlessBilling7043 non-nullobject
16PaymentMethod7043 non-nullobject
17MonthlyCharges7043 non-nullfloat64
18TotalCharges7043 non-nullobject
19Churn7043 non-nullobject
dtypes: float64(1), int64(2), object(17)
memory usage: 1.1+ MB
# TotalCharges is type string because it contains SPACE's
# how many rows have a SPACE instad of a float value?
(telco_df['TotalCharges'] == ' ').sum()
# 11
# drop those 11 rows
telco_df = telco_df[telco_df['TotalCharges'] != ' ']
# now turn TotalCharges column into dtype=float
telco_df['TotalCharges'] = telco_df['TotalCharges'].apply(lambda string: float(string))
# Name: TotalCharges, Length: 7032, dtype: float64
# check for missing values
telco_df.isnull().sum()
gender0
SeniorCitizen0
Partner0
Dependents0
tenure0
PhoneService0
MultipleLines0
InternetService0
OnlineSecurity0
OnlineBackup0
DeviceProtection0
TechSupport0
StreamingTV0
StreamingMovies0
Contract0
PaperlessBilling0
PaymentMethod0
MonthlyCharges0
TotalCharges0
Churn0
dtype: int64
# check label distribution
telco_df['Churn'].value_counts()

# No 5174
# Yes 1869
# Name: Churn, dtype: int64
plt.figure(figsize=(10,4))
plt.title('Churn Count Distribution')
sns.countplot(data=telco_df, x='Churn', palette='seismic')
plt.savefig('assets/telco-customer-churn_01.webp', bbox_inches='tight')

Telco Customer Churn Dataset

plt.figure(figsize=(10, 5))
plt.title('Total Charges by Churn Count')

sns.boxenplot(
data=telco_df,
x='Churn',
y='TotalCharges',
hue='gender',
palette='seismic_r'
)
plt.savefig('assets/telco-customer-churn_02.webp', bbox_inches='tight')

Telco Customer Churn Dataset

plt.figure(figsize=(10, 5))
plt.title('Total Charges by Contract Type')

sns.violinplot(
data=telco_df,
x='Contract',
y='TotalCharges',
hue='Churn',
palette='seismic_r',
orient='v',
inner='quartile',
bw=0.3,
split=True
)
plt.legend(loc='lower right', title='Churn')
plt.savefig('assets/telco-customer-churn_03.webp', bbox_inches='tight')

Telco Customer Churn Dataset

# make values numeric to show correlation
telco_df_numeric = pd.get_dummies(telco_df[
[
'gender',
'SeniorCitizen',
'Partner',
'Dependents',
'PhoneService',
'MultipleLines',
'InternetService',
'OnlineSecurity',
'OnlineBackup',
'DeviceProtection',
'TechSupport',
'StreamingTV',
'StreamingMovies',
'Contract',
'PaperlessBilling',
'PaymentMethod',
'Churn'
]
], drop_first=True)

telco_df_numeric.head(5).transpose()
01234
SeniorCitizen00000
gender_Male01110
Partner_Yes10000
Dependents_Yes00000
PhoneService_Yes01101
MultipleLines_No phone service10010
MultipleLines_Yes00000
InternetService_Fiber optic00001
InternetService_No00000
OnlineSecurity_No internet service00000
OnlineSecurity_Yes01110
OnlineBackup_No internet service00000
OnlineBackup_Yes10100
DeviceProtection_No internet service00000
DeviceProtection_Yes01010
TechSupport_No internet service00000
TechSupport_Yes00010
StreamingTV_No internet service00000
StreamingTV_Yes00000
StreamingMovies_No internet service00000
StreamingMovies_Yes00000
Contract_One year01010
Contract_Two year00000
PaperlessBilling_Yes10101
PaymentMethod_Credit card (automatic)00000
PaymentMethod_Electronic check10001
PaymentMethod_Mailed check01100
Churn_Yes00101
plt.figure(figsize=(25,25))
plt.title('Dataset Correlation Matrix')
sns.heatmap(telco_df_numeric.corr(), annot=True, cmap='coolwarm')
plt.savefig('assets/telco-customer-churn_04.webp', bbox_inches='tight')

Telco Customer Churn Dataset

# correlation to label class
telco_df_numeric.corr(numeric_only=True)['Churn_Yes']
SeniorCitizen0.150541
gender_Male-0.008545
Partner_Yes-0.149982
Dependents_Yes-0.163128
PhoneService_Yes0.011691
MultipleLines_No phone service-
MultipleLines_Yes0.040033
InternetService_Fiber optic0.307463
InternetService_No-0.227578
OnlineSecurity_No internet service-0.227578
OnlineSecurity_Yes-0.171270
OnlineBackup_No internet service-0.227578
OnlineBackup_Yes-0.082307
DeviceProtection_No internet service-0.227578
DeviceProtection_Yes-0.066193
TechSupport_No internet service-0.227578
TechSupport_Yes-0.164716
StreamingTV_No internet service-0.227578
StreamingTV_Yes0.063254
StreamingMovies_No internet service-0.227578
StreamingMovies_Yes0.060860
Contract_One year-0.178225
Contract_Two year-0.301552
PaperlessBilling_Yes0.191454
PaymentMethod_Credit card (automatic-0.134687
PaymentMethod_Electronic check0.301455
PaymentMethod_Mailed check-0.090773
Churn_Yes1.000000
Name: Churn_Yes, dtype: float64
# create correlation dataframe
telco_df_numeric_corr = telco_df_numeric.corr(numeric_only=True)['Churn_Yes'][:-2].sort_values()
plt.figure(figsize=(10, 10))
plt.title('Festure Correlation to Customer Churn')

sns.barplot(
y=telco_df_numeric_corr.index,
x=telco_df_numeric_corr.values,
estimator='mean',
errorbar=None,
palette='coolwarm',
orient='h'
)

plt.savefig('assets/telco-customer-churn_05.webp', bbox_inches='tight')

Telco Customer Churn Dataset

Churn Analaysis

# segmenting customers based on their tenure
telco_df['Contract'].value_counts()

# Month-to-month 3875
# Two year 1685
# One year 1472
# Name: Contract, dtype: int64
plt.figure(figsize=(10, 5))
plt.title('Churn Distribution by Contract Type')

sns.histplot(
data=telco_df,
x='Contract',
hue='Churn',
element='step',
palette='seismic',
kde=True
)

plt.savefig('assets/telco-customer-churn_06.webp', bbox_inches='tight')

Telco Customer Churn Dataset

plt.figure(figsize=(10, 5))
plt.title('Churn Distribution by Tenure in Months')

sns.histplot(
data=telco_df,
x='tenure',
hue='Churn',
element='step',
palette='seismic',
kde=True
)

plt.savefig('assets/telco-customer-churn_07.webp', bbox_inches='tight')

Telco Customer Churn Dataset

grid = sns.catplot(
data=telco_df,
x='tenure',
kind='count',
hue='Churn',
palette='seismic',
col='Contract',
row='Churn'
)

grid.set(
xticks=np.arange(0,72,6),
xlabel='Tenure [Months]',
ylabel='Customer Count'
)

grid.fig.subplots_adjust(top=0.9)
grid.fig.suptitle('Tenure [months] by Contract Type and Churn')

plt.savefig('assets/telco-customer-churn_08.webp', bbox_inches='tight')

Telco Customer Churn Dataset

plt.figure(figsize=(10, 6))

plot = sns.scatterplot(
x='MonthlyCharges',
y='TotalCharges',
data=telco_df,
hue='Churn',
palette='seismic',
alpha=0.5
)

plot.set_title('Churn by Monthly and Total Charges')

plt.savefig('assets/telco-customer-churn_09.webp', bbox_inches='tight')

Telco Customer Churn Dataset

Tenure Cohort Study

# group customers by tenure and calculate average monthly charges
telco_df_tenure = telco_df.groupby('tenure').mean(numeric_only=True)['MonthlyCharges']
tenureMonthlyCharges
150.485808
257.206303
358.015000
457.432670
561.003759
...
6873.321000
6970.823158
7076.378992
7173.735588
7280.695856
Name: MonthlyCharges, Length: 72, dtype: float64
plt.figure(figsize=(10, 5))
plt.title('Average Monthly Charges by Tenure')
plt.xlabel('Tenure [Months]')
plt.ylabel('Mean Monthly Charges [$]')

sns.scatterplot(
x=telco_df_tenure.index,
y=telco_df_tenure.values,
color='#3b70f7'
)

plt.savefig('assets/telco-customer-churn_10.webp', bbox_inches='tight')

Telco Customer Churn Dataset

# adding missing columns
telco_df_numeric['Tenure'] = telco_df['tenure']
telco_df_numeric['Monthly Charges'] = telco_df['MonthlyCharges']
telco_df_numeric['Total Charges'] = telco_df['TotalCharges']
telco_df_numeric['Contract'] = telco_df['Contract']
telco_df_numeric[['Tenure', 'Churn_Yes']]
TenureChurn_Yes
010
1340
221
3450
421
...
7038240
7039720
7040110
704141
7042660
7032 rows × 2 columns
# group customers by tenure and calculate average churn rate
telco_df_tenure_Churn = telco_df_numeric.groupby('Tenure').mean(numeric_only=True)['Churn_Yes']
TenureAverage Churn Rate
10.619902
20.516807
30.470000
40.471591
50.481203
...
680.090000
690.084211
700.092437
710.035294
720.016575
Name: Churn_Yes, Length: 72, dtype: float64
fig, ax = plt.subplots(
figsize=(10,5)
)
ax.bar(
x=telco_df_tenure_Churn.index,
height=telco_df_tenure_Churn.values.round(4)*100,
color='#3b70f7'
)
ax.set_title('Churn Rate by Tenure in Months')
ax.set(
xlabel='Tenure [Months]',
ylabel='Churn Rate [%]',
xticks=np.arange(0,78,6)
)

fig.savefig('assets/telco-customer-churn_11.webp', bbox_inches='tight')

Telco Customer Churn Dataset

# group customers in chorts 0-12,12-24,24-48 and above 48 months tenure
def tenure_cohort(tenure):
if tenure < 13:
return '0-12 Months'
elif tenure < 25:
return '12-24 Months'
elif tenure < 49:
return '24-48 Months'
else:
return 'Over 48 Months'

telco_df_numeric['Tenure Cohort'] = telco_df_numeric['Tenure'].apply(tenure_cohort)
telco_df_numeric[['Tenure','Tenure Cohort']]
TenureTenure Cohort
010-12 Months
13424-48 Months
220-12 Months
34524-48 Months
420-12 Months
...
70382412-24 Months
703972Over 48 Months
7040110-12 Months
704140-12 Months
704266Over 48 Months
7032 rows × 2 columns
plt.figure(figsize=(10, 6))

plot = sns.scatterplot(
x='Monthly Charges',
y='Total Charges',
data=telco_df_numeric,
hue='Tenure Cohort',
palette='seismic_r',
alpha=0.5
)

plot.set_title('Monthly and Total Charges by Tenure Cohort')

plt.savefig('assets/telco-customer-churn_12.webp', bbox_inches='tight')

Telco Customer Churn Dataset

plt.figure(figsize=(12, 5))
plt.title('Monthly Charges for Tenure Cohorts')

sns.boxenplot(
data=telco_df_numeric,
x='Tenure Cohort',
y='Monthly Charges',
hue='Churn_Yes',
palette='seismic_r'
)
plt.savefig('assets/telco-customer-churn_13.webp', bbox_inches='tight')

Telco Customer Churn Dataset

plt.figure(figsize=(10, 5))
plt.title('Total Charges for Tenure Cohorts')

sns.boxenplot(
data=telco_df_numeric,
x='Tenure Cohort',
y='Total Charges',
hue='Churn_Yes',
palette='seismic_r'
)
plt.savefig('assets/telco-customer-churn_14.webp', bbox_inches='tight')

Telco Customer Churn Dataset

plt.figure(figsize=(10, 5))
plt.title('Churn by Tenure Cohorts')

sns.countplot(
data=telco_df_numeric,
x='Tenure Cohort',
hue='Churn_Yes',
palette='seismic_r'
)
plt.savefig('assets/telco-customer-churn_15.webp', bbox_inches='tight')

Telco Customer Churn Dataset

grid = sns.catplot(
data=telco_df_numeric,
x='Tenure Cohort',
kind='count',
hue='Churn_Yes',
palette='seismic',
col='Contract'
)

grid.set(
# xticks=np.arange(0,72,6),
xlabel='Tenure Cohort',
ylabel='Customer Count'
)

grid.fig.subplots_adjust(top=0.9)
grid.fig.suptitle('Contract Type by Tenure Cohort and Churn')

plt.savefig('assets/telco-customer-churn_16.webp', bbox_inches='tight')

Telco Customer Churn Dataset

Predictive Modeling

Single Decision Tree

Baseline single tree classifier with default hyper parameters.

# remove target feature and non-numeric columns from training set
X_churn = telco_df_numeric.drop(['Churn_Yes','Tenure Cohort', 'Contract'], axis=1)
y_churn = telco_df_numeric['Churn_Yes']
# train/test split
X_churn_train, X_churn_test, y_churn_train, y_churn_test = train_test_split(
X_churn,
y_churn,
test_size=0.2
)
# fit the model
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_churn_train, y_churn_train)
# run predictions
y_tree_preds = tree_classifier.predict(X_churn_test)
# evaluate predictions
conf_mtx_tree = confusion_matrix(y_churn_test, y_tree_preds)

conf_mtx_tree_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_tree,
display_labels=[False,True]
)

conf_mtx_tree_plot.plot(cmap='winter')

plt.savefig('assets/telco-customer-churn_17.webp', bbox_inches='tight')

Telco Customer Churn Dataset

report_tree = classification_report(
y_churn_test, y_tree_preds
)
print(report_tree)
precisionrecallf1-scoresupport
00.800.810.811012
10.510.500.50395
accuracy0.721407
macro avg0.660.650.651407
weighted avg0.720.720.721407
# feature importance for classification
tree_index = ['importance']
tree_data_columns = pd.Series(X_churn.columns)
tree_importance_array = tree_classifier.feature_importances_
tree_importance_df = pd.DataFrame(tree_importance_array, tree_data_columns, tree_index)
tree_importance_df
importance
SeniorCitizen0.019747
gender_Male0.022160
Partner_Yes0.018070
Dependents_Yes0.016844
PhoneService_Yes0.002513
MultipleLines_No phone service0.001701
MultipleLines_Yes0.015637
InternetService_Fiber optic0.106278
InternetService_No0.000000
OnlineSecurity_No internet service0.000000
OnlineSecurity_Yes0.015796
OnlineBackup_No internet service0.000000
OnlineBackup_Yes0.015036
DeviceProtection_No internet service0.000000
DeviceProtection_Yes0.013911
TechSupport_No internet service0.004314
TechSupport_Yes0.023056
StreamingTV_No internet service0.006463
StreamingTV_Yes0.012094
StreamingMovies_No internet service0.000000
StreamingMovies_Yes0.010126
Contract_One year0.008873
Contract_Two year0.007519
PaperlessBilling_Yes0.023079
PaymentMethod_Credit card (automatic)0.010816
PaymentMethod_Electronic check0.022332
PaymentMethod_Mailed check0.009718
Tenure0.192801
MonthlyCharges0.085796
TotalCharges0.121363
Monthly Charges0.114721
Total Charges0.099236
tree_importance_df.sort_values(
by='importance',
ascending=True
).plot(
kind='barh',
title='Feature Importance for Churn Classification',
figsize=(10,10)
)

plt.savefig('assets/telco-customer-churn_18.webp', bbox_inches='tight')

Telco Customer Churn Dataset

Random Forest

Random forest with grid search for optimal hyper parameter.

rdnfor_classifier = RandomForestClassifier()

param_grid = {
'n_estimators': [5, 25, 50, 75,100, 125],
'min_samples_split': [1,2,3],
'min_samples_leaf': [1,2,3],
'criterion': ['gini', 'entropy', 'log_loss'],
'max_features' : ['sqrt', 'log2']
}

grid_search_rdnfor = GridSearchCV(
estimator = rdnfor_classifier,
param_grid = param_grid
)

grid_search_rdnfor.fit(X_churn_train, y_churn_train)
print('Best Parameter: ', grid_search_rdnfor.best_params_)
# Best Parameter: {'criterion': 'gini', 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 125}
# run predictions
y_rdnfor_preds = grid_search_rdnfor.predict(X_churn_test)
# evaluate predictions
conf_mtx_rdnfor = confusion_matrix(y_churn_test, y_rdnfor_preds)

conf_mtx_rdnfor_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_rdnfor,
display_labels=[False,True]
)

conf_mtx_rdnfor_plot.plot(cmap='winter')

plt.savefig('assets/telco-customer-churn_19.webp', bbox_inches='tight')

Telco Customer Churn Dataset

report_rdnfor = classification_report(
y_churn_test, y_rdnfor_preds
)
print(report_rdnfor)
precisionrecallf1-scoresupport
00.840.910.871012
10.700.550.61395
accuracy0.811407
macro avg0.770.730.741407
weighted avg0.800.810.801407

Adaptive Boosting

Baseline AdaBoost classifier with default hyper parameters.

abc_classifier = AdaBoostClassifier(
estimator=None,
n_estimators=50,
learning_rate=1.0
)

abc_classifier.fit(X_churn_train, y_churn_train)

abc_classifier_preds = abc_classifier.predict(X_churn_test)
# run predictions
y_abc_preds = abc_classifier.predict(X_churn_test)
# evaluate predictions
conf_mtx_abc = confusion_matrix(y_churn_test, y_abc_preds)

conf_mtx_abc_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_abc,
display_labels=[False,True]
)

conf_mtx_abc_plot.plot(cmap='winter')

plt.savefig('assets/telco-customer-churn_21.webp', bbox_inches='tight')

Telco Customer Churn Dataset

report_abc = classification_report(
y_churn_test, y_abc_preds
)
print(report_abc)
precisionrecallf1-scoresupport
00.850.910.881012
10.720.580.64395
accuracy0.821407
macro avg0.780.750.761407
weighted avg0.810.820.811407
# feature importance for classification
abc_index = ['importance']
abc_data_columns = pd.Series(X_churn.columns)
abc_importance_array = abc_classifier.feature_importances_
abc_importance_df = pd.DataFrame(abc_importance_array, abc_data_columns, abc_index)
abc_importance_df
abc_importance_df.sort_values(
by='importance',
ascending=True
).plot(
kind='barh',
title='Feature Importance for Churn Classification',
figsize=(10,10)
)

plt.savefig('assets/telco-customer-churn_22.webp', bbox_inches='tight')

Telco Customer Churn Dataset

Gradient Boosting

GradientBoost Classifier with grid search for optimal hyper parameter.

gb_classifier = GradientBoostingClassifier()
param_grid = {
'n_estimators': [50, 100, 150],
'learning_rate': [0.05,0.1,0.2],
'max_depth': [2,3,4,5]
}
grid_search_gb = GridSearchCV(gb_classifier, param_grid)
grid_search_gb.fit(X_churn_train, y_churn_train)
grid_search_gb.best_params_
# {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
# run predictions
y_gb_preds = grid_search_gb.predict(X_churn_test)
# evaluate predictions
conf_mtx_gb = confusion_matrix(y_churn_test, y_gb_preds)

conf_mtx_gb_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_gb,
display_labels=[False,True]
)

conf_mtx_gb_plot.plot(cmap='winter')

plt.savefig('assets/telco-customer-churn_23.webp', bbox_inches='tight')

Telco Customer Churn Dataset

report_gb = classification_report(
y_churn_test, y_gb_preds
)
print(report_gb)
precisionrecallf1-scoresupport
00.840.920.881012
10.730.560.63395
accuracy0.821407
macro avg0.790.740.751407
weighted avg0.810.820.811407

Support Vector Classifier

# normalize feature values
sc_churn = StandardScaler()
X_churn_train_scaled=sc_churn.fit_transform(X_churn_train)
X_churn_test_scaled=sc_churn.transform(X_churn_test)
# SVM classifier fitting
svc_classifier = svm.SVC()
svc_classifier.fit(X_churn_train_scaled, y_churn_train)
# Predictions
y_svc_pred = svc_classifier.predict(X_churn_test_scaled)
# evaluate predictions
conf_mtx_svc = confusion_matrix(y_churn_test, y_svc_pred)

conf_mtx_svc_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_svc,
display_labels=[False,True]
)

conf_mtx_svc_plot.plot(cmap='winter')

plt.savefig('assets/telco-customer-churn_24.webp', bbox_inches='tight')

Telco Customer Churn Dataset

report_svc = classification_report(
y_churn_test, y_svc_pred
)
print(report_svc)
precisionrecallf1-scoresupport
00.830.920.871012
10.720.500.59395
accuracy0.811407
macro avg0.770.710.731407
weighted avg0.800.810.791407