# Set Working Directory
setwd("F:/IIITB - Upgrad/Course 3 - Predictive Analysis/Case Study")
# Install and Load the required packages
#Checkpoint 1
# Load the given files.
churn_data <- read.csv("churn_data.csv", stringsAsFactors = FALSE)
customer_data <-
read.csv("customer_data.csv", stringsAsFactors = FALSE)
internet_data <-
read.csv("internet_data.csv", stringsAsFactors = FALSE)
# Collate the 3 files in a single file.
merge1 <- merge(churn_data, customer_data, by = 'customerID')
churn <- merge(merge1, internet_data, by = 'customerID')
# Understand the structure of the collated file.
#Checkpoint 2 - EDA
# Make bar charts to find interesting relationships between variables.
#Distribution of monthly charges along with churn
ggplot(churn, aes(x = churn$MonthlyCharges)) + geom_histogram() + aes(fill = churn$Churn)
#Distribution of tenure along with churn
ggplot(churn, aes(x = churn$tenure)) + geom_histogram() + aes(fill = churn$Churn)
#Gender wise churn
ggplot(churn, aes(x = churn$Churn, fill = churn$gender)) + geom_bar()
#internet service wise churn
ggplot(churn, aes(x = churn$InternetService, fill = churn$Churn)) + geom_bar()
#payment method wise churn
ggplot(churn, aes(x = churn$PaymentMethod, fill = churn$Churn)) + geom_bar()
#Techsupport wise churn
ggplot(churn, aes(x = churn$TechSupport, fill = churn$Churn)) + geom_bar()
#Checkpoint 3 - Data Preparation
# Make Box plots for numeric variables to look for outliers.
boxplot.stats(churn$tenure) #No Outlier
boxplot.stats(churn$MonthlyCharges) #No Outlier
boxplot.stats(churn$TotalCharges) #No Outlier
# Perform De-Duplication if required
which(duplicated(churn) == 'TRUE') #No Duplicates
# Impute the missing values, and perform the outlier treatment.
sapply(churn, function(x)
churn$TotalCharges[which($TotalCharges) == 'TRUE')] <-
mean(churn$TotalCharges, na.rm = TRUE)
#CHECKPOINT 4: Modeling
#Model 1: Logistics Regression
#Creating object for modeling
churn_df =$tenure, churn$MonthlyCharges, churn$TotalCharges)
names(churn_df)[names(churn_df) == 'churn$tenure'] <- 'tenure'
names(churn_df)[names(churn_df) == 'churn$MonthlyCharges'] <-
names(churn_df)[names(churn_df) == 'churn$TotalCharges'] <-
# Bring the variables in the correct format
dummy_PhoneService = ~ PhoneService - 1, data = churn))
churn_df = cbind(churn_df, dummy_PhoneService[, -1])
names(churn_df)[names(churn_df) == 'dummy_PhoneService[, -1]'] <-
dummy_PaperlessBilling = ~ PaperlessBilling -
1, data = churn))
churn_df = cbind(churn_df, dummy_PaperlessBilling[, -1])
names(churn_df)[names(churn_df) == 'dummy_PaperlessBilling[, -1]'] <-
dummy_Churn = ~ Churn - 1, data = churn))
churn_df = cbind(churn_df, dummy_Churn[, -1])
names(churn_df)[names(churn_df) == 'dummy_Churn[, -1]'] <-
dummy_gender = ~ gender - 1, data = churn))
churn_df = cbind(churn_df, dummy_gender[, -1])
names(churn_df)[names(churn_df) == 'dummy_gender[, -1]'] <-
#Senior Citizen data is already in 0-1 format, changing it to numeric
churn_df = cbind(churn_df, as.numeric(churn$SeniorCitizen))
names(churn_df)[names(churn_df) == 'as.numeric(churn$SeniorCitizen)'] <-
dummy_Partner = ~ Partner - 1, data = churn))
churn_df = cbind(churn_df, dummy_Partner[, -1])
names(churn_df)[names(churn_df) == 'dummy_Partner[, -1]'] <-
dummy_Dependents = ~ Dependents - 1, data = churn))
churn_df = cbind(churn_df, dummy_Dependents[, -1])
names(churn_df)[names(churn_df) == 'dummy_Dependents[, -1]'] <-
dummy_contract = ~ Contract - 1, data = churn))
churn_df = cbind(churn_df, dummy_contract[, -3])
dummy_PaymentMethod = ~ PaymentMethod - 1, data = churn))
churn_df = cbind(churn_df, dummy_PaymentMethod[, -4])
dummy_InternetService = ~ InternetService -
1, data = churn))
churn_df = cbind(churn_df, dummy_InternetService[, -3])
dummy_OnlineSecurity = ~ OnlineSecurity - 1, data = churn))
churn_df = cbind(churn_df, dummy_OnlineSecurity[, -3])
dummy_OnlineBackup = ~ OnlineBackup - 1, data = churn))
churn_df = cbind(churn_df, dummy_OnlineBackup[, -3])
dummy_multiplelines = ~ MultipleLines - 1, data = churn))
churn_df = cbind(churn_df, dummy_multiplelines[, -3])
dummy_DeviceProtection = ~ DeviceProtection -
1, data = churn))
churn_df = cbind(churn_df, dummy_DeviceProtection[, -3])
dummy_TechSupport = ~ TechSupport - 1, data = churn))
churn_df = cbind(churn_df, dummy_TechSupport[, -3])
dummy_StreamingTV = ~ StreamingTV - 1, data = churn))
churn_df = cbind(churn_df, dummy_StreamingTV[, -3])
dummy_StreamingMovies = ~ StreamingMovies -
1, data = churn))
churn_df = cbind(churn_df, dummy_StreamingMovies[, -3])
#Split data set
train.indices = sample(1:nrow(churn_df), 0.7 * nrow(churn_df)) = churn_df[train.indices,] = churn_df[-train.indices,]
# Initial Model with all variables
initial_model = glm(dummy_Churn ~ ., data =, family = "binomial")
# Stepwise selection
step_model = step(initial_model, direction = "both")
dummy_Churn ~ tenure + MonthlyCharges + TotalCharges + dummy_PhoneService +
dummy_PaperlessBilling + dummy_SeniorCitizen + `ContractMonth-to-month` + `ContractOne year` +
`PaymentMethodElectronic check` + InternetServiceDSL + OnlineSecurityNo +
OnlineBackupNo + TechSupportNo
#Remove Tenure (High VIF)
model1 = glm(
dummy_Churn ~ MonthlyCharges + TotalCharges + dummy_PhoneService +
dummy_PaperlessBilling + dummy_SeniorCitizen + `ContractMonth-to-month` + `ContractOne year` +
`PaymentMethodElectronic check` + InternetServiceDSL + OnlineSecurityNo +
OnlineBackupNo + TechSupportNo,
data =,
family = "binomial"
#Remove ContractMonth-to-month (High VIF)
model2 = glm(
dummy_Churn ~ MonthlyCharges + TotalCharges + dummy_PhoneService +
dummy_PaperlessBilling + dummy_SeniorCitizen + `ContractOne year` +
`PaymentMethodElectronic check` + InternetServiceDSL + OnlineSecurityNo +
OnlineBackupNo + TechSupportNo,
data =,
family = "binomial"
#Remove OnlineBackupNo (Insignificant)
model3 = glm(
dummy_Churn ~ MonthlyCharges + TotalCharges + dummy_PhoneService +
dummy_PaperlessBilling + dummy_SeniorCitizen + `ContractOne year` +
`PaymentMethodElectronic check` + InternetServiceDSL + OnlineSecurityNo +
data =,
family = "binomial"
#Checking correlation between MonthlyCharges & TotalCharges
#Since strong correlation, removing MonthlyCharges
model4 = glm(
dummy_Churn ~ TotalCharges + dummy_PhoneService +
dummy_PaperlessBilling + dummy_SeniorCitizen + `ContractOne year` +
`PaymentMethodElectronic check` + InternetServiceDSL + OnlineSecurityNo +
data =,
family = "binomial"
#Removing dummy_PhoneService (low significance)
model5 = glm(
dummy_Churn ~ TotalCharges + dummy_PaperlessBilling + dummy_SeniorCitizen + `ContractOne year` +
`PaymentMethodElectronic check` + InternetServiceDSL + OnlineSecurityNo +
data =,
family = "binomial"
#Removing dummy_SeniorCitizen (low significance)
model6 = glm(
dummy_Churn ~ TotalCharges + dummy_PaperlessBilling + `ContractOne year` +
`PaymentMethodElectronic check` + InternetServiceDSL + OnlineSecurityNo +
data =,
family = "binomial"
best_model_log = model6
## C-statistic$predicted_prob = predict(best_model_log, type = "response")
rcorr.cens($predicted_prob,$dummy_Churn)$predicted_prob = predict(best_model_log, newdata =, type = "response")
#Train Data
model_score <-
model_perf <- performance(model_score, "tpr", "fpr")
ks_table <-
attr(model_perf, "y.values")[[1]] - (attr(model_perf, "x.values")[[1]])
ks = max(ks_table)
which(ks_table == ks)
#Test Data
model_score_test <-
model_perf_test <- performance(model_score_test, "tpr", "fpr")
ks_table_test <-
attr(model_perf_test, "y.values")[[1]] - (attr(model_perf_test, "x.values")[[1]])
ks_test = max(ks_table_test)
which(ks_table_test == ks_test)
# Selecting threshold value
# ROC curve
plot(model_perf, col = "red", lab = c(10, 10, 10))
plot(model_perf_test, col = "red", lab = c(10, 10, 10))
#confusion matrix 1 (Threshold Value = 0.5)
confusionMatrix(as.numeric($predicted_prob > 0.5),$dummy_Churn,
positive = "1")
confusionMatrix(as.numeric($predicted_prob > 0.5),$dummy_Churn,
positive = "1")
#confusion matrix 2 (Threshold Value = 0.3)
confusionMatrix(as.numeric($predicted_prob > 0.3),$dummy_Churn,
positive = "1")
confusionMatrix(as.numeric($predicted_prob > 0.3),$dummy_Churn,
positive = "1")
#confusion matrix 3 (Threshold Value = 0.7)
confusionMatrix(as.numeric($predicted_prob > 0.7),$dummy_Churn,
positive = "1")
confusionMatrix(as.numeric($predicted_prob > 0.7),$dummy_Churn,
positive = "1")
# Model2
# K-NN Model:
#Creating "churn_knn" dataframe from "churn_df" and bringing original churn data
churn_knn = churn_df[, -6]
churn_knn = cbind(churn_knn, as.factor(churn$Churn))
names(churn_knn)[names(churn_knn) == 'as.factor(churn$Churn)'] <-
# Bring the data in the correct format to implement K-NN model.
churn_knn$MonthlyCharges <- scale(churn_knn$MonthlyCharges)
churn_knn$TotalCharges <- scale(churn_knn$TotalCharges)
# Implement the K-NN model for optimal K.
s1 = sample(1:nrow(churn_knn), 0.7 * nrow(churn_knn))
churn_knn_train = churn_knn[s1, ]
churn_knn_test = churn_knn[-s1, ]
cl <- churn_knn_train[, 31]
#Removing Class label "dummy_churn" from the train and test data set
churn_knn_train1 <- churn_knn_train[, -31]
churn_knn_test1 <- churn_knn_test[, -31]
#Using the train() command to find the best K.
model <- train(
Churn ~ .,
data = churn_knn_train,
method = 'knn',
tuneGrid = expand.grid(.k = 1:50),
metric = 'Accuracy',
trControl = trainControl(
method = 'repeatedcv',
number = 10,
repeats = 10
#Generating the plot of the model
plot(model) # Looking at graph we can see that optimum K = 19 (Post which not much difference in accuracy)
#Creating the model with optimum cost
impknn2 <-
k = 19,
prob = TRUE)
table(impknn2, churn_knn_test[, 31])
confusionMatrix(impknn2, churn_knn_test[, 31], positive = "Yes")
#KNN Model Solution
# Accuracy = 79.32%
# Sensitivity = 50.98%
# Specificity = 89.51%
# Model 3
# Naive Bayes Model:
#Using the original data set
# Bring the data in the correct format to implement Naive Bayes algorithm.
churn$customerID <- factor(churn$customerID)
churn$PhoneService <- factor(churn$PhoneService)
churn$Contract <- factor(churn$Contract)
churn$PaperlessBilling <- factor(churn$PaperlessBilling)
churn$PaymentMethod <- factor(churn$PaymentMethod)
churn$Churn <- factor(churn$Churn)
churn$gender <- factor(churn$gender)
churn$Partner <- factor(churn$Partner)
churn$Dependents <- factor(churn$Dependents)
churn$MultipleLines <- factor(churn$MultipleLines)
churn$InternetService <- factor(churn$InternetService)
churn$OnlineSecurity <- factor(churn$OnlineSecurity)
churn$OnlineBackup <- factor(churn$DeviceProtection)
churn$TechSupport <- factor(churn$TechSupport)
churn$StreamingTV <- factor(churn$StreamingTV)
churn$StreamingMovies <- factor(churn$StreamingMovies)
churn$DeviceProtection <- factor(churn$DeviceProtection)
s = sample(1:nrow(churn), 0.7 * nrow(churn))
churn_NB_train = churn[s, ]
churn_NB_test = churn[-s, ]
churn_NB_test1 <- churn_NB_test[, -9]
# Implement the Naive Bayes algorithm.
model <- naiveBayes(churn_NB_train$Churn ~ . , data = churn_NB_train)
pred <- predict(model, churn_NB_test1)
table(pred, churn_NB_test$Churn)
confusionMatrix(pred, churn_NB_test$Churn)
#Naive Bayes Solution
#??? Accuracy=71.46%
#??? Sensitivity = 68.79%
#??? Specificit y=78.89%
#??? ROC Curve
# Model 4
# SVM:
# Bring the data in the correct format to implement the SVM algorithm.
# Taking data frame from Logistic Regression model
churn_svm = churn_df[, -6]
churn_svm = cbind(churn_svm, as.factor(churn$Churn))
names(churn_svm)[names(churn_svm) == 'as.factor(churn$Churn)'] <-
# Also taking the train and test data from Logistic regression model (minus predicted probibility)
svm.set = sample(1:nrow(churn_svm), 0.7 * nrow(churn_svm))
churn_svm_train = churn_knn[svm.set, ]
churn_svm_test = churn_knn[-svm.set, ]
# Implement the SVM algorithm using the optimal cost.
# model 0 with cost = 0.1
#model.svm.0 = svm(churn_svm_train$Churn~., data = churn_svm_train, kernel = "linear", cost = 0.1, scale = F)
# finding the optimal value of cost using cross-validation using the tune function
tune.svm = tune(
Churn ~ .,
data = churn_svm_train,
kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 10, 100))
bestmodel_svm <- tune.svm$best.model
bestmodel_svm #Best Performance is for Cost = 0.01
# predicting test classes using the best model and analyzing the table
svm.predict = predict(bestmodel_svm, churn_svm_test)
table(svm.predict, churn_svm_test$Churn)
confusionMatrix(svm.predict, churn_svm_test$Churn)
# Accuracy: 78.99
# Sensitivity: 88.22
# Specificity: 53.31
# Plotting the SVM
plot(svm.predict, churn_svm_test$Churn)

In the telecom industry, customers are able to choose from multiple service providers and actively switch from one operator to another. In this highly competitive market, the telecommunications industry experiences an average of 15-25% annual churn rate. Given the fact that it costs 5-10 times more to acquire a new customer than to retain an existing one, customer retention has now become even more important than customer acquisition.

For many incumbent operators, retaining high profitable customers is the number one business goal.

Background: To reduce customer churn, telecom companies need to predict which customers are at high risk of churn. We have been hired by a telecom industry giant to look at customer level data and identify customers at high risk of churn and identify the main indicators of churn.

Problem Statement: We need to build a predictive model using advanced Machine Learning algorithms in order to predict the customers at high risk of churn along with the key indicators of churn.

Link to the project code

This case study has been completed with the help of my team mate Koushal Deshpande. Thanks Koushal for your help and your key insights!

