Customer Churn Prediction

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set()

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.metrics import accuracy_score, recall_score from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay from sklearn.metrics import classification_report from sklearn.metrics import PrecisionRecallDisplay from sklearn.metrics import f1_score

# make sure pandas shows all the columns in the dataframe pd.set_option("display.max_columns", None) # set max number of rows to show by Pandas pd.set_option('display.max_rows', 30)

df = pd.read_csv('./data/raw/telecom_churn.csv')

type(df.shape)

df.shape

df.head()

df.tail()

df.dtypes

df.rename(columns= { "Int'l Plan": "International plan", "VMail Plan": "Voicemail plan", "VMail Message": "Voicemail messages", "Eve Mins": "Evening mins", "Eve Calls": "Evening calls", "Eve Charge": "Evening charge", "Intl Mins": "International minutes", "Intl Calls": "International Calls", "Intl Charge": "International Charge", "CustServ Calls": "Customer service calls", "Churn?": "Churn" }, inplace=True)

mapping_dictionary = { "False.": 0, "True.": 1 }

df.replace({"Churn": mapping_dictionary}, inplace=True)

df.columns = df.columns.str.lower()

df['churn'].value_counts()

df['churn'].value_counts(normalize=True)

df.mean(numeric_only=True)

df[df["churn"]==True].mean(numeric_only=True)

sns.displot(x="account length", data=df, hue='churn');

sns.displot(x="customer service calls", data=df, hue='churn');

df["total charge"] = df["day charge"] + df["evening charge"] + df["night charge"]

df.columns

df["cs call rate"] = df["customer service calls"] / df["account length"]

df["cs call rate"].describe()

df.iloc[df["cs call rate"].argmax()]

df.isnull().sum()

df.describe()

df.dtypes

df_numerical = df.select_dtypes('number').drop(['area code', 'churn'], axis=1)

df_numerical.columns

df["state"].value_counts()

df.drop("state", axis=1, inplace=True)

df.dtypes

df["phone"]

df.drop("phone", inplace=True, axis=1)

df.dtypes

df.select_dtypes('object')

label_encoder = LabelEncoder()

label_encoder.fit_transform(df["international plan"])

pd.DataFrame({"Original": df["international plan"], "Encoded": label_encoder.fit_transform(df["international plan"])})

df_label_encoded = pd.DataFrame()

df_label_encoded["international plan"] = label_encoder.fit_transform(df["international plan"])

df_label_encoded["voicemail plan"] = label_encoder.fit_transform(df["voicemail plan"])

df_label_encoded

df["area code"].value_counts()

df_one_hot_encoded = pd.get_dummies(df["area code"]) df_one_hot_encoded

df_one_hot_encoded.rename(columns={408: "area_408", 415: "area_415", 510: "area_510"}, inplace=True)

df_one_hot_encoded

df_encoded = pd.concat([df_label_encoded, df_one_hot_encoded], axis=1)

df_encoded

df_final = pd.concat([df_numerical, df_encoded], axis=1) df_final

x_train, x_test, y_train, y_test = train_test_split(df_final, df["churn"], test_size=0.2, random_state=0, stratify=df["churn"])

y_train.value_counts(normalize=True)

standard_scaler = StandardScaler()

standard_scaler.fit(x_train)

x_train = pd.DataFrame(standard_scaler.transform(x_train), columns = x_train.columns) x_test = pd.DataFrame(standard_scaler.transform(x_test), columns = x_test.columns)

x_train.describe()

y_pred = np.zeros(len(y_test))

f1_score(y_test, y_pred)

accuracy_score(y_test, y_pred)

recall_score(y_test, y_pred)

log_reg_model = LogisticRegression(class_weight='balanced', random_state=0)

log_reg_model.fit(x_train, y_train)

f1_score(y_test, log_reg_model.predict(x_test))

f1_score(y_train, log_reg_model.predict(x_train))

recall_score(y_train, log_reg_model.predict(x_train))

recall_score(y_test, log_reg_model.predict(x_test))

scores_log_reg = cross_val_score(log_reg_model, x_train, y_train, cv=5, scoring='f1')

print("Average F1-score for logistic regresion: %.2f, standard deviation: %.2f" % (scores_log_reg.mean(), scores_log_reg.std()))

scores_log_reg

scores_log_reg_r = cross_val_score(log_reg_model, x_train, y_train, cv=5, scoring='recall')

print("Average Recall-score for logistic regresion: %.2f, standard deviation: %.2f" % (scores_log_reg_r.mean(), scores_log_reg_r.std()))

scores_log_reg_r

rf = RandomForestClassifier(random_state=0, n_estimators=10)

rf.fit(x_train, y_train)

f1_score(y_train, rf.predict(x_train))

scores_rf = cross_val_score(rf, x_train, y_train, cv=5, scoring='f1')

scores_rf

scores_rf.mean()

dico_param= {"n_estimators":[20,90,100,105]}

grid_search = GridSearchCV(RandomForestClassifier(random_state=0), dico_param, scoring="f1", cv=5)

grid_search.fit(x_train, y_train)

grid_search.best_score_

f1_score(y_test, grid_search.best_estimator_.predict(x_test))

grid_search.best_params_

grid_search.cv_results_

recall_score(y_test, grid_search.best_estimator_.predict(x_test))

from joblib import dump, load # library for export dump(standard_scaler, "./models/scaler.joblib") # dump the scaler dump(grid_search.best_estimator_, './models/my_rf_model.joblib') # dump the best model