# Standard
import pandas as pd
import numpy as np
# Pycaret
from pycaret.classification import *
# Plots
from plotly.offline import iplot
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import seaborn as sns
# Sklearn tools
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *
# Extras
from datetime import date
import warnings
warnings.filterwarnings("ignore")
# Datapath and Setup
data_path = "/datasets/telco-customer-churn/"
random_seed = 142
# Helper functions for structured data
## Get info about the dataset
def dataset_info(dataset, dataset_name: str):
print(f"Dataset Name: {dataset_name} | Number of Samples: {dataset.shape[0]} | Number of Columns: {dataset.shape[1]}")
print(30*"=")
print("Column Data Type")
print(dataset.dtypes)
print(30*"=")
missing_data = dataset.isnull().sum()
if sum(missing_data) > 0:
print(missing_data[missing_data.values > 0])
else:
print("No Missing Data on this Dataset!")
print(30*"=")
print(f"Memory Usage: {np.round(dataset.memory_usage(index=True).sum() / 10e5, 3)} MB")
## Dataset Sampling
def data_sampling(dataset, frac: float, random_seed: int):
data_sampled_a = dataset.sample(frac=frac, random_state=random_seed)
data_sampled_b = dataset.drop(data_sampled_a.index).reset_index(drop=True)
data_sampled_a.reset_index(drop=True, inplace=True)
return data_sampled_a, data_sampled_b
## Bar Plot
def bar_plot(data, plot_title: str, x_axis: str, y_axis: str):
colors = ["#0080ff",] * len(data)
colors[0] = "#ff8000"
trace = go.Bar(y=data.values, x=data.index, text=data.values,
marker_color=colors)
layout = go.Layout(autosize=False, height=600,
title={"text" : plot_title,
"y" : 0.9,
"x" : 0.5,
"xanchor" : "center",
"yanchor" : "top"},
xaxis={"title" : x_axis},
yaxis={"title" : y_axis},)
fig = go.Figure(data=trace, layout=layout)
fig.update_layout(template="simple_white")
fig.update_traces(textposition="outside",
textfont_size=14,
marker=dict(line=dict(color="#000000", width=2)))
fig.update_yaxes(automargin=True)
iplot(fig)
## Plot Pie Chart
def pie_plot(data, plot_title: str):
trace = go.Pie(labels=data.index, values=data.values)
layout = go.Layout(autosize=False,
title={"text" : plot_title,
"y" : 0.9,
"x" : 0.5,
"xanchor" : "center",
"yanchor" : "top"})
fig = go.Figure(data=trace, layout=layout)
fig.update_traces(textfont_size=14,
marker=dict(line=dict(color="#000000", width=2)))
fig.update_yaxes(automargin=True)
iplot(fig)
## Histogram
def histogram_plot(data, plot_title: str, y_axis: str):
trace = go.Histogram(x=data)
layout = go.Layout(autosize=False,
title={"text" : plot_title,
"y" : 0.9,
"x" : 0.5,
"xanchor" : "center",
"yanchor" : "top"},
yaxis={"title" : y_axis})
fig = go.Figure(data=trace, layout=layout)
fig.update_traces(marker=dict(line=dict(color="#000000", width=2)))
fig.update_layout(template="simple_white")
fig.update_yaxes(automargin=True)
iplot(fig)
# Particular case: Histogram subplot (1, 2)
def histogram_subplot(dataset_a, dataset_b, feature_a: str,
feature_b: str, title: str, title_a: str, title_b: str):
fig = make_subplots(rows=1, cols=2, subplot_titles=(
title_a,
title_b
)
)
fig.add_trace(go.Histogram(x=dataset_a[feature_a], showlegend=False), row=1, col=1)
fig.add_trace(go.Histogram(x=dataset_b[feature_b], showlegend=False), row=1, col=2)
fig.update_layout(template="simple_white")
fig.update_layout(autosize=False,
title={"text" : title,
"y" : 0.9,
"x" : 0.5,
"xanchor" : "center",
"yanchor" : "top"},
yaxis={"title" : "<i>Frequency</i>"})
fig.update_traces(marker=dict(line=dict(color="#000000", width=2)))
fig.update_yaxes(automargin=True)
iplot(fig)
# Calculate scores with Test/Unseen labeled data
def test_score_report(data_unseen, predict_unseen):
le = LabelEncoder()
data_unseen["Label"] = le.fit_transform(data_unseen.Churn.values)
data_unseen["Label"] = data_unseen["Label"].astype(int)
accuracy = accuracy_score(data_unseen["Label"], predict_unseen["Label"])
roc_auc = roc_auc_score(data_unseen["Label"], predict_unseen["Label"])
precision = precision_score(data_unseen["Label"], predict_unseen["Label"])
recall = recall_score(data_unseen["Label"], predict_unseen["Label"])
f1 = f1_score(data_unseen["Label"], predict_unseen["Label"])
df_unseen = pd.DataFrame({
"Accuracy" : [accuracy],
"AUC" : [roc_auc],
"Recall" : [recall],
"Precision" : [precision],
"F1 Score" : [f1]
})
return df_unseen
# Confusion Matrix
def conf_mat(data_unseen, predict_unseen):
unique_label = data_unseen["Label"].unique()
cmtx = pd.DataFrame(
confusion_matrix(data_unseen["Label"], predict_unseen["Label"], labels=unique_label),
index=['{:}'.format(x) for x in unique_label],
columns=['{:}'.format(x) for x in unique_label]
)
ax = sns.heatmap(cmtx, annot=True, fmt="d", cmap="YlGnBu")
ax.set_ylabel('Predicted')
ax.set_xlabel('Target');
ax.set_title("Predict Unseen Confusion Matrix", size=14);
dataset = pd.read_csv(data_path+"customers.csv")
dataset.head(3)
dataset[dataset.duplicated()]
dataset_info(dataset, "customers")
Dataset Name: customers | Number of Samples: 7043 | Number of Columns: 21
==============================
Column Data Type
customerID object
gender object
SeniorCitizen int64
Partner object
Dependents object
tenure int64
PhoneService object
MultipleLines object
InternetService object
OnlineSecurity object
OnlineBackup object
DeviceProtection object
TechSupport object
StreamingTV object
StreamingMovies object
Contract object
PaperlessBilling object
PaymentMethod object
MonthlyCharges float64
TotalCharges object
Churn object
dtype: object
==============================
No Missing Data on this Dataset!
==============================
Memory Usage: 1.183 MB
dataset["TotalCharges"] = pd.to_numeric(dataset["TotalCharges"], errors="coerce")
print(f"The Feature TotalCharges is type {dataset.TotalCharges.dtype} now!")
The Feature TotalCharges is type float64 now!
pie_plot(dataset["Churn"].value_counts(), plot_title="<b>Client Churn Distribution<b>")
df_aux = dataset.query('Churn == "No"')
df_aux = df_aux["Contract"].value_counts()
bar_plot(df_aux, "<b>Contract Types of not Churned Clients</b>", "<i>Contract</i>", "<i>Count</i>")
df_aux = dataset.query('Churn == "Yes"')
df_aux = df_aux["Contract"].value_counts()
bar_plot(df_aux, "<b>Contract Types of Churned Clients</b>", "<i>Contract</i>", "<i>Count</i>")
df_aux = dataset.query('(Contract == "Month-to-month") and (Churn == "No")')
histogram_subplot(df_aux, df_aux, "MonthlyCharges", "TotalCharges", "<b>Charges Distribution for Month-to-month contracts for not Churned Clients</b>",
"(a) Monthtly Charges Distribution", "(b) Total Charges Distribution")
df_aux = dataset.query('(Contract == "Month-to-month") and (Churn == "Yes")')
histogram_subplot(df_aux, df_aux, "MonthlyCharges", "TotalCharges", "<b>Charges Distribution for Month-to-month contracts for Churned Clients</b>",
"(a) Monthtly Charges Distribution", "(b) Total Charges Distribution")
df_aux = dataset.query(('Contract == Month-to-month') and ('Churn == "Yes"'))
df_aux = df_aux["PaymentMethod"].value_counts()
bar_plot(df_aux, "<b>Payment Method of Month-to-month contract Churned Clients</b>", "<i>Payment Method</i>", "<i>Count</i>")
df_aux = dataset.query(('Contract == Month-to-month') and ('Churn == "Yes"'))
df_aux = df_aux["tenure"].value_counts().head(5)
bar_plot(df_aux, "<b>Tenure of Month-to-month contract for Churned Clients</b>", "<i>Tenure</i>", "<i>Count</i>")
data, data_unseen = data_sampling(dataset, 0.9, random_seed)
print(f"There are {data_unseen.shape[0]} samples for Unseen Data.")
There are 704 samples for Unseen Data.
exp01 = setup(data=data, target="Churn", session_id=random_seed, ignore_features=["customerID"],
numeric_features=["SeniorCitizen"], normalize=True,
feature_selection=True, remove_outliers=True, remove_multicollinearity=True,
transformation=True, ignore_low_variance=True, pca=True,
bin_numeric_features=["MonthlyCharges", "TotalCharges"])
Setup Succesfully Completed!
compare_models(fold=10, sort="F1")
base_alg = "qda"
base_model = create_model(base_alg)
plot_model(base_model, plot="parameter")
tuned_model = tune_model(base_alg, fold=10, optimize="F1")
plot_model(tuned_model, plot="parameter")
bagged_model = ensemble_model(tuned_model)
blended_model = blend_models(estimator_list=[tuned_model, bagged_model], method="soft")
stacked_model = stack_models([tuned_model, bagged_model], method="soft")
best_model = blended_model
plot_model(best_model, plot="auc")
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
predict_model(best_model);
final_model = finalize_model(best_model)
predict_unseen = predict_model(final_model, data=data_unseen);
score_unseen = test_score_report(data_unseen, predict_unseen)
print(score_unseen.to_string(index=False))
conf_mat(data_unseen, predict_unseen);
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
Accuracy AUC Recall Precision F1 Score
0.792614 0.796375 0.803571 0.544355 0.649038
save_experiment("expQDA_"+date.today().strftime("%m-%d-%Y"))
save_model(tuned_model, "modelQDA_"+date.today().strftime("%m-%d-%Y"))
Experiment Succesfully Saved
Transformation Pipeline and Model Succesfully Saved