import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import f1_score
# make sure pandas shows all the columns in the dataframe
pd.set_option("display.max_columns", None)
# set max number of rows to show by Pandas
pd.set_option('display.max_rows', 30)
df = pd.read_csv('./data/raw/telecom_churn.csv')
type(df.shape)
df.shape
df.head()
df.tail()
df.dtypes
df.rename(columns=
{
"Int'l Plan": "International plan",
"VMail Plan": "Voicemail plan",
"VMail Message": "Voicemail messages",
"Eve Mins": "Evening mins",
"Eve Calls": "Evening calls",
"Eve Charge": "Evening charge",
"Intl Mins": "International minutes",
"Intl Calls": "International Calls",
"Intl Charge": "International Charge",
"CustServ Calls": "Customer service calls",
"Churn?": "Churn"
},
inplace=True)
mapping_dictionary = {
"False.": 0,
"True.": 1
}
df.replace({"Churn": mapping_dictionary}, inplace=True)
df.columns = df.columns.str.lower()
df['churn'].value_counts()
df['churn'].value_counts(normalize=True)
df.mean(numeric_only=True)
df[df["churn"]==True].mean(numeric_only=True)
sns.displot(x="account length", data=df, hue='churn');
sns.displot(x="customer service calls", data=df, hue='churn');
df["total charge"] = df["day charge"] + df["evening charge"] + df["night charge"]
df.columns
df["cs call rate"] = df["customer service calls"] / df["account length"]
df["cs call rate"].describe()
df.iloc[df["cs call rate"].argmax()]
df.isnull().sum()
df.describe()
df.dtypes
df_numerical = df.select_dtypes('number').drop(['area code', 'churn'], axis=1)
df_numerical.columns
df["state"].value_counts()
df.drop("state", axis=1, inplace=True)
df.dtypes
df["phone"]
df.drop("phone", inplace=True, axis=1)
df.dtypes
df.select_dtypes('object')
label_encoder = LabelEncoder()
label_encoder.fit_transform(df["international plan"])
pd.DataFrame({"Original": df["international plan"], "Encoded": label_encoder.fit_transform(df["international plan"])})
df_label_encoded = pd.DataFrame()
df_label_encoded["international plan"] = label_encoder.fit_transform(df["international plan"])
df_label_encoded["voicemail plan"] = label_encoder.fit_transform(df["voicemail plan"])
df_label_encoded
df["area code"].value_counts()
df_one_hot_encoded = pd.get_dummies(df["area code"])
df_one_hot_encoded
df_one_hot_encoded.rename(columns={408: "area_408", 415: "area_415", 510: "area_510"}, inplace=True)
df_one_hot_encoded
df_encoded = pd.concat([df_label_encoded, df_one_hot_encoded], axis=1)
df_encoded
df_final = pd.concat([df_numerical, df_encoded], axis=1)
df_final
x_train, x_test, y_train, y_test = train_test_split(df_final, df["churn"],
test_size=0.2, random_state=0, stratify=df["churn"])
y_train.value_counts(normalize=True)
standard_scaler = StandardScaler()
standard_scaler.fit(x_train)
x_train = pd.DataFrame(standard_scaler.transform(x_train), columns = x_train.columns)
x_test = pd.DataFrame(standard_scaler.transform(x_test), columns = x_test.columns)
x_train.describe()
y_pred = np.zeros(len(y_test))
f1_score(y_test, y_pred)
accuracy_score(y_test, y_pred)
recall_score(y_test, y_pred)
log_reg_model = LogisticRegression(class_weight='balanced', random_state=0)
log_reg_model.fit(x_train, y_train)
f1_score(y_test, log_reg_model.predict(x_test))
f1_score(y_train, log_reg_model.predict(x_train))
recall_score(y_train, log_reg_model.predict(x_train))
recall_score(y_test, log_reg_model.predict(x_test))
scores_log_reg = cross_val_score(log_reg_model, x_train, y_train, cv=5, scoring='f1')
print("Average F1-score for logistic regresion: %.2f, standard deviation: %.2f" %
(scores_log_reg.mean(), scores_log_reg.std()))
scores_log_reg
scores_log_reg_r = cross_val_score(log_reg_model, x_train, y_train, cv=5, scoring='recall')
print("Average Recall-score for logistic regresion: %.2f, standard deviation: %.2f" %
(scores_log_reg_r.mean(), scores_log_reg_r.std()))
scores_log_reg_r
rf = RandomForestClassifier(random_state=0, n_estimators=10)
rf.fit(x_train, y_train)
f1_score(y_train, rf.predict(x_train))
scores_rf = cross_val_score(rf, x_train, y_train, cv=5, scoring='f1')
scores_rf
scores_rf
scores_rf.mean()
dico_param= {"n_estimators":[20,90,100,105]}
grid_search = GridSearchCV(RandomForestClassifier(random_state=0),
dico_param,
scoring="f1",
cv=5)
grid_search.fit(x_train, y_train)
grid_search.best_score_
f1_score(y_test, grid_search.best_estimator_.predict(x_test))
grid_search.best_params_
grid_search.cv_results_
recall_score(y_test, grid_search.best_estimator_.predict(x_test))
from joblib import dump, load # library for export
dump(standard_scaler, "./models/scaler.joblib") # dump the scaler
dump(grid_search.best_estimator_, './models/my_rf_model.joblib') # dump the best model