import pandas as pd
import numpy as np
df_data = pd.read_csv('/work/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df_data.head(5)
df_data.info()
df_data.TotalCharges = pd.to_numeric(df_data.TotalCharges, errors='coerce')
df_data.info()
df_data.isnull().sum()
df_data.dropna(inplace=True)
df_data.drop('customerID', axis=1, inplace=True)
df_data.head(5)
df_data['Churn'].replace(to_replace="Yes", value=1, inplace=True)
df_data['Churn'].replace(to_replace="No", value=0, inplace=True)
df_data['Churn']
df_data_processing = df_data.copy()
df_data_processing = pd.get_dummies(df_data_processing)
df_data_processing.head(5)
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15,9))
df_data_processing.corr()['Churn'].sort_values(ascending=True).plot(kind='bar')
plt.show()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_data_processing_scaled = scaler.fit_transform(df_data_processing)
df_data_processing_scaled = pd.DataFrame(df_data_processing_scaled)
df_data_processing_scaled.columns = df_data_processing.columns
df_data_processing_scaled.head(5)
import seaborn as sns
sns.countplot(data=df_data, x='gender', hue='Churn')
plt.show()
def plot_categorical(column):
fig = plt.figure(figsize=(7,7))
sns.countplot(data=df_data, x=column, hue='Churn')
plt.show()
column_cat = df_data.select_dtypes(include='object').columns
for _ in column_cat:
plot_categorical(_)
fig = plt.figure(figsize=(7,7))
sns.pairplot(data=df_data, hue='Churn')
plt.show()
X = df_data_processing_scaled.drop('Churn', axis=1)
y = df_data_processing_scaled['Churn'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
result = model.fit(X_train, y_train)
from sklearn import metrics
prediction_test = model.predict(X_test)
print(metrics.accuracy_score(y_test, prediction_test))
model.predict_log_proba(X_test)
model.coef_
model.feature_names_in_
weights = pd.Series(model.coef_[0],
index=X.columns.values)
print(weights.sort_values(ascending=False)[:10].plot(kind='bar'))
print(weights.sort_values(ascending=False)[-10:].plot(kind='bar'))
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
fig = plt.figure(figsize=(11,11))
cm = confusion_matrix(y_test, prediction_test, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=model.classes_)
disp.plot(cmap='gray')
plt.show()
lasso = LogisticRegression(max_iter=10000, penalty='l1',solver='saga',C=0.5)
lasso.fit(X_train, y_train)
lasso.score(X_test,y_test)
cm=confusion_matrix(lasso.predict(X_test),y_test)
sns.heatmap(
cm,
annot=True,
cmap='gray',
cbar=False,
square=True,
fmt="d"
)
plt.ylabel('Real Label')
plt.xlabel('Predicted Label');
weights = pd.Series(lasso.coef_[0], index=X.columns.values).sort_values(ascending=False)
fig = plt.figure(figsize=(15,5))
weights.plot(kind='bar');
weights[weights==0]
ridge=LogisticRegression(max_iter=10000, penalty='l2',solver='saga',C=0.5)
ridge.fit(X_train, y_train)
ridge.score(X_test,y_test)
cm=confusion_matrix(ridge.predict(X_test),y_test)
sns.heatmap(
cm,
annot=True,
cmap='gray',
cbar=False,
square=True,
fmt="d"
)
plt.ylabel('Real Label')
plt.xlabel('Predicted Label');
weights = pd.Series(ridge.coef_[0], index=X.columns.values).sort_values(ascending=False)
fig = plt.figure(figsize=(15,5))
weights.plot(kind='bar');