import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
import pickle
from pathlib import Path
from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease
ModuleNotFoundError: No module named 'jcopml'
df = pd.read_csv("https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/dqlab_telco_final.csv")
#Tampilkan bentuk dari dataset
print(df.shape)
#Tampilkan 5 data teratas
print(df.head())
#Tampilkan jumlah ID yang unik
print(df.customerID.nunique())
(6950, 13)
UpdatedAt customerID gender SeniorCitizen Partner tenure PhoneService \
0 202006 45759018157 Female No Yes 1 No
1 202006 45315483266 Male No Yes 60 Yes
2 202006 45236961615 Male No No 5 Yes
3 202006 45929827382 Female No Yes 72 Yes
4 202006 45305082233 Female No Yes 56 Yes
StreamingTV InternetService PaperlessBilling MonthlyCharges TotalCharges \
0 No Yes Yes 29.85 29.85
1 No No Yes 20.50 1198.80
2 Yes Yes No 104.10 541.90
3 Yes Yes Yes 115.50 8312.75
4 Yes Yes No 81.25 4620.40
Churn
0 No
1 No
2 Yes
3 No
4 No
6950
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
labels = ['No','Yes']
churn = df.Churn.value_counts()
ax.pie(churn, labels=labels, autopct='%.0f%%')
plt.title("Presentase Churn", fontsize=20)
plt.show()
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
fig, ax = plt.subplots(1, 3, figsize=(15, 6))
# Use the following code to plot two overlays of histogram per each numerical_features, use a color of blue and orange, respectively
df[df.Churn == 'No'][numerical_features].hist(bins=20, color='blue', alpha=0.5, ax=ax)
df[df.Churn == 'Yes'][numerical_features].hist(bins=20, color='orange', alpha=0.5, ax=ax)
plt.show()
sns.set(style='darkgrid')
# Your code goes here
fig, ax = plt.subplots(3, 3, figsize=(14, 12))
sns.countplot(data=df, x='gender', hue='Churn', ax=ax[0][0])
sns.countplot(data=df, x='Partner', hue='Churn', ax=ax[0][1])
sns.countplot(data=df, x='SeniorCitizen', hue='Churn', ax=ax[0][2])
sns.countplot(data=df, x='PhoneService', hue='Churn', ax=ax[1][0])
sns.countplot(data=df, x='StreamingTV', hue='Churn', ax=ax[1][1])
sns.countplot(data=df, x='InternetService', hue='Churn', ax=ax[1][2])
sns.countplot(data=df, x='PaperlessBilling', hue='Churn', ax=ax[2][1])
plt.tight_layout()
plt.show()
cleaned_df=df.drop(columns=["customerID","UpdatedAt"], axis=1)
print(cleaned_df)
gender SeniorCitizen Partner tenure PhoneService StreamingTV \
0 Female No Yes 1 No No
1 Male No Yes 60 Yes No
2 Male No No 5 Yes Yes
3 Female No Yes 72 Yes Yes
4 Female No Yes 56 Yes Yes
... ... ... ... ... ... ...
6945 Male No No 1 Yes No
6946 Female Yes No 1 Yes No
6947 Female No No 1 Yes Yes
6948 Female No Yes 72 Yes Yes
6949 Male No No 64 Yes No
InternetService PaperlessBilling MonthlyCharges TotalCharges Churn
0 Yes Yes 29.85 29.85 No
1 No Yes 20.50 1198.80 No
2 Yes No 104.10 541.90 Yes
3 Yes Yes 115.50 8312.75 No
4 Yes No 81.25 4620.40 No
... ... ... ... ... ...
6945 Yes Yes 44.75 44.75 No
6946 Yes Yes 70.15 70.15 Yes
6947 Yes Yes 85.55 85.55 Yes
6948 Yes Yes 117.15 8529.50 No
6949 Yes No 99.25 6549.45 No
[6950 rows x 11 columns]
from sklearn.preprocessing import LabelEncoder
#Convert all the non-numeric columns to numerical data types
for column in cleaned_df.columns:
if cleaned_df[column].dtype == np.number: continue
#Perform encoding for each non-numeric column
cleaned_df[column] = LabelEncoder().fit_transform(cleaned_df[column])
print(cleaned_df.describe())
gender SeniorCitizen Partner tenure PhoneService \
count 6950.000000 6950.000000 6950.000000 6950.000000 6950.000000
mean 0.504317 0.162302 0.483309 32.415827 0.903741
std 0.500017 0.368754 0.499757 24.561336 0.294967
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 9.000000 1.000000
50% 1.000000 0.000000 0.000000 29.000000 1.000000
75% 1.000000 0.000000 1.000000 55.000000 1.000000
max 1.000000 1.000000 1.000000 73.000000 1.000000
StreamingTV InternetService PaperlessBilling MonthlyCharges \
count 6950.000000 6950.000000 6950.000000 6950.000000
mean 0.384317 0.783453 0.591942 64.992201
std 0.486468 0.411921 0.491509 30.032040
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 1.000000 0.000000 36.462500
50% 0.000000 1.000000 1.000000 70.450000
75% 1.000000 1.000000 1.000000 89.850000
max 1.000000 1.000000 1.000000 169.931250
TotalCharges Churn
count 6950.000000 6950.000000
mean 2286.058750 0.264173
std 2265.702553 0.440923
min 19.000000 0.000000
25% 406.975000 0.000000
50% 1400.850000 0.000000
75% 3799.837500 1.000000
max 8889.131250 1.000000