#Analizamos las metricas promedio para las personas que cancelaron y las que no
df_group = df.groupby('Churn').mean()
df_group.head()
#Graficamos las columnas del dataset vs la variable objetivo
for col in df.drop('Churn', axis=1).columns:
sns.scatterplot(x=df['Churn'], y=df[col])
plt.show()
#Dividimos los datos
X = df.drop('Churn',axis = 1)
y = df['Churn']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)
model_lineal = LogisticRegression()
model_lineal.fit(X_train,y_train)
#predicciones
y_proba_model_lineal = model_lineal.predict_proba(X_test)
y_predict_model_lineal = model_lineal.predict(X_test)
#Resultados
print('Accuracy Score for lineal model:', accuracy_score(y_test,y_predict_model_lineal))
print('Precision Score for lineal model:', precision_score(y_test,y_predict_model_lineal))
print('Recall Score for lineal model:', recall_score(y_test,y_predict_model_lineal))
#Ahora procdemos con el modelo de bosue aleatorio
model_tree = RandomForestClassifier(n_estimators = 100)
model_tree.fit(X_train,y_train)
y_proba_tree =model_tree.predict_proba(X_test)
y_predict_tree = model_tree.predict(X_test)
#Resultados
print('Accuracy Score for tree model:', accuracy_score(y_test,y_predict_tree))
print('Precision Score for tree model:', precision_score(y_test,y_predict_tree))
print('Recall Score for tree model:', recall_score(y_test,y_predict_tree))
# Estandarizamos los datos
sc = StandardScaler()
x_sc = sc.fit_transform(df.drop(columns = ['Churn']))
# Usamos la funcion linkage para crear una matriz de distancias basada en la matriz de caracteristicas estandarizada
linked = linkage(x_sc,method = 'ward')
plt.figure(figsize = (12,15))
dendrogram(linked, orientation = 'top')
plt.title('Clusters para el gym')
plt.show()
#entrenamos el modelo
km = KMeans(n_clusters =5)
labels =km.fit_predict(df.drop(columns = ['Churn']))
df['cluster_km'] = labels
df.groupby('cluster_km').count()
df.groupby('cluster_km').sum()
df.groupby('cluster_km').mean()
#trazamos distribuciones
def plot_all_clusters(df, cluster_name):
# Filtrar solo columnas numéricas, excepto la de clusters
numeric_cols = df.drop(columns=[cluster_name]).select_dtypes(include=['int64','float64']).columns
for col in numeric_cols:
plt.figure(figsize=(8, 6))
sns.scatterplot(
y=df[col],
x=df[cluster_name],
hue=df[cluster_name],
palette='Paired',
alpha=0.7
)
plt.title(f"{col} vs {cluster_name}")
plt.xlabel(cluster_name)
plt.ylabel(col)
plt.legend(title="Cluster")
plt.show()
plot_all_clusters(df,'cluster_km')
def show_clusters_on_plot(df, x_name,y_name, cluster_name):
plt.figure(figsize = (10,10))
sns.scatterplot(df[x_name], df[y_name],
hue = df[cluster_name], palette = 'Paired'
)
plt.title('{} vs {}'.format(x_name, y_name))
plt.show()
show_clusters_on_plot(df,'Avg_class_frequency_total','Avg_additional_charges_total','cluster_km')
#Calculamos tasas de cancelacion
df.groupby('cluster_km')['Churn'].mean()