Brissi Saavedra’s Pronósticos y predicciones

#Analizamos las metricas promedio para las personas que cancelaron y las que no df_group = df.groupby('Churn').mean() df_group.head()

#Graficamos las columnas del dataset vs la variable objetivo for col in df.drop('Churn', axis=1).columns: sns.scatterplot(x=df['Churn'], y=df[col]) plt.show()

#Dividimos los datos X = df.drop('Churn',axis = 1) y = df['Churn'] X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

model_lineal = LogisticRegression()

model_lineal.fit(X_train,y_train)

#predicciones y_proba_model_lineal = model_lineal.predict_proba(X_test) y_predict_model_lineal = model_lineal.predict(X_test)

#Resultados print('Accuracy Score for lineal model:', accuracy_score(y_test,y_predict_model_lineal)) print('Precision Score for lineal model:', precision_score(y_test,y_predict_model_lineal)) print('Recall Score for lineal model:', recall_score(y_test,y_predict_model_lineal))

#Ahora procdemos con el modelo de bosue aleatorio model_tree = RandomForestClassifier(n_estimators = 100) model_tree.fit(X_train,y_train)

y_proba_tree =model_tree.predict_proba(X_test) y_predict_tree = model_tree.predict(X_test)

#Resultados print('Accuracy Score for tree model:', accuracy_score(y_test,y_predict_tree)) print('Precision Score for tree model:', precision_score(y_test,y_predict_tree)) print('Recall Score for tree model:', recall_score(y_test,y_predict_tree))

# Estandarizamos los datos sc = StandardScaler() x_sc = sc.fit_transform(df.drop(columns = ['Churn']))

# Usamos la funcion linkage para crear una matriz de distancias basada en la matriz de caracteristicas estandarizada linked = linkage(x_sc,method = 'ward')

plt.figure(figsize = (12,15)) dendrogram(linked, orientation = 'top') plt.title('Clusters para el gym') plt.show()

#entrenamos el modelo km = KMeans(n_clusters =5) labels =km.fit_predict(df.drop(columns = ['Churn']))

df['cluster_km'] = labels

df.groupby('cluster_km').count()

df.groupby('cluster_km').sum()

df.groupby('cluster_km').mean()

#trazamos distribuciones def plot_all_clusters(df, cluster_name): # Filtrar solo columnas numéricas, excepto la de clusters numeric_cols = df.drop(columns=[cluster_name]).select_dtypes(include=['int64','float64']).columns for col in numeric_cols: plt.figure(figsize=(8, 6)) sns.scatterplot( y=df[col], x=df[cluster_name], hue=df[cluster_name], palette='Paired', alpha=0.7 ) plt.title(f"{col} vs {cluster_name}") plt.xlabel(cluster_name) plt.ylabel(col) plt.legend(title="Cluster") plt.show()

plot_all_clusters(df,'cluster_km')

def show_clusters_on_plot(df, x_name,y_name, cluster_name): plt.figure(figsize = (10,10)) sns.scatterplot(df[x_name], df[y_name], hue = df[cluster_name], palette = 'Paired' ) plt.title('{} vs {}'.format(x_name, y_name)) plt.show()

show_clusters_on_plot(df,'Avg_class_frequency_total','Avg_additional_charges_total','cluster_km')

#Calculamos tasas de cancelacion df.groupby('cluster_km')['Churn'].mean()