Model_Evaluate

import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split

headers = pd.read_csv('./Data/handles-data.csv') tweets = pd.read_csv('./Data/tweets-data.csv') pd.options.display.float_format = '{:.2f}'.format

tweets["nretweets"].isnull().sum()

tweets["nretweets"].quantile([0.65,0.70,0.8,0.9,0.95,0.99])

target = "is_popular"

# unpopular:0 popular:1 tweets[target] = pd.qcut(tweets["nretweets"], [0.65,0.70,1], labels=["0","1"] )

tweets[target][tweets[target].isnull()] = "0"

tweets[target].head(8)

tweets[target].value_counts(normalize=True, dropna=False) * 100

tweets[target].value_counts()

print(tweets[tweets[target].isnull()][target].count())

sns.countplot(data=tweets,x=tweets[target])

union = pd.merge(headers, tweets, left_on='username', right_on='username' ) union.describe() union.to_csv("./Data/union_categorical.csv",sep=";", index=False)

union = pd.read_csv("./Data/union_categorical.csv", sep=";")

headers["followers"].quantile([0.4,0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.99])

union["followers_categorical"] = pd.qcut(union["followers"], [0.60,0.70,1], labels=["0", "1"])

union["followers_categorical"]

union["followers_categorical"][union["followers_categorical"].isnull()] = "0"

union[union["followers_categorical"].isnull()]["followers_categorical"].count()

sns.countplot(data=union,x=union["followers_categorical"])

union["have_urls"] = union["urls"] != "[]"

union.loc[union["have_urls"] == True, 'have_urls'] = "1"

union.loc[union["have_urls"] == False, 'have_urls'] = "0"

union["have_urls"]

(union[union["have_urls"]== "0"]["have_urls"].count() / union["have_urls"].count()) * 100

(union[union["have_urls"]== "1"]["have_urls"].count() / union["have_urls"].count()) * 100

union["have_urls"].value_counts(sort=True)

sns.countplot(data=union,x=union["have_urls"])

union["hashtags"].head(15)

union["have_hashtags"] = union["hashtags"] != "[]"

union.loc[union["have_hashtags"] == True, 'have_hashtags'] = "1" union.loc[union["have_hashtags"] == False, 'have_hashtags'] = "0"

union["have_hashtags"].value_counts(sort=True)

sns.countplot(data=union,x=union["have_hashtags"])

union["join_date"] = pd.to_datetime(union["join_date"]) union["join_date"] = union["join_date"].dt.year

union["join_date"]

variables = union[["followers_categorical", "have_urls", "have_hashtags", "join_date"]]

variables.head(10)

variables.groupby(["join_date","followers_categorical","have_urls"]).count()

target = union[[target]]

target.shape

variables.shape

target.head(8)

variables.to_csv("./Data/variables_modelo.csv", index=False)

target.to_csv("./Data/target_modelo.csv", index=False)

variables = pd.read_csv("./Data/variables_modelo.csv") target = pd.read_csv("./Data/target_modelo.csv")

# Stratify nos ayuda a mantener la proporcion de datos en el conjunto de entrenamiento y test X_train, X_test, y_train, y_test = train_test_split(variables, target, test_size=0.3, random_state=25, stratify=target)

y_test.value_counts()

y_train.head(8)

X_train.astype(int)

from sklearn.linear_model import LogisticRegression

model_regression = LogisticRegression() model_regression.fit(X_train, y_train)

predicciones_regresion = model_regression.predict(X_test) probabilidades_regresion = model_regression.predict_proba(X_test)

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, predicciones_regresion)

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42)

rf_clf.fit(X_train,y_train)

predicciones_forest = rf_clf.predict(X_test) probabilidades_forest = rf_clf.predict_proba(X_test)

print(rf_clf.predict_proba([[1,0,0,2010]]))

[(variable, importancia) for variable, importancia in zip(X_train, rf_clf.feature_importances_)]

X_train_random_mejorado = X_train[["followers_categorical", "have_urls", "join_date"]]

rf_clf_mejorado = RandomForestClassifier(random_state=42)

rf_clf_mejorado.fit(X_train_random_mejorado,y_train)

predicciones_forest_mejorado = rf_clf_mejorado.predict(X_test[["followers_categorical", "have_urls", "join_date"]]) probabilidades_forest_mejorado = rf_clf_mejorado.predict_proba(X_test[["followers_categorical", "have_urls", "join_date"]])

import matplotlib.pyplot as plt from sklearn.metrics import RocCurveDisplay import matplotlib.pyplot as plt from sklearn.metrics import ConfusionMatrixDisplay from sklearn import tree

from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score,log_loss

def evaluacion_modelo(modelo,nombre,X_test,y_test,predicciones,probabilidades): # Creamos un dataframe para almacenar el resultado predicción-probabilidad # Obtenemos las predicciones que nos indican que es popular probabilidades = probabilidades[:,1] data = { "prediccion": predicciones, "probabilidad": probabilidades, "real": y_test["is_popular"] } df_proba = pd.DataFrame(data) df_proba['predicciones'] = predicciones df_proba['probabilidades'] = probabilidades df_proba['Correct'] = (df_proba["real"] == df_proba.predicciones) # Poniendo treshold treshold = 0.4 predicciones_treshold = (probabilidades >= treshold).astype('int') print("MODELO: ", nombre) print("Log Loss:",round(log_loss(y_test,predicciones),3)) print("F1 Score:",round(f1_score(y_test,predicciones),3)) print("Accuracy:",round(accuracy_score(y_test,predicciones),3)) print("Accuracy Treshold ",treshold, ":",round(accuracy_score(y_test,predicciones_treshold),3)) print("Precision:",round(precision_score(y_test,predicciones),3)) print("Precision Treshold ",treshold, ":",round(precision_score(y_test,predicciones_treshold),3)) print("Recall (sensibilidad):",round(recall_score(y_test,predicciones),3)) print("Recall Treshold:",round(recall_score(y_test,predicciones_treshold),3)) print("---- Correctos ----") print(df_proba[['real', 'predicciones', 'Correct']].describe()) # Plotting Confusion Matrix ConfusionMatrixDisplay.from_estimator(modelo, X_test, y_test, normalize='true') # Curva ROC figura = plt.figure(figsize=(14,14)) axes = figura.add_subplot(2,1,1) axes.plot([0,1],[0,1], linestyle="--") RocCurveDisplay.from_estimator(modelo, X_test, y_test, ax=axes) feature_names = X_test.columns if nombre == "Random Forest": # Plot tree random forest axes_tree = figura.add_subplot(2,1,2) tree.plot_tree(rf_clf.estimators_[0], feature_names=feature_names, fontsize=12, class_names=['No es popular', 'Es popular'], max_depth=4, impurity=False, proportion=True, precision=2)

evaluacion_modelo(rf_clf,"Random Forest",X_test,y_test, predicciones_forest,probabilidades_forest)

evaluacion_modelo(model_regression,"Regresión Logística",X_test,y_test, predicciones_regresion,probabilidades_regresion)

evaluacion_modelo(rf_clf_mejorado,"Random Forest Mejorado",X_test[["followers_categorical", "have_urls", "join_date"]],y_test, predicciones_forest_mejorado,probabilidades_forest_mejorado)

from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from xgboost import XGBClassifier

lr = LogisticRegression() print(lr.get_params().keys())

svc = SVC() print(svc.get_params().keys())

forest = RandomForestClassifier() print(forest.get_params().keys())

XGB = XGBClassifier() print(XGB.get_params().keys())

param_grid_forest = { 'n_estimators': [10,15,20,40,60], 'max_depth': [None, 5,10,20,30], 'min_samples_leaf': [1,5,10], 'random_state': [42,41,40] } param_grid_svc = { 'C': [0.001,0.01, 0.1], 'kernel': ['linear', 'rbf'], 'random_state': [42,41,40] } param_grid_logistic = { 'C': [0.001,0.01], 'solver': ['saga','lbfgs'], 'max_iter': [2000], 'random_state': [42,41,40,39,38] } param_grid_XGB = { 'n_estimators': [10,15,20,40,60], 'max_depth': [None, 5,10,20,30], 'learning_rate': [0.1,0.01,0.001], 'random_state': [42,41,40] } grids = [param_grid_forest, param_grid_svc, param_grid_logistic,param_grid_XGB]

modelos = ( RandomForestClassifier(random_state=42), SVC(random_state=42), LogisticRegression(random_state=42), XGBClassifier(random_state=42) )

nombres = ('Random Forest', 'SVC', 'Logistic Regression','XGBoostClassifier')

def optimizar(): for index, modelo in enumerate(modelos): print(nombres[index]) optimized_pipeline = GridSearchCV(modelo,grids[index], verbose=0,cv=2) optimized_pipeline.fit(X_train, y_train.values.ravel()) print("Best Params", nombres[index] + ":",optimized_pipeline.best_params_) print("Best Score ", nombres[index] + ":",optimized_pipeline.best_score_)

optimizar()

rf_eval = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_leaf=1, random_state=41)

rf_eval.fit(X_train,y_train)

predicciones_rf_eval = rf_eval.predict(X_test) probabilidades_rf_eval = rf_eval.predict_proba(X_test)

evaluacion_modelo(rf_eval,"Random Forest Ev. Final",X_test,y_test, predicciones_rf_eval,probabilidades_rf_eval)

rl_eval = LogisticRegression(C=0.01,solver='lbfgs',max_iter=2000,random_state=42) rl_eval.fit(X_train,y_train) predicciones_rl_eval = rl_eval.predict(X_test) probabilidades_rl_eval = rl_eval.predict_proba(X_test) evaluacion_modelo(rl_eval,"Logistic Regression Ev. Final",X_test,y_test, predicciones_rl_eval,probabilidades_rl_eval)

xgb_model = XGBClassifier(random_state=42,n_estimators=40,max_depth=5, learning_rate=0.1)

xgb_model.fit(X_train,y_train)

predicciones_xgb = xgb_model.predict(X_test) probabilidades_xgb = xgb_model.predict_proba(X_test)

evaluacion_modelo(xgb_model,"XGBoost",X_test,y_test, predicciones_xgb,probabilidades_xgb)

from sklearn.decomposition import PCA

pca = PCA(n_components=0.95) X_reduced = pca.fit_transform(X_train) X_reduced.shape