import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
headers = pd.read_csv('./Data/handles-data.csv')
tweets = pd.read_csv('./Data/tweets-data.csv')
pd.options.display.float_format = '{:.2f}'.format
tweets["nretweets"].isnull().sum()
tweets["nretweets"].quantile([0.65,0.70,0.8,0.9,0.95,0.99])
target = "is_popular"
# unpopular:0 popular:1
tweets[target] = pd.qcut(tweets["nretweets"], [0.65,0.70,1], labels=["0","1"] )
tweets[target][tweets[target].isnull()] = "0"
tweets[target].head(8)
tweets[target].value_counts(normalize=True, dropna=False) * 100
tweets[target].value_counts()
print(tweets[tweets[target].isnull()][target].count())
sns.countplot(data=tweets,x=tweets[target])
union = pd.merge(headers, tweets, left_on='username', right_on='username' )
union.describe()
union.to_csv("./Data/union_categorical.csv",sep=";", index=False)
union = pd.read_csv("./Data/union_categorical.csv", sep=";")
headers["followers"].quantile([0.4,0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.99])
union["followers_categorical"] = pd.qcut(union["followers"], [0.60,0.70,1], labels=["0", "1"])
union["followers_categorical"]
union["followers_categorical"][union["followers_categorical"].isnull()] = "0"
union[union["followers_categorical"].isnull()]["followers_categorical"].count()
sns.countplot(data=union,x=union["followers_categorical"])
union["have_urls"] = union["urls"] != "[]"
union.loc[union["have_urls"] == True, 'have_urls'] = "1"
union.loc[union["have_urls"] == False, 'have_urls'] = "0"
union["have_urls"]
(union[union["have_urls"]== "0"]["have_urls"].count() / union["have_urls"].count()) * 100
(union[union["have_urls"]== "1"]["have_urls"].count() / union["have_urls"].count()) * 100
union["have_urls"].value_counts(sort=True)
sns.countplot(data=union,x=union["have_urls"])
union["hashtags"].head(15)
union["have_hashtags"] = union["hashtags"] != "[]"
union.loc[union["have_hashtags"] == True, 'have_hashtags'] = "1"
union.loc[union["have_hashtags"] == False, 'have_hashtags'] = "0"
union["have_hashtags"].value_counts(sort=True)
sns.countplot(data=union,x=union["have_hashtags"])
union["join_date"] = pd.to_datetime(union["join_date"])
union["join_date"] = union["join_date"].dt.year
union["join_date"]
variables = union[["followers_categorical", "have_urls", "have_hashtags", "join_date"]]
variables.head(10)
variables.groupby(["join_date","followers_categorical","have_urls"]).count()
target = union[[target]]
target.shape
variables.shape
target.head(8)
variables.to_csv("./Data/variables_modelo.csv", index=False)
target.to_csv("./Data/target_modelo.csv", index=False)
variables = pd.read_csv("./Data/variables_modelo.csv")
target = pd.read_csv("./Data/target_modelo.csv")
# Stratify nos ayuda a mantener la proporcion de datos en el conjunto de entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(variables, target, test_size=0.3, random_state=25, stratify=target)
y_test.value_counts()
y_train.head(8)
X_train.astype(int)
from sklearn.linear_model import LogisticRegression
model_regression = LogisticRegression()
model_regression.fit(X_train, y_train)
predicciones_regresion = model_regression.predict(X_test)
probabilidades_regresion = model_regression.predict_proba(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predicciones_regresion)
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train,y_train)
predicciones_forest = rf_clf.predict(X_test)
probabilidades_forest = rf_clf.predict_proba(X_test)
print(rf_clf.predict_proba([[1,0,0,2010]]))
[(variable, importancia) for variable, importancia in zip(X_train, rf_clf.feature_importances_)]
X_train_random_mejorado = X_train[["followers_categorical", "have_urls", "join_date"]]
rf_clf_mejorado = RandomForestClassifier(random_state=42)
rf_clf_mejorado.fit(X_train_random_mejorado,y_train)
predicciones_forest_mejorado = rf_clf_mejorado.predict(X_test[["followers_categorical", "have_urls", "join_date"]])
probabilidades_forest_mejorado = rf_clf_mejorado.predict_proba(X_test[["followers_categorical", "have_urls", "join_date"]])
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import tree
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score,log_loss
def evaluacion_modelo(modelo,nombre,X_test,y_test,predicciones,probabilidades):
# Creamos un dataframe para almacenar el resultado predicción-probabilidad
# Obtenemos las predicciones que nos indican que es popular
probabilidades = probabilidades[:,1]
data = {
"prediccion": predicciones,
"probabilidad": probabilidades,
"real": y_test["is_popular"]
}
df_proba = pd.DataFrame(data)
df_proba['predicciones'] = predicciones
df_proba['probabilidades'] = probabilidades
df_proba['Correct'] = (df_proba["real"] == df_proba.predicciones)
# Poniendo treshold
treshold = 0.4
predicciones_treshold = (probabilidades >= treshold).astype('int')
print("MODELO: ", nombre)
print("Log Loss:",round(log_loss(y_test,predicciones),3))
print("F1 Score:",round(f1_score(y_test,predicciones),3))
print("Accuracy:",round(accuracy_score(y_test,predicciones),3))
print("Accuracy Treshold ",treshold, ":",round(accuracy_score(y_test,predicciones_treshold),3))
print("Precision:",round(precision_score(y_test,predicciones),3))
print("Precision Treshold ",treshold, ":",round(precision_score(y_test,predicciones_treshold),3))
print("Recall (sensibilidad):",round(recall_score(y_test,predicciones),3))
print("Recall Treshold:",round(recall_score(y_test,predicciones_treshold),3))
print("---- Correctos ----")
print(df_proba[['real', 'predicciones', 'Correct']].describe())
# Plotting Confusion Matrix
ConfusionMatrixDisplay.from_estimator(modelo, X_test, y_test, normalize='true')
# Curva ROC
figura = plt.figure(figsize=(14,14))
axes = figura.add_subplot(2,1,1)
axes.plot([0,1],[0,1], linestyle="--")
RocCurveDisplay.from_estimator(modelo, X_test, y_test, ax=axes)
feature_names = X_test.columns
if nombre == "Random Forest":
# Plot tree random forest
axes_tree = figura.add_subplot(2,1,2)
tree.plot_tree(rf_clf.estimators_[0], feature_names=feature_names, fontsize=12, class_names=['No es popular', 'Es popular'], max_depth=4,
impurity=False, proportion=True, precision=2)
evaluacion_modelo(rf_clf,"Random Forest",X_test,y_test, predicciones_forest,probabilidades_forest)
evaluacion_modelo(model_regression,"Regresión Logística",X_test,y_test, predicciones_regresion,probabilidades_regresion)
evaluacion_modelo(rf_clf_mejorado,"Random Forest Mejorado",X_test[["followers_categorical", "have_urls", "join_date"]],y_test, predicciones_forest_mejorado,probabilidades_forest_mejorado)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
lr = LogisticRegression()
print(lr.get_params().keys())
svc = SVC()
print(svc.get_params().keys())
forest = RandomForestClassifier()
print(forest.get_params().keys())
XGB = XGBClassifier()
print(XGB.get_params().keys())
param_grid_forest = {
'n_estimators': [10,15,20,40,60],
'max_depth': [None, 5,10,20,30],
'min_samples_leaf': [1,5,10],
'random_state': [42,41,40]
}
param_grid_svc = {
'C': [0.001,0.01, 0.1],
'kernel': ['linear', 'rbf'],
'random_state': [42,41,40]
}
param_grid_logistic = {
'C': [0.001,0.01],
'solver': ['saga','lbfgs'],
'max_iter': [2000],
'random_state': [42,41,40,39,38]
}
param_grid_XGB = {
'n_estimators': [10,15,20,40,60],
'max_depth': [None, 5,10,20,30],
'learning_rate': [0.1,0.01,0.001],
'random_state': [42,41,40]
}
grids = [param_grid_forest, param_grid_svc, param_grid_logistic,param_grid_XGB]
modelos = (
RandomForestClassifier(random_state=42),
SVC(random_state=42),
LogisticRegression(random_state=42),
XGBClassifier(random_state=42)
)
nombres = ('Random Forest', 'SVC', 'Logistic Regression','XGBoostClassifier')
def optimizar():
for index, modelo in enumerate(modelos):
print(nombres[index])
optimized_pipeline = GridSearchCV(modelo,grids[index], verbose=0,cv=2)
optimized_pipeline.fit(X_train, y_train.values.ravel())
print("Best Params", nombres[index] + ":",optimized_pipeline.best_params_)
print("Best Score ", nombres[index] + ":",optimized_pipeline.best_score_)
optimizar()
rf_eval = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_leaf=1, random_state=41)
rf_eval.fit(X_train,y_train)
predicciones_rf_eval = rf_eval.predict(X_test)
probabilidades_rf_eval = rf_eval.predict_proba(X_test)
evaluacion_modelo(rf_eval,"Random Forest Ev. Final",X_test,y_test, predicciones_rf_eval,probabilidades_rf_eval)
rl_eval = LogisticRegression(C=0.01,solver='lbfgs',max_iter=2000,random_state=42)
rl_eval.fit(X_train,y_train)
predicciones_rl_eval = rl_eval.predict(X_test)
probabilidades_rl_eval = rl_eval.predict_proba(X_test)
evaluacion_modelo(rl_eval,"Logistic Regression Ev. Final",X_test,y_test, predicciones_rl_eval,probabilidades_rl_eval)
xgb_model = XGBClassifier(random_state=42,n_estimators=40,max_depth=5, learning_rate=0.1)
xgb_model.fit(X_train,y_train)
predicciones_xgb = xgb_model.predict(X_test)
probabilidades_xgb = xgb_model.predict_proba(X_test)
evaluacion_modelo(xgb_model,"XGBoost",X_test,y_test, predicciones_xgb,probabilidades_xgb)
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)
X_reduced.shape