import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
headers = pd.read_csv('./Data/handles-data.csv')
tweets = pd.read_csv('./Data/tweets-data.csv')
pd.options.display.float_format = '{:.2f}'.format
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3457: DtypeWarning: Columns (5) have mixed types.Specify dtype option on import or set low_memory=False.
exec(code_obj, self.user_global_ns, self.user_ns)
tweets["nretweets"].isnull().sum()
tweets["nretweets"].quantile([0.65,0.70,0.8,0.9,0.95,0.99])
target = "is_popular"
# unpopular:0 popular:1
tweets[target] = pd.qcut(tweets["nretweets"], [0.65,0.70,1], labels=["0","1"] )
tweets[target][tweets[target].isnull()] = "0"
tweets[target].head(8)
tweets[target].value_counts(normalize=True, dropna=False) * 100
tweets[target].value_counts()
print(tweets[tweets[target].isnull()][target].count())
0
sns.countplot(data=tweets,x=tweets[target])
union = pd.merge(headers, tweets, left_on='username', right_on='username' )
union.describe()
union.to_csv("./Data/union_categorical.csv",sep=";", index=False)
union = pd.read_csv("./Data/union_categorical.csv", sep=";")
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3457: DtypeWarning: Columns (23) have mixed types.Specify dtype option on import or set low_memory=False.
exec(code_obj, self.user_global_ns, self.user_ns)
headers["followers"].quantile([0.4,0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.99])
union["followers_categorical"] = pd.qcut(union["followers"], [0.60,0.70,1], labels=["0", "1"])
union["followers_categorical"]
union["followers_categorical"][union["followers_categorical"].isnull()] = "0"
union[union["followers_categorical"].isnull()]["followers_categorical"].count()
sns.countplot(data=union,x=union["followers_categorical"])
union["have_urls"] = union["urls"] != "[]"
union.loc[union["have_urls"] == True, 'have_urls'] = "1"
union.loc[union["have_urls"] == False, 'have_urls'] = "0"
union["have_urls"]
(union[union["have_urls"]== "0"]["have_urls"].count() / union["have_urls"].count()) * 100
(union[union["have_urls"]== "1"]["have_urls"].count() / union["have_urls"].count()) * 100
union["have_urls"].value_counts(sort=True)
sns.countplot(data=union,x=union["have_urls"])
union["hashtags"].head(15)
union["have_hashtags"] = union["hashtags"] != "[]"
union.loc[union["have_hashtags"] == True, 'have_hashtags'] = "1"
union.loc[union["have_hashtags"] == False, 'have_hashtags'] = "0"
union["have_hashtags"].value_counts(sort=True)
sns.countplot(data=union,x=union["have_hashtags"])
union["join_date"] = pd.to_datetime(union["join_date"])
union["join_date"] = union["join_date"].dt.year
union["join_date"]
variables = union[["followers_categorical", "have_urls", "have_hashtags", "join_date"]]
variables.head(10)
variables.groupby(["join_date","followers_categorical","have_urls"]).count()
target = union[[target]]
target.shape
variables.shape
target.head(8)
variables.to_csv("./Data/variables_modelo.csv", index=False)
target.to_csv("./Data/target_modelo.csv", index=False)
variables = pd.read_csv("./Data/variables_modelo.csv")
target = pd.read_csv("./Data/target_modelo.csv")
# Stratify nos ayuda a mantener la proporcion de datos en el conjunto de entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(variables, target, test_size=0.3, random_state=25, stratify=target)
y_test.value_counts()
y_train.head(8)
X_train.astype(int)
from sklearn.linear_model import LogisticRegression
model_regression = LogisticRegression()
model_regression.fit(X_train, y_train)
/home/julianmelero/twitter_ai_popular/venv/lib/python3.8/site-packages/sklearn/utils/validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
predicciones_regresion = model_regression.predict(X_test)
probabilidades_regresion = model_regression.predict_proba(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predicciones_regresion)
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train,y_train)
/tmp/ipykernel_733/1435717670.py:1: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
rf_clf.fit(X_train,y_train)
predicciones_forest = rf_clf.predict(X_test)
probabilidades_forest = rf_clf.predict_proba(X_test)
print(rf_clf.predict_proba([[1,0,0,2010]]))
[[0.09467756 0.90532244]]
/home/julianmelero/twitter_ai_popular/venv/lib/python3.8/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names
warnings.warn(
[(variable, importancia) for variable, importancia in zip(X_train, rf_clf.feature_importances_)]
X_train_random_mejorado = X_train[["followers_categorical", "have_urls", "join_date"]]
rf_clf_mejorado = RandomForestClassifier(random_state=42)
rf_clf_mejorado.fit(X_train_random_mejorado,y_train)
/tmp/ipykernel_733/3197789431.py:1: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
rf_clf_mejorado.fit(X_train_random_mejorado,y_train)
predicciones_forest_mejorado = rf_clf_mejorado.predict(X_test[["followers_categorical", "have_urls", "join_date"]])
probabilidades_forest_mejorado = rf_clf_mejorado.predict_proba(X_test[["followers_categorical", "have_urls", "join_date"]])
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import tree
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score,log_loss
def evaluacion_modelo(modelo,nombre,X_test,y_test,predicciones,probabilidades):
# Creamos un dataframe para almacenar el resultado predicción-probabilidad
# Obtenemos las predicciones que nos indican que es popular
probabilidades = probabilidades[:,1]
data = {
"prediccion": predicciones,
"probabilidad": probabilidades,
"real": y_test["is_popular"]
}
df_proba = pd.DataFrame(data)
df_proba['predicciones'] = predicciones
df_proba['probabilidades'] = probabilidades
df_proba['Correct'] = (df_proba["real"] == df_proba.predicciones)
# Poniendo treshold
treshold = 0.4
predicciones_treshold = (probabilidades >= treshold).astype('int')
print("MODELO: ", nombre)
print("Log Loss:",round(log_loss(y_test,predicciones),3))
print("F1 Score:",round(f1_score(y_test,predicciones),3))
print("Accuracy:",round(accuracy_score(y_test,predicciones),3))
print("Accuracy Treshold ",treshold, ":",round(accuracy_score(y_test,predicciones_treshold),3))
print("Precision:",round(precision_score(y_test,predicciones),3))
print("Precision Treshold ",treshold, ":",round(precision_score(y_test,predicciones_treshold),3))
print("Recall (sensibilidad):",round(recall_score(y_test,predicciones),3))
print("Recall Treshold:",round(recall_score(y_test,predicciones_treshold),3))
print("---- Correctos ----")
print(df_proba[['real', 'predicciones', 'Correct']].describe())
# Plotting Confusion Matrix
ConfusionMatrixDisplay.from_estimator(modelo, X_test, y_test, normalize='true')
# Curva ROC
figura = plt.figure(figsize=(14,14))
axes = figura.add_subplot(2,1,1)
axes.plot([0,1],[0,1], linestyle="--")
RocCurveDisplay.from_estimator(modelo, X_test, y_test, ax=axes)
feature_names = X_test.columns
if nombre == "Random Forest":
# Plot tree random forest
axes_tree = figura.add_subplot(2,1,2)
tree.plot_tree(rf_clf.estimators_[0], feature_names=feature_names, fontsize=12, class_names=['No es popular', 'Es popular'], max_depth=4,
impurity=False, proportion=True, precision=2)
evaluacion_modelo(rf_clf,"Random Forest",X_test,y_test, predicciones_forest,probabilidades_forest)
MODELO: Random Forest
Log Loss: 8.626
F1 Score: 0.452
Accuracy: 0.75
Accuracy Treshold 0.4 : 0.749
Precision: 0.679
Precision Treshold 0.4 : 0.668
Recall (sensibilidad): 0.339
Recall Treshold: 0.35
---- Correctos ----
real predicciones
count 12261.000000 12261.000000
mean 0.303972 0.151619
std 0.459989 0.358666
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 1.000000 0.000000
max 1.000000 1.000000
evaluacion_modelo(model_regression,"Regresión Logística",X_test,y_test, predicciones_regresion,probabilidades_regresion)
MODELO: Regresión Logística
Log Loss: 10.059
F1 Score: 0.311
Accuracy: 0.709
Accuracy Treshold 0.4 : 0.709
Precision: 0.554
Precision Treshold 0.4 : 0.554
Recall (sensibilidad): 0.216
Recall Treshold: 0.216
---- Correctos ----
real predicciones
count 12261.000000 12261.000000
mean 0.303972 0.118751
std 0.459989 0.323508
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 1.000000 0.000000
max 1.000000 1.000000
evaluacion_modelo(rf_clf_mejorado,"Random Forest Mejorado",X_test[["followers_categorical", "have_urls", "join_date"]],y_test, predicciones_forest_mejorado,probabilidades_forest_mejorado)
MODELO: Random Forest Mejorado
Log Loss: 8.673
F1 Score: 0.446
Accuracy: 0.749
Accuracy Treshold 0.4 : 0.743
Precision: 0.677
Precision Treshold 0.4 : 0.641
Recall (sensibilidad): 0.332
Recall Treshold: 0.353
---- Correctos ----
real predicciones
count 12261.000000 12261.000000
mean 0.303972 0.148927
std 0.459989 0.356032
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 1.000000 0.000000
max 1.000000 1.000000
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
lr = LogisticRegression()
print(lr.get_params().keys())
dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])
svc = SVC()
print(svc.get_params().keys())
dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])
forest = RandomForestClassifier()
print(forest.get_params().keys())
dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])
XGB = XGBClassifier()
print(XGB.get_params().keys())
dict_keys(['objective', 'use_label_encoder', 'base_score', 'booster', 'callbacks', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'early_stopping_rounds', 'enable_categorical', 'eval_metric', 'gamma', 'gpu_id', 'grow_policy', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_bin', 'max_cat_to_onehot', 'max_delta_step', 'max_depth', 'max_leaves', 'min_child_weight', 'missing', 'monotone_constraints', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'predictor', 'random_state', 'reg_alpha', 'reg_lambda', 'sampling_method', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])
param_grid_forest = {
'n_estimators': [10,15,20,40,60],
'max_depth': [None, 5,10,20,30],
'min_samples_leaf': [1,5,10],
'random_state': [42,41,40]
}
param_grid_svc = {
'C': [0.001,0.01, 0.1],
'kernel': ['linear', 'rbf'],
'random_state': [42,41,40]
}
param_grid_logistic = {
'C': [0.001,0.01],
'solver': ['saga','lbfgs'],
'max_iter': [2000],
'random_state': [42,41,40,39,38]
}
param_grid_XGB = {
'n_estimators': [10,15,20,40,60],
'max_depth': [None, 5,10,20,30],
'learning_rate': [0.1,0.01,0.001],
'random_state': [42,41,40]
}
grids = [param_grid_forest, param_grid_svc, param_grid_logistic,param_grid_XGB]
modelos = (
RandomForestClassifier(random_state=42),
SVC(random_state=42),
LogisticRegression(random_state=42),
XGBClassifier(random_state=42)
)
nombres = ('Random Forest', 'SVC', 'Logistic Regression','XGBoostClassifier')
def optimizar():
for index, modelo in enumerate(modelos):
print(nombres[index])
optimized_pipeline = GridSearchCV(modelo,grids[index], verbose=0,cv=2)
optimized_pipeline.fit(X_train, y_train.values.ravel())
print("Best Params", nombres[index] + ":",optimized_pipeline.best_params_)
print("Best Score ", nombres[index] + ":",optimized_pipeline.best_score_)
optimizar()
Random Forest
Best Params Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 20, 'random_state': 41}
Best Score Random Forest: 0.7456827239040761
SVC
Best Params SVC: {'C': 0.1, 'kernel': 'linear', 'random_state': 42}
Best Score SVC: 0.7006222470810319
Logistic Regression
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
Best Params Logistic Regression: {'C': 0.01, 'max_iter': 2000, 'random_state': 42, 'solver': 'lbfgs'}
Best Score Logistic Regression: 0.7074389988114382
XGBoostClassifier
Best Params XGBoostClassifier: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 40, 'random_state': 42}
Best Score XGBoostClassifier: 0.7455778508005313
rf_eval = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_leaf=1, random_state=41)
rf_eval.fit(X_train,y_train)
/tmp/ipykernel_21031/2302440801.py:1: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
rf_eval.fit(X_train,y_train)
predicciones_rf_eval = rf_eval.predict(X_test)
probabilidades_rf_eval = rf_eval.predict_proba(X_test)
evaluacion_modelo(rf_eval,"Random Forest Ev. Final",X_test,y_test, predicciones_rf_eval,probabilidades_rf_eval)
MODELO: Random Forest Ev. Final
Log Loss: 8.626
F1 Score: 0.452
Accuracy: 0.75
Accuracy Treshold 0.4 : 0.749
Precision: 0.679
Precision Treshold 0.4 : 0.668
Recall (sensibilidad): 0.339
Recall Treshold: 0.35
---- Correctos ----
real predicciones
count 12261.000000 12261.000000
mean 0.303972 0.151619
std 0.459989 0.358666
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 1.000000 0.000000
max 1.000000 1.000000
rl_eval = LogisticRegression(C=0.01,solver='lbfgs',max_iter=2000,random_state=42)
rl_eval.fit(X_train,y_train)
predicciones_rl_eval = rl_eval.predict(X_test)
probabilidades_rl_eval = rl_eval.predict_proba(X_test)
evaluacion_modelo(rl_eval,"Logistic Regression Ev. Final",X_test,y_test, predicciones_rl_eval,probabilidades_rl_eval)
/home/julianmelero/twitter_ai_complete/venv/lib/python3.8/site-packages/sklearn/utils/validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
MODELO: Logistic Regression Ev. Final
Log Loss: 10.059
F1 Score: 0.311
Accuracy: 0.709
Accuracy Treshold 0.4 : 0.709
Precision: 0.554
Precision Treshold 0.4 : 0.554
Recall (sensibilidad): 0.216
Recall Treshold: 0.216
---- Correctos ----
real predicciones
count 12261.000000 12261.000000
mean 0.303972 0.118751
std 0.459989 0.323508
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 1.000000 0.000000
max 1.000000 1.000000
xgb_model = XGBClassifier(random_state=42,n_estimators=40,max_depth=5, learning_rate=0.1)
xgb_model.fit(X_train,y_train)
predicciones_xgb = xgb_model.predict(X_test)
probabilidades_xgb = xgb_model.predict_proba(X_test)
evaluacion_modelo(xgb_model,"XGBoost",X_test,y_test, predicciones_xgb,probabilidades_xgb)
MODELO: XGBoost
Log Loss: 8.631
F1 Score: 0.451
Accuracy: 0.75
Accuracy Treshold 0.4 : 0.744
Precision: 0.679
Precision Treshold 0.4 : 0.639
Recall (sensibilidad): 0.338
Recall Treshold: 0.361
---- Correctos ----
real predicciones
count 12261.000000 12261.00000
mean 0.303972 0.15113
std 0.459989 0.35819
min 0.000000 0.00000
25% 0.000000 0.00000
50% 0.000000 0.00000
75% 1.000000 0.00000
max 1.000000 1.00000
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)
X_reduced.shape