!python -m pip install -q --upgrade pip
!pip install -q -r requirements.txt
# Miscellaneous
from __future__ import print_function
from IPython.display import display
#Importing Requierd Libraries
import pandas as pd
import numpy as np
# For interactive graphics
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from ydata_profiling import ProfileReport
from yellowbrick.classifier import ClassPredictionError, ConfusionMatrix, ROCAUC
from tqdm.notebook import trange, tqdm
# Sklearn
from sklearn.preprocessing import PowerTransformer
from sklearn.datasets import make_blobs
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, f1_score, RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score
from sklearn.model_selection import train_test_split, LearningCurveDisplay, ShuffleSplit
## Algorithms for Binary Classification
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
## Visualization of Decision Tree
from sklearn.tree import plot_tree, export_text
# Magics Funtions
%load_ext autoreload
%autoreload 2
%run "template_visualitation.ipynb" # Export function pandas_title()
%run "pandas-missing-extension.ipynb" # Export missing functions from the API of Pandas library
%%time
df_data = pd.read_csv('megaGymDataset.csv')
df_data.to_feather("megaGymDataset.feather")
%%time
df_data = pd.read_feather("megaGymDataset.feather")
pandas_title(df_data.describe().style.format('{:.5f}').applymap(above_zero), 'Describe Data: First middle')
memory_usage = df_data.memory_usage(deep=True) / 1024 ** 2
memory_usage.loc['total'] = memory_usage.sum()
memory_usage = memory_usage.to_frame(name="memory usage of variable (MB)")
pandas_title(memory_usage, 'memory usage of features', True).bar(subset=["memory usage of variable (MB)",], color='#ee1f5f', axis=0).format("{:.6f} MB")
def reduce_memory_usage(df, verbose=True):
numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
start_mem = df.memory_usage().sum() / 1024 ** 2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if (
c_min > np.finfo(np.float16).min
and c_max < np.finfo(np.float16).max
):
df[col] = df[col].astype(np.float16)
elif (
c_min > np.finfo(np.float32).min
and c_max < np.finfo(np.float32).max
):
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024 ** 2
if verbose:
print(
"Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
end_mem, 100 * (start_mem - end_mem) / start_mem
)
)
return df
df_data = reduce_memory_usage(df_data, verbose=True)
pd.DataFrame(df_data.info(memory_usage="deep"))
df_data.info()
df_data.drop(['Unnamed: 0'], axis=1, inplace=True)
df_data_processing = df_data.copy()
df_data_processing.Level = df_data.Level.replace(["Beginner", "Intermediate", "Expert"], [0, 1, 2])
df_data_processing.Level.value_counts()
df_data_processing.Type.value_counts()
categorical_columns = ['Type', 'BodyPart', 'Equipment']
pandas_title(df_data_processing.missing.missing_variable_summary().set_index('variable').T.style.format('{:.0f}'), 'Quantity and percentage of Missing Values')
df_data_processing.duplicated().value_counts()
df_data_processing.drop_duplicates(inplace=True)
pandas_title(df_imputeknn.nunique().to_frame(name="Quantity ").T, 'Unique values')
profile = ProfileReport(
df_data, title="Pandas Profiling Report", html={"style": {"primary_color": "#FA0087"}},
minimal=True
)
profile
df_data
# Load the regular expression library
import re
# Remove punctuation
#papers['paper_text_processed'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))
def limpiar_variable(list_variables, sufix):
'''
Esta función limpia las variables y crea una nueva variable con el nombre de la variable+sufijo
El orden en el que se va limpiando el texto no es arbitrario.
El listado de signos de puntuación se ha obtenido de: print(string.punctuation)
y re.escape(string.punctuation)
'''
for variable in list_variables:
# Se sustituyen los None por una string y se convierte todo el texto a minúsculas
df_data[f'{variable}{sufix}'] = df_data[f'{variable}'].fillna('none').str.lower()
# Eliminación de signos de puntuación
regex = '[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]'
df_data[f'{variable}{sufix}'] = df_data[f'{variable}{sufix}'].map(lambda x: re.sub(regex, '', x))
# Eliminación de espacios en blanco múltiples
df_data[f'{variable}{sufix}'] = df_data[f'{variable}{sufix}'].map(lambda x: re.sub('[\\s]{2,}', '', x))
# Eliminación de números
df_data[f'{variable}{sufix}'] = df_data[f'{variable}{sufix}'].map(lambda x: re.sub('\d+', '', x))
limpiar_variable(['Title', 'Desc'], '_Processed')
df_data
#!pip install wordcloud==1.8.2.2 scipy==1.9.3 gensim==4.3.0 pyLDAvis==3.4.0
# Import the wordcloud library
from wordcloud import WordCloud
import os
from PIL import Image
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
colums_text_analysis = ['Title_Processed', 'Desc_Processed']
image_mask = np.array(Image.open(os.path.join('images/', "silhouette-fitness-preview.png")))
for column in colums_text_analysis:
text_list = ''
for row in df_data[column].dropna():
text_list = text_list + ' ' + row
stop_words = stopwords.words('english')
stop_words.extend(['none'])
# Join the different processed titles together.
text_list = ' '.join([x for x in text_list.split(' ') if x not in stop_words and len(x)>2])
# Create a WordCloud object
wc = WordCloud(background_color='black', mask=image_mask, max_words=2000, random_state=42)
# Generate a word cloud
wc.generate(text_list)
# create coloring from image
# image_colors = ImageColorGenerator(parrot_color)
# wc.recolor(color_func=image_colors)
# Visualize the word cloud
# wc.to_file(f"text_analysis_{column}.png")
plt.axis("off")
plt.imshow(wc, interpolation='bilinear')
plt.title(f"Text analysis variable {column}", fontsize=20, fontdict={"weight": "bold"}, pad=30)
#fig = plt.gcf()
#fig.savefig(f"text_analysis_{column}.png")
plt.show()
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['none'])
def sent_to_words(sentences):
for sentence in sentences:
# deacc=True removes punctuations
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc))
if word not in stop_words] for doc in texts]
data = df_data['Desc_Processed'].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
id2word=id2word,
num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
import pyLDAvis.gensim
import pickle
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
with open(LDAvis_data_filepath, 'wb') as f:
pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared
X = df_imputeknn.drop(['Level'], axis=1)
y = df_imputeknn['Level']
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(X, y)
import warnings
warnings.filterwarnings("ignore")
from yellowbrick.style import set_palette
set_palette('sns_bright')
# method used a cross-validation procedure
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
# Grid for classifier
reg_classifier = {
'LogisticRegression': LogisticRegression(),
'RandomForestClassifier': RandomForestClassifier(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'KNeighborsClassifier': KNeighborsClassifier(),
'LGBMClassifier': LGBMClassifier(),
# 'CatBoostClassifier': CatBoostClassifier(),
'AdaBoostClassifier': AdaBoostClassifier(),
'GradientBoostingClassifier': GradientBoostingClassifier(),
'SupportVectorMachine': SVC(),
'GaussianNaiveBayes': GaussianNB(),
'BaggingClassifier': None,
'BoostingDecisionTree': None,
'VotingClassifier': None
}
params = {
'LogisticRegression':
{
"C": np.logspace(-4, 4, 4),
"penalty": ['l1', 'l2'],
"solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
'max_iter': [100],
'random_state': [42],
'n_jobs': [-1],
},
'RandomForestClassifier':
{
"criterion": ['entropy', 'gini'],
"n_estimators": [96, 128, 160],
"max_depth": [6, 8, 10],
'random_state': [42],
'n_jobs': [-1],
},
'DecisionTreeClassifier':
{"criterion": ['entropy', 'gini'],
"max_depth": [6, 8, 10],
'random_state': [42],
},
'KNeighborsClassifier':
{
'n_neighbors': [1, 3, 5, 7, 9, 12, 15, 19, 24],
'weights': ['uniform', 'distance'],
'n_jobs': [-1],
'p': [1, 2, 3],
},
'LGBMClassifier':
{
'num_leaves': [15, 20, 25],
'min_child_samples': [12, 15, 18],
'max_depth': [3, 4, 5],
'learning_rate':[0.1],
'reg_alpha':[0.025, 0.03, 0.035],
'n_jobs':[-1],
'random_state': [42]
},
# 'CatBoostClassifier':
# {
# 'max_depth': [4, 8, 16],
# 'learning_rate': [0.1],
# 'iterations': [10, 20, 30],
# 'random_state': [42]
# },
'AdaBoostClassifier':
{"n_estimators": [25, 30, 35],
"learning_rate": [0.1],
'random_state': [42],
},
'GradientBoostingClassifier':
{"n_estimators": [15, 20, 25],
"learning_rate": [0.1],
'random_state': [42],
"loss":["log_loss"],
"min_samples_split": [0.05, 0.1, 0.15],
"min_samples_leaf": [0.1, 0.2, 0.3],
"max_depth":[3, 4, 5],
"max_features":["log2","sqrt"],
"criterion": ["friedman_mse", "mae"],
"subsample":[0.85, 0.9, 0.95],
},
'SupportVectorMachine':
{
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'random_state': [42],
'probability': [True],
},
'GaussianNaiveBayes':
{},
'BaggingClassifier':
{
'max_samples': [0.5],
'max_features': [1.0],
'n_estimators': [10, 20, 30],
'n_jobs': [-1],
'random_state': [42]
},
'BoostingDecisionTree':
{
'n_estimators': [10, 20, 30],
'learning_rate': [0.1],
'random_state': [42],
},
'VotingClassifier':
{
'voting': ['soft'],
'n_jobs': [-1],
},
}
# Grid for LearningCurveDisplay
common_params = {
"X": X,
"y": y,
"train_sizes": np.linspace(0.2, 1.0, 5),
"cv": cv,
"score_type": "both",
"n_jobs": -1,
"line_kw": {"marker": "o"},
"std_display_style": "fill_between",
"score_name": "Accuracy",
"random_state": 42
}
# Dict and list to save score to models
models_best = {}
models_names = []
test_scores = []
train_scores = []
# Split with 'X' and 'y' balanced and pca_components_best of 'X'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
###################################### Models deployment ###########################################
for name, reg in tqdm(reg_classifier.items()):
grid_reg = GridSearchCV(reg, params.get(name), n_jobs=-1, cv=cv).fit(X, y)
score = np.abs(grid_reg.best_score_)
#Getting the best estimators, score & parameters
GridSearchCV
print(f'├──GridSearchCV of {name}:')
print(f'\t├──{grid_reg.best_estimator_ = }')
print(f'\t├──{grid_reg.best_score_ = } \n\n')
# Best model for comparative Roc Curves
models_best[name] = grid_reg.best_estimator_
# Save the best algorithms for use in the BaggingClassifier, BoostingDecisionTree and VotingClassifier
if name == 'LogisticRegression':
best_params_logistic_regression = grid_reg.best_estimator_.get_params()
if name == 'DecisionTreeClassifier':
best_params_decision_tree_classifier = grid_reg.best_estimator_.get_params()
reg_classifier['BaggingClassifier'] = BaggingClassifier(DecisionTreeClassifier(**best_params_decision_tree_classifier))
reg_classifier['BoostingDecisionTree'] = AdaBoostClassifier(DecisionTreeClassifier(**best_params_decision_tree_classifier))
if name == 'SupportVectorMachine':
best_params_support_vector_machine = grid_reg.best_estimator_.get_params()
if name == 'GaussianNaiveBayes':
best_params_gaussian_naive_bayes = grid_reg.best_estimator_.get_params()
reg_classifier['VotingClassifier'] = VotingClassifier(estimators=[
('gnb', GaussianNB(**best_params_gaussian_naive_bayes)),
('lr', LogisticRegression(**best_params_logistic_regression)),
('svm', SVC(**best_params_support_vector_machine))
])
# Training our best model and prediction
model = grid_reg.best_estimator_.fit(X_train, y_train)
prediction = model.predict(X_test)
# View report with pro style
df = pd.DataFrame(classification_report(y_test,
prediction,
digits=2,
output_dict=True)).T
df['support'] = df.support.apply(int)
pandas_title(
df.style.background_gradient(cmap='coolwarm',
subset=pd.IndexSlice['0':'3', :'f1-score']),
f'Report metrics of best estimator {name}')
# Display table results Test_Score and Train_Score for all models
models_names.append(name)
test_scores.append(model.score(X_test, y_test)*100)
train_scores.append(model.score(X_train, y_train)*100)
# Function plot matrix confusion with plotly
cm = confusion_matrix(y_test, prediction, labels=model.classes_)
# Confusion matrix
fig, ax = plt.subplots(figsize=(6, 6))
visualizer = ConfusionMatrix(grid_reg.best_estimator_,
percent=True,
cmap="Blues",
fontsize=13,
ax=ax,
encoder={0: 'Beginner', 1: 'Intermediate', 2: 'Expert'})
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show();
# Class Prediction Error of result LogisticRegression# Class Prediction Error of result LogisticRegression
## Instantiate the classification model and visualizer
fig, ax = plt.subplots(figsize=(7, 7))
visualizer = ClassPredictionError(grid_reg.best_estimator_,
encoder={0: 'Beginner', 1: 'Intermediate', 2: 'Expert'})
## Fit the training data to the visualizer
visualizer.fit(X_train, y_train)
## Evaluate the model on the test data
visualizer.score(X_test, y_test)
## Draw visualization
visualizer.show()
# Learning Curve visualization
fig, ax = plt.subplots(figsize=(6, 6))
LearningCurveDisplay.from_estimator(grid_reg.best_estimator_,
**common_params, ax=ax)
handles, label = ax.get_legend_handles_labels()
ax.legend(handles[:2], ["Training Score", "Test Score"], fontsize=12)
plt.title(f"Learning Curve for {name}")
plt.show()
# # Instaniate the classification model and visualizer
fig, ax = plt.subplots(figsize=(7, 7))
visualizer = MyROCAUC(grid_reg.best_estimator_,
encoder={0: 'Beginner', 1: 'Intermediate', 2: 'Expert'},
ax=ax,
**{'line_color': LINE_COLOR})
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.show() # Finalize and render the figure
# Plot a decision tree
if name == 'DecisionTreeClassifier':
mpl.rcParams['text.color'] = 'black'
fig, ax = plt.subplots(figsize=(12, 12), facecolor='k')
plot_tree(grid_reg.best_estimator_,
feature_names=list(X.columns),
label=list(np.unique(y)),
filled=True, fontsize=12,
max_depth=2,
ax=ax)
plt.title(f"Decision tree for {name}", color='white', fontsize=28, fontweight='bold')
plt.show()
sns.set_style('darkgrid', {"grid.color": ".6", "grid.linestyle": ":"})
plt.style.use('dark_background')
set_palette('sns_bright')
print('\n' * 3, '_' * 85, '\n' * 3)
dict_models_scores = {'Model': models_names, 'Test Score': test_scores, 'Train Score': train_scores}
df = pd.DataFrame(dict_models_scores)
# plot bar comparative scores test and train for best classifier
fig = go.Figure()
fig.add_trace(go.Bar(
y=df['Test Score'],
x=df['Model'],
name='Test Scores',
marker_color='indianred'
))
fig.add_trace(go.Bar(
y=df['Train Score'],
x=df['Model'],
name='Train Scores',
marker_color='lightsalmon'
))
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45, title="Bar comparative scores of test and train for best classifier",)
fig.update_yaxes(tickfont=dict(family='Arial', size=14), automargin='height')
fig.update_xaxes(tickfont=dict(family='Arial', size=14), automargin='height')
fig.update_yaxes(title_text="Scores %")
fig.update_xaxes(title_text="Logistic Regression models implemented")
fig.show()
# !pip install -q pipreqsnb
# !pipreqsnb '.' --force
# Edit to requirements.txt with any additional libraries you require