Classification-for-difficulty-of-all-exercise-of-Gym

!python -m pip install -q --upgrade pip !pip install -q -r requirements.txt

# Miscellaneous from __future__ import print_function from IPython.display import display #Importing Requierd Libraries import pandas as pd import numpy as np # For interactive graphics import plotly.express as px import plotly.graph_objects as go import plotly.figure_factory as ff from plotly.subplots import make_subplots import matplotlib.pyplot as plt import matplotlib as mpl import seaborn as sns from ydata_profiling import ProfileReport from yellowbrick.classifier import ClassPredictionError, ConfusionMatrix, ROCAUC from tqdm.notebook import trange, tqdm # Sklearn from sklearn.preprocessing import PowerTransformer from sklearn.datasets import make_blobs from imblearn.over_sampling import SMOTE from sklearn.model_selection import GridSearchCV from sklearn.metrics import roc_curve, roc_auc_score, precision_score, f1_score, RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score from sklearn.model_selection import train_test_split, LearningCurveDisplay, ShuffleSplit ## Algorithms for Binary Classification from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier from lightgbm import LGBMClassifier from catboost import CatBoostClassifier ## Visualization of Decision Tree from sklearn.tree import plot_tree, export_text # Magics Funtions %load_ext autoreload %autoreload 2 %run "template_visualitation.ipynb" # Export function pandas_title() %run "pandas-missing-extension.ipynb" # Export missing functions from the API of Pandas library

%%time df_data = pd.read_csv('megaGymDataset.csv')

df_data.to_feather("megaGymDataset.feather")

%%time df_data = pd.read_feather("megaGymDataset.feather")

pandas_title(df_data.describe().style.format('{:.5f}').applymap(above_zero), 'Describe Data: First middle')

memory_usage = df_data.memory_usage(deep=True) / 1024 ** 2 memory_usage.loc['total'] = memory_usage.sum() memory_usage = memory_usage.to_frame(name="memory usage of variable (MB)") pandas_title(memory_usage, 'memory usage of features', True).bar(subset=["memory usage of variable (MB)",], color='#ee1f5f', axis=0).format("{:.6f} MB")

def reduce_memory_usage(df, verbose=True): numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"] start_mem = df.memory_usage().sum() / 1024 ** 2 for col in df.columns: col_type = df[col].dtypes if col_type in numerics: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == "int": if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if ( c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max ): df[col] = df[col].astype(np.float16) elif ( c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max ): df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024 ** 2 if verbose: print( "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format( end_mem, 100 * (start_mem - end_mem) / start_mem ) ) return df df_data = reduce_memory_usage(df_data, verbose=True)

pd.DataFrame(df_data.info(memory_usage="deep"))

df_data.info()

df_data.drop(['Unnamed: 0'], axis=1, inplace=True)

df_data_processing = df_data.copy()

df_data_processing.Level = df_data.Level.replace(["Beginner", "Intermediate", "Expert"], [0, 1, 2])

df_data_processing.Level.value_counts()

df_data_processing.Type.value_counts()

categorical_columns = ['Type', 'BodyPart', 'Equipment']

pandas_title(df_data_processing.missing.missing_variable_summary().set_index('variable').T.style.format('{:.0f}'), 'Quantity and percentage of Missing Values')

df_data_processing.duplicated().value_counts()

df_data_processing.drop_duplicates(inplace=True)

pandas_title(df_imputeknn.nunique().to_frame(name="Quantity ").T, 'Unique values')

profile = ProfileReport( df_data, title="Pandas Profiling Report", html={"style": {"primary_color": "#FA0087"}}, minimal=True ) profile

df_data

# Load the regular expression library import re # Remove punctuation #papers['paper_text_processed'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x)) def limpiar_variable(list_variables, sufix): ''' Esta función limpia las variables y crea una nueva variable con el nombre de la variable+sufijo El orden en el que se va limpiando el texto no es arbitrario. El listado de signos de puntuación se ha obtenido de: print(string.punctuation) y re.escape(string.punctuation) ''' for variable in list_variables: # Se sustituyen los None por una string y se convierte todo el texto a minúsculas df_data[f'{variable}{sufix}'] = df_data[f'{variable}'].fillna('none').str.lower() # Eliminación de signos de puntuación regex = '[\\!\\"\\#\\$\\%\\&\\\'\$\$\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]' df_data[f'{variable}{sufix}'] = df_data[f'{variable}{sufix}'].map(lambda x: re.sub(regex, '', x)) # Eliminación de espacios en blanco múltiples df_data[f'{variable}{sufix}'] = df_data[f'{variable}{sufix}'].map(lambda x: re.sub('[\\s]{2,}', '', x)) # Eliminación de números df_data[f'{variable}{sufix}'] = df_data[f'{variable}{sufix}'].map(lambda x: re.sub('\d+', '', x)) limpiar_variable(['Title', 'Desc'], '_Processed') df_data

#!pip install wordcloud==1.8.2.2 scipy==1.9.3 gensim==4.3.0 pyLDAvis==3.4.0

# Import the wordcloud library from wordcloud import WordCloud import os from PIL import Image import nltk nltk.download('stopwords') from nltk.corpus import stopwords colums_text_analysis = ['Title_Processed', 'Desc_Processed'] image_mask = np.array(Image.open(os.path.join('images/', "silhouette-fitness-preview.png"))) for column in colums_text_analysis: text_list = '' for row in df_data[column].dropna(): text_list = text_list + ' ' + row stop_words = stopwords.words('english') stop_words.extend(['none']) # Join the different processed titles together. text_list = ' '.join([x for x in text_list.split(' ') if x not in stop_words and len(x)>2]) # Create a WordCloud object wc = WordCloud(background_color='black', mask=image_mask, max_words=2000, random_state=42) # Generate a word cloud wc.generate(text_list) # create coloring from image # image_colors = ImageColorGenerator(parrot_color) # wc.recolor(color_func=image_colors) # Visualize the word cloud # wc.to_file(f"text_analysis_{column}.png") plt.axis("off") plt.imshow(wc, interpolation='bilinear') plt.title(f"Text analysis variable {column}", fontsize=20, fontdict={"weight": "bold"}, pad=30) #fig = plt.gcf() #fig.savefig(f"text_analysis_{column}.png") plt.show()

import gensim from gensim.utils import simple_preprocess import nltk nltk.download('stopwords') from nltk.corpus import stopwords stop_words = stopwords.words('english') stop_words.extend(['none']) def sent_to_words(sentences): for sentence in sentences: # deacc=True removes punctuations yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) def remove_stopwords(texts): return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] data = df_data['Desc_Processed'].values.tolist() data_words = list(sent_to_words(data)) # remove stop words data_words = remove_stopwords(data_words) print(data_words[:1][0][:30])

import gensim.corpora as corpora # Create Dictionary id2word = corpora.Dictionary(data_words) # Create Corpus texts = data_words # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # View print(corpus[:1][0][:30])

from pprint import pprint # number of topics num_topics = 10 # Build LDA model lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics) # Print the Keyword in the 10 topics pprint(lda_model.print_topics()) doc_lda = lda_model[corpus]

import pyLDAvis.gensim import pickle import pyLDAvis # Visualize the topics pyLDAvis.enable_notebook() LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html') LDAvis_prepared

X = df_imputeknn.drop(['Level'], axis=1) y = df_imputeknn['Level']

sm = SMOTE(random_state=42) X, y = sm.fit_resample(X, y)

import warnings warnings.filterwarnings("ignore") from yellowbrick.style import set_palette set_palette('sns_bright') # method used a cross-validation procedure cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42) # Grid for classifier reg_classifier = { 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'KNeighborsClassifier': KNeighborsClassifier(), 'LGBMClassifier': LGBMClassifier(), # 'CatBoostClassifier': CatBoostClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'SupportVectorMachine': SVC(), 'GaussianNaiveBayes': GaussianNB(), 'BaggingClassifier': None, 'BoostingDecisionTree': None, 'VotingClassifier': None } params = { 'LogisticRegression': { "C": np.logspace(-4, 4, 4), "penalty": ['l1', 'l2'], "solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], 'max_iter': [100], 'random_state': [42], 'n_jobs': [-1], }, 'RandomForestClassifier': { "criterion": ['entropy', 'gini'], "n_estimators": [96, 128, 160], "max_depth": [6, 8, 10], 'random_state': [42], 'n_jobs': [-1], }, 'DecisionTreeClassifier': {"criterion": ['entropy', 'gini'], "max_depth": [6, 8, 10], 'random_state': [42], }, 'KNeighborsClassifier': { 'n_neighbors': [1, 3, 5, 7, 9, 12, 15, 19, 24], 'weights': ['uniform', 'distance'], 'n_jobs': [-1], 'p': [1, 2, 3], }, 'LGBMClassifier': { 'num_leaves': [15, 20, 25], 'min_child_samples': [12, 15, 18], 'max_depth': [3, 4, 5], 'learning_rate':[0.1], 'reg_alpha':[0.025, 0.03, 0.035], 'n_jobs':[-1], 'random_state': [42] }, # 'CatBoostClassifier': # { # 'max_depth': [4, 8, 16], # 'learning_rate': [0.1], # 'iterations': [10, 20, 30], # 'random_state': [42] # }, 'AdaBoostClassifier': {"n_estimators": [25, 30, 35], "learning_rate": [0.1], 'random_state': [42], }, 'GradientBoostingClassifier': {"n_estimators": [15, 20, 25], "learning_rate": [0.1], 'random_state': [42], "loss":["log_loss"], "min_samples_split": [0.05, 0.1, 0.15], "min_samples_leaf": [0.1, 0.2, 0.3], "max_depth":[3, 4, 5], "max_features":["log2","sqrt"], "criterion": ["friedman_mse", "mae"], "subsample":[0.85, 0.9, 0.95], }, 'SupportVectorMachine': { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'random_state': [42], 'probability': [True], }, 'GaussianNaiveBayes': {}, 'BaggingClassifier': { 'max_samples': [0.5], 'max_features': [1.0], 'n_estimators': [10, 20, 30], 'n_jobs': [-1], 'random_state': [42] }, 'BoostingDecisionTree': { 'n_estimators': [10, 20, 30], 'learning_rate': [0.1], 'random_state': [42], }, 'VotingClassifier': { 'voting': ['soft'], 'n_jobs': [-1], }, } # Grid for LearningCurveDisplay common_params = { "X": X, "y": y, "train_sizes": np.linspace(0.2, 1.0, 5), "cv": cv, "score_type": "both", "n_jobs": -1, "line_kw": {"marker": "o"}, "std_display_style": "fill_between", "score_name": "Accuracy", "random_state": 42 } # Dict and list to save score to models models_best = {} models_names = [] test_scores = [] train_scores = [] # Split with 'X' and 'y' balanced and pca_components_best of 'X' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) ###################################### Models deployment ########################################### for name, reg in tqdm(reg_classifier.items()): grid_reg = GridSearchCV(reg, params.get(name), n_jobs=-1, cv=cv).fit(X, y) score = np.abs(grid_reg.best_score_) #Getting the best estimators, score & parameters GridSearchCV print(f'├──GridSearchCV of {name}:') print(f'\t├──{grid_reg.best_estimator_ = }') print(f'\t├──{grid_reg.best_score_ = } \n\n') # Best model for comparative Roc Curves models_best[name] = grid_reg.best_estimator_ # Save the best algorithms for use in the BaggingClassifier, BoostingDecisionTree and VotingClassifier if name == 'LogisticRegression': best_params_logistic_regression = grid_reg.best_estimator_.get_params() if name == 'DecisionTreeClassifier': best_params_decision_tree_classifier = grid_reg.best_estimator_.get_params() reg_classifier['BaggingClassifier'] = BaggingClassifier(DecisionTreeClassifier(**best_params_decision_tree_classifier)) reg_classifier['BoostingDecisionTree'] = AdaBoostClassifier(DecisionTreeClassifier(**best_params_decision_tree_classifier)) if name == 'SupportVectorMachine': best_params_support_vector_machine = grid_reg.best_estimator_.get_params() if name == 'GaussianNaiveBayes': best_params_gaussian_naive_bayes = grid_reg.best_estimator_.get_params() reg_classifier['VotingClassifier'] = VotingClassifier(estimators=[ ('gnb', GaussianNB(**best_params_gaussian_naive_bayes)), ('lr', LogisticRegression(**best_params_logistic_regression)), ('svm', SVC(**best_params_support_vector_machine)) ]) # Training our best model and prediction model = grid_reg.best_estimator_.fit(X_train, y_train) prediction = model.predict(X_test) # View report with pro style df = pd.DataFrame(classification_report(y_test, prediction, digits=2, output_dict=True)).T df['support'] = df.support.apply(int) pandas_title( df.style.background_gradient(cmap='coolwarm', subset=pd.IndexSlice['0':'3', :'f1-score']), f'Report metrics of best estimator {name}') # Display table results Test_Score and Train_Score for all models models_names.append(name) test_scores.append(model.score(X_test, y_test)*100) train_scores.append(model.score(X_train, y_train)*100) # Function plot matrix confusion with plotly cm = confusion_matrix(y_test, prediction, labels=model.classes_) # Confusion matrix fig, ax = plt.subplots(figsize=(6, 6)) visualizer = ConfusionMatrix(grid_reg.best_estimator_, percent=True, cmap="Blues", fontsize=13, ax=ax, encoder={0: 'Beginner', 1: 'Intermediate', 2: 'Expert'}) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(); # Class Prediction Error of result LogisticRegression# Class Prediction Error of result LogisticRegression ## Instantiate the classification model and visualizer fig, ax = plt.subplots(figsize=(7, 7)) visualizer = ClassPredictionError(grid_reg.best_estimator_, encoder={0: 'Beginner', 1: 'Intermediate', 2: 'Expert'}) ## Fit the training data to the visualizer visualizer.fit(X_train, y_train) ## Evaluate the model on the test data visualizer.score(X_test, y_test) ## Draw visualization visualizer.show() # Learning Curve visualization fig, ax = plt.subplots(figsize=(6, 6)) LearningCurveDisplay.from_estimator(grid_reg.best_estimator_, **common_params, ax=ax) handles, label = ax.get_legend_handles_labels() ax.legend(handles[:2], ["Training Score", "Test Score"], fontsize=12) plt.title(f"Learning Curve for {name}") plt.show() # # Instaniate the classification model and visualizer fig, ax = plt.subplots(figsize=(7, 7)) visualizer = MyROCAUC(grid_reg.best_estimator_, encoder={0: 'Beginner', 1: 'Intermediate', 2: 'Expert'}, ax=ax, **{'line_color': LINE_COLOR}) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Plot a decision tree if name == 'DecisionTreeClassifier': mpl.rcParams['text.color'] = 'black' fig, ax = plt.subplots(figsize=(12, 12), facecolor='k') plot_tree(grid_reg.best_estimator_, feature_names=list(X.columns), label=list(np.unique(y)), filled=True, fontsize=12, max_depth=2, ax=ax) plt.title(f"Decision tree for {name}", color='white', fontsize=28, fontweight='bold') plt.show() sns.set_style('darkgrid', {"grid.color": ".6", "grid.linestyle": ":"}) plt.style.use('dark_background') set_palette('sns_bright') print('\n' * 3, '_' * 85, '\n' * 3)

dict_models_scores = {'Model': models_names, 'Test Score': test_scores, 'Train Score': train_scores} df = pd.DataFrame(dict_models_scores) # plot bar comparative scores test and train for best classifier fig = go.Figure() fig.add_trace(go.Bar( y=df['Test Score'], x=df['Model'], name='Test Scores', marker_color='indianred' )) fig.add_trace(go.Bar( y=df['Train Score'], x=df['Model'], name='Train Scores', marker_color='lightsalmon' )) # Here we modify the tickangle of the xaxis, resulting in rotated labels. fig.update_layout(barmode='group', xaxis_tickangle=-45, title="Bar comparative scores of test and train for best classifier",) fig.update_yaxes(tickfont=dict(family='Arial', size=14), automargin='height') fig.update_xaxes(tickfont=dict(family='Arial', size=14), automargin='height') fig.update_yaxes(title_text="Scores %") fig.update_xaxes(title_text="Logistic Regression models implemented") fig.show()

# !pip install -q pipreqsnb # !pipreqsnb '.' --force # Edit to requirements.txt with any additional libraries you require