!jt -l
Available Themes:
chesterish
grade3
gruvboxd
gruvboxl
monokai
oceans16
onedork
solarizedd
solarizedl
#Setting theme
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('gruvboxd')
# For data cleaning/feature engineering
import numpy as np
import pandas as pd
# For visualization
import seaborn as sns
import matplotlib.pyplot as plt
#For predicting missing values
from sklearn.tree import DecisionTreeClassifier
#For statistical tests
import pingouin as pg
import statsmodels as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import levene
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.sandbox.stats.multicomp import TukeyHSDResults
from statsmodels.stats.diagnostic import kstest_normal
from scipy.stats import shapiro
sns.set(style = 'darkgrid', palette = 'bright')
pd.set_option('display.max_columns', None)
plt.rcParams['figure.figsize'] = (15, 10)
plt.style.use('ggplot')
%matplotlib inline
df = pd.read_csv('fighter_stance.csv')
def reduce_mem_usage(df, n_unique_threshold):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage before: {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != 'object':
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
elif col_type=='object':
if df[col].nunique()<=n_unique_threshold:
df[col] = df[col].astype('category')
else:
pass
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage now : {:.2f} MB'.format(end_mem))
print('Memory usage decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
reduce_mem_usage(df, 10)
Memory usage before: 0.20 MB
Memory usage now : 0.09 MB
Memory usage decreased by 57.0%
display(df.tail(), df.info(), df.groupby('stance')['SLpM'].describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3741 entries, 0 to 3740
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 3737 non-null object
1 last_name 3741 non-null object
2 stance 2933 non-null category
3 wins 3741 non-null int16
4 losses 3741 non-null int8
5 SLpM 3741 non-null float16
6 fights 3741 non-null int16
dtypes: category(1), float16(1), int16(2), int8(1), object(2)
memory usage: 88.0+ KB
#Checking the mean and number of samples per stance cat
df.groupby('stance')['SLpM'].agg(['count', 'mean'])
#Removing the stance categories that have very few values. We'll conduct tests on 3
df = df[(df['stance']!='Open Stance') & (df['stance']!='Sideways')]
print(f'''Number of rows left: {df.shape[0]}''')
Number of rows left: 3731
#Missing stance records
df[df['stance'].isna()].shape[0]
#Around 60% of the missing null values have 0 SLpM
len(df[(df['stance'].isna()) & (df['SLpM']==0)])/len(df[df['stance'].isna()])
df[df['SLpM']==0].shape[0]
#Let's find what percent of 0 SLpM values are when the stance is not null
df[(df['SLpM']==0) & (df['stance'].notna())]
#Checking how many fights have they fought when SLpM is null
df[df['SLpM']==0]['fights'].value_counts()
df[(df['fights']<=5) & (df['SLpM']==0.00)]
#People who have fought fewer or equal to 5 fights have around 63% zero SLpM record
df[(df['fights']<=5) & (df['SLpM']==0.00)].shape[0]/df[df['fights']<=5].shape[0]
over = [f for f in df[df['stance'].isna()]['fights'].values if f<=5]
len(over)/len(list(df[df['stance'].isna()]['fights'].values))
df[df['fights']<=5].groupby('stance')['SLpM'].mean()
df[df['fights']>=5].groupby('stance')['SLpM'].mean()
'''Here, we're filtering out all the cases that don't meet our predefined mininum.
However, this should't always be the case since we're throwing away some useful information'''
df = df[df['fights']>5]
#We discover that most 0 SLpM values are with null stance. We'll have to remove these as well
df[(df['SLpM']==0.00) & (df['stance'].isna())]
df = df[(df['SLpM']!=0.00) | (df['stance'].notna())]
#The code above ensures that these two cases below no longer intersect
print(f'''Number of zero SLpM records now: {df[df['SLpM']==0.00].shape[0]}'''),
print(f'''Number of null stance records now: {df[df['stance'].isna()].shape[0]}''')
Number of zero SLpM records now: 154
Number of null stance records now: 287
#Visualizing the mean comparison
#But this is not conclusive
plt.figure(figsize=(15,8))
sns.set_palette("Reds", 4)
sns.boxplot(x="stance",y="SLpM",data=df,order=["Orthodox", "Southpaw", "Switch"])
#sns.stripplot(x="stance", y="SLpM", data=df, order=["Orthodox", "Southpaw", "Switch"], jitter=0.4, color="0.3")
plt.title("UFC Fighter's significant strikes per minute grouped by Fighting Stance", fontsize=16)
plt.xlabel("")
plt.xticks([0,1,2],["Orthodox","Southpaw","Switch"],fontsize=13)
plt.ylabel("Significant Strikes per Minute", fontsize=13)
#Save to downloads folder
plt.savefig("stance_boxplot.png")
plt.show()
#Number of 0 SLpm's per stance
df[df['SLpM']==0].groupby('stance')['name'].agg('count')
'''We've got fighters here who've fought numerous times but have 0 SLpM.
I've found out that these particular fighters have very few fights in the UFC itself (via the site),
excluding the prelims. So, these fighters are excluded from the tests'''
df = df[df['SLpM']!=0]
#Lastly, it's time to deal with the missing stance records before running the tests
df.isna().sum()
#Rechecking
df.groupby('stance')['SLpM'].agg(['count', 'mean', 'median'])
df = df.iloc[:, 2:-1]
df['stance'].replace('Orthodox' , '0', inplace=True)
df['stance'].replace('Southpaw' , '1', inplace=True)
df['stance'].replace('Switch' , '2', inplace=True)
df.reset_index(drop=True)
#Predicting the missing stance values (max_depth=6 set)
y_train = df[df['stance'].notna()]['stance']
X_train = df[df['stance'].notna()].drop('stance', axis=1)
X_test = df[df['stance'].isna()].drop('stance', axis=1)
X_test.reset_index(drop=True,inplace=True)
X_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
dtree_model = DecisionTreeClassifier(max_depth = 6).fit(X_train, y_train)
y_test = dtree_model.predict(X_test)
Xy_test = pd.concat([X_test, pd.Series(y_test)], axis=1).rename(columns={0: 'stance'})
Xy_train = pd.concat([X_train, y_train], axis=1)
y_test
df = pd.concat([Xy_train, Xy_test], axis=0)
df['stance'].replace('0', 'Orthodox', inplace=True)
df['stance'].replace('1', 'Southpaw', inplace=True)
df['stance'].replace('2', 'Switch', inplace=True)
#They're all mount-shaped, however, we notice quite a few outliers
for f in df['stance'].unique()[0:3]:
plt.hist(df[df['stance']==f]['SLpM'], color='blue', bins=30)
plt.title(f'Distribution of SLpM values {f} stance')
plt.xlabel(f)
plt.ylabel('SLpM')
plt.savefig('Stance_normality.png')
plt.show()
'''Conducting Kolgorov-Smirnov test. p-values are all below 0.05 which suggests
not normal distribution, however, this test is sensitive to outliers and we're
looking for enough normality to conduct ANOVA'''
for f in df['stance'].unique()[0:3]:
normality_test = kstest_normal(df[df['stance']==f]['SLpM'])
print(f'''P value for {f} category: {normality_test[1]}''')
P value for Orthodox category: 0.0009999999999998899
P value for Switch category: 0.0037611734838932497
P value for Southpaw category: 0.0009999999999998899
#Let's check theoretical vs actual quantiles and see how much they differ
for f in df['stance'].unique():
fig = qqplot(df[df['stance']==f]['SLpM'], line = '45', fit=True)
ax = plt.gca()
fig.set_size_inches(15, 8)
ax.set_xlabel('Theoretical Quantiles', fontsize=13)
ax.set_ylabel(f'Sample Quantiles of the {f} stance', fontsize=13)
ax.set_title("QQPlot of Stance Categories", fontsize=16)
plt.show()
#They look even enough
#Now let's conduct Shapiro-Wilk test for more confident results
for f in df['stance'].unique():
print(f'''P-value for {f} stance: {shapiro(df[df['stance']==f]['SLpM']).pvalue}''')
P-value for Orthodox stance: 1.1392201938282288e-30
P-value for Switch stance: 2.7280493668513373e-05
P-value for Southpaw stance: 4.728488781080564e-11
#It's considered fairly on the linear line. We've opted to be a bit less rigid here
stance = ols("SLpM ~ C(stance)", data = df).fit()
residuals = stance.resid
fig = qqplot(residuals, line = '45', fit=True)
ax = plt.gca()
fig.set_size_inches(15, 8)
ax.set_xlabel("Theoretical Quantiles", fontsize=13)
ax.set_ylabel("Sample Quantiles", fontsize=13)
ax.set_title("QQPlot of the Residuals", fontsize=16)
plt.savefig('Residuals.png')
plt.show()
'''This is where our assumption is being violated.
As you see, Orthodox and Southpaw fighter's standard deviation
is fairly close. Unfortunately, that's not the case for the fighters
with a Switch stance'''
df.groupby('stance')['SLpM'].describe()['std'].to_frame()
homoscedasticity_test = levene(df[df['stance']=='Orthodox']['SLpM'], df[df['stance']=='Southpaw']['SLpM'],
df[df['stance']=='Switch']['SLpM'])
print(f'''Levene test p-value: {homoscedasticity_test[1]}''')
Levene test p-value: 0.001083011740234121
#For the reason above, we conduct a more robust Welch test for unequal variances
pg.welch_anova(dv='SLpM', between='stance', data=df)
#p-val less than 0.05. We can reject the null hypothesis
#Lastly, we're checking pairwise difference. Switch stance significant mean in difference with others
pg.pairwise_gameshowell(dv='SLpM', between='stance', data=df)