#Setting theme
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('gruvboxd')
# For data cleaning/feature engineering
import numpy as np
import pandas as pd
# For visualization
import seaborn as sns
import matplotlib.pyplot as plt
#For statistical tests
import pingouin as pg
import statsmodels as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import levene
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.sandbox.stats.multicomp import TukeyHSDResults
from statsmodels.stats.diagnostic import kstest_normal
from scipy.stats import shapiro
sns.set(style = 'darkgrid', palette = 'bright')
pd.set_option('display.max_columns', None)
plt.rcParams['figure.figsize'] = (15, 10)
plt.style.use('ggplot')
%matplotlib inline
df = pd.read_csv('fighter_stance_tko.csv')
df.drop('Unnamed: 0',axis=1, inplace=True)
display(df.memory_usage(), df.memory_usage().sum())
def reduce_memory_usage(data, pct_threshold=0.4):
'''Can be reapplied after
outlier handling and scaling'''
start_mem = data.memory_usage().sum() / 1024**2
print('Memory usage before: {:.2f} MB'.format(start_mem))
for col in data.columns:
col_type = data[col].dtype
if col_type != 'object':
c_min = data[col].min()
c_max = data[col].max()
if 'int' in str(col_type):
if c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
data[col] = data[col].astype(np.uint8)
elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
data[col] = data[col].astype(np.uint16)
elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
data[col] = data[col].astype(np.uint32)
elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
data[col] = data[col].astype(np.uint64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
data[col] = data[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
data[col] = data[col].astype(np.float32)
else:
data[col] = data[col].astype(np.float64)
elif col_type=='object':
if data[col].nunique() / data[col].shape[0] < pct_threshold:
data[col] = data[col].astype('category')
else:
continue
end_mem = data.memory_usage().sum() / 1024**2
print('Memory usage now : {:.2f} MB'.format(end_mem))
print('Memory usage decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
reduce_mem_usage(df)
Memory usage before: 0.04 MB
Memory usage now : 0.01 MB
Memory usage decreased by 61.8%
display(df.tail(), df.info(), df.groupby('stance')['tko_win_ratio'].describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 632 entries, 0 to 631
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 632 non-null object
1 last_name 632 non-null object
2 stance 632 non-null category
3 wins 632 non-null int8
4 losses 632 non-null int8
5 SLpM 632 non-null float16
6 fights 632 non-null int8
7 tko_win_ratio 632 non-null float16
dtypes: category(1), float16(2), int8(3), object(2)
memory usage: 15.1+ KB
#Removing the stance categories that have very few values. We'll conduct tests on 3
df = df[df['stance']!='Open Stance']
colors = sns.color_palette('bright')[0:5]
plt.figsize = (20,15)
plt.pie(df['stance'].value_counts(), colors=colors,
labels = ['Orthodox', 'Southpaw', 'Switch', 'Open Stance'], autopct='%.0f%%')
plt.show()
df['SLpM'].corr(df['tko_win_ratio'], method='spearman')
#We see that tko_win_ratio has the biggest positive correlation with SLpM
plt.figsize = (20,25)
sns.heatmap(df.corr(method='spearman'), annot=True)
plt.show()
#No missing values
df[df.isna().any(axis=1)]
#Checking the mean and number of samples per stance cat
df.groupby('stance')['tko_win_ratio'].agg(['count', 'mean', 'std', 'median'])
'''The samples may be few, but we're conducting tests
only on the fighters that have sufficient UFC (not overall career)
experience'''
print(f'''Number of rows left: {df.shape[0]}''')
Number of rows left: 630
#Visualizing the mean comparison
#But this is not conclusive
plt.figure(figsize=(15,8))
sns.set_palette("Reds", 4)
sns.boxplot(x="stance",y="tko_win_ratio",data=df,order=["Orthodox", "Southpaw", "Switch"])
sns.stripplot(x="stance", y="tko_win_ratio", data=df, order=["Orthodox", "Southpaw", "Switch"], jitter=0.4, color="0.3")
plt.title("UFC Fighter's Knockout to Win Ratio grouped by Fighting Stance", fontsize=16)
plt.xlabel("")
plt.xticks([0,1,2],["Orthodox","Southpaw","Switch"],fontsize=13)
plt.ylabel("Technical Knockout to Win Ratio", fontsize=13)
#Save to downloads folder
plt.savefig("stance_tko-ratio-boxplot.png")
plt.show()
'''We've got fighters here who've fought numerous times but have 0 SLpM.
I've found out that these particular fighters have very few fights in the UFC itself (via the site),
excluding the prelims. So, these fighters are excluded from the tests'''
df = df[df['SLpM']!=0]
#They're all mount-shaped, however, we notice quite a few outliers
for f in df['stance'].unique():
plt.hist(df[df['stance']==f]['tko_win_ratio'], color='blue', bins=15)
plt.title(f'Distribution of tko_win_ratio values per {f} stance')
plt.xlabel(f)
plt.ylabel('tko_win_ratio')
plt.savefig('Stance_tko_normality.png')
plt.show()
'''Conducting Kolgorov-Smirnov test. 2 stance's p-values are below 0.05 which
doesn't suggest normal distribution, however, this test is sensitive
to outliers and we're looking for enough normality to conduct ANOVA'''
for f in df['stance'].unique():
normality_test = kstest_normal(df[df['stance']==f]['tko_win_ratio'])
print(f'''P value for {f} category: {normality_test[1]}''')
P value for Switch category: 0.8746371809364659
P value for Orthodox category: 0.0009999999999998899
P value for Southpaw category: 0.006740387567031209
#Let's check theoretical vs actual quantiles and see how much they differ
for f in df['stance'].unique():
fig = qqplot(df[df['stance']==f]['tko_win_ratio'], line = '45', fit=True)
ax = plt.gca()
fig.set_size_inches(15, 8)
ax.set_xlabel('Theoretical Quantiles', fontsize=13)
ax.set_ylabel(f'Sample Quantiles of the {f} stance', fontsize=13)
ax.set_title("QQPlot of Stance Categories", fontsize=16)
plt.show()
#They look even enough
#Now let's conduct Shapiro-Wilk test for more confident results
for f in df['stance'].unique():
print(f'''P-value for {f} stance: {shapiro(df[df['stance']==f]['tko_win_ratio']).pvalue}''')
#They're all above 0.05. We may reject the null hypothesis
P-value for Switch stance: 0.29995888471603394
P-value for Orthodox stance: 2.4719229441139134e-10
P-value for Southpaw stance: 3.15619763568975e-05
#It's considered fairly on the linear line. We've opted to be a bit less rigid here
stance = ols("tko_win_ratio ~ C(stance)", data = df).fit()
residuals = stance.resid
fig = qqplot(residuals, line = '45', fit=True)
ax = plt.gca()
fig.set_size_inches(15, 8)
ax.set_xlabel("Theoretical Quantiles", fontsize=13)
ax.set_ylabel("Sample Quantiles", fontsize=13)
ax.set_title("QQPlot of the Residuals", fontsize=16)
plt.savefig('Residuals_stance_tko.png')
plt.show()
'''Standard deviation seem pretty close. Let's
make sure with Levene test'''
df.groupby('stance')['tko_win_ratio'].describe()['std'].to_frame()
#Above 0.05. No null hypothesis
homoscedasticity_test = levene(df[df['stance']=='Orthodox']['tko_win_ratio'],
df[df['stance']=='Southpaw']['tko_win_ratio'],
df[df['stance']=='Switch']['tko_win_ratio'])
print(f'''Levene test p-value: {homoscedasticity_test[1]}''')
Levene test p-value: 0.37038170855806174
'''Although switch fighters have an advantage in
knockout percentage, the test results show that
there isn't enough evidence to conclude that it
is true. Thus, we're unable to reject the null hypothesis. '''
lm = ols('tko_win_ratio ~ C(stance) ',data=df).fit()
table = anova_lm(lm)
table
#Pairwise test display the same
mc = pairwise_tukeyhsd(df['tko_win_ratio'], df['stance'])
result = mc._results_table
print(result)
print(mc.groupsunique)
Multiple Comparison of Means - Tukey HSD, FWER=0.05
=======================================================
group1 group2 meandiff p-adj lower upper reject
-------------------------------------------------------
Orthodox Southpaw -0.0166 0.775 -0.0738 0.0407 False
Orthodox Switch 0.0421 0.6862 -0.0775 0.1617 False
Southpaw Switch 0.0587 0.5249 -0.0686 0.186 False
-------------------------------------------------------
['Orthodox' 'Southpaw' 'Switch']