# For web scraping
import requests
from bs4 import BeautifulSoup
# For data cleaning/feature engineering
import numpy as np
import pandas as pd
# For visualization
import seaborn as sns
import matplotlib.pyplot as plt
#For statistical tests
import pingouin as pg
#Miscellaneous
import string
df = pd.read_csv('fighter_stance.csv')
letters = string.ascii_lowercase
fighter_ids = {}
for letter in letters:
pages = requests.get(f'http://ufcstats.com/statistics/fighters?char={letter}&page=all')
soup = BeautifulSoup(pages.text, 'lxml')
fighter_ids[letter] = soup.find_all('a', attrs = {'class': "b-link b-link_style_black"})
#Extract pure links to send request to
fighter_links = []
for letter in letters:
for fighter in fighter_ids[letter]:
fighter_links.append(fighter['href'])
fighter_links = list(pd.Series(fighter_links).to_frame()[0].unique())
len(fighter_links)
%%time
fighter_win = {}
for link in fighter_links:
pages = requests.get(link)
soup = BeautifulSoup(pages.text, 'lxml')
fighter_win[link] = soup.find_all('p', attrs = {'class': "b-fight-details__table-text"})
CPU times: user 1min 30s, sys: 3.48 s, total: 1min 33s
Wall time: 19min 32s
win_loss = [[0 for columns in range(0)] for rows in range(len(fighter_win))]
for key, value in enumerate(fighter_links):
try:
if fighter_win[value][0].text.split('\n')[1]=='next':
for i in np.arange(6, len(fighter_win[value]), 17):
win_loss[key].append(fighter_win[value][i].text.split('\n')[1])
else:
for j in np.arange(0, len(fighter_win[value]), 17):
win_loss[key].append(fighter_win[value][j].text.split('\n')[1])
except:
win_loss[key].append('')
method = [[0 for columns in range(0)] for rows in range(len(fighter_win))]
for key, value in enumerate(fighter_links):
try:
if fighter_win[value][0].text.split('\n')[1]!='next':
for i in np.arange (13, len(fighter_win[value]), 17):
method[key].append(fighter_win[value][i].text.split('\n')[4][10:])
elif fighter_win[value][0].text.split('\n')[1]=='next':
for i in np.arange (19, len(fighter_win[value]), 17):
try:
method[key].append(fighter_win[value][i].text.split('\n')[4][10:])
except:
method[key].append('No info')
except:
method[key].append('No info')
methods = [[0 for columns in range(0)] for rows in range(len(method))]
for i in range(len(method)):
for key, value in enumerate(win_loss[i]):
if value=='win':
methods[i].append(method[i][key])
#Fighter that have very many fights but very few in the UFC are discarded from the study
for fighter in range(len(methods)):
if len(methods[fighter])<5:
methods[fighter] = 'Insufficient'
tko = [[0 for columns in range(0)] for rows in range(len(methods))]
for f in range(len(methods)):
for key, value in enumerate(methods[f]):
if value=='KO/TKO':
tko[f].append(methods[f][key])
#Finally, let's get the tko/win ratio per fighter data into a single list
tko_ratio = []
for f in range(len(methods)):
try:
if methods[f]=='Insufficient':
tko_ratio.append('Insufficient wins')
else:
tko_ratio.append(round(len(tko[f])/len(methods[f]),2))
except:
tko_ratio.append('Insufficient wins')
#Concatenating it to our original data
df['tko_win_ratio'] = pd.Series(tko_ratio).to_frame()
'''Unfortunately, we'll have to lose most of our data since most fighters
are preliminary or at least not on top 20 which means they're may have numerous
fights overall but very few in the UFC itself. Our purpose is to conduct this
study on fighters that have at least 5 wins in the UFC'''
display(df.shape[0], df[df['tko_win_ratio']=='Insufficient wins'].shape[0])
df = df[df['tko_win_ratio']!='Insufficient wins']
df.tail()
#Saving this data
from google.colab import files
df.to_csv('fighter_stance_tko.csv')
files.download('fighter_stance_tko.csv')
'''The initial view suggests that Switch fighters yet again have an advantage,
however, this isn't conclusive without the tests which is to be conducted in
the next notebook'''
df.groupby('stance')['tko_win_ratio'].agg(['mean', 'count', 'median', 'std'])