import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import string
#On a single url (each correspond to a fighter)
df = pd.read_html('http://ufcstats.com/fighter-details/1338e2c7480bdf9e')
#Getting the table format
df[0]
#Acquiring all the fighter links
letters = string.ascii_lowercase
fighter_ids = {}
for letter in letters:
pages = requests.get(f'http://ufcstats.com/statistics/fighters?char={letter}&page=all')
soup = BeautifulSoup(pages.text, 'lxml')
fighter_ids[letter] = soup.find_all('a', attrs = {'class': "b-link b-link_style_black"})
fighter_links = []
for letter in letters:
for fighter in fighter_ids[letter]:
fighter_links.append(fighter['href'])
fighter_links = list(pd.Series(fighter_links).to_frame()[0].unique())
len(fighter_links)
#Using requests and Pandas' read_html function together
%%time
fighter_table = [[0 for f in range(0)] for f in range(len(fighter_links))]
for key, value in enumerate(fighter_links):
page = requests.get(f'{value}')
fighter_table[key] = pd.read_html(page.text)[0]
CPU times: user 46.6 s, sys: 2.49 s, total: 49.1 s
Wall time: 34min 38s
#Engineering the knockout ratio values by dividing the tko/ko combined with win length to the entire win length
knockout_ratio = []
for index in range(len(fighter_table)):
try:
knockout_ratio.append(len(fighter_table[index][(fighter_table[index]['W/L']=='win') & (fighter_table[index]['Method'].str.contains('KO/TKO'))])/len(fighter_table[index][fighter_table[index]['W/L']=='win']))
except:
knockout_ratio.append('Insufficient fights')
#Switching to a one-dimensional list
tko_ratio = [f for f in knockout_ratio]
df = pd.concat(map(pd.read_csv, ['fighter_stance.csv']))
#Converting that list into a Series
df['knockout_ratio'] = pd.Series(tko_ratio).to_frame()
df.iloc[70:75,:]
#Not included in the test
df[df['knockout_ratio']=='Insufficient fights'].shape[0]