import pickle
import re
import time
from itertools import product
from math import ceil
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import (
AdaBoostClassifier,
BaggingClassifier,
ExtraTreesClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tqdm import tqdm
RANDOM_SEED = 42
# !pip install autopep8
# !pip install cleanipynb
# !pip install seaborn
# !pip install kaggle
# !pip install tqdm
# !mkdir ~/dataset
# !mkdir ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 /home/jovyan/.kaggle/kaggle.json
# !cd ~/dataset
# !kaggle datasets download -d devinanzelmo/dota-2-matches
# !unzip \*.zip && rm *.zip
# !find . -name '*.csv' -exec mv '{}' dataset \;
match = pd.read_csv('dataset/match.csv')
match.head(3)
match.info()
def drop_features(df, cols_pattern):
dropped_features = [re.findall(reg, ' '.join(
df.columns.values.tolist())) for reg in cols_pattern]
dropped_features = list(pd.core.common.flatten(dropped_features))
df = df.copy().drop(columns=dropped_features, axis=1)
return df, dropped_features
# Use a custom transformer for data preprocessing
class MatchDataPreprocessor(BaseEstimator, TransformerMixin):
def fit(self, data, y=None):
return self
def transform(self, data, y=None):
# Map id of game_mode to its value with mods.json
url = 'https://raw.githubusercontent.com/kronusme/dota2-api/master/data/mods.json'
r = requests.get(url)
mods_data = r.json()
mods = {mod['id']: mod['name'] for mod in mods_data['mods']}
for value in data['game_mode'].value_counts().index:
idx = data['game_mode'] == value
data.loc[idx, 'game_mode'] = mods[value]
# Remove Captains Mode in game_mode
idx = data[data['game_mode'] == 'Captains Mode'].index
data = data.drop(index=idx, axis=0)
# Drop unwanted features
dropped_features_regex = ['match_id', 'start_time', 'cluster',
'game_mode', '[^\s]+_status_[^\s]+']
data, _ = drop_features(data, dropped_features_regex)
num_col = [
'duration',
'first_blood_time',
'negative_votes',
'positive_votes'
]
# Remove rows with extreme Values
cols = ['negative_votes', 'positive_votes', 'duration']
thresholds = [30, 30, 16000]
for col, threshold in zip(cols, thresholds):
idx = data[data[col] >= threshold].index
data = data.drop(index=idx, axis=0)
# Feature scaling
num_match_arr = StandardScaler().fit_transform(data[num_col])
num_match = pd.DataFrame(num_match_arr, columns=num_col)
# Convert boolean value in radiant_win feature to binary value of 1.0 and 0.0
data.loc[data['radiant_win'] == True, 'radiant_win'] = 1
data.loc[data['radiant_win'] == False, 'radiant_win'] = 0
data['radiant_win'] = data['radiant_win'].astype('float64')
num_match.reset_index(drop=True, inplace=True)
data.reset_index(drop=True, inplace=True)
# Concatenate all features together
data = num_match.join(data['radiant_win'])
return data
match1 = MatchDataPreprocessor().fit_transform(match.copy())
match1.info()
player = pd.read_csv('dataset/players.csv')
player.head(3)
player.info()
dropped_features_regex = ['account_id', 'item_\d', 'unit_order_[^\s]+']
player1, dropped_features = drop_features(
player.copy(), dropped_features_regex)
print('List of dropped features:')
pd.Series(dropped_features)
player2 = player1.join(match1, on='match_id', how='inner')
player2.info()
def create_subplots(cols, num_cols_per_row, fig_w, fig_h):
num_rows = ceil(len(cols) / num_cols_per_row)
indexes = list(product(range(num_rows), range(num_cols_per_row)))
fig, axs = plt.subplots(num_rows, num_cols_per_row)
fig.set_size_inches(fig_w, num_rows * fig_h)
return num_rows, indexes, axs, fig
def plot_boxplot(df, cols, pickle_file_name, num_cols_per_row=4, fig_w=16, fig_h=7):
num_rows, indexes, axs, fig = create_subplots(
cols, num_cols_per_row, fig_w, fig_h)
with tqdm(total=100) as pbar:
progress_unit = 100/(len(cols) + 1)
for idx, col in enumerate(cols):
ax = axs if num_cols_per_row == 1 else axs[
idx] if num_rows == 1 else axs[indexes[idx][0]][indexes[idx][1]]
sns.boxplot(y=df[col], ax=ax)
ax.set(title=col, ylabel=None)
pbar.update(progress_unit)
with open(f'pickles/{pickle_file_name}.pickle', 'wb') as file:
pickle.dump(fig, file)
pbar.update(progress_unit)
def plot_hist(df, cols, pickle_file_name, kde=True, num_cols_per_row=3, fig_w=18, fig_h=5):
num_rows, indexes, axs, fig = create_subplots(
cols, num_cols_per_row, fig_w, fig_h)
with tqdm(total=100) as pbar:
progress_unit = 100/(len(cols) + 1)
for idx, col in enumerate(cols):
ax = axs if num_cols_per_row == 1 else axs[
idx] if num_rows == 1 else axs[indexes[idx][0]][indexes[idx][1]]
data_range = (df[col].max() - df[col].min())
binwidth = data_range / 50 if data_range >= 50 else None
sns.histplot(data=df, x=col, kde=kde, ax=ax, binwidth=binwidth)
ax.set(title=col, xlabel=None)
pbar.update(progress_unit)
with open(f'pickles/{pickle_file_name}.pickle', 'wb') as file:
pickle.dump(fig, file)
pbar.update(progress_unit)
def plot_hist_cat(df, cols, pickle_file_name, num_cols_per_row=3, fig_w=18, fig_h=5):
num_rows, indexes, axs, fig = create_subplots(
cols, num_cols_per_row, fig_w, fig_h)
with tqdm(total=100) as pbar:
progress_unit = 100/(len(cols) + 1)
for idx, col in enumerate(cols):
ax = axs if num_cols_per_row == 1 else axs[
idx] if num_rows == 1 else axs[indexes[idx][0]][indexes[idx][1]]
df[col].value_counts().plot(kind='barh', ax=ax)
ax.set(title=col)
pbar.update(progress_unit)
with open(f'pickles/{pickle_file_name}.pickle', 'wb') as file:
pickle.dump(fig, file)
pbar.update(progress_unit)
def show_figure(fig):
# create a dummy figure and use its
# manager to display "fig"
dummy = plt.figure()
new_manager = dummy.canvas.manager
new_manager.canvas.figure = fig
fig.set_canvas(new_manager.canvas)
def plot_graphs_fast(pickle_file_name):
with open(f'pickles/{pickle_file_name}.pickle', 'rb') as file:
figure = pickle.load(file)
try:
figure.show()
except AttributeError:
show_figure(figure)
figure.show()
# plot_graphs_fast('heroes')
attr = ['kills', 'deaths', 'assists', 'denies', 'last_hits',
'hero_damage', 'hero_healing', 'tower_damage', 'level']
player2[attr].info()
player2[attr].describe()
# plot_graphs_fast('p2_attr_1st_hist')
# plot_graphs_fast('p2_attr_1st_boxplt')
def map_id_to_value(df, col, id_value_mapper):
for value in df[col].value_counts().index:
idx = df[col] == value
df.loc[idx, col] = id_value_mapper[value]
leaver_status_mapper = {0: 'NONE',
1: 'DISCONNECTED',
2: 'DISCONNECTED_TOO_LONG',
3: 'ABANDONED',
4: 'AFK',
5: 'NEVER_CONNECTED',
6: 'NEVER_CONNECTED_TOO_LONG'}
map_id_to_value(player2, 'leaver_status', leaver_status_mapper)
player2['leaver_status'].value_counts()
player2['stuns'].value_counts(normalize=True)
index = player2['stuns'] == 'None'
player2.loc[index, 'stuns'] = 0.0
player2['stuns'] = player2['stuns'].astype('float64')
xp_features = re.findall(
'xp_[^\s]+', ' '.join(player2.columns.values.tolist()))
player2[xp_features].info()
for feature in xp_features:
print(
f'{feature} contains {len(player2[player2[feature].isnull()])} NaN values.')
# plot_graphs_fast('p2_xp_attr_hist')
# plot_graphs_fast('p2_xp_attr_boxplt')
xp_features.remove('xp_roshan')
xp_features.remove('xp_other')
player2, _ = drop_features(player2, ['xp_roshan', 'xp_other'])
impute_xp_hero = SimpleImputer(missing_values=np.nan, strategy='mean')
impute_xp_creep = SimpleImputer(missing_values=np.nan, strategy='mean')
player2['xp_hero'] = impute_xp_hero.fit_transform(
player2['xp_hero'].values.reshape(-1, 1))
player2['xp_creep'] = impute_xp_creep.fit_transform(
player2['xp_creep'].values.reshape(-1, 1))
gold_features = re.findall(
'gold_[^\s]+', ' '.join(player2.columns.values.tolist()))
player2[gold_features].info()
for feature in gold_features:
print(
f'{feature} contains {len(player2[player2[feature].isnull()])} NaN values.')
# plot_graphs_fast('p2_gold_attr_hist')
# plot_graphs_fast('p2_gold_attr_boxplt')
gold_features.remove('gold_per_min')
dropped_gold_features = gold_features
player2, dropped_gold_features = drop_features(player2, dropped_gold_features)
print('List of dropped features:')
pd.Series(dropped_gold_features)
heroes = pd.read_csv('dataset/hero_names.csv')
heroes.head(3)
heroes_arr = heroes.values
for hero in heroes_arr:
hero[0] = hero[0][9:]
heroes = pd.DataFrame(heroes_arr, columns=heroes.columns)
heroes.head(3)
heroes = heroes.rename(
columns={'name': 'hero_name', 'localized_name': 'hero_localized_name'})
heroes.head(3)
player2 = player2.merge(heroes, on='hero_id', how='left')
player2[['match_id', 'player_slot', 'hero_localized_name']].head(5)
player2[['hero_id', 'hero_name', 'hero_localized_name']].info()
index = player2['hero_name'].isnull()
player2.loc[index, 'hero_id'].value_counts()
index = player2[player2['hero_id'] == 0].index
player2 = player2.drop(index=index, axis=0)
player2.shape
player2['player_slot'].value_counts()
# Use a custom transformer for data preprocessing
class DataPreprocessorIntermediate(BaseEstimator, TransformerMixin):
def drop_features(self, data, dropped_feature_regex):
dropped_features = [re.findall(reg, ' '.join(
df.columns.values.tolist())) for reg in cols_pattern]
dropped_features = list(pd.core.common.flatten(dropped_features))
df = df.copy().drop(columns=dropped_features, axis=1)
return data, dropped_indexes
def fit(self, data, y=None):
return self
def transform(self, data, y=None):
with tqdm(total=100) as pbar:
progress_step = 100 / 7
# Step 1: Join with match.csv
match = pd.read_csv('dataset/match.csv')
match = MatchDataPreprocessor().fit_transform(match)
data = data.join(match, on='match_id', how='inner')
pbar.update(progress_step)
# Step 2: Replace missing value in stuns with 0
index = data['stuns'] == 'None'
data.loc[index, 'stuns'] = 0.0
data['stuns'] = data['stuns'].astype('float64')
pbar.update(progress_step)
# Step 3: Impute xp_hero and xp_creep
impute_xp_hero = SimpleImputer(
missing_values=np.nan, strategy='mean')
impute_xp_creep = SimpleImputer(
missing_values=np.nan, strategy='mean')
data['xp_hero'] = impute_xp_hero.fit_transform(
data['xp_hero'].values.reshape(-1, 1))
data['xp_creep'] = impute_xp_creep.fit_transform(
data['xp_creep'].values.reshape(-1, 1))
pbar.update(progress_step)
# Step 4: Map hero_ids to its value
heroes = pd.read_csv('dataset/hero_names.csv')
data = data.merge(heroes, on='hero_id', how='left')
index = data[data['hero_id'] == 0].index
data = data.drop(index=index, axis=0)
pbar.update(progress_step)
# Step 5: One Hot Hero columns
pbar.update(progress_step)
# Step 6: Map leaver status ids to its value
data['leaver_status'] = data['leaver_status'].astype('object')
leaver_status_col = ['NONE', 'DISCONNECTED', 'DISCONNECTED_TOO_LONG',
'ABANDONED', 'AFK', 'NEVER_CONNECTED',
'NEVER_CONNECTED_TOO_LONG']
for idx, status in enumerate(leaver_status_col):
index = data[data['leaver_status'] == idx].index
data.loc[index, 'leaver_status'] = status
pbar.update(progress_step)
# Step 7: Drop features
gold_features = re.findall(
'gold_[^\s]+', ' '.join(data.columns.values.tolist()))
gold_features.remove('gold_per_min')
dropped_gold_features = gold_features
dropped_features_regex = ['account_id', 'item_\d', 'unit_order_[^\s]+',
'xp_roshan', 'xp_other', 'hero_id', 'name',
'duration', 'first_blood_time', 'negative_votes',
'positive_votes']
dropped_features_regex += dropped_gold_features
data, _ = drop_features(data, dropped_features_regex)
pbar.update(progress_step)
return data
# Haven't drop match_id and player_slot
player3 = DataPreprocessorIntermediate().fit_transform(player.copy())
player3.head(3)
match_id_counts = player3['match_id'].value_counts()
dropped_match_ids = []
for id_, count in zip(match_id_counts.index, match_id_counts.values):
if count != 10:
dropped_match_ids.append(id_)
print(
f'There are {len(dropped_match_ids)} matches with incompete player\'s records.')
index = player3[player3['match_id'].isin(dropped_match_ids)].index
player3 = player3.drop(index=index, axis=0)
player3.shape
player3['player_slot'].value_counts()
radiant_players_slots = list(range(5))
dire_players_slots = list(range(128, 128 + 5))
index = player3[player3['player_slot'].isin(radiant_players_slots)].index
radiant_players = player3.drop(index=index, axis=0)
index = player3[player3['player_slot'].isin(dire_players_slots)].index
dire_players = player3.drop(index=index, axis=0)
sum_features = ['gold', 'gold_per_min', 'xp_per_min', 'kills', 'deaths', 'assists', 'denies',
'last_hits', 'hero_damage', 'hero_healing', 'tower_damage', 'level', 'xp_hero', 'xp_creep'
]
feature = ['match_id'] + sum_features
rad_sum = radiant_players[feature].groupby(['match_id']).sum()
dire_sum = dire_players[feature].groupby(['match_id']).sum()
radiant_team = pd.DataFrame(rad_sum, columns=sum_features, index=rad_sum.index)
dire_team = pd.DataFrame(dire_sum, columns=sum_features, index=dire_sum.index)
# radiant_player['leaver_status'].value_counts()
status_features = ['match_id', 'leaver_status']
def one_hot_status(players):
enc = OneHotEncoder(sparse=False)
status_arr = enc.fit_transform(
players['leaver_status'].values.reshape(-1, 1))
feature_names = enc.get_feature_names(['status'])
status_1_hot = pd.DataFrame(status_arr, columns=feature_names)
status_1_hot = status_1_hot.drop(columns=['status_NONE'], axis=1)
status_1_hot['match_id'] = pd.Series(players['match_id'].values)
return status_1_hot, list(status_1_hot.columns.values)
rad_players_status_encoded, status_features = one_hot_status(radiant_players)
dire_players_status_encoded, status_features = one_hot_status(dire_players)
rad_teams_status = rad_players_status_encoded.groupby('match_id')[
status_features].sum()
dire_teams_status = dire_players_status_encoded.groupby('match_id')[
status_features].sum()
status_features.remove('match_id')
rad_teams_status[status_features] = rad_teams_status[status_features].replace(
np.arange(1, 6, 1, dtype='float64'), 1)
dire_teams_status[status_features] = dire_teams_status[status_features].replace(
np.arange(1, 6, 1, dtype='float64'), 1)
radiant_team = radiant_team.join(rad_teams_status)
dire_team = dire_team.join(dire_teams_status)
radiant_team.head(5)
dire_team.head(5)
for feature in status_features:
print(feature + ' (radiant)')
print(radiant_team[feature].value_counts())
print()
print(feature + ' (dire)')
print(dire_team[feature].value_counts())
print()
heroes_features = ['match_id', 'localized_name']
def one_hot_heroes(players):
enc = OneHotEncoder(sparse=False)
status_arr = enc.fit_transform(
players['localized_name'].values.reshape(-1, 1))
feature_names = enc.get_feature_names(['hero'])
status_1_hot = pd.DataFrame(status_arr, columns=feature_names)
status_1_hot['match_id'] = pd.Series(players['match_id'].values)
return status_1_hot, list(status_1_hot.columns.values)
rad_players_heroes_encoded, heroes_features = one_hot_heroes(radiant_players)
dire_players_heroes_encoded, heroes_features = one_hot_heroes(dire_players)
rad_teams_heroes = rad_players_heroes_encoded.groupby('match_id')[
heroes_features].sum()
dire_teams_heroes = dire_players_heroes_encoded.groupby('match_id')[
heroes_features].sum()
heroes_features.remove('match_id')
radiant_team = radiant_team.join(rad_teams_heroes[heroes_features])
dire_team = dire_team.join(dire_teams_heroes[heroes_features])
print(
f'Data Validation:\nProve that radiant teams has 5 heroes: {np.all(radiant_team[heroes_features].sum(axis=1) == 5)}')
print(
f'Prove that dire teams has 5 heroes: {np.all(dire_team[heroes_features].sum(axis=1) == 5)}')
radiant_team.head(5)
dire_team.head(5)
# # Use a custom transformer for data preprocessing
# class FinalDataPreprocesser(BaseEstimator, TransformerMixin):
# def drop_features(self, data, dropped_feature_regex):
# dropped_features = [re.findall(reg, ' '.join(df.columns.values.tolist())) for reg in cols_pattern]
# dropped_features = list(pd.core.common.flatten(dropped_features))
# df = df.copy().drop(columns=dropped_features, axis=1)
# return data, dropped_indexes
# def fit(self, data, y=None):
# return self
# def transform(self, data, y=None):
# with tqdm(total=100) as pbar:
# progress_step = 100 / 15
# # Step 1: Join with match.csv
# match = pd.read_csv('dataset/match.csv')
# match = MatchDataPreprocessor().fit_transform(match)
# data = data.join(match, on='match_id', how='inner')
# pbar.update(progress_step)
# # Step 2: Replace missing value in stuns with 0
# index = data['stuns'] == 'None'
# data.loc[index, 'stuns'] = 0.0
# data['stuns'] = data['stuns'].astype('float64')
# pbar.update(progress_step)
# # Step 3: Impute xp_hero and xp_creep
# impute_xp_hero = SimpleImputer(missing_values=np.nan, strategy='mean')
# impute_xp_creep = SimpleImputer(missing_values=np.nan, strategy='mean')
# data['xp_hero'] = impute_xp_hero.fit_transform(data['xp_hero'].values.reshape(-1, 1))
# data['xp_creep'] = impute_xp_creep.fit_transform(data['xp_creep'].values.reshape(-1, 1))
# pbar.update(progress_step)
# # Step 4.1: Map hero_ids to its value
# heroes = pd.read_csv('dataset/hero_names.csv')
# data = data.merge(heroes, on='hero_id', how='left')
# index = data[data['hero_id'] == 0].index
# data = data.drop(index=index, axis=0)
# pbar.update(progress_step)
# # Step 5: Map leaver status ids to its value
# data['leaver_status'] = data['leaver_status'].astype('object')
# leaver_status_col = ['NONE','DISCONNECTED','DISCONNECTED_TOO_LONG',
# 'ABANDONED','AFK','NEVER_CONNECTED',
# 'NEVER_CONNECTED_TOO_LONG']
# for idx, status in enumerate(leaver_status_col):
# index = data[data['leaver_status'] == idx].index
# data.loc[index, 'leaver_status'] = status
# pbar.update(progress_step)
# # Step 6: Drop matches that contain incomplete information
# match_id_counts = data['match_id'].value_counts()
# dropped_match_ids = []
# for id_, count in zip(match_id_counts.index, match_id_counts.values):
# if count != 10:
# dropped_match_ids.append(id_)
# index = data[data['match_id'].isin(dropped_match_ids)].index
# data = data.drop(index=index, axis=0)
# pbar.update(progress_step)
# # Step 7: Split into two teams in dataframes: radiant and dire
# radiant_players_slots = list(range(5))
# dire_players_slots = list(range(128, 128 + 5))
# index = data[data['player_slot'].isin(radiant_players_slots)].index
# radiant_players = data.drop(index=index, axis=0)
# index = data[data['player_slot'].isin(dire_players_slots)].index
# dire_players = data.drop(index=index, axis=0)
# pbar.update(progress_step)
# # Step 8: Sum up all numerical features of players for each team in each match
# sum_features = ['gold','gold_per_min','xp_per_min','kills','deaths','assists','denies',
# 'last_hits','hero_damage','hero_healing','tower_damage','level','xp_hero','xp_creep',
# 'stuns'
# ]
# feature = ['match_id'] + sum_features
# rad_sum = radiant_players[feature].groupby(['match_id']).sum()
# dire_sum = dire_players[feature].groupby(['match_id']).sum()
# rad_sum.index.name = None
# dire_sum.index.name = None
# radiant_team = pd.DataFrame(rad_sum, columns=sum_features)
# dire_team = pd.DataFrame(dire_sum, columns=sum_features)
# radiant_team['match_id'] = pd.Series(radiant_players['match_id'].copy().values)
# dire_team['match_id'] = pd.Series(dire_players['match_id'].copy().values)
# pbar.update(progress_step)
# # Step 9: One hot encode leaver_status
# status_features = ['match_id', 'leaver_status']
# def one_hot_status(players):
# enc = OneHotEncoder(sparse=False)
# status_arr = enc.fit_transform(players['leaver_status'].values.reshape(-1, 1))
# feature_names = enc.get_feature_names(['status'])
# status_1_hot = pd.DataFrame(status_arr, columns=feature_names)
# status_1_hot = status_1_hot.drop(columns=['status_NONE'], axis=1)
# status_1_hot['match_id'] = pd.Series(players['match_id'].values)
# return status_1_hot, list(status_1_hot.columns.values)
# rad_players_status_encoded, status_features = one_hot_status(radiant_players)
# dire_players_status_encoded, status_features = one_hot_status(dire_players)
# pbar.update(progress_step)
# # Step 10: Combine the whole teams' leaver statses into one radiant and dire teams respectively
# rad_teams_status = rad_players_status_encoded.groupby('match_id')[status_features].sum()
# dire_teams_status= dire_players_status_encoded.groupby('match_id')[status_features].sum()
# status_features.remove('match_id')
# rad_teams_status[status_features] = rad_teams_status[status_features].replace(np.arange(1, 6, 1, dtype='float64'), 1)
# dire_teams_status[status_features] = dire_teams_status[status_features].replace(np.arange(1, 6, 1, dtype='float64'), 1)
# rad_teams_status['match_id'] = pd.Series(rad_teams_status.index)
# dire_teams_status['match_id'] = pd.Series(dire_teams_status.index)
# rad_teams_status.index.name = None
# dire_teams_status.index.name = None
# radiant_team = radiant_team.merge(rad_teams_status, on='match_id')
# dire_team = dire_team.merge(dire_teams_status, on='match_id')
# pbar.update(progress_step)
# # Step 11: One hot encode heroes
# heroes_features = ['match_id', 'localized_name']
# def one_hot_heroes(players):
# enc = OneHotEncoder(sparse=False)
# status_arr = enc.fit_transform(players['localized_name'].values.reshape(-1, 1))
# feature_names = enc.get_feature_names(['hero'])
# status_1_hot = pd.DataFrame(status_arr, columns=feature_names)
# status_1_hot['match_id'] = pd.Series(players['match_id'].values)
# return status_1_hot, list(status_1_hot.columns.values)
# rad_players_heroes_encoded, heroes_features = one_hot_heroes(radiant_players)
# dire_players_heroes_encoded, heroes_features = one_hot_heroes(dire_players)
# pbar.update(progress_step)
# # Step 12: Combine the whole teams' heroes into one radiant and dire teams respectively
# rad_teams_heroes = rad_players_heroes_encoded.groupby('match_id')[heroes_features].sum()
# dire_teams_heroes= dire_players_heroes_encoded.groupby('match_id')[heroes_features].sum()
# heroes_features.remove('match_id')
# rad_teams_heroes['match_id'] = pd.Series(rad_teams_heroes.index)
# dire_teams_heroes['match_id'] = pd.Series(dire_teams_heroes.index)
# rad_teams_heroes.index.name = None
# dire_teams_heroes.index.name = None
# radiant_team = radiant_team.merge(rad_teams_heroes, on='match_id')
# dire_team = dire_team.merge(dire_teams_heroes, on='match_id')
# pbar.update(progress_step)
# # Step 13: Join two teams back together into single dataframe
# match_ids = radiant_team.index
# index = data[(data['match_id'].isin(match_ids)) & (data['player_slot'] == 0)].index
# outcome = data.loc[index, 'radiant_win'].copy()
# data = radiant_team.join(dire_team, lsuffix='_rad', rsuffix='_dire', on='match_id')
# data['radiant_win'] = pd.Series(outcome.values)
# pbar.update(progress_step)
# # Step 14: Feature Scaling
# num_attr = ['gold', 'gold_per_min','xp_per_min','kills','deaths',
# 'assists','denies','last_hits','stuns','hero_damage',
# 'hero_healing','tower_damage','level','xp_hero','xp_creep']
# rad_num_attr = [attr + '_rad' for attr in num_attr]
# dire_num_attr = [attr + '_dire' for attr in num_attr]
# num_attr = rad_num_attr + dire_num_attr
# std_scaler = StandardScaler()
# data[num_attr] = std_scaler.fit_transform(data[num_attr])
# pbar.update(progress_step)
# # Step 15: Drop features
# dropped_features_regex = ['match_id_[^\s]+', 'match_id']
# # data, _ = drop_features(data, dropped_features_regex)
# pbar.update(progress_step)
# return data
# final_player = FinalDataPreprocesser().fit_transform(player.copy())
# final_player.info()
player = pd.read_csv('dataset/players.csv')
data = player.copy()
def test(no, df):
# for col in list(df.columns.values):
bool_index = pd.isnull(df['radiant_win'])
if np.any(bool_index):
print(
f'{no} radiant_win contains {len(df.loc[bool_index, "radiant_win"])} null values')
def drop_features(df, dropped_feature_regex):
dropped_features = [re.findall(reg, ' '.join(
df.columns.values.tolist())) for reg in dropped_feature_regex]
dropped_features = list(pd.core.common.flatten(dropped_features))
df = df.copy().drop(columns=dropped_features, axis=1)
return df, dropped_features
with tqdm(total=100) as pbar:
progress_step = 100 / 15
# Step 1: Join with match.csv
match = pd.read_csv('dataset/match.csv')
match = MatchDataPreprocessor().fit_transform(match)
data = data.join(match, on='match_id', how='inner')
test(1, data)
pbar.update(progress_step)
# Step 2: Replace missing value in stuns with 0
index = data['stuns'] == 'None'
data.loc[index, 'stuns'] = 0.0
data['stuns'] = data['stuns'].astype('float64')
test(2, data)
pbar.update(progress_step)
# Step 3: Impute xp_hero and xp_creep
impute_xp_hero = SimpleImputer(missing_values=np.nan, strategy='mean')
impute_xp_creep = SimpleImputer(missing_values=np.nan, strategy='mean')
data['xp_hero'] = impute_xp_hero.fit_transform(
data['xp_hero'].values.reshape(-1, 1))
data['xp_creep'] = impute_xp_creep.fit_transform(
data['xp_creep'].values.reshape(-1, 1))
test(3, data)
pbar.update(progress_step)
# Step 4.1: Map hero_ids to its value
heroes = pd.read_csv('dataset/hero_names.csv')
data = data.merge(heroes, on='hero_id', how='left')
index = data[data['hero_id'] == 0].index
data = data.drop(index=index, axis=0)
test(4, data)
pbar.update(progress_step)
# Step 5: Map leaver status ids to its value
data['leaver_status'] = data['leaver_status'].astype('object')
leaver_status_col = ['NONE', 'DISCONNECTED', 'DISCONNECTED_TOO_LONG',
'ABANDONED', 'AFK', 'NEVER_CONNECTED',
'NEVER_CONNECTED_TOO_LONG']
for idx, status in enumerate(leaver_status_col):
index = data[data['leaver_status'] == idx].index
data.loc[index, 'leaver_status'] = status
pbar.update(progress_step)
test(5, data)
# Step 6: Drop matches that contain incomplete information
match_id_counts = data['match_id'].value_counts()
dropped_match_ids = []
for id_, count in zip(match_id_counts.index, match_id_counts.values):
if count != 10:
dropped_match_ids.append(id_)
index = data[data['match_id'].isin(dropped_match_ids)].index
data = data.drop(index=index, axis=0)
pbar.update(progress_step)
test(6, data)
# Step 7: Split into two teams in dataframes: radiant and dire
radiant_players_slots = list(range(5))
dire_players_slots = list(range(128, 128 + 5))
index = data[data['player_slot'].isin(radiant_players_slots)].index
radiant_players = data.drop(index=index, axis=0)
index = data[data['player_slot'].isin(dire_players_slots)].index
dire_players = data.drop(index=index, axis=0)
test(7, data)
pbar.update(progress_step)
# Step 8: Sum up all numerical features of players for each team in each match
sum_features = ['gold', 'gold_per_min', 'xp_per_min', 'kills', 'deaths', 'assists', 'denies',
'last_hits', 'hero_damage', 'hero_healing', 'tower_damage', 'level', 'xp_hero', 'xp_creep',
'stuns'
]
feature = ['match_id'] + sum_features
rad_sum = radiant_players[feature].groupby(['match_id']).sum()
dire_sum = dire_players[feature].groupby(['match_id']).sum()
rad_sum.index.name = None
dire_sum.index.name = None
radiant_team = pd.DataFrame(rad_sum, columns=sum_features)
dire_team = pd.DataFrame(dire_sum, columns=sum_features)
radiant_team['match_id'] = pd.Series(
radiant_players['match_id'].copy().values)
dire_team['match_id'] = pd.Series(dire_players['match_id'].copy().values)
pbar.update(progress_step)
test(8, data)
# Step 9: One hot encode leaver_status
status_features = ['match_id', 'leaver_status']
def one_hot_status(players):
enc = OneHotEncoder(sparse=False)
status_arr = enc.fit_transform(
players['leaver_status'].values.reshape(-1, 1))
feature_names = enc.get_feature_names(['status'])
status_1_hot = pd.DataFrame(status_arr, columns=feature_names)
status_1_hot = status_1_hot.drop(columns=['status_NONE'], axis=1)
status_1_hot['match_id'] = pd.Series(players['match_id'].values)
return status_1_hot, list(status_1_hot.columns.values)
rad_players_status_encoded, status_features = one_hot_status(
radiant_players)
dire_players_status_encoded, status_features = one_hot_status(dire_players)
pbar.update(progress_step)
test(9, data)
# Step 10: Combine the whole teams' leaver statses into one radiant and dire teams respectively
rad_teams_status = rad_players_status_encoded.groupby('match_id')[
status_features].sum()
dire_teams_status = dire_players_status_encoded.groupby('match_id')[
status_features].sum()
status_features.remove('match_id')
rad_teams_status[status_features] = rad_teams_status[status_features].replace(
np.arange(1, 6, 1, dtype='float64'), 1)
dire_teams_status[status_features] = dire_teams_status[status_features].replace(
np.arange(1, 6, 1, dtype='float64'), 1)
rad_teams_status['match_id'] = pd.Series(rad_teams_status.index)
dire_teams_status['match_id'] = pd.Series(dire_teams_status.index)
rad_teams_status.index.name = None
dire_teams_status.index.name = None
radiant_team = radiant_team.merge(rad_teams_status, on='match_id')
dire_team = dire_team.merge(dire_teams_status, on='match_id')
pbar.update(progress_step)
test(10, data)
# Step 11: One hot encode heroes
heroes_features = ['match_id', 'localized_name']
def one_hot_heroes(players):
enc = OneHotEncoder(sparse=False)
status_arr = enc.fit_transform(
players['localized_name'].values.reshape(-1, 1))
feature_names = enc.get_feature_names(['hero'])
status_1_hot = pd.DataFrame(status_arr, columns=feature_names)
status_1_hot['match_id'] = pd.Series(players['match_id'].values)
return status_1_hot, list(status_1_hot.columns.values)
rad_players_heroes_encoded, heroes_features = one_hot_heroes(
radiant_players)
dire_players_heroes_encoded, heroes_features = one_hot_heroes(dire_players)
pbar.update(progress_step)
test(11, data)
# Step 12: Combine the whole teams' heroes into one radiant and dire teams respectively
rad_teams_heroes = rad_players_heroes_encoded.groupby('match_id')[
heroes_features].sum()
dire_teams_heroes = dire_players_heroes_encoded.groupby('match_id')[
heroes_features].sum()
heroes_features.remove('match_id')
rad_teams_heroes['match_id'] = pd.Series(rad_teams_heroes.index)
dire_teams_heroes['match_id'] = pd.Series(dire_teams_heroes.index)
rad_teams_heroes.index.name = None
dire_teams_heroes.index.name = None
radiant_team = radiant_team.merge(rad_teams_heroes, on='match_id')
dire_team = dire_team.merge(dire_teams_heroes, on='match_id')
pbar.update(progress_step)
test(12, data)
# Step 13: Join two teams back together into single dataframe
data = dire_team.merge(data[['match_id', 'radiant_win']], on='match_id')
data = radiant_team.merge(data, suffixes=['_rad', '_dire'], on='match_id')
pbar.update(progress_step)
test(13, data)
# Step 14: Feature Scaling
num_attr = ['gold', 'gold_per_min', 'xp_per_min', 'kills', 'deaths',
'assists', 'denies', 'last_hits', 'stuns', 'hero_damage',
'hero_healing', 'tower_damage', 'level', 'xp_hero', 'xp_creep']
rad_num_attr = [attr + '_rad' for attr in num_attr]
dire_num_attr = [attr + '_dire' for attr in num_attr]
num_attr = rad_num_attr + dire_num_attr
std_scaler = StandardScaler()
data[num_attr] = std_scaler.fit_transform(data[num_attr])
pbar.update(progress_step)
test(14, data)
# Step 15: Drop features
dropped_features_regex = ['match_id_[^\s]+', 'match_id']
# data, _ = drop_features(data, dropped_features_regex)
pbar.update(progress_step)
80%|████████ | 80.0/100 [00:03<00:01, 13.37it/s]
KernelInterrupted: Execution interrupted by the Jupyter kernel.
list(final_player.columns.values)
X = final_player.drop(columns='radiant_win', axis=1)
y = final_player['radiant_win']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=RANDOM_SEED)
print(f'X_train shape: {X_train.shape}\nX_test shape: {X_test.shape}')
for col in list(data.columns.values):
bool_index = np.isnan(data[col])
if np.any(bool_index):
print(f'{col} contains {len(data.loc[bool_index, col])} null values')
for col in list(final_player.columns.values):
bool_index = np.isnan(final_player[col])
if np.any(bool_index):
print(
f'{col} contains {len(final_player.loc[bool_index, col])} null values')
len(player3.loc[np.isnan(player3['radiant_win'])])
final_player.loc[np.isnan(final_player['radiant_win'])]
short_names = ['log_reg', 'rand_forest', 'extra_tree',
'ada_boost_cf', 'gradient_b_cf', 'bagging_cf']
names = ['Logistic Regression', 'Random forest Classifier',
'Extra Tree Classifier', 'AdaBoost Classifier', 'Gradient Boosting Classifier',
'Bagging Classifier']
functions = [
LogisticRegression(
penalty='none', random_state=RANDOM_SEED, max_iter=1000),
RandomForestClassifier(random_state=RANDOM_SEED),
ExtraTreesClassifier(random_state=RANDOM_SEED),
AdaBoostClassifier(random_state=RANDOM_SEED),
GradientBoostingClassifier(random_state=RANDOM_SEED),
BaggingClassifier(random_state=RANDOM_SEED)
]
classifiers_idx = {}
classifiers = {}
# Zip all classfiers together into a dictionary for convenient access
for idx, s_name, name, func in zip(range(len(names)), short_names, names, functions):
classifiers_idx[idx] = {'name': name, 'func': func}
classifiers[s_name] = {'name': name, 'func': func}
def get_models_performace(classifiers, X, y):
train_results = {'classifier_name': [], 'duration': [],
'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}
test_results = {'classifier_name': [], 'duration': [],
'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}
# Loop through all classifiers
for idx in range(len(classifiers)):
cf_name = classifiers[idx]['name']
print(f'{cf_name} has started...')
# Count time to get the duration of the models
start = time.time()
# cross_validate returns both train_score and test_score by setting return_train_score to True
cv_scores = cross_validate(classifiers[idx]['func'], X, y,
scoring=['f1', 'roc_auc', 'precision', 'recall'], cv=5,
return_train_score=True, n_jobs=-1)
end = time.time()
duration = end - start
print(f'{cf_name} ended in {duration} seconds.\n')
updateRecord(train_results, cv_scores, 'train', cf_name, duration)
updateRecord(test_results, cv_scores, 'test', cf_name, duration)
# Return as DataFrame instead of dictionary
return pd.DataFrame(train_results), pd.DataFrame(test_results)
# Append values to the dictionary based on key_name passed into the function
def updateRecord(df, scores, key_name, classifier_name, duration):
df['classifier_name'].append(classifier_name)
df['duration'].append(duration)
df['f1_score'].append(np.mean(scores[f'{key_name}_f1']))
df['auc_score'].append(np.mean(scores[f'{key_name}_roc_auc']))
df['precision'].append(np.mean(scores[f'{key_name}_precision']))
df['recall'].append(np.mean(scores[f'{key_name}_recall']))
train_results, test_results = get_models_performace(
classifiers_idx, X_train, y_train)
# Load
with open('pickles/model_performance.pickle', 'wb') as file:
pickle.dump(train_results, file)
pickle.dump(test_results, file)
# with open('pickles/model_performance.pickle', 'rb') as file:
# train_results = pickle.load(file)
# test_results = pickle.load(file)
train_results
test_results