!ln -s /sportsreference/sportsipy/nba/boxscore.py boxscore.py
from sportsreference.sportsipy.nba.boxscore import Boxscores, Boxscore
# !ln -s /sportsreference/sportsipy/nba/schedule.py schedule.py
# from sportsreference.sportsipy.nba.schedule import Schedule
# !ln -s /sportsreference/sportsipy/nba/roster.py roster.py
# from sportsreference.sportsipy.nba.roster import Player, Roster
# !ln -s /sportsreference/sportsipy/nba/teams.py teams.py
# from sportsreference.sportsipy.nba.teams import Teams
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import normalize, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, f1_score
from sklearn.linear_model import LinearRegression
# games_2018_2019 = Boxscores(datetime(2018, 10, 16), datetime(2019, 4, 10)) #boxscores dictionary in datetime range
# game_id_list_2018_2019 = []
# for day, games_list in games_2018_2019.games.items():
# if games_list:
# for game in games_list:
# game_id_list_2018_2019.append(game['boxscore'])
# game_id_list_2018_2019
# games_2019_2020 = Boxscores(datetime(2019, 10, 22), datetime(2020, 8, 14)) #boxscores dictionary in datetime range
# game_id_list_2019_2020 = []
# for day, games_list in games_2019_2020.games.items():
# if games_list:
# for game in games_list:
# game_id_list_2019_2020.append(game['boxscore'])
# game_id_list_2019_2020
games_2020_2021 = Boxscores(datetime(2020, 12, 22), datetime(2021, 4, 30)) #boxscores dictionary in datetime range
game_id_list_2020_2021 = []
for day, games_list in games_2020_2021.games.items():
if games_list:
for game in games_list:
game_id_list_2020_2021.append(game['boxscore'])
game_id_list_2020_2021
# nba_2018_2019 = pd.DataFrame()
# for game_id in game_id_list_2018_2019:
# game_boxscore_df = Boxscore(game_id).dataframe
# nba_2018_2019 = nba_2018_2019.append(game_boxscore_df)
# nba_2018_2019.to_csv(r'BoxScores_2018_2019.csv')
# nba_2019_2020 = pd.DataFrame()
# for game_id in game_id_list_2019_2020:
# game_boxscore_df = Boxscore(game_id).dataframe
# nba_2019_2020 = nba_2019_2020.append(game_boxscore_df)
# nba_2019_2020
#nba_2019_2020.to_csv(r'BoxScores_2019_2020.csv')
# nba_2020_2021 = pd.DataFrame()
# for game_id in game_id_list_2020_2021:
# game_boxscore_df = Boxscore(game_id).dataframe
# nba_2020_2021 = nba_2020_2021.append(game_boxscore_df)
# nba_2020_2021.to_csv(r'BoxScores_2020_2021_ThreeMonths.csv')
# AWAY: no instances of minutes played incorrect
# HOME: 67 instances of minutes played incorrect -> overtime games
# home losses, home wins wrong, cannot be used for now
# all OT games have the home team data messed up currently, not using for now
nba2018 = pd.read_csv(r'BoxScores_2018_2019.csv')
#nba2018
nba2019 = pd.read_csv(r'BoxScores_2019_2020.csv')
#nba2019
nba2020 = pd.read_csv(r'BoxScores_2020_2021_ThreeMonths.csv')
WL2018 = pd.read_csv(r'boxscores2018HomeWL.csv')
WL2019 = pd.read_csv(r'boxscores2019HomeWL.csv')
WL2020 = pd.read_csv(r'boxscores2020HomeWL.csv')
def inputWL(original, WL):
home_wins = WL['home_wins']
home_losses = WL['home_losses']
original['home_wins'] = home_wins
original['home_losses'] = home_losses
return original
nba2018 = inputWL(nba2018, WL2018)
nba2019 = inputWL(nba2019, WL2019)
nba2020 = inputWL(nba2020, WL2020)
## this function combines our cleaning step into one easy function
def format_nba_df(df):
## create a bool column for home win
df['homeWin'] = df['winner'] == 'Home'
## rename unnamed column to gameid
df = df.rename(columns={'Unnamed: 0': 'gameID'})
## create datetime object from column of date strings and drop the time
df['date'] = df['date'].apply(lambda x: (datetime.strptime(x, '%I:%M %p, %B %d, %Y')).date())
## create home team column
df['homeTeam'] = np.where(df['homeWin']==True, df['winning_abbr'], df['losing_abbr'])
## create away team column
df['awayTeam'] = np.where(df['homeWin']==False, df['winning_abbr'], df['losing_abbr'])
return df
## this takes an odds dataframe and converts the date to a datetime object, year must be specified
def convert_to_datetime(x, year):
y = str(x)
if x < 1000:
date = datetime(year=year, month=int(y[:-2]), day=int(y[-2:]))
return date.strftime("%Y-%m-%d")
else:
date = datetime(year=year-1, month=int(y[:-2]), day=int(y[-2:]))
return date.strftime("%Y-%m-%d")
nba2018_formatted = format_nba_df(nba2018)
nba2018_formatted
nba2019_formatted = format_nba_df(nba2019)
nba2019_formatted
nba2020_formatted = format_nba_df(nba2020)
nba2020_formatted
# load 2017-18 odds
odds2017 = pd.read_csv(r'nba-odds-2017-18.csv')
odds2018 = pd.read_csv(r'nba-odds-2018-19.csv')
odds2019 = pd.read_csv(r'nba-odds-2019-20.csv')
odds2020 = pd.read_csv(r'nba-odds-2020-21.csv')
print(odds2020.shape)
# Dropping last 28 rows of NaN from odds2018 using drop
odds2018.drop(odds2018.tail(28).index, inplace = True)
odds2018
odds2020
## one game played in london
odds2017['VH'].unique()
## we need to change these to home and away so it matches our other df
odds2017.loc[odds2017['VH']=='N']
## change values for home and away team
odds2017['VH'].iloc[[1233]] = 'H'
odds2017['VH'].iloc[[1232]] = 'V'
## check work
#odds2017.iloc[[1232,1233]]
## do the same for 2018 df
odds2018.loc[odds2018['VH']=='N']
## change the value for home and away team
odds2018['VH'].iloc[[1329]] = 'H'
odds2018['VH'].iloc[[1328]] = 'V'
## check work
#odds2018.iloc[[1328,1329]]
for ind in odds2018.index:
if odds2018['Close'][ind] == 'pk' or odds2018['Close'][ind] == 'PK':
odds2018['Close'][ind] = 0
for ind in odds2019.index:
if odds2019['Close'][ind] == 'pk' or odds2019['Close'][ind] == 'PK':
odds2019['Close'][ind] = 0
for ind in odds2020.index:
if odds2020['Close'][ind] == 'pk' or odds2020['Close'][ind] == 'PK':
odds2020['Close'][ind] = 0
## convert to datetime object
odds2017['date'] = odds2017['Date'].apply(lambda x: convert_to_datetime(x, 2018))
odds2018['date'] = odds2018['Date'].apply(lambda x: convert_to_datetime(int(x), 2019))
odds2019['date'] = odds2019['Date'].apply(lambda x: convert_to_datetime(int(x), 2020))
odds2020['date'] = odds2020['Date'].apply(lambda x: convert_to_datetime(int(x), 2021))
## team name to team abbrev dict for mapping
team_dict = {'Atlanta':'ATL', 'Boston':'BOS', 'Brooklyn':'BRK','Charlotte':'CHO','Chicago':'CHI','Cleveland':'CLE','Dallas':'DAL', 'Denver':'DEN','Detroit':'DET',
'GoldenState':'GSW', 'Golden State': 'GSW', 'Houston':'HOU', 'Indiana':'IND','LAClippers':'LAC', 'LA Clippers':'LAC', 'LALakers':'LAL', 'LA Lakers': 'LAL', 'Memphis':'MEM','Miami':'MIA','Milwaukee':'MIL','Minnesota':'MIN',
'NewOrleans':'NOP', 'New Orleans': 'NOP', 'NewYork':'NYK','New York':'NYK','OklahomaCity':'OKC','Oklahoma City':'OKC','Orlando':'ORL','Philadelphia':'PHI','Phoenix':'PHO','Portland':'POR','Sacramento':'SAC',
'SanAntonio':'SAS', 'San Antonio':'SAS','Toronto':'TOR','Utah':'UTA','Washington':'WAS'}
## change team names to abbrevs
#odds2017['Team'] = odds2017['Team'].map(team_dict)
odds2018['Team'] = odds2018['Team'].map(team_dict)
odds2019['Team'] = odds2019['Team'].map(team_dict)
odds2020['Team'] = odds2020['Team'].map(team_dict)
spread_col = []
ind = 0
while ind < len(odds2018) - 1:
pair = (float(odds2018['Close'][ind]), float(odds2018['Close'][ind + 1]))
winners_index = 0 if pair[0] < 150 else 1
closing_spread_abs = pair[winners_index]
# assert(closing_spread_abs < 150)
if winners_index == 0:
spread_col.append(closing_spread_abs)
spread_col.append(-closing_spread_abs)
else:
spread_col.append(-closing_spread_abs)
spread_col.append(closing_spread_abs)
ind += 2
odds2018['Spread'] = spread_col
odds2018
spread_col = []
ind = 0
while ind < len(odds2019) - 1:
pair = (float(odds2019['Close'][ind]), float(odds2019['Close'][ind + 1]))
winners_index = 0 if pair[0] < 150 else 1
closing_spread_abs = pair[winners_index]
# assert(closing_spread_abs < 150)
if winners_index == 0:
spread_col.append(closing_spread_abs)
spread_col.append(-closing_spread_abs)
else:
spread_col.append(-closing_spread_abs)
spread_col.append(closing_spread_abs)
ind += 2
odds2019['Spread'] = spread_col
odds2019
spread_col = []
ind = 0
while ind < len(odds2020) - 1:
pair = (float(odds2020['Close'][ind]), float(odds2020['Close'][ind + 1]))
winners_index = 0 if pair[0] < 150 else 1
closing_spread_abs = pair[winners_index]
# assert(closing_spread_abs < 150)
if winners_index == 0:
spread_col.append(closing_spread_abs)
spread_col.append(-closing_spread_abs)
else:
spread_col.append(-closing_spread_abs)
spread_col.append(closing_spread_abs)
ind += 2
odds2020['Spread'] = spread_col
odds2020
### seperate into home and away and drop playoff games
homeOdds2017 = odds2017.loc[odds2017['VH']=='H'].reset_index(drop=True)[:1230]
awayOdds2017 = odds2017.loc[odds2017['VH']=='V'].reset_index(drop=True)[:1230]
homeOdds2018 = odds2018.loc[odds2018['VH']=='H'].reset_index(drop=True)[:1230]
awayOdds2018 = odds2018.loc[odds2018['VH']=='V'].reset_index(drop=True)[:1230]
homeOdds2019 = odds2019.loc[odds2019['VH']=='H'].reset_index(drop=True)[:1230]
awayOdds2019 = odds2019.loc[odds2019['VH']=='V'].reset_index(drop=True)[:1230]
homeOdds2020 = odds2020.loc[odds2017['VH']=='H'].reset_index(drop=True)[:1230]
awayOdds2020 = odds2020.loc[odds2017['VH']=='V'].reset_index(drop=True)[:1230]
# rename home columns
homeOdds2017 = homeOdds2017.rename(columns={'Team': 'homeTeam', 'ML': 'homeML', 'Spread': 'homeSpread'})
homeOdds2018 = homeOdds2018.rename(columns={'Team': 'homeTeam', 'ML': 'homeML', 'Spread': 'homeSpread'})
homeOdds2019 = homeOdds2019.rename(columns={'Team': 'homeTeam', 'ML': 'homeML', 'Spread': 'homeSpread'})
homeOdds2020 = homeOdds2020.rename(columns={'Team': 'homeTeam', 'ML': 'homeML', 'Spread': 'homeSpread'})
# rename away columns
awayOdds2017 = awayOdds2017.rename(columns={'Team': 'awayTeam', 'ML': 'awayML', 'Spread': 'awaySpread'})
awayOdds2018 = awayOdds2018.rename(columns={'Team': 'awayTeam', 'ML': 'awayML', 'Spread': 'awaySpread'})
awayOdds2019 = awayOdds2019.rename(columns={'Team': 'awayTeam', 'ML': 'awayML', 'Spread': 'awaySpread'})
awayOdds2020 = awayOdds2020.rename(columns={'Team': 'awayTeam', 'ML': 'awayML', 'Spread': 'awaySpread'})
## create half of the combined df
#combinedOdds2017 = homeOdds2017[['date','homeTeam','homeML', 'homeSpread']]
combinedOdds2018 = homeOdds2018[['date','homeTeam','homeML', 'homeSpread']]
combinedOdds2019 = homeOdds2019[['date','homeTeam','homeML', 'homeSpread']]
combinedOdds2020 = homeOdds2020[['date','homeTeam','homeML', 'homeSpread']]
## add in away columns
#combinedOdds2017['awayOdds'] = awayOdds2017['awayOdds']
#combinedOdds2017['awayTeam'] = awayOdds2017['awayTeam']
combinedOdds2018['awayML'] = awayOdds2018['awayML']
combinedOdds2018['awaySpread'] = awayOdds2018['awaySpread']
combinedOdds2018['awayTeam'] = awayOdds2018['awayTeam']
sortedOdds2018 = combinedOdds2018.sort_values(by=['date','homeTeam']).reset_index(drop=True)
sortedNba2018 = nba2018_formatted.sort_values(by=['date','homeTeam'])
# display(sortedOdds2018[['homeTeam', 'awayTeam']].tail(5))
# display(sortedNba2018[['homeTeam', 'awayTeam']].tail(5))
#sortedOdds2018
combinedOdds2019['awayML'] = awayOdds2019['awayML']
combinedOdds2019['awaySpread'] = awayOdds2019['awaySpread']
combinedOdds2019['awayTeam'] = awayOdds2019['awayTeam']
sortedOdds2019 = combinedOdds2019.sort_values(by=['date','homeTeam']).reset_index(drop=True)
sortedNba2019 = nba2019_formatted.sort_values(by=['date','homeTeam'])
combinedOdds2020['awayML'] = awayOdds2020['awayML']
combinedOdds2020['awaySpread'] = awayOdds2020['awaySpread']
combinedOdds2020['awayTeam'] = awayOdds2020['awayTeam']
sortedOdds2020 = combinedOdds2020.sort_values(by=['date','homeTeam']).reset_index(drop=True)
sortedNba2020 = nba2020_formatted.sort_values(by=['date','homeTeam'])
sortedNba2018['homeML'] = sortedOdds2018['homeML']
sortedNba2018['homeSpread'] = sortedOdds2018['homeSpread']
sortedNba2018['awayML'] = sortedOdds2018['awayML']
sortedNba2018['awaySpread'] = sortedOdds2018['awaySpread']
sortedNba2019['homeML'] = sortedOdds2019['homeML']
sortedNba2019['homeSpread'] = sortedOdds2019['homeSpread']
sortedNba2019['awayML'] = sortedOdds2019['awayML']
sortedNba2019['awaySpread'] = sortedOdds2019['awaySpread']
sortedNba2020['homeML'] = sortedOdds2020['homeML']
sortedNba2020['homeSpread'] = sortedOdds2020['homeSpread']
sortedNba2020['awayML'] = sortedOdds2020['awayML']
sortedNba2020['awaySpread'] = sortedOdds2020['awaySpread']
newNba2018 = sortedNba2018.loc[sortedNba2018['home_minutes_played'] >= 240]
newNba2019 = sortedNba2019.loc[sortedNba2019['home_minutes_played'] >= 240]
newNba2020 = sortedNba2020.loc[sortedNba2020['home_minutes_played'] >= 240]
away_cols = ['away_assist_percentage', 'away_assists',
'away_block_percentage', 'away_blocks', 'away_defensive_rating',
'away_defensive_rebound_percentage', 'away_defensive_rebounds',
'away_effective_field_goal_percentage', 'away_field_goal_attempts',
'away_field_goal_percentage', 'away_field_goals',
'away_free_throw_attempt_rate', 'away_free_throw_attempts',
'away_free_throw_percentage', 'away_free_throws','away_minutes_played', 'away_offensive_rating',
'away_offensive_rebound_percentage', 'away_offensive_rebounds',
'away_personal_fouls', 'away_points', 'away_steal_percentage',
'away_steals', 'away_three_point_attempt_rate',
'away_three_point_field_goal_attempts',
'away_three_point_field_goal_percentage',
'away_three_point_field_goals', 'away_total_rebound_percentage',
'away_total_rebounds', 'away_true_shooting_percentage',
'away_turnover_percentage', 'away_turnovers',
'away_two_point_field_goal_attempts',
'away_two_point_field_goal_percentage', 'away_two_point_field_goals']
home_cols = ['home_assist_percentage', 'home_assists',
'home_block_percentage', 'home_blocks', 'home_defensive_rating',
'home_defensive_rebound_percentage', 'home_defensive_rebounds',
'home_effective_field_goal_percentage', 'home_field_goal_attempts',
'home_field_goal_percentage', 'home_field_goals',
'home_free_throw_attempt_rate', 'home_free_throw_attempts',
'home_free_throw_percentage', 'home_free_throws',
'home_minutes_played', 'home_offensive_rating',
'home_offensive_rebound_percentage', 'home_offensive_rebounds',
'home_personal_fouls', 'home_points', 'home_steal_percentage',
'home_steals', 'home_three_point_attempt_rate',
'home_three_point_field_goal_attempts',
'home_three_point_field_goal_percentage',
'home_three_point_field_goals', 'home_total_rebound_percentage',
'home_total_rebounds', 'home_true_shooting_percentage',
'home_turnover_percentage', 'home_turnovers',
'home_two_point_field_goal_attempts',
'home_two_point_field_goal_percentage', 'home_two_point_field_goals']
desired_cols = ['desired_assist_percentage', 'desired_assists',
'desired_block_percentage', 'desired_blocks', 'desired_defensive_rating',
'desired_defensive_rebound_percentage', 'desired_defensive_rebounds',
'desired_effective_field_goal_percentage', 'desired_field_goal_attempts',
'desired_field_goal_percentage', 'desired_field_goals',
'desired_free_throw_attempt_rate', 'desired_free_throw_attempts',
'desired_free_throw_percentage', 'desired_free_throws','desired_minutes_played', 'desired_offensive_rating',
'desired_offensive_rebound_percentage', 'desired_offensive_rebounds',
'desired_personal_fouls', 'desired_points', 'desired_steal_percentage',
'desired_steals', 'desired_three_point_attempt_rate',
'desired_three_point_field_goal_attempts',
'desired_three_point_field_goal_percentage',
'desired_three_point_field_goals', 'desired_total_rebound_percentage',
'desired_total_rebounds', 'desired_true_shooting_percentage',
'desired_turnover_percentage', 'desired_turnovers',
'desired_two_point_field_goal_attempts',
'desired_two_point_field_goal_percentage', 'desired_two_point_field_goals']
team_list = sorted(list(final2018['homeTeam'].unique()))
newNba2018
newNba2019 = newNba2019.loc[newNba2019['homeSpread'].notnull()]
newNba2020
def add_points(df):
home_points = df['home_points']
away_points = df['away_points']
df['test_home_points'] = home_points
df['test_away_points'] = away_points
return df
newNba2018 = add_points(newNba2018)
newNba2019 = add_points(newNba2019)
newNba2020 = add_points(newNba2020)
def add_spread(df):
home_points = df['test_home_points']
away_points = df['test_away_points']
home_spread = home_points - away_points
away_spread = away_points - home_points
df['test_home_spread'] = home_spread
df['test_away_spread'] = away_spread
return df
newNba2018 = add_spread(newNba2018)
newNba2019 = add_spread(newNba2019)
newNba2020 = add_spread(newNba2020)
def avg_previous_num_games(df, num_games):
### This function changes each stat to be the average of the last num_games for each team, and shifts it one so it does not include the current stats and drops the first num_games that become null
for col in home_cols:
for team in team_list:
df[col].loc[df['homeTeam']==team] = df[col].loc[df['homeTeam']==team].shift(1).rolling(num_games, min_periods=3).mean()
for col in away_cols:
for team in team_list:
df[col].loc[df['awayTeam']==team] = df[col].loc[df['awayTeam']==team].shift(1).rolling(num_games, min_periods=3).mean()
return df.dropna()
# def avg_previous_num_games_neutral(df, num_games):
# ### This function changes each stat to be the average of the last num_games for each team, and shifts it one so it does not include the current stats and drops the first num_games that become null
# for team in team_list:
# team_games = df.loc[(df['homeTeam']==team) | (df['awayTeam']==team)]
# for i in range(len(desired_cols)):
# team_games[desired_cols[i]] = [0] * len(team_games['away_assists'])
# team_games[desired_cols[i]].loc[team_games['homeTeam']==team] = team_games[home_cols[i]].loc[team_games['homeTeam']==team]
# team_games[desired_cols[i]].loc[team_games['awayTeam']==team] = team_games[away_cols[i]].loc[team_games['awayTeam']==team]
# avg_team_games = team_games.shift(1).rolling(3, min_periods = 3).mean()
# for i in range(len(desired_cols)):
# df[home_cols[i]].loc[team_games['homeTeam']==team] = avg_team_games[desired_cols[i]].loc[team_games['homeTeam']==team]
# df[away_cols[i]].loc[team_games['awayTeam']==team] = avg_team_games[desired_cols[i]].loc[team_games['awayTeam']==team]
transformedNba2018 = newNba2018.copy()
averagedNba2018 = avg_previous_num_games(transformedNba2018, 3)
transformedNba2018.dropna().to_csv(r'Final_2018_2019.csv')
averagedNba2018
transformedNba2019 = newNba2019.copy()
averagedNba2019 = avg_previous_num_games(transformedNba2019, 3)
averagedNba2019
transformedNba2019.dropna().to_csv(r'Final_2019_2020.csv')
transformedNba2020 = newNba2020.copy()
averagedNba2020 = avg_previous_num_games(transformedNba2020, 3)
averagedNba2020
transformedNba2020.dropna().to_csv(r'Final_2020_2021_3Months.csv')
final2018 = pd.read_csv(r'Final_2018_2019.csv')
final2018
final2019 = pd.read_csv(r'Final_2019_2020.csv')
final2019
final2020 = pd.read_csv(r'Final_2020_2021_3Months.csv')
final2018
winCorr = final2018.corr()['homeWin'].sort_values(ascending=False)
winCorr.head(10)
winCorr.tail(10)
## create list of features
excluded_from_avg = ['away_wins','away_losses', 'home_wins', 'home_losses', 'awayML','homeML', 'awaySpread', 'homeSpread']
features = away_cols + home_cols + excluded_from_avg
training_set = pd.concat([final2018, final2019])
training_set
## declare features
x = training_set[features]
## declare target variable
y = training_set['homeWin']
## scale the features
scaler = RobustScaler()
x_scaled = scaler.fit_transform(x)
#Fitting the PCA algorithm with our Data
pca = PCA().fit(x_scaled)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Explained Variance')
plt.show()
## keep 30 components
x_pca = PCA(30).fit_transform(x_scaled)
# team_games = newNba2018.loc[(newNba2018['homeTeam'] == 'MIA') | (newNba2018['awayTeam'] == 'MIA')]
# avg_team_games = team_games.shift(1).rolling(3, min_periods = 3).mean()
# display(avg_team_games)
# display(team_games)
# # team_games['away_assists'].loc[team_games['awayTeam'] == 'MIA']
# newNba2018['away_assists'].loc[(newNba2018['homeTeam'] == 'MIA')]
# team_games.shape
# avg_team_games.shape
# home_games = newNba2018.loc[(newNba2018['homeTeam'] == 'MIA')]
# away_games = newNba2018.loc[(newNba2018['awayTeam'] == 'MIA')]
# team_games_raw = newNba2018.loc[(newNba2018['homeTeam'] == 'MIA') | (newNba2018['awayTeam'] == 'MIA')]
# team_games = pd.DataFrame()
# for i in range(len(desired_cols)):
# team_games[desired_cols[i]] = pd.concat([home_games[home_cols[i]], away_games[away_cols[i]]])
# team_games = team_games.sort_index()
# avg_team_games = team_games.shift(1).rolling(3, min_periods = 3).mean()
# display(avg_team_games)
# for index, row in newNba2018.iterrows():
# if index in avg_team_games.index:
newNba2018.shape
## create our model accuracy scorer
scorer = make_scorer(f1_score, pos_label=None, average='weighted')
# Our data is continuous, so we'll use the Gaussian Classifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
## print score
print('Average Gaussian NB F1: {}'.format((cross_val_score(gnb, x_pca, y, scoring=scorer)).mean()))
#Decision Tree
from sklearn import tree
clf = tree.DecisionTreeClassifier()
## print score
print('Average Decision Tree F1: {}'.format((cross_val_score(clf, x_pca, y, scoring=scorer)).mean()))
#Random forest
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()
## print score
print('Average Random Forest F1 score: {}'.format((cross_val_score(rfc, x_pca, y, scoring=scorer)).mean()))
#KNN
from sklearn.neighbors import KNeighborsClassifier
neighbors = KNeighborsClassifier(n_neighbors=3)
## print score
print('Average KNN F1 score: {}'.format((cross_val_score(neighbors, x_pca, y, scoring=scorer)).mean()))
#Multi-layer Perception
from sklearn.neural_network import MLPClassifier
# Establish and fit the model, with a single, 100 perceptron layer.
mlp100 = MLPClassifier(hidden_layer_sizes=(100,))
mlp1000 = MLPClassifier(hidden_layer_sizes=(1000,))
## print score
print('Average MLP100 F1 score: {}'.format((cross_val_score(mlp100, x_pca, y, scoring=scorer)).mean()))
print('Average MLP1000 F1 score: {}'.format((cross_val_score(mlp1000, x_pca, y, scoring=scorer)).mean()))
test_set = final2020
del test_set['home_spread_predictions']
test_set
# declare features, scale and pca
x_test = test_set[features]
x_test_scaled = scaler.fit_transform(x_test)
x_test_scaled_pca = PCA(30).fit_transform(x_test_scaled)
# fit to training data and predict for current season
test_set['predictions'] = gnb.fit(x_pca, y).predict(x_test_scaled_pca)
## correct pct picks
len(test_set[test_set['predictions']==test_set['homeWin']])/len(test_set)
nba_test = test_set.copy()
## create an empty money won column
nba_test['moneyWon'] = 0
## calculate money won
for i in nba_test.index:
## check for correct pick
if nba_test['predictions'].iloc[i] == nba_test['homeWin'].iloc[i]:
## correct home win
if nba_test['homeWin'].iloc[i] == True:
## home is favored
if nba_test['homeML'].iloc[i] < 0:
nba_test.at[i, 'moneyWon'] = (100 / ((abs(nba_test['homeML'].iloc[i]))/100))
## home is underdog
elif nba_test['homeML'].iloc[i] > 0:
nba_test.at[i, 'moneyWon'] = (100 * ((nba_test['homeML'].iloc[i])/100))
## even odds
elif nba_test['homeML'].iloc[i] == 0:
nba_test.at[i, 'moneyWon'] = 100
## correct away win
if nba_test['homeWin'].iloc[i] == False:
## away is favored
if nba_test['awayML'].iloc[i] < 0:
nba_test.at[i, 'moneyWon'] = (100 / ((abs(nba_test['awayML'].iloc[i]))/100))
## away is underdog
elif nba_test['awayML'].iloc[i] > 0:
nba_test.at[i, 'moneyWon'] = (100 * ((nba_test['awayML'].iloc[i])/100))
## even odds
elif nba_test['awayML'].iloc[i] == 0:
nba_test.at[i, 'moneyWon'] = 100
## incorrect pick
elif nba_test['predictions'].iloc[i] != nba_test['homeWin'].iloc[i]:
nba_test.at[i, 'moneyWon'] = -100
# create cumulative money won column
nba_test['cumTotalWon'] = nba_test['moneyWon'].cumsum()
# check work
nba_test[['predictions','homeWin','homeML','awayML','moneyWon','cumTotalWon']].head(10)
## plot the cumulative money won over time
fig, ax = plt.subplots()
plt.plot(nba_test['date'], nba_test['cumTotalWon'])
plt.title('Cumulative Earnings with $100 bets')
plt.ylabel('Dollars Earned')
plt.xlabel('Time')
ax.tick_params(labelbottom=False)
print("The model's low point is {} dollars.".format(nba_test['cumTotalWon'].min()))
print("The model's high point is {} dollars.".format(nba_test['cumTotalWon'].max()))
print("The model has profited {} dollars this season.".format(nba_test['cumTotalWon'].iloc[-1]))
## top 5 wins
nba_test[['date','homeTeam','awayTeam','predictions','homeWin','homeML','awayML','moneyWon']].loc[nba_test['moneyWon'] > 100].sort_values(by='moneyWon',ascending=False).head()
nba_test = test_set.copy()
def fixSpread(nba_test):
real_home = nba_test['awaySpread'].copy()
real_away = nba_test['homeSpread'].copy()
nba_test['homeSpread'] = real_home
nba_test['awaySpread'] = real_away
return nba_test
nba_test = fixSpread(nba_test)
nba_test
def score_linear_regression_test(test_var):
x = training_set[features]
scaler = RobustScaler()
x_scaled = scaler.fit_transform(x)
x_scaled_pca = PCA(30).fit_transform(x_scaled)
test_x = nba_test[features]
scaler2 = RobustScaler()
test_x_scaled = scaler2.fit_transform(test_x)
test_x_scaled_pca = PCA(30).fit_transform(test_x_scaled)
y = training_set[test_var]
linear_regression = LinearRegression()
linear_regression.fit(x_scaled_pca, y)
y_pred = linear_regression.predict(test_x_scaled_pca)
if 'home' in test_var:
nba_test['home_score_predictions'] = y_pred
elif 'away' in test_var:
nba_test['away_score_predictions'] = y_pred
# correct = 0
# total = len(y_pred)
# for i in range(total):
# if y_pred[i] > - nba_test['homeSpread'].iloc[i]:
# correct += 1
# print(correct / total)
return nba_test
score_linear_regression_test('test_home_points')
score_linear_regression_test('test_away_points')
# Add predicted home spread column, total, winner
nba_test['score_spread_predictions'] = nba_test['home_score_predictions'] - nba_test['away_score_predictions']
nba_test['total_score_predictions'] = nba_test['home_score_predictions'] + nba_test['away_score_predictions']
winners = []
for i in range(len(nba_test)):
winner = nba_test['score_spread_predictions'].iloc[i] > 0
winners.append(winner)
nba_test['score_homeWin_predictions'] = winners
nba_test
len(nba_test[nba_test['score_homeWin_predictions']==nba_test['homeWin']])/len(nba_test)
nba_test['moneyWon'] = 0
## calculate money won
for i in nba_test.index:
## check for correct pick
if nba_test['score_homeWin_predictions'].iloc[i] == nba_test['homeWin'].iloc[i]:
## correct home win
if nba_test['homeWin'].iloc[i] == True:
## home is favored
if nba_test['homeML'].iloc[i] < 0:
nba_test.at[i, 'moneyWon'] = (100 / ((abs(nba_test['homeML'].iloc[i]))/100))
## home is underdog
elif nba_test['homeML'].iloc[i] > 0:
nba_test.at[i, 'moneyWon'] = (100 * ((nba_test['homeML'].iloc[i])/100))
## even odds
elif nba_test['homeML'].iloc[i] == 0:
nba_test.at[i, 'moneyWon'] = 100
## correct away win
if nba_test['homeWin'].iloc[i] == False:
## away is favored
if nba_test['awayML'].iloc[i] < 0:
nba_test.at[i, 'moneyWon'] = (100 / ((abs(nba_test['awayML'].iloc[i]))/100))
## away is underdog
elif nba_test['awayML'].iloc[i] > 0:
nba_test.at[i, 'moneyWon'] = (100 * ((nba_test['awayML'].iloc[i])/100))
## even odds
elif nba_test['awayML'].iloc[i] == 0:
nba_test.at[i, 'moneyWon'] = 100
## incorrect pick
elif nba_test['score_homeWin_predictions'].iloc[i] != nba_test['homeWin'].iloc[i]:
nba_test.at[i, 'moneyWon'] = -100
# create cumulative money won column
nba_test['cumTotalWon'] = nba_test['moneyWon'].cumsum()
## plot the cumulative money won over time
fig, ax = plt.subplots()
plt.plot(nba_test['date'], nba_test['cumTotalWon'])
plt.title('Cumulative Earnings with $100 bets')
plt.ylabel('Dollars Earned')
plt.xlabel('Time')
ax.tick_params(labelbottom=False)
def spread_linear_regression_test(y):
x = training_set[features]
scaler = RobustScaler()
x_scaled = scaler.fit_transform(x)
x_scaled_pca = PCA(30).fit_transform(x_scaled)
test_x = nba_test[features]
scaler2 = RobustScaler()
test_x_scaled = scaler2.fit_transform(test_x)
test_x_scaled_pca = PCA(30).fit_transform(test_x_scaled)
linear_regression = LinearRegression()
linear_regression.fit(x_scaled_pca, y)
y_pred = linear_regression.predict(test_x_scaled_pca)
nba_test['home_spread_predictions'] = y_pred
correct = 0
total = len(y_pred)
for i in range(total):
if y_pred[i] > - nba_test['homeSpread'].iloc[i]:
correct += 1
print(correct / total)
spread_linear_regression_test(training_set['test_home_spread'])
# print(linear_regression_test(training_set['test_away_spread']))
def spreadStrategy():
## create an empty money won column
nba_test['moneyWon'] = 0
correct = 0
total = len(nba_test)
for i in nba_test.index:
if nba_test['home_spread_predictions'].iloc[i] != 0: #if we bet
if nba_test['homeSpread'].iloc[i] == 0: #check even odds
if nba_test['home_spread_predictions'].iloc[i] > 0 and nba_test['homeWin'].iloc[i] == True: #check correct home win
nba_test.at[i, 'moneyWon'] = 90
correct += 1
elif nba_test['home_spread_predictions'].iloc[i] < 0 and nba_test['homeWin'].iloc[i] == False: #check correct away win
nba_test.at[i, 'moneyWon'] = 90
correct += 1
else:
nba_test.at[i, 'moneyWon'] = -100
elif nba_test['homeSpread'].iloc[i] < 0: #home favored
if nba_test['home_spread_predictions'].iloc[i] > - nba_test['homeSpread'].iloc[i] and nba_test['test_home_spread'].iloc[i] > - nba_test['homeSpread'].iloc[i]: #correct home bet
nba_test.at[i, 'moneyWon'] = 90
correct += 1
elif nba_test['home_spread_predictions'].iloc[i] < - nba_test['homeSpread'].iloc[i] and nba_test['test_home_spread'].iloc[i] < - nba_test['homeSpread'].iloc[i]: #correct away bet
nba_test.at[i, 'moneyWon'] = 90
correct += 1
else:
nba_test.at[i, 'moneyWon'] = -100
else: #away favored
if nba_test['home_spread_predictions'].iloc[i] > - nba_test['homeSpread'].iloc[i] and nba_test['test_home_spread'].iloc[i] > - nba_test['homeSpread'].iloc[i]: #correct home bet
nba_test.at[i, 'moneyWon'] = 90
correct += 1
elif nba_test['home_spread_predictions'].iloc[i] < - nba_test['homeSpread'].iloc[i] and nba_test['test_home_spread'].iloc[i] < - nba_test['homeSpread'].iloc[i]: #correct away bet
nba_test.at[i, 'moneyWon'] = 90
correct += 1
else:
nba_test.at[i, 'moneyWon'] = -100
return correct / total
spreadStrategy()
# create cumulative money won column
nba_test['cumTotalWon'] = nba_test['moneyWon'].cumsum()
# check work
nba_test[['home_spread_predictions','test_home_spread', 'test_away_spread', 'homeSpread','awaySpread','moneyWon','cumTotalWon']].tail()
## plot the cumulative money won over time
fig, ax = plt.subplots()
plt.plot(nba_test['date'], nba_test['cumTotalWon'])
plt.title('Cumulative Earnings with $100 bets')
plt.ylabel('Dollars Earned')
plt.xlabel('Time')
ax.tick_params(labelbottom=False)