import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
db_path = '/work/basketball.sqlite'
con = sqlite3.connect(db_path) # create connection object to database
print("SQL database connected")
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", con)
print(table)
SQL database connected
name
0 Player
1 Team
2 Team_Attributes
3 Team_History
4 Player_Attributes
5 Game_Officials
6 Game_Inactive_Players
7 Team_Salary
8 Player_Salary
9 Draft
10 Draft_Combine
11 Player_Photos
12 Player_Bios
13 Game
14 News
15 News_Missing
game_df = pd.read_sql_query("SELECT * from game;", con)
game_df
GAME_IDobject
00202000030%
00202000050%
62377 others100%
SEASON_IDobject
0
0024600001
21946
1
0024600003
21946
2
0024600004
21946
3
0024600002
21946
4
0024600005
21946
5
0024600006
21946
6
0024600007
21946
7
0024600008
21946
8
0024600009
21946
9
0024600010
21946
year_df = pd.DataFrame(pd.DatetimeIndex(pd.to_datetime(game_df["GAME_DATE"])).year)
year_df = pd.DataFrame(year_df.GAME_DATE.value_counts()).reset_index(level=0).sort_values('index')
year_df.rename(columns={"index": "Year", "GAME_DATE": "Count"}, inplace=True)
year_df.plot("Year", "Count").set_title("Number of NBA Season Games Played vs. Year")
merge_df = game_df.copy()
merge_df["Year"] = pd.DataFrame(pd.DatetimeIndex(pd.to_datetime(game_df["GAME_DATE"])).year)
years = list(pd.DatetimeIndex(pd.to_datetime(game_df["GAME_DATE"])).year.unique())
teams_df = game_df["TEAM_ABBREVIATION_HOME"].unique()
num_teams = []
for year in years:
num_teams.append(len(merge_df[merge_df["Year"] == year]["TEAM_ABBREVIATION_HOME"].unique()))
pd.DataFrame({"Years": years, "Num Teams": num_teams}).plot("Years", "Num Teams").set_title("Number of NBA Teams vs. Year")
game_df["PF_HOME"].mean()
important_columns = ["GAME_ID", "LAST_GAME_HOME_TEAM_POINTS", "LAST_GAME_VISITOR_TEAM_POINTS", "WL_HOME"]
home_columns = ["OREB_HOME", 'OREB_HOME','DREB_HOME', 'REB_HOME', 'AST_HOME', 'STL_HOME', 'BLK_HOME', 'TOV_HOME', 'PF_HOME']
away_columns = [column.replace("HOME", "AWAY") for column in home_columns]
important_columns += home_columns + away_columns
important_columns
extraction_df = game_df[important_columns]
extraction_df.isna().sum()
extraction_df = extraction_df.dropna()
len(extraction_df)
wins = len(extraction_df[extraction_df.WL_HOME == 'W'])
losses = len(extraction_df[extraction_df.WL_HOME == 'L'])
home_won = (wins)/(wins + losses)
home_won
extraction_df.columns
blk_winner_home = extraction_df[extraction_df.WL_HOME == 'W'].BLK_HOME.astype(float).dropna().sum()
blk_winner_away = extraction_df[extraction_df.WL_HOME == 'L'].BLK_AWAY.astype(float).dropna().sum()
blk_winner = (blk_winner_home + blk_winner_away)/len(extraction_df)
print('Winners average blocks:', blk_winner)
Winners average blocks: 5.472943881794609
blk_loser_home = extraction_df[extraction_df.WL_HOME == 'L'].BLK_HOME.astype(float).dropna().sum()
blk_loser_away = extraction_df[extraction_df.WL_HOME == 'W'].BLK_AWAY.astype(float).dropna().sum()
blk_loser = (blk_loser_home + blk_loser_away)/len(extraction_df)
print('Losers average blocks', blk_loser)
Losers average blocks 4.563554968442683
reb_winner_home = extraction_df[extraction_df.WL_HOME == 'W'].REB_HOME.astype(float).dropna().sum()
reb_winner_away = extraction_df[extraction_df.WL_HOME == 'L'].REB_AWAY.astype(float).dropna().sum()
reb_winner = (reb_winner_home + reb_winner_away)/len(extraction_df)
print('Winners average rebounds:', reb_winner)
Winners average rebounds: 44.38035128920202
reb_loser_home = extraction_df[extraction_df.WL_HOME == 'L'].REB_HOME.astype(float).dropna().sum()
reb_loser_away = extraction_df[extraction_df.WL_HOME == 'W'].REB_AWAY.astype(float).dropna().sum()
reb_loser = (reb_loser_home + reb_loser_away)/len(extraction_df)
print('Losers average rebounds:', reb_loser)
Losers average rebounds: 40.91565145065805
tov_winner_home = extraction_df[extraction_df.WL_HOME == 'W'].TOV_HOME.astype(float).dropna().sum()
tov_winner_away = extraction_df[extraction_df.WL_HOME == 'L'].TOV_AWAY.astype(float).dropna().sum()
tov_winner = (tov_winner_home + tov_winner_away)/len(extraction_df)
print('Winners average turnovers:', tov_winner)
Winners average turnovers: 14.484368119771027
tov_loser_home = extraction_df[extraction_df.WL_HOME == 'L'].TOV_HOME.astype(float).dropna().sum()
tov_loser_away = extraction_df[extraction_df.WL_HOME == 'W'].TOV_AWAY.astype(float).dropna().sum()
tov_loser = (tov_loser_home + tov_loser_away)/len(extraction_df)
print('Losers average turnovers:', tov_loser)
Losers average turnovers: 15.435148490630658
stl_winner_home = extraction_df[extraction_df.WL_HOME == 'W'].STL_HOME.astype(float).dropna().sum()
stl_winner_away = extraction_df[extraction_df.WL_HOME == 'L'].STL_AWAY.astype(float).dropna().sum()
stl_winner = (stl_winner_home + stl_winner_away)/len(extraction_df)
print('Winners average steals:', stl_winner)
Winners average steals: 8.367826214589755
stl_loser_home = extraction_df[extraction_df.WL_HOME == 'L'].STL_HOME.astype(float).dropna().sum()
stl_loser_away = extraction_df[extraction_df.WL_HOME == 'W'].STL_AWAY.astype(float).dropna().sum()
stl_loser = (stl_loser_home + stl_loser_away)/len(extraction_df)
print('Losers average steals:', stl_loser)
Losers average steals: 7.499437350163903
pf_winner_home = extraction_df[extraction_df.WL_HOME == 'W'].PF_HOME.astype(float).dropna().sum()
pf_winner_away = extraction_df[extraction_df.WL_HOME == 'L'].PF_AWAY.astype(float).dropna().sum()
pf_winner = (pf_winner_home + pf_winner_away)/len(extraction_df)
print('Winners average personal fouls:', pf_winner)
Winners average personal fouls: 21.115220901218258
pf_loser_home = extraction_df[extraction_df.WL_HOME == 'L'].PF_HOME.astype(float).dropna().sum()
pf_loser_away = extraction_df[extraction_df.WL_HOME == 'W'].PF_AWAY.astype(float).dropna().sum()
pf_loser = (pf_loser_home + pf_loser_away)/len(extraction_df)
print('Losers average personal fouls:', pf_loser)
Losers average personal fouls: 22.492122902294632
blk_dif = round((blk_winner - blk_loser), 1)
reb_dif = round((reb_winner - reb_loser), 1)
tov_dif = round((tov_loser - tov_winner), 1)
stl_dif = round((stl_winner - stl_loser), 1)
pf_dif = round((pf_loser - pf_winner), 1)
print('The winner averages:', blk_dif, 'more blocks,', reb_dif, 'more rebounds, ', tov_dif, 'fewer turnovers, ', stl_dif, 'more steals, ', pf_dif, 'fewer personal fouls.')
The winner averages: 0.9 more blocks, 3.5 more rebounds, 1.0 fewer turnovers, 0.9 more steals, 1.4 fewer personal fouls.