import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
df_batting = pd.read_csv("Batting.csv")
df_salaries = pd.read_csv("Salaries.csv")
df_people = pd.read_csv("People.csv")
df_batting.head()
df_batting.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108789 entries, 0 to 108788
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 playerID 108789 non-null object
1 yearID 108789 non-null int64
2 stint 108789 non-null int64
3 teamID 108789 non-null object
4 lgID 108052 non-null object
5 G 108789 non-null int64
6 AB 108789 non-null int64
7 R 108789 non-null int64
8 H 108789 non-null int64
9 2B 108789 non-null int64
10 3B 108789 non-null int64
11 HR 108789 non-null int64
12 RBI 108033 non-null float64
13 SB 106421 non-null float64
14 CS 85248 non-null float64
15 BB 108789 non-null int64
16 SO 106689 non-null float64
17 IBB 72139 non-null float64
18 HBP 105973 non-null float64
19 SH 102721 non-null float64
20 SF 72686 non-null float64
21 GIDP 83348 non-null float64
dtypes: float64(9), int64(10), object(3)
memory usage: 18.3+ MB
df_salaries.head()
df_salaries.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26428 entries, 0 to 26427
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 yearID 26428 non-null int64
1 teamID 26428 non-null object
2 lgID 26428 non-null object
3 playerID 26428 non-null object
4 salary 26428 non-null int64
dtypes: int64(2), object(3)
memory usage: 1.0+ MB
df_bat = df_batting.merge(df_people, on="playerID", how="left")
df_bat.head()
df_bat.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 108789 entries, 0 to 108788
Data columns (total 45 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 playerID 108789 non-null object
1 yearID 108789 non-null int64
2 stint 108789 non-null int64
3 teamID 108789 non-null object
4 lgID 108052 non-null object
5 G 108789 non-null int64
6 AB 108789 non-null int64
7 R 108789 non-null int64
8 H 108789 non-null int64
9 2B 108789 non-null int64
10 3B 108789 non-null int64
11 HR 108789 non-null int64
12 RBI 108033 non-null float64
13 SB 106421 non-null float64
14 CS 85248 non-null float64
15 BB 108789 non-null int64
16 SO 106689 non-null float64
17 IBB 72139 non-null float64
18 HBP 105973 non-null float64
19 SH 102721 non-null float64
20 SF 72686 non-null float64
21 GIDP 83348 non-null float64
22 birthYear 108644 non-null float64
23 birthMonth 108286 non-null float64
24 birthDay 107943 non-null float64
25 birthCountry 108725 non-null object
26 birthState 105780 non-null object
27 birthCity 108405 non-null object
28 deathYear 47462 non-null float64
29 deathMonth 47461 non-null float64
30 deathDay 47459 non-null float64
31 deathCountry 47438 non-null object
32 deathState 47198 non-null object
33 deathCity 47419 non-null object
34 nameFirst 108751 non-null object
35 nameLast 108789 non-null object
36 nameGiven 108751 non-null object
37 weight 107663 non-null float64
38 height 107735 non-null float64
39 bats 106877 non-null object
40 throws 107319 non-null object
41 debut 108782 non-null object
42 finalGame 108782 non-null object
43 retroID 108789 non-null object
44 bbrefID 108788 non-null object
dtypes: float64(17), int64(10), object(18)
memory usage: 38.2+ MB
df_bat = df_bat[["yearID", "playerID", "nameFirst", "nameLast", "nameGiven","lgID", "teamID",
"G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB"]]
df_salaries = df_salaries[["playerID", "yearID", "salary"]]
df_bat = df_bat.merge(df_salaries, on=["yearID", "playerID"], how="left")
df_bat.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 108964 entries, 0 to 108963
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 yearID 108964 non-null int64
1 playerID 108964 non-null object
2 nameFirst 108926 non-null object
3 nameLast 108964 non-null object
4 nameGiven 108926 non-null object
5 lgID 108227 non-null object
6 teamID 108964 non-null object
7 G 108964 non-null int64
8 AB 108964 non-null int64
9 R 108964 non-null int64
10 H 108964 non-null int64
11 2B 108964 non-null int64
12 3B 108964 non-null int64
13 HR 108964 non-null int64
14 RBI 108208 non-null float64
15 SB 106596 non-null float64
16 CS 85423 non-null float64
17 BB 108964 non-null int64
18 SO 106864 non-null float64
19 IBB 72314 non-null float64
20 salary 28294 non-null float64
dtypes: float64(6), int64(9), object(6)
memory usage: 18.3+ MB
df_bat.corr()
AL_filter = df_bat["lgID"] == "AL"
df_AL = df_bat[AL_filter]
df_AL.corr()
df_AL["yearID"].value_counts()
px.scatter(df_AL, x="R", y="salary", animation_frame="yearID", animation_group="playerID", hover_name="playerID", color="teamID")
sns.scatterplot(data=df_AL, x="R", y="salary", hue="yearID")
sns.histplot(data= df_AL, x="R", kde=True)
sns.histplot(data= df_AL, x="salary", kde=True)
plt.figure(figsize=(25,15))
sns.heatmap(df_AL.corr(), annot=True, vmin=-1, vmax=1, cmap='RdBu_r')
#2016 filter
yr_filter = ((df_AL['yearID'] >= 2006) & (df_AL['yearID'] <= 2016))
df_l10 = df_AL[yr_filter]
plt.figure(figsize=(25,15))
sns.heatmap(df_l10.corr(), annot=True, vmin=-1, vmax=1, cmap='RdBu_r')
df_l10.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7408 entries, 87507 to 103005
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 yearID 7408 non-null int64
1 playerID 7408 non-null object
2 nameFirst 7408 non-null object
3 nameLast 7408 non-null object
4 nameGiven 7408 non-null object
5 lgID 7408 non-null object
6 teamID 7408 non-null object
7 G 7408 non-null int64
8 AB 7408 non-null int64
9 R 7408 non-null int64
10 H 7408 non-null int64
11 2B 7408 non-null int64
12 3B 7408 non-null int64
13 HR 7408 non-null int64
14 RBI 7408 non-null float64
15 SB 7408 non-null float64
16 CS 7408 non-null float64
17 BB 7408 non-null int64
18 SO 7408 non-null float64
19 IBB 7408 non-null float64
20 salary 4682 non-null float64
dtypes: float64(6), int64(9), object(6)
memory usage: 1.2+ MB
df_l10 = df_l10.dropna()
# baseline model
import statsmodels.api as sm
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
X = df_l10[["G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB"]]
y= df_l10['salary']
X = sm.add_constant(X)
reg = sm.OLS(y, X)
fitted_model = reg.fit()
fitted_model.summary()
# revised model
X = df_l10[["G", "H", "2B", "3B", "RBI", "SB", "CS", "BB", "IBB"]]
y= df_l10['salary']
X = sm.add_constant(X)
reg = sm.OLS(y, X)
fitted_model = reg.fit()
fitted_model.summary()
df_ALyr = df_AL.groupby(["yearID"])[["G", "AB", "H", "R", "2B", "3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB", "salary"]].sum()
df_ALyr = df_ALyr.reset_index()
df_ALyr.head()
X = df_ALyr[["G", "H", "2B", "3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB"]]
y= df_ALyr['salary']
X = sm.add_constant(X)
reg = sm.OLS(y, X)
model = reg.fit()
model.summary()
X = df_ALyr[["2B", "3B", "RBI", "SB", "CS"]]
y= df_ALyr['salary']
X = sm.add_constant(X)
reg = sm.OLS(y, X)
model = reg.fit()
model.summary()