import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
df_batting = pd.read_csv("Batting.csv")
df_salaries = pd.read_csv("Salaries.csv")
df_people = pd.read_csv("People.csv")
df_batting.head()
df_batting.info()
df_salaries.head()
df_salaries.info()
df_bat = df_batting.merge(df_people, on="playerID", how="left")
df_bat.head()
df_bat.info()
df_bat = df_bat[["yearID", "playerID", "nameFirst", "nameLast", "nameGiven","lgID", "teamID",
"G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB"]]
df_salaries = df_salaries[["playerID", "yearID", "salary"]]
df_bat = df_bat.merge(df_salaries, on=["yearID", "playerID"], how="left")
df_bat.info()
df_bat.corr()
AL_filter = df_bat["lgID"] == "AL"
df_AL = df_bat[AL_filter]
df_AL.corr()
df_AL["yearID"].value_counts()
px.scatter(df_AL, x="R", y="salary", animation_frame="yearID", animation_group="playerID", hover_name="playerID", color="teamID")
sns.scatterplot(data=df_AL, x="R", y="salary", hue="yearID")
sns.histplot(data= df_AL, x="R", kde=True)
sns.histplot(data= df_AL, x="salary", kde=True)
plt.figure(figsize=(25,15))
sns.heatmap(df_AL.corr(), annot=True, vmin=-1, vmax=1, cmap='RdBu_r')
#2016 filter
yr_filter = ((df_AL['yearID'] >= 2006) & (df_AL['yearID'] <= 2016))
df_l10 = df_AL[yr_filter]
plt.figure(figsize=(25,15))
sns.heatmap(df_l10.corr(), annot=True, vmin=-1, vmax=1, cmap='RdBu_r')
df_l10.info()
df_l10 = df_l10.dropna()
# baseline model
import statsmodels.api as sm
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
X = df_l10[["G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB"]]
y= df_l10['salary']
X = sm.add_constant(X)
reg = sm.OLS(y, X)
fitted_model = reg.fit()
fitted_model.summary()
# revised model
X = df_l10[["G", "H", "2B", "3B", "RBI", "SB", "CS", "BB", "IBB"]]
y= df_l10['salary']
X = sm.add_constant(X)
reg = sm.OLS(y, X)
fitted_model = reg.fit()
fitted_model.summary()
df_ALyr = df_AL.groupby(["yearID"])[["G", "AB", "H", "R", "2B", "3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB", "salary"]].sum()
df_ALyr = df_ALyr.reset_index()
df_ALyr.head()
X = df_ALyr[["G", "H", "2B", "3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB"]]
y= df_ALyr['salary']
X = sm.add_constant(X)
reg = sm.OLS(y, X)
model = reg.fit()
model.summary()
X = df_ALyr[["2B", "3B", "RBI", "SB", "CS"]]
y= df_ALyr['salary']
X = sm.add_constant(X)
reg = sm.OLS(y, X)
model = reg.fit()
model.summary()