import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import eli5
from eli5.sklearn import PermutationImportance
from collections import Counter
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
import plotly
sns.set_style('darkgrid')
df=pd.read_csv('../input/data.csv')
df.head().T
df.columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18207 entries, 0 to 18206
Data columns (total 89 columns):
Unnamed: 0 18207 non-null int64
ID 18207 non-null int64
Name 18207 non-null object
Age 18207 non-null int64
Photo 18207 non-null object
Nationality 18207 non-null object
Flag 18207 non-null object
Overall 18207 non-null int64
Potential 18207 non-null int64
Club 17966 non-null object
Club Logo 18207 non-null object
Value 18207 non-null object
Wage 18207 non-null object
Special 18207 non-null int64
Preferred Foot 18159 non-null object
International Reputation 18159 non-null float64
Weak Foot 18159 non-null float64
Skill Moves 18159 non-null float64
Work Rate 18159 non-null object
Body Type 18159 non-null object
Real Face 18159 non-null object
Position 18147 non-null object
Jersey Number 18147 non-null float64
Joined 16654 non-null object
Loaned From 1264 non-null object
Contract Valid Until 17918 non-null object
Height 18159 non-null object
Weight 18159 non-null object
LS 16122 non-null object
ST 16122 non-null object
RS 16122 non-null object
LW 16122 non-null object
LF 16122 non-null object
CF 16122 non-null object
RF 16122 non-null object
RW 16122 non-null object
LAM 16122 non-null object
CAM 16122 non-null object
RAM 16122 non-null object
LM 16122 non-null object
LCM 16122 non-null object
CM 16122 non-null object
RCM 16122 non-null object
RM 16122 non-null object
LWB 16122 non-null object
LDM 16122 non-null object
CDM 16122 non-null object
RDM 16122 non-null object
RWB 16122 non-null object
LB 16122 non-null object
LCB 16122 non-null object
CB 16122 non-null object
RCB 16122 non-null object
RB 16122 non-null object
Crossing 18159 non-null float64
Finishing 18159 non-null float64
HeadingAccuracy 18159 non-null float64
ShortPassing 18159 non-null float64
Volleys 18159 non-null float64
Dribbling 18159 non-null float64
Curve 18159 non-null float64
FKAccuracy 18159 non-null float64
LongPassing 18159 non-null float64
BallControl 18159 non-null float64
Acceleration 18159 non-null float64
SprintSpeed 18159 non-null float64
Agility 18159 non-null float64
Reactions 18159 non-null float64
Balance 18159 non-null float64
ShotPower 18159 non-null float64
Jumping 18159 non-null float64
Stamina 18159 non-null float64
Strength 18159 non-null float64
LongShots 18159 non-null float64
Aggression 18159 non-null float64
Interceptions 18159 non-null float64
Positioning 18159 non-null float64
Vision 18159 non-null float64
Penalties 18159 non-null float64
Composure 18159 non-null float64
Marking 18159 non-null float64
StandingTackle 18159 non-null float64
SlidingTackle 18159 non-null float64
GKDiving 18159 non-null float64
GKHandling 18159 non-null float64
GKKicking 18159 non-null float64
GKPositioning 18159 non-null float64
GKReflexes 18159 non-null float64
Release Clause 16643 non-null object
dtypes: float64(38), int64(6), object(45)
memory usage: 12.4+ MB
df.describe().T
df.drop(['Unnamed: 0','Photo','Flag','Club Logo'],axis=1,inplace=True)
msno.bar(df.sample( 18207 ),(28,10),color='red')
df.isnull().sum()
missing_height = df[df['Height'].isnull()].index.tolist()
missing_weight = df[df['Weight'].isnull()].index.tolist()
if missing_height == missing_weight:
print('They are same')
else:
print('They are different')
They are same
df.drop(df.index[missing_height],inplace =True)
df.isnull().sum()
df.drop(['Loaned From','Release Clause','Joined'],axis=1,inplace=True)
#Number of countries available and top 5 countries with highest number of players
print('Total number of countries : {0}'.format(df['Nationality'].nunique()))
print(df['Nationality'].value_counts().head(5))
print('--'*40)
print("\nEuropean Countries have most players")
Total number of countries : 164
England 1657
Germany 1195
Spain 1071
Argentina 936
France 911
Name: Nationality, dtype: int64
--------------------------------------------------------------------------------
European Countries have most players
#Total number of clubs present and top 5 clubs with highest number of players
print('Total number of clubs : {0}'.format(df['Club'].nunique()))
print(df['Club'].value_counts().head(5))
Total number of clubs : 651
Manchester United 33
Southampton 33
Chelsea 33
Eintracht Frankfurt 33
Manchester City 33
Name: Club, dtype: int64
#Player with maximum Potential and Overall Performance
print('Maximum Potential : '+str(df.loc[df['Potential'].idxmax()][1]))
print('Maximum Overall Perforamnce : '+str(df.loc[df['Overall'].idxmax()][1]))
Maximum Potential : K. Mbappé
Maximum Overall Perforamnce : L. Messi
pr_cols=['Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']
print('BEST IN DIFFERENT ASPECTS :')
print('_________________________\n\n')
i=0
while i < len(pr_cols):
print('Best {0} : {1}'.format(pr_cols[i],df.loc[df[pr_cols[i]].idxmax()][1]))
i += 1
BEST IN DIFFERENT ASPECTS :
_________________________
Best Crossing : K. De Bruyne
Best Finishing : L. Messi
Best HeadingAccuracy : Naldo
Best ShortPassing : L. Modrić
Best Volleys : E. Cavani
Best Dribbling : L. Messi
Best Curve : Quaresma
Best FKAccuracy : L. Messi
Best LongPassing : T. Kroos
Best BallControl : L. Messi
Best Acceleration : Douglas Costa
Best SprintSpeed : K. Mbappé
Best Agility : Neymar Jr
Best Reactions : Cristiano Ronaldo
Best Balance : Bernard
Best ShotPower : Cristiano Ronaldo
Best Jumping : Cristiano Ronaldo
Best Stamina : N. Kanté
Best Strength : A. Akinfenwa
Best LongShots : L. Messi
Best Aggression : B. Pearson
Best Interceptions : N. Kanté
Best Positioning : Cristiano Ronaldo
Best Vision : L. Messi
Best Penalties : M. Balotelli
Best Composure : L. Messi
Best Marking : A. Barzagli
Best StandingTackle : G. Chiellini
Best SlidingTackle : Sergio Ramos
Best GKDiving : De Gea
Best GKHandling : J. Oblak
Best GKKicking : M. Neuer
Best GKPositioning : G. Buffon
Best GKReflexes : De Gea
#Cleaning some of values so that we can interpret them
def value_to_int(df_value):
try:
value = float(df_value[1:-1])
suffix = df_value[-1:]
if suffix == 'M':
value = value * 1000000
elif suffix == 'K':
value = value * 1000
except ValueError:
value = 0
return value
df['Value'] = df['Value'].apply(value_to_int)
df['Wage'] = df['Wage'].apply(value_to_int)
df.head().T
#Top earners
print('Most valued player : '+str(df.loc[df['Value'].idxmax()][1]))
print('Highest earner : '+str(df.loc[df['Wage'].idxmax()][1]))
print("--"*40)
print("\nTop Earners")
Most valued player : Neymar Jr
Highest earner : L. Messi
--------------------------------------------------------------------------------
Top Earners
sns.jointplot(x=df['Age'],y=df['Potential'],
joint_kws={'alpha':0.1,'s':5,'color':'red'},
marginal_kws={'color':'red'})
sns.lmplot(data = df, x = 'Age', y = 'SprintSpeed',lowess=True,scatter_kws={'alpha':0.01, 's':5,'color':'green'},
line_kws={'color':'red'})
sns.lmplot(x = 'BallControl', y = 'Dribbling', data = df,col = 'Preferred Foot',scatter_kws = {'alpha':0.1,'color':'orange'},
line_kws={'color':'red'})
sns.jointplot(x=df['Dribbling'], y=df['Crossing'], kind="hex", color="#4CB391");
value = df.Value
cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)
sns.relplot(x="Age", y="Potential", hue=value/100000,
sizes=(40, 400), alpha=.5,
height=6, data=df);
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(corr,mask=mask,square=True,linewidths=.8,cmap="YlGnBu")
plt.rcParams['figure.figsize'] = (20, 7)
plt.style.use('seaborn-dark-palette')
sns.boxenplot(df['Overall'], df['Age'], hue = df['Preferred Foot'], palette = 'rocket')
plt.title('Comparison of Overall Scores and age wrt Preferred foot', fontsize = 20)
plt.show()
cols = ['Age','Overall','Potential','Acceleration','SprintSpeed',"Agility","Stamina",'Strength','Preferred Foot']
df_small = df[cols]
df_small.head()
sns.pairplot(df_small, hue ='Preferred Foot',palette=["black", "red"],plot_kws=dict(s=50, alpha =0.8),markers=['^','v'])
df=pd.read_csv('../input/data.csv')
#DROP UNNECESSARY VALUES
drop_cols = df.columns[28:54]
df = df.drop(drop_cols, axis = 1)
df = df.drop(['Unnamed: 0','ID','Photo','Flag','Club Logo','Jersey Number','Joined','Special','Loaned From','Body Type', 'Release Clause',
'Weight','Height','Contract Valid Until','Wage','Value','Name','Club'], axis = 1)
df = df.dropna()
df.head()
#Turn Real Face into a binary indicator variable
def face_to_num(df):
if (df['Real Face'] == 'Yes'):
return 1
else:
return 0
#Turn Preferred Foot into a binary indicator variable
def right_footed(df):
if (df['Preferred Foot'] == 'Right'):
return 1
else:
return 0
#Create a simplified position varaible to account for all player positions
def simple_position(df):
if (df['Position'] == 'GK'):
return 'GK'
elif ((df['Position'] == 'RB') | (df['Position'] == 'LB') | (df['Position'] == 'CB') | (df['Position'] == 'LCB') | (df['Position'] == 'RCB') | (df['Position'] == 'RWB') | (df['Position'] == 'LWB') ):
return 'DF'
elif ((df['Position'] == 'LDM') | (df['Position'] == 'CDM') | (df['Position'] == 'RDM')):
return 'DM'
elif ((df['Position'] == 'LM') | (df['Position'] == 'LCM') | (df['Position'] == 'CM') | (df['Position'] == 'RCM') | (df['Position'] == 'RM')):
return 'MF'
elif ((df['Position'] == 'LAM') | (df['Position'] == 'CAM') | (df['Position'] == 'RAM') | (df['Position'] == 'LW') | (df['Position'] == 'RW')):
return 'AM'
elif ((df['Position'] == 'RS') | (df['Position'] == 'ST') | (df['Position'] == 'LS') | (df['Position'] == 'CF') | (df['Position'] == 'LF') | (df['Position'] == 'RF')):
return 'ST'
else:
return df.Position
#Get a count of Nationalities in the Dataset, make of list of those with over 250 Players (our Major Nations)
nat_counts = df.Nationality.value_counts()
nat_list = nat_counts[nat_counts > 250].index.tolist()
#Replace Nationality with a binary indicator variable for 'Major Nation'
def major_nation(df):
if (df.Nationality in nat_list):
return 1
else:
return 0
#Create a copy of the original dataframe to avoid indexing errors
df1 = df.copy()
#Apply changes to dataset to create new column
df1['Real_Face'] = df1.apply(face_to_num, axis=1)
df1['Right_Foot'] = df1.apply(right_footed, axis=1)
df1['Simple_Position'] = df1.apply(simple_position,axis = 1)
df1['Major_Nation'] = df1.apply(major_nation,axis = 1)
#Split the Work Rate Column in two
tempwork = df1["Work Rate"].str.split("/ ", n = 1, expand = True)
#Create new column for first work rate
df1["WorkRate1"]= tempwork[0]
#Create new column for second work rate
df1["WorkRate2"]= tempwork[1]
#Drop original columns used
df1 = df1.drop(['Work Rate','Preferred Foot','Real Face', 'Position','Nationality'], axis = 1)
df1.head()
#Split ID as a Target value
target = df1.Overall
df2 = df1.drop(['Overall'], axis = 1)
#Splitting into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df2, target, test_size=0.2)
#One Hot Encoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
print(X_test.shape,X_train.shape)
print(y_test.shape,y_train.shape)
(3630, 54) (14517, 54)
(3630,) (14517,)
#Applying Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
#Finding the r2 score and root mean squared error
from sklearn.metrics import r2_score, mean_squared_error
print('r2 score: '+str(r2_score(y_test, predictions)))
print('RMSE : '+str(np.sqrt(mean_squared_error(y_test, predictions))))
r2 score: 0.9271870516548889
RMSE : 1.841233917704027
perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())
#Top 3 important features are Potential, Age & Reactions
#Visualising the results
plt.figure(figsize=(18,10))
sns.regplot(predictions,y_test,scatter_kws={'color':'red','edgecolor':'blue','linewidth':'0.7'},line_kws={'color':'black','alpha':0.5})
plt.xlabel('Predictions')
plt.ylabel('Overall')
plt.title("Linear Prediction of Player Rating")
plt.show()