import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Run to view results
# reading s data
MLSTable = pd.read_csv('MLS2017Table.csv',sep = ',') #reading our data which is ',' separated
MLSTable.rename(columns={'Club': 'club'}, inplace=True)
"""
For those unfamiliar with soccer statistics the meanings are as follows:
P: Points
+/-: goal difference
M: matches played
W: wins
L: losses
F: goals scored
A: Goals conceded
"""
MLSTable
Run to view results
# reading salaries
salaries_df = pd.read_csv('mls-salaries-2017.csv')
salaries_df
Run to view results
missing_values = salaries_df.isnull().sum()
# Check for missing values specifically in the 'club' and 'first_name' columns
missing_club = salaries_df['club'].isnull().sum()
missing_first_name = salaries_df['first_name'].isnull().sum()
# finding the data that needs to be replaced
print("Missing Values in 'club' column:", missing_club)
print("Missing Values in 'first_name' column:", missing_first_name)
missing_rows = salaries_df[salaries_df['club'].isnull() | salaries_df['first_name'].isnull()]
print("Rows with missing values in 'club' or 'first_name':")
print(missing_rows)
Run to view results
# simple google searches allow me to find the information needed to correct this
corrections_dict = {
'46': {'first_name': 'Vitor'},
'265': {'first_name': 'Barreto'},
'401': {'first_name': 'Ricardo'},
'429': {'first_name': 'Pereira'},
'614': {'club': 'TOR'},
'615': {'club': 'ORL'},
}
# Update missing values based on corrections_dict
for player_id, corrections in corrections_dict.items():
if 'club' in corrections:
salaries_df.loc[int(player_id), 'club'] = corrections['club']
if 'first_name' in corrections:
salaries_df.loc[int(player_id), 'first_name'] = corrections['first_name']
salaries_df
Run to view results
# Create a dictionary to map abbreviated names to full names
abbreviation_mapping = {
'ATL': 'Atlanta United FC',
'MTL': 'CF Montreal',
'CHI': 'Chicago Fire',
'COL': 'Colorado Rapids',
'CLB': 'Columbus Crew',
'DC': 'DC United',
'DAL': 'FC Dallas',
'HOU': 'Houston Dynamo',
'LA': 'LA Galaxy',
'MNUFC': 'Minnesota United',
'NE': 'New England Revolution',
'NYCFC': 'New York City FC',
'NYRB': 'New York Red Bulls',
'ORL': 'Orlando City',
'PHI': 'Philadelphia Union',
'POR': 'Portland Timbers',
'RSL': 'Real Salt Lake',
'SJ': 'San Jose Earthquakes',
'SEA': 'Seattle Sounders FC',
'KC': 'Sporting Kansas City',
'TOR': 'Toronto FC',
'VAN': 'Vancouver Whitecaps'
}
# Replace the abbreviated names in the 'club' column with full names
salaries_df['club'] = salaries_df['club'].replace(abbreviation_mapping)
salaries_df
Run to view results
average_salary_by_club = salaries_df.groupby('club')['base_salary'].mean().reset_index()
average_salary_by_club_sorted = average_salary_by_club.sort_values(by='base_salary', ascending=False)
average_salary_by_club_sorted.rename(columns={'base_salary': 'average_salary'}, inplace=True)
(average_salary_by_club_sorted)
Run to view results
from scipy.stats import pearsonr
# Assuming you have the 'average_salary_by_club_sorted' dataframe and 'MLSTable' dataframe
# If needed, you may want to merge them based on the 'club' column
# Merge the dataframes if not done already
merged_performance_salary_df = pd.merge(average_salary_by_club_sorted, MLSTable, on='club')
merged_performance_salary_df
Run to view results
# Calculate the Pearson correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(merged_performance_salary_df['average_salary'], merged_performance_salary_df['P'])
# Display the correlation coefficient and p-value
print(f"Pearson Correlation Coefficient between average salary and points: {correlation_coefficient}")
print(f"P-value: {p_value}")
# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='average_salary', y='P', data=merged_performance_salary_df)
plt.title('Scatter Plot of Average Salary vs. Points')
plt.xlabel('Average Salary')
plt.ylabel('Points')
plt.show()
Run to view results
# Calculate the Pearson correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(merged_performance_salary_df['average_salary'], merged_performance_salary_df['W'])
# Display the correlation coefficient and p-value
print(f"Pearson Correlation Coefficient between average salary and wins: {correlation_coefficient}")
print(f"P-value: {p_value}")
# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='average_salary', y='W', data=merged_performance_salary_df)
plt.title('Scatter Plot of Average Salary vs. Woms')
plt.xlabel('Average Salary')
plt.ylabel('Wins')
plt.show()
Run to view results
# Calculate the Pearson correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(merged_performance_salary_df['average_salary'], merged_performance_salary_df['F'])
# Display the correlation coefficient and p-value
print(f"Pearson Correlation Coefficient between average salary and goals scored: {correlation_coefficient}")
print(f"P-value: {p_value}")
# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='average_salary', y='F', data=merged_performance_salary_df)
plt.title('Scatter Plot of Average Salary vs. Goals Scored')
plt.xlabel('Average Salary')
plt.ylabel('Points')
plt.show()
Run to view results