Final Project

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

Run to view results

# reading s data MLSTable = pd.read_csv('MLS2017Table.csv',sep = ',') #reading our data which is ',' separated MLSTable.rename(columns={'Club': 'club'}, inplace=True) """ For those unfamiliar with soccer statistics the meanings are as follows: P: Points +/-: goal difference M: matches played W: wins L: losses F: goals scored A: Goals conceded """ MLSTable

Run to view results

# reading salaries salaries_df = pd.read_csv('mls-salaries-2017.csv') salaries_df

Run to view results

missing_values = salaries_df.isnull().sum() # Check for missing values specifically in the 'club' and 'first_name' columns missing_club = salaries_df['club'].isnull().sum() missing_first_name = salaries_df['first_name'].isnull().sum() # finding the data that needs to be replaced print("Missing Values in 'club' column:", missing_club) print("Missing Values in 'first_name' column:", missing_first_name) missing_rows = salaries_df[salaries_df['club'].isnull() | salaries_df['first_name'].isnull()] print("Rows with missing values in 'club' or 'first_name':") print(missing_rows)

Run to view results

# simple google searches allow me to find the information needed to correct this corrections_dict = { '46': {'first_name': 'Vitor'}, '265': {'first_name': 'Barreto'}, '401': {'first_name': 'Ricardo'}, '429': {'first_name': 'Pereira'}, '614': {'club': 'TOR'}, '615': {'club': 'ORL'}, } # Update missing values based on corrections_dict for player_id, corrections in corrections_dict.items(): if 'club' in corrections: salaries_df.loc[int(player_id), 'club'] = corrections['club'] if 'first_name' in corrections: salaries_df.loc[int(player_id), 'first_name'] = corrections['first_name'] salaries_df

Run to view results

# Create a dictionary to map abbreviated names to full names abbreviation_mapping = { 'ATL': 'Atlanta United FC', 'MTL': 'CF Montreal', 'CHI': 'Chicago Fire', 'COL': 'Colorado Rapids', 'CLB': 'Columbus Crew', 'DC': 'DC United', 'DAL': 'FC Dallas', 'HOU': 'Houston Dynamo', 'LA': 'LA Galaxy', 'MNUFC': 'Minnesota United', 'NE': 'New England Revolution', 'NYCFC': 'New York City FC', 'NYRB': 'New York Red Bulls', 'ORL': 'Orlando City', 'PHI': 'Philadelphia Union', 'POR': 'Portland Timbers', 'RSL': 'Real Salt Lake', 'SJ': 'San Jose Earthquakes', 'SEA': 'Seattle Sounders FC', 'KC': 'Sporting Kansas City', 'TOR': 'Toronto FC', 'VAN': 'Vancouver Whitecaps' } # Replace the abbreviated names in the 'club' column with full names salaries_df['club'] = salaries_df['club'].replace(abbreviation_mapping) salaries_df

Run to view results

average_salary_by_club = salaries_df.groupby('club')['base_salary'].mean().reset_index() average_salary_by_club_sorted = average_salary_by_club.sort_values(by='base_salary', ascending=False) average_salary_by_club_sorted.rename(columns={'base_salary': 'average_salary'}, inplace=True) (average_salary_by_club_sorted)

Run to view results

from scipy.stats import pearsonr # Assuming you have the 'average_salary_by_club_sorted' dataframe and 'MLSTable' dataframe # If needed, you may want to merge them based on the 'club' column # Merge the dataframes if not done already merged_performance_salary_df = pd.merge(average_salary_by_club_sorted, MLSTable, on='club') merged_performance_salary_df

Run to view results

# Calculate the Pearson correlation coefficient and p-value correlation_coefficient, p_value = pearsonr(merged_performance_salary_df['average_salary'], merged_performance_salary_df['P']) # Display the correlation coefficient and p-value print(f"Pearson Correlation Coefficient between average salary and points: {correlation_coefficient}") print(f"P-value: {p_value}") # Scatter plot plt.figure(figsize=(10, 6)) sns.scatterplot(x='average_salary', y='P', data=merged_performance_salary_df) plt.title('Scatter Plot of Average Salary vs. Points') plt.xlabel('Average Salary') plt.ylabel('Points') plt.show()

Run to view results

# Calculate the Pearson correlation coefficient and p-value correlation_coefficient, p_value = pearsonr(merged_performance_salary_df['average_salary'], merged_performance_salary_df['W']) # Display the correlation coefficient and p-value print(f"Pearson Correlation Coefficient between average salary and wins: {correlation_coefficient}") print(f"P-value: {p_value}") # Scatter plot plt.figure(figsize=(10, 6)) sns.scatterplot(x='average_salary', y='W', data=merged_performance_salary_df) plt.title('Scatter Plot of Average Salary vs. Woms') plt.xlabel('Average Salary') plt.ylabel('Wins') plt.show()

Run to view results

# Calculate the Pearson correlation coefficient and p-value correlation_coefficient, p_value = pearsonr(merged_performance_salary_df['average_salary'], merged_performance_salary_df['F']) # Display the correlation coefficient and p-value print(f"Pearson Correlation Coefficient between average salary and goals scored: {correlation_coefficient}") print(f"P-value: {p_value}") # Scatter plot plt.figure(figsize=(10, 6)) sns.scatterplot(x='average_salary', y='F', data=merged_performance_salary_df) plt.title('Scatter Plot of Average Salary vs. Goals Scored') plt.xlabel('Average Salary') plt.ylabel('Points') plt.show()

Run to view results