English Premier League 2020

#dataset downloaded from kaggle import os import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns import numpy as np import pandas as pd

df = pd.read_excel('Premiership 2020 results.xlsx')

df.head()

del df['Div']

df.head()

n = 92 new_df = df.iloc[:, :-n]

new_df.head()

n = 2 new_df = new_df.iloc[:, :-n]

new_df.head(5)

#i dont like the long name data = new_df

data

data['Referee'].nunique()

data['HomeTeam'].nunique()

data['AwayTeam'].nunique()

data['HomeTeam'].unique()

"""I would have loved to create a new columns to display the unique teams but it would not match the data shape. But further analysis and visualization on that would be done on tableau """ data['Teams'] = data['HomeTeam'].unique()

data.isnull().sum()

data.info()

import datetime as dt data['Date'] = pd.to_datetime(data['Date'])

data.info()

data.describe()

data.isnull().any()

data.hist(figsize=(20,14)) plt.show()

data.corr()

plt.figure(figsize = (12,10)) sns.heatmap(data.corr(), annot = True)

sns.regplot(x='FTHG', y='FTAG', data=data)

sns.regplot(x='HTHG', y='HTAG', data=data)

sns.regplot(x='FTHG', y='HTHG', data=data)

plt.style.use('default') plt.figure(figsize=(5,5)) sns.barplot(x='FTHG', y='FTAG', data=data) plt.title('Full Time Home Goal vs Full Time Away Goal') plt.xlabel('Full Time Home Goal') plt.ylabel('Full Time Away Goal') plt.show()

list_1 = list(data.columns)

list_cate =[] for i in list_1: if data[i].dtype=='object': list_cate.append(i)

list_cate

from sklearn.preprocessing import LabelEncoder le =LabelEncoder()

for i in list_cate: data[i]=le.fit_transform(data[i])

data

y = data['FTHG'] x = data.drop('FTHG', axis=1)

from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test=train_test_split(x,y,random_state=0, test_size=0.2)

print(len(x_train)) print(len(x_test)) print(len(y_train)) print(len(y_test))

from sklearn.neighbors import KNeighborsClassifier knn=KNeighborsClassifier(n_neighbors=7) knn.fit(x_train, y_train)