#dataset downloaded from kaggle
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd
df = pd.read_excel('Premiership 2020 results.xlsx')
df.head()
del df['Div']
df.head()
n = 92
new_df = df.iloc[:, :-n]
new_df.head()
n = 2
new_df = new_df.iloc[:, :-n]
new_df.head(5)
#i dont like the long name
data = new_df
data
data['Referee'].nunique()
data['HomeTeam'].nunique()
data['AwayTeam'].nunique()
data['HomeTeam'].unique()
data['HomeTeam'].unique()
"""I would have loved to create a new columns to display the unique teams but it would not
match the data shape. But further analysis and visualization on that would be done on tableau """
data['Teams'] = data['HomeTeam'].unique()
data.isnull().sum()
data.info()
import datetime as dt
data['Date'] = pd.to_datetime(data['Date'])
data.info()
data.describe()
data.isnull().any()
data.hist(figsize=(20,14))
plt.show()
data.corr()
plt.figure(figsize = (12,10))
sns.heatmap(data.corr(), annot = True)
sns.regplot(x='FTHG', y='FTAG', data=data)
sns.regplot(x='HTHG', y='HTAG', data=data)
sns.regplot(x='FTHG', y='HTHG', data=data)
plt.style.use('default')
plt.figure(figsize=(5,5))
sns.barplot(x='FTHG', y='FTAG', data=data)
plt.title('Full Time Home Goal vs Full Time Away Goal')
plt.xlabel('Full Time Home Goal')
plt.ylabel('Full Time Away Goal')
plt.show()
list_1 = list(data.columns)
list_cate =[]
for i in list_1:
if data[i].dtype=='object':
list_cate.append(i)
list_cate
from sklearn.preprocessing import LabelEncoder
le =LabelEncoder()
for i in list_cate:
data[i]=le.fit_transform(data[i])
data
y = data['FTHG']
x = data.drop('FTHG', axis=1)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,random_state=0, test_size=0.2)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train, y_train)