#dataset downloaded from kaggle
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd
df = pd.read_excel('Premiership 2020 results.xlsx')
df.head()
del df['Div']
df.head()
n = 92
new_df = df.iloc[:, :-n]
new_df.head()
n = 2
new_df = new_df.iloc[:, :-n]
new_df.head(5)
#i dont like the long name
data = new_df
data
data['Referee'].nunique()
data['HomeTeam'].nunique()
data['AwayTeam'].nunique()
data['HomeTeam'].unique()
data['HomeTeam'].unique()
"""I would have loved to create a new columns to display the unique teams but it would not
match the data shape. But further analysis and visualization on that would be done on tableau """
data['Teams'] = data['HomeTeam'].unique()
Execution Error
ValueError: Length of values (20) does not match length of index (380)
data.isnull().sum()
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 380 non-null object
1 Time 380 non-null object
2 HomeTeam 380 non-null object
3 AwayTeam 380 non-null object
4 FTHG 380 non-null int64
5 FTAG 380 non-null int64
6 FTR 380 non-null object
7 HTHG 380 non-null int64
8 HTAG 380 non-null int64
9 HTR 380 non-null object
10 Referee 380 non-null object
dtypes: int64(4), object(7)
memory usage: 32.8+ KB
import datetime as dt
data['Date'] = pd.to_datetime(data['Date'])
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 380 non-null datetime64[ns]
1 Time 380 non-null object
2 HomeTeam 380 non-null object
3 AwayTeam 380 non-null object
4 FTHG 380 non-null int64
5 FTAG 380 non-null int64
6 FTR 380 non-null object
7 HTHG 380 non-null int64
8 HTAG 380 non-null int64
9 HTR 380 non-null object
10 Referee 380 non-null object
dtypes: datetime64[ns](1), int64(4), object(6)
memory usage: 32.8+ KB
data.describe()
data.isnull().any()
data.hist(figsize=(20,14))
plt.show()
data.corr()
plt.figure(figsize = (12,10))
sns.heatmap(data.corr(), annot = True)
sns.regplot(x='FTHG', y='FTAG', data=data)
sns.regplot(x='HTHG', y='HTAG', data=data)
sns.regplot(x='FTHG', y='HTHG', data=data)
plt.style.use('default')
plt.figure(figsize=(5,5))
sns.barplot(x='FTHG', y='FTAG', data=data)
plt.title('Full Time Home Goal vs Full Time Away Goal')
plt.xlabel('Full Time Home Goal')
plt.ylabel('Full Time Away Goal')
plt.show()
list_1 = list(data.columns)
list_cate =[]
for i in list_1:
if data[i].dtype=='object':
list_cate.append(i)
list_cate
from sklearn.preprocessing import LabelEncoder
le =LabelEncoder()
for i in list_cate:
data[i]=le.fit_transform(data[i])
data
y = data['FTHG']
x = data.drop('FTHG', axis=1)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,random_state=0, test_size=0.2)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))
304
76
304
76
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train, y_train)
Execution Error
TypeError: The DTypes <class 'numpy.dtype[int32]'> and <class 'numpy.dtype[datetime64]'> do not have a common DType. For example they cannot be stored in a single array unless the dtype is `object`.