import pandas as pd
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('/work/horses.csv')
df.dtypes
df.head()
for col in df.columns:
if 'ID' not in col:
print(col)
print('number of unique values')
print(df[col].unique())
print('---')
for col in df.columns:
print(col)
print(df[col].value_counts())
print(df[[col]].describe())
if 'ID' not in col:
fig = px.histogram(df[col], title=f'The distribution of {col}', height=300)
fig.show()
print('------------')
temp = df.isna().sum().sort_values(ascending=False).to_frame()
temp['percentage %'] = round(temp[0]/df.shape[0] * 100,2)
temp.rename(columns={0:'count'})
df.groupby('RaceID')['HorseID'].count().sort_values(ascending=False)
print(df.groupby('RaceID')['HorseID'].count().to_frame().describe())
px.histogram(x=df.groupby('RaceID')['HorseID'].count().sort_values(ascending=False).tolist(), height=300, title='The distribution of number of horses in each race')
df[df.Won==1].groupby('RaceID')['HorseID'].count().value_counts()
df.Won.value_counts()
temp = df[['RaceID', 'HorseCount']].copy()
temp['unique_horse_count'] = df.groupby(['RaceID'])['HorseID'].transform('nunique')
temp['include_all_horses'] = temp['HorseCount']>=temp['unique_horse_count']
temp.groupby('include_all_horses')['RaceID'].nunique()
temp[~temp['include_all_horses']]
df[df['RaceID']==993698]
df[df['HorseCount']==0]['RaceID'].value_counts()
for col in ['Cloth', 'Stall', 'WeightValue', 'LastRunDaysFlat', 'Age', 'Sex', 'Colour']:
if col not in ['Sex', 'Colour']:
print(df.groupby('Won')[col].agg(['mean', 'median', pd.Series.mode]))
else:
print(df.groupby('Won')[col].agg([pd.Series.mode]))
fig = px.histogram(df, x=col, facet_row='Won', barmode='group', height=300)
fig.update_yaxes(matches=None)
fig.update_yaxes(showticklabels=True, col=2)
fig.show()
for col in ['Cloth', 'Stall', 'WeightValue', 'Age', 'Sex', 'Colour']:
temp = (df[df['Won']==1][col].value_counts() / df[col].value_counts()*100).apply(lambda x: round(x,1)).to_frame().reset_index().rename(columns={'index':col, col:'won_percentage'})
# print(temp)
fig = px.bar(temp, x=col, y='won_percentage', height=300, text_auto=True, title=f'The percentage of winning that race of each value for {col}')
fig.show()
for col in ['ForecastPrice', 'StartingPrice']:
print(f'Avilable {col} of winner and non winner')
print(df[df[col].apply(lambda x: '-' not in str(x))].groupby(['Won', col]).size())
fig = px.box(x=df['JockeyID'].value_counts().values, height=300, title='The distribution of the number of races each jockey participates')
fig.show()
fig = px.box(x=df['HorseID'].value_counts().values, height=300, title='The distribution of the number of races each horse participates')
fig.show()
raceids = df[df['Won']==1]['RaceID'].value_counts().loc[lambda x: x==1].keys()
new_df = df[df['RaceID'].isin(raceids)]
new_df.shape
cols = ['Cloth', 'Stall', 'WeightValue', 'LastRunDaysFlat', 'Age', 'Course_Distance', 'Sex', 'Colour']
new_df[cols].isna().sum()
new_df[new_df['Age'].isna()][['Age', 'YearBorn']]
for col in ['Age', 'Sex', 'Colour']:
print(new_df[col].value_counts())
final_df = new_df[~new_df['Stall'].isna()].copy()
final_df['LastRunDaysFlat'].fillna(final_df['LastRunDaysFlat'].median(), inplace=True)
for col in ['Age', 'Sex', 'Colour']:
final_df[col].fillna(final_df[col].mode().iloc[0], inplace=True)
cols = ['Cloth', 'Stall', 'WeightValue', 'LastRunDaysFlat', 'Age', 'Course_Distance', 'Sex', 'Colour']
final_df[cols].isna().sum()
final_df[cols].dtypes
print(pd.get_dummies(final_df[cols]).columns)
X = pd.get_dummies(final_df[cols])
y = final_df['Won']
print(X.shape, y.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print('train set label distribution')
print(y_train.value_counts(normalize=True))
print('test set label distribution')
print(y_test.value_counts(normalize=True))
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
precision_ls = []
for max_depth in range(1,30):
clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
precision_ls.append(cross_val_score(clf, X_train, y_train, cv=10, scoring='precision').mean())
px.line(x=[max_depth for max_depth in range(1,30)], y=precision_ls, title='CV average precision score of different max_depth for decision tree model')
precision_ls = []
for max_depth in range(1,30):
clf = RandomForestClassifier(n_estimators=10, max_depth=max_depth, random_state=42)
precision_ls.append(cross_val_score(clf, X_train, y_train, cv=10, scoring='precision').mean())
px.line(x=[max_depth for max_depth in range(1,30)], y=precision_ls, title='CV average precision score of different max_depth for random forest model')
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10, max_depth=14, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits=3))
print('The distribution of winner for each stall value')
print((df[df['Won']==1]['Stall'].value_counts(normalize=True)*100).apply(lambda x:str(round(x,1))+'%'))
fig = px.histogram(x=df[df['Won']==1]['Stall'], height=300, title='The number of winners for each stable value')
fig.show()
y_pred_baseline = X_test['Stall'].apply(lambda x: 1 if x==1 else 0)
print(classification_report(y_test, y_pred_baseline, digits=3))
print('random forest model')
print(classification_report(y_test, y_pred, digits=3))
print('baseline model')
print(classification_report(y_test, y_pred_baseline, digits=3))