Starling test

import pandas as pd import plotly.express as px import warnings warnings.filterwarnings("ignore") df = pd.read_csv('/work/horses.csv') df.dtypes

df.head()

for col in df.columns: if 'ID' not in col: print(col) print('number of unique values') print(df[col].unique()) print('---')

for col in df.columns: print(col) print(df[col].value_counts()) print(df[[col]].describe()) if 'ID' not in col: fig = px.histogram(df[col], title=f'The distribution of {col}', height=300) fig.show() print('------------')

temp = df.isna().sum().sort_values(ascending=False).to_frame() temp['percentage %'] = round(temp[0]/df.shape[0] * 100,2) temp.rename(columns={0:'count'})

df.groupby('RaceID')['HorseID'].count().sort_values(ascending=False)

print(df.groupby('RaceID')['HorseID'].count().to_frame().describe()) px.histogram(x=df.groupby('RaceID')['HorseID'].count().sort_values(ascending=False).tolist(), height=300, title='The distribution of number of horses in each race')

df[df.Won==1].groupby('RaceID')['HorseID'].count().value_counts()

df.Won.value_counts()

temp = df[['RaceID', 'HorseCount']].copy() temp['unique_horse_count'] = df.groupby(['RaceID'])['HorseID'].transform('nunique') temp['include_all_horses'] = temp['HorseCount']>=temp['unique_horse_count'] temp.groupby('include_all_horses')['RaceID'].nunique()

temp[~temp['include_all_horses']] df[df['RaceID']==993698]

df[df['HorseCount']==0]['RaceID'].value_counts()

for col in ['Cloth', 'Stall', 'WeightValue', 'LastRunDaysFlat', 'Age', 'Sex', 'Colour']: if col not in ['Sex', 'Colour']: print(df.groupby('Won')[col].agg(['mean', 'median', pd.Series.mode])) else: print(df.groupby('Won')[col].agg([pd.Series.mode])) fig = px.histogram(df, x=col, facet_row='Won', barmode='group', height=300) fig.update_yaxes(matches=None) fig.update_yaxes(showticklabels=True, col=2) fig.show()

for col in ['Cloth', 'Stall', 'WeightValue', 'Age', 'Sex', 'Colour']: temp = (df[df['Won']==1][col].value_counts() / df[col].value_counts()*100).apply(lambda x: round(x,1)).to_frame().reset_index().rename(columns={'index':col, col:'won_percentage'}) # print(temp) fig = px.bar(temp, x=col, y='won_percentage', height=300, text_auto=True, title=f'The percentage of winning that race of each value for {col}') fig.show()

for col in ['ForecastPrice', 'StartingPrice']: print(f'Avilable {col} of winner and non winner') print(df[df[col].apply(lambda x: '-' not in str(x))].groupby(['Won', col]).size())

fig = px.box(x=df['JockeyID'].value_counts().values, height=300, title='The distribution of the number of races each jockey participates') fig.show() fig = px.box(x=df['HorseID'].value_counts().values, height=300, title='The distribution of the number of races each horse participates') fig.show()

raceids = df[df['Won']==1]['RaceID'].value_counts().loc[lambda x: x==1].keys() new_df = df[df['RaceID'].isin(raceids)] new_df.shape

cols = ['Cloth', 'Stall', 'WeightValue', 'LastRunDaysFlat', 'Age', 'Course_Distance', 'Sex', 'Colour'] new_df[cols].isna().sum()

new_df[new_df['Age'].isna()][['Age', 'YearBorn']]

for col in ['Age', 'Sex', 'Colour']: print(new_df[col].value_counts())

final_df = new_df[~new_df['Stall'].isna()].copy() final_df['LastRunDaysFlat'].fillna(final_df['LastRunDaysFlat'].median(), inplace=True) for col in ['Age', 'Sex', 'Colour']: final_df[col].fillna(final_df[col].mode().iloc[0], inplace=True) cols = ['Cloth', 'Stall', 'WeightValue', 'LastRunDaysFlat', 'Age', 'Course_Distance', 'Sex', 'Colour'] final_df[cols].isna().sum()

final_df[cols].dtypes

print(pd.get_dummies(final_df[cols]).columns) X = pd.get_dummies(final_df[cols]) y = final_df['Won'] print(X.shape, y.shape)

from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) print('train set label distribution') print(y_train.value_counts(normalize=True)) print('test set label distribution') print(y_test.value_counts(normalize=True))

from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_val_score precision_ls = [] for max_depth in range(1,30): clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42) precision_ls.append(cross_val_score(clf, X_train, y_train, cv=10, scoring='precision').mean()) px.line(x=[max_depth for max_depth in range(1,30)], y=precision_ls, title='CV average precision score of different max_depth for decision tree model')

precision_ls = [] for max_depth in range(1,30): clf = RandomForestClassifier(n_estimators=10, max_depth=max_depth, random_state=42) precision_ls.append(cross_val_score(clf, X_train, y_train, cv=10, scoring='precision').mean()) px.line(x=[max_depth for max_depth in range(1,30)], y=precision_ls, title='CV average precision score of different max_depth for random forest model')

from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=10, max_depth=14, random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) from sklearn.metrics import classification_report print(classification_report(y_test, y_pred, digits=3))

print('The distribution of winner for each stall value') print((df[df['Won']==1]['Stall'].value_counts(normalize=True)*100).apply(lambda x:str(round(x,1))+'%')) fig = px.histogram(x=df[df['Won']==1]['Stall'], height=300, title='The number of winners for each stable value') fig.show()

y_pred_baseline = X_test['Stall'].apply(lambda x: 1 if x==1 else 0) print(classification_report(y_test, y_pred_baseline, digits=3))

print('random forest model') print(classification_report(y_test, y_pred, digits=3)) print('baseline model') print(classification_report(y_test, y_pred_baseline, digits=3))