AutoInland Vehicle Insurance Claim

import pandas as pd import numpy as np import datetime as dt from tqdm.notebook import tqdm_notebook import seaborn as sns import matplotlib.pyplot as plt from sklearn.linear_model import SGDClassifier from sklearn.metrics import f1_score from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from lightgbm import LGBMClassifier from xgboost import XGBClassifier from catboost import CatBoostClassifier import warnings warnings.filterwarnings('ignore')

# Load files into a pandas dataframe train = pd.read_csv('Data/Train.csv') test = pd.read_csv('Data/Test.csv') ss = pd.read_csv('Data/SampleSubmission.csv')

# Preview the first five rows of the train set train.head()

# Preview the first five rows of the test set test.head()

# Preview the first five rows of the sample submission file ss.head()

# Check the shape of the train and test sets print(f'The shape of the train set is: {train.shape}\nThe shape of the test set is: {test.shape}')

# Check if there any missing values in train set ax = train.isna().sum().sort_values().plot(kind = 'barh', figsize = (10, 7)) plt.title('Percentage of Missing Values Per Column in Train Set', fontdict={'size':15}) for p in ax.patches: percentage ='{:,.0f}%'.format((p.get_width()/train.shape[0])*100) width, height =p.get_width(),p.get_height() x=p.get_x()+width+0.02 y=p.get_y()+height/2 ax.annotate(percentage,(x,y))

# Check if there missing values in test set ax = test.isna().sum().sort_values().plot(kind = 'barh', figsize = (10, 7)) plt.title('Percentage of Missing Values Per Column in Test Set', fontdict={'size':15}) for p in ax.patches: percentage ='{:,.1f}%'.format((p.get_width()/test.shape[0])*100) width, height =p.get_width(),p.get_height() x=p.get_x()+width+0.02 y=p.get_y()+height/2 ax.annotate(percentage,(x,y))

# Combine train and test set ntrain = train.shape[0] # to be used to split train and test set from the combined dataframe all_data = pd.concat((train, test)).reset_index(drop=True) print(f'The shape of the combined dataframe is: {all_data.shape}')

# Preview the last five rows of the combined dataframe all_data.tail()

# Check the column names and datatypes all_data.info()

# Change each column to its appriopriate datatype date_cols = [col for col in all_data.columns if 'Date' in col] num_cols = ['Age', 'No_Pol'] cat_cols = [col for col in all_data.columns if col not in date_cols+num_cols+['ID', 'target']] for col in all_data.columns: if col in date_cols: all_data[col] = pd.to_datetime(all_data[col]) elif col in cat_cols: all_data[col] = all_data[col].astype('category') # Confirm whether the changes have been applied successfully all_data.info()

sns.countplot(train.target) plt.title('Target Distribution', fontdict={'size':14});

# Gender distribution ax = all_data.Gender.value_counts().sort_values().plot(kind = 'barh', figsize=(10,7)) plt.title('Gender Distribution', fontdict={'size': 15}) for p in ax.patches: percentage ='{:,.1f}%'.format((p.get_width()/all_data.shape[0])*100) width, height =p.get_width(),p.get_height() x=p.get_x()+width+0.02 y=p.get_y()+height/2 ax.annotate(percentage,(x,y))

mapper = {'Entity':'Other', 'Joint Gender':'Other', 'NOT STATED':'Other', 'NO GENDER': 'Other', 'SEX':"Other"} all_data.Gender = all_data.Gender.replace(mapper) # Confirm mappings all_data.Gender.value_counts()

# Check unique values for each categorical column for col in cat_cols: print(col) print(all_data[col].unique(), '\n')

# Fill in missing values # For cat cols and date cols fill in with mode and for num cols fill in with 9999 for col in all_data.columns: if col in date_cols+cat_cols: all_data[col] = all_data[col].fillna(all_data[col].mode()[0]) elif col in num_cols: all_data[col] = all_data[col].fillna(all_data[col].fillna(9999)) # Confirm that there aren't any missing values all_data[all_data.columns.difference(['target'])].isna().sum()

# Extract date features from the date columns for col in date_cols: for date_feature in ['year', 'month', 'day']: all_data[col+date_feature] = getattr(all_data[col].dt, date_feature) # all_data = all_data.fillna(-999) all_data.head()

cat_cols

# Encode categorical features all_data = pd.get_dummies(data = all_data, columns = cat_cols) all_data.head()

# import re # all_data = all_data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

main_cols = all_data.columns.difference(date_cols+['ID', 'target'])

# Separate train and test data from the combined dataframe train_df = all_data[:ntrain] test_df = all_data[ntrain:] # Check the shapes of the split dataset train_df.shape, test_df.shape

train_df.columns

from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import ADASYN, BorderlineSMOTE, RandomOverSampler, SMOTE

# Select main columns to be used in training X = train_df[main_cols].values y = train_df.target us = RandomUnderSampler(0.2) os = RandomOverSampler(0.95, random_state=10) print(X.shape, y.shape) X, y = os.fit_resample(X, y) print(X.shape, y.shape)

# Split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42) from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier, VotingClassifier # Train model model1 = CatBoostClassifier(iterations=220,od_type="Iter",l2_leaf_reg=5, learning_rate=0.95,verbose=0, depth=10) # model2 = LogisticRegression(multi_class='multinomial', random_state=1) model3 = RandomForestClassifier(n_estimators=400, random_state=1) model4 = GaussianNB() model5= LGBMClassifier(learning_rate=0.5,n_estimators=1000,num_leaves=120,n_jobs =4,min_child_samples= 14, min_child_weight= 10) model6=XGBClassifier() model = VotingClassifier(estimators=[('cat', model1),('lgbm',model5)], voting='hard') model.fit(X_train, y_train) # Make predictions y_pred = model.predict(X_test) # Check the f1 score of the model print(f'F1 score on the X_test is: {f1_score(y_test, y_pred)}')

# Make prediction on the test set test_df = test_df[main_cols] predictions = model.predict(test_df.values) # Create a submission file sub_file = ss.copy() sub_file.predictions = predictions # Check the distribution of your predictions sns.countplot(sub_file.predictions);

ss["target"] = predictions.astype("int")

ss.to_csv("submission.csv",index=False)

ss.head()