import pandas as pd
import numpy as np
import datetime as dt
from tqdm.notebook import tqdm_notebook
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')
# Load files into a pandas dataframe
train = pd.read_csv('Data/Train.csv')
test = pd.read_csv('Data/Test.csv')
ss = pd.read_csv('Data/SampleSubmission.csv')
# Preview the first five rows of the train set
train.head()
# Preview the first five rows of the test set
test.head()
# Preview the first five rows of the sample submission file
ss.head()
# Check the shape of the train and test sets
print(f'The shape of the train set is: {train.shape}\nThe shape of the test set is: {test.shape}')
# Check if there any missing values in train set
ax = train.isna().sum().sort_values().plot(kind = 'barh', figsize = (10, 7))
plt.title('Percentage of Missing Values Per Column in Train Set', fontdict={'size':15})
for p in ax.patches:
percentage ='{:,.0f}%'.format((p.get_width()/train.shape[0])*100)
width, height =p.get_width(),p.get_height()
x=p.get_x()+width+0.02
y=p.get_y()+height/2
ax.annotate(percentage,(x,y))
# Check if there missing values in test set
ax = test.isna().sum().sort_values().plot(kind = 'barh', figsize = (10, 7))
plt.title('Percentage of Missing Values Per Column in Test Set', fontdict={'size':15})
for p in ax.patches:
percentage ='{:,.1f}%'.format((p.get_width()/test.shape[0])*100)
width, height =p.get_width(),p.get_height()
x=p.get_x()+width+0.02
y=p.get_y()+height/2
ax.annotate(percentage,(x,y))
# Combine train and test set
ntrain = train.shape[0] # to be used to split train and test set from the combined dataframe
all_data = pd.concat((train, test)).reset_index(drop=True)
print(f'The shape of the combined dataframe is: {all_data.shape}')
# Preview the last five rows of the combined dataframe
all_data.tail()
# Check the column names and datatypes
all_data.info()
# Change each column to its appriopriate datatype
date_cols = [col for col in all_data.columns if 'Date' in col]
num_cols = ['Age', 'No_Pol']
cat_cols = [col for col in all_data.columns if col not in date_cols+num_cols+['ID', 'target']]
for col in all_data.columns:
if col in date_cols:
all_data[col] = pd.to_datetime(all_data[col])
elif col in cat_cols:
all_data[col] = all_data[col].astype('category')
# Confirm whether the changes have been applied successfully
all_data.info()
sns.countplot(train.target)
plt.title('Target Distribution', fontdict={'size':14});
# Gender distribution
ax = all_data.Gender.value_counts().sort_values().plot(kind = 'barh', figsize=(10,7))
plt.title('Gender Distribution', fontdict={'size': 15})
for p in ax.patches:
percentage ='{:,.1f}%'.format((p.get_width()/all_data.shape[0])*100)
width, height =p.get_width(),p.get_height()
x=p.get_x()+width+0.02
y=p.get_y()+height/2
ax.annotate(percentage,(x,y))
mapper = {'Entity':'Other', 'Joint Gender':'Other', 'NOT STATED':'Other', 'NO GENDER': 'Other', 'SEX':"Other"}
all_data.Gender = all_data.Gender.replace(mapper)
# Confirm mappings
all_data.Gender.value_counts()
# Check unique values for each categorical column
for col in cat_cols:
print(col)
print(all_data[col].unique(), '\n')
# Fill in missing values
# For cat cols and date cols fill in with mode and for num cols fill in with 9999
for col in all_data.columns:
if col in date_cols+cat_cols:
all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
elif col in num_cols:
all_data[col] = all_data[col].fillna(all_data[col].fillna(9999))
# Confirm that there aren't any missing values
all_data[all_data.columns.difference(['target'])].isna().sum()
# Extract date features from the date columns
for col in date_cols:
for date_feature in ['year', 'month', 'day']:
all_data[col+date_feature] = getattr(all_data[col].dt, date_feature)
# all_data = all_data.fillna(-999)
all_data.head()
cat_cols
# Encode categorical features
all_data = pd.get_dummies(data = all_data, columns = cat_cols)
all_data.head()
# import re
# all_data = all_data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
main_cols = all_data.columns.difference(date_cols+['ID', 'target'])
# Separate train and test data from the combined dataframe
train_df = all_data[:ntrain]
test_df = all_data[ntrain:]
# Check the shapes of the split dataset
train_df.shape, test_df.shape
train_df.columns
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN, BorderlineSMOTE, RandomOverSampler, SMOTE
# Select main columns to be used in training
X = train_df[main_cols].values
y = train_df.target
us = RandomUnderSampler(0.2)
os = RandomOverSampler(0.95, random_state=10)
print(X.shape, y.shape)
X, y = os.fit_resample(X, y)
print(X.shape, y.shape)
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
# Train model
model1 = CatBoostClassifier(iterations=220,od_type="Iter",l2_leaf_reg=5,
learning_rate=0.95,verbose=0,
depth=10)
# model2 = LogisticRegression(multi_class='multinomial', random_state=1)
model3 = RandomForestClassifier(n_estimators=400, random_state=1)
model4 = GaussianNB()
model5= LGBMClassifier(learning_rate=0.5,n_estimators=1000,num_leaves=120,n_jobs =4,min_child_samples= 14,
min_child_weight= 10)
model6=XGBClassifier()
model = VotingClassifier(estimators=[('cat', model1),('lgbm',model5)], voting='hard')
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Check the f1 score of the model
print(f'F1 score on the X_test is: {f1_score(y_test, y_pred)}')
# Make prediction on the test set
test_df = test_df[main_cols]
predictions = model.predict(test_df.values)
# Create a submission file
sub_file = ss.copy()
sub_file.predictions = predictions
# Check the distribution of your predictions
sns.countplot(sub_file.predictions);
ss["target"] = predictions.astype("int")
ss.to_csv("submission.csv",index=False)
ss.head()