import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
import warnings
warnings.filterwarnings(action = 'ignore')
train=pd.read_csv('train_loan.csv')
test=pd.read_csv('test_loan.csv')
train.head(5)
train.dtypes
train.isnull().sum()
test.isnull().sum()
train.columns
test.columns
train.shape,test.shape
train['Loan_Status'].value_counts()
train['Loan_Status'].value_counts(normalize=True)
train['Loan_Status'].value_counts(normalize=True).plot(kind='bar')
cat_var=['Gender','Married','Self_Employed','Credit_History']
def plot_categorical(cat_var):
size=len(cat_var)
plt.figure(figsize=(20,10))
for j,i in enumerate(cat_var):
plt.subplot(2,2,j+1)
train[i].value_counts(normalize=True).plot.bar(title=i)
plot_categorical(cat_var)
ordinal_var=['Education','Dependents','Property_Area']
def plot_ordinal(ordinal_var):
plt.figure(figsize=(20,10))
for j,i in enumerate(ordinal_var):
plt.subplot(2,2,j+1)
train[i].value_counts(normalize=True).plot.bar(title=i)
plot_ordinal(ordinal_var)
Following inferences can be made from the above bar plots:
Most of the applicants don’t have any dependents.
Around 80% of the applicants are Graduate.
Most of the applicants are from Semiurban area.
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
sns.distplot(train['ApplicantIncome'])
plt.subplot(1,2,2)
plt.boxplot(train['ApplicantIncome'])
plt.show()
train.boxplot(column='ApplicantIncome' ,by='Education')
plt.suptitle("")
plt.figure(1)
plt.subplot(121)
sns.distplot(train['CoapplicantIncome'])
plt.subplot(122)
train['CoapplicantIncome'].plot.box(figsize=(16,5))
plt.figure(1)
df=train.dropna()
plt.subplot(121)
sns.distplot(df['LoanAmount'])
plt.subplot(122)
df['LoanAmount'].plot.box(figsize=(10,7))
Gender= pd.crosstab(train['Gender'],train['Loan_Status'])
Gender.div(Gender.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
Married=pd.crosstab(train['Married'],train['Loan_Status'])
Dependents=pd.crosstab(train['Dependents'],train['Loan_Status'])
Education=pd.crosstab(train['Education'],train['Loan_Status'])
Self_Employed=pd.crosstab(train['Self_Employed'],train['Loan_Status'])
Married.div(Married.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
plt.show()
Dependents.div(Dependents.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
plt.show()
Education.div(Education.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
plt.show()
Self_Employed.div(Self_Employed.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
plt.show()
Credit_History=pd.crosstab(train['Credit_History'],train['Loan_Status'])
Property_Area=pd.crosstab(train['Property_Area'],train['Loan_Status'])
Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
plt.show()
Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
plt.show()
train.groupby('Loan_Status')['ApplicantIncome'].mean().plot(kind='bar',title='loan status and applicant income')
bins=[0,1000,3000,42000]
group=['low','average','high']
train['CoapplicantIncome_bin']=pd.cut(train['CoapplicantIncome'],bins,labels=group)
CoapplicantIncome_bin = pd.crosstab(train['CoapplicantIncome_bin'],train['Loan_Status'])
CoapplicantIncome_bin.div(CoapplicantIncome_bin.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
#new variable in which we will combine the applicant’s and coapplicant’s income t
train['Total_Income']=train['ApplicantIncome']+train['CoapplicantIncome']
bins=[0,2500,4000,6000,81000]
group=['Low','Average','High', 'Very high']
train['Total_Income_bin']=pd.cut(train['Total_Income'],bins,labels=group)
Total_Income_bin=pd.crosstab(train['Total_Income_bin'],train['Loan_Status'])
Total_Income_bin.div(Total_Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
plt.xlabel('Total_Income')
P = plt.ylabel('Percentage')
bins=[0,100,200,700]
group=['Low','Average','High']
train['LoanAmount_bin']=pd.cut(train['LoanAmount'],bins,labels=group)
LoanAmount_bin=pd.crosstab(train['LoanAmount_bin'],train['Loan_Status'])
LoanAmount_bin.div(LoanAmount_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
plt.xlabel('LoanAmount')
P = plt.ylabel('Percentage')
train.head(5)
train=train.drop(['Total_Income_bin','LoanAmount_bin','CoapplicantIncome_bin'],axis=1)
train.head(5)
train['Dependents'].replace('3+',3,inplace=True)
train['Loan_Status'].replace('Y',1,inplace=True)
train['Loan_Status'].replace('N',0,inplace=True)
train['Loan_Status']=train['Loan_Status'].astype('int64')
matrix=train.corr()
f,ax=plt.subplots(figsize=(9,6))
sns.heatmap(matrix,vmax=.8,square=True,cmap='BuPu')
train.isnull().sum()
train['Gender'].fillna(train['Gender'].mode()[0],inplace=True)
train['Married'].fillna(train['Married'].mode()[0],inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0],inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0],inplace=True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0],inplace=True)
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0],inplace=True)
train['LoanAmount'].fillna(train['LoanAmount'].median(),inplace=True)
train.info()
test['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
test['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
test['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
test['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
test['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
test['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
train['LoanAmountLog']=np.log(train['LoanAmount'])
test['LoanAmountLog']=np.log(test['LoanAmount'])
train['LoanAmountLog'].hist(bins=20)
train.head(5)
test['Total_Income'] =test['ApplicantIncome']+test['CoapplicantIncome']
test.head(5)
sns.distplot(train['Total_Income'])
train['Total_Income_log']=np.log(train['Total_Income'])
sns.distplot(train['Total_Income_log'])
test['Total_Income_log']= np.log(train['Total_Income'])
train['EMI']=train['LoanAmount']/train['Loan_Amount_Term']
test['EMI']=test['LoanAmount']/test['Loan_Amount_Term']
sns.distplot(train['EMI'])
train['Balance_Income']=train['Total_Income']-(train['EMI']*1000)
test['Balance_Income']=test['Total_Income']-(test['EMI']*1000)
sns.distplot(train['Balance_Income'])
train=train.drop(['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term'],axis=1)
test=test.drop(['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term'],axis=1)
train.head(5)
test.head(5)
train=train.drop('Loan_ID',axis=1)
test_loan_id=test['Loan_ID']
test=test.drop('Loan_ID',axis=1)
train.head(5)
test.head(5)
y=train.Loan_Status
X=train.drop(['Loan_Status'],axis=1)
X.head(2)
y.head(2)
X=pd.get_dummies(X)
train=pd.get_dummies(train)
test=pd.get_dummies(test)
train.shape,test.shape
X.head(5)
from sklearn import tree
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix as cm
train['Loan_Status'].value_counts()
from sklearn.ensemble import RandomForestClassifier as rfc
i=1
kf=StratifiedKFold(n_splits=10,random_state=6,shuffle=True)
accuracy_list=[]
for train_index , test_index in kf.split(X,y):
print('\n{} of kfold {}'.format(i,kf.n_splits))
xtr,xvl=X.loc[train_index],X.loc[test_index]
ytr,yvl=y.loc[train_index],y.loc[test_index]
model=rfc(n_estimators=100,criterion='gini', max_depth=4)
model.fit(X,y)
pred_train=model.predict(X)
score=accuracy_score(y,pred_train)
accuracy_list.append(score)
print('accuracy_score',score)
mean_accuracy_rfc=sum(accuracy_list)/len(accuracy_list)
mean_accuracy_rfc
original_test_predictions=model.predict(test)
submissions=pd.read_csv('sample_submission.csv')
submissions['Loan_Status']=original_test_predictions
submissions['Loan_Status'].value_counts()
submissions['Loan_Status'].replace(0, 'N',inplace=True)
submissions['Loan_Status'].replace(1, 'Y',inplace=True)
pd.DataFrame(submissions, columns=['Loan_ID','Loan_Status']).to_csv('randomforest.csv',index=False)
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, GridSearchCV
xgb_model = xgb.XGBClassifier(objective = "binary:logistic")
params = {
'eta': np.arange(0.1, 0.26, 0.05),
'gamma': [5],
}
skf = StratifiedKFold(n_splits=20, shuffle = True)
grid = GridSearchCV(xgb_model,
param_grid = params,
n_jobs = -1,
cv = skf.split(X, y),
refit = "accuracy_score")
grid.fit(X,y)
xgboost_test_predictions=grid.predict(test)
xgboost_test_predictions
submissions['Loan_Status']=xgboost_test_predictions
submissions['Loan_Status'].value_counts()
submissions['Loan_Status'].replace(0, 'N',inplace=True)
submissions['Loan_Status'].replace(1, 'Y',inplace=True)
pd.DataFrame(submissions, columns=['Loan_ID','Loan_Status']).to_csv('xgboostmodel.csv',index=False)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=102)
from sklearn.ensemble import GradientBoostingClassifier
gbk=GradientBoostingClassifier()
gbk.fit(x_train,y_train)
pred_gbc=gbk.predict(x_test)
acc_gbc=accuracy_score(y_test,pred_gbc)*100
acc_gbc
xgb_test2=gbk.predict(test)
submissions['Loan_Status']=xgboost_test_predictions
submissions['Loan_Status'].value_counts()
submissions['Loan_Status'].replace(0, 'N',inplace=True)
submissions['Loan_Status'].replace(1, 'Y',inplace=True)
pd.DataFrame(submissions, columns=['Loan_ID','Loan_Status']).to_csv('xgboostmodel2.csv',index=False)