import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve
from xgboost import XGBClassifier, XGBRFClassifier
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
''' reading data '''
df = pd.read_csv('Social_Network_Ads.csv')
''' displaying first 5 rows '''
df.head()
''' shape of data '''
df.shape
''' checking null values '''
df.isnull().sum()
''' describing data '''
df.describe()
''' value count of purchased '''
purch = df.Purchased.value_counts()
''' barplot '''
plt.figure(figsize=(8, 5))
sns.barplot(purch.index, purch);
plt.xlabel('Purchassed', fontsize=15)
plt.ylabel('Count', fontsize=15);
''' distribution plot '''
plt.figure(figsize=(10, 5))
sns.distplot(df['Age'])
plt.xlabel('Age', fontsize=15)
plt.ylabel('Density', fontsize=15);
''' distribution plot of EstimatedSalary '''
plt.figure(figsize=(10, 5))
sns.distplot(df['EstimatedSalary'])
plt.xlabel('EstimatedSalary', fontsize=15)
plt.ylabel('Density', fontsize=15);
''' scatter plot '''
plt.figure(figsize=(10,8))
plt.title('Plot of Estimated Salary vs Age vs Purchased', fontsize=20)
plt.xlabel('Age', fontsize=15)
plt.ylabel('EstimatedSalary', fontsize=15)
sns.scatterplot(data=df,x='Age',y= 'EstimatedSalary', hue='Purchased', s=100);
''' correlation matrix '''
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True)
plt.title('Confusion Matrix', fontsize=20);
''' independent and dependent variables '''
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X.head()
y.head()
''' train test split '''
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
''' scaling '''
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
def scores(m, X_train, X_test, y_train, y_test):
np.random.seed(42)
ms = {}
for n, model in m.items():
model.fit(X_train,y_train)
ms[n] = model.score(X_test,y_test)
ms = pd.DataFrame(ms, index=['Score']).transpose()
ms = ms.sort_values('Score')
return ms
models = {'LogisticRegression': LogisticRegression(max_iter=10000),
'KNeighborsClassifier': KNeighborsClassifier(),
'SVC': SVC(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'RandomForestClassifier': RandomForestClassifier(),
'AdaBoostClassifier': AdaBoostClassifier(),
'GradientBoostingClassifier': GradientBoostingClassifier(),
'XGBClassifier': XGBClassifier(eval_metric='mlogloss'),
'XGBRFClassifier': XGBRFClassifier(eval_metric='mlogloss')}
results = scores(models, X_train, X_test, y_train, y_test)
results
''' barplot of results '''
plt.figure(figsize=(10, 5))
plt.xticks(rotation=90)
plt.xlabel('Model', fontsize=15)
plt.ylabel('Score', fontsize=15)
plt.title("Models and its scores", fontsize=20)
sns.barplot(results.index, results.Score.values);
''' hyperparameter tunning '''
def rs_cv_sc(m, p, X_train, X_test, y_train, y_test):
np.random.seed(42)
mrs = {}
mrbs = {}
for name, model in m.items():
rs_model = RandomizedSearchCV(model, param_distributions=p[name], cv=5, n_iter=20, verbose=1)
rs_model.fit(X_train,y_train)
mrs[name] = rs_model.score(X_test,y_test)
mrbs[name] = rs_model.best_params_
return mrs, mrbs
''' RandomSearch CV 1 '''
params = {'KNeighborsClassifier' : {'n_neighbors': np.arange(1,100,10)},
'XGBClassifier': {'learning_rate': np.linspace(0,1,20),
'gamma': [0,2,4,10,20,50],
'max_depth': [2,3,6,10,20],
'lambda': [0,1],
'alpha' : [0,0.1,0.2,0.5,0.8,1],
},
'XGBRFClassifier': {'learning_rate': np.linspace(0,1,20),
'max_depth': [2,3,6,10,20],
},
'SVC' : {'C': [0.1,0.5,1, 10,100,500],
'kernel':['linear', 'poly', 'rbf','sigmoid'],
'gamma':['scale','auto'],
'degree':[2,3,4,5,6,7]}
}
models = {'KNeighborsClassifier': KNeighborsClassifier(),
'XGBClassifier': XGBClassifier(eval_metric='mlogloss'),
'XGBRFClassifier': XGBRFClassifier(eval_metric='mlogloss'),
'SVC': SVC()
}
mrs, mrbs = rs_cv_sc(models, params, X_train, X_test, y_train, y_test)
mrs
mrbs
''' RandomSearch CV 2 '''
params = {'KNeighborsClassifier' : {'n_neighbors': np.arange(1,15)},
'XGBClassifier': {'learning_rate': [0.631578947368421],
'gamma': [15,20,25],
'max_depth': [2],
'lambda': [0,1],
'alpha' : [0],
},
'XGBRFClassifier': {'learning_rate': [0.47368421052631576],
'max_depth': [2],
},
'SVC' : {'C': [0.3,0.4,0.5,0.6,0.7],
'kernel':['rbf'],
'gamma':['scale'],
'degree':[7,8,9,10,15,20]}
}
models = {'KNeighborsClassifier': KNeighborsClassifier(),
'XGBClassifier': XGBClassifier(eval_metric='mlogloss'),
'XGBRFClassifier': XGBRFClassifier(eval_metric='mlogloss'),
'SVC': SVC()
}
mrs1, mrbs1 = rs_cv_sc(models, params, X_train, X_test, y_train, y_test)
mrs1
mrbs1
''' KNN '''
knn = KNeighborsClassifier()
''' fit on data '''
knn.fit(X_train, y_train)
''' prediction '''
preds = knn.predict(X_test)
print("Classification Report: \n", classification_report(y_test, preds))
''' confusion matrix '''
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(y_test, preds), annot=True)
plt.title('Confusion Matrix', fontsize=20);
plt.figure(figsize=(10, 5))
plot_roc_curve(knn, X_test,y_test);