# import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 12 features and 1 label
# 6497 rows (wines)
df = pd.read_csv('wine_fraud.csv')
df.shape
df.describe()
# since we have string values, we will need to map this column to check correlation
df['quality'].unique()
# first, we have more white wines than red wines in the data set
# next, we see that fraud wines take very small portion of entire data set
plt.figure(figsize=(10,8))
sns.countplot(x='type',data=df,hue='quality');
# Number of red wines
len(df[df['type']=='red'])
# Number of white wines
len(df[df['type']=='white'])
# percentage of red wine
mask = df['quality']=='Fraud'
len(df[df['type']=='red'][mask])/len(df[df['type']=='red'])*100
# percentage of white wine
len(df[df['type']=='white'][mask])/len(df[df['type']=='white'])*100
# first we need numerical target column
df['fraud'] = df['quality'].map({'Legit':0,'Fraud':1}) # 1 if fraud
# now we can look at correlation easily
# heatmap of correlations
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),cmap='mako',annot=True);
# barplot of correlations
corr = df.corr()['fraud'].sort_values(ascending=True)[:-1]
plt.figure(figsize=(12,8))
sns.barplot(x=corr.index,y=corr)
plt.xticks(rotation=90);
# we need to prepare the data before split and scale
# we first drop unnecessary column
df = df.drop('fraud',axis=1)
# then we get_dummies for 'type' column
df['type'] = pd.get_dummies(df['type'],drop_first=True)
# set X and y
X = df.drop('quality',axis=1)
y = df['quality']
X.shape, y.shape
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
# base SVC
# we set class_weight = 'balanced' since we observed far more legit wines than fraud wines
from sklearn.svm import SVC
svc = SVC(class_weight='balanced',random_state=42)
# svc with grid search
from sklearn.model_selection import GridSearchCV
# parameter grid to explore different hyperparameters
param_grid = {'C':[0.01,0.1,0.5,1],'gamma':['scale','auto'],'kernel':['linear','rbf']}
# grid model
grid = GridSearchCV(estimator=svc,param_grid=param_grid)
# fit the model
grid.fit(scaled_X_train,y_train)
# what are the best parameters?
# we have C:1, gamma: auto, kernel: rbf
grid.best_estimator_.get_params()
# now create a final model
final_model = SVC(C=1,gamma='auto',kernel='rbf',random_state=42)
# fit to train data
final_model.fit(scaled_X_train,y_train)
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix, classification_report
# model prediction
svc_pred = final_model.predict(scaled_X_test)
# confusion matrix
cm = confusion_matrix(y_test,svc_pred)
ConfusionMatrixDisplay(cm,display_labels=final_model.classes_).plot();
# classification report
print(classification_report(y_test,svc_pred))
sns.countplot(x='quality',data=df);
print('Percentage of Fraud Wine is '+str(np.round(len(df[df['quality']=='Fraud'])/len(df)*100,2))+'%')