Fake Wines?

# import useful libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

# 12 features and 1 label # 6497 rows (wines) df = pd.read_csv('wine_fraud.csv') df.shape

df.describe()

# since we have string values, we will need to map this column to check correlation df['quality'].unique()

# first, we have more white wines than red wines in the data set # next, we see that fraud wines take very small portion of entire data set plt.figure(figsize=(10,8)) sns.countplot(x='type',data=df,hue='quality');

# Number of red wines len(df[df['type']=='red'])

# Number of white wines len(df[df['type']=='white'])

# percentage of red wine mask = df['quality']=='Fraud' len(df[df['type']=='red'][mask])/len(df[df['type']=='red'])*100

# percentage of white wine len(df[df['type']=='white'][mask])/len(df[df['type']=='white'])*100

# first we need numerical target column df['fraud'] = df['quality'].map({'Legit':0,'Fraud':1}) # 1 if fraud

# now we can look at correlation easily # heatmap of correlations plt.figure(figsize=(10,10)) sns.heatmap(df.corr(),cmap='mako',annot=True);

# barplot of correlations corr = df.corr()['fraud'].sort_values(ascending=True)[:-1] plt.figure(figsize=(12,8)) sns.barplot(x=corr.index,y=corr) plt.xticks(rotation=90);

# we need to prepare the data before split and scale # we first drop unnecessary column df = df.drop('fraud',axis=1) # then we get_dummies for 'type' column df['type'] = pd.get_dummies(df['type'],drop_first=True) # set X and y X = df.drop('quality',axis=1) y = df['quality']

X.shape, y.shape

# train test split from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# scaling from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaled_X_train = scaler.fit_transform(X_train) scaled_X_test = scaler.transform(X_test)

# base SVC # we set class_weight = 'balanced' since we observed far more legit wines than fraud wines from sklearn.svm import SVC svc = SVC(class_weight='balanced',random_state=42)

# svc with grid search from sklearn.model_selection import GridSearchCV # parameter grid to explore different hyperparameters param_grid = {'C':[0.01,0.1,0.5,1],'gamma':['scale','auto'],'kernel':['linear','rbf']} # grid model grid = GridSearchCV(estimator=svc,param_grid=param_grid) # fit the model grid.fit(scaled_X_train,y_train)

# what are the best parameters? # we have C:1, gamma: auto, kernel: rbf grid.best_estimator_.get_params()

# now create a final model final_model = SVC(C=1,gamma='auto',kernel='rbf',random_state=42)

# fit to train data final_model.fit(scaled_X_train,y_train)

from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix, classification_report

# model prediction svc_pred = final_model.predict(scaled_X_test) # confusion matrix cm = confusion_matrix(y_test,svc_pred) ConfusionMatrixDisplay(cm,display_labels=final_model.classes_).plot();

# classification report print(classification_report(y_test,svc_pred))

sns.countplot(x='quality',data=df);

print('Percentage of Fraud Wine is '+str(np.round(len(df[df['quality']=='Fraud'])/len(df)*100,2))+'%')