import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv('store.csv')
df.head()
# Dropping User_ID and Product_ID
data = df.drop(['User_ID','Product_ID'],axis=1)
# Converting the Variables data types as catogory and encoding them for modelling
data['Gender'] = data['Gender'].astype('category')
data['gender_cat'] = data['Gender'].cat.codes
data['Age'] = data['Age'].astype('category')
data['Age_cat'] = data['Age'].cat.codes
data['City_Category'] = data['City_Category'].astype('category')
data['City_Category_cat'] = data['City_Category'].cat.codes
data['Stay_In_Current_City_Years'] = data['Stay_In_Current_City_Years'].astype('category')
data['Stay_In_Current_City_Years_cat'] = data['Stay_In_Current_City_Years'].cat.codes
# Dropping all the Columns with Strings and using only their encoded versions
data = data.drop(['Gender','Age','City_Category','Stay_In_Current_City_Years'],axis=1)
data.head()
# Product Category 3 is dropped because it is not usable for figuring out missing values in Product Category 2 which will be Target Variable
data_1 = data.drop(['Product_Category_3'],axis=1)
# Seperating the data with missing values in Product Category 2 for predicting it later
test_1 = data_1[data_1.isna().any(axis=1)]
data_1 = data_1.dropna()
data_1.head()
# Scalling the Values except the Traget Variable
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data_1.drop(['Product_Category_2'],axis=1))
scaled_features = scaler.transform(data_1.drop(['Product_Category_2'],axis=1))
# Creating dataframe with the scaled features
scaled_feat_col = ['Occupation','Marital_Status','Product_Category_1','Purchase','gender_cat','Age_cat','City_Category_cat','Stay_In_Current_City_Years_cat']
df_feat = pd.DataFrame(scaled_features,columns=scaled_feat_col)
df_feat.head()
# Creating a KNN model with df_feat
X = df_feat
y = data_1['Product_Category_2']
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=99)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_valid)
pred
# Calculating the accuracy score
from sklearn.metrics import accuracy_score as acc
print('Accuracy of the model is ',acc(y_valid,pred))
# Elbow Curve for Classifier
def Elbow(K):
#initiating empty list
test_error = []
#training model for evey value of K
for i in K:
#Instance oh KNN
knn = KNeighborsClassifier(n_neighbors = i)
knn.fit(X_train, y_train)
# Appending Accuracy scores to empty list claculated using the predictions
tmp = knn.predict(X_valid)
tmp = acc(tmp,y_valid)
error = 1-tmp
test_error.append(error)
return test_error
#Defining K range
k = range(10, 50, 5)
# calling above defined function
elbow_curve = Elbow(k)
# plotting the Curves
plt.plot(k, elbow_curve)
plt.xlabel('K Neighbors')
plt.ylabel('Test error')
plt.title('Elbow Curve for test')
# Creating instance of KNN
knn = KNeighborsClassifier(n_neighbors = 25)
# Fitting the model
knn.fit(X_train, y_train)
# Predicting over the Train Set and calculating accuracy score
valid_predict = knn.predict(X_valid)
k = acc(valid_predict, y_valid)
print('Test Accuracy Score ', k )
# Scalling the Values except the Target Variable
scaler = StandardScaler()
scaler.fit(test_1.drop(['Product_Category_2'],axis=1))
scaled_features = scaler.transform(test_1.drop(['Product_Category_2'],axis=1))
X_test = pd.DataFrame(scaled_features,columns=scaled_feat_col)
pred1 = knn.predict(X_test)
pred1 = pd.DataFrame(pred1,columns=['Product_Category_2'])
# Replacing the missing values in Product_category_2 with the predicted values
gen_alt = (alt for alt in pred1['Product_Category_2'])
for count, ele in enumerate(data.Product_Category_2):
if not pd.isna(ele): continue
try:
data['Product_Category_2'][count] = next(gen_alt)
except StopIteration:
break
data
data.dtypes
test_2 = data[data.isna().any(axis=1)]
data_2 = data.dropna()
# Scaling the Independent Variables
scaler = StandardScaler()
scaler.fit(data_2.drop(['Product_Category_3'],axis=1))
scaled_features = scaler.transform(data_2.drop(['Product_Category_3'],axis=1))
# Creating dataframe with the scaled features
scaled_feat_col = ['Occupation','Marital_Status','Product_Category_1','Product_Category_2','Purchase','gender_cat','Age_cat','City_Category_cat','Stay_In_Current_City_Years_cat']
df_feat = pd.DataFrame(scaled_features,columns=scaled_feat_col)
df_feat.head()
# Creating a KNN model with df_feat
X = df_feat
y = data_2['Product_Category_3']
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=99)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_valid)
pred
# Calculating the accuracy score
from sklearn.metrics import accuracy_score as acc
print('Accuracy of the model is ',acc(y_valid,pred))
# Elbow Curve for Classifier
def Elbow(K):
#initiating empty list
test_error = []
#training model for evey value of K
for i in K:
#Instance oh KNN
knn = KNeighborsClassifier(n_neighbors = i)
knn.fit(X_train, y_train)
# Appending Accuracy scores to empty list claculated using the predictions
tmp = knn.predict(X_valid)
tmp = acc(tmp,y_valid)
error = 1-tmp
test_error.append(error)
return test_error
#Defining K range
k = range(15,50,5)
# calling above defined function
elbow_curve = Elbow(k)
# plotting the Curves
plt.plot(k, elbow_curve)
plt.xlabel('K Neighbors')
plt.ylabel('Test error')
plt.title('Elbow Curve for test')
# Creating instance of KNN
knn = KNeighborsClassifier(n_neighbors = 40)
# Fitting the model
knn.fit(X_train, y_train)
# Predicting over the Train Set and calculating accuracy score
valid_predict = knn.predict(X_valid)
k = acc(valid_predict, y_valid)
print('Test Accuracy Score ', k )
# Scalling the Values except the Target Variable
scaler = StandardScaler()
scaler.fit(test_2.drop(['Product_Category_3'],axis=1))
scaled_features = scaler.transform(test_2.drop(['Product_Category_3'],axis=1))
X_test = pd.DataFrame(scaled_features,columns=scaled_feat_col)
pred2 = knn.predict(X_test)
pred2 = pd.DataFrame(pred2,columns=['Product_Category_3'])
# Replacing the missing values in Product_category_2 with the predicted values
gen_alt = (alt for alt in pred2['Product_Category_3'])
for count, ele in enumerate(data.Product_Category_3):
if not pd.isna(ele): continue
try:
data['Product_Category_3'][count] = next(gen_alt)
except StopIteration:
break
data.isna().sum()
data.Product_Category_2.value_counts().sort_index()
df.Product_Category_2.value_counts().sort_index()
data.to_csv("data_clean")