reset -fs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import *
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import compose
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
data = pd.read_csv("summer-products-with-rating-and-performance_2020-08.csv")
df = data.copy()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1573 entries, 0 to 1572
Data columns (total 43 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 1573 non-null object
1 title_orig 1573 non-null object
2 price 1573 non-null float64
3 retail_price 1573 non-null int64
4 currency_buyer 1573 non-null object
5 units_sold 1573 non-null int64
6 uses_ad_boosts 1573 non-null int64
7 rating 1573 non-null float64
8 rating_count 1573 non-null int64
9 rating_five_count 1528 non-null float64
10 rating_four_count 1528 non-null float64
11 rating_three_count 1528 non-null float64
12 rating_two_count 1528 non-null float64
13 rating_one_count 1528 non-null float64
14 badges_count 1573 non-null int64
15 badge_local_product 1573 non-null int64
16 badge_product_quality 1573 non-null int64
17 badge_fast_shipping 1573 non-null int64
18 tags 1573 non-null object
19 product_color 1532 non-null object
20 product_variation_size_id 1559 non-null object
21 product_variation_inventory 1573 non-null int64
22 shipping_option_name 1573 non-null object
23 shipping_option_price 1573 non-null int64
24 shipping_is_express 1573 non-null int64
25 countries_shipped_to 1573 non-null int64
26 inventory_total 1573 non-null int64
27 has_urgency_banner 473 non-null float64
28 urgency_text 473 non-null object
29 origin_country 1556 non-null object
30 merchant_title 1573 non-null object
31 merchant_name 1569 non-null object
32 merchant_info_subtitle 1572 non-null object
33 merchant_rating_count 1573 non-null int64
34 merchant_rating 1573 non-null float64
35 merchant_id 1573 non-null object
36 merchant_has_profile_picture 1573 non-null int64
37 merchant_profile_picture 226 non-null object
38 product_url 1573 non-null object
39 product_picture 1573 non-null object
40 product_id 1573 non-null object
41 theme 1573 non-null object
42 crawl_month 1573 non-null object
dtypes: float64(9), int64(15), object(19)
memory usage: 528.6+ KB
df.dtypes
#Remove object columns
for col in df.select_dtypes('object'):
df = df.drop([col], axis=1)
df.head(3)
df.isnull().sum()[df.isnull().sum() !=0]
for c in df.columns:
if df[c].isnull().sum() > 40:
df[c] = df[c].replace(np.nan, 0)
df["product_variation_size_id"] = data["product_variation_size_id"]
df["origin_country"] = data["origin_country"]
df["product_color"] = data["product_color"]
df.head(3)
count = df['product_variation_size_id'].value_counts()
count[count>=2]
df['product_variation_size_id'].replace('S.', 'S', inplace=True)
df['product_variation_size_id'].replace('Size S', 'S', inplace=True)
df['product_variation_size_id'].replace('XS.', 'XS', inplace=True)
df['product_variation_size_id'].replace('s', 'S', inplace=True)
df['product_variation_size_id'].replace('M.', 'M', inplace=True)
df['product_variation_size_id'].replace('2XL', 'XXL', inplace=True)
df['product_variation_size_id'].replace('Size XS', 'XS', inplace=True)
df['product_variation_size_id'].replace('Size-XS', 'XS', inplace=True)
df['product_variation_size_id'].replace('4XL', 'XXXXL', inplace=True)
df['product_variation_size_id'].replace('SIZE XS', 'XS', inplace=True)
df['product_variation_size_id'].replace('SizeL', 'L', inplace=True)
df['product_variation_size_id'].replace('Size-S', 'S', inplace=True)
df['product_variation_size_id'].replace('5XL', 'XXXXXL', inplace=True)
df['product_variation_size_id'].replace('3XL', 'XXXL', inplace=True)
df['product_variation_size_id'].replace('S(bust 88cm)', 'S', inplace=True)
df['product_variation_size_id'].replace('Size4XL', 'XXXXL', inplace=True)
df['product_variation_size_id'].replace('Size -XXS', 'XXS', inplace=True)
df['product_variation_size_id'].replace('SIZE-XXS', 'XXS', inplace=True)
df['product_variation_size_id'].replace('Size M', 'M', inplace=True)
df['product_variation_size_id'].replace('size S', 'S', inplace=True)
df['product_variation_size_id'].replace('S Pink', 'S', inplace=True)
df['product_variation_size_id'].replace('Size S.', 'S', inplace=True)
df['product_variation_size_id'].replace('Suit-S', 'S', inplace=True)
def size_name(size):
ls = ["XXXS", "XXS", "XS", "S", "M", "L", "XL", "XXL", "XXXL", "XXXXL", "XXXXXL"]
if size in ls:
return size
return "Others"
df['product_variation_size_id'].replace(np.nan, 'Others', inplace=True)
df['product_variation_size_id'] = df['product_variation_size_id'].apply(size_name)
df['origin_country'].value_counts()
def origin(country):
ls = ["VE", "SG", "GB", "AT"]
if country in ls:
return "Others"
return country
df['origin_country'].replace(np.nan, "Others", inplace=True)
df['origin_country'] = df['origin_country'].apply(origin)
df.nunique()
def encoding_prod_color(name):
if name == "armygreen" \
or name == "khaki" \
or name == "camouflage"\
or name == "mintgreen" \
or name == "lightgreen" \
or name == "lightkhaki" \
or name == "Army green" \
or name == "army green" \
or name == "darkgreen" \
or name == "Green" \
or name == "fluorescentgreen" \
or name == "applegreen" \
or name == "navy":
return "green"
elif name == "Black" \
or name == "black & white" \
or name == "black & blue" \
or name == "coolblack" \
or name == "black & green" \
or name == "black & yellow":
return "black"
elif name == "navyblue" \
or name == "lightblue" \
or name == "skyblue" \
or name == "Blue" \
or name == "darkblue" \
or name == "navy blue" \
or name == "navyblue & white" \
or name == "lakeblue":
return "blue"
elif name == "Yellow" \
or name == "lightyellow" \
or name == "star":
return "yellow"
elif name == "offwhite" \
or name == "White" \
or name == "whitefloral" \
or name == "white & black" \
or name == "white & green":
return "white"
elif name == "rosered" \
or name == "rose" \
or name == "Pink" \
or name == "Rose" \
or name == "pink & grey" \
or name == "floral" \
or name == "lightpink" \
or name == "pink & white" \
or name == "pink & black" \
or name == "pink & blue" \
or name == "dustypink":
return "pink"
elif name == "Red" \
or name == "rouge" \
or name == "lightred" \
or name == "coralred" \
or name == "watermelonred" \
or name == "Rouge":
return "red"
elif name == "Orange" \
or name == "orange-red" \
or name == "apricot":
return "orange"
elif name == 'coffee':
return "brown"
elif name == "lightgrey" \
or name == "gray" \
or name == "Grey" \
or name == "grey":
return "grey"
elif name == 'white' \
or name == 'black' \
or name == 'yellow' \
or name == 'pink' \
or name == 'red' \
or name == 'green' \
or name == 'orange' \
or name == 'grey' \
or name == 'brown' \
or name == "purple" \
or name == "blue" \
or name == 'beige':
return name
else:
return "other"
df['product_color'] = df['product_color'].replace(np.nan, "Other")
df['product_color'] = df['product_color'].apply(encoding_prod_color)
df['product_color'].value_counts().head(50)
df = pd.get_dummies(df, columns = ['product_color'],
prefix = "Color_",
drop_first = True)
df = pd.get_dummies(df, columns = ["product_variation_size_id"],
prefix = "Size_",
drop_first = True)
df = pd.get_dummies(df, columns = ["origin_country"],
prefix = "Origin_",
drop_first = True)
df.head()
#Spliting
X = df.drop(["units_sold"], axis=1)
y = df["units_sold"]
print("Shape of X is {}".format(X.shape))
print("Shape of y is {}".format(y.shape))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
print('Shape of training set ', X_train.shape)
print('Shape of test set ', X_test.shape)
Shape of X is (1573, 48)
Shape of y is (1573,)
Shape of training set (943, 48)
Shape of test set (630, 48)
def select_features(X_train, y_train, X_test):
fs = SelectKBest(score_func=mutual_info_regression, k='all')
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fs
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
print('Shape of Training set with the best features: ', X_train_fs.shape)
Shape of Training set with the best features: (943, 48)
cols = fs.get_support(indices=True)
print('Best features in our model\n')
for i in cols:
print (df.columns[i])
Best features in our model
price
retail_price
units_sold
uses_ad_boosts
rating
rating_count
rating_five_count
rating_four_count
rating_three_count
rating_two_count
rating_one_count
badges_count
badge_local_product
badge_product_quality
badge_fast_shipping
product_variation_inventory
shipping_option_price
shipping_is_express
countries_shipped_to
inventory_total
has_urgency_banner
merchant_rating_count
merchant_rating
merchant_has_profile_picture
Color__black
Color__blue
Color__brown
Color__green
Color__grey
Color__orange
Color__other
Color__pink
Color__purple
Color__red
Color__white
Color__yellow
Size__M
Size__Others
Size__S
Size__XL
Size__XS
Size__XXL
Size__XXS
Size__XXXL
Size__XXXS
Size__XXXXL
Size__XXXXXL
Origin__Others
regressors = [LinearRegression(),
DecisionTreeRegressor(random_state=1),
RandomForestRegressor(n_estimators = 10, random_state=1)]
predict_df = pd.DataFrame(columns = ['Name', 'Train Score', 'Test Score', 'Mean Absolute Error', 'Mean Squared Error',
'Cross Validation Score (Mean Accuracy)', 'R2 Score'])
for regressor in regressors:
regressor.fit(X_train_fs, y_train)
y_pred = regressor.predict(X_test_fs)
s = str(type(regressor)).split('.')[-1][:-2]
train = regressor.score(X_train_fs, y_train)
test = regressor.score(X_test_fs, y_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
accuracy = cross_val_score(estimator = regressor, X = X_train_fs, y = y_train, cv=10)
cv = accuracy.mean()*100
r2 = r2_score(y_test, y_pred)
predict_df = predict_df.append({'Name': s, 'Train Score': train, 'Test Score': test, 'Mean Absolute Error': mae,
'Mean Squared Error': mse, 'Cross Validation Score (Mean Accuracy)': cv,
'R2 Score': r2},
ignore_index=True)
predict_df
from sklearn.model_selection import GridSearchCV
reg = RandomForestRegressor(random_state=1)
param_grid = {
'n_estimators': np.arange(4, 30, 2),
'max_depth' : [4,5,6,7,8],
}
CV_reg = GridSearchCV(estimator=reg, param_grid=param_grid, cv= 5)
CV_reg.fit(X_train_fs, y_train)
CV_reg.best_params_
regressor = RandomForestRegressor(n_estimators=28, random_state=1, max_depth=8)
regressor.fit(X_train_fs, y_train)
y_pred = regressor.predict(X_test_fs)
df = df.append({'Name': str(type(regressor)).split('.')[-1][:-2] + ' (after GridSearchCV)',
'Train Score': regressor.score(X_train_fs, y_train),
'Test Score': regressor.score(X_test_fs, y_test),
'Mean Absolute Error': mean_absolute_error(y_test, y_pred),
'Mean Squared Error': mean_squared_error(y_test, y_pred),
'Cross Validation Score (Mean Accuracy)': cross_val_score(estimator = regressor, X = X_train_fs, y = y_train, cv=10).mean()*100,
'R2 Score': r2_score(y_test, y_pred)},
ignore_index=True)
from sklearn.ensemble import VotingRegressor
regressor = VotingRegressor([('lr',LinearRegression()), ('rf', RandomForestRegressor(n_estimators=18, random_state=1, max_depth=4))])
regressor.fit(X_train_fs, y_train)
# Predicting test values
y_pred = regressor.predict(X_test_fs)
predict_df = predict_df.append({'Name': str(type(regressor)).split('.')[-1][:-2],
'Train Score': regressor.score(X_train_fs, y_train),
'Test Score': regressor.score(X_test_fs, y_test),
'Mean Absolute Error': mean_absolute_error(y_test, y_pred),
'Mean Squared Error': mean_squared_error(y_test, y_pred),
'Cross Validation Score (Mean Accuracy)': cross_val_score(estimator = regressor, X = X_train_fs, y = y_train, cv=10).mean()*100,
'R2 Score': r2_score(y_test, y_pred)},
ignore_index=True)
predict_df