import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly as pl
import lightgbm as lgb
db=pd.read_csv('Data/GP Orders - 4.csv',
parse_dates=['Order Date'])
db.head()
Order Numberint64
Order Statusobject
0
70978
Completed
1
70962
Completed
2
70960
Completed
3
70959
Completed
4
70958
Completed
print('Checking null values in each Columns')
db.isnull().sum()
Checking null values in each Columns
print("Bestseller :",db['Book Name'].value_counts().nlargest(1,keep='all').to_frame().index.values[0]); # bestseller
print("Top City :",db['City (Billing)'].value_counts().nlargest(1,keep='all').to_frame().index.values[0]); # top city for orders
db['Book Name'] = db['Book Name'].fillna('انٹرنیٹ سے پیسہ کمائیں') # filling null values with most sold book
db['City (Billing)'] = db['City (Billing)'].fillna('Karachi') # filling null valuse with top city
display("Checking null values in each Columns")
db.isnull().sum()
Bestseller : انٹرنیٹ سے پیسہ کمائیں
Top City : Karachi
db['Date'] = [d.date() for d in db['Order Date']]
db['Time'] = [d.time() for d in db['Order Date']]
db['Month'] = [d.month for d in db['Order Date']]
db['Month_Year'] = [d.to_period("M") for d in db['Order Date']]
db['Year'] = [d.year for d in db['Order Date']]
db['Weekday'] = [d.week for d in db['Order Date']]
db.head()
Order Numberint64
Order Statusobject
0
70978
Completed
1
70962
Completed
2
70960
Completed
3
70959
Completed
4
70958
Completed
cities=pd.read_csv('Data/pk.csv')
cities.head()
cityobject
latfloat64
0
Karachi
24.86
1
Lahore
31.5497
2
Sialkot City
32.5
3
Faisalabad
31.418
4
Rawalpindi
33.6007
cities['city'] = cities['city'].replace(['Sialkot City'],'Sialkot') # Replacing names for merggin DB
geo=db['City (Billing)'].value_counts().rename_axis('City').reset_index(name='counts')
clean_geo=geo.merge(cities,how='inner',left_on='City', right_on='city')
clean_geo=clean_geo[["City","lat","lng","admin_name",
"population_proper","counts"]]
clean_geo=clean_geo.rename(columns={"lat":"Latitude","lng":"Longitude",
"population_proper":"Population",
"admin_name":"Province","counts":"Total Orders"})
clean_geo.head()
Cityobject
Latitudefloat64
0
Karachi
24.86
1
Lahore
31.5497
2
Islamabad
33.6989
3
Rawalpindi
33.6007
4
Faisalabad
31.418
import plotly.graph_objects as go
clean_geo['text'] = clean_geo['City'] + '<br> Book Sold ' + (clean_geo['Total Orders']).astype(str)
# limits = [(0,99),(100,299),(300,499),(500,999),(1000,3000)]
limits = [(0,3),(3,9),(9,19),(19,49),(49,3000)]
colors = ["royalblue","crimson","lightseagreen","orange","lightgrey"]
scale = 5000
fig = go.Figure()
for i in range(len(limits)):
lim = limits[i]
df_sub = clean_geo[lim[0]:lim[1]]
fig.add_trace(go.Scattergeo(
lon = df_sub['Longitude'],
lat = df_sub['Latitude'],
text = df_sub['text'],
marker = dict(
size = df_sub['Total Orders'],
color = colors[i],
line_color='rgb(40,40,40)',
line_width=0.5,
sizemode = 'area'
),
name = 'Top {0} - {1}'.format(lim[0]+1,lim[1])))
fig.update_layout(
title_text = 'Total Books Sold by Gufhtugu per City',
showlegend = True,
legend_title="Top Books",
legend_title_font_size=14,
geo = dict(
scope = 'asia',
landcolor = 'rgb(217, 217, 217)',
lonaxis = dict(range = [60.578993, 82.65129]),
lataxis = dict(range = [24.407138, 36.885931]),
),
)
fig.show()
month=db[['City (Billing)','Month']].value_counts().rename_axis(['City','Month']).reset_index(name='counts')
month_geo=month.merge(clean_geo,how='inner',left_on='City', right_on='City').sort_values(['Month'])
month_geo.head()
Cityobject
Monthint64
0
Karachi
1
318
Dera Ismail Khan
1
327
Nowshera
1
340
Lodhran
1
346
Sanghar
1
import plotly.express as px
# df = px.data.gapminder()
px.scatter(month_geo, x="Total Orders", y="counts", animation_frame="Month", animation_group="City",
size="counts", color="Province", hover_name="City",title='Number of Books bought by cities over month',
log_x=True, size_max=40, range_x=[1,3000], range_y=[0,500])
# px.update_xaxes(autorange=True)
weekday=db[['City (Billing)','Weekday']].value_counts().rename_axis(['City','Week']).reset_index(name='counts')
weekday_geo=weekday.merge(clean_geo,how='inner',left_on='City', right_on='City').sort_values(['Week'])
px.scatter(weekday_geo, x="Total Orders", y="counts", animation_frame="Week", animation_group="City",
size="counts", color="Province", hover_name="City",title='Number of Books bought by cities over Week',
log_x=True, size_max=40, range_x=[1,3000], range_y=[0,200])
# sns.set_theme(style="whitegrid", palette="muted")
total_month=db['Month'].value_counts().rename_axis(['Month']).reset_index(name='counts')
sns.set(rc={'figure.figsize':(12,8)},style="whitegrid", palette="muted")
# Draw a categorical scatterplot to show each observation
ax = sns.barplot(data=total_month, x="Month", y="counts",palette='CMRmap')
ax.set(ylabel="",title="Number of Books sold per month",);
# ax.legend(loc='upper right', bbox_to_anchor=(0.3, 1), ncol=1);
db=db.merge(cities,how='left',left_on='City (Billing)', right_on='city').set_index('Order Number')
total=db[["Order Status","Book Name","Date","Time","City (Billing)",
"lat","lng","population_proper","admin_name","Month"]]
total=total.rename(columns={"Order Status":"Status","Book Name":"Book","City (Billing)":"City",
"lat":"Latitude","lng":"Longitude","population_proper":"Population","admin_name":"Province"})
total.head()
Statusobject
Bookobject
70978
Completed
Column Nigari Kese Sekhain
70962
Completed
انٹرنیٹ سے پیسہ کمائیں
70960
Completed
Justju ka safar-1/Waqfa e Pareshani
70959
Completed
R ka Taaruf
70958
Completed
انٹرنیٹ سے پیسہ کمائیں
import matplotlib as mpl
ax = sns.histplot(data=total, x="Month", hue="Province",
multiple="stack",
palette="rocket",
edgecolor=".3",binwidth=1,kde=True,
linewidth=.5)
ax.set(ylabel="",title="Number of Books sold to province per month")
ax.set_xticklabels([ "",'Feb', 'Apr','Jun','Aug','Oct','Dec']);
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:8: UserWarning:
FixedFormatter should only be used together with FixedLocator
status=db[['Order Status','Month']].value_counts().rename_axis(['Status','Month']).reset_index(name='counts')
pal = dict(Completed="#6495ED", Returned="#F08080",Canceled="#90ee90")
# Show the survival probability as a function of age and sex
g = sns.lmplot(x="Month", y="counts", col="Status", hue="Status", data=status,
palette=pal, y_jitter=.02, logistic=False, truncate=True,);
status_pro=total[['Status','Month','Province']].value_counts().rename_axis(['Status','Month','Province']).reset_index(name='counts')
g = sns.relplot(
data=status_pro,
x="Month", y="counts",
hue="Status", size="Province",
palette=pal, sizes=(10, 200),alpha=0.9,height=8,aspect=1.2
)
g.set( yscale="log")
g.set(ylabel="",title="Status of Books sold to Province per month")
g.despine(left=True, bottom=True);
from catboost import CatBoostRegressor,Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
train=db[['Order Status','Year','admin_name','City (Billing)','Weekday']].value_counts().rename_axis(['Status','Year','Province','City','Week']).reset_index(name='counts')
train[['Year','Week']]=train[['Year','Week']].astype('int')
DataX = train.drop(columns=['counts'])
Datay = train['counts'].values
x_train, x_val, y_train, y_val = train_test_split(DataX, Datay, test_size=0.3)
train_pool = Pool(x_train,
y_train,
cat_features=['Status','Province','City'])
test_pool = Pool(x_val,
y_val,
cat_features=['Status','Province','City'])
model=CatBoostRegressor(iterations=1000,
learning_rate=0.001,
depth=12,
eval_metric='RMSE',
random_seed = 23,
od_type='Iter',
metric_period = 100,
od_wait=100) # catboost is used for avoiding over fitting
model.fit(train_pool,
eval_set=test_pool,
use_best_model=True,
verbose=False);
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
predict=model.predict(test_pool)
res = model.calc_feature_statistics(train_pool,
feature=1)
import shap
shap.initjs()
/shared-libs/python3.7/py/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning:
IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x_train)
shap.dependence_plot("Week", shap_values, x_train)
shap.summary_plot(shap_values, x_train)
x=range(len(predict))
y=y_val
ax=sns.lineplot(x=x,y=y,label='Train',color='#9FD1FF')
ax=sns.lineplot(x=x,y=predict,label='Predicted',color='#FF8B8B')
ax.set(title='Trainig and predicted by Weekday data over the time');
Status ='Completed'
Year = 2021
Province = 'Punjab'
City = 'Lahore'
Week= 20
data1 = [Status,Year,Province,City,Week]
test1 = model.predict(data1)
print(int(test1),"Books Order",Status,"in week",Week,"-",Year," from ",City,",",Province,"." )
9 Books Order Completed in week 20 - 2021 from Lahore , Punjab .
Status =['Completed','Completed','Completed','Returned']
Year = [2021,2021,2022,2021]
Province = ['Punjab','Punjab','Sindh','Punjab']
City = ['Lahore','Bahawalpur','Karachi','Lahore']
Week= [20,10,13,14]
data2 = pd.DataFrame(np.transpose([Status,Year,Province,City,Week]),
columns=['Status','Year','Province','City','Week'])
test2 = model.predict(data2).astype('int')
for i in range(len(test2)):
print(i+1,')',test2[i],"Books Order",data2.Status[i],"in","in week",data2.Week[i],"-",data2.Year[i],
" from ",data2.City[i],",",data2.Province[i],".\n" )
1 ) 9 Books Order Completed in in week 20 - 2021 from Lahore , Punjab .
2 ) 3 Books Order Completed in in week 10 - 2021 from Bahawalpur , Punjab .
3 ) 14 Books Order Completed in in week 13 - 2022 from Karachi , Sindh .
4 ) 4 Books Order Returned in in week 14 - 2021 from Lahore , Punjab .
Week=DataX
Week['Year']=Week['Year'].replace([2019,2020],[2021,2022])
ypred=model.predict(Week)
Week['counts']=ypred.astype('int')
Totaldf=pd.concat([train, Week], axis=0)
Totaldf.shape
ax=sns.lineplot(x='Week',y='counts',hue='Year',data=Totaldf,palette='Set2')
ax.set( yscale="log")
ax.set(title='Trainig and predicted by Weekday data over the time');
train1=db[['Order Status','Month','Year','admin_name','City (Billing)']].value_counts().rename_axis(['Status','Month','Year','Province','City']).reset_index(name='counts')
train1[['Year','Month']]=train1[['Year','Month']].astype('int')
DataX = train1.drop(columns=['counts'])
Datay = train1['counts'].values
x_train, x_val, y_train, y_val = train_test_split(DataX, Datay, test_size=0.3)
train_pool = Pool(x_train,
y_train,
cat_features=['Status','Province','City'])
test_pool = Pool(x_val,
y_val,
cat_features=['Status','Province','City'])
model=CatBoostRegressor(iterations=10000,
learning_rate=0.001,
depth=12,
eval_metric='RMSE',
random_seed = 23,
od_type='Iter',
metric_period = 100,
od_wait=100)
model.fit(train_pool,
eval_set=test_pool,
use_best_model=True,
verbose=False);
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
Execution error
KernelInterrupted: Execution interrupted by the Jupyter kernel.
predict=model.predict(test_pool)
res = model.calc_feature_statistics(train_pool,
feature=1,
)
x=range(len(predict))
y=y_val
ax=sns.lineplot(x=x,y=y,label='Train',color='#9FD1FF')
ax=sns.lineplot(x=x,y=predict,label='Predicted',color='#FF8B8B')
ax.set(title='Trainig and predicted by Month data over the time');
Make = train1.drop(columns=['counts'])
Week=Make
Week['Year']=Week['Year'].replace([2019,2020],[2021,2022])
ypred=model.predict(Week)
Week['counts']=ypred.astype('int')
Total=pd.concat([train1, Week], axis=0)
ax=sns.lineplot(x='Month',y='counts',hue='Year',data=Total,palette='Set2')
ax.set( yscale="log")
ax.set(title='Trainig and predicted by Month data over the time');
from sklearn.linear_model import LinearRegression
ReadDb=pd.read_csv('Data/GP Orders - 5.csv',
parse_dates=['Order Date & Time'])
ReadDb['Month'] = [d.month for d in ReadDb['Order Date & Time']]
ReadDb['Month_Year'] = [d.to_period("M") for d in ReadDb['Order Date & Time']]
ReadDb['Year'] = [d.year for d in ReadDb['Order Date & Time']]
ReadDb['Weekday'] = [d.week for d in ReadDb['Order Date & Time']]
Revenue=ReadDb[['Month','Year']].value_counts().rename_axis(['Month','Year']).\
reset_index(name='counts')
Revenue["Sale"]=Revenue['counts']*500
Xpred=[i for i in range(2,13)]
pred=pd.DataFrame()
pred['Month']=Xpred
pred['Year']=2021
Revenue.sort_values(by=['Year','Month'],inplace=True)
reg=LinearRegression()
X = Revenue.drop(columns=['counts','Sale'])
y = Revenue['Sale'].values
reg.fit(X,y)
prediction_rev=reg.predict(pred)
ax=sns.lineplot(x=list(range(16)),y=Revenue.Sale,label="Past Sale",color='#9FD1FF');
ax=sns.lineplot(x=[i for i in range(15,len(prediction_rev)+15)],y=prediction_rev,
label="Future Sale",color='#FF8B8B');
ax.set(title='Predicting future sales till December 2021 Using Linear Regression ');
Year_books=db[['Book Name','Year']].value_counts().rename_axis(['Book','Year']).reset_index(name='counts')
Year2019=Year_books[Year_books['Year']==2019].nlargest(10, 'counts')
Year2020=Year_books[Year_books['Year']==2020].nlargest(10, 'counts')
Year2021=Year_books[Year_books['Year']==2021].nlargest(10, 'counts')
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 10))
# Generate some sequential data
sns.barplot(x='Book', y='counts', palette='Set2', ax=ax1,data=Year2019)
ax1.axhline(0, color="k", clip_on=False)
ax1.set_ylabel("2019")
# Center the data to make it diverging
sns.barplot(x='Book', y='counts', palette='Set2', ax=ax2,data=Year2020)
ax2.axhline(0, color="k", clip_on=False)
ax2.set_ylabel("2020")
# Randomly reorder the data to make it qualitative
sns.barplot(x='Book', y='counts', palette='Set2', ax=ax3,data=Year2021)
ax3.axhline(0, color="k", clip_on=False)
ax3.set_ylabel("2021")
# Finalize the plot
sns.despine(bottom=True)
plt.setp(f.axes, yticks=[])
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '1.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
for p in ax2.patches:
ax2.annotate(format(p.get_height(), '1.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
for p in ax3.patches:
ax3.annotate(format(p.get_height(), '1.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
ax3.set_xticklabels(ax3.get_xticklabels(),rotation=90)
ax1.set_xticklabels([ "Data Science",'Kaggle', 'R ka Taaruf','(C++)','Apna Elaaj','Shaoor','Molo Masali',
'Kaggle for Begginers','CryptoCurrency','Blockchain'])
ax2.set_xticklabels([ "Earn Money",'Python Programming','Product Management', 'Blockchain','Justju ka safar',
'Artificial Intelligence','Molo Masali','(C++)',
'Python Programming 2020','Sukkur To Florida'])
ax3.set_xticklabels([ "Lucky Draw",'Earn Money','Column Nigari', 'Python Programming','Waqfa e Pareshani',
'Data Science','Arif Kareem','Machine Learning',
'Artificial Intelligence','Blockchain'])
plt.tight_layout(h_pad=2)
ax1.set(title='Best Seller in 2019');
ax2.set(title='Best Seller in 2020');
ax3.set(title='Best Seller in 2021');
Month_books=db[['Book Name','Month','Year']].value_counts().rename_axis(['Book','Month','Year']).reset_index(name='counts')
Month_2019=Month_books.sort_values(by=['counts'],ascending=False)
Month_2019=Month_2019[Month_2019['Year']==2019]
Range= pd.Series(range(10,13))
Top_Month=[]
Top_Book=[]
Top_counts=[]
for x in Range:
Book=Month_2019[Month_2019['Month']==x].nlargest(1,'counts')['Book'].values
Month=Month_2019[Month_2019['Month']==x].nlargest(1,'counts')['Month'].values
Counts=Month_2019[Month_2019['Month']==x].nlargest(1,'counts')['counts'].values
Top_Book.append(Book)
Top_Month.append(Month)
Top_counts.append(Counts)
Top_Month=[['Oct','Nov','Dec']]
Top_Book=np.transpose(Top_Book)
Top_counts=np.transpose(Top_counts)
Total=pd.DataFrame(np.concatenate([Top_Month,Top_Book,Top_counts]).transpose(),
columns=['Month','Book','Sale'])
Total
ax4=sns.barplot(data=Total,y='Book',x='Sale',hue='Month',palette='Set2',orient='h'
,dodge=False)
# ax4.set_yticklabels(ax4.get_xticklabels(),rotation=90)
ax4.set_yticklabels([ "Kaggle for Begginers",'Apna Elaaj Khud Karay','Data Science'])
sns.despine(bottom=True)
plt.setp(f.axes, yticks=[]);
ax4.set(title='Bestseller by Month in 2019 ');
Month_2020=Month_books.sort_values(by=['counts'],ascending=False)
Month_2020=Month_2020[Month_2020['Year']==2020]
Range= pd.Series(range(1,13))
Top_Month=[]
Top_Book=[]
Top_counts=[]
for x in Range:
Book=Month_2020[Month_2020['Month']==x].nlargest(1,'counts')['Book'].values
Month=Month_2020[Month_2020['Month']==x].nlargest(1,'counts')['Month'].values
Counts=Month_2020[Month_2020['Month']==x].nlargest(1,'counts')['counts'].values
Top_Book.append(Book)
Top_Month.append(Month)
Top_counts.append(Counts)
Top_Month=[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']]
Top_Book=np.transpose(Top_Book)
Top_counts=np.transpose(Top_counts)
Total=pd.DataFrame(np.concatenate([Top_Month,Top_Book,Top_counts]).transpose(),
columns=['Month','Book','Sale'])
Total
ax4=sns.barplot(data=Total,y='Book',x='Sale',hue='Month',palette='Set2',orient='h'
,dodge=False,alpha=0.9)
# ax4.set_yticklabels(ax4.get_xticklabels(),rotation=90)
ax4.set_yticklabels([ "C++",'Data Science','Justju ka safar-1','Artificial Intelligence',
'Product Management','Python Programming','Earn Money From Internet'])
sns.despine(bottom=True)
plt.setp(f.axes, yticks=[]);
ax4.set(title='Bestseller by Month in 2020 ');
Month_2021=Month_books.sort_values(by=['counts'],ascending=False)
Month_2021=Month_2021[Month_2021['Year']==2021]
Total=Month_2021[Month_2021['Month']==1].nlargest(1,'counts')
Total