Gufhtugu publication analysis

import seaborn as sns import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt import plotly as pl import lightgbm as lgb

db=pd.read_csv('Data/GP Orders - 4.csv', parse_dates=['Order Date']) db.head()

Order Numberint64

Order Statusobject

0

70978

Completed

1

70962

Completed

2

70960

Completed

3

70959

Completed

4

70958

Completed

print('Checking null values in each Columns') db.isnull().sum()

Checking null values in each Columns

print("Bestseller :",db['Book Name'].value_counts().nlargest(1,keep='all').to_frame().index.values[0]); # bestseller print("Top City :",db['City (Billing)'].value_counts().nlargest(1,keep='all').to_frame().index.values[0]); # top city for orders db['Book Name'] = db['Book Name'].fillna('انٹرنیٹ سے پیسہ کمائیں') # filling null values with most sold book db['City (Billing)'] = db['City (Billing)'].fillna('Karachi') # filling null valuse with top city display("Checking null values in each Columns") db.isnull().sum()

Bestseller : انٹرنیٹ سے پیسہ کمائیں
Top City : Karachi

db['Date'] = [d.date() for d in db['Order Date']] db['Time'] = [d.time() for d in db['Order Date']] db['Month'] = [d.month for d in db['Order Date']] db['Month_Year'] = [d.to_period("M") for d in db['Order Date']] db['Year'] = [d.year for d in db['Order Date']] db['Weekday'] = [d.week for d in db['Order Date']] db.head()

Order Numberint64

Order Statusobject

0

70978

Completed

1

70962

Completed

2

70960

Completed

3

70959

Completed

4

70958

Completed

cities=pd.read_csv('Data/pk.csv') cities.head()

cityobject

latfloat64

0

Karachi

24.86

1

Lahore

31.5497

2

Sialkot City

32.5

3

Faisalabad

31.418

4

Rawalpindi

33.6007

cities['city'] = cities['city'].replace(['Sialkot City'],'Sialkot') # Replacing names for merggin DB

geo=db['City (Billing)'].value_counts().rename_axis('City').reset_index(name='counts') clean_geo=geo.merge(cities,how='inner',left_on='City', right_on='city') clean_geo=clean_geo[["City","lat","lng","admin_name", "population_proper","counts"]] clean_geo=clean_geo.rename(columns={"lat":"Latitude","lng":"Longitude", "population_proper":"Population", "admin_name":"Province","counts":"Total Orders"}) clean_geo.head()

Cityobject

Latitudefloat64

0

Karachi

24.86

1

Lahore

31.5497

2

Islamabad

33.6989

3

Rawalpindi

33.6007

4

Faisalabad

31.418

import plotly.graph_objects as go clean_geo['text'] = clean_geo['City'] + '<br> Book Sold ' + (clean_geo['Total Orders']).astype(str) # limits = [(0,99),(100,299),(300,499),(500,999),(1000,3000)] limits = [(0,3),(3,9),(9,19),(19,49),(49,3000)] colors = ["royalblue","crimson","lightseagreen","orange","lightgrey"] scale = 5000 fig = go.Figure() for i in range(len(limits)): lim = limits[i] df_sub = clean_geo[lim[0]:lim[1]] fig.add_trace(go.Scattergeo( lon = df_sub['Longitude'], lat = df_sub['Latitude'], text = df_sub['text'], marker = dict( size = df_sub['Total Orders'], color = colors[i], line_color='rgb(40,40,40)', line_width=0.5, sizemode = 'area' ), name = 'Top {0} - {1}'.format(lim[0]+1,lim[1]))) fig.update_layout( title_text = 'Total Books Sold by Gufhtugu per City', showlegend = True, legend_title="Top Books", legend_title_font_size=14, geo = dict( scope = 'asia', landcolor = 'rgb(217, 217, 217)', lonaxis = dict(range = [60.578993, 82.65129]), lataxis = dict(range = [24.407138, 36.885931]), ), ) fig.show()

month=db[['City (Billing)','Month']].value_counts().rename_axis(['City','Month']).reset_index(name='counts') month_geo=month.merge(clean_geo,how='inner',left_on='City', right_on='City').sort_values(['Month']) month_geo.head()

Cityobject

Monthint64

0

Karachi

1

318

Dera Ismail Khan

1

327

Nowshera

1

340

Lodhran

1

346

Sanghar

1

import plotly.express as px # df = px.data.gapminder() px.scatter(month_geo, x="Total Orders", y="counts", animation_frame="Month", animation_group="City", size="counts", color="Province", hover_name="City",title='Number of Books bought by cities over month', log_x=True, size_max=40, range_x=[1,3000], range_y=[0,500]) # px.update_xaxes(autorange=True)

weekday=db[['City (Billing)','Weekday']].value_counts().rename_axis(['City','Week']).reset_index(name='counts') weekday_geo=weekday.merge(clean_geo,how='inner',left_on='City', right_on='City').sort_values(['Week'])

px.scatter(weekday_geo, x="Total Orders", y="counts", animation_frame="Week", animation_group="City", size="counts", color="Province", hover_name="City",title='Number of Books bought by cities over Week', log_x=True, size_max=40, range_x=[1,3000], range_y=[0,200])

# sns.set_theme(style="whitegrid", palette="muted") total_month=db['Month'].value_counts().rename_axis(['Month']).reset_index(name='counts') sns.set(rc={'figure.figsize':(12,8)},style="whitegrid", palette="muted") # Draw a categorical scatterplot to show each observation ax = sns.barplot(data=total_month, x="Month", y="counts",palette='CMRmap') ax.set(ylabel="",title="Number of Books sold per month",); # ax.legend(loc='upper right', bbox_to_anchor=(0.3, 1), ncol=1);

db=db.merge(cities,how='left',left_on='City (Billing)', right_on='city').set_index('Order Number')

total=db[["Order Status","Book Name","Date","Time","City (Billing)", "lat","lng","population_proper","admin_name","Month"]] total=total.rename(columns={"Order Status":"Status","Book Name":"Book","City (Billing)":"City", "lat":"Latitude","lng":"Longitude","population_proper":"Population","admin_name":"Province"}) total.head()

Statusobject

Bookobject

70978

Completed

Column Nigari Kese Sekhain

70962

Completed

انٹرنیٹ سے پیسہ کمائیں

70960

Completed

Justju ka safar-1/Waqfa e Pareshani

70959

Completed

R ka Taaruf

70958

Completed

انٹرنیٹ سے پیسہ کمائیں

import matplotlib as mpl ax = sns.histplot(data=total, x="Month", hue="Province", multiple="stack", palette="rocket", edgecolor=".3",binwidth=1,kde=True, linewidth=.5) ax.set(ylabel="",title="Number of Books sold to province per month") ax.set_xticklabels([ "",'Feb', 'Apr','Jun','Aug','Oct','Dec']);

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:8: UserWarning:

FixedFormatter should only be used together with FixedLocator

status=db[['Order Status','Month']].value_counts().rename_axis(['Status','Month']).reset_index(name='counts') pal = dict(Completed="#6495ED", Returned="#F08080",Canceled="#90ee90") # Show the survival probability as a function of age and sex g = sns.lmplot(x="Month", y="counts", col="Status", hue="Status", data=status, palette=pal, y_jitter=.02, logistic=False, truncate=True,);

status_pro=total[['Status','Month','Province']].value_counts().rename_axis(['Status','Month','Province']).reset_index(name='counts') g = sns.relplot( data=status_pro, x="Month", y="counts", hue="Status", size="Province", palette=pal, sizes=(10, 200),alpha=0.9,height=8,aspect=1.2 ) g.set( yscale="log") g.set(ylabel="",title="Status of Books sold to Province per month") g.despine(left=True, bottom=True);

from catboost import CatBoostRegressor,Pool from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder

train=db[['Order Status','Year','admin_name','City (Billing)','Weekday']].value_counts().rename_axis(['Status','Year','Province','City','Week']).reset_index(name='counts')

train[['Year','Week']]=train[['Year','Week']].astype('int')

DataX = train.drop(columns=['counts']) Datay = train['counts'].values x_train, x_val, y_train, y_val = train_test_split(DataX, Datay, test_size=0.3)

train_pool = Pool(x_train, y_train, cat_features=['Status','Province','City']) test_pool = Pool(x_val, y_val, cat_features=['Status','Province','City'])

model=CatBoostRegressor(iterations=1000, learning_rate=0.001, depth=12, eval_metric='RMSE', random_seed = 23, od_type='Iter', metric_period = 100, od_wait=100) # catboost is used for avoiding over fitting

model.fit(train_pool, eval_set=test_pool, use_best_model=True, verbose=False);

Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.

predict=model.predict(test_pool)

res = model.calc_feature_statistics(train_pool, feature=1)

import shap shap.initjs()

/shared-libs/python3.7/py/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning:

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(x_train)

shap.dependence_plot("Week", shap_values, x_train)

shap.summary_plot(shap_values, x_train)

x=range(len(predict)) y=y_val ax=sns.lineplot(x=x,y=y,label='Train',color='#9FD1FF') ax=sns.lineplot(x=x,y=predict,label='Predicted',color='#FF8B8B') ax.set(title='Trainig and predicted by Weekday data over the time');

Status ='Completed' Year = 2021 Province = 'Punjab' City = 'Lahore' Week= 20 data1 = [Status,Year,Province,City,Week] test1 = model.predict(data1)

print(int(test1),"Books Order",Status,"in week",Week,"-",Year," from ",City,",",Province,"." )

9 Books Order Completed in week 20 - 2021  from  Lahore , Punjab .

Status =['Completed','Completed','Completed','Returned'] Year = [2021,2021,2022,2021] Province = ['Punjab','Punjab','Sindh','Punjab'] City = ['Lahore','Bahawalpur','Karachi','Lahore'] Week= [20,10,13,14] data2 = pd.DataFrame(np.transpose([Status,Year,Province,City,Week]), columns=['Status','Year','Province','City','Week']) test2 = model.predict(data2).astype('int')

for i in range(len(test2)): print(i+1,')',test2[i],"Books Order",data2.Status[i],"in","in week",data2.Week[i],"-",data2.Year[i], " from ",data2.City[i],",",data2.Province[i],".\n" )

1 ) 9 Books Order Completed in in week 20 - 2021  from  Lahore , Punjab .

2 ) 3 Books Order Completed in in week 10 - 2021  from  Bahawalpur , Punjab .

3 ) 14 Books Order Completed in in week 13 - 2022  from  Karachi , Sindh .

4 ) 4 Books Order Returned in in week 14 - 2021  from  Lahore , Punjab .

Week=DataX Week['Year']=Week['Year'].replace([2019,2020],[2021,2022]) ypred=model.predict(Week) Week['counts']=ypred.astype('int') Totaldf=pd.concat([train, Week], axis=0) Totaldf.shape

ax=sns.lineplot(x='Week',y='counts',hue='Year',data=Totaldf,palette='Set2') ax.set( yscale="log") ax.set(title='Trainig and predicted by Weekday data over the time');

train1=db[['Order Status','Month','Year','admin_name','City (Billing)']].value_counts().rename_axis(['Status','Month','Year','Province','City']).reset_index(name='counts') train1[['Year','Month']]=train1[['Year','Month']].astype('int') DataX = train1.drop(columns=['counts']) Datay = train1['counts'].values x_train, x_val, y_train, y_val = train_test_split(DataX, Datay, test_size=0.3) train_pool = Pool(x_train, y_train, cat_features=['Status','Province','City']) test_pool = Pool(x_val, y_val, cat_features=['Status','Province','City'])

model=CatBoostRegressor(iterations=10000, learning_rate=0.001, depth=12, eval_metric='RMSE', random_seed = 23, od_type='Iter', metric_period = 100, od_wait=100) model.fit(train_pool, eval_set=test_pool, use_best_model=True, verbose=False);

Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.

Execution error

KernelInterrupted: Execution interrupted by the Jupyter kernel.

predict=model.predict(test_pool) res = model.calc_feature_statistics(train_pool, feature=1, )

x=range(len(predict)) y=y_val ax=sns.lineplot(x=x,y=y,label='Train',color='#9FD1FF') ax=sns.lineplot(x=x,y=predict,label='Predicted',color='#FF8B8B') ax.set(title='Trainig and predicted by Month data over the time');

Make = train1.drop(columns=['counts']) Week=Make Week['Year']=Week['Year'].replace([2019,2020],[2021,2022]) ypred=model.predict(Week) Week['counts']=ypred.astype('int') Total=pd.concat([train1, Week], axis=0) ax=sns.lineplot(x='Month',y='counts',hue='Year',data=Total,palette='Set2') ax.set( yscale="log") ax.set(title='Trainig and predicted by Month data over the time');

from sklearn.linear_model import LinearRegression ReadDb=pd.read_csv('Data/GP Orders - 5.csv', parse_dates=['Order Date & Time']) ReadDb['Month'] = [d.month for d in ReadDb['Order Date & Time']] ReadDb['Month_Year'] = [d.to_period("M") for d in ReadDb['Order Date & Time']] ReadDb['Year'] = [d.year for d in ReadDb['Order Date & Time']] ReadDb['Weekday'] = [d.week for d in ReadDb['Order Date & Time']] Revenue=ReadDb[['Month','Year']].value_counts().rename_axis(['Month','Year']).\ reset_index(name='counts') Revenue["Sale"]=Revenue['counts']*500 Xpred=[i for i in range(2,13)] pred=pd.DataFrame() pred['Month']=Xpred pred['Year']=2021 Revenue.sort_values(by=['Year','Month'],inplace=True) reg=LinearRegression() X = Revenue.drop(columns=['counts','Sale']) y = Revenue['Sale'].values reg.fit(X,y) prediction_rev=reg.predict(pred) ax=sns.lineplot(x=list(range(16)),y=Revenue.Sale,label="Past Sale",color='#9FD1FF'); ax=sns.lineplot(x=[i for i in range(15,len(prediction_rev)+15)],y=prediction_rev, label="Future Sale",color='#FF8B8B'); ax.set(title='Predicting future sales till December 2021 Using Linear Regression ');

Year_books=db[['Book Name','Year']].value_counts().rename_axis(['Book','Year']).reset_index(name='counts')

Year2019=Year_books[Year_books['Year']==2019].nlargest(10, 'counts') Year2020=Year_books[Year_books['Year']==2020].nlargest(10, 'counts') Year2021=Year_books[Year_books['Year']==2021].nlargest(10, 'counts')

f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 10)) # Generate some sequential data sns.barplot(x='Book', y='counts', palette='Set2', ax=ax1,data=Year2019) ax1.axhline(0, color="k", clip_on=False) ax1.set_ylabel("2019") # Center the data to make it diverging sns.barplot(x='Book', y='counts', palette='Set2', ax=ax2,data=Year2020) ax2.axhline(0, color="k", clip_on=False) ax2.set_ylabel("2020") # Randomly reorder the data to make it qualitative sns.barplot(x='Book', y='counts', palette='Set2', ax=ax3,data=Year2021) ax3.axhline(0, color="k", clip_on=False) ax3.set_ylabel("2021") # Finalize the plot sns.despine(bottom=True) plt.setp(f.axes, yticks=[]) for p in ax1.patches: ax1.annotate(format(p.get_height(), '1.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 9), textcoords = 'offset points') for p in ax2.patches: ax2.annotate(format(p.get_height(), '1.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 9), textcoords = 'offset points') for p in ax3.patches: ax3.annotate(format(p.get_height(), '1.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 9), textcoords = 'offset points') ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90) ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90) ax3.set_xticklabels(ax3.get_xticklabels(),rotation=90) ax1.set_xticklabels([ "Data Science",'Kaggle', 'R ka Taaruf','(C++)','Apna Elaaj','Shaoor','Molo Masali', 'Kaggle for Begginers','CryptoCurrency','Blockchain']) ax2.set_xticklabels([ "Earn Money",'Python Programming','Product Management', 'Blockchain','Justju ka safar', 'Artificial Intelligence','Molo Masali','(C++)', 'Python Programming 2020','Sukkur To Florida']) ax3.set_xticklabels([ "Lucky Draw",'Earn Money','Column Nigari', 'Python Programming','Waqfa e Pareshani', 'Data Science','Arif Kareem','Machine Learning', 'Artificial Intelligence','Blockchain']) plt.tight_layout(h_pad=2) ax1.set(title='Best Seller in 2019'); ax2.set(title='Best Seller in 2020'); ax3.set(title='Best Seller in 2021');

Month_books=db[['Book Name','Month','Year']].value_counts().rename_axis(['Book','Month','Year']).reset_index(name='counts')

Month_2019=Month_books.sort_values(by=['counts'],ascending=False) Month_2019=Month_2019[Month_2019['Year']==2019] Range= pd.Series(range(10,13)) Top_Month=[] Top_Book=[] Top_counts=[] for x in Range: Book=Month_2019[Month_2019['Month']==x].nlargest(1,'counts')['Book'].values Month=Month_2019[Month_2019['Month']==x].nlargest(1,'counts')['Month'].values Counts=Month_2019[Month_2019['Month']==x].nlargest(1,'counts')['counts'].values Top_Book.append(Book) Top_Month.append(Month) Top_counts.append(Counts) Top_Month=[['Oct','Nov','Dec']] Top_Book=np.transpose(Top_Book) Top_counts=np.transpose(Top_counts) Total=pd.DataFrame(np.concatenate([Top_Month,Top_Book,Top_counts]).transpose(), columns=['Month','Book','Sale']) Total

ax4=sns.barplot(data=Total,y='Book',x='Sale',hue='Month',palette='Set2',orient='h' ,dodge=False) # ax4.set_yticklabels(ax4.get_xticklabels(),rotation=90) ax4.set_yticklabels([ "Kaggle for Begginers",'Apna Elaaj Khud Karay','Data Science']) sns.despine(bottom=True) plt.setp(f.axes, yticks=[]); ax4.set(title='Bestseller by Month in 2019 ');

Month_2020=Month_books.sort_values(by=['counts'],ascending=False) Month_2020=Month_2020[Month_2020['Year']==2020] Range= pd.Series(range(1,13)) Top_Month=[] Top_Book=[] Top_counts=[] for x in Range: Book=Month_2020[Month_2020['Month']==x].nlargest(1,'counts')['Book'].values Month=Month_2020[Month_2020['Month']==x].nlargest(1,'counts')['Month'].values Counts=Month_2020[Month_2020['Month']==x].nlargest(1,'counts')['counts'].values Top_Book.append(Book) Top_Month.append(Month) Top_counts.append(Counts) Top_Month=[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']] Top_Book=np.transpose(Top_Book) Top_counts=np.transpose(Top_counts) Total=pd.DataFrame(np.concatenate([Top_Month,Top_Book,Top_counts]).transpose(), columns=['Month','Book','Sale']) Total

ax4=sns.barplot(data=Total,y='Book',x='Sale',hue='Month',palette='Set2',orient='h' ,dodge=False,alpha=0.9) # ax4.set_yticklabels(ax4.get_xticklabels(),rotation=90) ax4.set_yticklabels([ "C++",'Data Science','Justju ka safar-1','Artificial Intelligence', 'Product Management','Python Programming','Earn Money From Internet']) sns.despine(bottom=True) plt.setp(f.axes, yticks=[]); ax4.set(title='Bestseller by Month in 2020 ');

Month_2021=Month_books.sort_values(by=['counts'],ascending=False) Month_2021=Month_2021[Month_2021['Year']==2021] Total=Month_2021[Month_2021['Month']==1].nlargest(1,'counts') Total