!pip install pandas==1.3.3
!pip install matplotlib==3.4.2
!pip install seaborn==0.11.0
!pip install plotly==5.3.1
!pip install folium==0.12.1
!pip install geocoder==1.38.1
!pip install geopy==2.2.0
!pip install scipy==1.7.1
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import folium
import geocoder
import geopy
import plotly.graph_objects as go
from scipy.stats import ttest_ind
orders = pd.read_csv('orders.csv', index_col = None)
accounts = pd.read_csv('accounts.csv',index_col = None)
sr = pd.read_csv('sales_representatives.csv',index_col = None)
regions = pd.read_csv('regions.csv',index_col = None)
for table in [orders, accounts, sr, regions]:
print(table.info())
print('\n')
# change the some of the columns names in the loaded tables
orders.rename(columns = {'occurred_at':'order_date_time','id':'order_id'}, inplace = True)
accounts.rename(columns = {'id':'account_id','sales_rep_id':'sales_person_id','name':'account'}, inplace = True)
sr.rename(columns = {'id':'sales_person_id','name':'sales_person'}, inplace = True)
regions.rename(columns = {'id':'region_id','name':'region'}, inplace = True)
# merge the 4 dataframes
df = (orders.merge(accounts, 'inner', left_on = 'account_id', right_on = 'account_id').
merge(sr, 'inner', left_on = 'sales_person_id', right_on = 'sales_person_id').
merge(regions, 'inner', left_on = 'region_id', right_on = 'region_id')
)
# delete the coluns that cotains 'id' except for the 'orders_id'
cols = [col for col in df.columns if 'id' in col and col != 'order_id']
df.drop(cols, axis = 1, inplace = True)
df.head()
# make a new coumln of 'coordinates'
df['coordinates'] = list(df[['lat', 'long']].itertuples(index=False, name=None))
# drop the 'lat' and 'long' columns
df.drop(columns = ['lat', 'long'], inplace = True)
# convert the 'order_date_time' to Datetime data type
df['order_date_time'] = pd.to_datetime(df['order_date_time'])
df.info()
# list the categorical columns
cate_cols = df.select_dtypes(include = 'object').columns
cate_cols
df['account'].value_counts()
print('Total number of distinct accounts is', len(df['account'].value_counts()))
print("Due to the large number of accounts, I'll focus on the top 10")
top10_accounts_by_count = (df['account'].
value_counts().
head(10).
to_frame().
reset_index().
rename(columns = {'index':'account','account':'transactions'})
)
print(top10_accounts_by_count)
print('\n')
# plot the values in top10_accounts_by_count
plt.figure(figsize = (18,6), facecolor = 'lightblue')
sns.barplot(x = 'account', y = 'transactions', data = top10_accounts_by_count, color ='#8d8339')
plt.title('Top 10 Accounts by Transactions', size = 22)
plt.xlabel('Accounts', size = 18)
plt.xticks(rotation = 14)
plt.ylabel('Transactions', size = 18)
plt.yticks(size =14)
plt.show()
df['website'].value_counts()
print('Total number of distinct websites is',len(df['website'].value_counts()))
print("Due to the large number of websites, I'll focus on the top 10")
top10_websites_by_count = (df['website'].
value_counts().
head(10).
to_frame().
reset_index().rename(columns = {'index':'website','website':'transactions'})
)
print(top10_websites_by_count)
print('\n')
# plot top10_websites_by_count
plt.figure(figsize = (20,6), facecolor = 'lightblue')
sns.barplot(y= 'website', x = 'transactions', data = top10_websites_by_count, orient = 'h',color ='#335EFF' )
plt.title('Top 10 Websites by Transactions', size=22)
plt.xlabel('Transactions',size =18)
plt.xticks(size = 14)
plt.ylabel('Websites',size =18)
plt.yticks(size = 14)
plt.show()
regions_count = (df['region'].value_counts().
to_frame().reset_index().
rename(columns = {'index':'region','region':'transactions'}))
print(regions_count)
print('\n')
# plot the values in regions_count
plt.figure(figsize = (18,6), facecolor = 'lightblue')
plt.suptitle('Regions by Transactions', size = 22)
plt.subplot(1,2,1)
sns.barplot(x = 'region',y= 'transactions', data = regions_count, color = '#59398d' )
plt.xlabel('Regions',size = 18)
plt.xticks(size = 14)
plt.ylabel('Transactions',size = 18)
plt.yticks(size = 14)
plt.subplot(1,2,2)
plt.pie(x = regions_count['transactions'],
labels = regions_count['region'],
startangle= 90,
autopct= '%2.1F%%')
circle = plt.Circle((0,0), 0.78, color = 'white')
get_current_figure = plt.gcf()
get_current_figure.gca().add_artist(circle)
plt.show()
df['sales_person'].value_counts()
print('Total number of distinct salespeople is',len(df['sales_person'].value_counts()))
print("Due to the large number of salespeople, I'll focus on the top 10")
sales_persons_count = df['sales_person'].\
value_counts().\
to_frame().reset_index().\
rename(columns = {'index':'sales_person','sales_person':'transactions'}).\
head(10)
print(sales_persons_count)
print('\n')
# plot the values in sales_persons_count
plt.figure(figsize = (18,6), facecolor = 'lightblue')
ax = sns.barplot(x = 'sales_person',y= 'transactions', data = sales_persons_count, color = '#8D5939' )
ax.set_xticklabels(labels = ['Earlie Schleusner','Tia Amato','Vernita Plump','Moon Torian',
'Nelle Meaux','Maren Musto','Maryanna Fiorentino','Arica Stoltzfus',
'Charles Bidwell','Gianna Dossey'], ha = 'center')
plt.title('Top 10 Sales_Person by Transactions', size = 22)
plt.xlabel('Sales_Person',size = 18)
plt.xticks(size = 14, rotation = 10)
plt.ylabel('Transactions',size = 18)
plt.yticks(size = 14)
plt.show()
df['primary_poc'].value_counts()
print('Total number of distinct primary-poc is',len(df['primary_poc'].value_counts()))
print("Due to the large number of primary-poc, I'll focus on the top 10")
priamry_poc_counts=(df['primary_poc'].value_counts().
to_frame().reset_index().
rename(columns = {'index':'primary_poc','primary_poc':'transactions'}).\
head(10)
)
print(priamry_poc_counts)
print('\n')
# plot the values in sales_persons_count
plt.figure(figsize = (18,6), facecolor = 'lightblue')
ax = sns.barplot(x = 'primary_poc',y= 'transactions', data = priamry_poc_counts, color = '#8D5939' )
ax.set_xticklabels(labels = ['Earlie Schleusner','Tia Amato','Vernita Plump','Moon Torian',
'Nelle Meaux','Maren Musto','Maryanna Fiorentino','Arica Stoltzfus',
'Charles Bidwell','Gianna Dossey'], ha = 'center')
plt.title('Top 10 Primary Contact Person by Transactions', size = 22)
plt.xlabel('Sales_Person',size = 18)
plt.xticks(size = 14, rotation = 10)
plt.ylabel('Transactions',size = 18)
plt.yticks(size = 14)
plt.show()
df['coordinates'].value_counts()
print('Total number of distinct coordinates tuples is',len(df['coordinates'].value_counts()))
print("Due to the large number of coordinates, I'll focus on the top 10")
top10_locations_count = df['coordinates'].value_counts().\
to_frame().reset_index().\
rename(columns = {'index':'coordinates','coordinates':'transactions'}).\
head(10)
top10_locations_count
# import geopy.geocoders
from geopy.geocoders import Nominatim
# initialize the geopy.geocoders
geolocator = Nominatim(user_agent="locations_count")
# make an empty list of locations
locations = []
# loop through each values in the 'oordinates'
for val in top10_locations_count['coordinates'].values:
# get the location info for each coordinate which is a dictionary
loc = geolocator.reverse(str(val[0])+","+str(val[1]))
# use the .raw method to get the dictionary assigned to the 'address' key in the loc ditionary
# use the .keys() method to get the keys of the 'address' dictionary
# check if 'city' among the keys of the 'address' dictionary
if 'city' in loc.raw['address'].keys():
# append the value of the key 'city' to 'locations' list
city = loc.raw['address']['city']
locations.append(city)
# if 'city' is not among the keys of the 'address' dictionary, check for 'county'
elif 'county' in loc.raw['address'].keys():
# assign the valu of th 'county' key to variable named county
county = loc.raw['address']['county']
# append the county value to the locations
locations.append(county)
else:
locations.append('NA')
# assign the values of locations list ot the new column 'locations' in the dataframe
top10_locations_count['locations'] = locations
top10_locations_count
# make a basic map
top10_locations_map = folium.Map(location = (42.0988316272597 ,-98.82989549371113),zoom_start=4)
# loop through th rows of top10_locations_count
for index,row in top10_locations_count.iterrows():
# add a merker for each corrdinate with popup and tooltip
folium.Marker(location = [str(row.loc['coordinates'][0]),str(row.loc['coordinates'][1])],
popup = row.loc['locations'],
tooltip = row.loc['locations']+' '+'Transactions: '+ str(row.loc['transactions'])).add_to(top10_locations_map)
print('Top 10 Locations by Transactions (Zoom in to see the 2 values of Atlanta)')
top10_locations_map
rev_cols = ['total_amt_usd','standard_amt_usd','gloss_amt_usd','poster_amt_usd']
qty_cols = ['total','standard_qty','gloss_qty','poster_qty']
for col in rev_cols:
print(col,' Statistics:')
print(df[col].describe())
print('\n\n')
plt.figure(figsize = (18,20), facecolor = 'lightblue')
for i,col in enumerate(rev_cols):
plt.suptitle('Revenue Columns Distribution', fontsize = 24)
plt.subplot(4,2,i*2+1)
#plt.hist(df[col],bins = 100,color = '#335EFF',alpha = 0.75, edgecolor = 'black', linewidth=2)
sns.histplot(x= df[col], bins = 100)
plt.title(f'{col} Histogram', fontsize = 24)
plt.xlabel(f'{col} in $', fontsize = 14)
plt.ylabel(f'Count', fontsize = 14)
plt.tight_layout(pad=5.0)
plt.subplot(4,2,i*2+2)
sns.boxplot(x = df[col])
plt.title(f'{col} Boxplot', fontsize = 24)
plt.xlabel(f'{col}', fontsize = 14)
plt.ylabel(f'Count', fontsize = 14)
plt.tight_layout(pad=5.0)
q1 = df['total_amt_usd'].quantile(q = 0.25)
q3 = df['total_amt_usd'].quantile(q = 0.75)
iqr = q3-q1
lower_limit = q1 - 1.5*iqr
upper_limit = q3 + 1.5*iqr
df_without_outliers = df[(df['total_amt_usd'] > lower_limit) & (df['total_amt_usd'] < upper_limit)]
plt.figure(figsize = (18,20), facecolor = 'lightblue')
for i,col in enumerate(rev_cols):
plt.suptitle('Revenue Columns Distribution', fontsize = 24)
plt.subplot(4,2,i*2+1)
#plt.hist(df[col],bins = 100,color = '#335EFF',alpha = 0.75, edgecolor = 'black', linewidth=2)
sns.histplot(x= df_without_outliers[col], bins = 30)
plt.title(f'{col} Histogram', fontsize = 24)
plt.xlabel(f'{col} in $', fontsize = 14)
plt.ylabel(f'Count', fontsize = 14)
plt.tight_layout(pad=5.0)
plt.subplot(4,2,i*2+2)
sns.boxplot(x = df_without_outliers[col])
plt.title(f'{col} Boxplot', fontsize = 24)
plt.xlabel(f'{col}', fontsize = 14)
plt.ylabel(f'Count', fontsize = 14)
plt.tight_layout(pad=5.0)
for col in qty_cols:
print(col,' Statistics:')
print(df[col].describe())
print('\n\n')
plt.figure(figsize = (18,20), facecolor = 'lightblue')
for i,col in enumerate(rev_cols):
plt.suptitle('Revenue Columns Distribution', fontsize = 24)
plt.subplot(4,2,i*2+1)
#plt.hist(df[col],bins = 100,color = '#335EFF',alpha = 0.75, edgecolor = 'black', linewidth=2)
sns.histplot(x= df[col], bins = 100)
plt.title(f'{col} Histogram', fontsize = 24)
plt.xlabel(f'{col} in $', fontsize = 14)
plt.ylabel(f'Count', fontsize = 14)
plt.tight_layout(pad=5.0)
plt.subplot(4,2,i*2+2)
sns.boxplot(x = df[col])
plt.title(f'{col} Boxplot', fontsize = 24)
plt.xlabel(f'{col}', fontsize = 14)
plt.ylabel(f'Count', fontsize = 14)
plt.tight_layout(pad=5.0)
q1 = df.loc[:,['account','total_amt_usd']].\
groupby('account').\
sum().sort_values('total_amt_usd', ascending = False).\
head(10).\
rename(columns = {'total_amt_usd':'Total Revenue'})
q1.index.name = 'Account'
px.bar(q1,x = q1.index,
y = 'Total Revenue',
title = 'Top 10 Accounts by Total Revenue',
template = 'plotly_dark')
del q1
# aggregating the 'total_amt_usd' by 'coordinates'
q2 = df.loc[:,['coordinates','total_amt_usd']].\
groupby('coordinates').sum().\
sort_values('total_amt_usd', ascending = False).\
head(10)
# initialize the geopy.geocoders
geolocator = Nominatim(user_agent="top10revenuelocations")
locations = []
# loop through each values in the 'oordinates'
for val in q2.index.values:
# get the location info for each coordinate which is a dictionary
loc = geolocator.reverse(str(val[0])+","+str(val[1]))
# use the .raw method to get the dictionary assigned to the 'address' key in the loc ditionary
# use the .keys() method to get the keys of the 'address' dictionary
# check if 'city' among the keys of the 'address' dictionary
if 'city' in loc.raw['address'].keys():
# append the value of the key 'city' to 'locations' list
city = loc.raw['address']['city']
locations.append(city)
# if 'city' is not among the keys of the 'address' dictionary, check for 'county'
elif 'county' in loc.raw['address'].keys():
# assign the valu of th 'county' key to variable named county
county = loc.raw['address']['county']
# append the county value to the locations
locations.append(county)
else:
locations.append('NA')
q2['location'] = locations
q2
q2.reset_index(inplace = True)
# make a basic map
top10_revenue_locations_map = folium.Map(location = (42.0988316272597 ,-98.82989549371113),zoom_start=4)
# loop through th rows of top10_locations_count
for index,row in q2.iterrows():
# add a merker for each corrdinate with popup and tooltip
folium.Marker(location = [str(row['coordinates'][0]),str(row['coordinates'][1])],
popup = row['location'],
tooltip = row['location']+' Revenue ' +str(row['total_amt_usd'])).add_to(top10_revenue_locations_map)
top10_revenue_locations_map
q3 = df.loc[:, ['account','region','primary_poc','total_amt_usd','total']].\
groupby(['region','account','primary_poc']).\
sum()
cond1 = q3['total_amt_usd'] > 200000 ; cond2 = q3['total'] > 2000
q3 = q3.loc[(cond1) & (cond2), 'total_amt_usd'].\
reset_index().\
sort_values('total_amt_usd',ascending = False).\
rename(columns = {'total_amt_usd':'Total Amount Spent'})
q3.to_csv('sales&marketingq3.csv')
q3.head()
px.sunburst(q3,
path = ['region','account'],
values = 'Total Amount Spent',
title = 'Accounts with Total Quantity > 2000 and Total Amount Spent > 200000',
hover_name= 'primary_poc',
color = 'Total Amount Spent',
color_continuous_scale='RdBu',
template = 'plotly_dark')
q4 = df.loc[:,['standard_amt_usd','gloss_amt_usd','poster_amt_usd']].\
melt(var_name = 'Paper Type', value_name = 'Total Revenue').\
groupby('Paper Type').mean().\
reset_index().\
sort_values('Total Revenue', ascending = False)
q4['Paper Type'] = q4['Paper Type'].str.replace('_amt_usd','')
px.bar(q4, x = 'Paper Type', y = 'Total Revenue',
title = 'Average Revenue by Paper Type',
height = 500,
template = 'plotly_dark')
from scipy.stats import ttest_ind
stats, p = ttest_ind(df['standard_amt_usd'], df['gloss_amt_usd'])
print('t_score:',stats, 'p_value:',p)
stats, p = ttest_ind(df['gloss_amt_usd'], df['poster_amt_usd'])
print('t_score:',stats, 'p_value:',p)
del q2, q3,q4
q1 = df[['region','sales_person','account']].sort_values(by = 'account', ascending= True).drop_duplicates()
print(q1.head())
q1.to_csv('hrq1.csv',index = False)
del q1
q2 = df[df['region']=='Midwest'][['region','sales_person','account']].\
sort_values('account').\
drop_duplicates()
q2.to_csv('hrq2.csv')
print(q2.head())
del q2
q3 = df[df['sales_person'].str.startswith('S')][['region','sales_person','account']].\
sort_values('account').\
drop_duplicates()
q3.to_csv('hrq3.csv')
print(q3.head())
del q3
sales_person_condition = df['sales_person'].str.split(' ').str.get(1).str.startswith('K')
region_condition = df['region'] == 'Midwest'
q4 = df[sales_person_condition & region_condition][['region','sales_person','account']].\
sort_values('account').\
drop_duplicates()
print(q4.head())
q4.to_csv('hrq4.csv')
del q4
q5 = (df.groupby('region').
agg({'sales_person':'count'}).
reset_index().
rename(columns = {'sales_person':'sales_person_count'}).
sort_values('sales_person_count'))
print(q5)
del q5
q6 = df[['sales_person','account']].\
drop_duplicates().\
groupby('sales_person').\
agg({'account':'count'}).\
rename(columns = {'account':'account_count'})
print(q6[q6['account_count']>10])
q6 = len(q6[q6['account_count']>10])
print('Number of sales person worked with more than one account:', q6)
del q6
q7 = df[['sales_person','account']].\
drop_duplicates().\
groupby('sales_person').\
agg({'account':'count'}).\
rename(columns = {'account':'account_count'})
q7 = len(q7[q7['account_count']>4])
print('Number of sales person worked with more than one account:', q7)
del q7
q8 = df.groupby('sales_person').\
agg({'order_id':'count','total_amt_usd':'sum'}).\
rename(columns = {'order_id':'orders_count','total_amt_usd':'total_sales'}).\
reset_index()
q8.loc[(q8['orders_count'] > 200)|(q8['total_sales']>750000), 'sales_person_level'] = 'high'
q8.loc[(q8['orders_count'] > 150)|(q8['total_sales'] >500000),'sales_person_level'] = 'middle'
q8.loc[(q8['orders_count'] <= 150)|(q8['total_sales'] <=500000),'sales_person_level'] = 'low'
q8.sort_values('total_sales', ascending=False , inplace = True)
print(q8)
q8.to_csv('hrq8.csv')
del q8
q1 = (df.loc[df['order_date_time'].dt.year == 2015,['order_date_time','total_amt_usd']].
set_index('order_date_time').resample('m').
sum()
)
q1.to_csv('financialq1.csv')
q1= px.line(q1, y = 'total_amt_usd',
title = 'Monthly Revenue of 2015',
template = 'plotly_dark',
height =500)
q1.update_layout(xaxis_title = 'Date',yaxis_title = 'Monthly Rev.')
del q1
# make a dataframe for monthly revenue of standard paper in 2015
q2_standard = df.loc[df['order_date_time'].dt.year == 2015,['order_date_time','standard_amt_usd']]
q2_standard = q2_standard.set_index('order_date_time').resample('m').sum()
q2_standard = q2_standard.rename(columns = {'standard_amt_usd':'revenue'})
q2_standard['paper_type'] = 'Standard Paper'
# make a dataframe for monthly revenue of gloss paper in 2015
q2_gloss = df.loc[df['order_date_time'].dt.year == 2015,['order_date_time','gloss_amt_usd']]
q2_gloss = q2_gloss.set_index('order_date_time').resample('m').sum()
q2_gloss = q2_gloss.rename(columns = {'gloss_amt_usd':'revenue'})
q2_gloss['paper_type'] = 'Gloss Paper'
# make a dataframe for monthly revenue of poster paper in 2015
q2_poster = df.loc[df['order_date_time'].dt.year == 2015,['order_date_time','poster_amt_usd']]
q2_poster = q2_poster.set_index('order_date_time').resample('m').sum()
q2_poster = q2_poster.rename(columns = {'poster_amt_usd':'revenue'})
q2_poster['paper_type'] = 'Poster Paper'
# concatenate the 3 dataframes
q2 = q2_standard.append(q2_gloss).append(q2_poster)
q2.to_csv('financialq2.csv')
# plot the resultant dataframe
q2 = px.line(q2, y = 'revenue',
title = 'Monthly Revenue of 2015',
template = 'plotly_dark',
color = 'paper_type',
height =500)
q2.update_layout(xaxis_title = 'Date',yaxis_title = 'Monthly Rev.')
del q2, q2_standard, q2_gloss, q2_poster
q3 = df.loc[df['order_date_time'].dt.year == 2016, ['order_date_time','total_amt_usd']].\
set_index('order_date_time').\
resample('M').sum().\
rename(columns = {'total_amt_usd':'monthly_revenue'})
q3['running_monthly_revenue'] = q3.expanding().sum()
q3.to_csv('financialq4.csv')
trace1 = go.Scatter(x = q3.index,y= q3['monthly_revenue'], mode ='lines', name = 'Monthly Average')
trace2 = go.Scatter(x = q3.index,y= q3['running_monthly_revenue'], mode ='lines', name = 'Running Monthly Revenue')
traces = [trace1, trace2]
fig = go.Figure(data = traces)
fig.update_layout(title='Monthly Avg & Monthly Running Revenue',
xaxis_title='Date',
yaxis_title='Amount in USD',
height = 500)
fig.layout.template = ('plotly_dark')
fig.layout.legend.x = 0.05
fig.show()
del q3
q4 = df.loc[:,['order_date_time','total_amt_usd']].\
set_index('order_date_time').\
resample('M').sum()
q4['monthly_moving_avg_of_revenue'] = q4.rolling(2).mean()
q4.rename(columns = {'total_amt_usd':'monthly_revenue'}, inplace = True)
trace1 = go.Scatter(x = q4.index,y= q4['monthly_revenue'], mode ='lines', name = 'Monthly Average')
trace2 = go.Scatter(x = q4.index,y= q4['monthly_moving_avg_of_revenue'], mode ='lines',
name = 'Monthly Moving Average of Revenue')
traces = [trace1, trace2]
fig = go.Figure(data = traces,
)
fig.update_layout(title='Monthly Avg vs. Monthly Moving Avg. of Rev.',
xaxis_title='Date',
yaxis_title='Amount in USD',
height = 500)
fig.layout.template = ('plotly_dark')
fig.layout.legend.x = 0.05
fig.show()
del q4
q5 = df.loc[:,['order_date_time','total_amt_usd']].\
set_index('order_date_time').\
resample('M').sum()
q5['change_in_monthly_revenue'] = q5['total_amt_usd'].pct_change()
fig = px.bar(q5, x = q5.index, y = 'change_in_monthly_revenue',color = 'change_in_monthly_revenue',
title = 'Change in Monthly Revenue',
template = 'plotly_dark'
)
fig.update_layout( xaxis_title='Date',
yaxis_title='Change in Revenue',
height = 500)
fig.show()
del q5