Parch&PoseyDataAnalytics

!pip install pandas==1.3.3 !pip install matplotlib==3.4.2 !pip install seaborn==0.11.0 !pip install plotly==5.3.1 !pip install folium==0.12.1 !pip install geocoder==1.38.1 !pip install geopy==2.2.0 !pip install scipy==1.7.1 import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import folium import geocoder import geopy import plotly.graph_objects as go from scipy.stats import ttest_ind

orders = pd.read_csv('orders.csv', index_col = None) accounts = pd.read_csv('accounts.csv',index_col = None) sr = pd.read_csv('sales_representatives.csv',index_col = None) regions = pd.read_csv('regions.csv',index_col = None)

for table in [orders, accounts, sr, regions]: print(table.info()) print('\n')

# change the some of the columns names in the loaded tables orders.rename(columns = {'occurred_at':'order_date_time','id':'order_id'}, inplace = True) accounts.rename(columns = {'id':'account_id','sales_rep_id':'sales_person_id','name':'account'}, inplace = True) sr.rename(columns = {'id':'sales_person_id','name':'sales_person'}, inplace = True) regions.rename(columns = {'id':'region_id','name':'region'}, inplace = True)

# merge the 4 dataframes df = (orders.merge(accounts, 'inner', left_on = 'account_id', right_on = 'account_id'). merge(sr, 'inner', left_on = 'sales_person_id', right_on = 'sales_person_id'). merge(regions, 'inner', left_on = 'region_id', right_on = 'region_id') )

# delete the coluns that cotains 'id' except for the 'orders_id' cols = [col for col in df.columns if 'id' in col and col != 'order_id'] df.drop(cols, axis = 1, inplace = True) df.head()

# make a new coumln of 'coordinates' df['coordinates'] = list(df[['lat', 'long']].itertuples(index=False, name=None)) # drop the 'lat' and 'long' columns df.drop(columns = ['lat', 'long'], inplace = True) # convert the 'order_date_time' to Datetime data type df['order_date_time'] = pd.to_datetime(df['order_date_time'])

df.info()

# list the categorical columns cate_cols = df.select_dtypes(include = 'object').columns cate_cols

df['account'].value_counts()

print('Total number of distinct accounts is', len(df['account'].value_counts())) print("Due to the large number of accounts, I'll focus on the top 10")

top10_accounts_by_count = (df['account']. value_counts(). head(10). to_frame(). reset_index(). rename(columns = {'index':'account','account':'transactions'}) ) print(top10_accounts_by_count) print('\n') # plot the values in top10_accounts_by_count plt.figure(figsize = (18,6), facecolor = 'lightblue') sns.barplot(x = 'account', y = 'transactions', data = top10_accounts_by_count, color ='#8d8339') plt.title('Top 10 Accounts by Transactions', size = 22) plt.xlabel('Accounts', size = 18) plt.xticks(rotation = 14) plt.ylabel('Transactions', size = 18) plt.yticks(size =14) plt.show()

df['website'].value_counts()

print('Total number of distinct websites is',len(df['website'].value_counts())) print("Due to the large number of websites, I'll focus on the top 10")

top10_websites_by_count = (df['website']. value_counts(). head(10). to_frame(). reset_index().rename(columns = {'index':'website','website':'transactions'}) ) print(top10_websites_by_count) print('\n') # plot top10_websites_by_count plt.figure(figsize = (20,6), facecolor = 'lightblue') sns.barplot(y= 'website', x = 'transactions', data = top10_websites_by_count, orient = 'h',color ='#335EFF' ) plt.title('Top 10 Websites by Transactions', size=22) plt.xlabel('Transactions',size =18) plt.xticks(size = 14) plt.ylabel('Websites',size =18) plt.yticks(size = 14) plt.show()

regions_count = (df['region'].value_counts(). to_frame().reset_index(). rename(columns = {'index':'region','region':'transactions'})) print(regions_count) print('\n') # plot the values in regions_count plt.figure(figsize = (18,6), facecolor = 'lightblue') plt.suptitle('Regions by Transactions', size = 22) plt.subplot(1,2,1) sns.barplot(x = 'region',y= 'transactions', data = regions_count, color = '#59398d' ) plt.xlabel('Regions',size = 18) plt.xticks(size = 14) plt.ylabel('Transactions',size = 18) plt.yticks(size = 14) plt.subplot(1,2,2) plt.pie(x = regions_count['transactions'], labels = regions_count['region'], startangle= 90, autopct= '%2.1F%%') circle = plt.Circle((0,0), 0.78, color = 'white') get_current_figure = plt.gcf() get_current_figure.gca().add_artist(circle) plt.show()

df['sales_person'].value_counts()

print('Total number of distinct salespeople is',len(df['sales_person'].value_counts())) print("Due to the large number of salespeople, I'll focus on the top 10")

sales_persons_count = df['sales_person'].\ value_counts().\ to_frame().reset_index().\ rename(columns = {'index':'sales_person','sales_person':'transactions'}).\ head(10) print(sales_persons_count) print('\n') # plot the values in sales_persons_count plt.figure(figsize = (18,6), facecolor = 'lightblue') ax = sns.barplot(x = 'sales_person',y= 'transactions', data = sales_persons_count, color = '#8D5939' ) ax.set_xticklabels(labels = ['Earlie Schleusner','Tia Amato','Vernita Plump','Moon Torian', 'Nelle Meaux','Maren Musto','Maryanna Fiorentino','Arica Stoltzfus', 'Charles Bidwell','Gianna Dossey'], ha = 'center') plt.title('Top 10 Sales_Person by Transactions', size = 22) plt.xlabel('Sales_Person',size = 18) plt.xticks(size = 14, rotation = 10) plt.ylabel('Transactions',size = 18) plt.yticks(size = 14) plt.show()

df['primary_poc'].value_counts()

print('Total number of distinct primary-poc is',len(df['primary_poc'].value_counts())) print("Due to the large number of primary-poc, I'll focus on the top 10")

priamry_poc_counts=(df['primary_poc'].value_counts(). to_frame().reset_index(). rename(columns = {'index':'primary_poc','primary_poc':'transactions'}).\ head(10) ) print(priamry_poc_counts) print('\n') # plot the values in sales_persons_count plt.figure(figsize = (18,6), facecolor = 'lightblue') ax = sns.barplot(x = 'primary_poc',y= 'transactions', data = priamry_poc_counts, color = '#8D5939' ) ax.set_xticklabels(labels = ['Earlie Schleusner','Tia Amato','Vernita Plump','Moon Torian', 'Nelle Meaux','Maren Musto','Maryanna Fiorentino','Arica Stoltzfus', 'Charles Bidwell','Gianna Dossey'], ha = 'center') plt.title('Top 10 Primary Contact Person by Transactions', size = 22) plt.xlabel('Sales_Person',size = 18) plt.xticks(size = 14, rotation = 10) plt.ylabel('Transactions',size = 18) plt.yticks(size = 14) plt.show()

df['coordinates'].value_counts()

print('Total number of distinct coordinates tuples is',len(df['coordinates'].value_counts())) print("Due to the large number of coordinates, I'll focus on the top 10")

top10_locations_count = df['coordinates'].value_counts().\ to_frame().reset_index().\ rename(columns = {'index':'coordinates','coordinates':'transactions'}).\ head(10) top10_locations_count

# import geopy.geocoders from geopy.geocoders import Nominatim # initialize the geopy.geocoders geolocator = Nominatim(user_agent="locations_count") # make an empty list of locations locations = [] # loop through each values in the 'oordinates' for val in top10_locations_count['coordinates'].values: # get the location info for each coordinate which is a dictionary loc = geolocator.reverse(str(val[0])+","+str(val[1])) # use the .raw method to get the dictionary assigned to the 'address' key in the loc ditionary # use the .keys() method to get the keys of the 'address' dictionary # check if 'city' among the keys of the 'address' dictionary if 'city' in loc.raw['address'].keys(): # append the value of the key 'city' to 'locations' list city = loc.raw['address']['city'] locations.append(city) # if 'city' is not among the keys of the 'address' dictionary, check for 'county' elif 'county' in loc.raw['address'].keys(): # assign the valu of th 'county' key to variable named county county = loc.raw['address']['county'] # append the county value to the locations locations.append(county) else: locations.append('NA') # assign the values of locations list ot the new column 'locations' in the dataframe top10_locations_count['locations'] = locations top10_locations_count

# make a basic map top10_locations_map = folium.Map(location = (42.0988316272597 ,-98.82989549371113),zoom_start=4) # loop through th rows of top10_locations_count for index,row in top10_locations_count.iterrows(): # add a merker for each corrdinate with popup and tooltip folium.Marker(location = [str(row.loc['coordinates'][0]),str(row.loc['coordinates'][1])], popup = row.loc['locations'], tooltip = row.loc['locations']+' '+'Transactions: '+ str(row.loc['transactions'])).add_to(top10_locations_map) print('Top 10 Locations by Transactions (Zoom in to see the 2 values of Atlanta)') top10_locations_map

rev_cols = ['total_amt_usd','standard_amt_usd','gloss_amt_usd','poster_amt_usd'] qty_cols = ['total','standard_qty','gloss_qty','poster_qty']

for col in rev_cols: print(col,' Statistics:') print(df[col].describe()) print('\n\n')

plt.figure(figsize = (18,20), facecolor = 'lightblue') for i,col in enumerate(rev_cols): plt.suptitle('Revenue Columns Distribution', fontsize = 24) plt.subplot(4,2,i*2+1) #plt.hist(df[col],bins = 100,color = '#335EFF',alpha = 0.75, edgecolor = 'black', linewidth=2) sns.histplot(x= df[col], bins = 100) plt.title(f'{col} Histogram', fontsize = 24) plt.xlabel(f'{col} in $', fontsize = 14) plt.ylabel(f'Count', fontsize = 14) plt.tight_layout(pad=5.0) plt.subplot(4,2,i*2+2) sns.boxplot(x = df[col]) plt.title(f'{col} Boxplot', fontsize = 24) plt.xlabel(f'{col}', fontsize = 14) plt.ylabel(f'Count', fontsize = 14) plt.tight_layout(pad=5.0)

q1 = df['total_amt_usd'].quantile(q = 0.25) q3 = df['total_amt_usd'].quantile(q = 0.75) iqr = q3-q1 lower_limit = q1 - 1.5*iqr upper_limit = q3 + 1.5*iqr

df_without_outliers = df[(df['total_amt_usd'] > lower_limit) & (df['total_amt_usd'] < upper_limit)]

plt.figure(figsize = (18,20), facecolor = 'lightblue') for i,col in enumerate(rev_cols): plt.suptitle('Revenue Columns Distribution', fontsize = 24) plt.subplot(4,2,i*2+1) #plt.hist(df[col],bins = 100,color = '#335EFF',alpha = 0.75, edgecolor = 'black', linewidth=2) sns.histplot(x= df_without_outliers[col], bins = 30) plt.title(f'{col} Histogram', fontsize = 24) plt.xlabel(f'{col} in $', fontsize = 14) plt.ylabel(f'Count', fontsize = 14) plt.tight_layout(pad=5.0) plt.subplot(4,2,i*2+2) sns.boxplot(x = df_without_outliers[col]) plt.title(f'{col} Boxplot', fontsize = 24) plt.xlabel(f'{col}', fontsize = 14) plt.ylabel(f'Count', fontsize = 14) plt.tight_layout(pad=5.0)

for col in qty_cols: print(col,' Statistics:') print(df[col].describe()) print('\n\n')

plt.figure(figsize = (18,20), facecolor = 'lightblue') for i,col in enumerate(rev_cols): plt.suptitle('Revenue Columns Distribution', fontsize = 24) plt.subplot(4,2,i*2+1) #plt.hist(df[col],bins = 100,color = '#335EFF',alpha = 0.75, edgecolor = 'black', linewidth=2) sns.histplot(x= df[col], bins = 100) plt.title(f'{col} Histogram', fontsize = 24) plt.xlabel(f'{col} in $', fontsize = 14) plt.ylabel(f'Count', fontsize = 14) plt.tight_layout(pad=5.0) plt.subplot(4,2,i*2+2) sns.boxplot(x = df[col]) plt.title(f'{col} Boxplot', fontsize = 24) plt.xlabel(f'{col}', fontsize = 14) plt.ylabel(f'Count', fontsize = 14) plt.tight_layout(pad=5.0)

q1 = df.loc[:,['account','total_amt_usd']].\ groupby('account').\ sum().sort_values('total_amt_usd', ascending = False).\ head(10).\ rename(columns = {'total_amt_usd':'Total Revenue'}) q1.index.name = 'Account' px.bar(q1,x = q1.index, y = 'Total Revenue', title = 'Top 10 Accounts by Total Revenue', template = 'plotly_dark')

del q1

# aggregating the 'total_amt_usd' by 'coordinates' q2 = df.loc[:,['coordinates','total_amt_usd']].\ groupby('coordinates').sum().\ sort_values('total_amt_usd', ascending = False).\ head(10) # initialize the geopy.geocoders geolocator = Nominatim(user_agent="top10revenuelocations") locations = [] # loop through each values in the 'oordinates' for val in q2.index.values: # get the location info for each coordinate which is a dictionary loc = geolocator.reverse(str(val[0])+","+str(val[1])) # use the .raw method to get the dictionary assigned to the 'address' key in the loc ditionary # use the .keys() method to get the keys of the 'address' dictionary # check if 'city' among the keys of the 'address' dictionary if 'city' in loc.raw['address'].keys(): # append the value of the key 'city' to 'locations' list city = loc.raw['address']['city'] locations.append(city) # if 'city' is not among the keys of the 'address' dictionary, check for 'county' elif 'county' in loc.raw['address'].keys(): # assign the valu of th 'county' key to variable named county county = loc.raw['address']['county'] # append the county value to the locations locations.append(county) else: locations.append('NA') q2['location'] = locations q2

q2.reset_index(inplace = True) # make a basic map top10_revenue_locations_map = folium.Map(location = (42.0988316272597 ,-98.82989549371113),zoom_start=4) # loop through th rows of top10_locations_count for index,row in q2.iterrows(): # add a merker for each corrdinate with popup and tooltip folium.Marker(location = [str(row['coordinates'][0]),str(row['coordinates'][1])], popup = row['location'], tooltip = row['location']+' Revenue ' +str(row['total_amt_usd'])).add_to(top10_revenue_locations_map) top10_revenue_locations_map

q3 = df.loc[:, ['account','region','primary_poc','total_amt_usd','total']].\ groupby(['region','account','primary_poc']).\ sum() cond1 = q3['total_amt_usd'] > 200000 ; cond2 = q3['total'] > 2000 q3 = q3.loc[(cond1) & (cond2), 'total_amt_usd'].\ reset_index().\ sort_values('total_amt_usd',ascending = False).\ rename(columns = {'total_amt_usd':'Total Amount Spent'}) q3.to_csv('sales&marketingq3.csv') q3.head()

px.sunburst(q3, path = ['region','account'], values = 'Total Amount Spent', title = 'Accounts with Total Quantity > 2000 and Total Amount Spent > 200000', hover_name= 'primary_poc', color = 'Total Amount Spent', color_continuous_scale='RdBu', template = 'plotly_dark')

q4 = df.loc[:,['standard_amt_usd','gloss_amt_usd','poster_amt_usd']].\ melt(var_name = 'Paper Type', value_name = 'Total Revenue').\ groupby('Paper Type').mean().\ reset_index().\ sort_values('Total Revenue', ascending = False) q4['Paper Type'] = q4['Paper Type'].str.replace('_amt_usd','') px.bar(q4, x = 'Paper Type', y = 'Total Revenue', title = 'Average Revenue by Paper Type', height = 500, template = 'plotly_dark')

from scipy.stats import ttest_ind

stats, p = ttest_ind(df['standard_amt_usd'], df['gloss_amt_usd']) print('t_score:',stats, 'p_value:',p)

stats, p = ttest_ind(df['gloss_amt_usd'], df['poster_amt_usd']) print('t_score:',stats, 'p_value:',p)

del q2, q3,q4

q1 = df[['region','sales_person','account']].sort_values(by = 'account', ascending= True).drop_duplicates() print(q1.head()) q1.to_csv('hrq1.csv',index = False) del q1

q2 = df[df['region']=='Midwest'][['region','sales_person','account']].\ sort_values('account').\ drop_duplicates() q2.to_csv('hrq2.csv') print(q2.head()) del q2

q3 = df[df['sales_person'].str.startswith('S')][['region','sales_person','account']].\ sort_values('account').\ drop_duplicates() q3.to_csv('hrq3.csv') print(q3.head()) del q3

sales_person_condition = df['sales_person'].str.split(' ').str.get(1).str.startswith('K') region_condition = df['region'] == 'Midwest' q4 = df[sales_person_condition & region_condition][['region','sales_person','account']].\ sort_values('account').\ drop_duplicates() print(q4.head()) q4.to_csv('hrq4.csv') del q4

q5 = (df.groupby('region'). agg({'sales_person':'count'}). reset_index(). rename(columns = {'sales_person':'sales_person_count'}). sort_values('sales_person_count')) print(q5) del q5

q6 = df[['sales_person','account']].\ drop_duplicates().\ groupby('sales_person').\ agg({'account':'count'}).\ rename(columns = {'account':'account_count'}) print(q6[q6['account_count']>10]) q6 = len(q6[q6['account_count']>10]) print('Number of sales person worked with more than one account:', q6) del q6

q7 = df[['sales_person','account']].\ drop_duplicates().\ groupby('sales_person').\ agg({'account':'count'}).\ rename(columns = {'account':'account_count'}) q7 = len(q7[q7['account_count']>4]) print('Number of sales person worked with more than one account:', q7) del q7

q8 = df.groupby('sales_person').\ agg({'order_id':'count','total_amt_usd':'sum'}).\ rename(columns = {'order_id':'orders_count','total_amt_usd':'total_sales'}).\ reset_index() q8.loc[(q8['orders_count'] > 200)|(q8['total_sales']>750000), 'sales_person_level'] = 'high' q8.loc[(q8['orders_count'] > 150)|(q8['total_sales'] >500000),'sales_person_level'] = 'middle' q8.loc[(q8['orders_count'] <= 150)|(q8['total_sales'] <=500000),'sales_person_level'] = 'low' q8.sort_values('total_sales', ascending=False , inplace = True) print(q8) q8.to_csv('hrq8.csv') del q8

q1 = (df.loc[df['order_date_time'].dt.year == 2015,['order_date_time','total_amt_usd']]. set_index('order_date_time').resample('m'). sum() ) q1.to_csv('financialq1.csv') q1= px.line(q1, y = 'total_amt_usd', title = 'Monthly Revenue of 2015', template = 'plotly_dark', height =500) q1.update_layout(xaxis_title = 'Date',yaxis_title = 'Monthly Rev.')

del q1

# make a dataframe for monthly revenue of standard paper in 2015 q2_standard = df.loc[df['order_date_time'].dt.year == 2015,['order_date_time','standard_amt_usd']] q2_standard = q2_standard.set_index('order_date_time').resample('m').sum() q2_standard = q2_standard.rename(columns = {'standard_amt_usd':'revenue'}) q2_standard['paper_type'] = 'Standard Paper' # make a dataframe for monthly revenue of gloss paper in 2015 q2_gloss = df.loc[df['order_date_time'].dt.year == 2015,['order_date_time','gloss_amt_usd']] q2_gloss = q2_gloss.set_index('order_date_time').resample('m').sum() q2_gloss = q2_gloss.rename(columns = {'gloss_amt_usd':'revenue'}) q2_gloss['paper_type'] = 'Gloss Paper' # make a dataframe for monthly revenue of poster paper in 2015 q2_poster = df.loc[df['order_date_time'].dt.year == 2015,['order_date_time','poster_amt_usd']] q2_poster = q2_poster.set_index('order_date_time').resample('m').sum() q2_poster = q2_poster.rename(columns = {'poster_amt_usd':'revenue'}) q2_poster['paper_type'] = 'Poster Paper' # concatenate the 3 dataframes q2 = q2_standard.append(q2_gloss).append(q2_poster) q2.to_csv('financialq2.csv') # plot the resultant dataframe q2 = px.line(q2, y = 'revenue', title = 'Monthly Revenue of 2015', template = 'plotly_dark', color = 'paper_type', height =500) q2.update_layout(xaxis_title = 'Date',yaxis_title = 'Monthly Rev.')

del q2, q2_standard, q2_gloss, q2_poster

q3 = df.loc[df['order_date_time'].dt.year == 2016, ['order_date_time','total_amt_usd']].\ set_index('order_date_time').\ resample('M').sum().\ rename(columns = {'total_amt_usd':'monthly_revenue'}) q3['running_monthly_revenue'] = q3.expanding().sum() q3.to_csv('financialq4.csv') trace1 = go.Scatter(x = q3.index,y= q3['monthly_revenue'], mode ='lines', name = 'Monthly Average') trace2 = go.Scatter(x = q3.index,y= q3['running_monthly_revenue'], mode ='lines', name = 'Running Monthly Revenue') traces = [trace1, trace2] fig = go.Figure(data = traces) fig.update_layout(title='Monthly Avg & Monthly Running Revenue', xaxis_title='Date', yaxis_title='Amount in USD', height = 500) fig.layout.template = ('plotly_dark') fig.layout.legend.x = 0.05 fig.show()

del q3

q4 = df.loc[:,['order_date_time','total_amt_usd']].\ set_index('order_date_time').\ resample('M').sum() q4['monthly_moving_avg_of_revenue'] = q4.rolling(2).mean() q4.rename(columns = {'total_amt_usd':'monthly_revenue'}, inplace = True) trace1 = go.Scatter(x = q4.index,y= q4['monthly_revenue'], mode ='lines', name = 'Monthly Average') trace2 = go.Scatter(x = q4.index,y= q4['monthly_moving_avg_of_revenue'], mode ='lines', name = 'Monthly Moving Average of Revenue') traces = [trace1, trace2] fig = go.Figure(data = traces, ) fig.update_layout(title='Monthly Avg vs. Monthly Moving Avg. of Rev.', xaxis_title='Date', yaxis_title='Amount in USD', height = 500) fig.layout.template = ('plotly_dark') fig.layout.legend.x = 0.05 fig.show()

del q4

q5 = df.loc[:,['order_date_time','total_amt_usd']].\ set_index('order_date_time').\ resample('M').sum() q5['change_in_monthly_revenue'] = q5['total_amt_usd'].pct_change() fig = px.bar(q5, x = q5.index, y = 'change_in_monthly_revenue',color = 'change_in_monthly_revenue', title = 'Change in Monthly Revenue', template = 'plotly_dark' ) fig.update_layout( xaxis_title='Date', yaxis_title='Change in Revenue', height = 500) fig.show()

del q5