#import required libraries for dataframes and visualizations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#import csv file
df = pd.read_csv('2019 Winter Data Science Intern Challenge Data Set - Sheet1.csv')
#take a look at the dataframe
df.info()
#statistical data of the dataframe
df.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 5000 non-null int64
1 shop_id 5000 non-null int64
2 user_id 5000 non-null int64
3 order_amount 5000 non-null int64
4 total_items 5000 non-null int64
5 payment_method 5000 non-null object
6 created_at 5000 non-null object
dtypes: int64(5), object(2)
memory usage: 273.6+ KB
order_idfloat64
shop_idfloat64
count
5000
5000
mean
2500.5
50.0788
std
1443.520003
29.00611766
min
1
1
25%
1250.75
24
50%
2500.5
50
75%
3750.25
75
max
5000
100
#checking for order_amount with outliers
sns.boxplot(y='order_amount',data=df, showfliers=True)
#checking for order_amount without outliers
sns.boxplot(y='order_amount',data=df, showfliers=False, color='#95BF47')
#checking for total_items with outliers
sns.boxplot(y='total_items',data=df, showfliers=True)
#checking for order_amount without outliers
sns.boxplot(y='total_items',data=df, showfliers=False, color='#95BF47')
#a closer look on the extreme outliers input within the data set where total items is greater than 100
df[df['total_items']>100].head()
order_idint64
shop_idint64
15
16
42
60
61
42
520
521
42
1104
1105
42
1362
1363
42
#a closer look on the extreme outliers input within the data set where order amount is greater than $10000 but excluding those we saw in the previous dataframe
df[(df['order_amount']>10000) &(df['total_items']<2000)].head()
order_idint64
shop_idint64
160
161
78
490
491
78
493
494
78
511
512
78
617
618
78
#creating a new dataframe that excludes extreme outliers for further analysis
new_df = df[(df['total_items']<100) & (df['order_amount']<10000)]
#checking the new dataframe
new_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4937 entries, 0 to 4999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 4937 non-null int64
1 shop_id 4937 non-null int64
2 user_id 4937 non-null int64
3 order_amount 4937 non-null int64
4 total_items 4937 non-null int64
5 payment_method 4937 non-null object
6 created_at 4937 non-null object
dtypes: int64(5), object(2)
memory usage: 308.6+ KB
#checking for order_amount with outliers
sns.boxplot(y='order_amount',data=new_df, showfliers=True, color='#95BF47')
#checking for total_items with outliers
sns.boxplot(y='total_items',data=new_df, showfliers=True, color='#95BF47')
new_df.describe()
order_idfloat64
shop_idfloat64
count
4937
4937
mean
2499.551347
49.84646546
std
1444.069407
29.06113136
min
1
1
25%
1248
24
50%
2497
50
75%
3751
74
max
5000
100