#import required libraries for dataframes and visualizations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#import csv file
df = pd.read_csv('2019 Winter Data Science Intern Challenge Data Set - Sheet1.csv')
#take a look at the dataframe
df.info()
#statistical data of the dataframe
df.describe()
#checking for order_amount with outliers
sns.boxplot(y='order_amount',data=df, showfliers=True)
#checking for order_amount without outliers
sns.boxplot(y='order_amount',data=df, showfliers=False, color='#95BF47')
#checking for total_items with outliers
sns.boxplot(y='total_items',data=df, showfliers=True)
#checking for order_amount without outliers
sns.boxplot(y='total_items',data=df, showfliers=False, color='#95BF47')
#a closer look on the extreme outliers input within the data set where total items is greater than 100
df[df['total_items']>100].head()
#a closer look on the extreme outliers input within the data set where order amount is greater than $10000 but excluding those we saw in the previous dataframe
df[(df['order_amount']>10000) &(df['total_items']<2000)].head()
#creating a new dataframe that excludes extreme outliers for further analysis
new_df = df[(df['total_items']<100) & (df['order_amount']<10000)]
#checking the new dataframe
new_df.info()
#checking for order_amount with outliers
sns.boxplot(y='order_amount',data=new_df, showfliers=True, color='#95BF47')
#checking for total_items with outliers
sns.boxplot(y='total_items',data=new_df, showfliers=True, color='#95BF47')
new_df.describe()