import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
'''reading dataset'''
df = pd.read_csv('My Uber Drives - 2016.csv')
''' displaying first 5 rows of dataset '''
df.head()
'''columns in dataset '''
df.columns
df.shape
'''checking null values in dataset '''
df.isnull().sum()
"""find duplicate rows"""
df[df.duplicated()]
## there is one duplicated row, we are gonna remove this duplicate row
df.drop_duplicates(inplace=True)
'''There we have 4 rows having starting and end-time exactly equal i.e. zero trip time,
while having non-zero miles (distance) - which cannot be possible.'''
df.drop(df.index[[751, 761, 798, 807]], inplace=True)
''' Renaming column names '''
df.columns = ['START_DATE', 'END_DATE', 'CATEGORY', 'START', 'STOP', 'MILES', 'PURPOSE']
''' Converting START_DATE and END_DATE into date time'''
df['START_DATE'] = pd.to_datetime(df['START_DATE'], errors='coerce')
df['END_DATE'] = pd.to_datetime(df['END_DATE'], errors='coerce')
df.info()
df.head()
''' Count Plot '''
plt.figure(figsize=(10, 5))
sns.countplot(df['CATEGORY']);
start_labels = df.START.value_counts().nlargest(10)
''' Bar Plot '''
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(start_labels.index, start_labels);
plt.ylabel('Value Counts');
''' Bar Plot '''
stop_labels = df.STOP.value_counts().nlargest(10)
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(stop_labels.index, stop_labels);
plt.ylabel('Value Counts');
''' Extracting months from column START_DATE '''
df['MONTH'] = pd.DatetimeIndex(df['START_DATE']).month
month_label = {1.0: 'Jan', 2.0: 'Feb', 3.0: 'Mar', 4.0: 'April', 5.0: 'May', 6.0: 'June', 7.0: 'July', 8.0: 'Aug', 9.0: 'Sep',
10.0: 'Oct', 11.0: 'Nov', 12.0: 'Dec' }
df["MONTH"] = df.MONTH.map(month_label)
df.MONTH.unique()
df.head()
''' Bar Plot '''
month_count = df.MONTH.value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(month_count.index, month_count);
plt.xlabel('Months')
plt.ylabel('Value Counts');
''' creating a dictonary that contains info about miles column '''
miles_dic = {}
for i in df.MILES:
if i < 10:
if '0-10 miles' not in miles_dic:
miles_dic['0-10 miles'] = [i]
else:
miles_dic['0-10 miles'].append(i)
elif i >= 10 and i < 20:
if '10-20 miles' not in miles_dic:
miles_dic['10-20 miles'] = [i]
else:
miles_dic['10-20 miles'].append(i)
elif i >= 20 and i < 30:
if '20-30 miles' not in miles_dic:
miles_dic['20-30 miles'] = [i]
else:
miles_dic['20-30 miles'].append(i)
elif i >= 30 and i < 40:
if '30-40 miles' not in miles_dic:
miles_dic['30-40 miles'] = [i]
else:
miles_dic['30-40 miles'].append(i)
elif i >= 40 and i < 50:
if '40-50 miles' not in miles_dic:
miles_dic['40-50 miles'] = [i]
else:
miles_dic['40-50 miles'].append(i)
else:
if 'Above 50 miles' not in miles_dic:
miles_dic['Above 50 miles'] = [i]
else:
miles_dic['Above 50 miles'].append(i)
len_miles = []
for key in miles_dic:
len_miles.append((key, len(miles_dic[key])))
a, b = [], []
for i, j in len_miles:
a.append(i)
b.append(j)
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(a, b)
plt.xlabel('Miles')
plt.ylabel('Count');
df.head()
''' Now we want to know how many trips were made on Day time and how many on Night time '''
t = pd.to_datetime(['18:00:00']).time
def check_time(tim):
if t > tim:
tim = 'DAY RIDE'
else:
tim = 'NIGHT RIDE'
df['DAY/NIGHT'] = df.apply(lambda x : 'NIGHT RIDE' if x['START_DATE'].time() > t else 'DAY RIDE', axis=1)
day_night_label = df['DAY/NIGHT'].value_counts()
''' Bar Plot '''
plt.figure(figsize=(10, 5))
sns.barplot(day_night_label.index, day_night_label);
plt.ylabel('COUNT')
plt.xlabel('DAY/NIGHT');
df['DAY'] = df.START_DATE.dt.weekday
day_label = {
0: 'Mon', 1: 'Tues', 2: 'Wed', 3: 'Thus', 4: 'Fir', 5: 'Sat', 6: 'Sun'
}
df['DAY'] = df['DAY'].map(day_label)
day_label = df.DAY.value_counts()
'''bar plot of days'''
plt.figure(figsize=(10, 5))
sns.barplot(day_label.index, day_label);
plt.xlabel('DAY')
plt.ylabel('COUNT');
''' count plot of day'''
plt.figure(figsize=(10, 5))
sns.countplot(hue='CATEGORY', x='DAY', data=df);