import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
'''reading dataset'''
df = pd.read_csv('My Uber Drives - 2016.csv')
''' displaying first 5 rows of dataset '''
df.head()
'''columns in dataset '''
df.columns
df.shape
'''checking null values in dataset '''
df.isnull().sum()
"""find duplicate rows"""
df[df.duplicated()]
## there is one duplicated row, we are gonna remove this duplicate row
df.drop_duplicates(inplace=True)
'''There we have 4 rows having starting and end-time exactly equal i.e. zero trip time,
while having non-zero miles (distance) - which cannot be possible.'''
df.drop(df.index[[751, 761, 798, 807]], inplace=True)
''' Renaming column names '''
df.columns = ['START_DATE', 'END_DATE', 'CATEGORY', 'START', 'STOP', 'MILES', 'PURPOSE']
''' Converting START_DATE and END_DATE into date time'''
df['START_DATE'] = pd.to_datetime(df['START_DATE'], errors='coerce')
df['END_DATE'] = pd.to_datetime(df['END_DATE'], errors='coerce')
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1151 entries, 0 to 1155
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 START_DATE 1150 non-null datetime64[ns]
1 END_DATE 1150 non-null datetime64[ns]
2 CATEGORY 1150 non-null object
3 START 1150 non-null object
4 STOP 1150 non-null object
5 MILES 1151 non-null float64
6 PURPOSE 652 non-null object
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 71.9+ KB
df.head()
''' Count Plot '''
plt.figure(figsize=(10, 5))
sns.countplot(df['CATEGORY']);
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
start_labels = df.START.value_counts().nlargest(10)
''' Bar Plot '''
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(start_labels.index, start_labels);
plt.ylabel('Value Counts');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
''' Bar Plot '''
stop_labels = df.STOP.value_counts().nlargest(10)
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(stop_labels.index, stop_labels);
plt.ylabel('Value Counts');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
''' Extracting months from column START_DATE '''
df['MONTH'] = pd.DatetimeIndex(df['START_DATE']).month
month_label = {1.0: 'Jan', 2.0: 'Feb', 3.0: 'Mar', 4.0: 'April', 5.0: 'May', 6.0: 'June', 7.0: 'July', 8.0: 'Aug', 9.0: 'Sep',
10.0: 'Oct', 11.0: 'Nov', 12.0: 'Dec' }
df["MONTH"] = df.MONTH.map(month_label)
df.MONTH.unique()
df.head()
''' Bar Plot '''
month_count = df.MONTH.value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(month_count.index, month_count);
plt.xlabel('Months')
plt.ylabel('Value Counts');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
''' creating a dictonary that contains info about miles column '''
miles_dic = {}
for i in df.MILES:
if i < 10:
if '0-10 miles' not in miles_dic:
miles_dic['0-10 miles'] = [i]
else:
miles_dic['0-10 miles'].append(i)
elif i >= 10 and i < 20:
if '10-20 miles' not in miles_dic:
miles_dic['10-20 miles'] = [i]
else:
miles_dic['10-20 miles'].append(i)
elif i >= 20 and i < 30:
if '20-30 miles' not in miles_dic:
miles_dic['20-30 miles'] = [i]
else:
miles_dic['20-30 miles'].append(i)
elif i >= 30 and i < 40:
if '30-40 miles' not in miles_dic:
miles_dic['30-40 miles'] = [i]
else:
miles_dic['30-40 miles'].append(i)
elif i >= 40 and i < 50:
if '40-50 miles' not in miles_dic:
miles_dic['40-50 miles'] = [i]
else:
miles_dic['40-50 miles'].append(i)
else:
if 'Above 50 miles' not in miles_dic:
miles_dic['Above 50 miles'] = [i]
else:
miles_dic['Above 50 miles'].append(i)
len_miles = []
for key in miles_dic:
len_miles.append((key, len(miles_dic[key])))
a, b = [], []
for i, j in len_miles:
a.append(i)
b.append(j)
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(a, b)
plt.xlabel('Miles')
plt.ylabel('Count');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
df.head()
''' Now we want to know how many trips were made on Day time and how many on Night time '''
t = pd.to_datetime(['18:00:00']).time
def check_time(tim):
if t > tim:
tim = 'DAY RIDE'
else:
tim = 'NIGHT RIDE'
df['DAY/NIGHT'] = df.apply(lambda x : 'NIGHT RIDE' if x['START_DATE'].time() > t else 'DAY RIDE', axis=1)
day_night_label = df['DAY/NIGHT'].value_counts()
''' Bar Plot '''
plt.figure(figsize=(10, 5))
sns.barplot(day_night_label.index, day_night_label);
plt.ylabel('COUNT')
plt.xlabel('DAY/NIGHT');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
df['DAY'] = df.START_DATE.dt.weekday
day_label = {
0: 'Mon', 1: 'Tues', 2: 'Wed', 3: 'Thus', 4: 'Fir', 5: 'Sat', 6: 'Sun'
}
df['DAY'] = df['DAY'].map(day_label)
day_label = df.DAY.value_counts()
'''bar plot of days'''
plt.figure(figsize=(10, 5))
sns.barplot(day_label.index, day_label);
plt.xlabel('DAY')
plt.ylabel('COUNT');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
''' count plot of day'''
plt.figure(figsize=(10, 5))
sns.countplot(hue='CATEGORY', x='DAY', data=df);