import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from scipy import stats
from haversine import haversine, Unit
from scipy.stats import skew, kurtosis
sns.set()
%matplotlib inline
# Importing the provided dataset from the data folder.
df = pd.read_csv('data/nyc_taxi_trip_duration.csv')
# Checking the five first rows of the dataset
df.head()
# Checking the last five rows of the dataset
df.tail()
# Using df.info to start analyzing the variables of the dataset
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729322 entries, 0 to 729321
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 729322 non-null object
1 vendor_id 729322 non-null int64
2 pickup_datetime 729322 non-null object
3 dropoff_datetime 729322 non-null object
4 passenger_count 729322 non-null int64
5 pickup_longitude 729322 non-null float64
6 pickup_latitude 729322 non-null float64
7 dropoff_longitude 729322 non-null float64
8 dropoff_latitude 729322 non-null float64
9 store_and_fwd_flag 729322 non-null object
10 trip_duration 729322 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 61.2+ MB
# Getting the unique values for each variable
df.isnull().sum()
# Unique values
df.nunique()
df.describe ()
df.describe (include=object)
# Changing the pickup_datetime and dropoff_datetime from object to datetime datatype
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime']=pd.to_datetime(df['dropoff_datetime'])
# Checking to new datatype for pickup_datetime and dropoff_datetime
print(df['pickup_datetime'].dtypes)
print(df['dropoff_datetime'].dtypes)
datetime64[ns]
datetime64[ns]
# Creating features based on month
df['pickup_by_month'] = df['pickup_datetime'].dt.month
df['dropoff_by_month'] = df['dropoff_datetime'].dt.month
# Creating features based on weekday
df['pickup_by_weekday'] = df['pickup_datetime'].dt.weekday
df['dropoff_by_weekday'] = df['dropoff_datetime'].dt.weekday
# Creating features based on day
df['pickup_by_day'] = df['pickup_datetime'].dt.day_name()
df['dropoff_by_day'] = df['dropoff_datetime'].dt.day_name()
# Creating features based on Hour
df['pickup_by_hour'] = df['pickup_datetime'].dt.hour
df['dropoff_by_hour'] = df['dropoff_datetime'].dt.hour
def part_of_day (t):
if t in range (6,12):
return "Morning"
elif t in range (12,18):
return "Afternoon"
elif t in range (18,21):
return "Evening"
else:
return "Night"
# Creating two new features called pickup_part_of_day and dropoff_part_of_day.
df['pickup_part_of_day']=df['pickup_by_hour'].apply(part_of_day)
df['dropoff_part_of_day']=df['dropoff_by_hour'].apply(part_of_day)
# Check to see if the formula has been applied correctly
df[['pickup_part_of_day','dropoff_part_of_day']].head()
point_a = (40.778873, -73.953918) # (lat, lon)
point_b = (40.771164, -73.963875)
haversine(point_a, point_b, unit=Unit.MILES)
# Create a function to determine the distance between two coordinate
def trip_distance(pickup_latitude,pickup_longitude, dropoff_latitude,dropoff_longitude):
start_coo = (pickup_latitude,pickup_longitude)
finish_coo = (dropoff_latitude,dropoff_longitude)
return haversine(start_coo,finish_coo, unit=Unit.MILES)
df['distance'] = df.apply(lambda x: trip_distance(x['pickup_latitude'],x['pickup_longitude'],x['dropoff_latitude'],x['dropoff_longitude']), axis=1)
# Checking to see that distance has been calculated
df['distance'].head()
# Creatring the speed feature using the formula (s = d/t) <- Given in Miles per Hour
df['average_speed'] = df['distance']/(df['trip_duration']/3600)
df['average_speed'].head().round(4)
# Display the first five rows
df[['pickup_by_month','dropoff_by_month','pickup_by_weekday', 'dropoff_by_weekday','pickup_by_day','dropoff_by_day','pickup_by_hour','dropoff_by_hour','pickup_part_of_day','dropoff_part_of_day','distance','average_speed']].head()
# Check to see that there are no missing values
df.isnull().sum()
df.describe()
df['trip_duration'].describe().round(2)
# Calculating the skewness and kurtosis of the variable
def add_stat(x):
m = df.loc[:,x].mode()
s = skew(df[x].dropna())
k = kurtosis(df[x].dropna())
W, p = stats.shapiro(df['trip_duration'].dropna())
print(f'Mode = {m}')
print(f'Skew = {s}')
print(f'Kurtosis = {k}')
if p <= 0.5:
print(f'Null hypothesis of normality is rejected.')
else:
print(f'Null hypothesis of normality is accepted.')
# Calculating additional statistics
add_stat('trip_duration')
Mode = 0 348
dtype: int64
Skew = 186.67805293547696
Kurtosis = 87141.8659722644
Null hypothesis of normality is rejected.
/shared-libs/python3.7/py/lib/python3.7/site-packages/scipy/stats/morestats.py:1681: UserWarning: p-value may not be accurate for N > 5000.
warnings.warn("p-value may not be accurate for N > 5000.")
# Plotting a Histogram to inspect distribution and outliers
plt.title('Histogram for Trip_Duration')
sns.histplot(df['trip_duration'], kde=False, bins=25)
plt.title('Boxplot for Trip_Duration')
sns.boxplot(x = df['trip_duration'])
# Checking the outlier )maxium) row
df[df['trip_duration'] == df['trip_duration'].max()]
/shared-libs/python3.7/py/lib/python3.7/site-packages/numpy/lib/histograms.py:822: RuntimeWarning: divide by zero encountered in double_scalars
norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge)
/shared-libs/python3.7/py/lib/python3.7/site-packages/numpy/lib/histograms.py:850: RuntimeWarning: invalid value encountered in multiply
f_indices = _unsigned_subtract(tmp_a, first_edge) * norm
# Dopping the outlier from maxium
df.drop(df[df['trip_duration'] == 1939736.00].index, inplace = True)
# Checking the new maxium
df[df['trip_duration'] == df['trip_duration'].max()]
/shared-libs/python3.7/py/lib/python3.7/site-packages/numpy/lib/histograms.py:822: RuntimeWarning: divide by zero encountered in double_scalars
norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge)
/shared-libs/python3.7/py/lib/python3.7/site-packages/numpy/lib/histograms.py:850: RuntimeWarning: invalid value encountered in multiply
f_indices = _unsigned_subtract(tmp_a, first_edge) * norm
df['vendor_id'].describe()
# Calculating additional statistics
add_stat('vendor_id')
Mode = 0 2
dtype: int64
Skew = -0.14197025212279726
Kurtosis = -1.9798444475121897
Null hypothesis of normality is rejected.
/shared-libs/python3.7/py/lib/python3.7/site-packages/scipy/stats/morestats.py:1681: UserWarning: p-value may not be accurate for N > 5000.
warnings.warn("p-value may not be accurate for N > 5000.")
sns.countplot(x='vendor_id', data=df)
df['passenger_count'].describe()
df.passenger_count.value_counts()
sns.countplot(x = 'passenger_count', data=df)
df['pickup_by_day'].describe(include=object)
df['dropoff_by_day'].describe(include=object)
sns.countplot(x = 'pickup_by_day', data=df)
sns.countplot(x = 'dropoff_by_day', data=df)
df['distance'].describe()
# Calculating additional statistics
add_stat('distance')
Mode = 0 0.0
dtype: float64
Skew = 40.983132210426184
Kurtosis = 9795.40134091124
Null hypothesis of normality is rejected.
/shared-libs/python3.7/py/lib/python3.7/site-packages/scipy/stats/morestats.py:1681: UserWarning: p-value may not be accurate for N > 5000.
warnings.warn("p-value may not be accurate for N > 5000.")
df['distance'].value_counts()
# Looking at the store_and_forward column for the minium
df[df['distance'] == df['distance'].min()]
df['average_speed'].describe()
# Calculating additional statistics
add_stat('average_speed')
Mode = 0 0.0
dtype: float64
Skew = 194.01366329779924
Kurtosis = 76874.06048746621
Null hypothesis of normality is rejected.
/shared-libs/python3.7/py/lib/python3.7/site-packages/scipy/stats/morestats.py:1681: UserWarning: p-value may not be accurate for N > 5000.
warnings.warn("p-value may not be accurate for N > 5000.")
df['average_speed'].value_counts()
df[df['average_speed'] == df['average_speed'].max()]
/shared-libs/python3.7/py/lib/python3.7/site-packages/numpy/lib/histograms.py:822: RuntimeWarning: divide by zero encountered in double_scalars
norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge)
/shared-libs/python3.7/py/lib/python3.7/site-packages/numpy/lib/histograms.py:850: RuntimeWarning: invalid value encountered in multiply
f_indices = _unsigned_subtract(tmp_a, first_edge) * norm
# Trip duration by vendor id
sns.barplot(x='vendor_id', y='trip_duration', data=df)
# Trip duration per passanger count
sns.barplot(x='passenger_count',y='trip_duration', data=df)
# Trip Duration by Day of the Week
sns.barplot(x="pickup_by_day", y='trip_duration', data=df)
# Trip Duration by Day of the Week
sns.barplot(x="pickup_part_of_day", y='trip_duration', data=df)
# Average Speed (Miles per Hour) by Day
sns.barplot(x="pickup_by_day", y='average_speed', data=df)
# Trip Duration by Day of the Week
sns.barplot(x="pickup_part_of_day", y='trip_duration', data=df)
# Trip Duration per hour
sns.lineplot(x='pickup_by_hour', y='trip_duration', data=df)
# Calculating the correlation
corr =df.corr()
corr
# Visualizing the relationship based on correlation
sns.heatmap(corr)