# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
# importing the datasets
df1 = pd.read_csv('202101-divvy-tripdata.csv')
df2 = pd.read_csv('202102-divvy-tripdata.csv')
df3 = pd.read_csv('202103-divvy-tripdata.csv')
df4 = pd.read_csv('202104-divvy-tripdata.csv')
df5 = pd.read_csv('202105-divvy-tripdata.csv')
df6 = pd.read_csv('202106-divvy-tripdata.csv')
df7 = pd.read_csv('202107-divvy-tripdata.csv')
df8 = pd.read_csv('202108-divvy-tripdata.csv')
df9 = pd.read_csv('202109-divvy-tripdata.csv')
df10 = pd.read_csv('202110-divvy-tripdata.csv')
df11 = pd.read_csv('202111-divvy-tripdata.csv')
df12 = pd.read_csv('202112-divvy-tripdata.csv')
# merging all data
Cyclistic_data = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12])
Cyclistic_data.shape
Cyclistic_data.head()
Cyclistic_data.info()
#creating a backup
Cyclistic_data.to_csv('Cyclistic_data.csv')
# change rideable_type to a categorical dtype from an object dtype
Cyclistic_data['rideable_type']= Cyclistic_data.rideable_type.astype('category')
#change member_casual to a categorical dtype from an object dtype
Cyclistic_data['member_casual']= Cyclistic_data.member_casual.astype('category')
# dropping the (start_lng, end_lat) and(start_lat, end_lat) columns from the dataset
Cyclistic_data. drop(['start_lat', 'start_lng', 'end_lat', 'end_lng'], axis=1, inplace=True)
# changing the 'started_at' and 'ended_at' dtype to datetime
Cyclistic_data['started_at']= pd.to_datetime(Cyclistic_data.started_at)
Cyclistic_data['ended_at']= pd.to_datetime(Cyclistic_data.ended_at)
# finding missing values
Cyclistic_data.isnull().sum()
Cyclistic_data.info()
# adding a new column ride_length (ended_at - started_at) to calculate the time travelled
Cyclistic_data['ride_length'] = Cyclistic_data['ended_at'] - Cyclistic_data['started_at']
Cyclistic_data['ride_length']
Cyclistic_data.info()
# splitting and extracting the date from the 'started_at' column
Cyclistic_data['start_date'] = Cyclistic_data.started_at.dt.date
Cyclistic_data.start_date.head()
# converting 'day_of_week' into a datetime dtype
Cyclistic_data['start_date']= pd.to_datetime(Cyclistic_data.start_date)
Cyclistic_data.dtypes
# finding the weekday of the date
Cyclistic_data['day_name'] = Cyclistic_data['start_date'].dt.day_name()
Cyclistic_data['day_name'].unique()
# changing day_name to categorical dtype
Cyclistic_data['day_name']= Cyclistic_data.day_name.astype('category')
Cyclistic_data.info()
Cyclistic_data['year'] = pd.DatetimeIndex(Cyclistic_data['start_date']).year
Cyclistic_data['month'] = pd.DatetimeIndex(Cyclistic_data['start_date']).month
Cyclistic_data.info()
Cyclistic_data.head()
# sort the dataframe by ascending using start date
Cyclistic_data.sort_values(by=['started_at'], inplace= True, ascending= True)
# 13% percent of the dataset has null values, and it is below the maximum percentage of 25%, so i dropped the columns
Cyclistic_data.isnull().sum()
Cyclistic_data.shape
# finding the total percentage of null values in the dataset
739170/5595063
# drop all rows with NAN values
Cyclistic_data.dropna(inplace= True)
# to save the cleaned file
Cyclistic_data.to_csv('Cyclistic_data_cleaned.csv')
# print cleaned dataset
Cyclistic_data.head()
# check for null values in the dataset
Cyclistic_data.isnull().sum()
# check for any duplicated values
Cyclistic_data.duplicated().any()
# calculate the mean of ride length of all types of users
Cyclistic_data.ride_length.describe(datetime_is_numeric=True)
# filtered the negative values
negative_duration = Cyclistic_data['ride_length'] < '00:00:00'
# showing the number of negative time duration
Cyclistic_data[negative_duration].count()
# removing the negative duration
Cyclistic_data.drop(Cyclistic_data[negative_duration].index, inplace=True)
# filtering the ride length for members
member_filter = Cyclistic_data['member_casual'] == 'member'
Cyclistic_data[member_filter].ride_length.describe()
# filtering the ride length for casual riders
Cyclistic_data[~member_filter].ride_length.describe()
# calculate the mode of week-day of all users
mode_week_day = Cyclistic_data['day_name'].mode()
mode_week_day
# calculte the mode of week-day of members
mode_week_day_member = Cyclistic_data[Cyclistic_data['member_casual'] == 'member']
mode_week_day_member = mode_week_day_member['day_name'].mode()
mode_week_day_member
# calculate the mode of the week-day of casual riders
mode_week_day_casual = Cyclistic_data[Cyclistic_data['member_casual'] == 'casual']
mode_week_day_casual = mode_week_day_casual['day_name'].mode()
mode_week_day_casual
# calculate the no. of rides by users in a week by adding Count of ride_id to values
count_users = Cyclistic_data.groupby('day_name')['ride_id'].count()
count_users = count_users.sort_index()
count_users
# calculate the no. of rides by members in a week
count_members = Cyclistic_data[Cyclistic_data['member_casual'] == 'member'].groupby('day_name')['ride_id'].count()
count_members
# calculate the no. of rides by casual riders in a week
count_casual = Cyclistic_data[Cyclistic_data['member_casual'] == 'casual'].groupby('day_name')['ride_id'].count()
count_casual
# plot the graph for number of rides by casual riders and annual members
plt.figure(figsize = (10,8))
plt.plot(count_members.index, count_members.values)
plt.plot(count_casual.index, count_casual.values)
plt.title('Calculate the number of rides by Annual Members and Casual Riders')
plt.legend(['members', 'casual'])
labels = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
plt.xticks(count_casual.index, labels)
plt.show()
# calculate the average ride length for all users in a week
user_avg_rides = Cyclistic_data.groupby('day_name')['ride_length'].mean(numeric_only=False)
user_avg_rides
# calculate the average ride length for members in a week
user_avg_members = Cyclistic_data[Cyclistic_data['member_casual'] == 'member'].groupby('day_name')['ride_length'].mean(numeric_only=False)
user_avg_members
# calculate the average ride length for casual_riders in a week
user_avg_casual = Cyclistic_data[Cyclistic_data['member_casual'] == 'casual'].groupby('day_name')['ride_length'].mean(numeric_only=False)
user_avg_casual
# plot the average ride_length for different users during the week
plt.figure(figsize = (10,6))
plt.plot(user_avg_members/pd.Timedelta(minutes=1))
plt.plot(user_avg_casual/pd.Timedelta(minutes=1))
plt.title('The average ride duration for both user types during the week')
plt.legend(['members', 'casual'])
labels = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
plt.xticks(user_avg_members.index, labels)
plt.show
# check the number of users in each type over the past 12 months
user_types = Cyclistic_data['member_casual'].value_counts()
user_types
# to plot the bar chart of the number of users in each type over the past 12 months
plt.title('count of users in each type')
plt.bar(user_types.index, user_types.values)
plt.show()
# to plot the pie chart of users in each type
plt.figure(figsize = (9,6))
plt.pie(user_types.values, labels= user_types.index, autopct= '%1.1f%%')
plt.legend(user_types.index)
plt.show
plt.title('Pie Chart of users in each type')
# To check the amount of users for a particular ride_type
ride_type = Cyclistic_data.groupby('member_casual')['rideable_type'].value_counts()
ride_type
# to segregate the number of users into members and casual riders based on ride type
list_member = []
list_x = []
list_y = list(ride_type.values)
for i, j in ride_type.index:
list_x.append(j)
list_member.append(str(i))
# to plot the number of users into members and casual riders based on rider type
plt.figure(figsize=(12,8))
sns.barplot(list_x, list_y, hue=list_member)
plt.title('Member and Casual riders over the months based on ride_type')
#to seggregate the number of users into members and casual over year
users_over_year = Cyclistic_data.groupby('year')['member_casual'].value_counts()
users_over_year
# to get the total amount of users in each category per month
users_over_month = Cyclistic_data.groupby('month')['member_casual'].value_counts()
users_over_month
#to seggregate the number of users into members and casual over months
list_month = []
list_x = []
list_y = (users_over_month.values)
for i, j in users_over_month.index:
list_x.append(j)
list_month.append(str(i))
# plot count of members and casual riders
plt.figure(figsize= (15,7))
sns.barplot(list_x, list_y, hue =list_month)
plt.title('Count of member and casual riders over the months')