Cyclistic Bike Sharing Case Study

# importing libraries import pandas as pd import numpy as np import seaborn as sns import datetime import matplotlib.pyplot as plt

# importing the datasets df1 = pd.read_csv('202101-divvy-tripdata.csv') df2 = pd.read_csv('202102-divvy-tripdata.csv') df3 = pd.read_csv('202103-divvy-tripdata.csv') df4 = pd.read_csv('202104-divvy-tripdata.csv') df5 = pd.read_csv('202105-divvy-tripdata.csv') df6 = pd.read_csv('202106-divvy-tripdata.csv') df7 = pd.read_csv('202107-divvy-tripdata.csv') df8 = pd.read_csv('202108-divvy-tripdata.csv') df9 = pd.read_csv('202109-divvy-tripdata.csv') df10 = pd.read_csv('202110-divvy-tripdata.csv') df11 = pd.read_csv('202111-divvy-tripdata.csv') df12 = pd.read_csv('202112-divvy-tripdata.csv')

# merging all data Cyclistic_data = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]) Cyclistic_data.shape

Cyclistic_data.head()

Cyclistic_data.info()

#creating a backup Cyclistic_data.to_csv('Cyclistic_data.csv')

# change rideable_type to a categorical dtype from an object dtype Cyclistic_data['rideable_type']= Cyclistic_data.rideable_type.astype('category')

#change member_casual to a categorical dtype from an object dtype Cyclistic_data['member_casual']= Cyclistic_data.member_casual.astype('category')

# dropping the (start_lng, end_lat) and(start_lat, end_lat) columns from the dataset Cyclistic_data. drop(['start_lat', 'start_lng', 'end_lat', 'end_lng'], axis=1, inplace=True)

# changing the 'started_at' and 'ended_at' dtype to datetime Cyclistic_data['started_at']= pd.to_datetime(Cyclistic_data.started_at) Cyclistic_data['ended_at']= pd.to_datetime(Cyclistic_data.ended_at)

# finding missing values Cyclistic_data.isnull().sum()

Cyclistic_data.info()

# adding a new column ride_length (ended_at - started_at) to calculate the time travelled Cyclistic_data['ride_length'] = Cyclistic_data['ended_at'] - Cyclistic_data['started_at'] Cyclistic_data['ride_length']

Cyclistic_data.info()

# splitting and extracting the date from the 'started_at' column Cyclistic_data['start_date'] = Cyclistic_data.started_at.dt.date Cyclistic_data.start_date.head()

# converting 'day_of_week' into a datetime dtype Cyclistic_data['start_date']= pd.to_datetime(Cyclistic_data.start_date) Cyclistic_data.dtypes

# finding the weekday of the date Cyclistic_data['day_name'] = Cyclistic_data['start_date'].dt.day_name() Cyclistic_data['day_name'].unique()

# changing day_name to categorical dtype Cyclistic_data['day_name']= Cyclistic_data.day_name.astype('category') Cyclistic_data.info()

Cyclistic_data['year'] = pd.DatetimeIndex(Cyclistic_data['start_date']).year Cyclistic_data['month'] = pd.DatetimeIndex(Cyclistic_data['start_date']).month

Cyclistic_data.info()

Cyclistic_data.head()

# sort the dataframe by ascending using start date Cyclistic_data.sort_values(by=['started_at'], inplace= True, ascending= True)

# 13% percent of the dataset has null values, and it is below the maximum percentage of 25%, so i dropped the columns Cyclistic_data.isnull().sum()

Cyclistic_data.shape

# finding the total percentage of null values in the dataset 739170/5595063

# drop all rows with NAN values Cyclistic_data.dropna(inplace= True)

# to save the cleaned file Cyclistic_data.to_csv('Cyclistic_data_cleaned.csv')

# print cleaned dataset Cyclistic_data.head()

# check for null values in the dataset Cyclistic_data.isnull().sum()

# check for any duplicated values Cyclistic_data.duplicated().any()

# calculate the mean of ride length of all types of users Cyclistic_data.ride_length.describe(datetime_is_numeric=True)

# filtered the negative values negative_duration = Cyclistic_data['ride_length'] < '00:00:00' # showing the number of negative time duration Cyclistic_data[negative_duration].count()

# removing the negative duration Cyclistic_data.drop(Cyclistic_data[negative_duration].index, inplace=True)

# filtering the ride length for members member_filter = Cyclistic_data['member_casual'] == 'member' Cyclistic_data[member_filter].ride_length.describe()

# filtering the ride length for casual riders Cyclistic_data[~member_filter].ride_length.describe()

# calculate the mode of week-day of all users mode_week_day = Cyclistic_data['day_name'].mode() mode_week_day

# calculte the mode of week-day of members mode_week_day_member = Cyclistic_data[Cyclistic_data['member_casual'] == 'member'] mode_week_day_member = mode_week_day_member['day_name'].mode() mode_week_day_member

# calculate the mode of the week-day of casual riders mode_week_day_casual = Cyclistic_data[Cyclistic_data['member_casual'] == 'casual'] mode_week_day_casual = mode_week_day_casual['day_name'].mode() mode_week_day_casual

# calculate the no. of rides by users in a week by adding Count of ride_id to values count_users = Cyclistic_data.groupby('day_name')['ride_id'].count() count_users = count_users.sort_index() count_users

# calculate the no. of rides by members in a week count_members = Cyclistic_data[Cyclistic_data['member_casual'] == 'member'].groupby('day_name')['ride_id'].count() count_members

# calculate the no. of rides by casual riders in a week count_casual = Cyclistic_data[Cyclistic_data['member_casual'] == 'casual'].groupby('day_name')['ride_id'].count() count_casual

# plot the graph for number of rides by casual riders and annual members plt.figure(figsize = (10,8)) plt.plot(count_members.index, count_members.values) plt.plot(count_casual.index, count_casual.values) plt.title('Calculate the number of rides by Annual Members and Casual Riders') plt.legend(['members', 'casual']) labels = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'] plt.xticks(count_casual.index, labels) plt.show()

# calculate the average ride length for all users in a week user_avg_rides = Cyclistic_data.groupby('day_name')['ride_length'].mean(numeric_only=False) user_avg_rides

# calculate the average ride length for members in a week user_avg_members = Cyclistic_data[Cyclistic_data['member_casual'] == 'member'].groupby('day_name')['ride_length'].mean(numeric_only=False) user_avg_members

# calculate the average ride length for casual_riders in a week user_avg_casual = Cyclistic_data[Cyclistic_data['member_casual'] == 'casual'].groupby('day_name')['ride_length'].mean(numeric_only=False) user_avg_casual

# plot the average ride_length for different users during the week plt.figure(figsize = (10,6)) plt.plot(user_avg_members/pd.Timedelta(minutes=1)) plt.plot(user_avg_casual/pd.Timedelta(minutes=1)) plt.title('The average ride duration for both user types during the week') plt.legend(['members', 'casual']) labels = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'] plt.xticks(user_avg_members.index, labels) plt.show

# check the number of users in each type over the past 12 months user_types = Cyclistic_data['member_casual'].value_counts() user_types

# to plot the bar chart of the number of users in each type over the past 12 months plt.title('count of users in each type') plt.bar(user_types.index, user_types.values) plt.show()

# to plot the pie chart of users in each type plt.figure(figsize = (9,6)) plt.pie(user_types.values, labels= user_types.index, autopct= '%1.1f%%') plt.legend(user_types.index) plt.show plt.title('Pie Chart of users in each type')

# To check the amount of users for a particular ride_type ride_type = Cyclistic_data.groupby('member_casual')['rideable_type'].value_counts() ride_type

# to segregate the number of users into members and casual riders based on ride type list_member = [] list_x = [] list_y = list(ride_type.values) for i, j in ride_type.index: list_x.append(j) list_member.append(str(i))

# to plot the number of users into members and casual riders based on rider type plt.figure(figsize=(12,8)) sns.barplot(list_x, list_y, hue=list_member) plt.title('Member and Casual riders over the months based on ride_type')

#to seggregate the number of users into members and casual over year users_over_year = Cyclistic_data.groupby('year')['member_casual'].value_counts() users_over_year

# to get the total amount of users in each category per month users_over_month = Cyclistic_data.groupby('month')['member_casual'].value_counts() users_over_month

#to seggregate the number of users into members and casual over months list_month = [] list_x = [] list_y = (users_over_month.values) for i, j in users_over_month.index: list_x.append(j) list_month.append(str(i))

# plot count of members and casual riders plt.figure(figsize= (15,7)) sns.barplot(list_x, list_y, hue =list_month) plt.title('Count of member and casual riders over the months')