import pandas as pd
import matplotlib.pyplot as plt
#load the .dat file on the same directory, separated by commas
bike_df = pd.read_csv('bycicle_df.dat', sep=',')
bike_df.head()
Run to view results
bike_df['day'] = pd.DatetimeIndex(bike_df['dteday']).day
#start the analysis of the data by examining a few key descriptive statistics.
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
bike_df[numeric_features + ['rentals']].describe()
Run to view results
%matplotlib inline
# Get the label column
label = bike_df['rentals']
# Create a figure for 2 subplots (2 rows, 1 column)
fig, ax = plt.subplots(2, 1, figsize = (9,12))
# Plot the histogram
ax[0].hist(label, bins=100)
ax[0].set_ylabel('Frequency')
# Add lines for the mean, median, and mode
ax[0].axvline(label.mean(), color='magenta', linestyle='dashed', linewidth=2)
ax[0].axvline(label.median(), color='cyan', linestyle='dashed', linewidth=2)
# Plot the boxplot
ax[1].boxplot(label, vert=False)
ax[1].set_xlabel('Rentals')
# Add a title to the Figure
fig.suptitle('Rental Distribution')
# Show the figure
fig.show()
Run to view results
for col in numeric_features:
fig = plt.figure(figsize=(9, 6))
ax = fig.gca()
feature = bike_df[col]
feature.hist(bins=100, ax = ax)
ax.axvline(feature.mean(), color='magenta', linestyle='dashed', linewidth=2)
ax.axvline(feature.median(), color='cyan', linestyle='dashed', linewidth=2)
ax.set_title(col)
plt.show()
Run to view results
# plot a bar plot for each categorical feature count
categorical_features = ['season','mnth','holiday','weekday','workingday','weathersit', 'day']
for col in categorical_features:
counts = bike_df[col].value_counts().sort_index()
fig = plt.figure(figsize=(9, 6))
ax = fig.gca()
counts.plot.bar(ax = ax, color='steelblue')
ax.set_title(col + ' counts')
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
plt.show()
Run to view results
for col in numeric_features:
fig = plt.figure(figsize=(9, 6))
ax = fig.gca()
feature = bike_df[col]
label = bike_df['rentals']
correlation = feature.corr(label)
plt.scatter(x=feature, y=label)
plt.xlabel(col)
plt.ylabel('Bike Rentals')
ax.set_title('rentals vs ' + col + '- correlation: ' + str(correlation))
plt.show()
Run to view results
# plot a boxplot for the label by each categorical feature
for col in categorical_features:
fig = plt.figure(figsize=(9, 6))
ax = fig.gca()
bike_df.boxplot(column = 'rentals', by = col, ax = ax)
ax.set_title('Label by ' + col)
ax.set_ylabel("Bike Rentals")
plt.show()
Run to view results