import pandas as pd
#chart
import seaborn as sns
import matplotlib.pyplot as plt
#Map
import folium
import branca
sns.set(style='darkgrid')
def get_label(g):
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height/2,
'{}'.format(round(height)),
ha="center", color='white')
jan = '/work/files/01.json'
feb = '/work/files/02.json'
mar = '/work/files/03.json'
apr = '/work/files/04.json'
may = '/work/files/05.json'
jun = '/work/files/06.json'
js = [jan, feb, mar, apr, may, jun]
df = pd.concat(map(pd.read_json, js))
df.tail()
df.shape
df.describe()
print('Number of rows {}'.format(df.shape[0]))
print('Number of cols {}'.format(df.shape[1]))
df.sample()
# drop irrelevant column cause is a unique valeu to each trip
cols = ['start_station_description', 'end_station_description']
df.drop(cols, axis=1, inplace=True)
df.isna().sum()
df.dtypes
df['started_at'] = pd.to_datetime(df['started_at']).dt.tz_localize(None)
df['ended_at'] = pd.to_datetime(df['ended_at']).dt.tz_localize(None)
df.sample()
df1 = df.copy()
# Create a new column with name month of trip and weekday name
month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sept', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
df1['month_trip'] = df1['started_at'].dt.month.map(month_map)
day_week = month_map = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
df1['day_week'] = df1['started_at'].dt.weekday.map(day_week)
#Create a new colum
df1['route'] = df1[['start_station_name', 'end_station_name']].agg(' at '.join, axis=1)
df1.sample()
df2 = df1.copy()
#I will only do the analysis from January to May because we are still in June
mask = (df2['month_trip'] != 'Jun')
df_mask = df2.loc[mask]
month_trips = df_mask.groupby('month_trip', as_index=False)['route'].count().rename(columns={'route': 'total'}).sort_values('total', ascending=False)
month_trips.head()
plt.figure(figsize=[15,6])
g = sns.barplot(x='month_trip', y='total', data=month_trips);
plt.xlabel("Month Trip")
get_label(g)
total_day = df_mask.groupby('day_week', as_index=False)['route'].count().rename(columns={'route': 'total'}).sort_values('total', ascending=False)
plt.figure(figsize=[15,6])
g = sns.barplot(x='day_week', y='total', data=total_day);
plt.xlabel("Day of week")
get_label(g)
df_mask.sample()
total_start = df_mask.groupby('start_station_name', as_index=False)['route'].count().rename(columns={'route': 'total'}).sort_values('total', ascending=False)
mask_AM = (df_mask['started_at'].dt.hour < 12)
mask_PM = (df_mask['started_at'].dt.hour >= 12)
df_am = df_mask.loc[mask_AM]
df_pm = df_mask.loc[mask_PM]
plt.figure(figsize=[20,6])
g = sns.countplot(x=df_mask['started_at'].dt.hour, data=df_mask);
plt.figure(figsize=[20,6])
g = sns.countplot(x=df_am['started_at'].dt.hour, hue='day_week', data=df_am);
plt.figure(figsize=[20,6])
sns.countplot(x=df_pm['started_at'].dt.hour, hue='day_week', data=df_pm);
df_mask.sample()
started_station = df_mask.groupby('start_station_name', as_index=False)['start_station_id'].count().rename(columns={'start_station_id': 'total'}).sort_values('total', ascending=False).head(10)
ended_station = df_mask.groupby('end_station_name', as_index=False)['end_station_id'].count().rename(columns={'end_station_id': 'total'}).sort_values('total', ascending=False).head(10)
route = df_mask.groupby('route', as_index=False)['start_station_id'].count().rename(columns={'start_station_id': 'total'}).sort_values('total', ascending=False).head(10)
plt.figure(figsize=[20,6])
g = sns.barplot(x='start_station_name', y='total', data=started_station);
plt.xlabel("Start Station")
get_label(g)
plt.figure(figsize=[20,6])
g = sns.barplot(x='end_station_name', y='total', data=ended_station);
plt.xlabel("End Station")
get_label(g)
df_mask.sample()
plt.figure(figsize=[18,10])
g = sns.barplot(x='total', y='route', data=route);
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.title('MOST FREQUENT ROUTE');
aux = df_mask.groupby(['start_station_name', 'start_station_latitude', 'start_station_longitude'], as_index=False)['route'].count().rename(columns={'route': 'total'}).sort_values('total', ascending=False)
aux = pd.DataFrame(aux)
aux1 = df_mask.groupby(['end_station_name', 'end_station_latitude', 'end_station_longitude'], as_index=False)['route'].count().rename(columns={'route': 'total'}).sort_values('total', ascending=False)
aux1 = pd.DataFrame(aux1)
aux1.rename(columns={'total': 'end_total', 'end_station_name': 'start_station_name'}, inplace=True)
aux1.drop(['end_station_latitude', 'end_station_longitude'], axis=1, inplace=True)
aux3 = pd.merge(aux, aux1, how = 'inner', on = 'start_station_name')
aux3.rename(columns={'start_station_name': 'Stations', 'start_station_latitude': 'latitude', 'start_station_longitude': 'longitude', 'total': 'start_total'}, inplace=True)
aux3.head()
#location is the mean of every lat and long point to centre the map.
location = aux3['latitude'].mean(), aux3['longitude'].mean()
map = folium.Map(location=location,zoom_start=14, width='100%',height=600)
for i, row in aux3.iterrows():
html=f"""
<h3> Travel information </h3>
<strong>Station: {row['Stations']}<br><br>Started Trips: { row['start_total']}<br><br>Ended Trips: {row['end_total']}</strong>
"""
iframe = folium.IFrame(html=html, width=260, height=190)
popup = folium.Popup(iframe, max_width=2650)
folium.Marker(
location=[row['latitude'], row['longitude']],
icon=folium.features.CustomIcon('https://cdn3.iconfinder.com/data/icons/flat-icons-web/40/Bicycle-512.png',icon_size=(50,50)),
popup = popup,
tooltip="Click for more information",
).add_to(map)
map