0. Setting up the notebook
# Installing packages
!pip install openpyxl
!pip install rtree
!pip install graphviz
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
import os
import time
import seaborn as sns
from shapely.geometry import Point
import geopandas
from shapely import wkt
import matplotlib.dates as md
import json
import requests
import geopandas as gpd
from folium.plugins import HeatMap, HeatMapWithTime
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
import graphviz
from sklearn import tree
from IPython.display import Image
from bokeh.models import ColumnDataSource, Legend, HoverTool, FactorRange
from bokeh.plotting import figure, output_file, show, save
from bokeh.io import output_notebook, curdoc
from bokeh.layouts import layout, column, row, gridplot
from bokeh.models.widgets import Tabs, Panel
from bokeh.transform import dodge
import tempfile
from IPython.core.display import display, HTML
import warnings
warnings.filterwarnings('ignore')
sns.set()
pal = sns.color_palette()
colors = pal.as_hex()
def plot_bokeh(plot):
output_filename = tempfile.NamedTemporaryFile(suffix='.html').name
output_file(output_filename)
save(plot)
f = open(output_filename, "r")
display(HTML(f.read()))
# Loading the datasets
df_tt = pd.read_csv('data/trafiktaelling.csv').drop(columns='FID')
df_traffic = pd.read_excel('data/SQL_2019.xlsx',
parse_dates= {"date" : ["ÅR","DATO"]})
# Transform our column to date object
df_traffic['date'] = pd.to_datetime(df_traffic['date'], format='%Y %m%d')
# Look at the 2009-2018 time window
df_traffic = df_traffic[df_traffic.date.dt.year > 2008]
# Extract the relevant columns for mapping
df_tt_mapping = df_tt[['t_nr', 'wkb_geometry']]
# Inspect duplicate value of t_nr
t_nr_mult = df_tt_mapping.groupby('t_nr').size()
df_tt_mapping[df_tt_mapping.t_nr.isin(t_nr_mult[t_nr_mult > 1].keys())]
# Map the datapoints
d_mapping = df_tt_mapping.drop_duplicates(subset='t_nr', keep='last').set_index('t_nr')['wkb_geometry'].to_dict()
df_traffic['wkb_geometry'] = df_traffic['TS'].map(d_mapping)
# Print where location is NAN
df_traffic[df_traffic['wkb_geometry'].isna()].VEJNAVN.unique()
# Drop nans
df_traffic.dropna(subset=['wkb_geometry'], inplace=True)
# Inspect null values
df_traffic.info(memory_usage="deep")
# Keep those where 6-7 were not measured
df_traffic = df_traffic[df_traffic['ANTAL 6-7'].isna()]
# Convert into geometry objects
df_traffic['wkb_geometry'] = geopandas.GeoSeries.from_wkt(df_traffic['wkb_geometry'])
# Extract lon and lat for plotting
df_traffic['lon'] = df_traffic.wkb_geometry.apply(lambda p: p.x)
df_traffic['lat'] = df_traffic.wkb_geometry.apply(lambda p: p.y)
# Select only cars and bikes
df = df_traffic[(df_traffic['KATEGORI'].str.lower() == 'personbiler') | (df_traffic['KATEGORI'].str.lower() == 'cykler + knallerter')]
# Select relevant columns
df = df[['TS', 'date', 'KATEGORI', 'lon', 'lat', 'ANTAL 7-8', 'ANTAL 8-9',
'ANTAL 9-10', 'ANTAL 10-11', 'ANTAL 11-12', 'ANTAL 12-13',
'ANTAL 13-14', 'ANTAL 14-15', 'ANTAL 15-16', 'ANTAL 16-17',
'ANTAL 17-18', 'ANTAL 18-19']].reset_index(drop=True)
# Translate remaining columns to English
df.rename(columns={
"TS": "id",
"KATEGORI": "category",
"ANTAL 7-8": "7-8",
'ANTAL 8-9': '8-9',
'ANTAL 9-10': '9-10',
'ANTAL 10-11': '10-11',
'ANTAL 11-12': '11-12',
'ANTAL 12-13': '12-13',
'ANTAL 13-14': '13-14',
'ANTAL 14-15': '14-15',
'ANTAL 15-16': '15-16',
'ANTAL 16-17': '16-17',
'ANTAL 17-18': '17-18',
'ANTAL 18-19': '18-19'
}, inplace=True)
# Translating cars and bikes from Danish to English
cb_mapper = {'PERSONBILER': 'cars', 'CYKLER + KNALLERTER': 'bikes'}
df['category'] = df.category.map(cb_mapper)
# Sum the observations of an entire day
df['entire_day'] = df[["7-8", '8-9', '9-10', '10-11', '11-12', '12-13', '13-14', '14-15', '15-16',
'16-17', '17-18', '18-19']].sum(axis=1)
# Create year column
df['year'] = df.date.dt.year
# Select modes of transport
modes_of_transport=['bikes', 'cars']
# Sum up the number of observations for each year and category
df_temp = df.groupby(['year', 'category']).sum('entire_day').reset_index()
df_year_count = pd.pivot_table(df_temp, index='year', columns=['category'], values='entire_day')
df_year_count = df_year_count.reset_index()
df_year_count['year'] = df_year_count['year'].astype(str)
# Calculate fractions for bikes and cars
df_year_count['frac_bikes'] = df_year_count['bikes']/df_year_count[['cars', 'bikes']].sum(axis=1)*100
df_year_count['frac_cars'] = df_year_count['cars']/df_year_count[['cars', 'bikes']].sum(axis=1)*100
# Find number of counters each year
df_year_count['counters'] = df.groupby('year').count()['id'].values
# Make into ColumnDataSource
cds_year = ColumnDataSource(df_year_count)
# Create a list of years for plotting
years = list(df_year_count.reset_index()['year'].astype(str))
# Create the figure
p1 = figure(title='Number of observations for each mode of transport by year',
width=800, height=400, x_range=FactorRange(factors=years), y_range=[0,2000000])
p1.add_layout(Legend(), 'right')
# Adding vertical bars
p1.vbar_stack(['bikes', 'cars'], x='year', source=cds_year,
width=.5, color=[colors[0], colors[1]], legend_label=['Bikes', 'Cars'])
p1.add_tools(HoverTool(tooltips=[("Fraction of bikes", "@frac_bikes%"),
('Fraction of cars', "@frac_cars%"),
('Total counters', '@counters')]))
# Formatters for plot
p1.yaxis.formatter.use_scientific = False
p1.xaxis.axis_label = 'Year'
p1.yaxis.axis_label = 'Number of observations'
p1.xgrid.grid_line_color = None
p1.background_fill_color = None
p1.border_fill_color = None
p1.outline_line_width = 0
p1.legend.border_line_width = 0
p1.legend.background_fill_color = None
plot_bokeh(p1)
# Extract values from 2017
df_2017_measurements = df[df.year == 2017].groupby(['date']).id.nunique()
# Plot our figure
fig, ax = plt.subplots(figsize=(12,3), dpi=200)
df_2017_measurements.plot(kind='bar',
ax=ax,
xlabel='Date',
ylabel='Number of counting spots',
title='Counting spots for each day in 2017')
ax.set_xticklabels([x.strftime("%m-%d") for x in df_2017_measurements.index])
plt.show()
# Find number of observations for each month and each category
df['month'] = df['date'].dt.month
df_temp = df.groupby(['month', 'category']).sum('entire_day').reset_index()
df_month_count = pd.pivot_table(df_temp, index='month', columns=['category'], values='entire_day')
# Add number of counters for each month
df_month_count['count'] = df.groupby('month').id.count()
df_month_count.reset_index(inplace=True)
# Map to get proper month names
month_mapper = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
df_month_count['month'] = df_month_count['month'].map(month_mapper)
# Transform into CDS
cds_month = ColumnDataSource(df_month_count)
# Plotting
p2 = figure(title='Number of observations for each mode of transport by month',
width=800, height=400, x_range=FactorRange(factors=list(month_mapper.values())),
y_range=[0, 7000000])
p2.add_layout(Legend(), 'right')
# Adding vertical bars
p2.vbar_stack(['bikes', 'cars'], x='month', source=cds_month,
width=.5, color=[colors[0], colors[1]], legend_label=['Bikes', 'Cars'])
p2.add_tools(HoverTool(tooltips=[("Total counters", "@count")]))
# Formatter tools
p2.yaxis.formatter.use_scientific = False
p2.xaxis.major_label_orientation = "vertical"
p2.xaxis.axis_label = 'Month'
p2.yaxis.axis_label = 'Number of observations'
p2.xgrid.grid_line_color = None
p2.background_fill_color = None
p2.border_fill_color = None
p2.outline_line_width = 0
p2.legend.border_line_width = 0
p2.legend.background_fill_color = None
plot_bokeh(p2)
# plot a map of Copenhagen
map_hooray = folium.Map(location=[55.676098, 12.568337],
zoom_start = 12,dpi=500)
# Plotting the points using folium's CircleMarker
# Loop over data, and include a couple extra information for each crime
for lng, lat in zip(df.lon, df.lat):
folium.CircleMarker([lat, lng], radius=1.5,
fill=True,
).add_to(map_hooray)
# Show the map
map_hooray
# Seperate dataframe into cars and bikes
df_cars = df[df.category == 'cars']
df_bikes = df[df.category == 'bikes']
# Create dataframe for each hour
df_traffic_hour = pd.DataFrame()
df_traffic_hour['cars'] = df_cars[["7-8", '8-9', '9-10', '10-11', '11-12', '12-13', '13-14', '14-15', '15-16',
'16-17', '17-18', '18-19']].sum()
df_traffic_hour['bikes'] = df_bikes[["7-8", '8-9', '9-10', '10-11', '11-12', '12-13', '13-14', '14-15', '15-16',
'16-17', '17-18', '18-19']].sum()
cds_hour = ColumnDataSource(df_traffic_hour)
# years = list(df_year_count.reset_index()['year'].astype(str))
p3 = figure(title='Number of observations for each hour',
width=800, height=400, x_range=FactorRange(factors=list(df_traffic_hour.index)),
y_range=[0,1500000])
p3.add_layout(Legend(), 'right')
# Add bars
p3.vbar(x=dodge('index', -0.15, range=p3.x_range), top='bikes', width=0.3, source=cds_hour,
legend_label="Bikes", color=colors[0])
p3.vbar(x=dodge('index', 0.15, range=p3.x_range), top='cars', width=0.3, source=cds_hour,
legend_label="Cars", color=colors[1])
# Formatter settings
p3.yaxis.formatter.use_scientific = False
p3.xaxis.axis_label = 'Time'
p3.yaxis.axis_label = 'Number of observations'
p3.xgrid.grid_line_color = None
p3.background_fill_color = None
p3.border_fill_color = None
p3.outline_line_width = 0
p3.legend.border_line_width = 0
p3.legend.background_fill_color = None
p3.legend.click_policy="hide"
plot_bokeh(p3)
# import copenhagen districts as json
with open('data/copenhagen_districts.geojson') as f:
data = json.load(f)
# Import copenhagen districts (ref: https://giedriusk.carto.com/tables/copenhagen_districts/public)
districts = gpd.read_file('data/copenhagen_districts.geojson')[['name', 'geometry']]
# Create dataframe with geographical locations
df_geo = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy(df.lon, df.lat))
# Join with the districts
df_with_districts = gpd.sjoin(df_geo, districts, op='within')
# Sum the observations to find the total amount of observations for each district
district_info_temp = df_with_districts.groupby(['name', 'category']).sum()[["7-8", '8-9', '9-10', '10-11', '11-12', '12-13', '13-14', '14-15', '15-16',
'16-17', '17-18', '18-19']].sum(axis=1).reset_index()
district_info_temp = pd.pivot_table(district_info_temp, values=0, index='name', columns=['category']).reset_index()
# Add back districts
df_districts = district_info_temp.merge(districts)
# Add properties with number of cars and bikes to make tooltips for plotting
for i in range(len(data['features'])):
name = data['features'][i]['properties']['name']
data['features'][i]['properties']['Bikes'] = int(df_districts[df_districts.name == name]['bikes'].values[0])
data['features'][i]['properties']['Cars'] = int(df_districts[df_districts.name == name]['cars'].values[0])
data['features'][i]['properties']['District'] = str(name)
# Plot heatmap with districts
Choropleth_bikes = folium.Choropleth(geo_data=data, data=df_districts, columns=['name', 'bikes'], key_on='feature.properties.name', fill_opacity=0, line_opacity=1, line_weight=2)
# Adding the data to the map
Choropleth_bikes.geojson.add_child(folium.features.GeoJsonTooltip(['District', 'Bikes']))
# Deleting legend bar for the color schema (If we have fill_opacity = 0)
for keys in Choropleth_bikes._children:
if keys.startswith('color_map'):
del(Choropleth_bikes._children[keys])
# Finding the relevant lon and lat coordinates
heat_data = [[row['lat'], row['lon'], row['entire_day']] for index, row in df_bikes.iterrows()]
# Creating the map
map_copenhagen = folium.Map(location=[55.676098, 12.568337],
zoom_start = 12,dpi=500)
# Adding Districts
Choropleth_bikes.add_to(map_copenhagen)
# Adding the data to the map
HeatMap(heat_data, radius=15, blur=25).add_to(map_copenhagen)
map_copenhagen
Choropleth_cars = folium.Choropleth(geo_data=data, data=df_districts, columns=['name', 'cars'], key_on='feature.properties.name', fill_opacity=0, line_opacity=1, line_weight=2)
# Adding the data to the map
Choropleth_cars.geojson.add_child(folium.features.GeoJsonTooltip(['District', 'Cars']))
# Deleting legend bar for the color schema (If we have fill_opacity = 0)
for keys in Choropleth_cars._children:
if keys.startswith('color_map'):
del(Choropleth_cars._children[keys])
# Finding the relevant lon and lat coordinates
heat_data = [[row['lat'], row['lon'], row['entire_day']] for index, row in df_cars.iterrows()]
# Creating the map
map_copenhagen = folium.Map(location=[55.676098, 12.568337],
zoom_start = 12,dpi=500)
# Adding Districts
Choropleth_cars.add_to(map_copenhagen)
# Adding the data to the map
HeatMap(heat_data, radius=15, blur=25).add_to(map_copenhagen)
map_copenhagen
# Extract the hours
hours = ["7-8", '8-9', '9-10', '10-11', '11-12', '12-13', '13-14',
'14-15', '15-16', '16-17', '17-18', '18-19']
# Loop to make a list with locations and the weighting
heat_time_data_bike = []
for hour in hours:
y_vals = df_bikes.lat.values
x_vals = df_bikes.lon.values
w_vals = df_bikes[hour].values
heat_data = [[y, x, w/10000] for y, x, w in zip(y_vals, x_vals, w_vals)]
heat_time_data_bike.append(heat_data)
# Creating the map
map_time_bike = folium.Map(location=[55.676098, 12.568337],
zoom_start = 12,dpi=500)
# Adding Districts
Choropleth_bikes.add_to(map_time_bike)
# Adding the data to the map
HeatMapWithTime(heat_time_data_bike,
auto_play=True,
radius=30,
gradient={.2: 'blue', .5: 'cyan', 0.75: 'lime', 1: 'yellow'},
index=hours,
).add_to(map_time_bike)
map_time_bike
# Loop to make a list with locations and the weighting
heat_time_data_cars = []
for hour in hours:
y_vals = df_cars.lat.values
x_vals = df_cars.lon.values
w_vals = df_cars[hour].values
heat_data = [[y, x, w/10000] for y, x, w in zip(y_vals, x_vals, w_vals)]
heat_time_data_cars.append(heat_data)
# Creating the map
map_time_cars = folium.Map(location=[55.676098, 12.568337],
zoom_start = 12,dpi=500)
# Adding Districts
Choropleth_bikes.add_to(map_time_cars)
# Adding the data to the map
HeatMapWithTime(heat_time_data_cars,
auto_play=True,
radius=30,
gradient={.2: 'blue', .5: 'cyan', 0.75: 'lime', 1: 'yellow'},
index=hours,
).add_to(map_time_cars)
map_time_cars
# Copenhagen
# (lon, lat) 12.447848,55.606376,12.692826,55.731079
url = 'https://dmigw.govcloud.dk/v2/metObs/collections/observation/items'
header={'X-Gravitee-Api-Key': '4017479a-aed9-4909-aaac-0cb6169d7743'} #hidden
# Parameters: https://confluence.govcloud.dk/pages/viewpage.action?pageId=26476616
param_list = ['temp_mean_past1h','wind_speed_past1h','precip_past1h']
#Construct weather dataframe
df_weather = pd.DataFrame()
for param in param_list:
# Specify query parameters
params = {'datetime' : '2009-01-01T00:00:00Z/2018-12-31T23:59:00Z',
'parameterId' : param,
'limit' : '300000',
# Use bbox from above
'bbox' : '12.447848,55.606376,12.692826,55.731079'
}
r = requests.get(url,headers=header,params=params) # submit GET request based on url and headers
json_t = r.json()['features']
df_weather_t = pd.json_normalize(json_t)
df_weather = pd.concat([df_weather, df_weather_t])
# Plot the different weather stations
df_weather[['lon', 'lat']] = pd.DataFrame(df_weather['geometry.coordinates'].tolist(), index= df_weather.index)
lon = df_weather['lon'].unique()
lat = df_weather['lat'].unique()
position = np.array(['Copenhagen Airport',
'Landbohøjskolen',
'Botanical Garden'])
measure_type = ["Precipitation, Wind, Temperature", "Temperature", "Precipitation"]
# plot a map of Copenhagen
map_cph = folium.Map(location=[55.65, 12.6],
zoom_start = 12,dpi=500)
# Plotting the points using folium's CircleMarker
# Loop over data, and include a couple extra information for each crime
for lng, lat, pos, m in zip(lon[:-1], lat[:-1],position, measure_type):
pops=folium.Popup(f"""<b>{pos}</b> <br> {m}""", max_width=len(pos)*10, min_width=len(pos)*10)
folium.Marker([lat, lng], radius=1.5,
popup=pops,
fill=True,
icon=folium.Icon(color="red"),
).add_to(map_cph)
# Show the map
map_cph
# Convert to 'datetime'
df_weather['properties.observed'] = pd.to_datetime(df_weather['properties.observed']).dt.tz_localize(None)
# Save columns including date and time
df_weather['date'] = pd.to_datetime(df_weather['properties.observed']).dt.date
df_weather['h'] = pd.to_datetime(df_weather['properties.observed']).dt.hour
# Select only daily hours to align with trafic data
df_weather = df_weather.loc[(df_weather['h'] >= 7) & (df_weather['h'] <= 18)]
# Select coloumns of interest
df_weather = df_weather[['geometry.coordinates', 'properties.observed', 'date', 'h', 'properties.parameterId', 'properties.value']]
# Aggregate over day
df_temp = df_weather[df_weather['properties.parameterId'] == 'temp_mean_past1h'].groupby(pd.Grouper(key='date')).agg({ 'properties.value': 'mean'})
df_wind = df_weather[df_weather['properties.parameterId'] == 'wind_speed_past1h'].groupby(pd.Grouper(key='date')).agg({ 'properties.value': 'mean'})
df_precip = df_weather[df_weather['properties.parameterId'] == 'precip_past1h'].groupby(pd.Grouper(key='date')).agg({ 'properties.value': 'mean'})
# Convert datetime to date (to match the weaterdata)
df['date'] = df['date'].dt.date
# Select only months with reasonable amount of data (April, May, June, September, and October is chosen)
df = df.loc[((df["month"] >= 4) & (df["month"] <= 6)) | (df["month"] == 9) | (df["month"] == 10)]
# Convert index into columns (prepare for merge)
df_temp.reset_index(inplace=True)
df_wind.reset_index(inplace=True)
df_precip.reset_index(inplace=True)
# Rename columns
df_temp = df_temp.rename(columns={"properties.value": "temp"})
df_wind = df_wind.rename(columns={"properties.value": "wind"})
df_precip = df_precip.rename(columns={"properties.value": "precip"})
# Merge dataframes
merged_weather1 = pd.merge(df_temp, df_wind, how='outer', on='date')
merged_weather2 = pd.merge(merged_weather1, df_precip, how='outer', on='date')
df_with_weather = pd.merge(merged_weather2, df, how='inner', on='date')
Let's look at the weather data by plotting the features.
from bokeh.layouts import column, row, gridplot
# Extract 3 years of data to "visulize the weather"
plotWeather = df_with_weather
# Make a date range to plot only a subset of the dataset, but with the ability to scroll
date_range = pd.date_range('2016-01-01', '2019-01-01')
x_min = date_range[0]
x_max = date_range[-1]
# Plot for temperature
s1 = figure(title='Development in temperature',
width=800, height=200, x_axis_type="datetime", x_range=(x_min, x_max))
s1.line(plotWeather['date'], plotWeather['temp'], color=colors[1])
c1 = s1.circle(plotWeather['date'].values, plotWeather['temp'].values, size=2, color=colors[1], alpha=0.8)
s1.xaxis.axis_label = 'Date'
s1.yaxis.axis_label = 'Temperature [˚C]'
# Plot for wind speed
s2 = figure(title='Development in wind speed',
width=800, height=200, x_axis_type="datetime", x_range=(x_min, x_max))
s2.line(plotWeather['date'], plotWeather['wind'], color=colors[2])
s2.circle(plotWeather['date'].values, plotWeather['wind'].values, size=2, color=colors[2], alpha=0.8)
s2.xaxis.axis_label = 'Date'
s2.yaxis.axis_label = 'Wind Speed [m/s]'
# Plot for precipitation
s3 = figure(title='Development in precipitation',
width=800, height=200, x_axis_type="datetime", x_range=(x_min, x_max))
s3.line(plotWeather['date'], plotWeather['precip'], color=colors[0])
s3.circle(plotWeather['date'].values, plotWeather['precip'].values, size=2, color=colors[0], alpha=0.8)
s3.xaxis.axis_label = 'Date'
s3.yaxis.axis_label = 'Precipitation [mm]'
p4 = column(s1, s2, s3)
plot_bokeh(p4)
# Extract columns of interest for bikes and cars
cols_of_interest = ['date', 'temp', 'wind', 'precip', 'category', 'entire_day']
df_weather_subset = df_with_weather[cols_of_interest]
df_weather_bikes = df_weather_subset.loc[df_weather_subset['category'] == 'bikes']
df_weather_cars = df_weather_subset.loc[df_weather_subset['category'] == 'cars']
# Divide into intervals
## bikes
df_weather_bikes["temp_bikes"] = pd.cut(df_weather_bikes.temp,
bins=[0, 5, 10, 15, 20, 100],
labels=['<5', '5-10', '10-15', '15-20', '>20'],
right=False)
df_weather_bikes["wind_bikes"] = pd.cut(df_weather_bikes.wind,
bins=[0, 2, 4, 6, 8, 10],
labels=['<2', '2-4', '4-6', '6-8', '8-10'],
right=False)
df_weather_bikes["precip_bikes"] = pd.cut(df_weather_bikes.precip,
bins=[0, 0.1, 0.2],
labels=['0-0.1', '>0.1'],
right=False)
## cars
df_weather_cars["temp_cars"] = pd.cut(df_weather_cars.temp,
bins=[0, 5, 10, 15, 20, 100],
labels=['<5', '5-10', '10-15', '15-20', '>20'],
right=False)
df_weather_cars["wind_cars"] = pd.cut(df_weather_cars.wind,
bins=[0, 2, 4, 6, 8, 10],
labels=['<2', '2-4', '4-6', '6-8', '8-10'],
right=False)
df_weather_cars["precip_cars"] = pd.cut(df_weather_cars.precip,
bins=[0, 0.1, 0.2],
labels=['0-0.1', '>0.1'],
right=False)
### Temperature
# Compute average across intervals
df_temp_bikes = df_weather_bikes.groupby(['temp_bikes']).mean()
df_temp_bikes = df_temp_bikes.rename(columns={'entire_day': 'Bikes'})
df_temp_bikes = df_temp_bikes.drop(columns=['temp', 'wind', 'precip'])
df_temp_cars = df_weather_cars.groupby(['temp_cars']).mean()
df_temp_cars = df_temp_cars.rename(columns={'entire_day': 'Cars'})
df_temp_cars = df_temp_cars.drop(columns=['temp', 'wind', 'precip'])
# Join dataframes
df_temp = df_temp_bikes.join(df_temp_cars)
### Wind
df_wind_bikes = df_weather_bikes.groupby(['wind_bikes']).mean()
df_wind_bikes = df_wind_bikes.rename(columns={'entire_day': 'Bikes'})
df_wind_bikes = df_wind_bikes.drop(columns=['temp', 'wind', 'precip'])
df_wind_cars = df_weather_cars.groupby(['wind_cars']).mean()
df_wind_cars = df_wind_cars.rename(columns={'entire_day': 'Cars'})
df_wind_cars = df_wind_cars.drop(columns=['temp', 'wind', 'precip'])
# Join dataframes
df_wind = df_wind_bikes.join(df_wind_cars)
### Precipitation
df_precip_bikes = df_weather_bikes.groupby(['precip_bikes']).mean()
df_precip_bikes = df_precip_bikes.rename(columns={'entire_day': 'Bikes'})
df_precip_bikes = df_precip_bikes.drop(columns=['temp', 'wind', 'precip'])
df_precip_cars = df_weather_cars.groupby(['precip_cars']).mean()
df_precip_cars = df_precip_cars.rename(columns={'entire_day': 'Cars'})
df_precip_cars = df_precip_cars.drop(columns=['temp', 'wind', 'precip'])
# Join dataframes
df_precip = df_precip_bikes.join(df_precip_cars)
# Transform into columndatasource
cds_temp = ColumnDataSource(df_temp)
cds_wind = ColumnDataSource(df_wind)
cds_precip = ColumnDataSource(df_precip)
## Plot for temperature
b1 = figure(width=275, height=300,
x_range=FactorRange(factors=list(df_temp.index)),
y_range=[0,8500])
b1b = b1.vbar(x=dodge('temp_bikes', -0.15, range=b1.x_range),
top='Bikes', width=0.3, source=cds_temp,
legend_label="Bikes", color=colors[0])
b1c = b1.vbar(x=dodge('temp_bikes', 0.15, range=b1.x_range),
top='Cars', width=0.3, source=cds_temp,
legend_label="Cars", color=colors[1])
b1.legend.visible=False
b1.xgrid.grid_line_color = None
b1.background_fill_color = None
b1.border_fill_color = None
b1.outline_line_width = 0
b1.xaxis.axis_label = 'Temperature interval [˚C]'
b1.yaxis.axis_label = 'Average number of observations'
legend1 = Legend(items=[
("Bikes", [b1b]),
("Cars", [b1c])
], location=(0, 0), border_line_width=0,background_fill_color=None, orientation="horizontal", click_policy="hide")
b1.add_layout(legend1, 'above')
## Plot for wind speed
b2 = figure(width=250, height=300,
x_range=FactorRange(factors=list(df_wind.index)),
y_range=[0,7000])
b2b = b2.vbar(x=dodge('wind_bikes', -0.15, range=b2.x_range),
top='Bikes', width=0.3, source=cds_wind,
legend_label="Bikes", color=colors[0])
b2c = b2.vbar(x=dodge('wind_bikes', 0.15, range=b2.x_range),
top='Cars', width=0.3, source=cds_wind,
legend_label="Cars", color=colors[1])
b2.legend.visible=False
b2.xgrid.grid_line_color = None
b2.background_fill_color = None
b2.border_fill_color = None
b2.outline_line_width = 0
b2.xaxis.axis_label = 'Wind speed interval [m/s]'
legend2 = Legend(items=[
("Bikes", [b2b]),
("Cars", [b2c])
], location=(0, 0),border_line_width=0,background_fill_color=None, orientation="horizontal", click_policy="hide")
b2.add_layout(legend2, 'above')
## Plot for precipitation
b3 = figure(width=250, height=300,
x_range=FactorRange(factors=list(df_precip.index)),
y_range=[0,7500])
b3b = b3.vbar(x=dodge('precip_bikes', -0.15, range=b3.x_range),
top='Bikes', width=0.3, source=cds_precip,
legend_label="Bikes", color=colors[0])
b3c = b3.vbar(x=dodge('precip_bikes', 0.15, range=b3.x_range),
top='Cars', width=0.3, source=cds_precip,
legend_label="Cars", color=colors[1])
b3.xaxis.axis_label = 'Precipitation interval [mm]'
b3.xgrid.grid_line_color = None
b3.background_fill_color = None
b3.border_fill_color = None
b3.outline_line_width = 0
b3.legend.visible=False
legend3 = Legend(items=[
("Bikes", [b3b]),
("Cars", [b3c])
], location=(0, 0),border_line_width=0, background_fill_color=None, orientation="horizontal", click_policy="hide")
b3.add_layout(legend3, 'above')
p5 = gridplot([[b1,b2, b3]])
plot_bokeh(p5)
# Convert the 'category-column to one-hot-encodered columns
df_regression = df_with_weather.join(pd.get_dummies(df_with_weather['category']))
# Select only bikes
X = df_regression[['bikes', 'cars','temp','wind', 'precip']]
y = df_regression['entire_day']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
# Compute the RMSE
rmse = np.sqrt(np.mean((y_test - np.mean(y_test))**2))
print('RMSE Baseline:', rmse)
# Compute the linear regression model using 'LinearRegression'
reg = LinearRegression().fit(X_train, y_train)
# Predict using the test data
y_pred = reg.predict(X_test)
# Compute the RMSE
rmse_lin = np.sqrt(np.mean((y_test - y_pred)**2))
print('RMSE Linear Regression:', rmse_lin)
# define alphas to try
r_alphas = np.logspace(0, 5, 100)
Ridge_clf = RidgeCV(alphas=r_alphas, scoring = 'r2').fit(X_train, y_train)
print('Optimal alpha:', Ridge_clf.alpha_)
# Compute RMSE
rmse_clf = np.sqrt(np.mean((y_test - Ridge_clf.predict(X_test))**2))
print('RMSE Ridge Regression:', rmse_clf)
Lasso_clf = LassoCV(cv=5, random_state=0).fit(X_train, y_train)
print('Optimal beta:', Lasso_clf.alpha_)
rmse_clf = np.sqrt(np.mean((y_test - Lasso_clf.predict(X_test))**2))
print('RMSE Lasso Regression:', rmse_clf)
Elas_clf = ElasticNetCV(cv=5, random_state=0).fit(X_train, y_train)
print("Optimal alpha: ", Elas_clf.alpha_)
print("Optimal lambda: ", Elas_clf.l1_ratio)
rmse_clf = np.sqrt(np.mean((y_test - Elas_clf.predict(X_test))**2))
print('RMSE ElasticNet Regression:', rmse_clf)
# Creating the RandomForestRegressor
model = RandomForestRegressor()
# Number of tress in the forest
n_estimators = [20,40,60,80,100]
# maximum depth of a tree
max_depth = [2,4,8,10]
# maximum number of features
max_features = [1,2,3,4,5]
# Creating the dictionary
param_distributions = {'n_estimators': n_estimators,
'max_depth': max_depth,
'max_features':max_features}
# Finding the best parameters
rsCV = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, cv=10, random_state=42)
search = rsCV.fit(X_train, y_train)
search.best_params_
# Save model with the best features
model_opt = RandomForestRegressor(n_estimators=80, max_features=2, max_depth=10)
# Fitting the model
model_opt.fit(X_train,y_train)
# Finding the models predictions
y_pred_opt = model_opt.predict(X_test)
print("RMSE", np.sqrt(np.mean((y_test - y_pred_opt)**2)))