ML-basics

import pandas as pd # load the training dataset bike_data = pd.read_csv('data/daily-bike-share.csv') bike_data.head()

bike_data['day'] = pd.DatetimeIndex(bike_data['dteday']).day bike_data.head(32)

numeric_features = ['temp', 'atemp', 'hum', 'windspeed'] bike_data[numeric_features + ['rentals']].describe()

import pandas as pd import matplotlib.pyplot as plt # This ensures plots are displayed inline in the Jupyter notebook %matplotlib inline # Get the label column label = bike_data['rentals'] # Create a figure for 2 subplots (2 rows, 1 column) fig, ax = plt.subplots(2, 1, figsize = (9,12)) # Plot the histogram ax[0].hist(label, bins=100) ax[0].set_ylabel('Frequency') # Add lines for the mean, median, and mode ax[0].axvline(label.mean(), color='magenta', linestyle='dashed', linewidth=2) ax[0].axvline(label.median(), color='cyan', linestyle='dashed', linewidth=2) # Plot the boxplot ax[1].boxplot(label, vert=False) ax[1].set_xlabel('Rentals') # Add a title to the Figure fig.suptitle('Rental Distribution') # Show the figure fig.show()

# Plot a histogram for each numeric feature for col in numeric_features: fig = plt.figure(figsize=(9, 6)) ax = fig.gca() feature = bike_data[col] feature.hist(bins=100, ax = ax) ax.axvline(feature.mean(), color='magenta', linestyle='dashed', linewidth=2) ax.axvline(feature.median(), color='cyan', linestyle='dashed', linewidth=2) ax.set_title(col) plt.show()

import numpy as np # plot a bar plot for each categorical feature count categorical_features = ['season','mnth','holiday','weekday','workingday','weathersit', 'day'] for col in categorical_features: counts = bike_data[col].value_counts().sort_index() fig = plt.figure(figsize=(9, 6)) ax = fig.gca() counts.plot.bar(ax = ax, color='steelblue') ax.set_title(col + ' counts') ax.set_xlabel(col) ax.set_ylabel("Frequency") plt.show()

for col in numeric_features: fig = plt.figure(figsize=(9, 6)) ax = fig.gca() feature = bike_data[col] label = bike_data['rentals'] correlation = feature.corr(label) plt.scatter(x=feature, y=label) plt.xlabel(col) plt.ylabel('Bike Rentals') ax.set_title('rentals vs ' + col + '- correlation: ' + str(correlation)) plt.show()

# plot a boxplot for the label by each categorical feature for col in categorical_features: fig = plt.figure(figsize=(9, 6)) ax = fig.gca() bike_data.boxplot(column = 'rentals', by = col, ax = ax) ax.set_title('Label by ' + col) ax.set_ylabel("Bike Rentals") plt.show()

# Separate features and labels X, y = bike_data[['season','mnth', 'holiday','weekday','workingday','weathersit','temp', 'atemp', 'hum', 'windspeed']].values, bike_data['rentals'].values print('Features:',X[:10], '\nLabels:', y[:10], sep='\n')

from sklearn.model_selection import train_test_split # Split data 70%-30% into training set and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0) print ('Training Set: %d rows\nTest Set: %d rows' % (X_train.shape[0], X_test.shape[0]))

# Train the model from sklearn.linear_model import LinearRegression # Fit a linear regression model on the training set model = LinearRegression().fit(X_train, y_train) print (model)

import numpy as np predictions = model.predict(X_test) np.set_printoptions(suppress=True) print('Predicted labels: ', np.round(predictions)[:10]) print('Actual labels : ' ,y_test[:10])

import matplotlib.pyplot as plt %matplotlib inline plt.scatter(y_test, predictions) plt.xlabel('Actual Labels') plt.ylabel('Predicted Labels') plt.title('Daily Bike Share Predictions') # overlay the regression line z = np.polyfit(y_test, predictions, 1) p = np.poly1d(z) plt.plot(y_test,p(y_test), color='magenta') plt.show()

from sklearn.metrics import mean_squared_error, r2_score mse = mean_squared_error(y_test, predictions) print("MSE:", mse) rmse = np.sqrt(mse) print("RMSE:", rmse) r2 = r2_score(y_test, predictions) print("R2:", r2)

from sklearn.linear_model import Lasso # Fit a lasso model on the training set model = Lasso().fit(X_train, y_train) print (model, "\n") # Evaluate the model using the test data predictions = model.predict(X_test) mse = mean_squared_error(y_test, predictions) print("MSE:", mse) rmse = np.sqrt(mse) print("RMSE:", rmse) r2 = r2_score(y_test, predictions) print("R2:", r2) # Plot predicted vs actual plt.scatter(y_test, predictions) plt.xlabel('Actual Labels') plt.ylabel('Predicted Labels') plt.title('Daily Bike Share Predictions') # overlay the regression line z = np.polyfit(y_test, predictions, 1) p = np.poly1d(z) plt.plot(y_test,p(y_test), color='magenta') plt.show()

from sklearn.tree import DecisionTreeRegressor from sklearn.tree import export_text # Train the model model = DecisionTreeRegressor().fit(X_train, y_train) print (model, "\n") # Visualize the model tree tree = export_text(model) print(tree)

# Evaluate the model using the test data predictions = model.predict(X_test) mse = mean_squared_error(y_test, predictions) print("MSE:", mse) rmse = np.sqrt(mse) print("RMSE:", rmse) r2 = r2_score(y_test, predictions) print("R2:", r2) # Plot predicted vs actual plt.scatter(y_test, predictions) plt.xlabel('Actual Labels') plt.ylabel('Predicted Labels') plt.title('Daily Bike Share Predictions') # overlay the regression line z = np.polyfit(y_test, predictions, 1) p = np.poly1d(z) plt.plot(y_test,p(y_test), color='magenta') plt.show()

from sklearn.ensemble import RandomForestRegressor # Train the model model = RandomForestRegressor().fit(X_train, y_train) print (model, "\n") # Evaluate the model using the test data predictions = model.predict(X_test) mse = mean_squared_error(y_test, predictions) print("MSE:", mse) rmse = np.sqrt(mse) print("RMSE:", rmse) r2 = r2_score(y_test, predictions) print("R2:", r2) # Plot predicted vs actual plt.scatter(y_test, predictions) plt.xlabel('Actual Labels') plt.ylabel('Predicted Labels') plt.title('Daily Bike Share Predictions') # overlay the regression line z = np.polyfit(y_test, predictions, 1) p = np.poly1d(z) plt.plot(y_test,p(y_test), color='magenta') plt.show()

# Train the model from sklearn.ensemble import GradientBoostingRegressor # Fit a lasso model on the training set model = GradientBoostingRegressor().fit(X_train, y_train) print (model, "\n") # Evaluate the model using the test data predictions = model.predict(X_test) mse = mean_squared_error(y_test, predictions) print("MSE:", mse) rmse = np.sqrt(mse) print("RMSE:", rmse) r2 = r2_score(y_test, predictions) print("R2:", r2) # Plot predicted vs actual plt.scatter(y_test, predictions) plt.xlabel('Actual Labels') plt.ylabel('Predicted Labels') plt.title('Daily Bike Share Predictions') # overlay the regression line z = np.polyfit(y_test, predictions, 1) p = np.poly1d(z) plt.plot(y_test,p(y_test), color='magenta') plt.show()

from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer, r2_score # Use a Gradient Boosting algorithm alg = GradientBoostingRegressor() # Try these hyperparameter values params = { 'learning_rate': [0.1, 0.5, 1.0], 'n_estimators' : [50, 100, 150] } # Find the best hyperparameter combination to optimize the R2 metric score = make_scorer(r2_score) gridsearch = GridSearchCV(alg, params, scoring=score, cv=3, return_train_score=True) gridsearch.fit(X_train, y_train) print("Best parameter combination:", gridsearch.best_params_, "\n") # Get the best model model=gridsearch.best_estimator_ print(model, "\n") # Evaluate the model using the test data predictions = model.predict(X_test) mse = mean_squared_error(y_test, predictions) print("MSE:", mse) rmse = np.sqrt(mse) print("RMSE:", rmse) r2 = r2_score(y_test, predictions) print("R2:", r2) # Plot predicted vs actual plt.scatter(y_test, predictions) plt.xlabel('Actual Labels') plt.ylabel('Predicted Labels') plt.title('Daily Bike Share Predictions') # overlay the regression line z = np.polyfit(y_test, predictions, 1) p = np.poly1d(z) plt.plot(y_test,p(y_test), color='magenta') plt.show()

# Train the model from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.linear_model import LinearRegression import numpy as np # Define preprocessing for numeric columns (scale them) numeric_features = [6,7,8,9] numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler())]) # Define preprocessing for categorical features (encode them) categorical_features = [0,1,2,3,4,5] categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore'))]) # Combine preprocessing steps preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) # Create preprocessing and training pipeline pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', GradientBoostingRegressor())]) # fit the pipeline to train a linear regression model on the training set model = pipeline.fit(X_train, (y_train)) print (model)

# Get predictions predictions = model.predict(X_test) # Display metrics mse = mean_squared_error(y_test, predictions) print("MSE:", mse) rmse = np.sqrt(mse) print("RMSE:", rmse) r2 = r2_score(y_test, predictions) print("R2:", r2) # Plot predicted vs actual plt.scatter(y_test, predictions) plt.xlabel('Actual Labels') plt.ylabel('Predicted Labels') plt.title('Daily Bike Share Predictions') z = np.polyfit(y_test, predictions, 1) p = np.poly1d(z) plt.plot(y_test,p(y_test), color='magenta') plt.show()

# Use a different estimator in the pipeline pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', RandomForestRegressor())]) # fit the pipeline to train a linear regression model on the training set model = pipeline.fit(X_train, (y_train)) print (model, "\n") # Get predictions predictions = model.predict(X_test) # Display metrics mse = mean_squared_error(y_test, predictions) print("MSE:", mse) rmse = np.sqrt(mse) print("RMSE:", rmse) r2 = r2_score(y_test, predictions) print("R2:", r2) # Plot predicted vs actual plt.scatter(y_test, predictions) plt.xlabel('Actual Labels') plt.ylabel('Predicted Labels') plt.title('Daily Bike Share Predictions - Preprocessed') z = np.polyfit(y_test, predictions, 1) p = np.poly1d(z) plt.plot(y_test,p(y_test), color='magenta') plt.show()

import joblib # Save the model as a pickle file filename = './models/bike-share.pkl' joblib.dump(model, filename)

# Load the model from the file loaded_model = joblib.load(filename) # Create a numpy array containing a new observation (for example tomorrow's seasonal and weather forecast information) X_new = np.array([[1,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869]]).astype('float64') print ('New sample: {}'.format(list(X_new[0]))) # Use the model to predict tomorrow's rentals result = loaded_model.predict(X_new) print('Prediction: {:.0f} rentals'.format(np.round(result[0])))

# An array of features based on five-day weather forecast X_new = np.array([[0,1,1,0,0,1,0.344167,0.363625,0.805833,0.160446], [0,1,0,1,0,1,0.363478,0.353739,0.696087,0.248539], [0,1,0,2,0,1,0.196364,0.189405,0.437273,0.248309], [0,1,0,3,0,1,0.2,0.212122,0.590435,0.160296], [0,1,0,4,0,1,0.226957,0.22927,0.436957,0.1869]]) # Use the model to predict rentals results = loaded_model.predict(X_new) print('5-day rental predictions:') for prediction in results: print(np.round(prediction))