Importing dataset
#importing the data from source
import pandas as pd
df = pd.read_csv('temps.csv')
df.head()
#We check descriptive stats
df.shape,df.describe()
"""Getting the count of missing values for each column
We do not have any missing data to transform
"""
df.isnull().sum(axis=0),df.info()
Data analysis & preprocessing
"""Preprocessing
Using one-hot encoding to transform categorical data
into some type of numeric values for machine understanding
"""
df = pd.get_dummies(df)
df.iloc[:,:12].head(5)
import numpy as np
df = np.array(df)
Training and testing sets
"""
We will split our data into training set and test set.
We use a stochastic process to help improve accuracy
Using Skicit-learn to split data into training and testing sets
"""
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(df, labels, test_size = 0.25, random_state = 42)
"""
Establish a baseline for the model
The baseline predictions are the historical averages
"""
baseline_preds = test_features[:, df_list.index('average')]
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))
Training our model
# Import the model, in our case random forest regression model
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);
Predictions on test set
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
Our model can predict the max temperature for the next day in Seattle with about 94% accuracy
import seaborn as sns
import matplotlib.pyplot as plt
# Use datetime for creating date objects for plotting
import datetime
# Dates of training values
months = df[:, df_list.index('month')]
days = df[:, df_list.index('day')]
years = df[:, df_list.index('year')]
# List and then convert to datetime object
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates]
# Dataframe with true values and dates
true_data = pd.DataFrame(data = {'date': dates, 'actual': labels})
# Dates of predictions
months = test_features[:, df_list.index('month')]
days = test_features[:, df_list.index('day')]
years = test_features[:, df_list.index('year')]
# Column of dates
test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
# Convert to datetime objects
test_dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in test_dates]
# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data = {'date': test_dates, 'prediction': predictions})
# Plot the actual values
plt.plot(true_data['date'], true_data['actual'], 'b-', label = 'actual')
# Plot the predicted values
plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'prediction')
plt.xticks(rotation = 'vertical');
plt.legend()
# Graph labels
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual and Predicted Values');