Importing dataset

#importing the data from source import pandas as pd df = pd.read_csv('temps.csv') df.head()

#We check descriptive stats df.shape,df.describe()

"""Getting the count of missing values for each column We do not have any missing data to transform """ df.isnull().sum(axis=0),df.info()

Data analysis & preprocessing

"""Preprocessing Using one-hot encoding to transform categorical data into some type of numeric values for machine understanding """ df = pd.get_dummies(df) df.iloc[:,:12].head(5)

import numpy as np df = np.array(df)

Training and testing sets

""" We will split our data into training set and test set. We use a stochastic process to help improve accuracy Using Skicit-learn to split data into training and testing sets """ from sklearn.model_selection import train_test_split # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(df, labels, test_size = 0.25, random_state = 42)

""" Establish a baseline for the model The baseline predictions are the historical averages """ baseline_preds = test_features[:, df_list.index('average')] baseline_errors = abs(baseline_preds - test_labels) print('Average baseline error: ', round(np.mean(baseline_errors), 2))

Training our model

# Import the model, in our case random forest regression model from sklearn.ensemble import RandomForestRegressor # Instantiate model with 1000 decision trees rf = RandomForestRegressor(n_estimators = 1000, random_state = 42) # Train the model on training data rf.fit(train_features, train_labels);

Predictions on test set

# Use the forest's predict method on the test data predictions = rf.predict(test_features) # Calculate the absolute errors errors = abs(predictions - test_labels) # Print out the mean absolute error (mae) print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE) mape = 100 * (errors / test_labels) # Calculate and display accuracy accuracy = 100 - np.mean(mape) print('Accuracy:', round(accuracy, 2), '%.')

Our model can predict the max temperature for the next day in Seattle with about 94% accuracy

import seaborn as sns import matplotlib.pyplot as plt # Use datetime for creating date objects for plotting import datetime # Dates of training values months = df[:, df_list.index('month')] days = df[:, df_list.index('day')] years = df[:, df_list.index('year')] # List and then convert to datetime object dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)] dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates] # Dataframe with true values and dates true_data = pd.DataFrame(data = {'date': dates, 'actual': labels}) # Dates of predictions months = test_features[:, df_list.index('month')] days = test_features[:, df_list.index('day')] years = test_features[:, df_list.index('year')] # Column of dates test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)] # Convert to datetime objects test_dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in test_dates] # Dataframe with predictions and dates predictions_data = pd.DataFrame(data = {'date': test_dates, 'prediction': predictions}) # Plot the actual values plt.plot(true_data['date'], true_data['actual'], 'b-', label = 'actual') # Plot the predicted values plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'prediction') plt.xticks(rotation = 'vertical'); plt.legend() # Graph labels plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual and Predicted Values');

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Importing dataset

Data analysis & preprocessing

Training and testing sets

Training our model

Predictions on test set

Importing dataset