#Install important packages
!pip install statsmodels==0.13.1
!pip install pmdarima
!pip install XGBoost
#imports
import numpy as np
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import sklearn
import math
import statsmodels as sm
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
%matplotlib inline
from datetime import date, timedelta
from matplotlib import pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARIMA
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
#Section 1.1
#getting the data with date as index
data = pd.read_csv('CFX Code Snippet (1).csv',index_col="Date",parse_dates=True)
#let's look at the data for NaN
data.info()
#Looks like we have few NaN values in the time series. In order to observe a continous time series we impute these NaN through pandas interpolation.
data = data.interpolate() #using interpolation for filling NaN values
data.info()
#Now we can see the time series is complete. Let's visualize it
plt.figure(figsize=(19,9))
plt.plot(data["Adj Close"],label="Commodity price")
plt.plot(data["Adj Close"].rolling(window=30).mean(),label="30_MA")
plt.plot(data["Adj Close"].rolling(window=60).mean(),label="60_MA")
plt.plot(data["Adj Close"].rolling(window=90).mean(),label="90_MA")
plt.plot(data["Adj Close"].rolling(window=120).mean(),label="120_MA")
plt.xlabel("Date")
plt.ylabel("Price")
plt.title("Commodity Prices")
plt.grid()
plt.legend()
# The commodity prices seem to have a varying trend over the past 4 years.
# But when ever the price trends above the 90 day MA it corrected it self. This suggests mean pr
#Section 1.2
#Lets try to decompose the series for a base line model where we assume that there are no #structural breaks
#But this is a simplistic assumption; we will try to split the data to explore any changes in the trend
#We will use 312 (6 prices every week) as the period to investigate cyclicity in an year
seasonal_dec = seasonal_decompose(data['Adj Close'], model='additive',period=312,two_sided=True)
seasonal_dec.plot()
#Visualizing decomposed plots
seasonal_dec.resid.plot()
plt.ylabel("Residual")
plt.title("Residual plot - Additive Decomposition")
plt.grid()
plt.axhline(y=1,color="black",linestyle="dotted")
seasonal_dec.seasonal.plot()
plt.ylabel("Seasonality maginitude")
plt.title("Seasonality plot - Additive Decomposition")
plt.grid()
plt.axhline(y=1,color="black",linestyle="dotted")
seasonal_dec.trend.plot()
plt.ylabel("Trend maginitude")
plt.title("Trend plot - Additive Decomposition")
plt.grid()
#We notice that the trend direction approximateley every 12 months
#We can divide this time series into three sections - 2017 Sep - 2018 Jul, 2018 Sep - 2019 #Jul, 2019 Sep - 2020 Jul and investigate the trend further.
#a
def get_trend(date):
return seasonal_dec.trend.loc[date]
#b
def get_price(date):
return seasonal_dec.trend.loc[date] + seasonal_dec.seasonal.loc[date] + seasonal_dec.resid.loc[date]
#b-i)
#We use this measure because Price = Trend + Seasonality + Residuals.
#c We loose around 312 values, as seasonality is checked over number of days the commodity has prices. and the period used by me is 312.
#Q3)-a,b,c) #Note I did this part using two approaches, the one uses classicfication models #and the other will include statistical models. I will Add that after classification models.
#Here we use the three month return (72 days - as data has 6 prices in a week) to gauage #whether the commodity price is trending up or down.
#Our binary target variable will be 1 if the 3 month return is > 1 and 0 otherwise
data["3_month_return"] = data["Adj Close"].pct_change(72)
data["binary_target"] = data["3_month_return"].apply(lambda x:1 if x>0 else 0)
data["binary_target_future"] = data["binary_target"].shift(-72)
data["trend"] = seasonal_dec.trend
data["price_deviation"] = data["trend"] - data["Adj Close"]
#Lets visualize this price deviation with our future 3 month return
#We shift the binary target 3 months to compare whether curent residuals with future returns
fig,ax = plt.subplots(2,1,figsize=(25,12.5))
ax[0].plot(data["binary_target_future"])
ax[0].set_title("Binary_target (3 month return)",fontsize=14)
ax[1].plot(data["price_deviation"],color="black")
ax[1].set_ylabel("Deviation from trend",fontsize=14)
ax[1].set_xlabel("Date",fontsize=14)
ax[1].set_title("Price deviation",fontsize=14)
ax[1].axhline(y=0,linestyle="dotted",color="black")
print("Correlation between price deviation and binary target : ", data["binary_target_future"].corr(data["price_deviation"]))
#Correlation between price deviation and binary target : 0.567698271340469
#We notice that the binary target has a considerable correlation with price deviations. As #the price deviations get negative (price is above trend) we expect the price to return back #down to mean in the future. Hence this can suggest a downtrend in 3 months.
#In this section we look at some classification models to predict the 3 month price direction
#imports for classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
#imports for metrics and functionality
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_roc_curve
from xgboost import XGBClassifier
print("Shape of the dataset before removing NaN:",data.shape)
#dropping NaN created from the rolling
data = data.dropna()
print("Shape of the dataset after removing NaN:",data.shape)
#In order to create the metric for trend we performed a seasonal decomposition with period #312 (number of values in a year). Hence we need to truncate our series by 312 data points.
#Logistic Regression
X = np.array(data["price_deviation"]).reshape(-1,1) #predictor variable (price deviation from trend)
y = data["binary_target_future"] #target variable - future 3 month trend direction
#performing train-test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
#logistic regression
logist_base = LogisticRegression(random_state=42).fit(X_train,y_train)
#cross validation parameters (folds)
k = 5
#fitting the transformations and the model
cross_val_scores_logist = cross_val_score(logist_base,X_train,y_train,cv=k)
print("Average Cross Val Score : ", format(cross_val_scores_logist.mean()) )
#predicting test data
y_pred = logist_base.predict(X_test)
#Accuracy out of sample
print("Out of sample accuracy : ", logist_base.score(X_test,y_test))
#Average Cross Val Score : 0.7730569948186529
#Out of sample accuracy : 0.768595041322314
#We achieve a decent accuracy with our simplistic lositic regression model
#Let's look at the confusion matrix to investigate model performance better
#plotting the confusion matrix
plot_confusion_matrix(logist_base,X_test,y_test,normalize = "true")
plt.title("Confsion matrix - Logistic Regression")
plot_roc_curve(logist_base,X_test,y_test)
print("-"*50)
print("Out of sample results")
print("-"*50)
print("Precision score : ",precision_score(y_test,y_pred))
print("Recall score : ",recall_score(y_test,y_pred))
print("F1 score : ",f1_score(y_test,y_pred))
#Out of sample results
#--------------------------------------------------
#Precision score : 0.7763157894736842
#Recall score : 0.8428571428571429
#F1 score : 0.8082191780821917
#AUC - 0.84
#Our logistic regression model was able to accurately predict the positive return ~84% times and a negative return ~67% times
#High F1 score of ~0.8 suggests that the price deviations from the trend have some predictive power in estimating future trend direction
#Our area under the curve is 0.84 suggesting good model performance
#Random Forest
#logistic regression
rf_base = RandomForestClassifier(random_state=42).fit(X_train,y_train)
#cross validation parameters (folds)
k = 5
#fitting the transformations and the model
cross_val_scores_rf = cross_val_score(rf_base,X_train,y_train,cv=k)
print("Average Cross Val Score : ", format(cross_val_scores_rf.mean()) )
#predicting test data
y_pred = rf_base.predict(X_test)
#Accuracy out of sample
print("Out of sample accuracy : ", logist_base.score(X_test,y_test))
#Average Cross Val Score : 0.6808290155440414
#Out of sample accuracy : 0.768595041322314
#plotting the confusion matrix
plot_confusion_matrix(rf_base,X_test,y_test,normalize = "true")
plt.title("Confsion matrix - Random Forests")
plot_roc_curve(rf_base,X_test,y_test)
print("-"*50)
print("Out of sample results")
print("-"*50)
print("Precision score : ",precision_score(y_test,y_pred))
print("Recall score : ",recall_score(y_test,y_pred))
print("F1 score : ",f1_score(y_test,y_pred))
#Out of sample results
#--------------------------------------------------
#Precision score : 0.7815126050420168
#Recall score : 0.6642857142857143
#F1 score : 0.7181467181467182
#We were able to improve the prediction accuracy for class 0, but the model's performance struggled in classifying class 1.
#Tuning the model can help increasing the prediction accuracy and help us conclude that the price deviations have some prdictive power in future price trend
#Support Vector Machines (SVM)
#logistic regression
SVC_base = SVC(random_state=42).fit(X_train,y_train)
#cross validation parameters (folds)
k = 5
#fitting the transformations and the model
cross_val_scores_svc = cross_val_score(SVC_base,X_train,y_train,cv=k)
print("Average Cross Val Score : ", format(cross_val_scores_svc.mean()) )
#predicting test data
y_pred = SVC_base.predict(X_test)
#Accuracy out of sample
print("Out of sample accuracy : ", SVC_base.score(X_test,y_test))
#Average Cross Val Score : 0.7720207253886011
#Out of sample accuracy : 0.7851239669421488
#plotting the confusion matrix
plot_confusion_matrix(SVC_base,X_test,y_test,normalize = "true")
plt.title("Confsion matrix - Random Forests")
plot_roc_curve(SVC_base,X_test,y_test)
print("-"*50)
print("Out of sample results")
print("-"*50)
print("Precision score : ",precision_score(y_test,y_pred))
print("Recall score : ",recall_score(y_test,y_pred))
print("F1 score : ",f1_score(y_test,y_pred))
#Out of sample results
#--------------------------------------------------
#Precision score : 0.8055555555555556
#Recall score : 0.8285714285714286
#F1 score : 0.8169014084507044
#XGBoost (Boosting algorithm)
#logistic regression
XG_base = XGBClassifier(random_state=42).fit(X_train,y_train)
#cross validation parameters (folds)
k = 5
#fitting the transformations and the model
cross_val_scores_xg = cross_val_score(XG_base,X_train,y_train,cv=k)
print("Average Cross Val Score : ", format(cross_val_scores_xg.mean()) )
#predicting test data
y_pred = XG_base.predict(X_test)
#Accuracy out of sample
print("Out of sample accuracy : ", XG_base.score(X_test,y_test))
#Average Cross Val Score : 0.7336787564766839
#Out of sample accuracy : 0.731404958677686
#plotting the confusion matrix
plot_confusion_matrix(XG_base,X_test,y_test,normalize = "true")
plt.title("Confsion matrix - Random Forests")
plot_roc_curve(XG_base,X_test,y_test)
print("-"*50)
print("Out of sample results")
print("-"*50)
print("Precision score : ",precision_score(y_test,y_pred))
print("Recall score : ",recall_score(y_test,y_pred))
print("F1 score : ",f1_score(y_test,y_pred))
#Out of sample results
#--------------------------------------------------
#Precision score : 0.7952755905511811
#Recall score : 0.7214285714285714
#F1 score : 0.7565543071161049
#Based on the high performance of our univariate machine learning models, we can conclude #that the price deviations from the tend do have some predictive power in estimating future #trend direction
#logistic Regression has the best performance.
#d) -Supply- demand shocks due to various reasons can make prices volatile and which cannot #be forcasted using a model
EXTRA -- ANOTHER APPROACH TO PREDICT FUTURE PRICES
There is another approach to predict future prices where i will use Arimax mode, the solution provided above is my main analysis but this is something I tried as well.
#Function testing for stationarity test
def get_stationarity(timeseries):
# rolling statistics
rolling_mean = timeseries.rolling(window=12).mean()
rolling_std = timeseries.rolling(window=12).std()
# rolling statistics plot
original = plt.plot(timeseries, color='blue', label='Original')
mean = plt.plot(rolling_mean, color='red', label='Rolling Mean')
std = plt.plot(rolling_std, color='black', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# Dickey–Fuller test:
result = adfuller(timeseries)
print('ADF Statistic: {}'.format(result[0]))
print('p-value: {}'.format(result[1]))
print('Critical Values:')
for key, value in result[4].items():
print('\t{}: {}'.format(key, value))
#Reading, Importing the data and setting index
df = pd.read_csv('CFX Code Snippet (1).csv' )
df.set_index('Date',inplace=True)
df.index=pd.to_datetime(df.index)
df.describe()
#Checking for low volume
perc =[0.01,0.05,0.1,.11,.12,.13,.14,.15,.16,.17,.18,.19,.20, .40, .60, .80]
df['Volume'].describe(percentiles=perc)
#As the volume is very low at times so I will drop volumes below 500 contracts, it can be due to contract expiry dates or delievery issues
#when volume is non significant or low the close prices for that day are not significant.
#At 500 contracts the volume may look low but in terms of some major commodites it can euqate to high number.
#For Example 500 ccontracts means 500000 barrels of oil or 2500000 bushels of wheat,corn or Beans.These are 4 major commodities.
df['Volume'].hist(bins =30)
#As prices when volume was low are not significant so I will make them nan
cols = [col for col in df.columns if col != 'Volume']
df.loc[df['Volume'] <=500, cols ] = np.nan
df
#Dropping columns not needed and interpolating prices so time series is continuous
df = df.drop(columns=['Open', 'High' , 'Low' , 'Close' , 'Volume'],axis = 1)
df = df.interpolate() #interpolating for the missing values
df.plot()
plt.grid()
plt.ylabel("Price")
plt.title("Commodity price")
#Testing for trend
!pip install pymannkendall
import pymannkendall as mk
mk.original_test(df['Adj Close'])
#increasing trend detected.
#As trend was detected, I will decompose the time series, will use additive decompose as variation isnt increaseing much over time.
#We will use 312 (6 prices every week) as the period to investigate cyclicity in an year
from statsmodels.tsa.seasonal import seasonal_decompose
res = seasonal_decompose(df['Adj Close'], model='additive',period=312)
res.plot()
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
num_lags = 40
#plotting ACF
acf_values = plot_acf(res.resid.dropna(), lags=num_lags, zero = True)
#plotting PACF
pacf_values = plot_pacf(res.resid.dropna(), lags=num_lags, zero = True)
df['Trend'] = res.trend
#Adding trend to time series
df["TrendDiff"] = res.observed - res.trend
df.plot()
#Adding difference between price and trend to dataframe
#Due to decompose we have nan values on ends dropping them. Note time series is still continuous
df = df.dropna()
df.describe()
#Stationarity test P value is less than 5% so data is stationary
get_stationarity(df['Adj Close'])
get_stationarity(df['TrendDiff'])
#Stationarity in Trend Difference
from statsmodels.stats.stattools import durbin_watson
#perform Durbin-Watson test
durbin_watson(df['Adj Close'])
#Presence of positive autocorrelation as the value is between 0 and 2.Positive autocorrelation (common in time series data)
df1 = df.copy()
df.head()
df1.tail(10)
#Definining df2 and df3.Then splitting data into n-90 and 90 values for training and testing our model
df2= df1.iloc[0:1155-90]
df2
df3 = df1.iloc[-90:]
df3.head()
get_stationarity(df2['Adj Close'])
get_stationarity(df2['TrendDiff'])
df3.describe()
#Fitting an ARIMAX model as the questions asks how well can Price- Trend Value predict prices,so will treat this as exogenous variable.
from pmdarima.arima import auto_arima
model = auto_arima(df2[['Adj Close']], exogenous =df2[['TrendDiff']], trace=True, error_action="ignore", suppress_warnings=True)
model.fit(df2[['Adj Close']], exogenous=df2[['TrendDiff']])
forecast= model.predict(n_periods=len(df3),exogenous=df3[['TrendDiff']])
df3["For_ARIMAX"] = forecast
#Plot showing the actual and predicted values, time series are good for predicting prices in shorter time frames but as you can see.
#As time horizon increases the difference in actual and predicted value increases
plt.figure(figsize=(15,7))
plt.plot(df3['Adj Close'],color='g')
plt.plot(df3['For_ARIMAX'],color='r')
plt.xlabel('Time')
plt.ylabel('Price in $')
plt.legend(['Actual Price', 'Predicted Price'])
plt.title('Actual vs Forcasted price')
plt.show()