#Install important packages !pip install statsmodels==0.13.1 !pip install pmdarima !pip install XGBoost

#imports import numpy as np import datetime import pandas as pd import matplotlib.pyplot as plt import matplotlib import sklearn import math import statsmodels as sm import statsmodels.api as sm from statsmodels.tsa.seasonal import seasonal_decompose %matplotlib inline from datetime import date, timedelta from matplotlib import pyplot as plt from statsmodels.tsa.stattools import adfuller from statsmodels.tsa.arima_model import ARIMA from pandas.plotting import register_matplotlib_converters register_matplotlib_converters()

#Section 1.1 #getting the data with date as index data = pd.read_csv('CFX Code Snippet (1).csv',index_col="Date",parse_dates=True) #let's look at the data for NaN data.info() #Looks like we have few NaN values in the time series. In order to observe a continous time series we impute these NaN through pandas interpolation.

data = data.interpolate() #using interpolation for filling NaN values data.info() #Now we can see the time series is complete. Let's visualize it

plt.figure(figsize=(19,9)) plt.plot(data["Adj Close"],label="Commodity price") plt.plot(data["Adj Close"].rolling(window=30).mean(),label="30_MA") plt.plot(data["Adj Close"].rolling(window=60).mean(),label="60_MA") plt.plot(data["Adj Close"].rolling(window=90).mean(),label="90_MA") plt.plot(data["Adj Close"].rolling(window=120).mean(),label="120_MA") plt.xlabel("Date") plt.ylabel("Price") plt.title("Commodity Prices") plt.grid() plt.legend() # The commodity prices seem to have a varying trend over the past 4 years. # But when ever the price trends above the 90 day MA it corrected it self. This suggests mean pr

#Section 1.2 #Lets try to decompose the series for a base line model where we assume that there are no #structural breaks #But this is a simplistic assumption; we will try to split the data to explore any changes in the trend #We will use 312 (6 prices every week) as the period to investigate cyclicity in an year seasonal_dec = seasonal_decompose(data['Adj Close'], model='additive',period=312,two_sided=True) seasonal_dec.plot()

#Visualizing decomposed plots seasonal_dec.resid.plot() plt.ylabel("Residual") plt.title("Residual plot - Additive Decomposition") plt.grid() plt.axhline(y=1,color="black",linestyle="dotted")

seasonal_dec.seasonal.plot() plt.ylabel("Seasonality maginitude") plt.title("Seasonality plot - Additive Decomposition") plt.grid() plt.axhline(y=1,color="black",linestyle="dotted")

seasonal_dec.trend.plot() plt.ylabel("Trend maginitude") plt.title("Trend plot - Additive Decomposition") plt.grid() #We notice that the trend direction approximateley every 12 months #We can divide this time series into three sections - 2017 Sep - 2018 Jul, 2018 Sep - 2019 #Jul, 2019 Sep - 2020 Jul and investigate the trend further.

#a def get_trend(date): return seasonal_dec.trend.loc[date] #b def get_price(date): return seasonal_dec.trend.loc[date] + seasonal_dec.seasonal.loc[date] + seasonal_dec.resid.loc[date] #b-i) #We use this measure because Price = Trend + Seasonality + Residuals.

#c We loose around 312 values, as seasonality is checked over number of days the commodity has prices. and the period used by me is 312.

#Q3)-a,b,c) #Note I did this part using two approaches, the one uses classicfication models #and the other will include statistical models. I will Add that after classification models. #Here we use the three month return (72 days - as data has 6 prices in a week) to gauage #whether the commodity price is trending up or down. #Our binary target variable will be 1 if the 3 month return is > 1 and 0 otherwise data["3_month_return"] = data["Adj Close"].pct_change(72) data["binary_target"] = data["3_month_return"].apply(lambda x:1 if x>0 else 0) data["binary_target_future"] = data["binary_target"].shift(-72) data["trend"] = seasonal_dec.trend data["price_deviation"] = data["trend"] - data["Adj Close"]

#Lets visualize this price deviation with our future 3 month return #We shift the binary target 3 months to compare whether curent residuals with future returns fig,ax = plt.subplots(2,1,figsize=(25,12.5)) ax[0].plot(data["binary_target_future"]) ax[0].set_title("Binary_target (3 month return)",fontsize=14) ax[1].plot(data["price_deviation"],color="black") ax[1].set_ylabel("Deviation from trend",fontsize=14) ax[1].set_xlabel("Date",fontsize=14) ax[1].set_title("Price deviation",fontsize=14) ax[1].axhline(y=0,linestyle="dotted",color="black") print("Correlation between price deviation and binary target : ", data["binary_target_future"].corr(data["price_deviation"])) #Correlation between price deviation and binary target : 0.567698271340469 #We notice that the binary target has a considerable correlation with price deviations. As #the price deviations get negative (price is above trend) we expect the price to return back #down to mean in the future. Hence this can suggest a downtrend in 3 months.

#In this section we look at some classification models to predict the 3 month price direction #imports for classification models from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC #imports for metrics and functionality from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn.metrics import plot_confusion_matrix from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score from sklearn.metrics import plot_roc_curve from xgboost import XGBClassifier print("Shape of the dataset before removing NaN:",data.shape) #dropping NaN created from the rolling data = data.dropna() print("Shape of the dataset after removing NaN:",data.shape) #In order to create the metric for trend we performed a seasonal decomposition with period #312 (number of values in a year). Hence we need to truncate our series by 312 data points.

#Logistic Regression X = np.array(data["price_deviation"]).reshape(-1,1) #predictor variable (price deviation from trend) y = data["binary_target_future"] #target variable - future 3 month trend direction #performing train-test split X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42) #logistic regression logist_base = LogisticRegression(random_state=42).fit(X_train,y_train) #cross validation parameters (folds) k = 5 #fitting the transformations and the model cross_val_scores_logist = cross_val_score(logist_base,X_train,y_train,cv=k) print("Average Cross Val Score : ", format(cross_val_scores_logist.mean()) ) #predicting test data y_pred = logist_base.predict(X_test) #Accuracy out of sample print("Out of sample accuracy : ", logist_base.score(X_test,y_test)) #Average Cross Val Score : 0.7730569948186529 #Out of sample accuracy : 0.768595041322314 #We achieve a decent accuracy with our simplistic lositic regression model #Let's look at the confusion matrix to investigate model performance better #plotting the confusion matrix plot_confusion_matrix(logist_base,X_test,y_test,normalize = "true") plt.title("Confsion matrix - Logistic Regression") plot_roc_curve(logist_base,X_test,y_test) print("-"*50) print("Out of sample results") print("-"*50) print("Precision score : ",precision_score(y_test,y_pred)) print("Recall score : ",recall_score(y_test,y_pred)) print("F1 score : ",f1_score(y_test,y_pred))

#Out of sample results #-------------------------------------------------- #Precision score : 0.7763157894736842 #Recall score : 0.8428571428571429 #F1 score : 0.8082191780821917 #AUC - 0.84 #Our logistic regression model was able to accurately predict the positive return ~84% times and a negative return ~67% times #High F1 score of ~0.8 suggests that the price deviations from the trend have some predictive power in estimating future trend direction #Our area under the curve is 0.84 suggesting good model performance

#Random Forest #logistic regression rf_base = RandomForestClassifier(random_state=42).fit(X_train,y_train) #cross validation parameters (folds) k = 5 #fitting the transformations and the model cross_val_scores_rf = cross_val_score(rf_base,X_train,y_train,cv=k) print("Average Cross Val Score : ", format(cross_val_scores_rf.mean()) ) #predicting test data y_pred = rf_base.predict(X_test) #Accuracy out of sample print("Out of sample accuracy : ", logist_base.score(X_test,y_test)) #Average Cross Val Score : 0.6808290155440414 #Out of sample accuracy : 0.768595041322314 #plotting the confusion matrix plot_confusion_matrix(rf_base,X_test,y_test,normalize = "true") plt.title("Confsion matrix - Random Forests") plot_roc_curve(rf_base,X_test,y_test) print("-"*50) print("Out of sample results") print("-"*50) print("Precision score : ",precision_score(y_test,y_pred)) print("Recall score : ",recall_score(y_test,y_pred)) print("F1 score : ",f1_score(y_test,y_pred)) #Out of sample results #-------------------------------------------------- #Precision score : 0.7815126050420168 #Recall score : 0.6642857142857143 #F1 score : 0.7181467181467182 #We were able to improve the prediction accuracy for class 0, but the model's performance struggled in classifying class 1. #Tuning the model can help increasing the prediction accuracy and help us conclude that the price deviations have some prdictive power in future price trend

#Support Vector Machines (SVM) #logistic regression SVC_base = SVC(random_state=42).fit(X_train,y_train) #cross validation parameters (folds) k = 5 #fitting the transformations and the model cross_val_scores_svc = cross_val_score(SVC_base,X_train,y_train,cv=k) print("Average Cross Val Score : ", format(cross_val_scores_svc.mean()) ) #predicting test data y_pred = SVC_base.predict(X_test) #Accuracy out of sample print("Out of sample accuracy : ", SVC_base.score(X_test,y_test)) #Average Cross Val Score : 0.7720207253886011 #Out of sample accuracy : 0.7851239669421488 #plotting the confusion matrix plot_confusion_matrix(SVC_base,X_test,y_test,normalize = "true") plt.title("Confsion matrix - Random Forests") plot_roc_curve(SVC_base,X_test,y_test) print("-"*50) print("Out of sample results") print("-"*50) print("Precision score : ",precision_score(y_test,y_pred)) print("Recall score : ",recall_score(y_test,y_pred)) print("F1 score : ",f1_score(y_test,y_pred)) #Out of sample results #-------------------------------------------------- #Precision score : 0.8055555555555556 #Recall score : 0.8285714285714286 #F1 score : 0.8169014084507044

#XGBoost (Boosting algorithm) #logistic regression XG_base = XGBClassifier(random_state=42).fit(X_train,y_train) #cross validation parameters (folds) k = 5 #fitting the transformations and the model cross_val_scores_xg = cross_val_score(XG_base,X_train,y_train,cv=k) print("Average Cross Val Score : ", format(cross_val_scores_xg.mean()) ) #predicting test data y_pred = XG_base.predict(X_test) #Accuracy out of sample print("Out of sample accuracy : ", XG_base.score(X_test,y_test)) #Average Cross Val Score : 0.7336787564766839 #Out of sample accuracy : 0.731404958677686 #plotting the confusion matrix plot_confusion_matrix(XG_base,X_test,y_test,normalize = "true") plt.title("Confsion matrix - Random Forests") plot_roc_curve(XG_base,X_test,y_test) print("-"*50) print("Out of sample results") print("-"*50) print("Precision score : ",precision_score(y_test,y_pred)) print("Recall score : ",recall_score(y_test,y_pred)) print("F1 score : ",f1_score(y_test,y_pred)) #Out of sample results #-------------------------------------------------- #Precision score : 0.7952755905511811 #Recall score : 0.7214285714285714 #F1 score : 0.7565543071161049 #Based on the high performance of our univariate machine learning models, we can conclude #that the price deviations from the tend do have some predictive power in estimating future #trend direction #logistic Regression has the best performance.

#d) -Supply- demand shocks due to various reasons can make prices volatile and which cannot #be forcasted using a model

EXTRA -- ANOTHER APPROACH TO PREDICT FUTURE PRICES

There is another approach to predict future prices where i will use Arimax mode, the solution provided above is my main analysis but this is something I tried as well.

#Function testing for stationarity test def get_stationarity(timeseries): # rolling statistics rolling_mean = timeseries.rolling(window=12).mean() rolling_std = timeseries.rolling(window=12).std() # rolling statistics plot original = plt.plot(timeseries, color='blue', label='Original') mean = plt.plot(rolling_mean, color='red', label='Rolling Mean') std = plt.plot(rolling_std, color='black', label='Rolling Std') plt.legend(loc='best') plt.title('Rolling Mean & Standard Deviation') plt.show(block=False) # Dickey–Fuller test: result = adfuller(timeseries) print('ADF Statistic: {}'.format(result[0])) print('p-value: {}'.format(result[1])) print('Critical Values:') for key, value in result[4].items(): print('\t{}: {}'.format(key, value))

#Reading, Importing the data and setting index df = pd.read_csv('CFX Code Snippet (1).csv' ) df.set_index('Date',inplace=True) df.index=pd.to_datetime(df.index) df.describe()

#Checking for low volume perc =[0.01,0.05,0.1,.11,.12,.13,.14,.15,.16,.17,.18,.19,.20, .40, .60, .80] df['Volume'].describe(percentiles=perc)

#As the volume is very low at times so I will drop volumes below 500 contracts, it can be due to contract expiry dates or delievery issues #when volume is non significant or low the close prices for that day are not significant. #At 500 contracts the volume may look low but in terms of some major commodites it can euqate to high number. #For Example 500 ccontracts means 500000 barrels of oil or 2500000 bushels of wheat,corn or Beans.These are 4 major commodities. df['Volume'].hist(bins =30)

#As prices when volume was low are not significant so I will make them nan cols = [col for col in df.columns if col != 'Volume'] df.loc[df['Volume'] <=500, cols ] = np.nan df

#Dropping columns not needed and interpolating prices so time series is continuous df = df.drop(columns=['Open', 'High' , 'Low' , 'Close' , 'Volume'],axis = 1) df = df.interpolate() #interpolating for the missing values df.plot() plt.grid() plt.ylabel("Price") plt.title("Commodity price")

#Testing for trend !pip install pymannkendall import pymannkendall as mk mk.original_test(df['Adj Close']) #increasing trend detected.

#As trend was detected, I will decompose the time series, will use additive decompose as variation isnt increaseing much over time. #We will use 312 (6 prices every week) as the period to investigate cyclicity in an year from statsmodels.tsa.seasonal import seasonal_decompose res = seasonal_decompose(df['Adj Close'], model='additive',period=312) res.plot()

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf num_lags = 40 #plotting ACF acf_values = plot_acf(res.resid.dropna(), lags=num_lags, zero = True)

#plotting PACF pacf_values = plot_pacf(res.resid.dropna(), lags=num_lags, zero = True)

df['Trend'] = res.trend #Adding trend to time series

df["TrendDiff"] = res.observed - res.trend df.plot() #Adding difference between price and trend to dataframe

#Due to decompose we have nan values on ends dropping them. Note time series is still continuous df = df.dropna() df.describe()

#Stationarity test P value is less than 5% so data is stationary get_stationarity(df['Adj Close'])

get_stationarity(df['TrendDiff']) #Stationarity in Trend Difference

from statsmodels.stats.stattools import durbin_watson #perform Durbin-Watson test durbin_watson(df['Adj Close']) #Presence of positive autocorrelation as the value is between 0 and 2.Positive autocorrelation (common in time series data)

df1 = df.copy() df.head() df1.tail(10)

#Definining df2 and df3.Then splitting data into n-90 and 90 values for training and testing our model df2= df1.iloc[0:1155-90] df2

df3 = df1.iloc[-90:] df3.head()

get_stationarity(df2['Adj Close'])

get_stationarity(df2['TrendDiff'])

df3.describe()

#Fitting an ARIMAX model as the questions asks how well can Price- Trend Value predict prices,so will treat this as exogenous variable. from pmdarima.arima import auto_arima model = auto_arima(df2[['Adj Close']], exogenous =df2[['TrendDiff']], trace=True, error_action="ignore", suppress_warnings=True) model.fit(df2[['Adj Close']], exogenous=df2[['TrendDiff']]) forecast= model.predict(n_periods=len(df3),exogenous=df3[['TrendDiff']]) df3["For_ARIMAX"] = forecast

#Plot showing the actual and predicted values, time series are good for predicting prices in shorter time frames but as you can see. #As time horizon increases the difference in actual and predicted value increases plt.figure(figsize=(15,7)) plt.plot(df3['Adj Close'],color='g') plt.plot(df3['For_ARIMAX'],color='r') plt.xlabel('Time') plt.ylabel('Price in $') plt.legend(['Actual Price', 'Predicted Price']) plt.title('Actual vs Forcasted price') plt.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}EXTRA -- ANOTHER APPROACH TO PREDICT FUTURE PRICES

EXTRA -- ANOTHER APPROACH TO PREDICT FUTURE PRICES