!pip install pandas seaborn statsmodels
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
gold_price = pd.read_csv('gold price(1968-2023).csv')
gold_price['Date'] = pd.to_datetime(gold_price['Date'], format="%d/%m/%Y")
gold_price.rename(columns={'Close': 'Gold Price'}, inplace=True)
print(gold_price.head())
cpi = pd.read_csv('US CPI monthly.csv')
cpi['Date'] = pd.to_datetime(cpi['Date'], format="%d/%m/%Y")
cpi.rename(columns={'Close': 'CPI'}, inplace=True)
print(cpi.head())
money_supply = pd.read_csv('US Money Supply (M2) monthly.csv')
money_supply['Date'] = pd.to_datetime(money_supply['Date'], format="%d/%m/%Y")
def convert_to_float(value):
if 'T' in value:
return float(value.rstrip('T')) * 1e12
if 'B' in value:
return float(value.rstrip('B')) * 1e9
money_supply['Close'] = money_supply['Close'].apply(convert_to_float)
money_supply.rename(columns={'Close': 'Money Supply'}, inplace=True)
print(money_supply.head())
interest_rate = pd.read_csv('US OVERNIGHT FED FUNDS EFFECT.csv')
interest_rate['Date'] = pd.to_datetime(interest_rate['Date'], format="%d/%m/%Y")
interest_rate.rename(columns={'Close': 'Interest Rate %'}, inplace=True)
print(interest_rate.head())
sp500 = pd.read_csv('S&P 500 Index.csv')
sp500['Date'] = pd.to_datetime(sp500['Date'], format="%d-%b-%Y")
sp500.rename(columns={'Close': 'S&P 500'}, inplace=True)
sp500['S&P 500'] = sp500['S&P 500'].str.replace(',', '').astype(float)
print(sp500.head())
world_gdp = pd.read_csv('World GDP yearly.csv', skiprows = 0)
world_gdp['Date'] = pd.to_datetime(world_gdp['Date'], format="%d/%m/%Y")
world_gdp['Close'] = world_gdp['Close'].apply(convert_to_float)
world_gdp.rename(columns={'Close': 'World GDP'}, inplace=True)
print(world_gdp.head())
world_uncertainty = pd.read_csv('uncertainty.csv')
# Convert 'Year' to datetime format
world_uncertainty['Year'] = pd.to_datetime(world_uncertainty['Year'].str.replace('q', '-Q'))
# Rename the 'Year' column to 'Date'
world_uncertainty = world_uncertainty.rename(columns={'Year': 'Date'})
# Identify string type columns (excluding 'Date' column)
string_columns = world_uncertainty.select_dtypes(include=['object']).columns.difference(['Date'])
world_uncertainty = world_uncertainty.drop(columns=['Year.1'], errors='ignore')
# Sort dataframe in descending order based on 'Date'
world_uncertainty = world_uncertainty.sort_values(by='Date', ascending=False).reset_index(drop=True)
# Columns to convert
columns_to_convert = ['Global (simple average)', 'Global (GDP weighted average)',
'Advanced economies', 'Emerging economies', 'Low-income economies']
# Iterate through each column and remove commas, then convert to float
for column in columns_to_convert:
world_uncertainty[column] = world_uncertainty[column].str.replace(',', '').astype(float)
print(world_uncertainty.head())
# Convert to monthly averages
gold_price_monthly = gold_price.resample('M', on='Date').mean()
dataframes = [gold_price_monthly, sp500, cpi, money_supply, interest_rate, world_gdp, world_uncertainty]
for df in dataframes:
# Convert to datetime index if 'Date' is a column
if 'Date' in df.columns:
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
# Convert all columns except 'Date' to numeric
for col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Resample each dataframe to monthly frequency
for i, df in enumerate(dataframes):
# Use mean for resampling
df_resampled = df.resample('M').mean()
# Forward fill the missing values
dataframes[i] = df_resampled.fillna(method='ffill')
# Merge all dataframes on the date index
df_combined = pd.concat(dataframes, axis=1)
df_cleaned_and_combined = df_combined.dropna()
print(df_cleaned_and_combined.tail(24))
ax = df_cleaned_and_combined['Interest Rate %'].plot(figsize=(10, 6), color='blue', label='Interest Rates')
ax2 = ax.twinx()
df_cleaned_and_combined['Gold Price'].plot(ax=ax2, color='gold', label='Gold Price USD/oz')
ax.set_xlabel("Date")
ax2.set_ylabel("Gold Prices USD/oz")
ax.set_title("Interest Rates and Gold Prices")
# Adding a legend for both axes
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax.legend(lines + lines2, labels + labels2, loc='upper left')
plt.tight_layout()
plt.show()
# Plotting the data to visualize trends
df_cleaned_and_combined.plot(subplots=True, figsize=(10, 15))
plt.tight_layout()
# Set x-axis label
plt.xlabel("Date")
plt.show()
# Calculate the correlation matrix
correlation_matrix = df_cleaned_and_combined.corr()
correlation_matrix = correlation_matrix.round(2)
print(correlation_matrix)
# Create a correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5)
plt.title("Correlation Heatmap")
plt.show()
# Plotting the data to visualize trends
df_cleaned_and_combined['Money Supply'].plot(subplots=True, figsize=(8, 5))
plt.tight_layout()
# Set x-axis label
plt.xlabel("Date")
plt.show()
import statsmodels.api as sm
# Defining the dependent variable and independent variables
Y = df_cleaned_and_combined['Gold Price']
X = df_cleaned_and_combined[['S&P 500', 'CPI', 'Money Supply', 'Interest Rate %', 'World GDP', 'Global (simple average)', 'Global (GDP weighted average)', 'Advanced economies', 'Emerging economies', 'Low-income economies']]
# Adding a constant to the model (it's a best practice!)
X = sm.add_constant(X)
# Constructing the model
model = sm.OLS(Y, X).fit()
# Printing the summary
print(model.summary())
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Calculate VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)
There is strong multicollinearity. Any VIF value higher than 10 indicates multicollinearity.
World GDP and S&P500 are dropped as they are strongly correlated with the money supply. Global (GDP weighted average) will remain as the only measure of uncertainty. Feel free to play with the other measures if needed
# Defining the dependent variable and independent variables
Y = df_cleaned_and_combined['Gold Price']
X = df_cleaned_and_combined[['CPI', 'Money Supply', 'Interest Rate %', 'Global (GDP weighted average)']]
#'Advanced economies', 'Emerging economies', 'Low-income economies']]
# Adding a constant to the model (it's a best practice!)
X = sm.add_constant(X)
# Constructing the model
model = sm.OLS(Y, X).fit()
# Printing the summary
print(model.summary())
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)
#Testin for heteroscedasticity
from statsmodels.stats.diagnostic import het_breuschpagan
# Assuming you have already fit your regression model as 'model'
residuals = model.resid
exog = model.model.exog
bp_test = het_breuschpagan(residuals, exog)
labels = ['LM Statistic', 'LM-Test p-value', 'F-Statistic', 'F-Test p-value']
print(dict(zip(labels, bp_test)))
There is no heteroscedasticity. All values above 0.05
Testing acorss different interest rate periods
columns_to_drop = ['S&P 500', 'World GDP', 'Advanced economies', 'Emerging economies', 'Low-income economies', 'Global (simple average)']
# Create the new dataframe
df_new = df_cleaned_and_combined.drop(columns=columns_to_drop)
#Can change the threshold
threshold = 1.5
# Splitting data based on the threshold
df_high_interest = df_new[df_new['Interest Rate %'] > threshold]
df_low_interest = df_new[df_new['Interest Rate %'] <= threshold]
# Regression for high interest rate period
X_high = df_high_interest.drop('Gold Price', axis=1)
X_high = sm.add_constant(X_high)
model_high = sm.OLS(df_high_interest['Gold Price'], X_high).fit()
print("High Interest Rate Period Regression Results:")
print(model_high.summary())
# Regression for low interest rate period
X_low = df_low_interest.drop('Gold Price', axis=1)
X_low = sm.add_constant(X_low)
model_low = sm.OLS(df_low_interest['Gold Price'], X_low).fit()
print("\nLow Interest Rate Period Regression Results:")
print(model_low.summary())