!pip install yfinance
pip install openpyxl
!pip install statsmodels==0.13.0
!pip install lazypredict
import pandas as pd
import numpy as np
import yfinance as yf
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
# set matplotlib.pyplot to default style
plt.style.use('default')
# Random selected companies from 4 industry
Manufacturing = ["TSM", "TXN", "AMAT", "IRBT", "LRCX", "PATK", "ACCO", "AIN", "EL", "ECL", "ZBRA", "RRX", "TSLA", "DOV", "KMB", "JBL", "FLEX", "ENTG", "ACH", "HELE", "MHK", "CAJ", "JJSF", "LULU", "CHD"] #25 companies
Medical = ["EW", "WST", "IDXX", "RMD","VRTX","AMGN", "ALGN", "PFE", "ZTS", "DHR", "TMO", "CERN", "UNH", "COO", "CNC", "QGEN", "CAH", "HCA", "MDT", "MRK", "JNJ", "REGN", "TFX", "LLY", "DVA", "ABC"] #26 companies
Logistic_transportation= ["KSU", "CHRW", "EXPD", "UNP", "FDX", "TIAIY", "ORAN", "EIX", "ASR", "DAL", "KNX", "SKYW", "HUBG", "HTLD", "JBHT", "OKE", "KEX", "NCLH", "ODFL", "PAC", "ZNH", "UAL", "TDW"]#23 companies #"TIGO","OSG",
Energy = ['NEE','ORA','ES','SRE','BWXT', "AEP", "DUK", "ATO", "PAGP", "DVN", "VLO", "PDCE", "HLX", "RRC", "KOS", "FRO", "TRGP", "LIN", "HE", "ENLAY", "CPE", "EC", "WLL", "CRK"]#24 compenies in use, excl. 'PLUG','ENPH'"BTU",
# Stock return
start=dt.datetime(2014,12,31)
end=dt.datetime(2021,9,30)
overall = []
overall.extend(Manufacturing)
overall.extend(Medical)
overall.extend(Logistic_transportation)
overall.extend(Energy)
data_O=yf.download(overall,start=start,end=end)
df_O_all=data_O['Adj Close']
df_O_all=df_O_all.ffill()
# we used 5 year data from 2014 to 2019. The return for 2020 is too volatile due to pandemic
df_O = df_O_all[:dt.datetime(2019,12,31)]
# Benchmark data (market indices & ESG ETFs)
benchmark_df = yf.download(['SPY'], start=start, end=end)
benchmark_SPY_all = benchmark_df['Adj Close']
benchmark_SPY = benchmark_SPY_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['^IXIC'], start=start, end=end)
benchmark_IXIC_all = benchmark_df['Adj Close']
benchmark_IXIC = benchmark_IXIC_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['DJI'], start=start, end=end)
benchmark_DJI_all = benchmark_df['Adj Close']
benchmark_DJI = benchmark_DJI_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ESGD'], start=start, end=end)
benchmark_ESGD_all = benchmark_df['Adj Close']
benchmark_ESG = benchmark_ESGD_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ESGV'], start=start, end=end)
benchmark_ESGV_all = benchmark_df['Adj Close']
benchmark_ESG2 = benchmark_ESGV_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ICLN'], start=start, end=end)
benchmark_ICLN_all = benchmark_df['Adj Close']
benchmark_ESG3 = benchmark_ICLN_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['SUSA'], start=start, end=end)
benchmark_SUSA_all = benchmark_df['Adj Close']
benchmark_ESG4 = benchmark_SUSA_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ESGHX'], start=start, end=end)
benchmark_ESGHX_all = benchmark_df['Adj Close']
benchmark_ESG5 = benchmark_ESGHX_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['NEXTX'], start=start, end=end)
benchmark_NEXTX_all = benchmark_df['Adj Close']
benchmark_ESG6 = benchmark_NEXTX_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ATEYX'], start=start, end=end)
benchmark_ATEYX_all = benchmark_df['Adj Close']
benchmark_ESG7 = benchmark_ATEYX_all[:dt.datetime(2019,12,31)]
# ESG Benchmark
_5benchmark =['ICLN','SUSA','ESGHX','NEXTX','ATEYX']
# market indices benchamrk
_3benchmark = ['SPY', 'IXIC','DJI']
pip install xlrd
#Read and clean data in Fama-French 3
path_1 = "FF3.CSV"
FF3=pd.read_csv(path_1, skiprows=4)
#FF3 = FF3.sort_index(ascending = False)
#FF3 = FF3.reset_index()
FF3 = FF3.rename(columns={'Unnamed: 0': 'Year'})
FF3 = FF3[:-1]
FF3["Year"] = pd.to_datetime(FF3['Year'],format='%Y%m%d')
FF3 = FF3[FF3['Year'] >= dt.datetime(2014,12,31)]
df = pd.DataFrame()
for i in overall:
path = "Archive/ESG Table for " + i + ".xlsx"
temp_n=pd.read_excel(path,engine='openpyxl')
temp_n = temp_n[1:14]
temp_n = temp_n.drop(temp_n.index[1:7])
#transpose the column
temp_n = temp_n.T
temp_n.columns = temp_n.iloc[0]
temp_n = temp_n.drop(temp_n.index[0])
# Rename the rows
temp_n.set_index(temp_n.columns[0],inplace=True)
temp_n = temp_n[1:6]
temp_n = temp_n.reset_index()
# Rename column names
temp_n = temp_n.rename(columns={np.nan: 'Year'})
temp_n.columns = ['Year','ESG_C_S', 'ESG_S', 'Env_S', 'Soc_S', 'Gov_S', 'ESG_Controversial']
# Rename the score into integer
temp_n = temp_n.replace(["D-", "D","D+","C-", "C","C+","B-", "B","B+","A-", "A","A+"], [0,1,2,3,4,5,6,7,8,9,10,11])
temp_n['Year'] = pd.to_datetime(temp_n['Year'])
temp_n = temp_n.set_index("Year")
new_date1 = pd.DataFrame(temp_n[-1:].values, index=[pd.to_datetime('2014-12-31')], columns=temp_n.columns)
new_date2 = pd.DataFrame(temp_n[-1:].values, index=[pd.to_datetime('2019-12-31')], columns=temp_n.columns)
temp_n = temp_n.append(new_date1)
temp_n = temp_n.append(new_date2)
temp_n = temp_n.sort_index()
temp_n = temp_n.resample('D').asfreq()
temp_n = temp_n.fillna(method = 'ffill')
# adjust column type
#temp_n["Year"] = pd.to_numeric(temp_n["Year"])
temp_n = temp_n.apply(pd.to_numeric)
#Merge two files
temp_n = temp_n.merge(FF3, how = "inner", left_on = temp_n.index, right_on = FF3['Year'])
temp_n = temp_n.set_index("Year")
temp_n = temp_n.drop(columns = 'key_0')
# Compute Daily Return
temp_return = df_O[i].ffill().pct_change()
temp_return = temp_return.to_frame()
temp_return.columns = [i]
temp_return = temp_return[1:]
temp_return = temp_return.round(3)
temp_return = temp_return.reset_index()
temp_return = temp_return.rename(columns={'Date': 'Year', i: "Ret"})
temp_return['Year'] = pd.DatetimeIndex(temp_return['Year'])
temp_n = temp_n.merge(temp_return, how = "outer", on = "Year")
#Set the company name as index
temp_n["Company"] = i
if i in Manufacturing:
temp_n['Industry'] = 'Manufacturing'
elif i in Medical:
temp_n['Industry'] = 'Medical'
elif i in Logistic_transportation:
temp_n['Industry'] = 'Logistic'
else:
temp_n['Industry'] = 'Energy'
temp_n = temp_n.set_index("Company")
df = pd.concat([df,temp_n], axis = 0)
df_I = df
# get dummy variable for Industry
df = pd.get_dummies(df, prefix=['I'], columns=['Industry'],drop_first=True)
df = df.dropna()
df
df.isna().sum()
# plot regress plot to see if there is any trend in the data
plt.figure(figsize = (16,8))
sns.regplot(x = 'ESG_C_S', y = "Ret", data = df, logistic=True, scatter_kws={"color": "black"}, y_jitter=.05,line_kws={"color": "red"})
plt.ylabel("return")
plt.xlabel("ESG index")
def CI_test(df, i):
bs_replicates = np.empty(10000)
for j in range(10000):
bs_replicates[j] = np.mean(np.random.choice(df[i], len(df[i])))
# Bootstrapped confidence intervals
conf_int = np.percentile(bs_replicates,[2.5, 97.5])
print('95% bootstrapped confidence interval =', conf_int, '%',i)
# Theoretical confidence intervals
conf_int_actual_upper = df[i].mean() + df[i].std()/np.sqrt(df[i].count())*1.96
conf_int_actual_lower = df[i].mean() - df[i].std()/np.sqrt(df[i].count())*1.96
conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
print('-'*120)
print('95% theoretical confidence interval =', conf_int_actual, '%',i)
CI_test(df, 'Ret')
# remove Year as it contain multicollinearity problem
df = df.drop(columns = 'Year')
df.columns
# Correlation heatmap --> identify potential correlated factors
corr = df.corr()
plt.figure(figsize = (16,8))
sns.heatmap(corr,annot=True)
plt.show()
# dependent/target/outcome variable
y = df['Ret']
# independent/predictor/explanatory variable
X = df[['ESG_C_S','ESG_S','Env_S', 'Soc_S','Gov_S','ESG_Controversial', 'Mkt-RF','SMB', 'RF' ,'HML','I_Logistic', 'I_Manufacturing', 'I_Medical']]
#
#'ESG_C_S', 'ESG_S', 'Env_S', 'Soc_S','Gov_S','ESG_Controversial'
#'Mkt-RF','SMB', 'RF' ,'HML','I_Logistic', 'I_Manufacturing', 'I_Medical',
# OLS Regression
X = sm.add_constant(X)
ols_model = sm.OLS(y,X.astype(float))
result = ols_model.fit()
print(result.summary2())
#conditional No. should be lower than 20
df_O.index
df_O = df_O.drop(df_O.index[0] ,axis = 0)
df_1 = df_O.fillna(method = 'ffill')
df_1 = df_1.pct_change()
df_1 = df_1.drop(df_1.index[0] ,axis = 0)
df_1
# Stationary test
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries, i):
# Determining rolling statistics
rolmean = timeseries.rolling(14).mean()
rolstd = timeseries.rolling(14).std()
# Plot rolling statistics:
#orig = plt.plot(timeseries, color='blue',label='Original')
#mean = plt.plot(rolmean, color='red', label='Rolling Mean')
#std = plt.plot(rolstd, color='black', label = 'Rolling Std')
#plt.legend(loc='best')
#plt.title('Rolling Mean & Standard Deviation')
#plt.show(block=False)
# Perform Dickey-Fuller test:
#print ('Results of Dickey-Fuller Test:' + i)
timeseries = timeseries.iloc[:].values
dftest = adfuller(timeseries, autolag='AIC')
#dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
#for key,value in dftest[4].items():
# dfoutput['Critical Value (%s)'%key] = value
#print(dfoutput)
if dftest[1] > 0.05:
print(i)
for i in overall:
test_stationarity(df_1[i], i)
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
#LazyPredict on regression models
# fit all models
reg = LazyRegressor(predictions=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models
predictions
!pip install PyPortfolioOpt==1.2.1
pip install --upgrade scipy
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns
from pypfopt import cla
from pypfopt.plotting import Plotting
from pypfopt.discrete_allocation import DiscreteAllocation, get_latest_prices
from pypfopt import objective_functions
import scipy.stats as st
# compute efficient frontier & portfolio statistics
class EF:
def __init__(self, asset, asset_list):
self.asset = asset
self.asset_list = asset_list
self.computation(asset, asset_list)
def computation(self, asset, asset_list):
# Check NaN values in the data
nullin_df = pd.DataFrame(self.asset,columns=self.asset_list)
# Calculate portfolio mean return
mu = expected_returns.mean_historical_return(self.asset)
# Calculate portfolio return variance
sigma = risk_models.sample_cov(self.asset)
# weight bounds in negative allows shorting of stocks
ef = EfficientFrontier(mu, sigma, weight_bounds=(0,1))
ef.add_objective(objective_functions.L2_reg, gamma=2)
# optional constraints possible, read pypfopt documentation.
sharpe_portfolio=ef.max_sharpe(risk_free_rate=0.00078) #US Bond
sharpe_portfolio_wt=ef.clean_weights()
latest_prices = get_latest_prices(self.asset)
# Allocate Portfolio Value in $ as required to show number of shares/stocks to buy,
# also bounds for shorting will affect allocation
# Maximum Sharpe Portfolio Allocation $1000000
da = DiscreteAllocation(sharpe_portfolio_wt, latest_prices, total_portfolio_value=1000000)
allocation, leftover = da.greedy_portfolio()
max_sharpe_cla = cla.CLA(mu, sigma)
max_sharpe_cla.max_sharpe()
sharpe_portfolio_wt_list = list(sharpe_portfolio_wt.values())
ret_data1 = self.asset.pct_change()[1:]
weighted_returns1 = (sharpe_portfolio_wt_list * ret_data1)
portfolio_ret1 = pd.DataFrame(weighted_returns1.sum(axis=1))
ret_data1 = ret_data1.merge(portfolio_ret1, on="Date", how="left")
ret_data1 = ret_data1.rename(columns={0: "portfolio_ret"})
ret_data1['cumulative_portfolio_ret'] = (ret_data1['portfolio_ret'] + 1).cumprod()
self.sharpe_portfolio_wt = sharpe_portfolio_wt
self.portfolio_ret1 = portfolio_ret1
self.allocation = allocation
self.max_sharpe_cla = max_sharpe_cla
self.ret_data1 = ret_data1
self.leftover = leftover
def SP_weight(self):
print(self.sharpe_portfolio_wt)
print(self.allocation)
print("Leftover Fund value for the maximum Sharpe portfolio is ${:.2f}".format(self.leftover))
P_weight = Plotting.plot_weights(self.sharpe_portfolio_wt)
return P_weight
def Efficient_frontier(self):
EF_plot = Plotting.plot_efficient_frontier(self.max_sharpe_cla, show_assets="True")
return EF_plot
def cumulative_return(self):
Cum_ret = sns.scatterplot('Date', 'cumulative_portfolio_ret', data=self.ret_data1, color = 'green')
print(self.ret_data1['cumulative_portfolio_ret'][-1])
return Cum_ret
def plot_hist(self):
hist_ret = plt.hist(self.ret_data1['portfolio_ret'], bins=10)
print(st.norm.interval(alpha=0.95, loc=np.mean(self.ret_data1['portfolio_ret']), scale=st.sem(self.ret_data1['portfolio_ret'])))
return hist_ret
def port_stat(self):
geometric_port_return = np.prod(self.portfolio_ret1 + 1) ** (252/self.portfolio_ret1.shape[0]) - 1
annual_std = np.std(self.portfolio_ret1) * np.sqrt(252)
port_sharpe_ratio = geometric_port_return / annual_std
print("Annual Return", geometric_port_return)
print("Stdev", annual_std)
print("Sharpe Ratio", port_sharpe_ratio)
def CI_test(self):
bs_replicates = np.empty(10000)
for i in range(10000):
bs_replicates[i] = np.mean(np.random.choice(self.ret_data1['portfolio_ret'], len(self.ret_data1['portfolio_ret'])))
# Bootstrapped confidence intervals
conf_int = np.percentile(bs_replicates,[2.5, 97.5])
print('95% bootstrapped confidence interval =', conf_int, '%')
# Theoretical confidence intervals
conf_int_actual_upper = self.ret_data1['portfolio_ret'].mean() + self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
conf_int_actual_lower = self.ret_data1['portfolio_ret'].mean() - self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
print('-'*120)
print('95% theoretical confidence interval =', conf_int_actual, '%')
# compute efficient frontier & portfolio statistics with benchmarks
class EF_benchmark5:
def __init__(self, asset, asset_list,benchmark1,benchmark2,benchmark3,benchmark4,benchmark5, benchmark_name = None):
self.asset = asset
self.asset_list = asset_list
self.benchmark1 = benchmark1
self.benchmark2 = benchmark2
self.benchmark3 = benchmark3
self.benchmark4 = benchmark4
self.benchmark5 = benchmark5
self.benchmark_name = benchmark_name
self.computation(asset, asset_list,benchmark1,benchmark2,benchmark3,benchmark4,benchmark5)
def computation(self, asset, asset_list,benchmark1,benchmark2,benchmark3,benchmark4,benchmark5):
# Check NaN values in the data
nullin_df = pd.DataFrame(self.asset,columns=self.asset_list)
# Calculate portfolio mean return
mu = expected_returns.mean_historical_return(self.asset)
# Calculate portfolio return variance
sigma = risk_models.sample_cov(self.asset)
# weight bounds in negative allows shorting of stocks
ef = EfficientFrontier(mu, sigma, weight_bounds=(0,1))
ef.add_objective(objective_functions.L2_reg, gamma=2)
# optional constraints possible, read pypfopt documentation.
sharpe_portfolio=ef.max_sharpe(risk_free_rate=0.00078) #US Bond
sharpe_portfolio_wt=ef.clean_weights()
latest_prices = get_latest_prices(self.asset)
# Allocate Portfolio Value in $ as required to show number of shares/stocks to buy,
# also bounds for shorting will affect allocation
# Maximum Sharpe Portfolio Allocation $1000000
da = DiscreteAllocation(sharpe_portfolio_wt, latest_prices, total_portfolio_value=1000000)
allocation, leftover = da.greedy_portfolio()
max_sharpe_cla = cla.CLA(mu, sigma)
max_sharpe_cla.max_sharpe()
sharpe_portfolio_wt_list = list(sharpe_portfolio_wt.values())
ret_data1 = self.asset.pct_change()[1:]
weighted_returns1 = (sharpe_portfolio_wt_list * ret_data1)
portfolio_ret1 = pd.DataFrame(weighted_returns1.sum(axis=1))
ret_data1 = ret_data1.merge(portfolio_ret1, on="Date", how="left")
ret_data1 = ret_data1.rename(columns={0: "portfolio_ret"})
ret_data1['cumulative_portfolio_ret'] = (ret_data1['portfolio_ret'] + 1).cumprod()
ret_benchmark1 = self.benchmark1.pct_change()[1:]
ret_benchmark1['cumulative_benchmark_ret'] = (ret_benchmark1 + 1).cumprod()
ret_benchmark2 = self.benchmark2.pct_change()[1:]
ret_benchmark2['cumulative_benchmark_ret'] = (ret_benchmark2 + 1).cumprod()
ret_benchmark3 = self.benchmark3.pct_change()[1:]
ret_benchmark3['cumulative_benchmark_ret'] = (ret_benchmark3 + 1).cumprod()
ret_benchmark4 = self.benchmark4.pct_change()[1:]
ret_benchmark4['cumulative_benchmark_ret'] = (ret_benchmark4 + 1).cumprod()
ret_benchmark5 = self.benchmark5.pct_change()[1:]
ret_benchmark5['cumulative_benchmark_ret'] = (ret_benchmark5 + 1).cumprod()
self.sharpe_portfolio_wt = sharpe_portfolio_wt
self.portfolio_ret1 = portfolio_ret1
self.allocation = allocation
self.max_sharpe_cla = max_sharpe_cla
self.ret_data1 = ret_data1
self.ret_benchmark1 = ret_benchmark1
self.ret_benchmark2 = ret_benchmark2
self.ret_benchmark3 = ret_benchmark3
self.ret_benchmark4 = ret_benchmark4
self.ret_benchmark5 = ret_benchmark5
self.leftover = leftover
def SP_weight(self):
print(self.sharpe_portfolio_wt)
print(self.allocation)
print("Leftover Fund value for the maximum Sharpe portfolio is ${:.2f}".format(self.leftover))
P_weight = Plotting.plot_weights(self.sharpe_portfolio_wt)
return P_weight
def Efficient_frontier(self):
EF_plot = Plotting.plot_efficient_frontier(self.max_sharpe_cla, show_assets="True")
return EF_plot
def cumulative_return(self):
Cum_ret = sns.scatterplot('Date', 'cumulative_portfolio_ret', data=self.ret_data1, color = 'yellowgreen', size = 1)
Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark1, color = 'olivedrab',size = 1)
Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark2, color = 'darkslategray',size = 1)
Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark3, color = 'darkseagreen',size = 1)
Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark4, color = 'darkgreen',size = 1)
Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark5, color = 'teal',size = 1)
if self.benchmark_name != None:
plt.legend(labels = ["ESG", *self.benchmark_name])
else:
plt.legend(labels = ["ESG", "Benchmark"])
print(self.ret_data1['cumulative_portfolio_ret'][-1])
return Cum_ret
def plot_hist(self):
hist_ret = plt.hist(self.ret_data1['portfolio_ret'], bins=10)
print(st.norm.interval(alpha=0.95, loc=np.mean(self.ret_data1['portfolio_ret']), scale=st.sem(self.ret_data1['portfolio_ret'])))
return hist_ret
def port_stat(self):
geometric_port_return = np.prod(self.portfolio_ret1 + 1) ** (252/self.portfolio_ret1.shape[0]) - 1
annual_std = np.std(self.portfolio_ret1) * np.sqrt(252)
port_sharpe_ratio = geometric_port_return / annual_std
print("Annual Return", geometric_port_return)
print("Stdev", annual_std)
print("Sharpe Ratio", port_sharpe_ratio)
def hypothesis_testing(self):
#import scipy.stats as stats
plt1 = self.ret_data1['portfolio_ret'].hist(bins=100, color='r', alpha=0.5, figsize = (16,8))
plt1 = self.benchmark2.pct_change()[1:].hist(bins=100, color='olivedrab', alpha=0.5)
print("Difference in mean return (" + self.benchmark_name[1] + ", ESG): ")
print((self.ret_data1['portfolio_ret'].mean() - self.benchmark2.pct_change()[1:].mean())*100)
stat, p = st.ttest_ind(self.ret_data1['portfolio_ret'], self.benchmark2.pct_change()[1:], equal_var=False)#, alternative='greater')
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
print('The difference in mean return is significantly different (reject null hypothesis)')
else:
print('The difference in mean return is not significantly different (failed to reject null hypothesis)')
def CI_test(self):
bs_replicates = np.empty(10000)
for i in range(10000):
bs_replicates[i] = np.mean(np.random.choice(self.ret_data1['portfolio_ret'], len(self.ret_data1['portfolio_ret'])))
# Bootstrapped confidence intervals
conf_int = np.percentile(bs_replicates,[2.5, 97.5])
print('95% bootstrapped confidence interval =', conf_int, '%')
# Theoretical confidence intervals
conf_int_actual_upper = self.ret_data1['portfolio_ret'].mean() + self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
conf_int_actual_lower = self.ret_data1['portfolio_ret'].mean() - self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
print('-'*120)
print('95% theoretical confidence interval =', conf_int_actual, '%')
def CI_test_benchmark(self):
bs_replicates = np.empty(10000)
for i in range(10000):
bs_replicates[i] = np.mean(np.random.choice(self.benchmark2.pct_change()[1:], len(self.benchmark2.pct_change()[1:])))
# Bootstrapped confidence intervals
conf_int = np.percentile(bs_replicates,[2.5, 97.5])
print('95% bootstrapped confidence interval =', conf_int, '%')
# Theoretical confidence intervals
conf_int_actual_upper = self.benchmark2.pct_change()[1:].mean() + self.benchmark2.pct_change()[1:].std()/np.sqrt(self.benchmark2.pct_change()[1:].count())*1.96
conf_int_actual_lower = self.benchmark2.pct_change()[1:].mean() - self.benchmark2.pct_change()[1:].std()/np.sqrt(self.benchmark2.pct_change()[1:].count())*1.96
conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
print('-'*120)
print('95% theoretical confidence interval =', conf_int_actual, '%')
# compute efficient frontier & portfolio statistics with benchmarks
class EF_benchmark3:
def __init__(self, asset, asset_list,benchmark1,benchmark2,benchmark3, benchmark_name = None):
self.asset = asset
self.asset_list = asset_list
self.benchmark1 = benchmark1
self.benchmark2 = benchmark2
self.benchmark3 = benchmark3
self.benchmark_name = benchmark_name
self.computation(asset, asset_list,benchmark1,benchmark2,benchmark3)
def computation(self, asset, asset_list,benchmark1,benchmark2,benchmark3):
# Check NaN values in the data
nullin_df = pd.DataFrame(self.asset,columns=self.asset_list)
# Calculate portfolio mean return
mu = expected_returns.mean_historical_return(self.asset)
# Calculate portfolio return variance
sigma = risk_models.sample_cov(self.asset)
# weight bounds in negative allows shorting of stocks
ef = EfficientFrontier(mu, sigma, weight_bounds=(0,1))
ef.add_objective(objective_functions.L2_reg, gamma=2)
# optional constraints possible, read pypfopt documentation.
sharpe_portfolio=ef.max_sharpe(risk_free_rate=0.00078) #US Bond
sharpe_portfolio_wt=ef.clean_weights()
latest_prices = get_latest_prices(self.asset)
# Allocate Portfolio Value in $ as required to show number of shares/stocks to buy,
# also bounds for shorting will affect allocation
# Maximum Sharpe Portfolio Allocation $1000000
da = DiscreteAllocation(sharpe_portfolio_wt, latest_prices, total_portfolio_value=1000000)
allocation, leftover = da.greedy_portfolio()
max_sharpe_cla = cla.CLA(mu, sigma)
max_sharpe_cla.max_sharpe()
sharpe_portfolio_wt_list = list(sharpe_portfolio_wt.values())
ret_data1 = self.asset.pct_change()[1:]
weighted_returns1 = (sharpe_portfolio_wt_list * ret_data1)
portfolio_ret1 = pd.DataFrame(weighted_returns1.sum(axis=1))
ret_data1 = ret_data1.merge(portfolio_ret1, on="Date", how="left")
ret_data1 = ret_data1.rename(columns={0: "portfolio_ret"})
ret_data1['cumulative_portfolio_ret'] = (ret_data1['portfolio_ret'] + 1).cumprod()
ret_benchmark1 = self.benchmark1.pct_change()[1:]
ret_benchmark1['cumulative_benchmark_ret'] = (ret_benchmark1 + 1).cumprod()
ret_benchmark2 = self.benchmark2.pct_change()[1:]
ret_benchmark2['cumulative_benchmark_ret'] = (ret_benchmark2 + 1).cumprod()
ret_benchmark3 = self.benchmark3.pct_change()[1:]
ret_benchmark3['cumulative_benchmark_ret'] = (ret_benchmark3 + 1).cumprod()
self.sharpe_portfolio_wt = sharpe_portfolio_wt
self.portfolio_ret1 = portfolio_ret1
self.allocation = allocation
self.max_sharpe_cla = max_sharpe_cla
self.ret_data1 = ret_data1
self.ret_benchmark1 = ret_benchmark1
self.ret_benchmark2 = ret_benchmark2
self.ret_benchmark3 = ret_benchmark3
self.leftover = leftover
def SP_weight(self):
print(self.sharpe_portfolio_wt)
print(self.allocation)
print("Leftover Fund value for the maximum Sharpe portfolio is ${:.2f}".format(self.leftover))
P_weight = Plotting.plot_weights(self.sharpe_portfolio_wt)
return P_weight
def Efficient_frontier(self):
EF_plot = Plotting.plot_efficient_frontier(self.max_sharpe_cla, show_assets="True")
return EF_plot
def cumulative_return(self):
Cum_ret = sns.scatterplot('Date', 'cumulative_portfolio_ret', data=self.ret_data1, color = 'yellowgreen', size = 1)
Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark1, color = 'teal',size =1)
Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark2, color = 'olivedrab',size =1)
Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark3, color = 'darkgreen',size =1)
if self.benchmark_name != None:
plt.legend(labels = ["ESG", *self.benchmark_name])
else:
plt.legend(labels = ["ESG", "Benchmark"])
print(self.ret_data1['cumulative_portfolio_ret'][-1])
return Cum_ret
def plot_hist(self):
hist_ret = plt.hist(self.ret_data1['portfolio_ret'], bins=10)
print(st.norm.interval(alpha=0.95, loc=np.mean(self.ret_data1['portfolio_ret']), scale=st.sem(self.ret_data1['portfolio_ret'])))
return hist_ret
def port_stat(self):
geometric_port_return = np.prod(self.portfolio_ret1 + 1) ** (252/self.portfolio_ret1.shape[0]) - 1
annual_std = np.std(self.portfolio_ret1) * np.sqrt(252)
port_sharpe_ratio = geometric_port_return / annual_std
print("Annual Return", geometric_port_return)
print("Stdev", annual_std)
print("Sharpe Ratio", port_sharpe_ratio)
def hypothesis_testing(self):
#import scipy.stats as stats
plt1 = self.ret_data1['portfolio_ret'].hist(bins=100, color='r', alpha=0.5, figsize = (16,8))
plt1 = self.benchmark1.pct_change()[1:].hist(bins=100, color='olivedrab', alpha=0.5)
print("Difference in mean return (" + self.benchmark_name[0] + ", ESG): ")
print((self.ret_data1['portfolio_ret'].mean() - self.benchmark1.pct_change()[1:].mean())*100)
stat, p = st.ttest_ind(self.ret_data1['portfolio_ret'], self.benchmark1.pct_change()[1:], equal_var=False)#, alternative='greater')
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
print('The difference in mean return is significantly different (reject null hypothesis)')
else:
print('The difference in mean return is not significantly different (failed to reject null hypothesis)')
def CI_test(self):
bs_replicates = np.empty(10000)
for i in range(10000):
bs_replicates[i] = np.mean(np.random.choice(self.ret_data1['portfolio_ret'], len(self.ret_data1['portfolio_ret'])))
# Bootstrapped confidence intervals
conf_int = np.percentile(bs_replicates,[2.5, 97.5])
print('95% bootstrapped confidence interval =', conf_int, '%')
# Theoretical confidence intervals
conf_int_actual_upper = self.ret_data1['portfolio_ret'].mean() + self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
conf_int_actual_lower = self.ret_data1['portfolio_ret'].mean() - self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
print('-'*120)
print('95% theoretical confidence interval =', conf_int_actual, '%')
def CI_test_benchmark(self):
bs_replicates = np.empty(10000)
for i in range(10000):
bs_replicates[i] = np.mean(np.random.choice(self.benchmark1.pct_change()[1:], len(self.benchmark1.pct_change()[1:])))
# Bootstrapped confidence intervals
conf_int = np.percentile(bs_replicates,[2.5, 97.5])
print('95% bootstrapped confidence interval =', conf_int, '%')
# Theoretical confidence intervals
conf_int_actual_upper = self.benchmark1.pct_change()[1:].mean() + self.benchmark1.pct_change()[1:].std()/np.sqrt(self.benchmark1.pct_change()[1:].count())*1.96
conf_int_actual_lower = self.benchmark1.pct_change()[1:].mean() - self.benchmark1.pct_change()[1:].std()/np.sqrt(self.benchmark1.pct_change()[1:].count())*1.96
conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
print('-'*120)
print('95% theoretical confidence interval =', conf_int_actual, '%')
#Ranked the data into 3 segment
def gmb(df, subset = "Energy"):
test = df[df["Industry"] == subset]
test = test['ESG_C_S']
test = test.groupby(test.index).mean()
category = pd.qcut(test, 3,labels=["bad", "medium", "good"])
category = pd.DataFrame(category)
bad = list(category[category['ESG_C_S'] == "bad"].index)
medium = list(category[category['ESG_C_S'] == "medium"].index)
good = list(category[category['ESG_C_S'] == "good"].index)
return bad, medium, good
E_bad, E_medium, E_good = gmb(df_I,"Energy")
M1_bad, M1_medium, M1_good = gmb(df_I,"Manufacturing")
M2_bad, M2_medium, M2_good = gmb(df_I,"Medical")
L_bad, L_medium, L_good = gmb(df_I,"Logistic")
bad = []
bad += E_bad
bad += M1_bad
bad += M2_bad
bad += L_bad
medium = []
medium += E_medium
medium += M1_medium
medium += M2_medium
medium += L_medium
good = []
good += E_good
good += M1_good
good += M2_good
good += L_good
sns.violinplot(x="Industry", y="ESG_C_S", data=df_I)# hue = 'Industry',
g_ef = EF(df_O[good], good)
b_ef = EF(df_O[bad], bad)
m_ef = EF(df_O[medium], medium)
g_ef.cumulative_return()
# good ESG stocks in each industry
E_good,M1_good,M2_good,L_good
pip install yahoo_fin
pip install requests_html
# Extract company data
import yahoo_fin.stock_info as si
test = si.get_stats_valuation('AEP')
test['Unnamed: 0']
items = ['Market Cap (intraday) 5', 'Enterprise Value 3', 'Trailing P/E', 'Forward P/E 1', 'PEG Ratio (5 yr expected) 1', 'Price/Sales (ttm)', 'Price/Book (mrq)', 'Enterprise Value/Revenue 3', 'Enterprise Value/EBITDA 7']
def stats1(asset_list, item):
dict_temp = {}
for i in asset_list:
#print(i)
temp = si.get_stats_valuation(i)
temp = temp.iloc[:,:2]
temp.columns = ["Attribute", "Recent"]
dict_temp[i] = temp[(temp["Attribute"] == item)]
combined_stats = pd.concat(dict_temp)
combined_stats = combined_stats.reset_index()
if item == 'Market Cap (intraday) 5' or item == 'Enterprise Value 3':
combined_stats.Recent = combined_stats.Recent.apply(lambda x: float(x[:-1]) if 'B' in x else x)
else:
combined_stats.Recent = combined_stats.Recent.apply(lambda x: float(x))
combined_stats = combined_stats[["level_0", "Recent"]]
combined_stats.columns = ['Firm',item]
combined_stats = combined_stats.sort_values(by=[item],ascending=False)
return combined_stats
def stats2(asset_list, *item):
output = pd.DataFrame(columns=['Firm'])
for i in item:
#print(i)
temp = stats1(asset_list, i)
output = output.merge(temp, how = 'outer', on = "Firm")
return output
#Select stock based on their finacial ratios
output_E = stats2(E_good, items[0],items[3],items[5])
output_M1 = stats2(M1_good, items[0],items[3],items[5])
output_M2 = stats2(M2_good, items[0],items[3],items[5])
output_L = stats2(L_good, items[0],items[3],items[5])
output_E
output_M1
output_M2
output_L
# Select stocks based on the above criteria
Selected_Stock = ["NEE", "LIN", "ENLAY", "ES", "SRE",'UNH', 'TMO', 'WST','RMD','LLY', "TSM", "TXN", "ECL", "AMAT", "KMB", "UNP", "FDX", "ASR", "EIX", "ZNH"]
def alpha_beta_calculation(asset, assetlist, benchmark, start_date = "2014-12-31", end_date = "2019-12-31"):
nullin_df = pd.DataFrame(asset.loc[start_date:end_date],columns=assetlist)
# Calculate portfolio mean return
mu = expected_returns.mean_historical_return(asset.loc[start_date:end_date])
# Calculate portfolio return variance
sigma = risk_models.sample_cov(asset.loc[start_date:end_date])
# weight bounds in negative allows shorting of stocks
ef = EfficientFrontier(mu, sigma, weight_bounds=(0,1))
ef.add_objective(objective_functions.L2_reg, gamma=2)
# optional constraints possible, read pypfopt documentation.
sharpe_portfolio=ef.max_sharpe(risk_free_rate=0.00078) #US Bond
sharpe_portfolio_wt=ef.clean_weights()
latest_prices = get_latest_prices(asset.loc[start_date:end_date])
# Allocate Portfolio Value in $ as required to show number of shares/stocks to buy,
# also bounds for shorting will affect allocation
# Maximum Sharpe Portfolio Allocation $1000000
da = DiscreteAllocation(sharpe_portfolio_wt, latest_prices, total_portfolio_value=1000000)
allocation, leftover = da.greedy_portfolio()
max_sharpe_cla = cla.CLA(mu, sigma)
max_sharpe_cla.max_sharpe()
sharpe_portfolio_wt_list = list(sharpe_portfolio_wt.values())
ret_data1 = asset.loc[start_date:end_date].pct_change()[1:]
weighted_returns1 = (sharpe_portfolio_wt_list * ret_data1)
portfolio_ret1 =weighted_returns1.sum(axis=1)
benchmark1 = benchmark.loc[start_date:end_date].pct_change()[1:]
(Beta, Alpha) = st.linregress(benchmark1.values, portfolio_ret1.values)[0:2]
year = int(end_date[:4]) - int(start_date[:4])
print(str(year) + "-year " + f"Beta: {Beta:.6f}", str(year) + "-year " +f"Alpha: {Alpha:.6f}")
#print(str(year) + "-year " + f"Beta: {res[0]:.6f}", str(year) + "-year " +f"Alpha: {res[1]:.6f}")
#idosycratic risk work in favor to the stock (show as residual)
# --> good ESG & valued stock
alpha_beta_calculation(df_O[Selected_Stock], Selected_Stock,benchmark_SPY)
alpha_beta_calculation(df_O[Selected_Stock], Selected_Stock,benchmark_SPY,"2018-12-31","2019-12-31")
alpha_beta_calculation(df_O[Selected_Stock], Selected_Stock,benchmark_SPY,"2016-12-31","2019-12-31")
alpha_beta_calculation(df_O_all[Selected_Stock], Selected_Stock,benchmark_SPY_all,"2016-09-30","2021-09-30")
alpha_beta_calculation(df_O_all[Selected_Stock], Selected_Stock,benchmark_SPY_all,"2020-09-30","2021-09-30")
alpha_beta_calculation(df_O_all[Selected_Stock], Selected_Stock,benchmark_SPY_all,"2018-09-30","2021-09-30")
selected_EF_3 = EF_benchmark3(df_O[Selected_Stock], Selected_Stock,benchmark_SPY, benchmark_IXIC,benchmark_DJI, _3benchmark)
M_EF = selected_EF_3.cumulative_return()
selected_EF_3.Efficient_frontier()
selected_EF_3.SP_weight()
g_ef.Efficient_frontier()
selected_EF_3.port_stat()
selected_EF_3.CI_test()
selected_EF_3.CI_test_benchmark()
selected_EF_5 = EF_benchmark5(df_O[Selected_Stock], Selected_Stock,benchmark_ESG3, benchmark_ESG4,benchmark_ESG5,benchmark_ESG6,benchmark_ESG7, _5benchmark)
S_EF = selected_EF_5.cumulative_return()
selected_EF_3.hypothesis_testing()
selected_EF_5.hypothesis_testing()
selected_EF_5.port_stat()
a = {'NEE': 0.0652, 'LIN': 0.03926, 'ENLAY': 0.05335, 'ES': 0.04204, 'SRE': 0.03119, 'UNH': 0.07758, 'TMO': 0.06344, 'WST': 0.07308, 'RMD': 0.07704, 'LLY': 0.05329, 'TSM': 0.07588, 'TXN': 0.06643, 'ECL': 0.04255, 'AMAT': 0.06896, 'KMB': 0.02258, 'UNP': 0.0358, 'FDX': 0.0, 'ASR': 0.03637, 'EIX': 0.02486, 'ZNH': 0.05111}
a = pd.DataFrame.from_dict(a,orient='index')
a = a.rename(columns={0: 'Weight'})
a = a.sort_values('Weight', ascending = False)
a.head(10)
b = df_I.loc[Selected_Stock]
test = b[['ESG_C_S','Env_S','Soc_S','Gov_S','ESG_Controversial','Industry']]
test1 = round(test.groupby('Industry').mean(),2)
test1
test1_2 = test1.mean()
test1_2 # portfolio
c = df_I
test = c[['ESG_C_S','Env_S','Soc_S','Gov_S','ESG_Controversial','Industry']]
test2 = round(test.groupby('Industry').mean(),2)
test2
test2_2 = test.mean()
test2_2 # overall (98) companies
features = ['Combined \n Score','Environment','Social','Government','Controversial','Combined \n score']
def plot_rader(value1, value2, features, value1_name = 'Portfolio',value2_name = 'Overall', rader_size = 10):
# 使用ggplot的绘图风格
plt.style.use('ggplot')
# 构造数据
values = value1.values.tolist()
values2 = value2.values.tolist()
feature = features
N = len(values)
# 设置雷达图的角度,用于平分切开一个圆面
angles=np.linspace(0, 2*np.pi, N, endpoint=False)
# 为了使雷达图一圈封闭起来,需要下面的步骤
values=np.concatenate((values,[values[0]]))
values2=np.concatenate((values2,[values2[0]]))
angles=np.concatenate((angles,[angles[0]]))
# 绘图
fig=plt.figure()
ax = fig.add_subplot(111, polar=True)
# 绘制折线图
ax.plot(angles, values, 'o-', linewidth=2, label = value1_name, color = 'limegreen')
# 填充颜色
ax.fill(angles, values, alpha=0.25,color= 'limegreen')
# 绘制第二条折线图
ax.plot(angles, values2, 'o-', linewidth=2, label = value2_name, color = 'darkgreen')
ax.fill(angles, values2, alpha=0.25, color = 'darkgreen')
# 添加每个特征的标签
ax.set_thetagrids(angles * 180/np.pi, feature)
# 设置雷达图的范围
ax.set_ylim(0,rader_size)
# 添加标题
plt.title('ESG_Rader')
# 添加网格线
ax.grid(True)
# 设置图例
plt.legend(loc = 2, prop={'size': 7.4})
# 显示图形
plt.show()
plot_rader(test1_2, test2_2, features, 'Portfolio', 'Overall',10)
plot_rader(test1.loc['Energy'], test2.loc['Energy'], features, 'Portfolio', 'Overall',12)
plot_rader(test1.loc['Medical'], test2.loc['Medical'], features, 'Portfolio', 'Overall',12)
plot_rader(test1.loc['Manufacturing'], test2.loc['Manufacturing'], features, 'Portfolio', 'Overall',12)
plot_rader(test1.loc['Logistic'], test2.loc['Logistic'], features, 'Portfolio', 'Overall',12)
# Proportional to Industrial
I = pd.DataFrame(df_I.loc[Selected_Stock][['Year','Industry']])
I_1 = I[I['Year']=='2014-12-31 00:00:00']
I_2 = a.merge(I_1["Industry"], how = 'inner', left_on = a.index, right_on = I_1.index)
I_2 = I_2.groupby("Industry").sum()
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = I_2.index
sizes = round(I_2['Weight'],2)
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=False, startangle=90, colors = ['darkseagreen','olivedrab','green', 'mediumseagreen'])
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("Portfolio Weighting by Industry")
plt.show()