!pip install yfinance
pip install openpyxl
!pip install statsmodels==0.13.0
!pip install lazypredict
import pandas as pd
import numpy as np
import yfinance as yf
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm       
from statsmodels.formula.api import ols, logit
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
# set matplotlib.pyplot to default style
plt.style.use('default')
# Random selected companies from 4 industry
Manufacturing = ["TSM", "TXN", "AMAT", "IRBT", "LRCX", "PATK", "ACCO", "AIN", "EL", "ECL", "ZBRA", "RRX", "TSLA", "DOV", "KMB", "JBL", "FLEX", "ENTG", "ACH", "HELE", "MHK", "CAJ", "JJSF", "LULU", "CHD"] #25 companies
Medical = ["EW", "WST", "IDXX", "RMD","VRTX","AMGN", "ALGN", "PFE", "ZTS", "DHR", "TMO", "CERN", "UNH", "COO", "CNC", "QGEN", "CAH", "HCA", "MDT", "MRK", "JNJ", "REGN", "TFX", "LLY", "DVA", "ABC"] #26 companies
Logistic_transportation= ["KSU", "CHRW", "EXPD", "UNP", "FDX", "TIAIY", "ORAN", "EIX", "ASR", "DAL", "KNX", "SKYW", "HUBG", "HTLD", "JBHT", "OKE", "KEX", "NCLH", "ODFL", "PAC", "ZNH", "UAL", "TDW"]#23 companies #"TIGO","OSG",
Energy = ['NEE','ORA','ES','SRE','BWXT', "AEP", "DUK", "ATO", "PAGP", "DVN", "VLO", "PDCE", "HLX", "RRC", "KOS", "FRO", "TRGP", "LIN", "HE", "ENLAY", "CPE", "EC", "WLL",  "CRK"]#24 compenies in use, excl. 'PLUG','ENPH'"BTU",
# Stock return
start=dt.datetime(2014,12,31)
end=dt.datetime(2021,9,30) 
overall = []
overall.extend(Manufacturing)
overall.extend(Medical)
overall.extend(Logistic_transportation)
overall.extend(Energy)
data_O=yf.download(overall,start=start,end=end)
df_O_all=data_O['Adj Close']
df_O_all=df_O_all.ffill()
# we used 5 year data from 2014 to 2019. The return for 2020 is too volatile due to pandemic 
df_O = df_O_all[:dt.datetime(2019,12,31)]
# Benchmark data (market indices & ESG ETFs)
benchmark_df = yf.download(['SPY'], start=start, end=end)
benchmark_SPY_all = benchmark_df['Adj Close']
benchmark_SPY = benchmark_SPY_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['^IXIC'], start=start, end=end)
benchmark_IXIC_all = benchmark_df['Adj Close']
benchmark_IXIC = benchmark_IXIC_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['DJI'], start=start, end=end)
benchmark_DJI_all = benchmark_df['Adj Close']
benchmark_DJI = benchmark_DJI_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ESGD'], start=start, end=end)
benchmark_ESGD_all = benchmark_df['Adj Close']
benchmark_ESG = benchmark_ESGD_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ESGV'], start=start, end=end)
benchmark_ESGV_all = benchmark_df['Adj Close']
benchmark_ESG2 = benchmark_ESGV_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ICLN'], start=start, end=end)
benchmark_ICLN_all = benchmark_df['Adj Close']
benchmark_ESG3 = benchmark_ICLN_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['SUSA'], start=start, end=end)
benchmark_SUSA_all = benchmark_df['Adj Close']
benchmark_ESG4 = benchmark_SUSA_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ESGHX'], start=start, end=end)
benchmark_ESGHX_all = benchmark_df['Adj Close']
benchmark_ESG5 = benchmark_ESGHX_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['NEXTX'], start=start, end=end)
benchmark_NEXTX_all = benchmark_df['Adj Close']
benchmark_ESG6 = benchmark_NEXTX_all[:dt.datetime(2019,12,31)]
benchmark_df = yf.download(['ATEYX'], start=start, end=end)
benchmark_ATEYX_all = benchmark_df['Adj Close']
benchmark_ESG7 = benchmark_ATEYX_all[:dt.datetime(2019,12,31)]
# ESG Benchmark
_5benchmark =['ICLN','SUSA','ESGHX','NEXTX','ATEYX']
# market indices benchamrk
_3benchmark = ['SPY', 'IXIC','DJI']
pip install xlrd
#Read and clean data in Fama-French 3
path_1 = "FF3.CSV"
FF3=pd.read_csv(path_1, skiprows=4)
#FF3 = FF3.sort_index(ascending = False)
#FF3 = FF3.reset_index()
FF3 = FF3.rename(columns={'Unnamed: 0': 'Year'})
FF3 = FF3[:-1]
FF3["Year"] = pd.to_datetime(FF3['Year'],format='%Y%m%d')
FF3 = FF3[FF3['Year'] >= dt.datetime(2014,12,31)]
df = pd.DataFrame()
for i in overall:
    path = "Archive/ESG Table for " + i + ".xlsx"
    temp_n=pd.read_excel(path,engine='openpyxl')
    temp_n = temp_n[1:14]
    temp_n = temp_n.drop(temp_n.index[1:7])
    #transpose the column
    temp_n = temp_n.T
    temp_n.columns = temp_n.iloc[0]
    temp_n = temp_n.drop(temp_n.index[0])
    # Rename the rows
    temp_n.set_index(temp_n.columns[0],inplace=True)
    temp_n = temp_n[1:6]
    temp_n = temp_n.reset_index()
    # Rename column names
    temp_n = temp_n.rename(columns={np.nan: 'Year'})
    temp_n.columns = ['Year','ESG_C_S', 'ESG_S', 'Env_S', 'Soc_S', 'Gov_S', 'ESG_Controversial']
    # Rename the score into integer
    temp_n = temp_n.replace(["D-", "D","D+","C-", "C","C+","B-", "B","B+","A-", "A","A+"], [0,1,2,3,4,5,6,7,8,9,10,11])
    temp_n['Year'] = pd.to_datetime(temp_n['Year'])
    temp_n = temp_n.set_index("Year")
    new_date1 = pd.DataFrame(temp_n[-1:].values, index=[pd.to_datetime('2014-12-31')], columns=temp_n.columns)
    new_date2 = pd.DataFrame(temp_n[-1:].values, index=[pd.to_datetime('2019-12-31')], columns=temp_n.columns)
    temp_n = temp_n.append(new_date1)
    temp_n = temp_n.append(new_date2)
    temp_n = temp_n.sort_index()
    temp_n = temp_n.resample('D').asfreq()
    temp_n = temp_n.fillna(method = 'ffill')
    # adjust column type
    #temp_n["Year"] = pd.to_numeric(temp_n["Year"])
    temp_n = temp_n.apply(pd.to_numeric)
    #Merge two files
    temp_n = temp_n.merge(FF3, how = "inner", left_on = temp_n.index, right_on = FF3['Year'])
    temp_n = temp_n.set_index("Year")
    temp_n = temp_n.drop(columns = 'key_0')
    # Compute Daily Return
    temp_return = df_O[i].ffill().pct_change()
    temp_return = temp_return.to_frame()
    temp_return.columns = [i]
    temp_return = temp_return[1:]
    temp_return = temp_return.round(3)
    temp_return = temp_return.reset_index()
    temp_return = temp_return.rename(columns={'Date': 'Year', i: "Ret"})
    temp_return['Year'] = pd.DatetimeIndex(temp_return['Year'])
    temp_n = temp_n.merge(temp_return, how = "outer", on = "Year")
    #Set the company name as index
    temp_n["Company"] = i
    if i in Manufacturing:
        temp_n['Industry'] = 'Manufacturing'
    elif i in Medical:
        temp_n['Industry'] = 'Medical'
    elif i in Logistic_transportation:
        temp_n['Industry'] = 'Logistic'
    else:
        temp_n['Industry'] = 'Energy'
    temp_n = temp_n.set_index("Company")
    df = pd.concat([df,temp_n], axis = 0)
df_I = df
# get dummy variable for Industry
df = pd.get_dummies(df, prefix=['I'], columns=['Industry'],drop_first=True)
df = df.dropna()
df
df.isna().sum()
# plot regress plot to see if there is any trend in the data
plt.figure(figsize = (16,8))
sns.regplot(x = 'ESG_C_S', y = "Ret", data = df, logistic=True, scatter_kws={"color": "black"}, y_jitter=.05,line_kws={"color": "red"})
plt.ylabel("return")
plt.xlabel("ESG index")
def CI_test(df, i):
    bs_replicates = np.empty(10000)
    for j in range(10000):
        bs_replicates[j] = np.mean(np.random.choice(df[i], len(df[i])))
    # Bootstrapped confidence intervals
    conf_int = np.percentile(bs_replicates,[2.5, 97.5])
    print('95% bootstrapped confidence interval =', conf_int, '%',i)
    # Theoretical confidence intervals
    conf_int_actual_upper = df[i].mean() + df[i].std()/np.sqrt(df[i].count())*1.96
    conf_int_actual_lower = df[i].mean() - df[i].std()/np.sqrt(df[i].count())*1.96
    conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
    print('-'*120)
    print('95% theoretical confidence interval =', conf_int_actual, '%',i)
CI_test(df, 'Ret')
# remove Year as it contain multicollinearity problem
df = df.drop(columns = 'Year')
df.columns
# Correlation heatmap --> identify potential correlated factors
corr = df.corr()
plt.figure(figsize = (16,8))
sns.heatmap(corr,annot=True)
plt.show()
# dependent/target/outcome variable
y = df['Ret']
# independent/predictor/explanatory variable
X = df[['ESG_C_S','ESG_S','Env_S', 'Soc_S','Gov_S','ESG_Controversial', 'Mkt-RF','SMB', 'RF' ,'HML','I_Logistic', 'I_Manufacturing', 'I_Medical']]
#  
#'ESG_C_S', 'ESG_S', 'Env_S', 'Soc_S','Gov_S','ESG_Controversial'
#'Mkt-RF','SMB', 'RF' ,'HML','I_Logistic', 'I_Manufacturing', 'I_Medical',
# OLS Regression
X = sm.add_constant(X)
ols_model = sm.OLS(y,X.astype(float))
result = ols_model.fit()
print(result.summary2())
#conditional No. should be lower than 20
df_O.index
df_O = df_O.drop(df_O.index[0] ,axis = 0)
df_1 = df_O.fillna(method = 'ffill')
df_1 = df_1.pct_change()
df_1 = df_1.drop(df_1.index[0] ,axis = 0)
df_1
# Stationary test
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries, i):
    
    # Determining rolling statistics
    rolmean = timeseries.rolling(14).mean()
    rolstd = timeseries.rolling(14).std()
    # Plot rolling statistics:
    #orig = plt.plot(timeseries, color='blue',label='Original')
    #mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    #std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    #plt.legend(loc='best')
    #plt.title('Rolling Mean & Standard Deviation')
    #plt.show(block=False)
    
    # Perform Dickey-Fuller test:
    #print ('Results of Dickey-Fuller Test:' + i)
    timeseries = timeseries.iloc[:].values
    dftest = adfuller(timeseries, autolag='AIC')
    #dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    #for key,value in dftest[4].items():
    #    dfoutput['Critical Value (%s)'%key] = value
    #print(dfoutput)
    if dftest[1] > 0.05:
        print(i)
for i in overall:
    test_stationarity(df_1[i], i)
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
#LazyPredict on regression models
# fit all models
reg = LazyRegressor(predictions=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models
predictions
!pip install PyPortfolioOpt==1.2.1
pip install --upgrade scipy
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns
from pypfopt import cla
from pypfopt.plotting import Plotting
from pypfopt.discrete_allocation import DiscreteAllocation, get_latest_prices
from pypfopt import objective_functions
import scipy.stats as st
# compute efficient frontier & portfolio statistics
class EF:
    def __init__(self, asset, asset_list):
        self.asset = asset
        self.asset_list = asset_list
        self.computation(asset, asset_list)
    def computation(self, asset, asset_list):
        # Check NaN values in the data
        nullin_df = pd.DataFrame(self.asset,columns=self.asset_list)
        # Calculate portfolio mean return
        mu = expected_returns.mean_historical_return(self.asset)
        # Calculate portfolio return variance
        sigma = risk_models.sample_cov(self.asset)
        
        # weight bounds in negative allows shorting of stocks
        ef = EfficientFrontier(mu, sigma, weight_bounds=(0,1)) 
        ef.add_objective(objective_functions.L2_reg, gamma=2)
        # optional constraints possible, read  pypfopt documentation.
        sharpe_portfolio=ef.max_sharpe(risk_free_rate=0.00078) #US Bond
        sharpe_portfolio_wt=ef.clean_weights()
        latest_prices = get_latest_prices(self.asset)
        # Allocate Portfolio Value in $ as required to show number of shares/stocks to buy, 
        # also bounds for shorting will affect allocation
        # Maximum Sharpe Portfolio Allocation $1000000
        da = DiscreteAllocation(sharpe_portfolio_wt, latest_prices, total_portfolio_value=1000000)
        allocation, leftover = da.greedy_portfolio()
        max_sharpe_cla = cla.CLA(mu, sigma)
        max_sharpe_cla.max_sharpe()
        sharpe_portfolio_wt_list = list(sharpe_portfolio_wt.values())
        ret_data1 = self.asset.pct_change()[1:]
        weighted_returns1 = (sharpe_portfolio_wt_list * ret_data1)
        portfolio_ret1 = pd.DataFrame(weighted_returns1.sum(axis=1))
        ret_data1 = ret_data1.merge(portfolio_ret1, on="Date", how="left")
        ret_data1 = ret_data1.rename(columns={0: "portfolio_ret"})
        ret_data1['cumulative_portfolio_ret'] = (ret_data1['portfolio_ret'] + 1).cumprod()
        
        self.sharpe_portfolio_wt = sharpe_portfolio_wt
        self.portfolio_ret1 = portfolio_ret1
        self.allocation = allocation
        self.max_sharpe_cla = max_sharpe_cla
        self.ret_data1 = ret_data1
        self.leftover = leftover
    def SP_weight(self):
        print(self.sharpe_portfolio_wt)
        print(self.allocation)
        print("Leftover Fund value for the maximum Sharpe portfolio is ${:.2f}".format(self.leftover))
        P_weight = Plotting.plot_weights(self.sharpe_portfolio_wt)
        return P_weight
    def Efficient_frontier(self):
        EF_plot = Plotting.plot_efficient_frontier(self.max_sharpe_cla, show_assets="True")
        return EF_plot
    def cumulative_return(self):
        Cum_ret = sns.scatterplot('Date', 'cumulative_portfolio_ret', data=self.ret_data1, color = 'green')
        print(self.ret_data1['cumulative_portfolio_ret'][-1])
        return Cum_ret
    
    def plot_hist(self):
        hist_ret = plt.hist(self.ret_data1['portfolio_ret'], bins=10)
        print(st.norm.interval(alpha=0.95, loc=np.mean(self.ret_data1['portfolio_ret']), scale=st.sem(self.ret_data1['portfolio_ret'])))
        return hist_ret
    def port_stat(self):
        geometric_port_return = np.prod(self.portfolio_ret1 + 1) ** (252/self.portfolio_ret1.shape[0]) - 1
        annual_std = np.std(self.portfolio_ret1) * np.sqrt(252)
        port_sharpe_ratio = geometric_port_return / annual_std
        print("Annual Return", geometric_port_return)
        print("Stdev", annual_std)
        print("Sharpe Ratio", port_sharpe_ratio)
    
    def CI_test(self):
        bs_replicates = np.empty(10000)
        for i in range(10000):
            bs_replicates[i] = np.mean(np.random.choice(self.ret_data1['portfolio_ret'], len(self.ret_data1['portfolio_ret'])))
        # Bootstrapped confidence intervals
        conf_int = np.percentile(bs_replicates,[2.5, 97.5])
        print('95% bootstrapped confidence interval =', conf_int, '%')
        # Theoretical confidence intervals
        conf_int_actual_upper = self.ret_data1['portfolio_ret'].mean() + self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
        conf_int_actual_lower = self.ret_data1['portfolio_ret'].mean() - self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
        conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
        print('-'*120)
        print('95% theoretical confidence interval =', conf_int_actual, '%')
# compute efficient frontier & portfolio statistics with benchmarks
class EF_benchmark5:
    def __init__(self, asset, asset_list,benchmark1,benchmark2,benchmark3,benchmark4,benchmark5, benchmark_name = None):
        self.asset = asset
        self.asset_list = asset_list
        self.benchmark1 = benchmark1
        self.benchmark2 = benchmark2
        self.benchmark3 = benchmark3
        self.benchmark4 = benchmark4
        self.benchmark5 = benchmark5
        self.benchmark_name = benchmark_name
        self.computation(asset, asset_list,benchmark1,benchmark2,benchmark3,benchmark4,benchmark5)
    def computation(self, asset, asset_list,benchmark1,benchmark2,benchmark3,benchmark4,benchmark5):
        # Check NaN values in the data
        nullin_df = pd.DataFrame(self.asset,columns=self.asset_list)
        # Calculate portfolio mean return
        mu = expected_returns.mean_historical_return(self.asset)
        # Calculate portfolio return variance
        sigma = risk_models.sample_cov(self.asset)
        
        # weight bounds in negative allows shorting of stocks
        ef = EfficientFrontier(mu, sigma, weight_bounds=(0,1)) 
        ef.add_objective(objective_functions.L2_reg, gamma=2)
        # optional constraints possible, read  pypfopt documentation.
        sharpe_portfolio=ef.max_sharpe(risk_free_rate=0.00078) #US Bond
        sharpe_portfolio_wt=ef.clean_weights()
        latest_prices = get_latest_prices(self.asset)
        # Allocate Portfolio Value in $ as required to show number of shares/stocks to buy, 
        # also bounds for shorting will affect allocation
        # Maximum Sharpe Portfolio Allocation $1000000
        da = DiscreteAllocation(sharpe_portfolio_wt, latest_prices, total_portfolio_value=1000000)
        allocation, leftover = da.greedy_portfolio()
        max_sharpe_cla = cla.CLA(mu, sigma)
        max_sharpe_cla.max_sharpe()
        sharpe_portfolio_wt_list = list(sharpe_portfolio_wt.values())
        ret_data1 = self.asset.pct_change()[1:]
        weighted_returns1 = (sharpe_portfolio_wt_list * ret_data1)
        portfolio_ret1 = pd.DataFrame(weighted_returns1.sum(axis=1))
        ret_data1 = ret_data1.merge(portfolio_ret1, on="Date", how="left")
        ret_data1 = ret_data1.rename(columns={0: "portfolio_ret"})
        ret_data1['cumulative_portfolio_ret'] = (ret_data1['portfolio_ret'] + 1).cumprod()
        ret_benchmark1 = self.benchmark1.pct_change()[1:]
        ret_benchmark1['cumulative_benchmark_ret'] = (ret_benchmark1 + 1).cumprod()
        ret_benchmark2 = self.benchmark2.pct_change()[1:]
        ret_benchmark2['cumulative_benchmark_ret'] = (ret_benchmark2 + 1).cumprod()
        ret_benchmark3 = self.benchmark3.pct_change()[1:]
        ret_benchmark3['cumulative_benchmark_ret'] = (ret_benchmark3 + 1).cumprod()
        ret_benchmark4 = self.benchmark4.pct_change()[1:]
        ret_benchmark4['cumulative_benchmark_ret'] = (ret_benchmark4 + 1).cumprod()
        ret_benchmark5 = self.benchmark5.pct_change()[1:]
        ret_benchmark5['cumulative_benchmark_ret'] = (ret_benchmark5 + 1).cumprod()
        
        self.sharpe_portfolio_wt = sharpe_portfolio_wt
        self.portfolio_ret1 = portfolio_ret1
        self.allocation = allocation
        self.max_sharpe_cla = max_sharpe_cla
        self.ret_data1 = ret_data1
        self.ret_benchmark1 = ret_benchmark1
        self.ret_benchmark2 = ret_benchmark2
        self.ret_benchmark3 = ret_benchmark3
        self.ret_benchmark4 = ret_benchmark4
        self.ret_benchmark5 = ret_benchmark5
        self.leftover = leftover
    def SP_weight(self):
        print(self.sharpe_portfolio_wt)
        print(self.allocation)
        print("Leftover Fund value for the maximum Sharpe portfolio is ${:.2f}".format(self.leftover))
        P_weight = Plotting.plot_weights(self.sharpe_portfolio_wt)
        return P_weight
    def Efficient_frontier(self):
        EF_plot = Plotting.plot_efficient_frontier(self.max_sharpe_cla, show_assets="True")
        return EF_plot
    def cumulative_return(self):
        Cum_ret = sns.scatterplot('Date', 'cumulative_portfolio_ret', data=self.ret_data1, color = 'yellowgreen', size = 1)
        Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark1, color = 'olivedrab',size = 1)
        Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark2, color = 'darkslategray',size = 1)
        Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark3, color = 'darkseagreen',size = 1)
        Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark4, color = 'darkgreen',size = 1)
        Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark5, color = 'teal',size = 1)
        if self.benchmark_name != None:
            plt.legend(labels = ["ESG", *self.benchmark_name])
        else:
            plt.legend(labels = ["ESG", "Benchmark"])
        print(self.ret_data1['cumulative_portfolio_ret'][-1])
        return Cum_ret
    
    def plot_hist(self):
        hist_ret = plt.hist(self.ret_data1['portfolio_ret'], bins=10)
        print(st.norm.interval(alpha=0.95, loc=np.mean(self.ret_data1['portfolio_ret']), scale=st.sem(self.ret_data1['portfolio_ret'])))
        return hist_ret
    def port_stat(self):
        geometric_port_return = np.prod(self.portfolio_ret1 + 1) ** (252/self.portfolio_ret1.shape[0]) - 1
        annual_std = np.std(self.portfolio_ret1) * np.sqrt(252)
        port_sharpe_ratio = geometric_port_return / annual_std
        print("Annual Return", geometric_port_return)
        print("Stdev", annual_std)
        print("Sharpe Ratio", port_sharpe_ratio)
    
    def hypothesis_testing(self):
        #import scipy.stats as stats
        plt1 = self.ret_data1['portfolio_ret'].hist(bins=100, color='r', alpha=0.5, figsize = (16,8))
        plt1 = self.benchmark2.pct_change()[1:].hist(bins=100, color='olivedrab', alpha=0.5)
        
        print("Difference in mean return (" + self.benchmark_name[1] + ", ESG): ")
        print((self.ret_data1['portfolio_ret'].mean() - self.benchmark2.pct_change()[1:].mean())*100)
        stat, p = st.ttest_ind(self.ret_data1['portfolio_ret'], self.benchmark2.pct_change()[1:], equal_var=False)#, alternative='greater')
        alpha = 0.05
        print("p value is " + str(p))
        if p <= alpha:
            print('The difference in mean return is significantly different (reject null hypothesis)')
        else:
            print('The difference in mean return is not significantly different (failed to reject null hypothesis)')
    def CI_test(self):
        bs_replicates = np.empty(10000)
        for i in range(10000):
            bs_replicates[i] = np.mean(np.random.choice(self.ret_data1['portfolio_ret'], len(self.ret_data1['portfolio_ret'])))
        # Bootstrapped confidence intervals
        conf_int = np.percentile(bs_replicates,[2.5, 97.5])
        print('95% bootstrapped confidence interval =', conf_int, '%')
        # Theoretical confidence intervals
        conf_int_actual_upper = self.ret_data1['portfolio_ret'].mean() + self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
        conf_int_actual_lower = self.ret_data1['portfolio_ret'].mean() - self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
        conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
        print('-'*120)
        print('95% theoretical confidence interval =', conf_int_actual, '%')
    
    def CI_test_benchmark(self):
        bs_replicates = np.empty(10000)
        for i in range(10000):
            bs_replicates[i] = np.mean(np.random.choice(self.benchmark2.pct_change()[1:], len(self.benchmark2.pct_change()[1:])))
        # Bootstrapped confidence intervals
        conf_int = np.percentile(bs_replicates,[2.5, 97.5])
        print('95% bootstrapped confidence interval =', conf_int, '%')
        # Theoretical confidence intervals
        conf_int_actual_upper = self.benchmark2.pct_change()[1:].mean() + self.benchmark2.pct_change()[1:].std()/np.sqrt(self.benchmark2.pct_change()[1:].count())*1.96
        conf_int_actual_lower = self.benchmark2.pct_change()[1:].mean() - self.benchmark2.pct_change()[1:].std()/np.sqrt(self.benchmark2.pct_change()[1:].count())*1.96
        conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
        print('-'*120)
        print('95% theoretical confidence interval =', conf_int_actual, '%')
# compute efficient frontier & portfolio statistics with benchmarks
class EF_benchmark3:
    def __init__(self, asset, asset_list,benchmark1,benchmark2,benchmark3, benchmark_name = None):
        self.asset = asset
        self.asset_list = asset_list
        self.benchmark1 = benchmark1
        self.benchmark2 = benchmark2
        self.benchmark3 = benchmark3
        self.benchmark_name = benchmark_name
        self.computation(asset, asset_list,benchmark1,benchmark2,benchmark3)
    def computation(self, asset, asset_list,benchmark1,benchmark2,benchmark3):
        # Check NaN values in the data
        nullin_df = pd.DataFrame(self.asset,columns=self.asset_list)
        # Calculate portfolio mean return
        mu = expected_returns.mean_historical_return(self.asset)
        # Calculate portfolio return variance
        sigma = risk_models.sample_cov(self.asset)
        
        # weight bounds in negative allows shorting of stocks
        ef = EfficientFrontier(mu, sigma, weight_bounds=(0,1)) 
        ef.add_objective(objective_functions.L2_reg, gamma=2)
        # optional constraints possible, read  pypfopt documentation.
        sharpe_portfolio=ef.max_sharpe(risk_free_rate=0.00078) #US Bond
        sharpe_portfolio_wt=ef.clean_weights()
        latest_prices = get_latest_prices(self.asset)
        # Allocate Portfolio Value in $ as required to show number of shares/stocks to buy, 
        # also bounds for shorting will affect allocation
        # Maximum Sharpe Portfolio Allocation $1000000
        da = DiscreteAllocation(sharpe_portfolio_wt, latest_prices, total_portfolio_value=1000000)
        allocation, leftover = da.greedy_portfolio()
        max_sharpe_cla = cla.CLA(mu, sigma)
        max_sharpe_cla.max_sharpe()
        sharpe_portfolio_wt_list = list(sharpe_portfolio_wt.values())
        ret_data1 = self.asset.pct_change()[1:]
        weighted_returns1 = (sharpe_portfolio_wt_list * ret_data1)
        portfolio_ret1 = pd.DataFrame(weighted_returns1.sum(axis=1))
        ret_data1 = ret_data1.merge(portfolio_ret1, on="Date", how="left")
        ret_data1 = ret_data1.rename(columns={0: "portfolio_ret"})
        ret_data1['cumulative_portfolio_ret'] = (ret_data1['portfolio_ret'] + 1).cumprod()
        ret_benchmark1 = self.benchmark1.pct_change()[1:]
        ret_benchmark1['cumulative_benchmark_ret'] = (ret_benchmark1 + 1).cumprod()
        ret_benchmark2 = self.benchmark2.pct_change()[1:]
        ret_benchmark2['cumulative_benchmark_ret'] = (ret_benchmark2 + 1).cumprod()
        ret_benchmark3 = self.benchmark3.pct_change()[1:]
        ret_benchmark3['cumulative_benchmark_ret'] = (ret_benchmark3 + 1).cumprod()
        
        self.sharpe_portfolio_wt = sharpe_portfolio_wt
        self.portfolio_ret1 = portfolio_ret1
        self.allocation = allocation
        self.max_sharpe_cla = max_sharpe_cla
        self.ret_data1 = ret_data1
        self.ret_benchmark1 = ret_benchmark1
        self.ret_benchmark2 = ret_benchmark2
        self.ret_benchmark3 = ret_benchmark3
        self.leftover = leftover
    def SP_weight(self):
        print(self.sharpe_portfolio_wt)
        print(self.allocation)
        print("Leftover Fund value for the maximum Sharpe portfolio is ${:.2f}".format(self.leftover))
        P_weight = Plotting.plot_weights(self.sharpe_portfolio_wt)
        return P_weight
    def Efficient_frontier(self):
        EF_plot = Plotting.plot_efficient_frontier(self.max_sharpe_cla, show_assets="True")
        return EF_plot
    def cumulative_return(self):
        Cum_ret = sns.scatterplot('Date', 'cumulative_portfolio_ret', data=self.ret_data1, color = 'yellowgreen', size = 1)
        Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark1, color = 'teal',size =1)
        Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark2, color = 'olivedrab',size =1)
        Cum_ret = sns.scatterplot('Date', 'cumulative_benchmark_ret', data=self.ret_benchmark3, color = 'darkgreen',size =1)
        if self.benchmark_name != None:
            plt.legend(labels = ["ESG", *self.benchmark_name])
        else:
            plt.legend(labels = ["ESG", "Benchmark"])
        print(self.ret_data1['cumulative_portfolio_ret'][-1])
        return Cum_ret
    
    def plot_hist(self):
        hist_ret = plt.hist(self.ret_data1['portfolio_ret'], bins=10)
        print(st.norm.interval(alpha=0.95, loc=np.mean(self.ret_data1['portfolio_ret']), scale=st.sem(self.ret_data1['portfolio_ret'])))
        return hist_ret
    def port_stat(self):
        geometric_port_return = np.prod(self.portfolio_ret1 + 1) ** (252/self.portfolio_ret1.shape[0]) - 1
        annual_std = np.std(self.portfolio_ret1) * np.sqrt(252)
        port_sharpe_ratio = geometric_port_return / annual_std
        print("Annual Return", geometric_port_return)
        print("Stdev", annual_std)
        print("Sharpe Ratio", port_sharpe_ratio)
    def hypothesis_testing(self):
        #import scipy.stats as stats
        plt1 = self.ret_data1['portfolio_ret'].hist(bins=100, color='r', alpha=0.5, figsize = (16,8))
        plt1 = self.benchmark1.pct_change()[1:].hist(bins=100, color='olivedrab', alpha=0.5)
        
        print("Difference in mean return (" + self.benchmark_name[0] + ", ESG): ")
        print((self.ret_data1['portfolio_ret'].mean() - self.benchmark1.pct_change()[1:].mean())*100)
        stat, p = st.ttest_ind(self.ret_data1['portfolio_ret'], self.benchmark1.pct_change()[1:], equal_var=False)#, alternative='greater')
        alpha = 0.05
        print("p value is " + str(p))
        if p <= alpha:
            print('The difference in mean return is significantly different (reject null hypothesis)')
        else:
            print('The difference in mean return is not significantly different (failed to reject null hypothesis)')
    def CI_test(self):
        bs_replicates = np.empty(10000)
        for i in range(10000):
            bs_replicates[i] = np.mean(np.random.choice(self.ret_data1['portfolio_ret'], len(self.ret_data1['portfolio_ret'])))
        # Bootstrapped confidence intervals
        conf_int = np.percentile(bs_replicates,[2.5, 97.5])
        print('95% bootstrapped confidence interval =', conf_int, '%')
        # Theoretical confidence intervals
        conf_int_actual_upper = self.ret_data1['portfolio_ret'].mean() + self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
        conf_int_actual_lower = self.ret_data1['portfolio_ret'].mean() - self.ret_data1['portfolio_ret'].std()/np.sqrt(self.ret_data1['portfolio_ret'].count())*1.96
        conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
        print('-'*120)
        print('95% theoretical confidence interval =', conf_int_actual, '%')
    
    def CI_test_benchmark(self):
        bs_replicates = np.empty(10000)
        for i in range(10000):
            bs_replicates[i] = np.mean(np.random.choice(self.benchmark1.pct_change()[1:], len(self.benchmark1.pct_change()[1:])))
        # Bootstrapped confidence intervals
        conf_int = np.percentile(bs_replicates,[2.5, 97.5])
        print('95% bootstrapped confidence interval =', conf_int, '%')
        # Theoretical confidence intervals
        conf_int_actual_upper = self.benchmark1.pct_change()[1:].mean() + self.benchmark1.pct_change()[1:].std()/np.sqrt(self.benchmark1.pct_change()[1:].count())*1.96
        conf_int_actual_lower = self.benchmark1.pct_change()[1:].mean() - self.benchmark1.pct_change()[1:].std()/np.sqrt(self.benchmark1.pct_change()[1:].count())*1.96
        conf_int_actual = [conf_int_actual_lower, conf_int_actual_upper]
        print('-'*120)
        print('95% theoretical confidence interval =', conf_int_actual, '%')
#Ranked the data into 3 segment
def gmb(df, subset = "Energy"):
    test = df[df["Industry"] == subset]
    test = test['ESG_C_S']
    test = test.groupby(test.index).mean()
    category = pd.qcut(test, 3,labels=["bad", "medium", "good"])
    category = pd.DataFrame(category)
    bad = list(category[category['ESG_C_S'] == "bad"].index) 
    medium = list(category[category['ESG_C_S'] == "medium"].index) 
    good = list(category[category['ESG_C_S'] == "good"].index) 
    return bad, medium, good
E_bad, E_medium, E_good = gmb(df_I,"Energy")
M1_bad, M1_medium, M1_good = gmb(df_I,"Manufacturing")
M2_bad, M2_medium, M2_good = gmb(df_I,"Medical")
L_bad, L_medium, L_good = gmb(df_I,"Logistic")
bad = []
bad += E_bad
bad += M1_bad
bad += M2_bad
bad += L_bad
medium = []
medium += E_medium
medium += M1_medium
medium += M2_medium
medium += L_medium
good = []
good += E_good
good += M1_good
good += M2_good
good += L_good
sns.violinplot(x="Industry", y="ESG_C_S", data=df_I)# hue = 'Industry',
g_ef = EF(df_O[good], good)
b_ef = EF(df_O[bad], bad)
m_ef = EF(df_O[medium], medium)
g_ef.cumulative_return()
# good ESG stocks in each industry
E_good,M1_good,M2_good,L_good
pip install yahoo_fin
pip install requests_html
# Extract company data
import yahoo_fin.stock_info as si
test = si.get_stats_valuation('AEP')
test['Unnamed: 0']
items = ['Market Cap (intraday) 5', 'Enterprise Value 3', 'Trailing P/E', 'Forward P/E 1', 'PEG Ratio (5 yr expected) 1', 'Price/Sales (ttm)', 'Price/Book (mrq)', 'Enterprise Value/Revenue 3', 'Enterprise Value/EBITDA 7']
     
def stats1(asset_list, item):
    dict_temp = {}
    for i in asset_list:
        #print(i)
        temp = si.get_stats_valuation(i)
        temp = temp.iloc[:,:2]
        temp.columns = ["Attribute", "Recent"]
        dict_temp[i] = temp[(temp["Attribute"] == item)]
    combined_stats = pd.concat(dict_temp)
    combined_stats = combined_stats.reset_index()
    if item == 'Market Cap (intraday) 5' or item == 'Enterprise Value 3':
        combined_stats.Recent = combined_stats.Recent.apply(lambda x: float(x[:-1]) if 'B' in x else x)
    else:
        combined_stats.Recent = combined_stats.Recent.apply(lambda x: float(x))
    combined_stats = combined_stats[["level_0", "Recent"]]
    combined_stats.columns = ['Firm',item]
    combined_stats = combined_stats.sort_values(by=[item],ascending=False)
    return combined_stats
def stats2(asset_list, *item):
    output = pd.DataFrame(columns=['Firm']) 
    for i in item:
        #print(i)
        temp = stats1(asset_list, i)
        output = output.merge(temp, how = 'outer', on = "Firm")
    return output
#Select stock based on their finacial ratios
output_E = stats2(E_good, items[0],items[3],items[5])
output_M1 = stats2(M1_good, items[0],items[3],items[5])
output_M2 = stats2(M2_good, items[0],items[3],items[5])
output_L = stats2(L_good, items[0],items[3],items[5])
output_E
output_M1
output_M2
output_L
# Select stocks based on the above criteria
Selected_Stock = ["NEE", "LIN", "ENLAY", "ES", "SRE",'UNH', 'TMO', 'WST','RMD','LLY', "TSM", "TXN", "ECL", "AMAT", "KMB", "UNP", "FDX", "ASR", "EIX", "ZNH"]
def alpha_beta_calculation(asset, assetlist, benchmark, start_date = "2014-12-31", end_date = "2019-12-31"):
    nullin_df = pd.DataFrame(asset.loc[start_date:end_date],columns=assetlist)
    # Calculate portfolio mean return
    mu = expected_returns.mean_historical_return(asset.loc[start_date:end_date])
    # Calculate portfolio return variance
    sigma = risk_models.sample_cov(asset.loc[start_date:end_date])
    # weight bounds in negative allows shorting of stocks
    ef = EfficientFrontier(mu, sigma, weight_bounds=(0,1)) 
    ef.add_objective(objective_functions.L2_reg, gamma=2)
    # optional constraints possible, read  pypfopt documentation.
    sharpe_portfolio=ef.max_sharpe(risk_free_rate=0.00078) #US Bond
    sharpe_portfolio_wt=ef.clean_weights()
    latest_prices = get_latest_prices(asset.loc[start_date:end_date])
    # Allocate Portfolio Value in $ as required to show number of shares/stocks to buy, 
    # also bounds for shorting will affect allocation
    # Maximum Sharpe Portfolio Allocation $1000000
    da = DiscreteAllocation(sharpe_portfolio_wt, latest_prices, total_portfolio_value=1000000)
    allocation, leftover = da.greedy_portfolio()
    max_sharpe_cla = cla.CLA(mu, sigma)
    max_sharpe_cla.max_sharpe()
    sharpe_portfolio_wt_list = list(sharpe_portfolio_wt.values())
    ret_data1 = asset.loc[start_date:end_date].pct_change()[1:]
    weighted_returns1 = (sharpe_portfolio_wt_list * ret_data1)
    portfolio_ret1 =weighted_returns1.sum(axis=1)
    benchmark1 = benchmark.loc[start_date:end_date].pct_change()[1:]
    (Beta, Alpha) = st.linregress(benchmark1.values, portfolio_ret1.values)[0:2]
    year = int(end_date[:4]) - int(start_date[:4])
    print(str(year) + "-year " + f"Beta: {Beta:.6f}", str(year) + "-year " +f"Alpha: {Alpha:.6f}")
    #print(str(year) + "-year " + f"Beta: {res[0]:.6f}", str(year) + "-year " +f"Alpha: {res[1]:.6f}")
#idosycratic risk work in favor to the stock (show as residual)
# --> good ESG & valued stock
alpha_beta_calculation(df_O[Selected_Stock], Selected_Stock,benchmark_SPY)
alpha_beta_calculation(df_O[Selected_Stock], Selected_Stock,benchmark_SPY,"2018-12-31","2019-12-31")
alpha_beta_calculation(df_O[Selected_Stock], Selected_Stock,benchmark_SPY,"2016-12-31","2019-12-31")
alpha_beta_calculation(df_O_all[Selected_Stock], Selected_Stock,benchmark_SPY_all,"2016-09-30","2021-09-30")
alpha_beta_calculation(df_O_all[Selected_Stock], Selected_Stock,benchmark_SPY_all,"2020-09-30","2021-09-30")
alpha_beta_calculation(df_O_all[Selected_Stock], Selected_Stock,benchmark_SPY_all,"2018-09-30","2021-09-30")
selected_EF_3 = EF_benchmark3(df_O[Selected_Stock], Selected_Stock,benchmark_SPY, benchmark_IXIC,benchmark_DJI, _3benchmark)
M_EF = selected_EF_3.cumulative_return()
selected_EF_3.Efficient_frontier()
selected_EF_3.SP_weight()
g_ef.Efficient_frontier()
selected_EF_3.port_stat()
selected_EF_3.CI_test()
selected_EF_3.CI_test_benchmark()
selected_EF_5 = EF_benchmark5(df_O[Selected_Stock], Selected_Stock,benchmark_ESG3, benchmark_ESG4,benchmark_ESG5,benchmark_ESG6,benchmark_ESG7, _5benchmark)
S_EF = selected_EF_5.cumulative_return()
selected_EF_3.hypothesis_testing()
selected_EF_5.hypothesis_testing()
selected_EF_5.port_stat()
a = {'NEE': 0.0652, 'LIN': 0.03926, 'ENLAY': 0.05335, 'ES': 0.04204, 'SRE': 0.03119, 'UNH': 0.07758, 'TMO': 0.06344, 'WST': 0.07308, 'RMD': 0.07704, 'LLY': 0.05329, 'TSM': 0.07588, 'TXN': 0.06643, 'ECL': 0.04255, 'AMAT': 0.06896, 'KMB': 0.02258, 'UNP': 0.0358, 'FDX': 0.0, 'ASR': 0.03637, 'EIX': 0.02486, 'ZNH': 0.05111}
a = pd.DataFrame.from_dict(a,orient='index')
a = a.rename(columns={0: 'Weight'})
a = a.sort_values('Weight', ascending = False)
a.head(10)
b = df_I.loc[Selected_Stock]
test = b[['ESG_C_S','Env_S','Soc_S','Gov_S','ESG_Controversial','Industry']]
test1 = round(test.groupby('Industry').mean(),2)
test1
test1_2 = test1.mean()
test1_2 # portfolio
c = df_I
test = c[['ESG_C_S','Env_S','Soc_S','Gov_S','ESG_Controversial','Industry']]
test2 = round(test.groupby('Industry').mean(),2)
test2
test2_2 = test.mean()
test2_2 # overall (98) companies
features = ['Combined \n Score','Environment','Social','Government','Controversial','Combined \n score']
def plot_rader(value1, value2,  features, value1_name = 'Portfolio',value2_name = 'Overall', rader_size = 10):
    # 使用ggplot的绘图风格
    plt.style.use('ggplot')
    # 构造数据
    values = value1.values.tolist()
    values2 = value2.values.tolist()
    feature = features
    N = len(values)
    # 设置雷达图的角度,用于平分切开一个圆面
    angles=np.linspace(0, 2*np.pi, N, endpoint=False)
    # 为了使雷达图一圈封闭起来,需要下面的步骤
    values=np.concatenate((values,[values[0]]))
    values2=np.concatenate((values2,[values2[0]]))
    angles=np.concatenate((angles,[angles[0]]))
    # 绘图
    fig=plt.figure()
    ax = fig.add_subplot(111, polar=True)
    # 绘制折线图
    ax.plot(angles, values, 'o-', linewidth=2, label = value1_name, color = 'limegreen')
    # 填充颜色
    ax.fill(angles, values, alpha=0.25,color= 'limegreen')
    # 绘制第二条折线图
    ax.plot(angles, values2, 'o-', linewidth=2, label = value2_name, color = 'darkgreen')
    ax.fill(angles, values2, alpha=0.25, color = 'darkgreen')
    # 添加每个特征的标签
    ax.set_thetagrids(angles * 180/np.pi, feature)
    # 设置雷达图的范围
    ax.set_ylim(0,rader_size)
    # 添加标题
    plt.title('ESG_Rader')
    # 添加网格线
    ax.grid(True)
    # 设置图例
    plt.legend(loc = 2, prop={'size': 7.4})
    # 显示图形
    plt.show()
plot_rader(test1_2, test2_2, features, 'Portfolio', 'Overall',10)
plot_rader(test1.loc['Energy'], test2.loc['Energy'], features, 'Portfolio', 'Overall',12)
plot_rader(test1.loc['Medical'], test2.loc['Medical'], features, 'Portfolio', 'Overall',12)
plot_rader(test1.loc['Manufacturing'], test2.loc['Manufacturing'], features, 'Portfolio', 'Overall',12)
plot_rader(test1.loc['Logistic'], test2.loc['Logistic'], features, 'Portfolio', 'Overall',12)
# Proportional to Industrial 
I = pd.DataFrame(df_I.loc[Selected_Stock][['Year','Industry']])
I_1 = I[I['Year']=='2014-12-31 00:00:00']
I_2 = a.merge(I_1["Industry"], how = 'inner', left_on = a.index, right_on = I_1.index)
I_2 = I_2.groupby("Industry").sum()
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = I_2.index
sizes = round(I_2['Weight'],2)
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=False, startangle=90, colors = ['darkseagreen','olivedrab','green', 'mediumseagreen'])
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("Portfolio Weighting by Industry")
plt.show()