!pip install pyarrow
import simfin as sf
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
Import data
# Set your API-key for downloading data. This key gets the free data.
sf.set_api_key('free')
# Set the local directory where data-files are stored.
# The directory will be created if it does not already exist.
sf.set_data_dir('~/simfin_data/')
# Download the data from the SimFin server and load into a Pandas DataFrame.
df = sf.load_shareprices(variant='daily', market='us')
Get the last entry for each Ticker per month and reset index.
df= df.groupby(by=[pd.Grouper(level='Ticker'), pd.Grouper(level='Date', freq='M')]).last()
df = df.reset_index()
column_dtypes = {
'Ticker' : 'str',
'Date' : 'str' ,
'SimFinId' : 'uint32',
'Open' : 'float64',
'Low' : 'float64',
'High' : 'float64',
'Close' : 'float64',
'Adj. Close' : 'float64',
'Dividend' : 'float64',
'Volume' : 'float64',
'Shares Outstanding': 'float64'}
#parse_dates = ['Date']
#df= pd.read_csv("SimFin_mth.csv", dtype = column_dtypes, parse_dates=parse_dates)
df['Date'] = pd.to_datetime(df['Date'].astype('str'))
df['Year'] = df['Date'].dt.strftime('%Y')
df['Month'] = df['Date'].dt.strftime('%m')
df = df.set_index(['Ticker','Year','Month'])
last = df.groupby(level = 'Ticker').last()[['Date','Adj. Close']]
df = df.join(last, rsuffix='_last')
df = df[df['Close'] < 99999999] #exclude bad data
Offset year and join to last known entry
#temp reduction in scope
#df = df['UFS':'UG'][['Adj. Close','Adj. Close_last']]
df['1Y'] = round(1/(1+df.groupby(level='Ticker')['Adj. Close'].apply(lambda x: x.pct_change(periods = -12))),5)
df['2Y'] = round(1/(1+df.groupby(level='Ticker')['Adj. Close'].apply(lambda x: x.pct_change(periods = -24))),5)
df['3Y'] = round(1/(1+df.groupby(level='Ticker')['Adj. Close'].apply(lambda x: x.pct_change(periods = -36))),5)
df['5Y'] = round(1/(1+df.groupby(level='Ticker')['Adj. Close'].apply(lambda x: x.pct_change(periods = -60))),5)
df.replace([np.inf, -np.inf], np.nan)
df['1Y']= round(df['1Y'].fillna(df['Adj. Close_last']/df['Adj. Close']-1),5)
df['2Y']= round(df['2Y'].fillna(df['Adj. Close_last']/df['Adj. Close']-1),5)
df['3Y']= round(df['3Y'].fillna(df['Adj. Close_last']/df['Adj. Close']-1),5)
df['5Y']= round(df['5Y'].fillna(df['Adj. Close_last']/df['Adj. Close']-1),5)
df['MktCap'] = round(df['Shares Outstanding'] * df['Close'],0)
df['VolAmt'] = round(df.groupby(['Ticker','Year']).agg(sum)['Volume'] * df['Close'],0)
Export to CSV
#df.to_csv('SimFin_mth_out.csv')
Weird selections - data quality considerations
#df['1Y'].max()
#df[df['Dividend'].isnull()]['1Y'].mean() # much higher than with div
#for n in range(1,100,5):
# print(n, df[df['Adj. Close']>n]['1Y'].mean() - df['1Y'].mean()) # 11% less return for stocks with adj. close higher than 10??
#[df[df['Shares Outstanding'].isnull()]['1Y'].count(), df[df['Shares Outstanding'].isnull()]['1Y'].mean() - df['1Y'].mean()] # missing shares outstanding = outperformance
Analysis of delta between full sample of 600 largest companies each month and n-sample for 1-5 year returns. Note that time horizon of x is not meaningful for startpoint higher than x years from 2019
def run(s_st, s_end):
df1y = df.loc(axis=0)[:,'2007':'2018',:]
out_list = []
for y in ['2009', '2010','2011', '2012', '2013','2014', '2015', '2016', '2017', '2018']:
for m in ['01','02','03','04','04','05','06','07','08','09','10','11','12']:
df_run = df1y.loc(axis=0)[:,y,m].nlargest(500,'VolAmt')
period_mean = df_run[['1Y', '2Y', '3Y', '5Y']].mean()
out_list.append([y, m, 'mean', 0,
period_mean['1Y'],
period_mean['2Y']**(1/2),
period_mean['3Y']**(1/3),
period_mean['5Y']**(1/5)])
period_median = df_run[['1Y', '2Y', '3Y', '5Y']].median()
out_list.append([y, m, 'median', 0,
period_median['1Y']-period_mean['1Y'],
period_median['2Y']**(1/2),
period_median['3Y']**(1/3),
period_median['5Y']**(1/5)])
for n in [10, 15, 20, 25]:
for s in range (s_st,s_end):
sample_mean = df_run.sample(n, random_state = s)[['1Y', '2Y', '3Y', '5Y']].mean()
out_list.append([y, m, n, s,
sample_mean['1Y']-period_mean['1Y'],
sample_mean['2Y']**(1/2)-period_mean['2Y']**(1/2),
sample_mean['3Y']**(1/3)-period_mean['3Y']**(1/3),
sample_mean['5Y']**(1/5)-period_mean['5Y']**(1/5)])
out = pd.DataFrame(out_list)
out.columns = ['y', 'm', 'n','s', 'delta1y', 'delta2y', 'delta3y', 'delta5y']
out = out.astype(dtype={'y': 'int64',
'm': 'category',
'n': 'str',
's': 'int64',
'delta1y': 'float16',
'delta2y': 'float16',
'delta3y': 'float16',
'delta5y': 'float16'})
#Exclude incomplete series (e.g. 5 year analysis with start later than 2014)
out.loc[(out.y > 2014) , 'delta5y'] = np.nan
out.loc[(out.y > 2016) , 'delta3y'] = np.nan
out.loc[(out.y > 2017) , 'delta2y'] = np.nan
out['n'] = out.n.str.zfill(3)
return out
#consider also weights df1.sample(n = 3, weights = test_weights)
s_range = [(101,300), (301,700), (701,1100)]
"""
Generate data
"""
filename = 'TMP_output_analysis_s'
for s_st, s_end in s_range:
out = run(s_st,s_end)
out.to_pickle(filename+str(s_st)+'-'+str(s_end)+'.pickle')
out = ''
s_st, s_end = s_range[0]
out = pd.read_pickle(filename+str(s_st)+'-'+str(s_end)+'.pickle')
for s_st, s_end in s_range[1:]:
out = out.append(pd.read_pickle(filename+str(s_st)+'-'+str(s_end)+'.pickle'))
"""out['tmp'] = out['y']
out['y'] = out['m'].astype('int')
out['m'] = out['tmp'].astype('category')
out = out.drop('tmp', axis = 1)
out"""
Visualize: Boxplot of CAGR return of sample minus full universe:
- Median (Solid)
- Mean (Dashed)
- Bottom to top quintile (Box)
- 10-90 percentile (Whiskers)
out_plot = out[~out.n.isin(['mean', 'median'])]
fig = out_plot[['n','delta1y', 'delta2y', 'delta3y', 'delta5y']].groupby('n').boxplot(figsize = [20,10], sym = '', whis = [10, 90], meanline = True, showmeans = True, layout = (1,6))
for show in out[~out.n.isin(['mean', 'median'])].n.unique():
#print(show, out[out.n.isin([show])][['delta1y', 'delta2y', 'delta3y', 'delta5y']].describe(percentiles = [.5]))
print(show, out[out.n.isin([show])][['delta1y', 'delta2y', 'delta3y', 'delta5y']].describe())
for show in ['2009', '2010', '2011','2012', '2013', '2014','2015', '2016', '2017', '2018']:
print(show, out[(out.n == '100') & (out.y.isin([show]))][['delta1y', 'delta2y', 'delta3y', 'delta5y']].describe())
TODO:
- Percentile plot per year for n=100, delta5y to find percentile for delta higher than 0.008 which is the "affordable" ER. Answer: Approx 30% risk
out[(out.n == '100')].quantile([0.288, 0.29])