!pip install pyarrow
WARNING: The directory '/home/jovyan/.cache/pip' or its parent directory is not owned or is not writable by the current user. The cache has been disabled. Check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.
Requirement already satisfied: pyarrow in /opt/venv/lib/python3.7/site-packages (2.0.0)
Requirement already satisfied: numpy>=1.14 in /opt/venv/lib/python3.7/site-packages (from pyarrow) (1.19.4)
WARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
import simfin as sf
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
# Set your API-key for downloading data. This key gets the free data.
sf.set_api_key('free')
# Set the local directory where data-files are stored.
# The directory will be created if it does not already exist.
sf.set_data_dir('~/simfin_data/')
# Download the data from the SimFin server and load into a Pandas DataFrame.
df = sf.load_shareprices(variant='daily', market='us')
Dataset "us-shareprices-daily" on disk (0 days old).
- Loading from disk ... Done!
df= df.groupby(by=[pd.Grouper(level='Ticker'), pd.Grouper(level='Date', freq='M')]).last()
df = df.reset_index()
column_dtypes = {
'Ticker' : 'str',
'Date' : 'str' ,
'SimFinId' : 'uint32',
'Open' : 'float64',
'Low' : 'float64',
'High' : 'float64',
'Close' : 'float64',
'Adj. Close' : 'float64',
'Dividend' : 'float64',
'Volume' : 'float64',
'Shares Outstanding': 'float64'}
#parse_dates = ['Date']
#df= pd.read_csv("SimFin_mth.csv", dtype = column_dtypes, parse_dates=parse_dates)
df['Date'] = pd.to_datetime(df['Date'].astype('str'))
df['Year'] = df['Date'].dt.strftime('%Y')
df['Month'] = df['Date'].dt.strftime('%m')
df = df.set_index(['Ticker','Year','Month'])
last = df.groupby(level = 'Ticker').last()[['Date','Adj. Close']]
df = df.join(last, rsuffix='_last')
df = df[df['Close'] < 99999999] #exclude bad data
#temp reduction in scope
#df = df['UFS':'UG'][['Adj. Close','Adj. Close_last']]
df['1Y'] = round(1/(1+df.groupby(level='Ticker')['Adj. Close'].apply(lambda x: x.pct_change(periods = -12))),5)
df['2Y'] = round(1/(1+df.groupby(level='Ticker')['Adj. Close'].apply(lambda x: x.pct_change(periods = -24))),5)
df['3Y'] = round(1/(1+df.groupby(level='Ticker')['Adj. Close'].apply(lambda x: x.pct_change(periods = -36))),5)
df['5Y'] = round(1/(1+df.groupby(level='Ticker')['Adj. Close'].apply(lambda x: x.pct_change(periods = -60))),5)
df.replace([np.inf, -np.inf], np.nan)
df['1Y']= round(df['1Y'].fillna(df['Adj. Close_last']/df['Adj. Close']-1),5)
df['2Y']= round(df['2Y'].fillna(df['Adj. Close_last']/df['Adj. Close']-1),5)
df['3Y']= round(df['3Y'].fillna(df['Adj. Close_last']/df['Adj. Close']-1),5)
df['5Y']= round(df['5Y'].fillna(df['Adj. Close_last']/df['Adj. Close']-1),5)
df['MktCap'] = round(df['Shares Outstanding'] * df['Close'],0)
df['VolAmt'] = round(df.groupby(['Ticker','Year']).agg(sum)['Volume'] * df['Close'],0)
#df.to_csv('SimFin_mth_out.csv')
#df['1Y'].max()
#df[df['Dividend'].isnull()]['1Y'].mean() # much higher than with div
#for n in range(1,100,5):
# print(n, df[df['Adj. Close']>n]['1Y'].mean() - df['1Y'].mean()) # 11% less return for stocks with adj. close higher than 10??
#[df[df['Shares Outstanding'].isnull()]['1Y'].count(), df[df['Shares Outstanding'].isnull()]['1Y'].mean() - df['1Y'].mean()] # missing shares outstanding = outperformance
def run(s_st, s_end):
df1y = df.loc(axis=0)[:,'2007':'2018',:]
out_list = []
for y in ['2009', '2010','2011', '2012', '2013','2014', '2015', '2016', '2017', '2018']:
for m in ['01','02','03','04','04','05','06','07','08','09','10','11','12']:
df_run = df1y.loc(axis=0)[:,y,m].nlargest(500,'VolAmt')
period_mean = df_run[['1Y', '2Y', '3Y', '5Y']].mean()
out_list.append([y, m, 'mean', 0,
period_mean['1Y'],
period_mean['2Y']**(1/2),
period_mean['3Y']**(1/3),
period_mean['5Y']**(1/5)])
period_median = df_run[['1Y', '2Y', '3Y', '5Y']].median()
out_list.append([y, m, 'median', 0,
period_median['1Y']-period_mean['1Y'],
period_median['2Y']**(1/2),
period_median['3Y']**(1/3),
period_median['5Y']**(1/5)])
for n in [10, 15, 20, 25]:
for s in range (s_st,s_end):
sample_mean = df_run.sample(n, random_state = s)[['1Y', '2Y', '3Y', '5Y']].mean()
out_list.append([y, m, n, s,
sample_mean['1Y']-period_mean['1Y'],
sample_mean['2Y']**(1/2)-period_mean['2Y']**(1/2),
sample_mean['3Y']**(1/3)-period_mean['3Y']**(1/3),
sample_mean['5Y']**(1/5)-period_mean['5Y']**(1/5)])
out = pd.DataFrame(out_list)
out.columns = ['y', 'm', 'n','s', 'delta1y', 'delta2y', 'delta3y', 'delta5y']
out = out.astype(dtype={'y': 'int64',
'm': 'category',
'n': 'str',
's': 'int64',
'delta1y': 'float16',
'delta2y': 'float16',
'delta3y': 'float16',
'delta5y': 'float16'})
#Exclude incomplete series (e.g. 5 year analysis with start later than 2014)
out.loc[(out.y > 2014) , 'delta5y'] = np.nan
out.loc[(out.y > 2016) , 'delta3y'] = np.nan
out.loc[(out.y > 2017) , 'delta2y'] = np.nan
out['n'] = out.n.str.zfill(3)
return out
#consider also weights df1.sample(n = 3, weights = test_weights)
s_range = [(101,300), (301,700), (701,1100)]
"""
Generate data
"""
filename = 'TMP_output_analysis_s'
for s_st, s_end in s_range:
out = run(s_st,s_end)
out.to_pickle(filename+str(s_st)+'-'+str(s_end)+'.pickle')
out = ''
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:27: RuntimeWarning: invalid value encountered in double_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:26: RuntimeWarning: invalid value encountered in double_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:25: RuntimeWarning: invalid value encountered in double_scalars
s_st, s_end = s_range[0]
out = pd.read_pickle(filename+str(s_st)+'-'+str(s_end)+'.pickle')
for s_st, s_end in s_range[1:]:
out = out.append(pd.read_pickle(filename+str(s_st)+'-'+str(s_end)+'.pickle'))
"""out['tmp'] = out['y']
out['y'] = out['m'].astype('int')
out['m'] = out['tmp'].astype('category')
out = out.drop('tmp', axis = 1)
out"""
out_plot = out[~out.n.isin(['mean', 'median'])]
fig = out_plot[['n','delta1y', 'delta2y', 'delta3y', 'delta5y']].groupby('n').boxplot(figsize = [20,10], sym = '', whis = [10, 90], meanline = True, showmeans = True, layout = (1,6))
for show in out[~out.n.isin(['mean', 'median'])].n.unique():
#print(show, out[out.n.isin([show])][['delta1y', 'delta2y', 'delta3y', 'delta5y']].describe(percentiles = [.5]))
print(show, out[out.n.isin([show])][['delta1y', 'delta2y', 'delta3y', 'delta5y']].describe())
010 delta1y delta2y delta3y delta5y
count 129610.000000 116649.000000 103688.000000 77766.000000
mean -0.000000 -0.000000 -0.000000 -0.000000
std 0.000000 0.000000 0.000000 0.000000
min -0.586914 -0.408691 -0.341064 -0.352539
25% -0.069458 -0.051331 -0.041992 -0.034607
50% -0.002370 -0.003813 -0.003727 -0.004448
75% 0.065002 0.043884 0.034058 0.025894
max 1.596680 0.516602 0.342041 0.252441
015 delta1y delta2y delta3y delta5y
count 129610.000000 116649.000000 103688.000000 77766.000000
mean 0.000000 -0.000000 -0.000000 -0.000000
std 0.000000 0.000000 0.000000 0.000000
min -0.548828 -0.309814 -0.265869 -0.217896
25% -0.056671 -0.041534 -0.033905 -0.028000
50% -0.001673 -0.002502 -0.002659 -0.002907
75% 0.054199 0.036865 0.028687 0.022110
max 1.250000 0.432373 0.265625 0.210693
020 delta1y delta2y delta3y delta5y
count 129610.000000 116649.000000 103688.000000 77766.000000
mean 0.000000 -0.000000 -0.000000 -0.000000
std 0.000000 0.000000 0.000000 0.000000
min -0.497803 -0.235596 -0.209961 -0.182251
25% -0.048950 -0.035736 -0.029099 -0.024048
50% -0.001482 -0.001957 -0.002224 -0.002396
75% 0.046997 0.032135 0.025070 0.019608
max 0.941406 0.335205 0.217773 0.159546
025 delta1y delta2y delta3y delta5y
count 129610.000000 116649.000000 103688.000000 77766.000000
mean 0.000000 -0.000000 -0.000000 -0.000000
std 0.000000 0.000000 0.000000 0.000000
min -0.396973 -0.218506 -0.208252 -0.152344
25% -0.043427 -0.031769 -0.025986 -0.021225
50% -0.000999 -0.001452 -0.001778 -0.001842
75% 0.042480 0.029083 0.022675 0.017868
max 0.812012 0.361816 0.199463 0.164185
for show in ['2009', '2010', '2011','2012', '2013', '2014','2015', '2016', '2017', '2018']:
print(show, out[(out.n == '100') & (out.y.isin([show]))][['delta1y', 'delta2y', 'delta3y', 'delta5y']].describe())
2009 delta1y delta2y delta3y delta5y
count 12935.000000 12935.000000 12935.000000 12935.000000
mean -0.000516 -0.000194 -0.000183 -0.000265
std 0.046692 0.024567 0.017334 0.013199
min -0.249146 -0.113342 -0.074097 -0.053894
25% -0.026398 -0.015694 -0.011314 -0.008972
50% -0.001304 -0.000327 -0.000212 -0.000367
75% 0.024551 0.014755 0.010757 0.008396
max 0.284180 0.136963 0.075989 0.051697
2010 delta1y delta2y delta3y delta5y
count 12935.000000 12935.000000 12935.000000 12935.000000
mean -0.000078 -0.000084 -0.000104 -0.000233
std 0.026962 0.019180 0.014397 0.012215
min -0.115295 -0.075073 -0.061615 -0.047913
25% -0.018143 -0.013283 -0.009872 -0.008450
50% -0.000306 -0.000229 -0.000069 -0.000189
75% 0.017609 0.012665 0.009464 0.008087
max 0.106262 0.082397 0.056946 0.046753
2011 delta1y delta2y delta3y delta5y
count 12935.000000 12935.000000 12935.000000 12935.000000
mean 0.000401 0.000071 -0.000081 -0.000174
std 0.026917 0.019714 0.016602 0.012413
min -0.116821 -0.071228 -0.062927 -0.046692
25% -0.017700 -0.013191 -0.011639 -0.008678
50% 0.000320 0.000203 -0.000134 -0.000191
75% 0.018440 0.013302 0.011028 0.008213
max 0.112427 0.082458 0.073425 0.045349
2012 delta1y delta2y delta3y delta5y
count 12935.000000 12935.000000 12935.000000 12935.000000
mean -0.000241 -0.000346 -0.000269 -0.000377
std 0.031586 0.021896 0.018250 0.014259
min -0.106140 -0.082581 -0.069641 -0.053894
25% -0.021446 -0.015240 -0.012833 -0.009911
50% -0.000561 -0.000524 -0.000325 -0.000417
75% 0.020126 0.014324 0.012032 0.009094
max 0.156006 0.095215 0.074646 0.061035
2013 delta1y delta2y delta3y delta5y
count 12935.000000 12935.000000 12935.000000 12935.000000
mean -0.000238 -0.000526 -0.000417 -0.000600
std 0.028793 0.021317 0.015839 0.015793
min -0.122620 -0.085571 -0.065796 -0.057037
25% -0.019531 -0.014824 -0.011116 -0.011467
50% -0.000470 -0.000585 -0.000535 -0.001034
75% 0.018517 0.013412 0.010330 0.009918
max 0.145142 0.089783 0.060181 0.055939
2014 delta1y delta2y delta3y delta5y
count 12935.000000 12935.000000 12935.000000 12935.000000
mean 0.000422 -0.000025 -0.000290 -0.000434
std 0.029114 0.019333 0.018250 0.015419
min -0.113098 -0.072449 -0.068115 -0.061401
25% -0.019203 -0.013149 -0.012558 -0.010632
50% 0.000269 0.000088 -0.000695 -0.000402
75% 0.019966 0.013161 0.011723 0.010056
max 0.101196 0.065857 0.064514 0.059692
2015 delta1y delta2y delta3y delta5y
count 12935.000000 12935.000000 12935.000000 0.0
mean -0.000135 -0.000359 -0.000360 NaN
std 0.026337 0.021698 0.019775 NaN
min -0.101379 -0.077393 -0.066528 NaN
25% -0.017792 -0.014969 -0.013794 NaN
50% -0.000054 -0.000937 -0.000975 NaN
75% 0.017075 0.013954 0.012207 NaN
max 0.106750 0.090759 0.077026 NaN
2016 delta1y delta2y delta3y delta5y
count 12935.000000 12935.000000 12935.000000 0.0
mean -0.000078 -0.000347 -0.000586 NaN
std 0.032135 0.024017 0.018448 NaN
min -0.124084 -0.091064 -0.085022 NaN
25% -0.021332 -0.016586 -0.013008 NaN
50% -0.000185 -0.000621 -0.000568 NaN
75% 0.020950 0.015663 0.011948 NaN
max 0.125000 0.090088 0.065613 NaN
2017 delta1y delta2y delta3y delta5y
count 12935.000000 12935.000000 0.0 0.0
mean -0.000702 -0.000515 NaN NaN
std 0.031738 0.022064 NaN NaN
min -0.129761 -0.088806 NaN NaN
25% -0.021973 -0.015301 NaN NaN
50% -0.000684 -0.000143 NaN NaN
75% 0.020988 0.014568 NaN NaN
max 0.109558 0.092224 NaN NaN
2018 delta1y delta2y delta3y delta5y
count 12935.000000 0.0 0.0 0.0
mean -0.000370 NaN NaN NaN
std 0.030136 NaN NaN NaN
min -0.133179 NaN NaN NaN
25% -0.020233 NaN NaN NaN
50% -0.000224 NaN NaN NaN
75% 0.020004 NaN NaN NaN
max 0.130493 NaN NaN NaN
out[(out.n == '100')].quantile([0.288, 0.29])