# start by loading in our typical set of packages
import numpy as np # NumPy
import pandas as pd # pandas
import seaborn as sns
import matplotlib.pyplot as plt
import sys
print(sys.version)
3.7.12 (default, Oct 12 2021, 03:36:26)
[GCC 8.3.0]
# use pip to install the package 'xlrd'
# need it to read the excel file below
!pip install xlrd
Collecting xlrd
Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
|████████████████████████████████| 96 kB 5.6 MB/s
Installing collected packages: xlrd
Successfully installed xlrd-2.0.1
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
fnamein = 'Company Screening Report 0yr Lag.xls'
# use pandas to read the excel file. Open it if you like and check out why the parameters are set the way they are.
df_mult0 = pd.read_excel(fnamein,sheet_name='Screening',header=None,na_values=['-','NM'],skiprows=8)
# here we are making our own column names instead of the longer ones in the Excel file.
i = ['name','ex:tick','comp_type','exchange','industry','sector','marketcap','total_debt','shrout','revenue','gross_profit','sga','advertising','marketing','sales_marketing','gen_admin','rd_expense','int_expense','op_lease_int','ebitda','ebit','net_income','cash','capex','acquisitions','changeNWC','beta1','beta2','beta5','etr','tev','gRev1yr','gRev3yr','ebit_margin','ebitda_margin','gross_margin','roe','roc','altman']
# set column names
df_mult0.columns = i
# We are going to make the ticker our index - i.e. the unique identifier. So we can't have any NA values, so .dropna()
df_mult0.dropna(axis=0, how='any', subset=['ex:tick'],inplace=True)
df_mult0.set_index('ex:tick',inplace=True, verify_integrity=True)
print(df_mult0.info())
df_mult0.describe()
<class 'pandas.core.frame.DataFrame'>
Index: 5294 entries, NasdaqGS:FLWS to NasdaqGS:ZNGA
Data columns (total 38 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 5294 non-null object
1 comp_type 5294 non-null object
2 exchange 4904 non-null object
3 industry 5294 non-null object
4 sector 5294 non-null object
5 marketcap 5282 non-null float64
6 total_debt 4998 non-null float64
7 shrout 5282 non-null float64
8 revenue 5156 non-null float64
9 gross_profit 5160 non-null float64
10 sga 5160 non-null float64
11 advertising 5160 non-null float64
12 marketing 5160 non-null float64
13 sales_marketing 5160 non-null float64
14 gen_admin 5160 non-null float64
15 rd_expense 5160 non-null float64
16 int_expense 5160 non-null float64
17 op_lease_int 5160 non-null float64
18 ebitda 5156 non-null float64
19 ebit 5160 non-null float64
20 net_income 5160 non-null float64
21 cash 4920 non-null float64
22 capex 4917 non-null float64
23 acquisitions 4913 non-null float64
24 changeNWC 5046 non-null float64
25 beta1 4351 non-null float64
26 beta2 4351 non-null float64
27 beta5 4101 non-null float64
28 etr 3424 non-null float64
29 tev 4641 non-null float64
30 gRev1yr 4280 non-null float64
31 gRev3yr 3967 non-null float64
32 ebit_margin 3529 non-null float64
33 ebitda_margin 3510 non-null float64
34 gross_margin 3851 non-null float64
35 roe 4481 non-null float64
36 roc 4161 non-null float64
37 altman 3509 non-null float64
dtypes: float64(33), object(5)
memory usage: 1.6+ MB
None
# fill in the two variables mentioned above to see their summary stats
df_mult0.loc[:,['revenue','tev']].describe()
# lets fix that. What should we do? Pick some (not all) of the following
# set values that are nonsensical to NaN, keep the rest the same?
df_mult0.loc[:,'revenue'] = np.where(df_mult0['revenue']< 0,np.nan,df_mult0['revenue'])
df_mult0.loc[:,'tev'] = np.where(df_mult0['tev']<0,np.nan,df_mult0['tev'])
# fill the NaN values with something?
#df_mult0['revenue'].fillna(value='count',inplace=True)
#df_mult0['tev'].fillna(value='count',inplace=True)
# drop NaN values?
df_mult0.dropna(axis=0, how='any', subset=['revenue','tev'], inplace=True)
# see if you solved the problem
df_mult0.loc[:,['revenue','tev']].describe()
#delete possibly before submission
df_mult0.info()
<class 'pandas.core.frame.DataFrame'>
Index: 4579 entries, NasdaqGS:FLWS to NasdaqGS:ZNGA
Data columns (total 38 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 4579 non-null object
1 comp_type 4579 non-null object
2 exchange 4231 non-null object
3 industry 4579 non-null object
4 sector 4579 non-null object
5 marketcap 4579 non-null float64
6 total_debt 4418 non-null float64
7 shrout 4579 non-null float64
8 revenue 4579 non-null float64
9 gross_profit 4579 non-null float64
10 sga 4579 non-null float64
11 advertising 4579 non-null float64
12 marketing 4579 non-null float64
13 sales_marketing 4579 non-null float64
14 gen_admin 4579 non-null float64
15 rd_expense 4579 non-null float64
16 int_expense 4579 non-null float64
17 op_lease_int 4579 non-null float64
18 ebitda 4579 non-null float64
19 ebit 4579 non-null float64
20 net_income 4579 non-null float64
21 cash 4347 non-null float64
22 capex 4381 non-null float64
23 acquisitions 4377 non-null float64
24 changeNWC 4501 non-null float64
25 beta1 3807 non-null float64
26 beta2 3807 non-null float64
27 beta5 3573 non-null float64
28 etr 2899 non-null float64
29 tev 4579 non-null float64
30 gRev1yr 3760 non-null float64
31 gRev3yr 3453 non-null float64
32 ebit_margin 3509 non-null float64
33 ebitda_margin 3490 non-null float64
34 gross_margin 3748 non-null float64
35 roe 3929 non-null float64
36 roc 4124 non-null float64
37 altman 3488 non-null float64
dtypes: float64(33), object(5)
memory usage: 1.4+ MB
# let's look at the dataframe, but only where the 'exchange' variable is NaN
df_mult0[df_mult0.exchange.isna()]
# this looks a lot like they are just international firms. We should keep these.
# Let's fill them with a value that may help us sort later.
# Also, later functions won't like NaN in our data.
df_mult0['exchange'].fillna(value='NonUS', inplace=True)
#all this does is replace the NaN above with the text NonUS for Non-US Exchanges.
#check how many NaN are left in the data frame
# what function did we use above to see the non-null counts?
df_mult0.info()
<class 'pandas.core.frame.DataFrame'>
Index: 4579 entries, NasdaqGS:FLWS to NasdaqGS:ZNGA
Data columns (total 38 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 4579 non-null object
1 comp_type 4579 non-null object
2 exchange 4579 non-null object
3 industry 4579 non-null object
4 sector 4579 non-null object
5 marketcap 4579 non-null float64
6 total_debt 4418 non-null float64
7 shrout 4579 non-null float64
8 revenue 4579 non-null float64
9 gross_profit 4579 non-null float64
10 sga 4579 non-null float64
11 advertising 4579 non-null float64
12 marketing 4579 non-null float64
13 sales_marketing 4579 non-null float64
14 gen_admin 4579 non-null float64
15 rd_expense 4579 non-null float64
16 int_expense 4579 non-null float64
17 op_lease_int 4579 non-null float64
18 ebitda 4579 non-null float64
19 ebit 4579 non-null float64
20 net_income 4579 non-null float64
21 cash 4347 non-null float64
22 capex 4381 non-null float64
23 acquisitions 4377 non-null float64
24 changeNWC 4501 non-null float64
25 beta1 3807 non-null float64
26 beta2 3807 non-null float64
27 beta5 3573 non-null float64
28 etr 2899 non-null float64
29 tev 4579 non-null float64
30 gRev1yr 3760 non-null float64
31 gRev3yr 3453 non-null float64
32 ebit_margin 3509 non-null float64
33 ebitda_margin 3490 non-null float64
34 gross_margin 3748 non-null float64
35 roe 3929 non-null float64
36 roc 4124 non-null float64
37 altman 3488 non-null float64
dtypes: float64(33), object(5)
memory usage: 1.4+ MB
# working from the top, the next one with missing values is Market Cap. (depending on what you did exactly - it may be ok)
# if the firm is missing MarketCap, then that means we don't have stock price or shares outstanding.
# Either is a problem, so we have to drop these firms. How do we do that?
# drop NaN values
df_mult0.dropna(axis=0, how='any', subset=['marketcap'], inplace=True)
# next we see total_debt.
# what does it mean if total_debt is missing? It likely means that the firm just has no debt.
# filter the Data Frame by only looking at firms where total_debt is NaN.
df_mult0.loc[df_mult0.total_debt.isna()]
# Maybe we need to look by sector. Where are these firms with missing total_debt?
# Let's look at how many NaN we have in total_debt by sector.
# the first part is the same filter as above, then use the groupby() function on the variable 'sector'
df_mult0.loc[df_mult0.total_debt.isna()].groupby(['sector']).count()
# here is a nifty trick - combine two conditional statements: which firms have NaN total_debt and are NOT in the 'Financials' sector
# (note the parentheses around each conditional statement - this is important)
df_mult0.loc[(df_mult0.total_debt.isna()) & (df_mult0.sector != 'Financials')]
# since we feel comfortalbe that these are normal companies but that they just have 0 total_debt, let's fill the NaN with 0
df_mult0['total_debt'].fillna(value=0,inplace=True) # make sure you set the correct second parameter to update the DataFrame.
df_mult0.info()
<class 'pandas.core.frame.DataFrame'>
Index: 4579 entries, NasdaqGS:FLWS to NasdaqGS:ZNGA
Data columns (total 38 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 4579 non-null object
1 comp_type 4579 non-null object
2 exchange 4579 non-null object
3 industry 4579 non-null object
4 sector 4579 non-null object
5 marketcap 4579 non-null float64
6 total_debt 4579 non-null float64
7 shrout 4579 non-null float64
8 revenue 4579 non-null float64
9 gross_profit 4579 non-null float64
10 sga 4579 non-null float64
11 advertising 4579 non-null float64
12 marketing 4579 non-null float64
13 sales_marketing 4579 non-null float64
14 gen_admin 4579 non-null float64
15 rd_expense 4579 non-null float64
16 int_expense 4579 non-null float64
17 op_lease_int 4579 non-null float64
18 ebitda 4579 non-null float64
19 ebit 4579 non-null float64
20 net_income 4579 non-null float64
21 cash 4347 non-null float64
22 capex 4381 non-null float64
23 acquisitions 4377 non-null float64
24 changeNWC 4501 non-null float64
25 beta1 3807 non-null float64
26 beta2 3807 non-null float64
27 beta5 3573 non-null float64
28 etr 2899 non-null float64
29 tev 4579 non-null float64
30 gRev1yr 3760 non-null float64
31 gRev3yr 3453 non-null float64
32 ebit_margin 3509 non-null float64
33 ebitda_margin 3490 non-null float64
34 gross_margin 3748 non-null float64
35 roe 3929 non-null float64
36 roc 4124 non-null float64
37 altman 3488 non-null float64
dtypes: float64(33), object(5)
memory usage: 1.4+ MB
# here is another trick, though you may not need it.
# what if a firm has EBIT but not EBITDA? We want to keep the firm, so we fill the EBITDA field with the EBIT, assuming that DA are 0 instead of NaN
#df_mult0['ebitda'].fillna(value=df_mult0['ebit'], inplace=True)
You are now a lot of the way there. The goal is to do this kind of analysis all the way down for all of your variables, finding why there is any odd data - negatives, NaN, zeros, etc. You can use filters, look at the data, summary stats, etc.
# lets put all of our conditions into a single index object, then use that to filter our dataframe
# do an internet search for "pandas dataframe slice by multiple conditions" if you want more info on what I'm doing here
# each of these create a boolean series using specific criteria.
idx_margin = (df_mult0.ebit_margin > 10) & (df_mult0.ebit_margin < 20)
idx_roc = (df_mult0.roc > 10) & (df_mult0.roc < 20)
idx_growth = (df_mult0.gRev1yr > 0) & (df_mult0.gRev1yr < 20)
idx_beta = (df_mult0.beta2 > 0.8) & (df_mult0.beta2 < 1.2)
# these are all boolean conditions, so we just use AND to combine them
idx = idx_margin & idx_roc & idx_growth & idx_beta
# now we pass idx for the row portion, and the list of columns we want to see
df_mult0.loc[idx,['ebit_margin','roc','gRev1yr','beta2']].describe()
# now, compute the EV/EBITDA multiple. Or EV/EBIT. Or perhaps Price/Sales (MarketCap/Revenue).
# note: you may want to use np.where() if your denominator is zero. You'll see inf or NaN for your multiple in that case.
df_mult0['ev_ebitda'] = df_mult0['tev']/df_mult0['ebitda']
# check out your multiple
df_mult0[idx]['ev_ebitda'].describe()
#print(df_mult0[idx]['ev_ebitda']) #to see individual multiples
df_mult0[idx].sort_values(by='ev_ebitda')
# want to do a histogram?
#df_mult0[idx]['ev_ebitda'].hist()
#hist=df_mult0.hist(bins=5)
fig=plt.figure()
ax=fig.add_subplot(1,1,1,)
n, bins, patches=ax.hist(df_mult0[idx]['ev_ebitda'],bins=10,range=(0,100),histtype='bar')
for patch in patches:
patch.set_facecolor('g')
plt.title('EV/EBITDA Frequency Plot')
plt.xlabel('EV/EBITDA')
plt.ylabel('Frequency')
idx_margin = (df_mult0.ebit_margin > 0) & (df_mult0.ebit_margin < 10)
idx_roc = (df_mult0.roc > 0) & (df_mult0.roc < 10)
idx_growth = (df_mult0.gRev1yr < 0) & (df_mult0.gRev1yr > -20)
idx_beta = (df_mult0.beta2 > 0.8) & (df_mult0.beta2 < 1.2)
# these are all boolean conditions, so we just use AND to combine them
idx = idx_margin & idx_roc & idx_growth & idx_beta
# now we pass idx for the row portion, and the list of columns we want to see
df_mult0.loc[idx,['ebit_margin','roc','gRev1yr','beta2']].describe()
# now, compute the EV/EBITDA multiple. Or EV/EBIT. Or perhaps Price/Sales (MarketCap/Revenue).
# note: you may want to use np.where() if your denominator is zero. You'll see inf or NaN for your multiple in that case.
df_mult0['ev_ebit'] = df_mult0['tev']/df_mult0['ebit']
# check out your multiple
df_mult0[idx]['ev_ebit'].describe()
#print(df_mult0[idx]['ev_ebit']) #to see individual multiples
df_mult0[idx].sort_values(by='ev_ebit')
# want to do a histogram?
#df_mult0[idx]['ev_ebit'].hist()
#hist=df_mult0.hist(bins=10)
fig=plt.figure()
ax=fig.add_subplot(1,1,1,)
n, bins, patches=ax.hist(df_mult0[idx]['ev_ebit'],bins=15,range=(0,100),histtype='bar')
for patch in patches:
patch.set_facecolor('r')
plt.title('EV/EBIT Frequency Plot')
plt.xlabel('EV/EBIT')
plt.ylabel('Frequency')
# We identified IntriCon Corporation (NasdaqGM:IIN) as the poorest performing company from our screen and
# are giving it a SELL recommendation. We came to this conclusion after observing it's negative growth rates,
# return on equity, no revenues, negative net income, high expenses, and extremely high mutliples.
# Stats: ROE = -2.74, Revenue 1 year growth = -9.45, EV/EBIT multiple = 2,147, Net Income = -2.52
# We would recommend selling IntriCon Corp bacause we expect a significant decline in the next several years as
# evidenced by the large negative 1-year growth rate of -9.45% and negative net income of -$2.52.
idx_margin = (df_mult0.ebit_margin > 20) & (df_mult0.ebit_margin < 30)
idx_roc = (df_mult0.roc > 20) & (df_mult0.roc < 30)
idx_growth = (df_mult0.gRev1yr > 0) & (df_mult0.gRev1yr < 30)
idx_beta = (df_mult0.beta2 > 0.8) & (df_mult0.beta2 < 1.2)
# these are all boolean conditions, so we just use AND to combine them
idx = idx_margin & idx_roc & idx_growth & idx_beta
# now we pass idx for the row portion, and the list of columns we want to see
df_mult0.loc[idx,['ebit_margin','roc','gRev1yr','beta2']].describe()
# now, compute the EV/EBITDA multiple. Or EV/EBIT. Or perhaps Price/Sales (MarketCap/Revenue).
# note: you may want to use np.where() if your denominator is zero. You'll see inf or NaN for your multiple in that case.
df_mult0['ev_ebit'] = df_mult0['tev']/df_mult0['ebit']
# check out your multiple
df_mult0[idx]['ev_ebit'].describe()
#print(df_mult0[idx]['ev_ebit']) #to see individual multiples
df_mult0[idx].sort_values(by='ev_ebit')
# want to do a histogram?
#df_mult0[idx]['ev_ebit'].hist()
#hist=df_mult0.hist(bins=10)
fig=plt.figure()
ax=fig.add_subplot(1,1,1,)
n, bins, patches=ax.hist(df_mult0[idx]['ev_ebit'],bins=15,range=(0,100),histtype='bar')
for patch in patches:
patch.set_facecolor('b')
plt.title('EV/EBIT Frequency Plot')
plt.xlabel('EV/EBIT')
plt.ylabel('Frequency')
# We identified Apple as the best performing company from our screen
# and are giving it a BUY recommendation. We came to this conclusion after observing it's positive ROE, high revenue growth rate of roughly 10%
# and an established, mature EV/EBIT multiple of 28.35. While these numbers are not the highest amongst its peers
# we felt that Apple is an established company and will continue to see growth through its innovation and technology
# Stats: ROE = 82.1, Revenue 1 year growth = 9.88, EV/EBIT multiple = 28.35, Net Income = 63,930