# start by loading in our typical set of packages
import numpy as np # NumPy
import pandas as pd # pandas
import seaborn as sns
import matplotlib.pyplot as plt
import sys
print(sys.version)

# use pip to install the package 'xlrd'
# need it to read the excel file below
!pip install xlrd

fnamein = 'Company Screening Report 0yr Lag.xls'
# use pandas to read the excel file. Open it if you like and check out why the parameters are set the way they are.
df_mult0 = pd.read_excel(fnamein,sheet_name='Screening',header=None,na_values=['-','NM'],skiprows=8)
# here we are making our own column names instead of the longer ones in the Excel file.
i = ['name','ex:tick','comp_type','exchange','industry','sector','marketcap','total_debt','shrout','revenue','gross_profit','sga','advertising','marketing','sales_marketing','gen_admin','rd_expense','int_expense','op_lease_int','ebitda','ebit','net_income','cash','capex','acquisitions','changeNWC','beta1','beta2','beta5','etr','tev','gRev1yr','gRev3yr','ebit_margin','ebitda_margin','gross_margin','roe','roc','altman']
# set column names
df_mult0.columns = i
# We are going to make the ticker our index - i.e. the unique identifier. So we can't have any NA values, so .dropna()
df_mult0.dropna(axis=0, how='any', subset=['ex:tick'],inplace=True)
df_mult0.set_index('ex:tick',inplace=True, verify_integrity=True)
print(df_mult0.info())
df_mult0.describe()

# fill in the two variables mentioned above to see their summary stats
df_mult0.loc[:,['revenue','tev']].describe()

# lets fix that. What should we do? Pick some (not all) of the following
# set values that are nonsensical to NaN, keep the rest the same?
df_mult0.loc[:,'revenue'] = np.where(df_mult0['revenue']< 0,np.nan,df_mult0['revenue'])
df_mult0.loc[:,'tev'] = np.where(df_mult0['tev']<0,np.nan,df_mult0['tev'])
# fill the NaN values with something?
#df_mult0['revenue'].fillna(value='count',inplace=True)
#df_mult0['tev'].fillna(value='count',inplace=True)
# drop NaN values?
df_mult0.dropna(axis=0, how='any', subset=['revenue','tev'], inplace=True)
# see if you solved the problem
df_mult0.loc[:,['revenue','tev']].describe()

#delete possibly before submission
df_mult0.info()

# let's look at the dataframe, but only where the 'exchange' variable is NaN
df_mult0[df_mult0.exchange.isna()]

# this looks a lot like they are just international firms. We should keep these.
# Let's fill them with a value that may help us sort later.
# Also, later functions won't like NaN in our data.
df_mult0['exchange'].fillna(value='NonUS', inplace=True)
#all this does is replace the NaN above with the text NonUS for Non-US Exchanges.
#check how many NaN are left in the data frame
# what function did we use above to see the non-null counts?
df_mult0.info()

# working from the top, the next one with missing values is Market Cap. (depending on what you did exactly - it may be ok)
# if the firm is missing MarketCap, then that means we don't have stock price or shares outstanding.
# Either is a problem, so we have to drop these firms. How do we do that?
# drop NaN values
df_mult0.dropna(axis=0, how='any', subset=['marketcap'], inplace=True)

# next we see total_debt.
# what does it mean if total_debt is missing? It likely means that the firm just has no debt.
# filter the Data Frame by only looking at firms where total_debt is NaN.
df_mult0.loc[df_mult0.total_debt.isna()]

# Maybe we need to look by sector. Where are these firms with missing total_debt?
# Let's look at how many NaN we have in total_debt by sector.
# the first part is the same filter as above, then use the groupby() function on the variable 'sector'
df_mult0.loc[df_mult0.total_debt.isna()].groupby(['sector']).count()

# here is a nifty trick - combine two conditional statements: which firms have NaN total_debt and are NOT in the 'Financials' sector
# (note the parentheses around each conditional statement - this is important)
df_mult0.loc[(df_mult0.total_debt.isna()) & (df_mult0.sector != 'Financials')]

# since we feel comfortalbe that these are normal companies but that they just have 0 total_debt, let's fill the NaN with 0
df_mult0['total_debt'].fillna(value=0,inplace=True) # make sure you set the correct second parameter to update the DataFrame.
df_mult0.info()

# here is another trick, though you may not need it.
# what if a firm has EBIT but not EBITDA? We want to keep the firm, so we fill the EBITDA field with the EBIT, assuming that DA are 0 instead of NaN
#df_mult0['ebitda'].fillna(value=df_mult0['ebit'], inplace=True)

You are now a lot of the way there. The goal is to do this kind of analysis all the way down for all of your variables, finding why there is any odd data - negatives, NaN, zeros, etc. You can use filters, look at the data, summary stats, etc.

# lets put all of our conditions into a single index object, then use that to filter our dataframe
# do an internet search for "pandas dataframe slice by multiple conditions" if you want more info on what I'm doing here
# each of these create a boolean series using specific criteria.
idx_margin = (df_mult0.ebit_margin > 10) & (df_mult0.ebit_margin < 20)
idx_roc = (df_mult0.roc > 10) & (df_mult0.roc < 20)
idx_growth = (df_mult0.gRev1yr > 0) & (df_mult0.gRev1yr < 20)
idx_beta = (df_mult0.beta2 > 0.8) & (df_mult0.beta2 < 1.2)
# these are all boolean conditions, so we just use AND to combine them
idx = idx_margin & idx_roc & idx_growth & idx_beta
# now we pass idx for the row portion, and the list of columns we want to see
df_mult0.loc[idx,['ebit_margin','roc','gRev1yr','beta2']].describe()

# now, compute the EV/EBITDA multiple. Or EV/EBIT. Or perhaps Price/Sales (MarketCap/Revenue).
# note: you may want to use np.where() if your denominator is zero. You'll see inf or NaN for your multiple in that case.
df_mult0['ev_ebitda'] = df_mult0['tev']/df_mult0['ebitda']
# check out your multiple
df_mult0[idx]['ev_ebitda'].describe()
#print(df_mult0[idx]['ev_ebitda']) #to see individual multiples

df_mult0[idx].sort_values(by='ev_ebitda')

# want to do a histogram?
#df_mult0[idx]['ev_ebitda'].hist()
#hist=df_mult0.hist(bins=5)
fig=plt.figure()
ax=fig.add_subplot(1,1,1,)
n, bins, patches=ax.hist(df_mult0[idx]['ev_ebitda'],bins=10,range=(0,100),histtype='bar')
for patch in patches:
patch.set_facecolor('g')
plt.title('EV/EBITDA Frequency Plot')
plt.xlabel('EV/EBITDA')
plt.ylabel('Frequency')

idx_margin = (df_mult0.ebit_margin > 0) & (df_mult0.ebit_margin < 10)
idx_roc = (df_mult0.roc > 0) & (df_mult0.roc < 10)
idx_growth = (df_mult0.gRev1yr < 0) & (df_mult0.gRev1yr > -20)
idx_beta = (df_mult0.beta2 > 0.8) & (df_mult0.beta2 < 1.2)
# these are all boolean conditions, so we just use AND to combine them
idx = idx_margin & idx_roc & idx_growth & idx_beta
# now we pass idx for the row portion, and the list of columns we want to see
df_mult0.loc[idx,['ebit_margin','roc','gRev1yr','beta2']].describe()

# now, compute the EV/EBITDA multiple. Or EV/EBIT. Or perhaps Price/Sales (MarketCap/Revenue).
# note: you may want to use np.where() if your denominator is zero. You'll see inf or NaN for your multiple in that case.
df_mult0['ev_ebit'] = df_mult0['tev']/df_mult0['ebit']
# check out your multiple
df_mult0[idx]['ev_ebit'].describe()
#print(df_mult0[idx]['ev_ebit']) #to see individual multiples

df_mult0[idx].sort_values(by='ev_ebit')

# want to do a histogram?
#df_mult0[idx]['ev_ebit'].hist()
#hist=df_mult0.hist(bins=10)
fig=plt.figure()
ax=fig.add_subplot(1,1,1,)
n, bins, patches=ax.hist(df_mult0[idx]['ev_ebit'],bins=15,range=(0,100),histtype='bar')
for patch in patches:
patch.set_facecolor('r')
plt.title('EV/EBIT Frequency Plot')
plt.xlabel('EV/EBIT')
plt.ylabel('Frequency')

# We identified IntriCon Corporation (NasdaqGM:IIN) as the poorest performing company from our screen and
# are giving it a SELL recommendation. We came to this conclusion after observing it's negative growth rates,
# return on equity, no revenues, negative net income, high expenses, and extremely high mutliples.
# Stats: ROE = -2.74, Revenue 1 year growth = -9.45, EV/EBIT multiple = 2,147, Net Income = -2.52
# We would recommend selling IntriCon Corp bacause we expect a significant decline in the next several years as
# evidenced by the large negative 1-year growth rate of -9.45% and negative net income of -$2.52.

idx_margin = (df_mult0.ebit_margin > 20) & (df_mult0.ebit_margin < 30)
idx_roc = (df_mult0.roc > 20) & (df_mult0.roc < 30)
idx_growth = (df_mult0.gRev1yr > 0) & (df_mult0.gRev1yr < 30)
idx_beta = (df_mult0.beta2 > 0.8) & (df_mult0.beta2 < 1.2)
# these are all boolean conditions, so we just use AND to combine them
idx = idx_margin & idx_roc & idx_growth & idx_beta
# now we pass idx for the row portion, and the list of columns we want to see
df_mult0.loc[idx,['ebit_margin','roc','gRev1yr','beta2']].describe()

# now, compute the EV/EBITDA multiple. Or EV/EBIT. Or perhaps Price/Sales (MarketCap/Revenue).
# note: you may want to use np.where() if your denominator is zero. You'll see inf or NaN for your multiple in that case.
df_mult0['ev_ebit'] = df_mult0['tev']/df_mult0['ebit']
# check out your multiple
df_mult0[idx]['ev_ebit'].describe()
#print(df_mult0[idx]['ev_ebit']) #to see individual multiples

df_mult0[idx].sort_values(by='ev_ebit')

# want to do a histogram?
#df_mult0[idx]['ev_ebit'].hist()
#hist=df_mult0.hist(bins=10)
fig=plt.figure()
ax=fig.add_subplot(1,1,1,)
n, bins, patches=ax.hist(df_mult0[idx]['ev_ebit'],bins=15,range=(0,100),histtype='bar')
for patch in patches:
patch.set_facecolor('b')
plt.title('EV/EBIT Frequency Plot')
plt.xlabel('EV/EBIT')
plt.ylabel('Frequency')

# We identified Apple as the best performing company from our screen
# and are giving it a BUY recommendation. We came to this conclusion after observing it's positive ROE, high revenue growth rate of roughly 10%
# and an established, mature EV/EBIT multiple of 28.35. While these numbers are not the highest amongst its peers
# we felt that Apple is an established company and will continue to see growth through its innovation and technology
# Stats: ROE = 82.1, Revenue 1 year growth = 9.88, EV/EBIT multiple = 28.35, Net Income = 63,930