import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
# import geopandas as gpd
# from shapely.geometry import Point
# from shapely import wkt
# import geopandas
# try:
# from pymc3 import *
# import pymc3 as pm
# except:
# ! pip install pymc3
# from pymc3 import *
# import pymc3 as pm
# import arviz as az
from pymc3 import glm
# from scipy.stats import poisson, norm, gamma
%matplotlib inline
## Reading into UNFILTERED Chronic Disease Indicator dataset.
us_cdi = pd.read_csv('us_cdi.csv')
#all values in dropped col are null -> drop
us_cdi[us_cdi['Response'].notnull()]
us_cdi = us_cdi.drop(['Response','StratificationCategory2','Stratification2','StratificationCategory3','Stratification3','StratificationCategoryID2','StratificationID2','StratificationCategoryID3','StratificationID3','ResponseID'], axis=1)
us_cdi.head()
df = us_cdi.loc[us_cdi['Topic'] == 'Cardiovascular Disease']
df = df.loc[(df['Question'] == 'Mortality from total cardiovascular diseases') & (df['Topic'] == 'Cardiovascular Disease')]
df = df.loc[(df['DataValueType'] == 'Age-adjusted Rate')]
df = df.loc[(df['StratificationCategory1'] == 'Race/Ethnicity')]
df = df[df['DataValue'].notna()]
df = df.drop(['YearEnd', 'LocationAbbr', 'DataSource', 'Topic', 'Question', 'DataValueUnit', 'DataValueAlt', 'DataValueType', 'DataValueFootnoteSymbol','DatavalueFootnote', 'LowConfidenceLimit', 'HighConfidenceLimit', 'StratificationCategory1','GeoLocation', 'LocationID', 'TopicID', 'QuestionID', 'DataValueTypeID', 'StratificationCategoryID1','StratificationID1'], axis=1)
# df.head()
#Diagnosed Diabetes; Total; Adults Aged 18+ Years; Age-Adjusted Percentage; U.S. States; 2008-2018
diabetes = pd.read_csv('DiabetesAtlasData.csv')
diabetes_drop = diabetes.iloc[:-3, :]
diabetes_drop.isna().sum()
diabetes_drop.head()
## Insert Diabetes DF
diabetes_copy = diabetes_drop.copy() ## Copy diabetes rates without US Territories
diabetes_copy['State'] = diabetes_drop['State']
df2_diabetes = df2.copy() ## Copy Original DF (with race labels updated)
df2_diabetes['Diabetes Rate'] = np.nan ## Insert Empty Column for Diabetes
def diabetes_match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return diabetes_copy[diabetes_copy['State'].str.contains(correct_state)][str(correct_date)].values[0]
df2_diabetes['Diabetes Rate'] = df2_diabetes.apply(lambda row: diabetes_match(row), axis = 1)
df2_diabetes.head()
tobacco = pd.read_csv('The_Tax_Burden_on_Tobacco__1970-2019.csv', usecols=[ 1, 2, 5, 6, 7])
tobacco = tobacco[tobacco["SubMeasureDesc"] == "Cigarette Consumption (Pack Sales Per Capita)"]
yearly_consumption = tobacco.groupby('Year')['Data_Value'].sum()
yearly_consumption = yearly_consumption.reset_index(name = "Year Sum")
combine = tobacco.merge(yearly_consumption, how='left', on='Year')
tobacco_consumption = combine
tobacco_consumption['Tobacco Consumption Based on US Total'] = (tobacco_consumption['Data_Value'] / tobacco_consumption['Year Sum'])
tobacco_consumption = tobacco_consumption.sort_values(['Year', 'LocationDesc'], ascending = True)
tobacco_consumption = tobacco_consumption.drop(columns =['Year Sum', 'Data_Value', 'SubMeasureDesc', 'MeasureDesc' ])
tobacco_consumption = tobacco_consumption.pivot(index='LocationDesc', columns='Year', values='Tobacco Consumption Based on US Total').reset_index()
tobacco_consumption = tobacco_consumption.rename(columns={'LocationDesc':'State'})
tobacco_consumption = tobacco_consumption[['State', 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]]
tobacco_consumption.iloc[:,[1,2,3,4,5,6,7,8,9]] = tobacco_consumption.iloc[:,[1,2,3,4,5,6,7,8,9]]*100
tobacco_consumption.head()
## Insert Tobacco into DF
tobacco_copy = tobacco_consumption.copy() ## Copy tobacco consump. rate
tobacco_copy['State'] = tobacco_consumption['State']
df3_tobacco = df2_diabetes.copy() ## Copy DF with added diabetes from before
df3_tobacco['Tobacco Consumption Rate'] = np.nan #Insert empty vals into col
def tobacco_match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return tobacco_copy[tobacco_copy['State'].str.contains(correct_state)][(correct_date)].values[0]
df3_tobacco['Tobacco Consumption Rate'] = df3_tobacco.apply(lambda row: tobacco_match(row), axis = 1)
df3_tobacco.head()
#obesity rate by state 2011-2020
import pandas as pd
obesity = pd.read_csv('ExportCSV.csv')
obesity_clean = obesity[['YearStart','LocationDesc','Data_Value']]#,'Low_Confidence_Limit','High_Confidence_Limit']]
obesity_clean = obesity_clean.rename(columns={'Data_Value':'Obesity_Rate'})#,'Low_Confidence_Limit':'Obesity_Low_Conf_Limit','High_Confidence_Limit':'Obesity_High_Conf_Limit'})
obesity_clean2 = obesity_clean.copy()
# obesity_clean3 = obesity_clean2.copy()
obesity_clean2 = obesity_clean2.pivot(index='LocationDesc',columns = 'YearStart', values = 'Obesity_Rate').reset_index()
obesity_clean2 = obesity_clean2.rename(columns={'LocationDesc':'State'})
obesity_clean2 = obesity_clean2[['State', 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]]
obesity_clean2.head()
# Insert Obesity into DF
df4_obesity = df3_tobacco.copy() ## Copy
def obesity_match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return (df_obesity[df_obesity['State'].str.contains(correct_state)][correct_date].values[0])
df4_obesity['Obesity Rate'] = df4_obesity.apply(lambda row: obesity_match(row), axis = 1)
df4_obesity.head()
#Cleaning Air Quality Data for Q1
daily10 = pd.read_csv('daily_88101_2010.csv')
daily11 = pd.read_csv('daily_88101_2011.csv')
daily12 = pd.read_csv('daily_88101_2012.csv')
daily13 = pd.read_csv('daily_88101_2013.csv')
daily14 = pd.read_csv('daily_88101_2014.csv')
daily15 = pd.read_csv('daily_88101_2015.csv')
daily16 = pd.read_csv('daily_88101_2016.csv')
daily17 = pd.read_csv('daily_88101_2017.csv')
daily18 = pd.read_csv('daily_88101_2018.csv')
daily18['Sample Duration'].value_counts()
#Cleaning pm2.5 data
#daily18[daily18['Sample Duration'] == '24 HOUR']
import datetime
def get_year_average(df):
#State Name, POC, Parameter Name, Sample Duration, Pollutant Standard, Date Local , Arithmetic Mean, AQI
df = df[df['Sample Duration'] == '24-HR BLK AVG']
df = df[['State Name', 'Parameter Name', 'Sample Duration', 'Pollutant Standard', 'Date Local' , 'Arithmetic Mean', 'AQI']]
df['Year'] = pd.DatetimeIndex(df['Date Local']).year
df = df.groupby('State Name').mean()
return df
annual18 = get_year_average(daily18)
annual17 = get_year_average(daily17)
annual16 = get_year_average(daily16)
annual15 = get_year_average(daily15)
annual14 = get_year_average(daily14)
annual13 = get_year_average(daily13)
annual12 = get_year_average(daily12)
annual11 = get_year_average(daily11)
annual10 = get_year_average(daily10)
# Reads and creates data frame from population
pop_sex = pd.read_csv('pop_sex.csv')
pop_sex.head()
sex_header = pop_sex.iloc[1]
pop_sex_df = pd.DataFrame(pop_sex.values[1:], columns=sex_header)
pop_sex_df = pop_sex_df.drop(['2008__Male', '2008__Female', '2009__Male', '2009__Female', '2019__Male', '2019__Female', '2019__Total', 'Footnotes'], axis = 1)
pop_sex_df = pop_sex_df.drop(['2008__Total', '2009__Total', '2010__Total', '2011__Total', '2012__Total', '2013__Total', '2014__Total', '2015__Total', '2016__Total', '2017__Total', '2018__Total'], axis = 1)
pop_sex_df = pop_sex_df.sort_values('Location').dropna()
pop_sex_df_male = pop_sex_df[['Location', '2010__Male', '2011__Male', '2012__Male', '2013__Male', '2014__Male', '2015__Male', '2016__Male', '2017__Male', '2018__Male']]
pop_sex_df_male = pop_sex_df_male.rename(columns={'Location': 'State', '2010__Male': 2010, '2011__Male': 2011, '2012__Male':2012, '2013__Male':2013, '2014__Male':2014, '2015__Male':2015, '2016__Male':2016, '2017__Male':2017, '2018__Male':2018})
pop_sex_df_male
pop_sex_df_male[2018] = pd.to_numeric(pop_sex_df_male[2018], errors = 'coerce') * 100
pop_sex_df_male[2017] = pd.to_numeric(pop_sex_df_male[2017], errors = 'coerce') * 100
pop_sex_df_male[2016] = pd.to_numeric(pop_sex_df_male[2016], errors = 'coerce') * 100
pop_sex_df_male[2015] = pd.to_numeric(pop_sex_df_male[2015], errors = 'coerce') * 100
pop_sex_df_male[2014] = pd.to_numeric(pop_sex_df_male[2014], errors = 'coerce') * 100
pop_sex_df_male[2013] = pd.to_numeric(pop_sex_df_male[2013], errors = 'coerce') * 100
pop_sex_df_male[2012] = pd.to_numeric(pop_sex_df_male[2012], errors = 'coerce') * 100
pop_sex_df_male[2011] = pd.to_numeric(pop_sex_df_male[2011], errors = 'coerce') * 100
pop_sex_df_male[2010] = pd.to_numeric(pop_sex_df_male[2010], errors = 'coerce') * 100
pop_sex_df_male[2010].fillna((pop_sex_df_male[2010].mean()), inplace=True)
pop_sex_df_male[2011].fillna((pop_sex_df_male[2011].mean()), inplace=True)
pop_sex_df_male[2012].fillna((pop_sex_df_male[2012].mean()), inplace=True)
pop_sex_df_male[2013].fillna((pop_sex_df_male[2013].mean()), inplace=True)
pop_sex_df_male[2014].fillna((pop_sex_df_male[2014].mean()), inplace=True)
pop_sex_df_male[2015].fillna((pop_sex_df_male[2015].mean()), inplace=True)
pop_sex_df_male[2016].fillna((pop_sex_df_male[2016].mean()), inplace=True)
pop_sex_df_male[2017].fillna((pop_sex_df_male[2017].mean()), inplace=True)
pop_sex_df_male[2018].fillna((pop_sex_df_male[2018].mean()), inplace=True)
pop_sex_df_male.isna().sum()
## Insert Sex into DF
sex_copy = pop_sex_df_male.copy()
df4_sex = df4_obesity.copy()
df4_sex['Male Sex %'] = np.nan
def sex_match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return (sex_copy[sex_copy['State'].str.contains(correct_state)][correct_date].values[0])
df4_sex['Male Sex %'] = df4_sex.apply(lambda row: sex_match(row), axis = 1)
df4_sex.head()
# Table 23 extracted from XLSX file: Medicare Per Enrollee (Dollars per Person) + Cleaning
health23 = pd.read_csv('Table 23 Medicare Per Enrollee.csv')
health23.drop(health23.tail(2).index,inplace=True)
health23 = health23.iloc[: , :-2]
new_header23 = health23.iloc[0] #grab the first row for the header
health23 = health23[1:] #take the data less the header row
health23.columns = new_header23 #set the header row as the df header
health23_yr = health23.drop(['1991', '1992','1993', '1994', '1995',
'1996', '1997', '1998', '1999', '2000',
'2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008', '2009'], axis = 1)
health23_yr = health23_yr.iloc[:, : -1] #removes avg growth column
health23_yr= health23_yr.rename(columns={'Region/state of residence':'State'})
health23_yr.head()
#Add Medicare Expenditure as new column to make new df
medicare = health23_yr.copy()
# df4['Medicare Expenditure'] = np.nan
df4_medicare = df4_sex.copy()
df4_medicare['Medicare Expenditure'] = np.nan
medicare = medicare[medicare.columns[1:]].replace('[\$,]', '', regex=True).astype(float)
#estimates for 2015-2018 based on 1.7% growth rate annually in expenditures
medicare['2015'] = medicare['2014'] * 1.017
medicare['2016'] = medicare['2015'] * 1.017
medicare['2017'] = medicare['2016'] * 1.017
medicare['2018'] = medicare['2017'] * 1.017
#just to fix the dataset
medicare['State'] = health23_yr['State']
medicare
#match the year and state and grab the appropriate input
def match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return (medicare[medicare['State'].str.contains(correct_state)][str(correct_date)].values[0])
#apply function to get final column in medicare expenditure
df4_medicare['Medicare Expenditure'] = df4_medicare.apply(lambda row: match(row), axis=1)
df4_medicare.head()
Comparing GLM and non-parametric methods: DT/RF
from sklearn.model_selection import train_test_split
train, test = train_test_split(cleaned_data,test_size=.3, random_state = 4)
#IF a poisson distribution was good for mortality rates, then the mean and variance would be close
#since its not, poisson distribution is NOT a good fit for age-related mortality rates
np.mean(cleaned_data.DataValue), np.var(cleaned_data.DataValue)
#Frequentist GLM - Poisson (race ONLY)
y = (cleaned_data.DataValue).astype(None)
poisson_model_race = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]),
family=sm.families.Poisson()
)
poisson_results_race = poisson_model_race.fit()
print(poisson_results_race.summary())
#Again: same problem, low standard of error, model is too confident. Can conclude poisson distribution is not a good fit for this data
# We will try this again with location once we use medicaid expenditure, etc. as features
#Frequentist GLM - Poisson (race + medicare expenditure + year)
poisson_model_race_medicare_year = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic', 'Medicare Expenditure', 'YearStart']]),
family=sm.families.Poisson()
)
poisson_results_race_medicare_year = poisson_model_race_medicare_year.fit()
print(poisson_results_race_medicare_year.summary())
#Again: non-significant p-values; too wide confidence intervals this time; log-likelihood & Chi2
#Frequent GLM - Negative Binomial (race + year)
negbin_model_raceyear = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic', 'YearStart']]),
family=sm.families.NegativeBinomial()
)
negbin_results_raceyear = negbin_model_raceyear.fit()
print(negbin_results_raceyear.summary())
#non-significant p-values, too wide confidence intervals, high standard errors
#Frequentist Model - Negative Binomial (race only)
negbin_model_race = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]),
family=sm.families.NegativeBinomial()
)
negbin_results_race = negbin_model_race.fit()
print(negbin_results_race.summary())
#log-likelihood was okay, avg log-likelihood of -6, log-likelihoods close to 0 are good.
#Chi-square statistic & deviance should be less than n-p, it WAS !
# roughly be n - p = 1885 - 5 = 1880 but instead is 123, since its less, better fit!)
#confidence intervals & standard erros are a bit TOO small so model might be overconfident
#Frequentist Model - Negative Binomial (race + expenditure)
negbin_model_race_medicare = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic', 'Medicare Expenditure']]),
family=sm.families.NegativeBinomial()
)
negbin_results_race_medicare = negbin_model_race_medicare.fit()
print(negbin_results_race_medicare.summary())
#Similar idea for deviance / chi2 / log-likelihood --> seem good
#Medicare expenditure had very little impact and it seems like race is a much bigger deal than
# state differences in medicare expenditure
#Frequentist Model - Negative Binomial (race + expenditure + year)
negbin_model_race_medicare_year = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic', 'Medicare Expenditure', 'YearStart']]),
family=sm.families.NegativeBinomial()
)
negbin_results_race_medicare_year = negbin_model_race_medicare_year.fit()
print(negbin_results_race_medicare_year.summary())
#Similar idea for deviance / chi2 / log-likelihood --> seem good
#Medicare expenditure had very little impact, same with year.
# confidence intervals are TOO wide so this was not good
#predicting mortality rate based on race
from sklearn.tree import DecisionTreeRegressor
race_tree_model = DecisionTreeRegressor()
X_train = train[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]
X_test = test[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]
y_train = train['DataValue']
y_test = test['DataValue']
race_tree_model.fit(X_train,y_train)
train["race_tree_pred"] = race_tree_model.predict(X_train)
test["race_tree_pred"] = race_tree_model.predict(X_test)
#RMSE for race predictor
train_rmse = np.mean((train["race_tree_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["race_tree_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for decision tree:", train_rmse)
print("Test set error for decision tree: ", test_rmse)
from sklearn.tree import plot_tree
plt.figure(figsize=(16, 4))
plot_tree(race_tree_model);
#predicting mortality rate based on medicare expenditure
from sklearn.tree import DecisionTreeRegressor
medicare_tree_model = DecisionTreeRegressor()
y_train = train['DataValue']
y_test = train['DataValue']
X_train = train[['Medicare Expenditure']]
X_test = test[['Medicare Expenditure']]
medicare_tree_model.fit(X_train,y_train)
train["medicare_tree_pred"] = medicare_tree_model.predict(X_train)
test["medicare_tree_pred"] = medicare_tree_model.predict(X_test)
#RMSE for expenditure predictor
train_rmse = np.mean((train["medicare_tree_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["medicare_tree_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for decision tree:", train_rmse)
print("Test set error for decision tree: ", test_rmse)
from sklearn.tree import plot_tree
plt.figure(figsize=(50, 50))
plot_tree(medicare_tree_model);
#closer look of nodes at the top
X_cols = ['Medicare Expenditure']
plt.figure(figsize=(12, 7))
plot_tree(medicare_tree_model, max_depth=2, fontsize=14, feature_names=X_cols);
#predicting mortality from race + expenditure
from sklearn.tree import DecisionTreeRegressor
medicare_race_tree_model = DecisionTreeRegressor()
y_train = train['DataValue']
y_test = train['DataValue']
X_train = train[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic','Medicare Expenditure']]
X_test = test[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic','Medicare Expenditure']]
medicare_race_tree_model.fit(X_train,y_train)
train["medicare_race_tree_pred"] = medicare_race_tree_model.predict(X_train)
test["medicare_race_tree_pred"] = medicare_race_tree_model.predict(X_test)
#RMSE for race + expenditure predictor
train_rmse = np.mean((train["medicare_race_tree_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["medicare_race_tree_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for decision tree:", train_rmse)
print("Test set error for decision tree: ", test_rmse)
from sklearn.tree import plot_tree
plt.figure(figsize=(30, 15))
plot_tree(medicare_race_tree_model);
#closer look at nodes
X_cols = ['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic','Medicare Expenditure']
plt.figure(figsize=(20, 15))
plot_tree(medicare_race_tree_model, max_depth=7, fontsize=14, feature_names=X_cols);
from sklearn.ensemble import RandomForestRegressor
race_forest_model = RandomForestRegressor(max_features = 1)
X_train = train[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]
X_test = test[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]
y_train = train['DataValue']
y_test = test['DataValue']
race_forest_model.fit(X_train,y_train)
train_pred = race_forest_model.predict(X_train)
test_pred = race_forest_model.predict(X_test)
train["race_forest_pred"] = train_pred
test["race_forest_pred"] = test_pred
train_rmse = np.mean((train["race_forest_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["race_forest_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for random forest:", train_rmse)
print("Test set error for random forest: ", test_rmse)
from sklearn.ensemble import RandomForestRegressor
medicare_forest_model = RandomForestRegressor(max_features = 1)
X_train = train[['Medicare Expenditure']]
X_test = test[['Medicare Expenditure']]
y_train = train['DataValue']
y_test = test['DataValue']
medicare_forest_model .fit(X_train,y_train)
train_pred = medicare_forest_model .predict(X_train)
test_pred = medicare_forest_model .predict(X_test)
train["medicare_forest_pred"] = train_pred
test["medicare_forest_pred"] = test_pred
train_rmse = np.mean((train["medicare_forest_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["medicare_forest_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for random forest:", train_rmse)
print("Test set error for random forest: ", test_rmse)
## OLS Functions
def fit_OLS_model(df, target_variable, explanatory_variables, intercept = True):
target = df[target_variable]
inputs = df[explanatory_variables]
if intercept:
inputs = sm.add_constant(inputs)
fitted_model = sm.OLS(target, inputs).fit()
return(fitted_model)
def compute_OLS_predictions(input_array, input_params):
predictions = input_params[0] + input_params[1] * input_array
return predictions