library(dplyr) # for manipulating data (group_by, filter, select, summarise, etc.)
library(ggplot2) # for visualizing data using the ggplot command
# note that dplyr and ggplot2 are part of a larger ecosystem of packages called `tidyverse`
# in the future we can simply run `library(tidyverse)` to load all dplyr and ggplot2
library(jtools) # summ() commands to display regression output
library(stargazer) # stargazer() command for displaying tables and regression output
library(GGally) # ggpairs() command
# convert scientific notation to numerals throughout the notebook
options(scipen=999)
# change plot size to 6in x 5in to fit the Deepnote window
options(repr.plot.width=6, repr.plot.height=5)
df_store <- read.csv("Data_Store24A_Class3.csv")
str(df_store)
summary(df_store)
round(cor(df_store),2)
model1 <- lm(profit ~ ctenure, data = df_store)
summ(model1)
summ(model1, confint=TRUE)
model1 <- lm(profit ~ mtenure, data = df_store)
summ(model1)
model2 <- lm(profit ~ ctenure, data = df_store)
summ(model2)
model3 <- lm(profit ~ mtenure + ctenure, data = df_store)
summ(model3)
model_1 <- lm(profit ~ mtenure + ctenure + pop +
comp + visibility + pedcount + res + hours24,
data = df_store)
summ(model_1)
fit_profit3 <- lm(profit ~ mtenure + ctenure, data = df_store)
fit_profit4 <- lm(profit ~ pop + comp + visibility + pedcount + res + hours24,
data = df_store)
fit_profit5 <- lm(profit ~ mtenure + ctenure + pop + comp + visibility +
pedcount + res + hours24, data = df_store)
stargazer(fit_profit3, fit_profit4, fit_profit5, type = "text", digits = 2, report = ('vcstp*'))
anova(fit_profit4,fit_profit5)
ggplot(df_store, aes(x = mtenure, y = profit / 1000)) +
geom_point(col = "blue") +
xlab("Manager Tenure (in months)") +
ylab("Store Profit (in thousands)")
# add a regression line to the plot using `geom_smooth` with the `lm` method
# this helps to visualize how well a straight line fits the observations
ggplot(df_store, aes(x = mtenure, y = profit / 1000)) +
geom_point(col = "blue") +
xlab("Manager Tenure (in months)") +
ylab("Store Profit (in thousands)") +
geom_smooth(method = "lm", formula = y~x, se = FALSE, col = "purple3")
fit_mt_low <- lm(profit ~ mtenure + ctenure + pop + comp +
visibility + pedcount + res + hours24,
data = subset(df_store, mtenure < 45))
summ(fit_mt_low)
fit_mt_high <- lm(profit ~ mtenure + ctenure + pop + comp +
visibility + pedcount + res + hours24,
data = subset(df_store, mtenure >= 45))
summ(fit_mt_high)
# you can manually create a squared variable
df_store <- df_store %>% # this line updates the dataframe with the new variable
mutate(MT2 = mtenure^2) # the mutate function appends the new variable "MT2" to the dataframe
fit_nonlinear <- lm(profit ~ mtenure + MT2 +
ctenure + pop + comp + visibility + pedcount + res + hours24, data = df_store)
summ(fit_nonlinear)
#or you can use an R short-cut within the lm command to do the same thing:
fit_profit6 <- lm(profit ~ mtenure + I(mtenure^2) +
ctenure + pop + comp + visibility + pedcount + res + hours24, data = df_store)
summ(fit_profit6)
ggplot(df_store, aes(x = mtenure, y = profit / 1000)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", formula = y~x, se = FALSE, col = "purple3") +
geom_smooth(method = "lm", formula = y~x + I(x^2), se = FALSE, col = "green3") +
scale_x_continuous("Manager Tenure in Months, by Store",breaks=c(0,50,100,150,200)) +
xlab("Manager Tenure (in months)") +
ylab("Store Profit (in thousands)") +
theme_classic()
df_storeB <- read.csv("Data_Store24B_Class3.csv")
names(df_storeB)
summary(df_storeB)
# this uses the GGally package, already loaded above
ggpairs(df_storeB, columns= c("profit", "mtenure", "mgrskill", "servqual"))
model_7 <- lm(profit ~ mtenure + I(mtenure^2) + ctenure +
pop + comp + visibility + pedcount + res + hours24 +
mgrskill, data = df_storeB)
summ(model_7)
model_8 <- lm(servqual ~ mtenure + I(mtenure^2) + ctenure +
pop + comp + visibility + pedcount + res + hours24 +
mgrskill, data = df_storeB)
summ(model_8)
model_9 <- lm(profit ~ mtenure + I(mtenure^2) + ctenure +
pop + comp + visibility + pedcount + res + hours24 +
servqual + mgrskill, data = df_storeB)
summ(model_9)
model_1 <- lm(profit ~ mtenure + ctenure + pop +
comp + visibility + pedcount + res + hours24,
data = df_store)
summary(model_1)
stargazer(df_store[c("mtenure", "ctenure")],
type = "text", digits = 2)
#df_store$mtenure.scaled <- (df_store$mtenure - mean(df_store$mtenure)) / sd(df_store$mtenure)
df_store <- df_store %>% mutate(mtenure.scaled = (mtenure - mean(mtenure)) / sd(mtenure))
df_store$mtenure.scaled1 <- scale(df_store$mtenure)
df_store$ctenure.scaled <- scale(df_store$ctenure)
stargazer(df_store[c("mtenure", "mtenure.scaled", "mtenure.scaled1")],
type = "text", digits = 2)
model_1_scaled <- lm(profit ~ mtenure.scaled + ctenure.scaled + pop +
comp + visibility + pedcount + res + hours24,
data = df_store)
summary(model_1_scaled)
stargazer(model_1, model_1_scaled, type = "text", digits = 2, report = ('vcstp*'))
# BONUS code
# You can also scale individual variables directly within the linear model command
model_6 <- lm(profit ~ scale(mtenure) + scale(ctenure) +
pop + comp + visibility + pedcount + res + hours24,
data = df_store)
summ(model_6)
library(sjPlot)
plot_model(model_6)
# you can compare the graph above to a regression table
# display confidence intervals with `ci=TRUE`
# the confidence intervals below should correspond to those in the graph above
stargazer(model_6, type = "text", digits = 2, report = ('vcsp*'), ci=TRUE)