library(dplyr) # for manipulating data (group_by, filter, select, summarise, etc.)
library(ggplot2) # for visualizing data using the ggplot command
# note that dplyr and ggplot2 are part of a larger ecosystem of packages called `tidyverse`
# in the future we can simply run `library(tidyverse)` to load all dplyr and ggplot2
library(jtools) # summ() commands to display regression output
library(stargazer) # stargazer() command for displaying tables and regression output
library(GGally) # ggpairs() command
# convert scientific notation to numerals throughout the notebook
options(scipen=999)
# change plot size to 6in x 5in to fit the Deepnote window
options(repr.plot.width=6, repr.plot.height=5)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
Please cite as:
Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
df_store <- read.csv("Data_Store24A_Class3.csv")
names(df_store)
str(df_store)
'data.frame': 75 obs. of 11 variables:
$ store : int 1 2 3 4 5 6 7 8 9 10 ...
$ sales : int 1060294 1619874 1099921 1053860 1227841 1703140 1809256 1378482 2113089 1080979 ...
$ profit : int 265014 424007 222735 210122 300480 469050 476355 361115 474725 278625 ...
$ mtenure : num 0 86.22 23.89 0 3.88 ...
$ ctenure : num 24.8 6.64 5.03 5.37 6.87 ...
$ pop : int 7535 8630 9695 2797 20335 16926 17754 20824 26519 16381 ...
$ comp : num 2.8 4.24 4.49 4.25 1.65 ...
$ visibility: int 3 4 3 4 2 3 2 4 2 4 ...
$ pedcount : int 3 3 3 2 5 4 5 3 4 3 ...
$ res : int 1 1 1 1 0 1 1 1 1 1 ...
$ hours24 : int 1 1 1 1 1 0 1 1 1 0 ...
summary(df_store)
round(cor(df_store),2)
model1 <- lm(profit ~ ctenure, data = df_store)
summ(model1)
summ(model1, confint=TRUE)
model1 <- lm(profit ~ mtenure, data = df_store)
summ(model1)
model2 <- lm(profit ~ ctenure, data = df_store)
summ(model2)
model3 <- lm(profit ~ mtenure + ctenure, data = df_store)
summ(model3)
model_1 <- lm(profit ~ mtenure + ctenure + pop +
comp + visibility + pedcount + res + hours24,
data = df_store)
summ(model_1)
fit_profit3 <- lm(profit ~ mtenure + ctenure, data = df_store)
fit_profit4 <- lm(profit ~ pop + comp + visibility + pedcount + res + hours24,
data = df_store)
fit_profit5 <- lm(profit ~ mtenure + ctenure + pop + comp + visibility +
pedcount + res + hours24, data = df_store)
stargazer(fit_profit3, fit_profit4, fit_profit5, type = "text", digits = 2, report = ('vcstp*'))
===================================================================================
Dependent variable:
---------------------------------------------------------------
profit
(1) (2) (3)
-----------------------------------------------------------------------------------
mtenure 619.84 760.99
(166.70) (127.09)
t = 3.72 t = 5.99
p = 0.0004*** p = 0.0000***
ctenure 810.13 944.98
(543.22) (421.69)
t = 1.49 t = 2.24
p = 0.15 p = 0.03**
pop 2.74 3.67
(1.91) (1.47)
t = 1.43 t = 2.50
p = 0.16 p = 0.02**
comp -18,280.54 -25,286.89
(7,044.85) (5,491.94)
t = -2.59 t = -4.60
p = 0.02** p = 0.0001***
visibility 23,715.37 12,625.45
(11,750.75) (9,087.62)
t = 2.02 t = 1.39
p = 0.05** p = 0.17
pedcount 35,034.67 34,087.36
(11,641.29) (9,073.20)
t = 3.01 t = 3.76
p = 0.004*** p = 0.0004***
res 28,097.48 91,584.67
(47,921.39) (39,231.28)
t = 0.59 t = 2.33
p = 0.56 p = 0.03**
hours24 36,987.98 63,233.31
(25,214.59) (19,641.11)
t = 1.47 t = 3.22
p = 0.15 p = 0.002***
Constant 236,950.70 83,889.40 7,610.04
(13,124.73) (82,529.25) (66,821.99)
t = 18.05 t = 1.02 t = 0.11
p = 0.00*** p = 0.32 p = 0.91
-----------------------------------------------------------------------------------
Observations 75 75 75
R2 0.22 0.36 0.64
Adjusted R2 0.20 0.30 0.59
Residual Std. Error 80,212.74 (df = 72) 74,854.22 (df = 68) 56,965.49 (df = 66)
F Statistic 9.97*** (df = 2; 72) 6.26*** (df = 6; 68) 14.53*** (df = 8; 66)
===================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
anova(fit_profit4,fit_profit5)
ggplot(df_store, aes(x = mtenure, y = profit / 1000)) +
geom_point(col = "blue") +
xlab("Manager Tenure (in months)") +
ylab("Store Profit (in thousands)")
# add a regression line to the plot using `geom_smooth` with the `lm` method
# this helps to visualize how well a straight line fits the observations
ggplot(df_store, aes(x = mtenure, y = profit / 1000)) +
geom_point(col = "blue") +
xlab("Manager Tenure (in months)") +
ylab("Store Profit (in thousands)") +
geom_smooth(method = "lm", formula = y~x, se = FALSE, col = "purple3")
fit_mt_low <- lm(profit ~ mtenure + ctenure + pop + comp +
visibility + pedcount + res + hours24,
data = subset(df_store, mtenure < 45))
summ(fit_mt_low)
fit_mt_high <- lm(profit ~ mtenure + ctenure + pop + comp +
visibility + pedcount + res + hours24,
data = subset(df_store, mtenure >= 45))
summ(fit_mt_high)
# you can manually create a squared variable
df_store <- df_store %>% # this line updates the dataframe with the new variable
mutate(MT2 = mtenure^2) # the mutate function appends the new variable "MT2" to the dataframe
fit_nonlinear <- lm(profit ~ mtenure + MT2 +
ctenure + pop + comp + visibility + pedcount + res + hours24, data = df_store)
summ(fit_nonlinear)
#or you can use an R short-cut within the lm command to do the same thing:
fit_profit6 <- lm(profit ~ mtenure + I(mtenure^2) +
ctenure + pop + comp + visibility + pedcount + res + hours24, data = df_store)
summ(fit_profit6)
ggplot(df_store, aes(x = mtenure, y = profit / 1000)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", formula = y~x, se = FALSE, col = "purple3") +
geom_smooth(method = "lm", formula = y~x + I(x^2), se = FALSE, col = "green3") +
scale_x_continuous("Manager Tenure in Months, by Store",breaks=c(0,50,100,150,200)) +
xlab("Manager Tenure (in months)") +
ylab("Store Profit (in thousands)") +
theme_classic()
df_storeB <- read.csv("Data_Store24B_Class3.csv")
names(df_storeB)
summary(df_storeB)
# this uses the GGally package, already loaded above
ggpairs(df_storeB, columns= c("profit", "mtenure", "mgrskill", "servqual"))
model_7 <- lm(profit ~ mtenure + I(mtenure^2) + ctenure +
pop + comp + visibility + pedcount + res + hours24 +
mgrskill, data = df_storeB)
summ(model_7)
model_8 <- lm(servqual ~ mtenure + I(mtenure^2) + ctenure +
pop + comp + visibility + pedcount + res + hours24 +
mgrskill, data = df_storeB)
summ(model_8)
model_9 <- lm(profit ~ mtenure + I(mtenure^2) + ctenure +
pop + comp + visibility + pedcount + res + hours24 +
servqual + mgrskill, data = df_storeB)
summ(model_9)
model_1 <- lm(profit ~ mtenure + ctenure + pop +
comp + visibility + pedcount + res + hours24,
data = df_store)
summary(model_1)
stargazer(df_store[c("mtenure", "ctenure")],
type = "text", digits = 2)
=========================================================
Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
---------------------------------------------------------
mtenure 75 45.30 57.67 0 6.7 50.9 278
ctenure 75 13.93 17.70 0.89 4.39 17.22 114.15
---------------------------------------------------------
#df_store$mtenure.scaled <- (df_store$mtenure - mean(df_store$mtenure)) / sd(df_store$mtenure)
df_store <- df_store %>% mutate(mtenure.scaled = (mtenure - mean(mtenure)) / sd(mtenure))
df_store$mtenure.scaled1 <- scale(df_store$mtenure)
df_store$ctenure.scaled <- scale(df_store$ctenure)
stargazer(df_store[c("mtenure", "mtenure.scaled", "mtenure.scaled1")],
type = "text", digits = 2)
==============================================================
Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
--------------------------------------------------------------
mtenure 75 45.30 57.67 0 6.7 50.9 278
mtenure.scaled 75 0.00 1.00 -0.79 -0.67 0.10 4.03
mtenure.scaled1 75 0.00 1.00 -0.79 -0.67 0.10 4.03
--------------------------------------------------------------
model_1_scaled <- lm(profit ~ mtenure.scaled + ctenure.scaled + pop +
comp + visibility + pedcount + res + hours24,
data = df_store)
summary(model_1_scaled)
stargazer(model_1, model_1_scaled, type = "text", digits = 2, report = ('vcstp*'))
==========================================================
Dependent variable:
----------------------------
profit
(1) (2)
----------------------------------------------------------
mtenure 760.99
(127.09)
t = 5.99
p = 0.0000***
ctenure 944.98
(421.69)
t = 2.24
p = 0.03**
mtenure.scaled 43,887.63
(7,329.23)
t = 5.99
p = 0.0000***
ctenure.scaled 16,723.76
(7,462.82)
t = 2.24
p = 0.03**
pop 3.67 3.67
(1.47) (1.47)
t = 2.50 t = 2.50
p = 0.02** p = 0.02**
comp -25,286.89 -25,286.89
(5,491.94) (5,491.94)
t = -4.60 t = -4.60
p = 0.0001*** p = 0.0001***
visibility 12,625.45 12,625.45
(9,087.62) (9,087.62)
t = 1.39 t = 1.39
p = 0.17 p = 0.17
pedcount 34,087.36 34,087.36
(9,073.20) (9,073.20)
t = 3.76 t = 3.76
p = 0.0004*** p = 0.0004***
res 91,584.67 91,584.67
(39,231.28) (39,231.28)
t = 2.33 t = 2.33
p = 0.03** p = 0.03**
hours24 63,233.31 63,233.31
(19,641.11) (19,641.11)
t = 3.22 t = 3.22
p = 0.002*** p = 0.002***
Constant 7,610.04 55,245.27
(66,821.99) (65,393.81)
t = 0.11 t = 0.84
p = 0.91 p = 0.41
----------------------------------------------------------
Observations 75 75
R2 0.64 0.64
Adjusted R2 0.59 0.59
Residual Std. Error (df = 66) 56,965.49 56,965.49
F Statistic (df = 8; 66) 14.53*** 14.53***
==========================================================
Note: *p<0.1; **p<0.05; ***p<0.01
# BONUS code
# You can also scale individual variables directly within the linear model command
model_6 <- lm(profit ~ scale(mtenure) + scale(ctenure) +
pop + comp + visibility + pedcount + res + hours24,
data = df_store)
summ(model_6)
library(sjPlot)
plot_model(model_6)
Registered S3 methods overwritten by 'broom':
method from
tidy.glht jtools
tidy.summary.glht jtools
Learn more about sjPlot with 'browseVignettes("sjPlot")'.
# you can compare the graph above to a regression table
# display confidence intervals with `ci=TRUE`
# the confidence intervals below should correspond to those in the graph above
stargazer(model_6, type = "text", digits = 2, report = ('vcsp*'), ci=TRUE)
===============================================
Dependent variable:
---------------------------
profit
-----------------------------------------------
scale(mtenure) 43,887.63
(29,522.61, 58,252.65)
p = 0.0000***
scale(ctenure) 16,723.76
(2,096.91, 31,350.62)
p = 0.03**
pop 3.67
(0.79, 6.54)
p = 0.02**
comp -25,286.89
(-36,050.88, -14,522.89)
p = 0.0001***
visibility 12,625.45
(-5,185.96, 30,436.85)
p = 0.17
pedcount 34,087.36
(16,304.22, 51,870.50)
p = 0.0004***
res 91,584.67
(14,692.77, 168,476.60)
p = 0.03**
hours24 63,233.31
(24,737.43, 101,729.20)
p = 0.002***
Constant 55,245.27
(-72,924.25, 183,414.80)
p = 0.41
-----------------------------------------------
Observations 75
R2 0.64
Adjusted R2 0.59
Residual Std. Error 56,965.49 (df = 66)
F Statistic 14.53*** (df = 8; 66)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01