suppressPackageStartupMessages({
library(dplyr)
library(ggformula)
library(mosaic)
library(supernova)
library(lsr)})
#load data
EAMMIdata <- read.csv(file = "https://bit.ly/2NGiyrS", header = TRUE)
head(EAMMIdata)
#The variables we'll use are age and Political_views
#The following code discard all the rows that do not have valid values in the attributes age and Political_views
EAMMIdata <- filter(EAMMIdata,age!="NA")
EAMMIdata <- filter(EAMMIdata,Political_views!="NA")
#check if there is any missing data in age and Political_views
favstats(~Political_views,data=EAMMIdata)
favstats(~age,data=EAMMIdata)
#obtain the favstats of Political_views
EAMMI.stats = favstats(~ Political_views, data = EAMMIdata)
#generate a histogram
gf_histogram(~Political_views, data = EAMMIdata, fill = "pink", color = "black", bins = "7") %>%
#draw a vertical line on the mean
gf_vline(xintercept = EAMMI.stats$mean, color = "blue")
#generate a point plot
gf_point(Political_views~age,data=EAMMIdata) %>%
#draw a horizontal line to emulate the simple model
gf_hline(yintercept=EAMMI.stats$mean, color= "blue")
Warning message:
“geom_vline(): Ignoring `mapping` because `xintercept` was provided.”
Warning message:
“geom_hline(): Ignoring `mapping` because `yintercept` was provided.”
#fit a simple model
Empty.model <- lm(Political_views ~ NULL, data= EAMMIdata)
Empty.model
#Make the ANOVA table
anova(Empty.model)
EAMMI.stats <- favstats(~ Political_views, data= EAMMIdata)
#obtain 1000 means from samples generated from a normal distribution with means and standard deviations that are the same as our original sample
sampled_b0s <- do(1000) * mean(rnorm(EAMMI.stats$n, EAMMI.stats$mean, EAMMI.stats$sd))
head(sampled_b0s)
gf_histogram(~mean, data=sampled_b0s)
#run favstats on the sampling distribution
sampled_b0s.stats <- favstats(sampled_b0s$mean)
sampled_b0s.stats
#obtain sum of squared errors of the sampling distribution
anova(lm(sampled_b0s$mean~NULL))
confint(Empty.model)
#sort the values of the sampling distribution of the mean
sampled_b0s <- arrange(sampled_b0s,mean)
n=sampled_b0s.stats$n
#Higher bound based on our sampling distribution
sampled_b0s$mean[0.975*n]
#Lower bound based on our sampling distribution
sampled_b0s$mean[0.025*n]
#divide age into 3 equally sized groups
EAMMIdata$agefactor <- ntile(EAMMIdata$age, 3)
#recode 1 as young, 2 as middle, 3 as old
EAMMIdata$agefactor <- factor(EAMMIdata$agefactor, levels =c(1,2,3), labels = c("young", "middle","old"))
#sanity check to see if code works as intended
tally(EAMMIdata$agefactor)
head(select(EAMMIdata,age,agefactor))
#Create a jitter plot
gf_jitter(Political_views ~ agefactor, data = EAMMIdata)
#box plot
gf_boxplot(Political_views ~ agefactor, data =EAMMIdata)
#fit a linear model with agefactor as the explanatory variable
Political.model <- lm(Political_views ~ agefactor, data = EAMMIdata)
Political.model
#obtain the anova table
anova(Political.model)
supernova(Political.model)
#create 1000 linear models, each from a resampled data
resampled_models <- do(1000) * lm(Political_views~agefactor, data=resample(EAMMIdata,1723))
#see what it looks like
head(resampled_models)
#we see that b0,b1,b2 are stored in columns
#visualization of the three sampling distributions
gf_histogram(~resampled_models$Intercept)
gf_histogram(~resampled_models$agefactormiddle)
gf_histogram(~resampled_models$agefactorold)
#gf_histogram(~resampled_models$F)
#run favstats on the sampling distributions
favstats(~resampled_models$Intercept) #b0
favstats(~resampled_models$agefactormiddle) #b1
favstats(~resampled_models$agefactorold) #b2
#construct confident intervals for the model
confint(Political.model)
supernova(Political.model)
#point plot
gf_jitter(Political_views ~ age, data = EAMMIdata) %>%
#create a line to emulate the linear model
gf_lm(color = "orange")
#fit a linear model for the two continuous variables
quant.model <- lm(Political_views ~ age, data = EAMMIdata)
quant.model
anova(quant.model)
#obtain 1000 b1s by resampling our original sample
SDoB1 <- do(1000)* b1(lm(Political_views ~ age, data = resample(EAMMIdata,1724)))
head(SDoB1)
#create a histogram
gf_histogram(~b1, data = SDoB1)
favstats(SDoB1$b1)
#run confident interval on the model
confint(quant.model)
#obtain anova table for model comparison
supernova(quant.model)