# load two R packages
library(ggplot2) # for visualizing data
library(dplyr) # for manipulating data
# reading in the data
df_glassdoor <- read.csv("Data_Glassdoor_Intro.csv")
head(df_glassdoor)
# note that this dataframe has 197 observations
# it contains two integer variables and two character variables
str(df_glassdoor)
summary(df_glassdoor)
df_glassdoor$gender <- as.factor(df_glassdoor$gender)
df_glassdoor$jobtitle <- as.factor(df_glassdoor$jobtitle)
str(df_glassdoor)
summary(df_glassdoor)
cor(df_glassdoor$income, df_glassdoor$age)
# run the same correlation but round to two decimal places
round(cor(df_glassdoor$income, df_glassdoor$age), digits=2)
# create a pairs plot displaying the relationships between all variables
# pairs of numeric variables will display a correlation coefficient
# before we start, run this command to resize figures so they fit on your screen
options(repr.plot.width=6, repr.plot.height=5)
library(GGally)
ggpairs(df_glassdoor)
# use ggplot to produce a basic scatterplot
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
geom_point()
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# you can customize the appearance of the scatterplot in many ways
# here are a few examples
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
geom_point(color='blue') +
ggtitle("Scatterplot with Regression Line") +
xlab("Age") +
ylab("Income (thousands)") +
theme(plot.title = element_text(hjust = 0.5, size = 17)) +
theme(text = element_text(size=15)) +
geom_smooth(method = "lm", se = FALSE, color='green3')
lm(income ~ age, data = df_glassdoor)
model_1 <- lm(income ~ age, data = df_glassdoor)
summary(model_1)
library(jtools)
summ(model_1)
df_glassdoor %>%
group_by(gender) %>%
summarize(Avg_income = mean(income))
df_glassdoor %>%
group_by(jobtitle) %>%
summarize(Avg_income = mean(income), Std.Dev. = sd(income), Sample_size = n())
ggplot(df_glassdoor, aes(x=gender, y=income)) +
geom_boxplot()
model_2 <- lm(income ~ gender, data = df_glassdoor)
summ(model_2)
model_2 <- lm(income ~ jobtitle, data = df_glassdoor)
summ(model_2)
### Enter your own code below
df_gd <- read.csv("Data_Glassdoor_Full.csv")
str(df_gd)
# here is a way to change all character variables to factor variables
df_gd <- df_gd %>%
mutate_if(is.character,as.factor)
# create a histogram of Income
ggplot(df_gd, aes(x=income/1000)) +
geom_histogram(bins = 30, color = 'black', fill = 'green3') +
theme_classic() +
ggtitle("Histogram of Income") +
theme(plot.title = element_text(hjust = 0.5, size = 20)) +
theme(text = element_text(size=15)) +
xlab("Income (thousands)") +
ylab("Frequency")
df_gd %>%
group_by(jobtitle) %>%
summarise("Freq" = n())
# you can create new dataframes containing subsets of the data
df_subset <- df_gd %>%
filter(jobtitle == "Data Scientist" | jobtitle == "Software Engineer")
str(df_subset)