# reading in the data
df_glassdoor <- read.csv("Data_Glassdoor_Intro.csv")
head(df_glassdoor)
# note that this dataframe has 197 observations
str(df_glassdoor)
# before we start, run this command to resize figures so they fit on your screen
options(repr.plot.width=6, repr.plot.height=5)
# load the ggplot2 package (which is pre-installed in this Deepnote project)
library(ggplot2)
# create a simple histogram of income
ggplot(data=df_glassdoor, aes(x=income)) +
geom_histogram()
ggplot(df_glassdoor, aes(x=income/1000)) +
geom_histogram() +
ggtitle("Income Distribution") +
xlab("Income (thousands)") +
ylab("Frequency")
ggplot(df_glassdoor, aes(x=income/1000)) +
geom_histogram(bins = 20, color = "black", fill = "green3") +
ggtitle("Income Distribution") +
xlab("Income (thousands)") +
ylab("Frequency") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 20)) +
theme(text = element_text(size=15))
# here is a density plot, another way to visualize a distribution
ggplot(df_glassdoor, aes(x=income)) +
geom_density()
# this density plot groups the data by job title
ggplot(df_glassdoor, aes(x=income, color=jobtitle)) +
geom_density() +
theme(legend.position = "top")
# Use a scatterplot to visualize the relationship between two variables
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
geom_point() + # this line is the command for a scatterplot
xlab("Age") +
ylab("Income (thousands)")
# Use a scatterplot to visualize the relationship between two variables
# Segment the observations by a third variable, using color to do so
ggplot(df_glassdoor, aes(x=age, y=income/1000, color=gender)) +
geom_point() +
xlab("Age") +
ylab("Income (thousands)") +
theme(legend.position = "top")
# Use a scatterplot to visualize the relationship between two variables
# Split the data into two "facets", based on a third variable
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
geom_point() +
facet_wrap(~gender) + # splits plot into two "facets" by gender
xlab("Age") +
ylab("Income (thousands)")
library(dplyr)
# here are three different cuts of the data for mean income
df_glassdoor %>%
summarise("Avg Income" = mean(income))
df_glassdoor %>%
group_by(jobtitle) %>%
summarise("Avg Income" = mean(income))
df_glassdoor %>%
group_by(gender) %>%
summarise("Avg Income" = mean(income), "Avg Age" = mean(age))
# use select() to choose a subset of observations
df_glassdoor %>%
filter(age > 45) %>%
summarise("Frequency" = n(), "Avg Income" = mean(income), "Std Dev" = sd(income))
# choose a subset of observations, then display first six rows using head()
df_glassdoor %>%
filter(gender == "Female" & jobtitle == "Data Scientist") %>%
head()
# create a new dataframe called df_managers containing only Managers
df_managers <- df_glassdoor %>%
filter(jobtitle == "Manager")
head(df_managers)
# use the new dataframe created above, df_managers, in a ggplot
ggplot(df_managers, aes(x=age, y=income/1000)) +
geom_point(color='blue') +
theme_classic() +
ggtitle("Scatterplot with Regression Line") +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
theme(text = element_text(size=14)) +
xlab("Age of Manager") +
ylab("Income (thousands)") +
geom_smooth(method = "lm", se = FALSE, color='green3')
df_gd_full <- read.csv("Data_Glassdoor_Full.csv")
head(df_gd_full)
# insert your own code here and below