Class1_Intro_Project

# reading in the data df_glassdoor <- read.csv("Data_Glassdoor_Intro.csv")

head(df_glassdoor)

# note that this dataframe has 197 observations str(df_glassdoor)

# before we start, run this command to resize figures so they fit on your screen options(repr.plot.width=6, repr.plot.height=5)

# load the ggplot2 package (which is pre-installed in this Deepnote project) library(ggplot2) # create a simple histogram of income ggplot(data=df_glassdoor, aes(x=income)) + geom_histogram()

ggplot(df_glassdoor, aes(x=income/1000)) + geom_histogram() + ggtitle("Income Distribution") + xlab("Income (thousands)") + ylab("Frequency")

ggplot(df_glassdoor, aes(x=income/1000)) + geom_histogram(bins = 20, color = "black", fill = "green3") + ggtitle("Income Distribution") + xlab("Income (thousands)") + ylab("Frequency") + theme_classic() + theme(plot.title = element_text(hjust = 0.5, size = 20)) + theme(text = element_text(size=15))

# here is a density plot, another way to visualize a distribution ggplot(df_glassdoor, aes(x=income)) + geom_density()

# this density plot groups the data by job title ggplot(df_glassdoor, aes(x=income, color=jobtitle)) + geom_density() + theme(legend.position = "top")

# Use a scatterplot to visualize the relationship between two variables ggplot(df_glassdoor, aes(x=age, y=income/1000)) + geom_point() + # this line is the command for a scatterplot xlab("Age") + ylab("Income (thousands)")

# Use a scatterplot to visualize the relationship between two variables # Segment the observations by a third variable, using color to do so ggplot(df_glassdoor, aes(x=age, y=income/1000, color=gender)) + geom_point() + xlab("Age") + ylab("Income (thousands)") + theme(legend.position = "top")

# Use a scatterplot to visualize the relationship between two variables # Split the data into two "facets", based on a third variable ggplot(df_glassdoor, aes(x=age, y=income/1000)) + geom_point() + facet_wrap(~gender) + # splits plot into two "facets" by gender xlab("Age") + ylab("Income (thousands)")

library(dplyr)

# here are three different cuts of the data for mean income df_glassdoor %>% summarise("Avg Income" = mean(income)) df_glassdoor %>% group_by(jobtitle) %>% summarise("Avg Income" = mean(income)) df_glassdoor %>% group_by(gender) %>% summarise("Avg Income" = mean(income), "Avg Age" = mean(age))

# use select() to choose a subset of observations df_glassdoor %>% filter(age > 45) %>% summarise("Frequency" = n(), "Avg Income" = mean(income), "Std Dev" = sd(income))

# choose a subset of observations, then display first six rows using head() df_glassdoor %>% filter(gender == "Female" & jobtitle == "Data Scientist") %>% head()

# create a new dataframe called df_managers containing only Managers df_managers <- df_glassdoor %>% filter(jobtitle == "Manager") head(df_managers)

# use the new dataframe created above, df_managers, in a ggplot ggplot(df_managers, aes(x=age, y=income/1000)) + geom_point(color='blue') + theme_classic() + ggtitle("Scatterplot with Regression Line") + theme(plot.title = element_text(hjust = 0.5, size = 16)) + theme(text = element_text(size=14)) + xlab("Age of Manager") + ylab("Income (thousands)") + geom_smooth(method = "lm", se = FALSE, color='green3')

df_gd_full <- read.csv("Data_Glassdoor_Full.csv") head(df_gd_full)

# insert your own code here and below