Class2_Stats_Project

# load two R packages library(ggplot2) # for visualizing data library(dplyr) # for manipulating data

# reading in the data df_glassdoor <- read.csv("Data_Glassdoor_Intro.csv")

head(df_glassdoor)

# note that this dataframe has 197 observations # it contains two integer variables and two character variables str(df_glassdoor)

summary(df_glassdoor)

df_glassdoor$gender <- as.factor(df_glassdoor$gender) df_glassdoor$jobtitle <- as.factor(df_glassdoor$jobtitle)

str(df_glassdoor) summary(df_glassdoor)

cor(df_glassdoor$income, df_glassdoor$age)

# run the same correlation but round to two decimal places round(cor(df_glassdoor$income, df_glassdoor$age), digits=2)

# create a pairs plot displaying the relationships between all variables # pairs of numeric variables will display a correlation coefficient # before we start, run this command to resize figures so they fit on your screen options(repr.plot.width=6, repr.plot.height=5) library(GGally) ggpairs(df_glassdoor)

# use ggplot to produce a basic scatterplot ggplot(df_glassdoor, aes(x=age, y=income/1000)) + geom_point()

ggplot(df_glassdoor, aes(x=age, y=income/1000)) + geom_point() + geom_smooth(method = "lm", se = FALSE)

# you can customize the appearance of the scatterplot in many ways # here are a few examples ggplot(df_glassdoor, aes(x=age, y=income/1000)) + geom_point(color='blue') + ggtitle("Scatterplot with Regression Line") + xlab("Age") + ylab("Income (thousands)") + theme(plot.title = element_text(hjust = 0.5, size = 17)) + theme(text = element_text(size=15)) + geom_smooth(method = "lm", se = FALSE, color='green3')

lm(income ~ age, data = df_glassdoor)

model_1 <- lm(income ~ age, data = df_glassdoor)

summary(model_1)

library(jtools)

summ(model_1)

df_glassdoor %>% group_by(gender) %>% summarize(Avg_income = mean(income))

df_glassdoor %>% group_by(jobtitle) %>% summarize(Avg_income = mean(income), Std.Dev. = sd(income), Sample_size = n())

ggplot(df_glassdoor, aes(x=gender, y=income)) + geom_boxplot()

model_2 <- lm(income ~ gender, data = df_glassdoor) summ(model_2)

model_2 <- lm(income ~ jobtitle, data = df_glassdoor) summ(model_2)

### Enter your own code below

df_gd <- read.csv("Data_Glassdoor_Full.csv")

str(df_gd)

# here is a way to change all character variables to factor variables df_gd <- df_gd %>% mutate_if(is.character,as.factor)

# create a histogram of Income ggplot(df_gd, aes(x=income/1000)) + geom_histogram(bins = 30, color = 'black', fill = 'green3') + theme_classic() + ggtitle("Histogram of Income") + theme(plot.title = element_text(hjust = 0.5, size = 20)) + theme(text = element_text(size=15)) + xlab("Income (thousands)") + ylab("Frequency")

df_gd %>% group_by(jobtitle) %>% summarise("Freq" = n())

# you can create new dataframes containing subsets of the data df_subset <- df_gd %>% filter(jobtitle == "Data Scientist" | jobtitle == "Software Engineer")

str(df_subset)