# load two R packages 
library(ggplot2)  # for visualizing data
library(dplyr)    # for manipulating data
# reading in the data
df_glassdoor <- read.csv("Data_Glassdoor_Intro.csv")
head(df_glassdoor)
# note that this dataframe has 197 observations
# it contains two integer variables and two character variables
str(df_glassdoor)
summary(df_glassdoor)
df_glassdoor$gender <- as.factor(df_glassdoor$gender)
df_glassdoor$jobtitle <- as.factor(df_glassdoor$jobtitle)
str(df_glassdoor)
summary(df_glassdoor)
cor(df_glassdoor$income, df_glassdoor$age)
# run the same correlation but round to two decimal places
round(cor(df_glassdoor$income, df_glassdoor$age), digits=2)
# create a pairs plot displaying the relationships between all variables
# pairs of numeric variables will display a correlation coefficient
# before we start, run this command to resize figures so they fit on your screen
options(repr.plot.width=6, repr.plot.height=5)
library(GGally)
ggpairs(df_glassdoor)
# use ggplot to produce a basic scatterplot
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
    geom_point() 
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
    geom_point() +
    geom_smooth(method = "lm", se = FALSE) 
# you can customize the appearance of the scatterplot in many ways
# here are a few examples
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
    geom_point(color='blue') +
    ggtitle("Scatterplot with Regression Line") +
    xlab("Age") +
    ylab("Income (thousands)") +
    theme(plot.title = element_text(hjust = 0.5, size = 17)) +
    theme(text = element_text(size=15)) +
    geom_smooth(method = "lm", se = FALSE, color='green3') 
lm(income ~ age, data = df_glassdoor)
model_1 <- lm(income ~ age, data = df_glassdoor)
summary(model_1)
library(jtools)
summ(model_1)
df_glassdoor %>%
  group_by(gender) %>%
  summarize(Avg_income = mean(income))
df_glassdoor %>%
  group_by(jobtitle) %>%
  summarize(Avg_income = mean(income), Std.Dev. = sd(income), Sample_size = n())
ggplot(df_glassdoor, aes(x=gender, y=income)) +
  geom_boxplot()
model_2 <- lm(income ~ gender, data = df_glassdoor)
summ(model_2)
model_2 <- lm(income ~ jobtitle, data = df_glassdoor)
summ(model_2)
### Enter your own code below
df_gd <- read.csv("Data_Glassdoor_Full.csv")
str(df_gd)
# here is a way to change all character variables to factor variables
df_gd <- df_gd %>% 
    mutate_if(is.character,as.factor)
# create a histogram of Income
ggplot(df_gd, aes(x=income/1000)) +
    geom_histogram(bins = 30, color = 'black', fill = 'green3') +
    theme_classic() +
    ggtitle("Histogram of Income") +
    theme(plot.title = element_text(hjust = 0.5, size = 20)) +
    theme(text = element_text(size=15)) +
    xlab("Income (thousands)") +
    ylab("Frequency") 
df_gd %>%
    group_by(jobtitle) %>%
    summarise("Freq" = n())
# you can create new dataframes containing subsets of the data
df_subset <- df_gd %>%
    filter(jobtitle == "Data Scientist" | jobtitle == "Software Engineer") 
str(df_subset)