# load two R packages
library(ggplot2) # for visualizing data
library(dplyr) # for manipulating data
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
# reading in the data
df_glassdoor <- read.csv("Data_Glassdoor_Intro.csv")
head(df_glassdoor)
# note that this dataframe has 197 observations
# it contains two integer variables and two character variables
str(df_glassdoor)
'data.frame': 197 obs. of 4 variables:
$ income : int 92067 74523 113252 96355 87121 70643 89415 71105 71193 59212 ...
$ age : int 30 22 45 29 32 21 33 24 49 18 ...
$ gender : chr "Female" "Female" "Female" "Male" ...
$ jobtitle: chr "Data Scientist" "Data Scientist" "Data Scientist" "Data Scientist" ...
summary(df_glassdoor)
df_glassdoor$gender <- as.factor(df_glassdoor$gender)
df_glassdoor$jobtitle <- as.factor(df_glassdoor$jobtitle)
str(df_glassdoor)
summary(df_glassdoor)
'data.frame': 197 obs. of 4 variables:
$ income : int 92067 74523 113252 96355 87121 70643 89415 71105 71193 59212 ...
$ age : int 30 22 45 29 32 21 33 24 49 18 ...
$ gender : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 1 1 1 1 2 ...
$ jobtitle: Factor w/ 2 levels "Data Scientist",..: 1 1 1 1 1 1 1 1 1 1 ...
cor(df_glassdoor$income, df_glassdoor$age)
# run the same correlation but round to two decimal places
round(cor(df_glassdoor$income, df_glassdoor$age), digits=2)
# create a pairs plot displaying the relationships between all variables
# pairs of numeric variables will display a correlation coefficient
# before we start, run this command to resize figures so they fit on your screen
options(repr.plot.width=6, repr.plot.height=5)
library(GGally)
ggpairs(df_glassdoor)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# use ggplot to produce a basic scatterplot
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
geom_point()
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
`geom_smooth()` using formula 'y ~ x'
# you can customize the appearance of the scatterplot in many ways
# here are a few examples
ggplot(df_glassdoor, aes(x=age, y=income/1000)) +
geom_point(color='blue') +
ggtitle("Scatterplot with Regression Line") +
xlab("Age") +
ylab("Income (thousands)") +
theme(plot.title = element_text(hjust = 0.5, size = 17)) +
theme(text = element_text(size=15)) +
geom_smooth(method = "lm", se = FALSE, color='green3')
`geom_smooth()` using formula 'y ~ x'
lm(income ~ age, data = df_glassdoor)
model_1 <- lm(income ~ age, data = df_glassdoor)
summary(model_1)
library(jtools)
summ(model_1)
df_glassdoor %>%
group_by(gender) %>%
summarize(Avg_income = mean(income))
df_glassdoor %>%
group_by(jobtitle) %>%
summarize(Avg_income = mean(income), Std.Dev. = sd(income), Sample_size = n())
ggplot(df_glassdoor, aes(x=gender, y=income)) +
geom_boxplot()
model_2 <- lm(income ~ gender, data = df_glassdoor)
summ(model_2)
model_2 <- lm(income ~ jobtitle, data = df_glassdoor)
summ(model_2)
### Enter your own code below
df_gd <- read.csv("Data_Glassdoor_Full.csv")
str(df_gd)
'data.frame': 1000 obs. of 15 variables:
$ jobtitle : chr "Graphic Designer" "Software Engineer" "Warehouse Associate" "Software Engineer" ...
$ gender : chr "Female" "Male" "Female" "Male" ...
$ age : int 18 21 19 20 26 20 20 18 33 35 ...
$ performance: int 5 5 4 5 5 5 5 4 5 5 ...
$ education : chr "College" "College" "PhD" "Masters" ...
$ department : chr "Operations" "Management" "Administration" "Sales" ...
$ seniority : int 2 5 5 4 5 4 4 5 5 5 ...
$ income : int 42363 108476 90208 108080 99464 70890 67585 97523 112976 106524 ...
$ bonus : int 9938 11128 9268 10154 9319 10126 10541 10240 9836 9941 ...
$ pay : int 52301 119604 99476 118234 108783 81016 78126 107763 122812 116465 ...
$ female : int 1 0 1 0 0 1 1 0 1 1 ...
$ HIghSchool : int 0 0 0 0 0 0 0 0 1 0 ...
$ College : int 1 1 0 0 0 0 1 0 0 1 ...
$ Masters : int 0 0 0 1 1 0 0 0 0 0 ...
$ PhD : int 0 0 0 0 0 0 0 0 0 0 ...
# here is a way to change all character variables to factor variables
df_gd <- df_gd %>%
mutate_if(is.character,as.factor)
# create a histogram of Income
ggplot(df_gd, aes(x=income/1000)) +
geom_histogram(bins = 30, color = 'black', fill = 'green3') +
theme_classic() +
ggtitle("Histogram of Income") +
theme(plot.title = element_text(hjust = 0.5, size = 20)) +
theme(text = element_text(size=15)) +
xlab("Income (thousands)") +
ylab("Frequency")
df_gd %>%
group_by(jobtitle) %>%
summarise("Freq" = n())
# you can create new dataframes containing subsets of the data
df_subset <- df_gd %>%
filter(jobtitle == "Data Scientist" | jobtitle == "Software Engineer")
str(df_subset)
'data.frame': 216 obs. of 15 variables:
$ jobtitle : Factor w/ 10 levels "Data Scientist",..: 9 9 9 1 9 9 9 1 1 9 ...
$ gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 2 2 1 1 2 ...
$ age : int 21 20 18 30 21 19 20 22 45 27 ...
$ performance: int 5 5 4 5 4 5 5 5 5 5 ...
$ education : Factor w/ 4 levels "College","High School",..: 1 3 4 1 4 3 3 2 4 4 ...
$ department : Factor w/ 5 levels "Administration",..: 3 5 2 2 2 1 3 2 3 3 ...
$ seniority : int 5 4 5 5 5 4 2 3 5 4 ...
$ income : int 108476 108080 97523 92067 132823 100305 66359 74523 113252 96040 ...
$ bonus : int 11128 10154 10240 9838 9625 9618 10137 9972 10139 10050 ...
$ pay : int 119604 118234 107763 101905 142448 109923 76496 84495 123391 106090 ...
$ female : int 0 0 0 1 0 0 0 1 1 0 ...
$ HIghSchool : int 0 0 0 0 0 0 0 1 0 0 ...
$ College : int 1 0 0 1 0 0 0 0 0 0 ...
$ Masters : int 0 1 0 0 0 1 1 0 0 0 ...
$ PhD : int 0 0 0 0 0 0 0 0 0 0 ...