from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from time import sleep
from random import randint
import pandas as pd
import sys
import re as regex
from datetime import datetime
def initialize_driver():
driver = webdriver.Edge()
driver.implicitly_wait(5)
return driver
def open_indeed_and_deny_cookies(driver):
driver.get('https://de.indeed.com/')
driver.find_element(By.XPATH, '//*[@id="onetrust-reject-all-handler"]').click()
sleep(1)
def get_url(job_title, page):
job_title = job_title.replace(' ', '+').strip()
return f'https://de.indeed.com/jobs?q={job_title}&l=M%C3%BCnchen&start={page}0'
def parse_page(driver, job_title, page):
try:
driver.find_element(By.XPATH, '//*[@id="google-Only-Modal"]/div/div[1]/button').click()
driver.find_element(By.XPATH, '//*[@id="mosaic-modal-mosaic-provider-desktopserp-jobalert-popup"]/div/div/div[1]/div/button').click()
except: pass
driver.get(get_url(job_title, page))
beacons = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
postings = parse_beacons(driver, beacons)
return parse_all_postings(postings)
def parse_beacons(driver, beacons):
postings = []
for beacon in beacons:
beacon.click()
try: driver.find_element(By.XPATH, '//*[@id="jobsearch-ViewjobPaneWrapper"]/div/button')
except: pass
sleep(randint(1,7))
soup = BeautifulSoup(driver.page_source, 'html.parser')
header = soup.find('div', {'class':'jobsearch-JobComponent-embeddedHeader'})
body = soup.find('div', {'class':'jobsearch-JobComponent-embeddedBody'})
if (header is not None) & (body is not None):
postings.append((header, body))
else: print('Posting did not load (fast enough).')
return postings
def parse_all_postings(postings):
records = []
for (header, body) in postings:
posting = parse_posting(header, body)
if posting:
records.append(posting)
return records
def parse_posting(header, body):
record = {}
try:
record['Job_Title'] = regex.sub('\(\w(/\w)+\)','', header.find('h1').text.replace('- job post', '').title()).strip()
record['Company'] = header.find_all('div', {'class': 'jobsearch-InlineCompanyRating-companyHeader'})[1].text
except AttributeError:
print('Header not found')
return
try: record['Rating'] = float(header.find('a', {'class': 'icl-Ratings-starsCountWrapper icl-Ratings-link'})['aria-label'].split(' von')[0])
except TypeError: pass
try: record['Compensation'] = regex.sub('[^0-9–]', '', header.find('span', {'class': 'icl-u-xs-mr--xs attribute_snippet'}).text).replace('–', ' to ')
except AttributeError: pass
try:
e_types = body.find('div', {'class': 'jobsearch-JobDescriptionSection-sectionItemKey icl-u-textBold'}).find_next_siblings('div')
record['Employment_Type'] = [e_type.text for e_type in e_types]
except AttributeError: pass
record['Description'] = body.find('div', {'id': 'jobDescriptionText'}).text
try: record['Insights'] = regex.search('vor \d+\+? Tag(en)? geschaltet', body.find('div', {'id': 'hiringInsightsSectionRoot'}).text).group(0)
except AttributeError: pass
try: record['Link'] = header.find('div', {'id':'applyButtonLinkContainer'}).find('a')['href']
except AttributeError: record['Link'] = 'Fast application'
return record
def main(queries):
driver = initialize_driver()
open_indeed_and_deny_cookies(driver)
tot_pages = 0
tot_page_counter = 0
for query in queries:
tot_pages += query[1]
results = []
for (job_title, pages) in queries:
page_counter = 0
for page in range(pages):
try: results.extend(parse_page(driver, job_title, page))
except Exception as e: print(type(e))
page_counter += 1
tot_page_counter += 1
print(f'({tot_page_counter}/{tot_pages}):\n\
\tFinished scraping page {page_counter} out of {pages} page(s) for keyword {job_title}.\n\
\t{len(results)} results so far.')
return results
if __name__ == '__main__':
scrape_result = main([('Data Analyst', int(sys.argv[1])), ('Data Scientist', int(sys.argv[1])),
('Business Analyst', int(sys.argv[1])), ('Business Intelligence', int(sys.argv[1]))])
df = pd.DataFrame(scrape_result)
today = datetime.today().date()
df['Scraped_Date'] = today
df.to_csv(f'data{today}.csv', index=False)
import pandas as pd
from glob import glob
import re as regex
import numpy as np
from datetime import datetime, timedelta
def check_job_level(job):
job_title = job.Job_Title
for level, pattern in level_patterns.items():
if regex.search(pattern, job_title):
return level
return 'Other'
def check_job_position(job):
job_title = job.Job_Title
for position, pattern in position_patterns.items():
if regex.search(pattern, job_title):
return position
return 'Other'
def check_skills(job):
description = job.Description
skills = []
for skill, pattern in skill_patterns.items():
if regex.search(pattern, description):
skills.append(skill)
return skills
files = glob('Data/data*.csv')
dfs = []
for file in files:
dfs.append(pd.read_csv(file, parse_dates=['Scraped_Date']))
df = pd.concat(dfs)
df.Insights.fillna(value=False, inplace=True)
df['Days_Published'] = df.Insights.apply(lambda x: int(regex.sub('[^0-9]', '', x) if x else 0))
df['Created_at'] = pd.Series(df.Scraped_Date - df.Days_Published.apply(lambda x: timedelta(days=x))).apply(datetime.date)
df.sort_values(by=['Created_at'], ascending=True, inplace=True)
# Link, Rating and Insights can change over time, so they are not used to assess duplicates
df.drop_duplicates(subset=['Company', 'Job_Title', 'Description'], inplace=True)
df.reset_index(drop=True, inplace=True)
df['Company_short'] = df.Company.apply(lambda x: x.split(' ')[0])
df.head(3)
employments_types = {
'Praktikum': 'Intern', 'Werkstudent': 'Student', 'Berufsausbildung': 'Trainee', 'Freie Mitarbeit': 'Freelancer',\
'Teilzeit': 'Part_Time', 'Homeoffice': 'Homeoffice', 'Festanstellung': 'Permanent', 'Vollzeit': 'Full_Time'}
skill_patterns = {
'AI/Deep Learning': '\WAI\W|[Aa]rtificial [Ii]ntelligence|[Dd]eep[ -]?[Ll]earning', 'API': '\WAPI\W',\
'C': '\WC[^a-zA-Z0-9#]', 'C++': '[Cc]\+\+', 'C#': '\W[Cc]\#', 'Cloud Computing': '[Cc]loud[ -]?([Cc]omput|[Pp]lat)',\
'ERP software': '\WERPs?\W', 'Excel': '[Ee]xcel', 'GoogleSheets': '[Gg](oogle)?[Ss]heets?', 'Java': '[Jj]ava[^Ss]',\
'JavaScript': '[Jj]ava[Ss]cript|\WJS\W', 'Machine Learning': '[Mm]achine[ -]?[Ll]earning|ML[- ]?model',\
'NLP': '\WNLP\W|[Nn]atural [Ll]anguage [Pp]rocessing', 'PowerBI': 'Power ?B[Ii]',\
'PowerQuery': 'Power ?[Qq]uery', 'PowerPivot': 'Power ?[Pp]ivot', 'Python': '[Pp]ython',\
'R': '\WR\W|RStudio', 'Spark': '[Ss]park', 'SQL': 'SQL|sql', 'Tableau': '[Tt]ableau',\
}
position_patterns = {
'Data Scientist' : 'Data.*[Ss]cien(ce|tist)', 'Data Engineer': 'Data.*[Ee]ngineer',\
'Data Analyst': 'Data.*[Aa]naly(st|tics)', 'Business Analyst': 'Business.*[Aa]naly(st|tics)',\
'Business Intelligence': 'Business.*[Ii]ntellig|BI\W', 'Machine Learning': '[Mm]achine.?[Ll]earning'
}
level_patterns = {
'Director': '^.?[Dd]irector', 'Manager': '^.?[Mm]anag(er|ment)',\
'Senior': '^.?[Ss]enior', 'Expert': '^.?[Ee]xpert', 'Junior': '^.?[Jj]unior',\
'Consultant': '^.?[Cc]onsult', 'Entry': '^.?[Ee](ntry|instieg)|[Tt]rainee'
}
for keyword, emp_type in employments_types.items():
df[emp_type] = df['Employment_Type'].str.contains(keyword)
df['Job_Level'] = df.apply(check_job_level, axis=1)
df['Position'] = df.apply(check_job_position, axis=1)
df['Skills'] = df.apply(check_skills, axis=1)
skills = skill_patterns.keys()
for skill in skills:
df[skill] = df['Skills'].apply(lambda x: skill in x)
df.drop(columns=['Employment_Type', 'Insights', 'Scraped_Date', 'Days_Published'], inplace=True)
df[df.columns[7:15]] = df[df.columns[7:15]].fillna(value=False)
df
df.to_csv('full_data.csv', index=False)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('full_data.csv')
df.Position = pd.Categorical(df.Position,\
categories=['Other', 'Machine Learning', 'Data Scientist', 'Data Analyst', 'Data Engineer',\
'Business Intelligence', 'Business Analyst'], ordered=True)
# Transforming it into a categorical variable ensures the order when plotting
top_companies = df.Company_short.value_counts().index[:15]
df.Company_short = pd.Categorical(df.Company_short.copy(), categories=top_companies, ordered=True)
df_tc = df[df.Company_short.isin(top_companies)]
df_tc.head(2)
print(f'The dataset contains {len(df)} unduplicated job postings from {df.Company.nunique()} different companies.')
print(f'For {np.sum(df.Position.value_counts().values[1:])} postings, a definitive job position was determined.')
pd.merge(df.Position.value_counts(), df.Position.value_counts(normalize=True).apply(lambda x: format(x,'.1%')), left_index=True, right_index=True, how='left').rename(columns={'Position_x': 'Counts', 'Position_y': 'Share'})