import pandas as pd
df = pd.read_json('data.json')
df
#define company size
df['sizeCategory'] = df.companySize.replace({
'1-10 EMPLOYEES': 'Small',
'11-50 EMPLOYEES': 'Medium',
'51-200 EMPLOYEES': 'Large',
'201-500 EMPLOYEES': 'X-Large'})
#count unique companies
unique_df = df.drop_duplicates('companyName')
#Visualize company and job counts
order = ['Small', 'Medium', 'Large', 'X-Large']
combined = pd.DataFrame({
'Company count': unique_df.sizeCategory.value_counts().reindex(order),
'Job count': df.sizeCategory.value_counts().reindex(order)}).reset_index().rename({'index': 'Company size'})
plot = combined.rename(columns={'index': 'Company size'}).plot.bar(x='Company size', y=['Company count', 'Job count'], rot=0)
for container in plot.containers:
plot.bar_label(container)
plot.get_figure().savefig('figure1.png', dpi=400)
#check how many jobs have specified salary
df.details.map(lambda x: '$' in x).value_counts() / (253 + 34)
#check how many jobs have specified location
def has_location(det):
return det == 'San Francisco' or det == 'Los Angeles' or (('•' in det or ('• Remote possible' in det)) and not det.startswith('Remote possible'))
# has location city plus remote possible
# not only remote possible
# not empty
df['located'] = df.details.map(has_location)
list(df[df['located']].details)
list(df[~df.located].details)
df.located.value_counts() / (265 + 22)
#Brainstorming how to categorize technical vs managerial
# engineering wordset ['engineer', 'programmer', 'developer', 'scientist', 'architect']
# manager wordset ['chief', 'head of', 'vp ', 'vice president', 'manager', 'director', 'lead']
# ENGINEERING MANAGER must have '
# ENGINEER iff has 'engineer' then engineer
[s for s in list(df.title)]
#formal categorization starts here
df['Manager'] = df['title'].map(lambda title: any(word in title.lower() for word in ['chief', 'head of', 'vp ', 'vice president', 'mangager', 'manager', 'director', 'lead', 'cto']))
df['Technical'] = df['title'].map(lambda title: any(word in title.lower() for word in ['engineer', 'programmer', 'developer', 'scientist', 'architect', 'cto', 'software development']))
df['Technical'] = df['Technical'].map(int)
df['Manager'] = df['Manager'].map(int)
df['Technical Manager'] = df['Technical'] * df['Manager']
df['Only Technical'] = (df['Technical'] == 1) & (df['Manager'] == 0)
df['Only Manager'] = (df['Technical'] == 0) & (df['Manager'] == 1)
df['Neither'] = (df['Manager'] == 0) & (df['Technical'] == 0)
df['Company size'] = df['companySize'].replace({
'1-10 EMPLOYEES': 5,
'11-50 EMPLOYEES': 35,
'51-200 EMPLOYEES': 125,
'201-500 EMPLOYEES': 350})
#Visualize categorization results
plot_df = df.groupby(['Company size']).mean()
plot_df['Non-technical'] = 1 - plot_df['Technical']
plot_df['Non-manager'] = 1 - plot_df['Manager']
plot_df *= 100
plot_df.rename(columns={'Neither': 'Non-technical Non-manager'}, inplace=True)
plot_df
plot_df = plot_df.rename(index={5: 'Small', 35: 'Medium', 125: 'Large', 350: 'X-Large'})
# plot_df['only-manager'] + plot_df['only-technical'] + plot_df['technical-manager'] + plot_df['neither']
lgd = plot_df.plot.bar(
y=['Only Technical', 'Technical Manager', 'Only Manager', 'Non-technical Non-manager'],
stacked=True, figsize=(8, 4), color=['blue', 'green', 'orange', 'red'],
rot=0
)
y_offset = -4
for bar in lgd.patches:
lgd.text(
# Put the text in the middle of each bar. get_x returns the start
# so we add half the width to get to the middle.
bar.get_x() + bar.get_width() / 2,
# Vertically, add the height of the bar to the start of the bar,
# along with the offset.
bar.get_height() + bar.get_y() + y_offset,
# This is actual value we'll show.
str(round(bar.get_height())) + '%',
# Center the labels and style them a bit.
ha='center',
color='w',
weight='bold',
size=10
)
lgdbbox = lgd.legend(bbox_to_anchor=(1.0, 1.0))
lgdbbox.figure.savefig('figure2.png', dpi=400, bbox_extra_artists=(lgd,), bbox_inches='tight')
#output results file
df.to_csv('SoftwareS.csv', encoding='utf-8', index=False)