Startup

import pandas as pd

df = pd.read_json('data.json') df

#define company size df['sizeCategory'] = df.companySize.replace({ '1-10 EMPLOYEES': 'Small', '11-50 EMPLOYEES': 'Medium', '51-200 EMPLOYEES': 'Large', '201-500 EMPLOYEES': 'X-Large'})

#count unique companies unique_df = df.drop_duplicates('companyName')

#Visualize company and job counts order = ['Small', 'Medium', 'Large', 'X-Large'] combined = pd.DataFrame({ 'Company count': unique_df.sizeCategory.value_counts().reindex(order), 'Job count': df.sizeCategory.value_counts().reindex(order)}).reset_index().rename({'index': 'Company size'})

plot = combined.rename(columns={'index': 'Company size'}).plot.bar(x='Company size', y=['Company count', 'Job count'], rot=0) for container in plot.containers: plot.bar_label(container)

plot.get_figure().savefig('figure1.png', dpi=400)

#check how many jobs have specified salary df.details.map(lambda x: '$' in x).value_counts() / (253 + 34)

#check how many jobs have specified location def has_location(det): return det == 'San Francisco' or det == 'Los Angeles' or (('•' in det or ('• Remote possible' in det)) and not det.startswith('Remote possible')) # has location city plus remote possible # not only remote possible # not empty df['located'] = df.details.map(has_location) list(df[df['located']].details)

list(df[~df.located].details)

df.located.value_counts() / (265 + 22)

#Brainstorming how to categorize technical vs managerial # engineering wordset ['engineer', 'programmer', 'developer', 'scientist', 'architect'] # manager wordset ['chief', 'head of', 'vp ', 'vice president', 'manager', 'director', 'lead'] # ENGINEERING MANAGER must have ' # ENGINEER iff has 'engineer' then engineer [s for s in list(df.title)]

#formal categorization starts here df['Manager'] = df['title'].map(lambda title: any(word in title.lower() for word in ['chief', 'head of', 'vp ', 'vice president', 'mangager', 'manager', 'director', 'lead', 'cto'])) df['Technical'] = df['title'].map(lambda title: any(word in title.lower() for word in ['engineer', 'programmer', 'developer', 'scientist', 'architect', 'cto', 'software development']))

df['Technical'] = df['Technical'].map(int) df['Manager'] = df['Manager'].map(int) df['Technical Manager'] = df['Technical'] * df['Manager'] df['Only Technical'] = (df['Technical'] == 1) & (df['Manager'] == 0) df['Only Manager'] = (df['Technical'] == 0) & (df['Manager'] == 1) df['Neither'] = (df['Manager'] == 0) & (df['Technical'] == 0) df['Company size'] = df['companySize'].replace({ '1-10 EMPLOYEES': 5, '11-50 EMPLOYEES': 35, '51-200 EMPLOYEES': 125, '201-500 EMPLOYEES': 350})

#Visualize categorization results plot_df = df.groupby(['Company size']).mean() plot_df['Non-technical'] = 1 - plot_df['Technical'] plot_df['Non-manager'] = 1 - plot_df['Manager'] plot_df *= 100 plot_df.rename(columns={'Neither': 'Non-technical Non-manager'}, inplace=True) plot_df

plot_df = plot_df.rename(index={5: 'Small', 35: 'Medium', 125: 'Large', 350: 'X-Large'})

# plot_df['only-manager'] + plot_df['only-technical'] + plot_df['technical-manager'] + plot_df['neither'] lgd = plot_df.plot.bar( y=['Only Technical', 'Technical Manager', 'Only Manager', 'Non-technical Non-manager'], stacked=True, figsize=(8, 4), color=['blue', 'green', 'orange', 'red'], rot=0 ) y_offset = -4 for bar in lgd.patches: lgd.text( # Put the text in the middle of each bar. get_x returns the start # so we add half the width to get to the middle. bar.get_x() + bar.get_width() / 2, # Vertically, add the height of the bar to the start of the bar, # along with the offset. bar.get_height() + bar.get_y() + y_offset, # This is actual value we'll show. str(round(bar.get_height())) + '%', # Center the labels and style them a bit. ha='center', color='w', weight='bold', size=10 ) lgdbbox = lgd.legend(bbox_to_anchor=(1.0, 1.0)) lgdbbox.figure.savefig('figure2.png', dpi=400, bbox_extra_artists=(lgd,), bbox_inches='tight')

#output results file df.to_csv('SoftwareS.csv', encoding='utf-8', index=False)