Analysis

Loading data

import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt import plotly.express as px

df = pd.read_csv('DataScientist.csv')

df.columns

df['Job Title'].value_counts()

Cleaning Data

jobs_list = ['Data Scientist', 'Data Engineer','Data Analyst'] df_p = df[df['Job Title'].isin(jobs_list)] df_p

Is it possible predict salary based on location? [In progress]

df_pred = df_p[['Job Title', 'Salary Estimate','Location']] df_pred

df_pred['Salary Estimate'].value_counts()

df_pred['PerHour'] = df_pred["Salary Estimate"].str.find("Per Hour") df_pred.sort_values(by='PerHour', ascending=False)

df_pred = df_pred[df_pred.PerHour != 8] df_pred.sort_values(by='PerHour', ascending=False)

Transforming salary estimate from string to int

import re as re def find_number(text): num = re.findall(r'[0-9]+',text) return " ".join(num) df_pred['Salaries']=df_pred['Salary Estimate'].apply(lambda x: find_number(x)) df_pred.head()

df_pred[['InicialSalary','FinalSalary']] = df_pred['Salaries'].str.split(' ', expand=True).astype(str).astype(int) df_pred.head()

df_pred['Mean'] = df_pred[['InicialSalary', 'FinalSalary']].mean(axis=1) df_pred

df_pred = df_pred.drop(['Salary Estimate','PerHour','Salaries'], axis = 1)

df_pred[['City','State']] = df_pred['Location'].str.split(', ', expand=True) df_pred.head()

df_pred['NormStates'] = pd.factorize(df_pred['State'])[0] df_pred

All positions

from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression x = df_pred.iloc[:,7].values y = df_pred.iloc[:,4].values

X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size=0.2, random_state = 0) X_train

X_train= X_train.reshape(-1, 1) Y_train= Y_train.reshape(-1, 1) Y_test = X_test.reshape(-1, 1) X_test = X_test.reshape(-1, 1)

regressor = LinearRegression() regressor.fit(X_train, Y_train)

viz_train = plt viz_train.scatter(X_train, Y_train, color = 'blue') viz_train.plot(X_train, regressor.predict(X_train), color = 'black') viz_train.title('Salario vs Ciudad') viz_train.xlabel('Ciudad') viz_train.ylabel('Salario') viz_train.show()

regressor.score(X_test, Y_test)

Job offers by state

states = pd.DataFrame(df_pred['State'].value_counts().reset_index().values, columns=['State','Total']) states['Total'] = states['Total'].astype(str).astype(int) #states['Two Letter State'] = states['Two Letter State'].apply(lambda x: x.strip()) states

from states import code d = {key: value for (value, key) in code.items()} states['State Name'] = states['State'].map(d)

fig = px.choropleth(states, locations='State', color='Total', color_continuous_scale='spectral_r', hover_name='State Name', locationmode='USA-states', labels={'Total':'Total'}, scope='usa') fig.add_scattergeo( locations=states['State'], locationmode='USA-states', text=states['State'], mode='text', hoverinfo='skip') fig.update_layout( title={'text':'Total Job Offers by State', 'xanchor':'center', 'yanchor':'top', 'x':0.5}) fig.show()

Finding relevant words from job descriptions

df_test = df_p

import nltk nltk.download('punkt') nltk.download('averaged_perceptron_tagger') from nltk.tokenize import sent_tokenize, word_tokenize

def find_verb(keyword): tokens = nltk.word_tokenize(keyword) tagged = nltk.pos_tag(tokens) verb = [w for w,t in tagged if "VB" in t] if len(verb) < 1: return "" else: return verb def find_adjective(keyword): tokens = nltk.word_tokenize(keyword) tagged = nltk.pos_tag(tokens) adjective = [w for w,t in tagged if "JJ" in t] if len(adjective) < 1: return "" else: return adjective def find_noun(keyword): tokens = nltk.word_tokenize(keyword) tagged = nltk.pos_tag(tokens) noun = [w for w,t in tagged if "NN" in t] if len(noun) < 1: return "" else: return noun df_test['Verb'] = df_test['Job Description'].apply(find_verb) df_test['Adjective'] = df_test['Job Description'].apply(find_adjective) df_test['Noun'] = df_test['Job Description'].apply(find_noun)

df_test['Verb'] = [' '.join(map(str, l)) for l in df_test['Verb']] df_test['Adjective'] = [' '.join(map(str, l)) for l in df_test['Adjective']] df_test['Noun'] = [' '.join(map(str, l)) for l in df_test['Noun']] df_test.head()

from collections import Counter Counter(" ".join(df_test["Adjective"]).split()).most_common(50)

Counter(" ".join(df_test["Noun"]).split()).most_common(50)

Counter(" ".join(df_test["Verb"]).split()).most_common(50)

Adding a status

options = ['Applied','Applied','Applied','Applied', 'Technical Interview', 'Technical Interview', 'Technical Interview', 'First Interview', 'First Interview', 'First Interview','Second Interview','Second Interview','Rejected','Job Offered'] df_status = df_p df_status['Status'] = np.random.choice(list(options), len(df_status)) df_status.head(5)

freq = df_status['Status'].value_counts() freq

sns.set(rc={'figure.figsize':(13,9)}) sns.histplot(df_status['Status'])

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Analysis