Analysis
Loading data
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
df = pd.read_csv('DataScientist.csv')
df
df.columns
df['Job Title'].value_counts()
Cleaning Data
jobs_list = ['Data Scientist', 'Data Engineer','Data Analyst']
df_p = df[df['Job Title'].isin(jobs_list)]
df_p
Is it possible predict salary based on location? [In progress]
df_pred = df_p[['Job Title', 'Salary Estimate','Location']]
df_pred
df_pred['Salary Estimate'].value_counts()
df_pred['PerHour'] = df_pred["Salary Estimate"].str.find("Per Hour")
df_pred.sort_values(by='PerHour', ascending=False)
df_pred = df_pred[df_pred.PerHour != 8]
df_pred.sort_values(by='PerHour', ascending=False)
Transforming salary estimate from string to int
import re as re
def find_number(text):
num = re.findall(r'[0-9]+',text)
return " ".join(num)
df_pred['Salaries']=df_pred['Salary Estimate'].apply(lambda x: find_number(x))
df_pred.head()
df_pred[['InicialSalary','FinalSalary']] = df_pred['Salaries'].str.split(' ', expand=True).astype(str).astype(int)
df_pred.head()
df_pred['Mean'] = df_pred[['InicialSalary', 'FinalSalary']].mean(axis=1)
df_pred
df_pred = df_pred.drop(['Salary Estimate','PerHour','Salaries'], axis = 1)
df_pred[['City','State']] = df_pred['Location'].str.split(', ', expand=True)
df_pred.head()
df_pred['NormStates'] = pd.factorize(df_pred['State'])[0]
df_pred
All positions
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
x = df_pred.iloc[:,7].values
y = df_pred.iloc[:,4].values
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size=0.2, random_state = 0)
X_train
X_train= X_train.reshape(-1, 1)
Y_train= Y_train.reshape(-1, 1)
Y_test = X_test.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
viz_train = plt
viz_train.scatter(X_train, Y_train, color = 'blue')
viz_train.plot(X_train, regressor.predict(X_train), color = 'black')
viz_train.title('Salario vs Ciudad')
viz_train.xlabel('Ciudad')
viz_train.ylabel('Salario')
viz_train.show()
regressor.score(X_test, Y_test)
Job offers by state
states = pd.DataFrame(df_pred['State'].value_counts().reset_index().values, columns=['State','Total'])
states['Total'] = states['Total'].astype(str).astype(int)
#states['Two Letter State'] = states['Two Letter State'].apply(lambda x: x.strip())
states
from states import code
d = {key: value for (value, key) in code.items()}
states['State Name'] = states['State'].map(d)
fig = px.choropleth(states,
locations='State',
color='Total',
color_continuous_scale='spectral_r',
hover_name='State Name',
locationmode='USA-states',
labels={'Total':'Total'},
scope='usa')
fig.add_scattergeo(
locations=states['State'],
locationmode='USA-states',
text=states['State'],
mode='text',
hoverinfo='skip')
fig.update_layout(
title={'text':'Total Job Offers by State',
'xanchor':'center',
'yanchor':'top',
'x':0.5})
fig.show()
Finding relevant words from job descriptions
df_test = df_p
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize, word_tokenize
def find_verb(keyword):
tokens = nltk.word_tokenize(keyword)
tagged = nltk.pos_tag(tokens)
verb = [w for w,t in tagged if "VB" in t]
if len(verb) < 1:
return ""
else:
return verb
def find_adjective(keyword):
tokens = nltk.word_tokenize(keyword)
tagged = nltk.pos_tag(tokens)
adjective = [w for w,t in tagged if "JJ" in t]
if len(adjective) < 1:
return ""
else:
return adjective
def find_noun(keyword):
tokens = nltk.word_tokenize(keyword)
tagged = nltk.pos_tag(tokens)
noun = [w for w,t in tagged if "NN" in t]
if len(noun) < 1:
return ""
else:
return noun
df_test['Verb'] = df_test['Job Description'].apply(find_verb)
df_test['Adjective'] = df_test['Job Description'].apply(find_adjective)
df_test['Noun'] = df_test['Job Description'].apply(find_noun)
df_test['Verb'] = [' '.join(map(str, l)) for l in df_test['Verb']]
df_test['Adjective'] = [' '.join(map(str, l)) for l in df_test['Adjective']]
df_test['Noun'] = [' '.join(map(str, l)) for l in df_test['Noun']]
df_test.head()
from collections import Counter
Counter(" ".join(df_test["Adjective"]).split()).most_common(50)
Counter(" ".join(df_test["Noun"]).split()).most_common(50)
Counter(" ".join(df_test["Verb"]).split()).most_common(50)
Adding a status
options = ['Applied','Applied','Applied','Applied', 'Technical Interview', 'Technical Interview', 'Technical Interview', 'First Interview', 'First Interview', 'First Interview','Second Interview','Second Interview','Rejected','Job Offered']
df_status = df_p
df_status['Status'] = np.random.choice(list(options), len(df_status))
df_status.head(5)
freq = df_status['Status'].value_counts()
freq
sns.set(rc={'figure.figsize':(13,9)})
sns.histplot(df_status['Status'])