import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
jobs = pd.read_csv('NYC_Jobs.csv')
jobs.shape
jobs.head(5)
!pip install plotly
import plotly
import plotly.express as px
Collecting plotly
Downloading plotly-4.12.0-py2.py3-none-any.whl (13.1 MB)
|████████████████████████████████| 13.1 MB 6.9 MB/s
Collecting retrying>=1.3.3
Downloading retrying-1.3.3.tar.gz (10 kB)
Requirement already satisfied: six in /opt/venv/lib/python3.7/site-packages (from plotly) (1.15.0)
Building wheels for collected packages: retrying
Building wheel for retrying (setup.py) ... done
Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11429 sha256=e63d74684086cea2876f7e8cbde9b03373fa99ff3bc3803e1b0d23f074954ee4
Stored in directory: /home/jovyan/.cache/pip/wheels/f9/8d/8d/f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.12.0 retrying-1.3.3
jobs_by_cat = jobs.groupby(by=['Agency']).agg('median').reset_index()
jobs_by_cat.head()
fig = px.bar(jobs_by_cat,
x='Agency', y='Salary Range From',
color='Agency',
color_discrete_sequence=plotly.colors.qualitative.Prism,
title='NYC Jobs'
)
fig
fig = px.scatter(jobs, x='Salary Range From', y='Salary Range To',
opacity=0.3,hover_data=['Salary Range From','Salary Range To'],
color_discrete_sequence=plotly.colors.sequential.BuPu_r,
title='Correlation of Starting and Maximum Salaries')
#fig.show('notebook')
fig
#grouping by career level and then taking averages of the columns
job_salary = jobs.groupby(by=['Career Level']).agg('mean').reset_index()
#creating a bar graph to display average salary ranges across career levels
fig = px.bar (job_salary,
x='Career Level', y='Salary Range To',
range_y = (0,250000),
hover_name='Career Level',
color = 'Salary Range To',
title='Average Salary Range Across Career Levels',
)
fig
##The model predicts the career level depending upon the "Salary Range To" variable, using classification.
avgsalaries = jobs.groupby(by=['Career Level']).agg('mean') #gives means of each instance the career level appears
avgsalaries['Salary Range To']
cols_to_drop = ['Salary Frequency','Full-Time/Part-Time indicator','Title Code No','Salary Range From', 'Level', 'Process Date', 'Job ID', '# Of Positions', 'Agency','Posting Type','Business Title',
'Civil Service Title', 'Title Classification', 'Job Category','Work Location','Division/Work Unit','Job Description',
'Minimum Qual Requirements', 'Preferred Skills','Additional Information','To Apply','Hours/Shift','Work Location 1',
'Recruitment Contact', 'Residency Requirement', 'Posting Date','Post Until','Posting Updated']
df_ml = jobs.drop(columns=cols_to_drop)
df_ml=df_ml.dropna() #had to drop missing values here
df_ml #condensed data frame
df_ml['Career Level'] = df_ml['Career Level'].map({'Entry-Level':0,'Executive':1, 'Experienced (non-manager)':2,
'Manager':3, 'Student' :4}) #the different levels the model will classify based on "Salary Range To"
df_ml_cleaned = pd.get_dummies(df_ml,drop_first=True)
df_ml_cleaned.dropna(axis='columns',how='all') #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html
df_ml_cleaned.head()
X = df_ml_cleaned.drop(columns=['Career Level'])
y = df_ml_cleaned['Career Level']
print(X) #X is the data frame with solely the "Salary Range To variable"; input for the model
X
y
Salary Range To
0 43079.0
1 63794.0
2 49394.0
3 66000.0
4 100000.0
.. ...
884 75318.0
885 62215.0
886 68181.0
887 106023.0
888 57590.0
[887 rows x 1 columns]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
actual = np.array(y_test)
print('Look at first few predictions:')
print('Predicted Career Level: ',predicted) #idk if this part is right but 2 and 3 are the column numbers.
print('Actual Career Level: ',actual)
Look at first few predictions:
Predicted Career Level: [0 0 2 2 3 2 2 3 2 2 2 2 2 2 3 2 2 2 2 2 1 0 2 0 2 3 2 2 2 2 2 2 0 2 3 2 2
2 3 2 2 2 2 0 2 2 2 2 3 2 2 2 2 0 0 2 0 2 1 3 2 3 3 2 2 2 2 2 2 0 2 3 3 3
2 2 2 2 2 2 3 1 2 0 2 2 2 0 2 2 2 0 3 2 2 2 2 2 2 3 2 2 1 2 0 2 2 2 2 2 2
1 3 2 3 3 3 2 2 3 0 2 2 0 1 2 2 2 3 3 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2
2 3 2 2 2 2 0 2 0 2 2 2 2 2 3 2 2 0 2 2 2 2 3 3 0 2 2 2 1 2 2 2 2 2 2 2 3
2 3 2 2 2 2 2 0 2 2 2 4 0 2 2 2 3 2 3 3 2 3 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 3 2 3 2 2 0 2 2 2 2 2 2 0 0 3 2 2 2 0 3 2 2 3 2 3 0 3 2 3 2 2 2 2
2 2 2 2 0 2 2 1 2 2 0 2 0 2 2 1 2 2 2 2 3 2 2 4 3 2 2 2 0 2 3 3 2 0]
Actual Career Level: [2 0 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 1 3 1 2 2 0 2 3 2 2 2 2 2 2 0 2 2 2 2
2 2 2 2 0 2 2 2 2 2 2 3 2 2 2 2 0 2 2 2 2 1 3 2 2 2 2 2 2 2 3 2 0 2 2 3 2
2 2 2 2 2 3 2 1 2 0 2 2 2 2 0 2 2 0 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
2 3 2 3 3 3 2 1 1 0 2 2 2 1 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 0 2 2 2 2 0 2 0 3 2 2 3 2 2 2 2 2 2 2 3
2 3 2 3 2 2 2 2 2 2 2 4 2 2 2 2 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 3 2 2 2 2 2 2 2 3 2 0 0 3 2 2 2 0 1 2 2 3 2 3 2 3 2 3 0 3 2 2
3 3 2 2 2 2 2 1 2 2 2 3 2 2 2 1 3 2 2 2 3 2 2 4 2 2 0 2 0 3 3 3 2 0]
from sklearn.metrics import accuracy_score
accuracy_score(predicted,actual)
!pip install graphviz==0.14.2
Collecting graphviz==0.14.2
Downloading graphviz-0.14.2-py2.py3-none-any.whl (18 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.14.2
#Visualization of Decision Tree:
from IPython.display import display, HTML
estimator = clf.estimators_[1]
from sklearn.tree import export_graphviz
import graphviz
from sklearn.tree import export_graphviz
import graphviz
from sklearn.tree import export_graphviz
from IPython.display import SVG
from graphviz import Source
#!pip install graphviz
#from google.colab import files
graph = Source(export_graphviz(estimator,out_file=None,feature_names=X_train.columns,class_names=['Career Level'],filled=True,rounded=True))
graph
# save to file
png_bytes = graph.pipe(format='png')
with open('tree.png', 'wb') as f:
f.write(png_bytes)
files.download('tree.png')
# display in notebook
SVG(graph.pipe(format='svg'))
AttributeError: 'DecisionTreeClassifier' object has no attribute 'estimators_'