#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc
#gensim
import gensim
from gensim import corpora
#Visualization
from spacy import displacy
import pyLDAvis.gensim_models
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt
#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import jsonlines
#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])
#warning
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("/work/resume-dataset/Resume/Resume.csv")
df = df.reindex(np.random.permutation(df.index))
data = df.copy().iloc[
0:200,
]
data.head()
nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "jz_skill_patterns.jsonl"
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names
def get_skills(text):
doc = nlp(text)
myset = []
subset = []
for ent in doc.ents:
if ent.label_ == "SKILL":
subset.append(ent.text)
myset.append(subset)
return subset
def unique_skills(x):
return list(set(x))
clean = []
for i in range(data.shape[0]):
review = re.sub(
'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
" ",
data["Resume_str"].iloc[i],
)
review = review.lower()
review = review.split()
lm = WordNetLemmatizer()
review = [
lm.lemmatize(word)
for word in review
if not word in set(stopwords.words("english"))
]
review = " ".join(review)
clean.append(review)
data["Clean_Resume"] = clean
data["skills"] = data["Clean_Resume"].str.lower().apply(get_skills)
data["skills"] = data["skills"].apply(unique_skills)
data.head()
fig = px.histogram(
data, x="Category", title="Distribution of Jobs Categories"
).update_xaxes(categoryorder="total descending")
fig.show()
Job_cat = data["Category"].unique()
Job_cat = np.append(Job_cat, "ALL")
Job_Category
Total_skills = []
if Job_Category != "ALL":
fltr = data[data["Category"] == Job_Category]["skills"]
for x in fltr:
for i in x:
Total_skills.append(i)
else:
fltr = data["skills"]
for x in fltr:
for i in x:
Total_skills.append(i)
fig = px.histogram(
x=Total_skills,
labels={"x": "Skills"},
title=f"{Job_Category} Distribution of Skills",
).update_xaxes(categoryorder="total descending")
fig.show()
text = ""
for i in data[data["Category"] == Job_Category]["Clean_Resume"].values:
text += i + " "
plt.figure(figsize=(8, 8))
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wc = WordCloud(
width=800,
height=800,
background_color="white",
min_font_size=6,
repeat=True,
mask=mask,
)
wc.generate(text)
plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
plt.title(f"Most Used Words in {Job_Category} Resume", fontsize=20)
sent = nlp(data["Resume_str"].iloc[0])
displacy.render(sent, style="ent", jupyter=True)
displacy.render(sent[0:10], style="dep", jupyter=True, options={"distance": 90})
patterns = df.Category.unique()
for a in patterns:
ruler.add_patterns([{"label": "Job-Category", "pattern": a}])
# options=[{"ents": "Job-Category", "colors": "#ff3232"},{"ents": "SKILL", "colors": "#56c426"}]
colors = {
"Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
"SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
"ORG": "#ffd966",
"PERSON": "#e06666",
"GPE": "#9fc5e8",
"DATE": "#c27ba0",
"ORDINAL": "#674ea7",
"PRODUCT": "#f9cb9c",
}
options = {
"ents": [
"Job-Category",
"SKILL",
"ORG",
"PERSON",
"GPE",
"DATE",
"ORDINAL",
"PRODUCT",
],
"colors": colors,
}
sent = nlp(data["Resume_str"].iloc[5])
displacy.render(sent, style="ent", jupyter=True, options=options)
input_resume
sent2 = nlp(input_resume)
displacy.render(sent2, style="ent", jupyter=True, options=options)
input_skills
req_skills = input_skills.lower().split(",")
resume_skills = unique_skills(get_skills(input_resume.lower()))
score = 0
for x in req_skills:
if x in resume_skills:
score += 1
req_skills_len = len(req_skills)
match = round(score / req_skills_len * 100, 1)
print(f"The current Resume is {match}% matched to your requirements")
print(resume_skills)
docs = data["Clean_Resume"].values
dictionary = corpora.Dictionary(d.split() for d in docs)
bow = [dictionary.doc2bow(d.split()) for d in docs]
lda = gensim.models.ldamodel.LdaModel
num_topics = 4
ldamodel = lda(
bow,
num_topics=num_topics,
id2word=dictionary,
passes=50,
minimum_probability=0
)
ldamodel.print_topics(num_topics=num_topics)
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(ldamodel, bow, dictionary)