spaCy Resume Analysis

#spacy import spacy from spacy.pipeline import EntityRuler from spacy.lang.en import English from spacy.tokens import Doc #gensim import gensim from gensim import corpora #Visualization from spacy import displacy import pyLDAvis.gensim_models from wordcloud import WordCloud import plotly.express as px import matplotlib.pyplot as plt #Data loading/ Data manipulation import pandas as pd import numpy as np import jsonlines #nltk import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer nltk.download(['stopwords','wordnet']) #warning import warnings warnings.filterwarnings('ignore')

df = pd.read_csv("/work/resume-dataset/Resume/Resume.csv") df = df.reindex(np.random.permutation(df.index)) data = df.copy().iloc[ 0:200, ] data.head()

nlp = spacy.load("en_core_web_lg") skill_pattern_path = "jz_skill_patterns.jsonl"

ruler = nlp.add_pipe("entity_ruler") ruler.from_disk(skill_pattern_path) nlp.pipe_names

def get_skills(text): doc = nlp(text) myset = [] subset = [] for ent in doc.ents: if ent.label_ == "SKILL": subset.append(ent.text) myset.append(subset) return subset def unique_skills(x): return list(set(x))

clean = [] for i in range(data.shape[0]): review = re.sub( '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", data["Resume_str"].iloc[i], ) review = review.lower() review = review.split() lm = WordNetLemmatizer() review = [ lm.lemmatize(word) for word in review if not word in set(stopwords.words("english")) ] review = " ".join(review) clean.append(review)

data["Clean_Resume"] = clean data["skills"] = data["Clean_Resume"].str.lower().apply(get_skills) data["skills"] = data["skills"].apply(unique_skills) data.head()

fig = px.histogram( data, x="Category", title="Distribution of Jobs Categories" ).update_xaxes(categoryorder="total descending") fig.show()

Job_cat = data["Category"].unique() Job_cat = np.append(Job_cat, "ALL")

Job_Category

INFORMATION-TECHNOLOGY

Total_skills = [] if Job_Category != "ALL": fltr = data[data["Category"] == Job_Category]["skills"] for x in fltr: for i in x: Total_skills.append(i) else: fltr = data["skills"] for x in fltr: for i in x: Total_skills.append(i) fig = px.histogram( x=Total_skills, labels={"x": "Skills"}, title=f"{Job_Category} Distribution of Skills", ).update_xaxes(categoryorder="total descending") fig.show()

text = "" for i in data[data["Category"] == Job_Category]["Clean_Resume"].values: text += i + " " plt.figure(figsize=(8, 8)) x, y = np.ogrid[:300, :300] mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2 mask = 255 * mask.astype(int) wc = WordCloud( width=800, height=800, background_color="white", min_font_size=6, repeat=True, mask=mask, ) wc.generate(text) plt.axis("off") plt.imshow(wc, interpolation="bilinear") plt.title(f"Most Used Words in {Job_Category} Resume", fontsize=20)

sent = nlp(data["Resume_str"].iloc[0]) displacy.render(sent, style="ent", jupyter=True)

displacy.render(sent[0:10], style="dep", jupyter=True, options={"distance": 90})

patterns = df.Category.unique() for a in patterns: ruler.add_patterns([{"label": "Job-Category", "pattern": a}])

# options=[{"ents": "Job-Category", "colors": "#ff3232"},{"ents": "SKILL", "colors": "#56c426"}] colors = { "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)", "ORG": "#ffd966", "PERSON": "#e06666", "GPE": "#9fc5e8", "DATE": "#c27ba0", "ORDINAL": "#674ea7", "PRODUCT": "#f9cb9c", } options = { "ents": [ "Job-Category", "SKILL", "ORG", "PERSON", "GPE", "DATE", "ORDINAL", "PRODUCT", ], "colors": colors, } sent = nlp(data["Resume_str"].iloc[5]) displacy.render(sent, style="ent", jupyter=True, options=options)

input_resume

sent2 = nlp(input_resume) displacy.render(sent2, style="ent", jupyter=True, options=options)

input_skills

req_skills = input_skills.lower().split(",") resume_skills = unique_skills(get_skills(input_resume.lower())) score = 0 for x in req_skills: if x in resume_skills: score += 1 req_skills_len = len(req_skills) match = round(score / req_skills_len * 100, 1) print(f"The current Resume is {match}% matched to your requirements")

print(resume_skills)

docs = data["Clean_Resume"].values dictionary = corpora.Dictionary(d.split() for d in docs) bow = [dictionary.doc2bow(d.split()) for d in docs] lda = gensim.models.ldamodel.LdaModel num_topics = 4 ldamodel = lda( bow, num_topics=num_topics, id2word=dictionary, passes=50, minimum_probability=0 ) ldamodel.print_topics(num_topics=num_topics)

pyLDAvis.enable_notebook() pyLDAvis.gensim_models.prepare(ldamodel, bow, dictionary)