NLP - Resume Screening

#importing libraries #importing numpy,pandas,matplotlib and warnings library import numpy as np import pandas as pd import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore')

#loading the data as DataFrame #reading and storing CSV file into variabe resumeData resumeData = pd.read_csv('resume_dataset.csv' ,encoding='utf-8') #Creating a new column structured_resume resumeData['structured_resume'] = '' resumeData.head()

#printing the unique categories presented in the resumes print("Displaying the unique categories in resume ") print(resumeData['Category'].unique())

#Printing the unique categories of resume and number of records present print ("Displaying the unique categories of resume and number of records") Datas=resumeData['Category'].value_counts() print(Datas)

#Importing seaborn plotting the graph between Categories vs count import seaborn as sns plt.figure(figsize=(10,10)) ax = sns.countplot(x="Category", data=resumeData,palette="bright") ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right") plt.tight_layout() plt.title("Category vs Count") plt.show()

#plotting piechart using Matplotlib from matplotlib.gridspec import GridSpec targetCount = resumeData['Category'].value_counts() targetLabel = resumeData['Category'].unique()

#Making the square figures and axes plt.figure(1, figsize=(22,22)) the_grid = GridSpec(2, 2) cmap = plt.get_cmap('Wistia') colors = [cmap(i) for i in np.linspace(0, 1, 3)] plt.subplot(the_grid[0, 1], aspect=1, title='Category Distribution') source_pie = plt.pie(targetCount, labels=targetLabel, autopct='%1.1f%%', shadow=True, colors=colors) plt.show()

#importing re library import re #Function for cleaning Resume def clean_resume(Text): Text = re.sub('http\S+\s*', ' ', Text) # remove URLs in the text Text = re.sub('@\S+', ' ', Text) # remove mentions in the text Text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', Text) # remove punctuations in the text Text = re.sub('RT|cc', ' ', Text) # remove RT and cc in the text Text = re.sub('#\S+', '', Text) # remove hashtags in the text Text = re.sub(r'[^\x00-\x7f]',r' ',Text) Text = re.sub('\s+', ' ', Text) # remove extra whitespace in the text return Text resumeData['structured_resume'] = resumeData.Resume.apply(lambda x: clean_resume(x))

#Importing NLTK library import nltk from nltk.corpus import stopwords import string #Importing Wordcloud library from wordcloud import WordCloud nltk.download('stopwords') nltk.download('punkt')

#cleaning the sentences Set_Of_StopWords = set(stopwords.words('english')+['``',"''"]) total_Words =[] Sentences = resumeData['Resume'].values cleaned_Sentences = "" for i in range(0,160): cleanedText = clean_resume(Sentences[i]) cleaned_Sentences += cleanedText requiredWords = nltk.word_tokenize(cleanedText) for word in requiredWords: if word not in Set_Of_StopWords and word not in string.punctuation: total_Words.append(word)

#Using wordcloud we are finding the frequency of words wordfrequencydist = nltk.FreqDist(total_Words) mostCommon = wordfrequencydist.most_common(50) print(mostCommon)

#plotting the frequency of words using Wordcloud library word_cloud = WordCloud(background_color="white").generate(cleaned_Sentences) plt.figure(figsize=(14,14)) plt.imshow(word_cloud,interpolation="bilinear") plt.axis("off") plt.show()

#Importing sklearn library from sklearn import metrics from sklearn.metrics import accuracy_score from pandas.plotting import scatter_matrix from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.multiclass import OneVsRestClassifier

#Importing LabelEncoder from sklearn from sklearn.preprocessing import LabelEncoder #Converting words in to categorical values var_mod = ['Category'] le = LabelEncoder() for i in var_mod: resumeData[i] = le.fit_transform(resumeData[i])

#mporting library from splitting training and testing dataset from sklearn.model_selection import train_test_split #Convert a collection of raw documents to a matrix of TF-IDF features from sklearn.feature_extraction.text import TfidfVectorizer from scipy.sparse import hstack required_Text = resumeData['structured_resume'].values required_Target = resumeData['Category'].values word_vectorizer = TfidfVectorizer(sublinear_tf=True,stop_words='english',max_features=1500) word_vectorizer.fit(required_Text) WordFeatures = word_vectorizer.transform(required_Text) print ("Feature completed")

#Splitting training and testing dataset X_train,X_test,y_train,y_test = train_test_split(WordFeatures,required_Target,random_state=0, test_size=0.2) print(X_train.shape) print(X_test.shape)

#training the model and printing the classification report #Here we are using the one vs the rest classifier KNeighborsClassifier clf = OneVsRestClassifier(KNeighborsClassifier()) clf.fit(X_train, y_train) prediction = clf.predict(X_test) print("KNeighbors Classifier") print('Accuracy on training dataset: {:.2f}'.format(clf.score(X_train, y_train))) print('Accuracy on test dataset: {:.2f}'.format(clf.score(X_test, y_test))) print(metrics.classification_report(y_test, prediction))