import numpy as np
import pandas as pd
import string
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# Load dataset
def load_data(DATA_PATH):
''' Input parameter:
DATA_PATH = text to directory path
this method loads the dataset'''
data = pd.read_csv(DATA_PATH, header=None)
return data
#sample_data/Roman Urdu DataSet.csv
DATA_PATH = 'Roman Urdu DataSet.csv'
roman_urdu_df = load_data(DATA_PATH) #loading the data
roman_urdu_df.head()
roman_urdu_df = roman_urdu_df.drop([2], axis=1) #droping column
roman_urdu_df.columns = ['Sentence', 'Response'] #adding cols names to data
roman_urdu_df.head()
roman_urdu_df.isnull().sum()
roman_urdu_df.dropna(inplace = True)
roman_urdu_df.isnull().sum()
roman_urdu_df['Response'].value_counts()
# we can see there is one neative response
# digging inside it
roman_urdu_df[roman_urdu_df['Response'] == 'Neative']
roman_urdu_df.loc[roman_urdu_df['Response']=='Neative', 'Response'] = 'Negative'
roman_urdu_df['Response'].value_counts()
positive = roman_urdu_df[roman_urdu_df['Response'] == 'Positive'].shape[0]
Negative = roman_urdu_df[roman_urdu_df['Response'] == 'Negative'].shape[0]
Neutral = roman_urdu_df[roman_urdu_df['Response'] == 'Neutral'].shape[0]
# bar plot of the 3 classes
plt.bar(10,positive,3, label="Positive",color=['green'])
plt.bar(15,Negative,3, label="Negative",color=['red'])
plt.bar(20,Neutral,3, label="Neutral",color=['blue'])
plt.legend()
plt.ylabel('count')
plt.title('Overall Sentiment Responses')
plt.show()
roman_urdu_df['len'] = roman_urdu_df['Sentence'].str.rsplit().str.len()
roman_urdu_df.groupby(['Response'], sort=False)['len'].mean().plot(kind='bar',title = "Average number of words per sentiment", xlabel="Sentiment",color=['g', 'b', 'r'])
roman_urdu_df.groupby(['Response'], sort=False)['len'].max().plot(kind='bar',title = "Maximum number of words per sentiment", xlabel="Sentiment",color=['g', 'b', 'r'])
text_l = roman_urdu_df['Sentence'].tolist()
text = " ".join(review for review in text_l)
wordcloud = WordCloud().generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
print('Dataset size:',roman_urdu_df.shape)
print('Columns are:',roman_urdu_df.columns)
Dataset size: (20228, 3)
Columns are: Index(['Sentence', 'Response', 'len'], dtype='object')
roman_urdu_df.dtypes
df = pd.DataFrame(roman_urdu_df[['Sentence', 'Response']]) #converting into dataframe
string.punctuation
def remove_punct(text):
''' Input parameter:
text: (str)
this method is used to take each text line and check the characters if it
contains any puntuation and then remove them with blank space'''
text = "".join([char for [char] in text if char not in string.punctuation])
text = re.sub('[0-9]+', '', text)
return text
# removing all the punctuation
df['Sentence_Removal'] = df['Sentence'].apply(lambda x: remove_punct(str(x)))
df.head(5) #testing to view transformed data
def convert_to_lower_case(text):
''' Input parameter:
text: (str)
this method is used to take each text line and converts the characters into lower case and joins them further'''
text = "".join([char.lower() for [char] in text if char not in string.punctuation])
return text
df['Lower_Case'] = df['Sentence_Removal'].apply(lambda x: convert_to_lower_case(str(x))) #calling lower case on each line
df.head(5) #testing lower case of alphabet
new = df['Lower_Case'].str #WITHOUT ANY PRE-PROCESSING CHANGED TO LOWER CASE
# list of stopWord
stopwords=['ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha', 'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh', 'rha',
'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr', 'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya', 'gaya', 'kch',
'ab', 'thy', 'thay', 'houn', 'hain', 'han', 'to','is', 'hi', 'jo', 'kya', 'thi', 'se', 'pe', 'phr', 'wala',
'waisay', 'us', 'na', 'ny', 'hun', 'rha', 'raha', 'ja', 'rahay', 'abi', 'uski', 'ne', 'haan', 'acha', 'nai',
'sent', 'you', 'kafi', 'gai', 'rhy', 'kuch', 'jata', 'aye', 'ya', 'dono', 'hoa', 'aese', 'de', 'wohi',
'jati', 'jb', 'krta', 'lg', 'rahi', 'hui', 'karna', 'krna', 'gi', 'hova', 'yehi', 'jana', 'jye', 'chal', 'mil',
'tu', 'hum', 'par', 'hay', 'kis', 'sb', 'gy', 'dain', 'krny', 'tou','h','je','or','jee','he','in','un','kay','ki','ya','ap','meri','me']
#some words like Mein - Ma - Maa
# after running clusters removed stopped words again
dictStopWords = {} # global variable
forFastTextData = []
def removeStopWordss(text):
text = re.sub('[^a-zA-Z]',' ',str(text))
text = text.lower()
wordList =str(text).split()
for word in wordList:
if word in stopwords:
wordList.remove(word)
if word not in dictStopWords:
dictStopWords[word]= 1
else:
dictStopWords[word] = dictStopWords[word] + 1
newSentence = " ".join(wordList)
forFastTextData.append(newSentence.split())
return (newSentence)
df['Sen_Out_StopWord'] = df['Lower_Case'].apply(lambda x: removeStopWordss(x)) # sentence with removed stop words
df.head()
## checking for most common stopwords from the dictionary
import collections
from collections import Counter
dictGraph = {}
d = Counter(dictStopWords) # creating a counter
d.most_common(10)
for k,v in d.most_common(10):
dictGraph[k] = v
dictGraph
#plotting a graph of 10 most common stopwords from dictionary
plt.bar(dictGraph.keys(), dictGraph.values(), align='center', color = 'green')
plt.title('Bargraph for Most Common StopWords')
plt.xlabel('StopWords')
plt.ylabel('Count')
plt.show()
def replacing_characters(word):
''' Input Parameter:
word: word from the sentences'''
word = re.sub(r'ain$', r'ein', word)
word = re.sub(r'ai', r'ae', word)
word = re.sub(r'ay$', r'e', word)
word = re.sub(r'ey$', r'e', word)
word = re.sub(r'aa+', r'aa', word)
word = re.sub(r'e+', r'ee', word)
word = re.sub(r'ai', r'ahi', word) # e.g "sahi and sai nahi"
word = re.sub(r'ai', r'ahi', word)
word = re.sub(r'ie$', r'y', word)
word = re.sub(r'^es', r'is', word)
word = re.sub(r'a+', r'a', word)
word = re.sub(r'j+', r'j', word)
word = re.sub(r'd+', r'd', word)
word = re.sub(r'u', r'o', word)
word = re.sub(r'o+', r'o', word)
if not re.match(r'ar', word):
word = re.sub(r'ar', r'r', word)
word = re.sub(r'iy+', r'i', word)
word = re.sub(r'ih+', r'eh', word)
word = re.sub(r's+', r's', word)
if re.search(r'[rst]y', 'word') and word[-1] != 'y':
word = re.sub(r'y', r'i', word)
if re.search(r'[^a]i', word):
word = re.sub(r'i$', r'y', word)
if re.search(r'[a-z]h', word):
word = re.sub(r'h', '', word)
return word
df['stem'] = df['Sen_Out_StopWord'].apply(lambda x: replacing_characters(x)) #with stop words removed doing stem
df.head()
df['Stem_On_Original'] = df['Lower_Case'].apply(lambda x: replacing_characters(x))
df.head()
df.drop(columns=['Sentence_Removal'])
# Lower_case column is one with removing puntuction and all the unique characters and numeric
# Sen_Out_StopWord column is one with removing stopwords
# stem cloumn which created after applying stemming function for StopWords
# Stem_On_Original column is one applying stemming function without stopwords
stats = df.drop(labels=["Sentence", "Lower_Case",], axis=1)
stats.describe()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy
def generateWordCloudForClusters(n_clusters,clustermTerms):
'''Input parameter: n_clusters:
clustermTerms: dictionary for clustered terms (which has cluster no. and its terms)
'''
termListerOfSeprateCluster=[]
for clusterno in range(n_clusters):
for i in (clustermTerms):
if(clusterno == i.get('cluster')): # if same cluster
terms = i.get('terms')
termListerOfSeprateCluster.append((terms)) #add to file
#print(termListerOfSeprateCluster)
wordcloud = WordCloud(width = 500, height = 500, background_color='black', random_state=10).generate(transformForWordCount(termListerOfSeprateCluster))
plot_cloud(wordcloud)
termListerOfSeprateCluster = [] #empty list again for next cluster
def transformForWordCount(terms):
'''Input parameter: terms(terms per seprate cluster)
genrates a list for the cloud
'''
cloud = []
for term in range(len(terms)):
cloud.append(terms[term])
return " ".join(cloud)
def plot_cloud(wordcloud):
'''Input parameter: wordcloud
plots graph
'''
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis("off");
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
def plotKneeElbow(matrix,vale_for_range):
''' Input patameter: matrix returns a plot to help determine no of clusters'''
Sum_of_squared_distances = []
K = range(2,vale_for_range)
for k in K:
km = KMeans(n_clusters=k, max_iter=500, n_init=10)
km = km.fit(matrix)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal clusters')
plt.show()
wholedf = df
# vectorization of the texts
vectorizer = TfidfVectorizer()
def vectorFeatures(dataSet):
''' Input parameter: dataSet: seprate dataset for Positive,Negative or Neutral.
returns feature names and transformed vector'''
matrixForm = vectorizer.fit_transform(dataSet['Sen_Out_StopWord'])
# used words (axis in our multi-dimensional space)
words = vectorizer.get_feature_names()
#print("words", words)
return matrixForm,words
def performKmeansClustering(n_clusters,max_iter,matrixForm):
''' Input Parameter: n_clusters: no of clusters
max_iter: maxium iteratios for each cluster
matrixForm: vectorized transformed matrix'''
modelKmeans = KMeans(n_clusters=n_clusters, max_iter=max_iter, verbose=1)
modelKmeans.fit(matrixForm)
return modelKmeans
matrixdfForm,words = vectorFeatures(wholedf)
plotKneeElbow(matrixdfForm,10)
modelKmeans = performKmeansClustering(5,500,matrixdfForm)
Initialization complete
Iteration 0, inertia 39464.91952387971
Iteration 1, inertia 19736.529300699673
Iteration 2, inertia 19699.614332907473
Iteration 3, inertia 19695.581065485247
Iteration 4, inertia 19694.93685748799
Iteration 5, inertia 19694.758586128377
Iteration 6, inertia 19694.692723584663
Iteration 7, inertia 19694.667489350337
Iteration 8, inertia 19694.65644209507
Iteration 9, inertia 19694.65299893773
Iteration 10, inertia 19694.652457123768
Converged at iteration 10: strict convergence.
Initialization complete
Iteration 0, inertia 39133.35807931186
Iteration 1, inertia 19751.034792292696
Iteration 2, inertia 19743.273824804633
Iteration 3, inertia 19741.739038349013
Iteration 4, inertia 19740.282680556782
Iteration 5, inertia 19738.721435872205
Iteration 6, inertia 19737.992955016143
Iteration 7, inertia 19736.964608624014
Iteration 8, inertia 19736.4438698105
Iteration 9, inertia 19736.406865964924
Iteration 10, inertia 19736.38470255855
Iteration 11, inertia 19736.369765899908
Iteration 12, inertia 19736.36297773892
Iteration 13, inertia 19736.359583057867
Iteration 14, inertia 19736.35923782925
Iteration 15, inertia 19736.358899685827
Converged at iteration 15: strict convergence.
Initialization complete
Iteration 0, inertia 39309.526377127586
Iteration 1, inertia 19844.160056323464
Iteration 2, inertia 19821.040672551437
Iteration 3, inertia 19793.340721174733
Iteration 4, inertia 19778.66369600173
Iteration 5, inertia 19774.566874958873
Iteration 6, inertia 19773.540603500904
Iteration 7, inertia 19772.681411897353
Iteration 8, inertia 19771.458482332146
Iteration 9, inertia 19770.663516694975
Iteration 10, inertia 19770.646677777495
Iteration 11, inertia 19770.638404702415
Iteration 12, inertia 19770.633368150688
Iteration 13, inertia 19770.633035402614
Converged at iteration 13: strict convergence.
Initialization complete
Iteration 0, inertia 39458.764891141705
Iteration 1, inertia 19758.593938721533
Iteration 2, inertia 19730.57215804752
Iteration 3, inertia 19725.22272560077
Iteration 4, inertia 19723.722899563163
Iteration 5, inertia 19722.299925512187
Iteration 6, inertia 19720.93874112168
Iteration 7, inertia 19720.772561767757
Iteration 8, inertia 19720.759860121125
Iteration 9, inertia 19720.752173151857
Iteration 10, inertia 19720.748589273684
Iteration 11, inertia 19720.747551719367
Converged at iteration 11: strict convergence.
Initialization complete
Iteration 0, inertia 39378.40430178112
Iteration 1, inertia 19834.044877798166
Iteration 2, inertia 19782.661302112487
Iteration 3, inertia 19760.784949052733
Iteration 4, inertia 19751.67915633921
Iteration 5, inertia 19747.955158575718
Iteration 6, inertia 19746.807553061473
Iteration 7, inertia 19746.709786066567
Iteration 8, inertia 19746.67262421769
Iteration 9, inertia 19746.655410358933
Iteration 10, inertia 19746.642646635344
Iteration 11, inertia 19746.635159931757
Iteration 12, inertia 19746.62743307551
Iteration 13, inertia 19746.625821078687
Iteration 14, inertia 19746.624509947203
Iteration 15, inertia 19746.622828599822
Iteration 16, inertia 19746.6220819861
Iteration 17, inertia 19746.621714769757
Converged at iteration 17: strict convergence.
Initialization complete
Iteration 0, inertia 39284.23070747275
Iteration 1, inertia 19813.62639971561
Iteration 2, inertia 19784.5678554152
Iteration 3, inertia 19775.61551894443
Iteration 4, inertia 19770.153864715856
Iteration 5, inertia 19766.356373644485
Iteration 6, inertia 19763.268746926562
Iteration 7, inertia 19760.921183788603
Iteration 8, inertia 19759.75993487198
Iteration 9, inertia 19758.731641400776
Iteration 10, inertia 19758.210580140985
Iteration 11, inertia 19757.810514897228
Iteration 12, inertia 19756.946028443126
Iteration 13, inertia 19754.808598981297
Iteration 14, inertia 19753.247523025093
Iteration 15, inertia 19753.014074557872
Iteration 16, inertia 19752.503144356713
Iteration 17, inertia 19751.747798919547
Iteration 18, inertia 19750.96722526373
Iteration 19, inertia 19750.436981482722
Iteration 20, inertia 19750.10613481798
Iteration 21, inertia 19749.88556305881
Iteration 22, inertia 19749.395522413684
Iteration 23, inertia 19748.52774297249
Iteration 24, inertia 19747.886797980584
Iteration 25, inertia 19747.67613003122
Iteration 26, inertia 19747.543455911273
Iteration 27, inertia 19747.410402175785
Iteration 28, inertia 19747.303827283147
Iteration 29, inertia 19747.22624375544
Iteration 30, inertia 19747.143081115646
Iteration 31, inertia 19747.05127625431
Iteration 32, inertia 19746.949371104343
Iteration 33, inertia 19746.920842569147
Iteration 34, inertia 19746.912406952928
Iteration 35, inertia 19746.901135126216
Iteration 36, inertia 19746.88763509133
Iteration 37, inertia 19746.88184025604
Iteration 38, inertia 19746.877401113597
Iteration 39, inertia 19746.876803090214
Iteration 40, inertia 19746.875552112197
Iteration 41, inertia 19746.874818191085
Converged at iteration 41: strict convergence.
Initialization complete
Iteration 0, inertia 19980.0
Iteration 1, inertia 19909.04455179889
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 39429.08528586997
Iteration 1, inertia 19786.82378285647
Iteration 2, inertia 19761.084698263618
Iteration 3, inertia 19745.821394283634
Iteration 4, inertia 19732.313672760192
Iteration 5, inertia 19721.57724864382
Iteration 6, inertia 19715.204053676858
Iteration 7, inertia 19710.13012928767
Iteration 8, inertia 19707.06391581758
Iteration 9, inertia 19706.864130322778
Iteration 10, inertia 19706.83025105547
Iteration 11, inertia 19706.81598197696
Iteration 12, inertia 19706.805363467294
Iteration 13, inertia 19706.800773204526
Iteration 14, inertia 19706.800018657497
Converged at iteration 14: strict convergence.
Initialization complete
Iteration 0, inertia 39232.084409077695
Iteration 1, inertia 19847.629760862328
Iteration 2, inertia 19811.637892525127
Iteration 3, inertia 19768.06012137028
Iteration 4, inertia 19749.973048051233
Iteration 5, inertia 19743.07706694091
Iteration 6, inertia 19738.39898148222
Iteration 7, inertia 19734.761045948948
Iteration 8, inertia 19732.246286486323
Iteration 9, inertia 19730.942285756348
Iteration 10, inertia 19730.142588392788
Iteration 11, inertia 19729.651382533586
Iteration 12, inertia 19728.646129445497
Iteration 13, inertia 19726.297363057394
Iteration 14, inertia 19724.092558360524
Iteration 15, inertia 19721.25798445305
Iteration 16, inertia 19718.70615123572
Iteration 17, inertia 19718.510697194157
Iteration 18, inertia 19718.49685970273
Iteration 19, inertia 19718.48465038936
Iteration 20, inertia 19718.474169102094
Iteration 21, inertia 19718.464701500554
Converged at iteration 21: strict convergence.
Initialization complete
Iteration 0, inertia 39505.98657287062
Iteration 1, inertia 19772.693973841884
Iteration 2, inertia 19745.250760846528
Iteration 3, inertia 19736.070606489127
Iteration 4, inertia 19726.46057377655
Iteration 5, inertia 19722.55679055249
Iteration 6, inertia 19721.881813624612
Iteration 7, inertia 19721.226577173507
Iteration 8, inertia 19720.77976745177
Iteration 9, inertia 19720.55353391951
Iteration 10, inertia 19720.415880025444
Iteration 11, inertia 19720.291597000163
Iteration 12, inertia 19720.11166572816
Iteration 13, inertia 19719.76808826904
Iteration 14, inertia 19719.408179205642
Iteration 15, inertia 19719.241955503923
Iteration 16, inertia 19719.178211977654
Iteration 17, inertia 19719.10331737394
Iteration 18, inertia 19719.016180342474
Iteration 19, inertia 19718.913773308446
Iteration 20, inertia 19718.763038208966
Iteration 21, inertia 19718.6108678666
Iteration 22, inertia 19718.48459332001
Iteration 23, inertia 19718.368590630238
Iteration 24, inertia 19718.243406324418
Iteration 25, inertia 19718.09556412852
Iteration 26, inertia 19717.901029527762
Iteration 27, inertia 19717.525166285326
Iteration 28, inertia 19717.29919327221
Iteration 29, inertia 19717.236344958827
Iteration 30, inertia 19717.225420178464
Iteration 31, inertia 19717.22081570878
Iteration 32, inertia 19717.211626470784
Iteration 33, inertia 19717.177948207802
Iteration 34, inertia 19717.144894763605
Iteration 35, inertia 19717.123406962706
Iteration 36, inertia 19717.109464870366
Iteration 37, inertia 19717.094139196768
Iteration 38, inertia 19717.08394701262
Iteration 39, inertia 19717.06843745714
Iteration 40, inertia 19717.05589911266
Iteration 41, inertia 19717.031655943738
Iteration 42, inertia 19717.014634806448
Iteration 43, inertia 19717.008962254138
Converged at iteration 43: strict convergence.
labels = modelKmeans.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeans.cluster_centers_.argsort()[:, ::-1]
print("centers:", modelKmeans.cluster_centers_)
print("labels", labels)
print("intertia:", modelKmeans.inertia_)
centers: [[2.45891398e-03 6.07174473e-05 9.70392545e-05 ... 2.40227532e-04
1.04988722e-05 0.00000000e+00]
[1.37775121e-03 0.00000000e+00 0.00000000e+00 ... 4.03382411e-04
0.00000000e+00 0.00000000e+00]
[2.41059223e-03 0.00000000e+00 0.00000000e+00 ... 9.81660073e-04
0.00000000e+00 0.00000000e+00]
[1.82024620e-03 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[1.13858812e-03 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 1.78381469e-04]]
labels [0 0 0 ... 0 0 0]
intertia: 19694.652457123768
n_clusters = 5
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
clusterDictionary={}
clustermTerms=[]
print("Top words per cluster:")
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
Top words per cluster:
Cluster: 0 texts: 14392
ha
kia
nahi
ni
hahaha
bhai
bht
good
nhi
hai
Cluster: 1 texts: 850
allah
ameen
pak
salamat
ata
masha
ko
rakhy
farma
dy
Cluster: 2 texts: 826
ye
ha
kia
nahi
hai
ke
bhai
kon
he
baat
Cluster: 3 texts: 2112
ke
mein
aik
sath
the
kiya
nahi
bad
khan
leye
Cluster: 4 texts: 2048
mein
ke
aik
kiya
pakistan
khan
hasil
aur
ki
jis
n_clusters = 5
positivedf = df[df['Response'] == 'Positive'] #segmenting positive clusters
matrixForm,words = vectorFeatures(positivedf)
plotKneeElbow(matrixForm,20)
modelKmeans = performKmeansClustering(5,500,matrixForm)
Initialization complete
Iteration 0, inertia 11638.70242586698
Iteration 1, inertia 5918.231765076469
Iteration 2, inertia 5912.779323247318
Iteration 3, inertia 5909.458164931391
Iteration 4, inertia 5905.209134354751
Iteration 5, inertia 5898.485345650065
Iteration 6, inertia 5893.752087178129
Iteration 7, inertia 5889.890504252205
Iteration 8, inertia 5887.045430832817
Iteration 9, inertia 5886.080601435889
Iteration 10, inertia 5885.48015143132
Iteration 11, inertia 5885.433516343923
Iteration 12, inertia 5885.428245417689
Iteration 13, inertia 5885.424211706235
Iteration 14, inertia 5885.418479233584
Iteration 15, inertia 5885.417424161298
Iteration 16, inertia 5885.416178679513
Iteration 17, inertia 5885.414037916491
Iteration 18, inertia 5885.4118669801965
Iteration 19, inertia 5885.409829680953
Converged at iteration 19: strict convergence.
Initialization complete
Iteration 0, inertia 11734.932091062374
Iteration 1, inertia 5921.481987676169
Iteration 2, inertia 5915.309587310876
Iteration 3, inertia 5913.618194909429
Iteration 4, inertia 5912.427306797081
Iteration 5, inertia 5912.001630453081
Iteration 6, inertia 5911.760579610323
Iteration 7, inertia 5911.477909868069
Iteration 8, inertia 5911.181744226007
Iteration 9, inertia 5910.966891159394
Iteration 10, inertia 5910.742629468331
Iteration 11, inertia 5910.515429090009
Iteration 12, inertia 5910.376324792047
Iteration 13, inertia 5910.277095705676
Iteration 14, inertia 5910.113739729431
Iteration 15, inertia 5909.839286831771
Iteration 16, inertia 5909.56373670623
Iteration 17, inertia 5909.3117207236655
Iteration 18, inertia 5909.212198656154
Iteration 19, inertia 5909.103499692147
Iteration 20, inertia 5909.040073871786
Iteration 21, inertia 5908.993974453955
Iteration 22, inertia 5908.940764158584
Iteration 23, inertia 5908.901898129075
Iteration 24, inertia 5908.861667369804
Iteration 25, inertia 5908.815816049354
Iteration 26, inertia 5908.796528192912
Iteration 27, inertia 5908.7737385811515
Iteration 28, inertia 5908.763654917698
Iteration 29, inertia 5908.755477449177
Iteration 30, inertia 5908.751590707737
Iteration 31, inertia 5908.737187466088
Iteration 32, inertia 5908.733463351095
Converged at iteration 32: strict convergence.
Initialization complete
Iteration 0, inertia 11497.438655378413
Iteration 1, inertia 5885.285248161624
Iteration 2, inertia 5873.885867702664
Iteration 3, inertia 5870.500946625707
Iteration 4, inertia 5868.783811865338
Iteration 5, inertia 5867.86456710237
Iteration 6, inertia 5867.194216698158
Iteration 7, inertia 5866.846633354333
Iteration 8, inertia 5866.564304563144
Iteration 9, inertia 5866.384705447043
Iteration 10, inertia 5866.203415622214
Iteration 11, inertia 5866.028743539231
Iteration 12, inertia 5866.003330873057
Iteration 13, inertia 5865.990961966805
Iteration 14, inertia 5865.981345302518
Iteration 15, inertia 5865.9768853426185
Iteration 16, inertia 5865.974676089983
Iteration 17, inertia 5865.9725236418235
Converged at iteration 17: strict convergence.
Initialization complete
Iteration 0, inertia 11639.671406206526
Iteration 1, inertia 5909.941105413439
Iteration 2, inertia 5891.142717309343
Iteration 3, inertia 5886.749377489325
Iteration 4, inertia 5884.919999552638
Iteration 5, inertia 5883.749602297031
Iteration 6, inertia 5883.426920130498
Iteration 7, inertia 5883.364470640962
Iteration 8, inertia 5883.34946290216
Iteration 9, inertia 5883.344398602182
Iteration 10, inertia 5883.340375693296
Iteration 11, inertia 5883.337438774463
Iteration 12, inertia 5883.332754995878
Iteration 13, inertia 5883.331827949402
Converged at iteration 13: strict convergence.
Initialization complete
Iteration 0, inertia 11723.36506633276
Iteration 1, inertia 5915.861150167836
Iteration 2, inertia 5907.539800562356
Iteration 3, inertia 5900.294964490317
Iteration 4, inertia 5895.015647707014
Iteration 5, inertia 5892.093880437664
Iteration 6, inertia 5890.54296358035
Iteration 7, inertia 5889.9275858487945
Iteration 8, inertia 5889.644874220706
Iteration 9, inertia 5889.507865254292
Iteration 10, inertia 5889.4180379317595
Iteration 11, inertia 5889.360033313932
Iteration 12, inertia 5889.343513411737
Iteration 13, inertia 5889.328594855136
Iteration 14, inertia 5889.305777439089
Iteration 15, inertia 5889.257240586189
Iteration 16, inertia 5889.112942049793
Iteration 17, inertia 5888.727770847838
Iteration 18, inertia 5888.4293982213785
Iteration 19, inertia 5888.40865360887
Iteration 20, inertia 5888.402623359306
Converged at iteration 20: strict convergence.
Initialization complete
Iteration 0, inertia 11501.586083466704
Iteration 1, inertia 5913.101515401837
Iteration 2, inertia 5893.07846135935
Iteration 3, inertia 5887.3517802674505
Iteration 4, inertia 5885.77130288727
Iteration 5, inertia 5884.475883980248
Iteration 6, inertia 5881.5728538205585
Iteration 7, inertia 5878.96018308709
Iteration 8, inertia 5877.140985913238
Iteration 9, inertia 5876.032023386732
Iteration 10, inertia 5875.586938985602
Iteration 11, inertia 5875.117407917631
Iteration 12, inertia 5874.275038651936
Iteration 13, inertia 5872.144578274599
Iteration 14, inertia 5869.680326139892
Iteration 15, inertia 5867.753649124899
Iteration 16, inertia 5861.64540049664
Iteration 17, inertia 5851.399616965806
Iteration 18, inertia 5850.277247719815
Iteration 19, inertia 5850.105932274929
Iteration 20, inertia 5850.058334576849
Iteration 21, inertia 5850.046389102763
Converged at iteration 21: strict convergence.
Initialization complete
Iteration 0, inertia 11669.449597911604
Iteration 1, inertia 5914.81459215129
Iteration 2, inertia 5896.409056434139
Iteration 3, inertia 5889.392296105472
Iteration 4, inertia 5885.990849830837
Iteration 5, inertia 5883.289600357054
Iteration 6, inertia 5881.278932111127
Iteration 7, inertia 5880.564954258311
Iteration 8, inertia 5879.898837353021
Iteration 9, inertia 5879.584580012188
Iteration 10, inertia 5879.523299428006
Iteration 11, inertia 5879.505769332775
Iteration 12, inertia 5879.494127767516
Converged at iteration 12: strict convergence.
Initialization complete
Iteration 0, inertia 11652.032810599107
Iteration 1, inertia 5901.99069908003
Iteration 2, inertia 5866.409069481149
Iteration 3, inertia 5859.730966084252
Iteration 4, inertia 5857.508184489947
Iteration 5, inertia 5856.73396995692
Iteration 6, inertia 5856.655877796674
Iteration 7, inertia 5856.63720778182
Iteration 8, inertia 5856.630737963067
Iteration 9, inertia 5856.62247525072
Iteration 10, inertia 5856.621537194215
Converged at iteration 10: strict convergence.
Initialization complete
Iteration 0, inertia 11626.782489642372
Iteration 1, inertia 5900.938971909339
Iteration 2, inertia 5892.845862025027
Iteration 3, inertia 5887.627689307445
Iteration 4, inertia 5882.716379500174
Iteration 5, inertia 5879.47032224411
Iteration 6, inertia 5878.193746387502
Iteration 7, inertia 5877.934229283638
Iteration 8, inertia 5877.898276298322
Iteration 9, inertia 5877.878503464438
Iteration 10, inertia 5877.871669592799
Converged at iteration 10: strict convergence.
Initialization complete
Iteration 0, inertia 11452.586928398776
Iteration 1, inertia 5907.8939313333885
Iteration 2, inertia 5882.986140897086
Iteration 3, inertia 5880.067907551267
Iteration 4, inertia 5879.75222814334
Iteration 5, inertia 5879.653063638107
Iteration 6, inertia 5879.595178177778
Iteration 7, inertia 5879.524669236001
Iteration 8, inertia 5879.504826475844
Iteration 9, inertia 5879.494041082983
Iteration 10, inertia 5879.485749766454
Iteration 11, inertia 5879.4838688787
Converged at iteration 11: strict convergence.
labels = modelKmeans.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeans.cluster_centers_.argsort()[:, ::-1]
print("Positive centers:", modelKmeans.cluster_centers_)
print("Positive labels", labels)
print("Positive intertia:", modelKmeans.inertia_)
Positive centers: [[0.00429722 0.00011493 0.00016658 ... 0.00112581 0.00041271 0. ]
[0. 0. 0. ... 0. 0. 0. ]
[0.00101542 0. 0. ... 0.0006425 0. 0.00041574]
[0.00169045 0. 0. ... 0.00111931 0. 0. ]
[0.00163406 0. 0. ... 0.00018646 0. 0. ]]
Positive labels [0 0 0 ... 0 0 0]
Positive intertia: 5850.046389102763
n_clusters = 5
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
clusterDictionary={}
clustermTerms=[]
print("Top words per Positive cluster:")
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
Top words per Positive cluster:
Cluster: 0 texts: 3593
ha
bht
dua
achi
ye
bohat
love
mubarak
eid
bhai
Cluster: 1 texts: 86
good
very
qalandars
lahore
one
news
job
man
love
team
Cluster: 2 texts: 839
mein
ke
pakistan
khan
aik
kiya
hasil
ki
aur
jis
Cluster: 3 texts: 595
allah
ameen
pak
salamat
ata
ko
rakhy
madad
farma
lambi
Cluster: 4 texts: 900
ke
mein
sath
aik
the
nahi
kiya
film
leye
apne
generateWordCloudForClusters(5,clustermTerms)
negtivedf = df[df['Response'] == 'Negative']
matrixFormNeg,words = vectorFeatures(negtivedf)
plotKneeElbow(matrixFormNeg,10)
modelKmeansNeg = performKmeansClustering(6,500,matrixFormNeg)
Initialization complete
Iteration 0, inertia 10303.742734709333
Iteration 1, inertia 5228.594782623688
Iteration 2, inertia 5222.940893107798
Iteration 3, inertia 5218.864960117691
Iteration 4, inertia 5215.152362689033
Iteration 5, inertia 5213.330649645359
Iteration 6, inertia 5213.060218887514
Iteration 7, inertia 5212.927631473143
Iteration 8, inertia 5212.769203533682
Iteration 9, inertia 5212.591560064694
Iteration 10, inertia 5212.191306107012
Iteration 11, inertia 5211.844687594022
Iteration 12, inertia 5211.791756257817
Iteration 13, inertia 5211.766328932482
Iteration 14, inertia 5211.76105543869
Iteration 15, inertia 5211.755630868982
Iteration 16, inertia 5211.741302148792
Iteration 17, inertia 5211.731081248946
Iteration 18, inertia 5211.717502575058
Iteration 19, inertia 5211.712359653547
Converged at iteration 19: strict convergence.
Initialization complete
Iteration 0, inertia 10305.249065873839
Iteration 1, inertia 5238.042740207974
Iteration 2, inertia 5229.759760326679
Iteration 3, inertia 5223.688141862334
Iteration 4, inertia 5220.491556682696
Iteration 5, inertia 5218.313707962636
Iteration 6, inertia 5217.516527478591
Iteration 7, inertia 5217.042734695239
Iteration 8, inertia 5216.511307053478
Iteration 9, inertia 5215.920017457348
Iteration 10, inertia 5215.310619368511
Iteration 11, inertia 5214.842462600434
Iteration 12, inertia 5214.606025345368
Iteration 13, inertia 5214.287694912614
Iteration 14, inertia 5214.136068074448
Iteration 15, inertia 5214.093081530752
Iteration 16, inertia 5214.089611736968
Converged at iteration 16: strict convergence.
Initialization complete
Iteration 0, inertia 10317.851248304221
Iteration 1, inertia 5223.86226870125
Iteration 2, inertia 5216.257169737982
Iteration 3, inertia 5214.082439276656
Iteration 4, inertia 5213.038166981094
Iteration 5, inertia 5211.696268543017
Iteration 6, inertia 5210.823116116533
Iteration 7, inertia 5210.436620037931
Iteration 8, inertia 5210.197074752983
Iteration 9, inertia 5209.804972714274
Iteration 10, inertia 5208.881224807817
Iteration 11, inertia 5208.045649338146
Iteration 12, inertia 5207.849669163953
Iteration 13, inertia 5207.792404567299
Iteration 14, inertia 5207.755144832485
Iteration 15, inertia 5207.704649332525
Iteration 16, inertia 5207.5663612106955
Iteration 17, inertia 5207.406056099841
Iteration 18, inertia 5207.323749141013
Iteration 19, inertia 5207.17603758917
Iteration 20, inertia 5206.8753075761915
Iteration 21, inertia 5206.633951004779
Iteration 22, inertia 5206.582185951139
Iteration 23, inertia 5206.563724840729
Iteration 24, inertia 5206.556179539833
Iteration 25, inertia 5206.553771549646
Converged at iteration 25: strict convergence.
Initialization complete
Iteration 0, inertia 10250.453972798969
Iteration 1, inertia 5225.637870373032
Iteration 2, inertia 5216.050066559699
Iteration 3, inertia 5213.1046334551365
Iteration 4, inertia 5212.070388331261
Iteration 5, inertia 5211.732786766746
Iteration 6, inertia 5211.601913921875
Iteration 7, inertia 5211.551008860261
Iteration 8, inertia 5211.504764607487
Iteration 9, inertia 5211.479623114431
Iteration 10, inertia 5211.442576378529
Iteration 11, inertia 5211.402330675882
Iteration 12, inertia 5211.362512410847
Iteration 13, inertia 5211.337531122896
Iteration 14, inertia 5211.31408430728
Iteration 15, inertia 5211.291031787827
Iteration 16, inertia 5211.236933079989
Iteration 17, inertia 5211.171574350635
Iteration 18, inertia 5211.105244297966
Iteration 19, inertia 5211.023080464815
Iteration 20, inertia 5210.933919405853
Iteration 21, inertia 5210.809806617233
Iteration 22, inertia 5210.667471836696
Iteration 23, inertia 5210.519617471342
Iteration 24, inertia 5210.3435732590615
Iteration 25, inertia 5210.08262954554
Iteration 26, inertia 5209.72980939678
Iteration 27, inertia 5209.246792529882
Iteration 28, inertia 5208.519197077622
Iteration 29, inertia 5207.758033674253
Iteration 30, inertia 5207.059345864303
Iteration 31, inertia 5206.215648588574
Iteration 32, inertia 5205.667721708727
Iteration 33, inertia 5205.5123646367665
Iteration 34, inertia 5205.429840347002
Iteration 35, inertia 5205.344737747916
Iteration 36, inertia 5205.260628476409
Iteration 37, inertia 5205.165797834137
Iteration 38, inertia 5205.096198241194
Iteration 39, inertia 5205.031081413257
Iteration 40, inertia 5204.947772913913
Iteration 41, inertia 5204.7126804439695
Iteration 42, inertia 5204.298688062699
Iteration 43, inertia 5203.938696827631
Iteration 44, inertia 5203.693308767637
Iteration 45, inertia 5203.647437174001
Iteration 46, inertia 5203.642250548399
Iteration 47, inertia 5203.639673865389
Iteration 48, inertia 5203.637201290469
Converged at iteration 48: strict convergence.
Initialization complete
Iteration 0, inertia 10273.245656616287
Iteration 1, inertia 5223.448619254626
Iteration 2, inertia 5214.289332771368
Iteration 3, inertia 5211.028380544402
Iteration 4, inertia 5209.386977774231
Iteration 5, inertia 5207.870229887036
Iteration 6, inertia 5204.9626745251135
Iteration 7, inertia 5202.674845919278
Iteration 8, inertia 5201.6953667971275
Iteration 9, inertia 5201.135149183095
Iteration 10, inertia 5200.615393680914
Iteration 11, inertia 5199.857968550326
Iteration 12, inertia 5197.953520713508
Iteration 13, inertia 5197.360596798279
Iteration 14, inertia 5197.292351381995
Iteration 15, inertia 5197.255952682909
Iteration 16, inertia 5197.246033065122
Converged at iteration 16: strict convergence.
Initialization complete
Iteration 0, inertia 10235.501631552635
Iteration 1, inertia 5209.124029636427
Iteration 2, inertia 5200.910781519787
Iteration 3, inertia 5197.87744328855
Iteration 4, inertia 5197.027809279148
Iteration 5, inertia 5196.571566288594
Iteration 6, inertia 5196.1031295272915
Iteration 7, inertia 5195.386746953562
Iteration 8, inertia 5194.389028247872
Iteration 9, inertia 5193.520426153572
Iteration 10, inertia 5193.309368635764
Iteration 11, inertia 5193.264225632266
Iteration 12, inertia 5193.2436520545425
Iteration 13, inertia 5193.217967645818
Iteration 14, inertia 5193.210578651943
Iteration 15, inertia 5193.208390102702
Iteration 16, inertia 5193.206219611384
Iteration 17, inertia 5193.20445771019
Converged at iteration 17: strict convergence.
Initialization complete
Iteration 0, inertia 10317.194588653037
Iteration 1, inertia 5230.199054039625
Iteration 2, inertia 5221.764316390704
Iteration 3, inertia 5219.409501549969
Iteration 4, inertia 5218.716894146067
Iteration 5, inertia 5217.799639736188
Iteration 6, inertia 5216.397304134611
Iteration 7, inertia 5214.9616841237175
Iteration 8, inertia 5213.913060257858
Iteration 9, inertia 5213.5736152170675
Iteration 10, inertia 5213.349814363319
Iteration 11, inertia 5213.049716593562
Iteration 12, inertia 5212.625205614983
Iteration 13, inertia 5209.856134815305
Iteration 14, inertia 5204.0372794795785
Iteration 15, inertia 5201.837296539975
Iteration 16, inertia 5201.661581909572
Iteration 17, inertia 5201.6523442268835
Converged at iteration 17: strict convergence.
Initialization complete
Iteration 0, inertia 10339.045820302412
Iteration 1, inertia 5224.982691116074
Iteration 2, inertia 5219.859187569986
Iteration 3, inertia 5219.2408211528855
Iteration 4, inertia 5219.032707656427
Iteration 5, inertia 5218.8087118932035
Iteration 6, inertia 5218.569898554157
Iteration 7, inertia 5218.470671121645
Iteration 8, inertia 5218.463278492839
Iteration 9, inertia 5218.459790343884
Converged at iteration 9: strict convergence.
Initialization complete
Iteration 0, inertia 10304.837440677344
Iteration 1, inertia 5220.387456677699
Iteration 2, inertia 5214.61435991898
Iteration 3, inertia 5213.118366589331
Iteration 4, inertia 5212.479639586547
Iteration 5, inertia 5211.724616174587
Iteration 6, inertia 5211.220262772481
Iteration 7, inertia 5211.0352022435745
Iteration 8, inertia 5210.820627472691
Iteration 9, inertia 5210.524172669818
Iteration 10, inertia 5209.476909648838
Iteration 11, inertia 5207.945636782648
Iteration 12, inertia 5207.8010916458425
Iteration 13, inertia 5207.773854359629
Iteration 14, inertia 5207.742235976801
Iteration 15, inertia 5207.715057333058
Iteration 16, inertia 5207.7019609885365
Iteration 17, inertia 5207.691103730526
Iteration 18, inertia 5207.679628200775
Iteration 19, inertia 5207.672028510706
Converged at iteration 19: strict convergence.
Initialization complete
Iteration 0, inertia 10298.287690632675
Iteration 1, inertia 5223.797445507748
Iteration 2, inertia 5208.6473270841425
Iteration 3, inertia 5205.734410732107
Iteration 4, inertia 5205.19881323019
Iteration 5, inertia 5204.677157282844
Iteration 6, inertia 5203.922472624651
Iteration 7, inertia 5203.31576564724
Iteration 8, inertia 5202.681457234208
Iteration 9, inertia 5202.175940717498
Iteration 10, inertia 5201.88509623427
Iteration 11, inertia 5201.686613903385
Iteration 12, inertia 5201.565414806731
Iteration 13, inertia 5201.281485646082
Iteration 14, inertia 5201.137226207673
Iteration 15, inertia 5201.073957367497
Iteration 16, inertia 5200.928330644729
Iteration 17, inertia 5200.723350207023
Iteration 18, inertia 5200.445043173151
Iteration 19, inertia 5200.2846772841995
Iteration 20, inertia 5200.233503942458
Iteration 21, inertia 5200.224342509543
Iteration 22, inertia 5200.220582686008
Converged at iteration 22: strict convergence.
n_clusters=6
labelsNeg = modelKmeansNeg.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeansNeg.cluster_centers_.argsort()[:, ::-1]
print("Negative centers:", modelKmeansNeg.cluster_centers_)
print("Negative labels", labelsNeg)
print("Negative intertia:", modelKmeansNeg.inertia_)
Negative centers: [[2.44393731e-03 1.24873348e-04 2.39689421e-04 ... 8.71932662e-05
8.02991001e-04 0.00000000e+00]
[1.68325905e-03 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[7.36198146e-04 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[5.22551787e-03 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[2.87047647e-03 0.00000000e+00 0.00000000e+00 ... 1.36079642e-03
0.00000000e+00 5.39346758e-04]]
Negative labels [0 0 4 ... 0 2 0]
Negative intertia: 5193.20445771019
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
print("Top Negative words per 6 cluster:")
clustermTerms = []
clusterDictionary={}
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
Top Negative words per 6 cluster:
Cluster: 0 texts: 3593
nahi
nhi
allah
kia
ni
hai
mein
say
aunty
chor
Cluster: 1 texts: 86
ke
mein
aik
nahi
bad
lekin
khan
kiya
apni
pakistan
Cluster: 2 texts: 839
ha
ye
ma
ni
kia
had
rhi
sa
bat
nhi
Cluster: 3 texts: 595
lanat
per
py
is
pr
lakh
allah
beshumar
in
pay
Cluster: 4 texts: 900
ye
nahi
kia
nhi
hai
aap
log
phir
tak
bhai
Cluster: 5 texts: 0
to
police
jo
ma
ko
allah
nahi
ky
ni
log
modelKmeansNeg = performKmeansClustering(5,500,matrixFormNeg)
Initialization complete
Iteration 0, inertia 10367.971511064723
Iteration 1, inertia 5239.534797172505
Iteration 2, inertia 5230.689387471342
Iteration 3, inertia 5227.212839455675
Iteration 4, inertia 5224.360529174281
Iteration 5, inertia 5220.422724979715
Iteration 6, inertia 5218.446324920202
Iteration 7, inertia 5217.711833881779
Iteration 8, inertia 5217.121298855013
Iteration 9, inertia 5216.161992549762
Iteration 10, inertia 5214.354001528078
Iteration 11, inertia 5213.74749871841
Iteration 12, inertia 5213.584816019727
Iteration 13, inertia 5213.497971419185
Iteration 14, inertia 5213.413220222515
Iteration 15, inertia 5213.207878172583
Iteration 16, inertia 5212.815765642439
Iteration 17, inertia 5212.185019205774
Iteration 18, inertia 5211.390935836019
Iteration 19, inertia 5210.536037117721
Iteration 20, inertia 5209.779204358627
Iteration 21, inertia 5209.328168274374
Iteration 22, inertia 5209.212841575734
Iteration 23, inertia 5209.100748383987
Iteration 24, inertia 5208.931508713822
Iteration 25, inertia 5208.788960619349
Iteration 26, inertia 5208.7666804203855
Iteration 27, inertia 5208.753343856375
Iteration 28, inertia 5208.723818650422
Iteration 29, inertia 5208.7073575515415
Iteration 30, inertia 5208.6980003306735
Iteration 31, inertia 5208.693853937419
Iteration 32, inertia 5208.6904745716465
Iteration 33, inertia 5208.684658577126
Iteration 34, inertia 5208.681389770343
Converged at iteration 34: strict convergence.
Initialization complete
Iteration 0, inertia 10286.01044531237
Iteration 1, inertia 5230.008944779925
Iteration 2, inertia 5217.75215467166
Iteration 3, inertia 5210.818956653625
Iteration 4, inertia 5207.925960803595
Iteration 5, inertia 5206.26414218851
Iteration 6, inertia 5204.696815826161
Iteration 7, inertia 5203.488770090054
Iteration 8, inertia 5201.647279097611
Iteration 9, inertia 5201.03245277984
Iteration 10, inertia 5200.922415394078
Iteration 11, inertia 5200.891886144676
Iteration 12, inertia 5200.874723365965
Iteration 13, inertia 5200.869485321099
Iteration 14, inertia 5200.865904981607
Iteration 15, inertia 5200.862482812042
Iteration 16, inertia 5200.856123032369
Iteration 17, inertia 5200.848139668265
Iteration 18, inertia 5200.841795567329
Iteration 19, inertia 5200.831895318417
Iteration 20, inertia 5200.82595556204
Iteration 21, inertia 5200.8202324989925
Iteration 22, inertia 5200.812635512074
Iteration 23, inertia 5200.806981294917
Iteration 24, inertia 5200.80313839146
Iteration 25, inertia 5200.80122149794
Iteration 26, inertia 5200.79768213062
Converged at iteration 26: strict convergence.
Initialization complete
Iteration 0, inertia 10306.321891814961
Iteration 1, inertia 5239.944513610131
Iteration 2, inertia 5236.175496961054
Iteration 3, inertia 5232.916227546449
Iteration 4, inertia 5230.967842211162
Iteration 5, inertia 5230.069964950288
Iteration 6, inertia 5229.023332793315
Iteration 7, inertia 5227.776685058156
Iteration 8, inertia 5227.574390443106
Iteration 9, inertia 5227.4162668711215
Iteration 10, inertia 5227.278070102368
Iteration 11, inertia 5227.14492974001
Iteration 12, inertia 5227.0325149052205
Iteration 13, inertia 5226.890115469238
Iteration 14, inertia 5226.66592379859
Iteration 15, inertia 5226.445131799424
Iteration 16, inertia 5226.150485283455
Iteration 17, inertia 5225.8321542789545
Iteration 18, inertia 5225.704835600837
Iteration 19, inertia 5225.582510994679
Iteration 20, inertia 5225.482528577921
Iteration 21, inertia 5225.425986652441
Iteration 22, inertia 5225.372977282373
Iteration 23, inertia 5225.292693584771
Iteration 24, inertia 5225.059142301402
Iteration 25, inertia 5225.020119230901
Iteration 26, inertia 5225.0153490488
Iteration 27, inertia 5225.013021474682
Converged at iteration 27: strict convergence.
Initialization complete
Iteration 0, inertia 10348.934768036026
Iteration 1, inertia 5223.748605897562
Iteration 2, inertia 5216.385316826969
Iteration 3, inertia 5213.890952542815
Iteration 4, inertia 5212.836573030191
Iteration 5, inertia 5212.022269569161
Iteration 6, inertia 5211.89759366832
Iteration 7, inertia 5211.881619152217
Iteration 8, inertia 5211.8749841474155
Iteration 9, inertia 5211.866150537964
Iteration 10, inertia 5211.862008254057
Converged at iteration 10: strict convergence.
Initialization complete
Iteration 0, inertia 10363.744262175549
Iteration 1, inertia 5231.5968234560005
Iteration 2, inertia 5224.4524943628
Iteration 3, inertia 5219.899769850876
Iteration 4, inertia 5218.02153302897
Iteration 5, inertia 5217.69454768609
Iteration 6, inertia 5217.607249580759
Iteration 7, inertia 5217.489340096163
Iteration 8, inertia 5217.37221045002
Iteration 9, inertia 5217.283108799347
Iteration 10, inertia 5217.186300368063
Iteration 11, inertia 5217.108491326713
Iteration 12, inertia 5217.03111302197
Iteration 13, inertia 5216.9639787116275
Iteration 14, inertia 5216.84349129743
Iteration 15, inertia 5216.578110572913
Iteration 16, inertia 5216.467782177652
Iteration 17, inertia 5216.4256984962
Iteration 18, inertia 5216.41752521352
Converged at iteration 18: strict convergence.
Initialization complete
Iteration 0, inertia 10302.17949711364
Iteration 1, inertia 5221.3613954950415
Iteration 2, inertia 5217.371627414223
Iteration 3, inertia 5216.172983583581
Iteration 4, inertia 5215.564328914317
Iteration 5, inertia 5213.987082541497
Iteration 6, inertia 5212.785964675642
Iteration 7, inertia 5211.720287678367
Iteration 8, inertia 5210.872245731505
Iteration 9, inertia 5209.276732543929
Iteration 10, inertia 5205.680930044925
Iteration 11, inertia 5203.192233050764
Iteration 12, inertia 5203.019926236116
Iteration 13, inertia 5202.967200630625
Iteration 14, inertia 5202.940715308809
Iteration 15, inertia 5202.938905430869
Iteration 16, inertia 5202.9371190440625
Converged at iteration 16: strict convergence.
Initialization complete
Iteration 0, inertia 10294.998796804472
Iteration 1, inertia 5231.8712095109195
Iteration 2, inertia 5228.5282424637535
Iteration 3, inertia 5226.2424843255385
Iteration 4, inertia 5224.007273348677
Iteration 5, inertia 5221.9493239110125
Iteration 6, inertia 5219.934303445682
Iteration 7, inertia 5218.999732615921
Iteration 8, inertia 5218.559341485942
Iteration 9, inertia 5218.138410531308
Iteration 10, inertia 5217.769404014033
Iteration 11, inertia 5217.503281816122
Iteration 12, inertia 5217.455632766076
Iteration 13, inertia 5217.4139582940825
Iteration 14, inertia 5217.383765834922
Iteration 15, inertia 5217.373754130002
Iteration 16, inertia 5217.365188503025
Iteration 17, inertia 5217.3611078353915
Iteration 18, inertia 5217.357072579158
Iteration 19, inertia 5217.354921637761
Iteration 20, inertia 5217.353199858792
Iteration 21, inertia 5217.350821511714
Converged at iteration 21: strict convergence.
Initialization complete
Iteration 0, inertia 10309.242917688007
Iteration 1, inertia 5224.388672121628
Iteration 2, inertia 5219.373623364031
Iteration 3, inertia 5216.711591897442
Iteration 4, inertia 5214.19771106765
Iteration 5, inertia 5212.614781512186
Iteration 6, inertia 5211.759263254629
Iteration 7, inertia 5210.603793744328
Iteration 8, inertia 5209.554571957865
Iteration 9, inertia 5208.7713131358705
Iteration 10, inertia 5208.4737974696445
Iteration 11, inertia 5208.441762590871
Iteration 12, inertia 5208.439931177979
Converged at iteration 12: strict convergence.
Initialization complete
Iteration 0, inertia 10371.211675924033
Iteration 1, inertia 5229.039202517562
Iteration 2, inertia 5220.170037568129
Iteration 3, inertia 5214.480162610042
Iteration 4, inertia 5213.10554971685
Iteration 5, inertia 5212.992394089052
Iteration 6, inertia 5212.965538031837
Iteration 7, inertia 5212.958639504129
Iteration 8, inertia 5212.954719181341
Converged at iteration 8: strict convergence.
Initialization complete
Iteration 0, inertia 10334.150963248148
Iteration 1, inertia 5238.729508322428
Iteration 2, inertia 5225.136174599514
Iteration 3, inertia 5220.887064102745
Iteration 4, inertia 5218.8006508101535
Iteration 5, inertia 5216.59467526428
Iteration 6, inertia 5215.810898278978
Iteration 7, inertia 5215.621653731604
Iteration 8, inertia 5215.48333551108
Iteration 9, inertia 5215.348034338277
Iteration 10, inertia 5215.190517682781
Iteration 11, inertia 5215.03686625785
Iteration 12, inertia 5214.925115732293
Iteration 13, inertia 5214.810632031096
Iteration 14, inertia 5214.694364269147
Iteration 15, inertia 5214.59199822614
Iteration 16, inertia 5214.51998846058
Iteration 17, inertia 5214.4617559592425
Iteration 18, inertia 5214.43246432263
Iteration 19, inertia 5214.410157353982
Iteration 20, inertia 5214.381938541032
Iteration 21, inertia 5214.347195483555
Iteration 22, inertia 5214.307133480167
Iteration 23, inertia 5214.259613347713
Iteration 24, inertia 5214.20440997806
Iteration 25, inertia 5214.119188682478
Iteration 26, inertia 5214.0559740154185
Iteration 27, inertia 5214.01912146584
Iteration 28, inertia 5213.984644499517
Iteration 29, inertia 5213.970701416756
Iteration 30, inertia 5213.96541462727
Iteration 31, inertia 5213.952079151563
Iteration 32, inertia 5213.923219214851
Iteration 33, inertia 5213.862535939089
Iteration 34, inertia 5213.762778527236
Iteration 35, inertia 5213.725264879958
Iteration 36, inertia 5213.638069967985
Iteration 37, inertia 5213.434213337731
Iteration 38, inertia 5213.123505105519
Iteration 39, inertia 5213.01390448643
Iteration 40, inertia 5212.887127638456
Iteration 41, inertia 5212.810242893085
Iteration 42, inertia 5212.774182654379
Iteration 43, inertia 5212.739641998099
Iteration 44, inertia 5212.654865718542
Iteration 45, inertia 5212.521763001346
Iteration 46, inertia 5212.479294216212
Iteration 47, inertia 5212.445947216524
Iteration 48, inertia 5212.393604661411
Iteration 49, inertia 5212.325145313041
Iteration 50, inertia 5212.124062329532
Iteration 51, inertia 5211.89674786299
Iteration 52, inertia 5211.828097458641
Iteration 53, inertia 5211.777708929287
Iteration 54, inertia 5211.737688600058
Iteration 55, inertia 5211.676175007943
Iteration 56, inertia 5211.438702045662
Iteration 57, inertia 5210.759907826835
Iteration 58, inertia 5210.607125570732
Converged at iteration 58: strict convergence.
labelsNeg = modelKmeansNeg.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeansNeg.cluster_centers_.argsort()[:, ::-1]
print("Negative centers:", modelKmeansNeg.cluster_centers_)
print("Negative labels", labelsNeg)
print("Negative intertia:", modelKmeansNeg.inertia_)
Negative centers: [[2.60910141e-03 1.13785035e-04 2.18405845e-04 ... 1.67995784e-04
7.31688229e-04 3.50944808e-05]
[1.64136242e-03 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[2.84553647e-03 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[1.68766804e-03 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]]
Negative labels [0 0 0 ... 0 4 0]
Negative intertia: 5200.79768213062
n_clusters=5
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
print("Top Negative words per cluster:")
clustermTerms = []
clusterDictionary={}
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
Top Negative words per cluster:
Cluster: 0 texts: 3593
ye
nahi
nhi
allah
ni
hai
mein
aunty
chor
bhai
Cluster: 1 texts: 86
ke
mein
aik
nahi
bad
lekin
khan
kiya
pakistan
apni
Cluster: 2 texts: 839
lanat
per
py
is
pr
lakh
allah
beshumar
in
pay
Cluster: 3 texts: 595
kia
ye
ni
nhi
nahi
ha
sath
tum
karo
rha
Cluster: 4 texts: 900
ha
ye
ma
ni
rhi
had
nahi
sa
mulk
band
generateWordCloudForClusters(n_clusters,clustermTerms)
neutraldf = df[df['Response'] == 'Neutral']
matrixFormNeu,words = vectorFeatures(neutraldf)
plotKneeElbow(matrixFormNeg,10)
modelKmeansNeu = performKmeansClustering(5,500,matrixFormNeu)
Initialization complete
Iteration 0, inertia 8685.687452074248
Iteration 1, inertia 8655.363352379314
Iteration 2, inertia 8653.055069049224
Iteration 3, inertia 8651.746771326118
Iteration 4, inertia 8650.818982882623
Iteration 5, inertia 8650.595785944419
Converged at iteration 5: strict convergence.
Initialization complete
Iteration 0, inertia 17302.903026929635
Iteration 1, inertia 8603.237285790658
Iteration 2, inertia 8593.058516107822
Iteration 3, inertia 8576.206764985352
Iteration 4, inertia 8571.977658034217
Iteration 5, inertia 8571.212077284123
Iteration 6, inertia 8570.857859733043
Iteration 7, inertia 8570.433901575316
Iteration 8, inertia 8570.174009576494
Iteration 9, inertia 8570.161844057306
Iteration 10, inertia 8570.154602398767
Iteration 11, inertia 8570.152880949623
Iteration 12, inertia 8570.151206235027
Iteration 13, inertia 8570.150406657602
Converged at iteration 13: strict convergence.
Initialization complete
Iteration 0, inertia 17333.388878280406
Iteration 1, inertia 8624.46195840096
Iteration 2, inertia 8595.066631143636
Iteration 3, inertia 8581.398590614883
Iteration 4, inertia 8575.217585078615
Iteration 5, inertia 8571.851982194288
Iteration 6, inertia 8569.585183668647
Iteration 7, inertia 8569.150322193667
Iteration 8, inertia 8569.120147087579
Iteration 9, inertia 8569.114420601632
Iteration 10, inertia 8569.112046676899
Converged at iteration 10: strict convergence.
Initialization complete
Iteration 0, inertia 17332.16541218326
Iteration 1, inertia 8607.126094802488
Iteration 2, inertia 8599.091627577483
Iteration 3, inertia 8593.426785293192
Iteration 4, inertia 8588.639117253193
Iteration 5, inertia 8585.841332164271
Iteration 6, inertia 8584.178763870148
Iteration 7, inertia 8582.99176691958
Iteration 8, inertia 8581.587994413305
Iteration 9, inertia 8580.81399508142
Iteration 10, inertia 8579.995717256817
Iteration 11, inertia 8579.394337228823
Iteration 12, inertia 8578.90277738711
Iteration 13, inertia 8578.032246119023
Iteration 14, inertia 8577.256095697618
Iteration 15, inertia 8576.855467373332
Iteration 16, inertia 8575.647633447099
Iteration 17, inertia 8574.476771392712
Iteration 18, inertia 8574.37568075778
Iteration 19, inertia 8574.37179917862
Converged at iteration 19: strict convergence.
Initialization complete
Iteration 0, inertia 17342.71640745492
Iteration 1, inertia 8578.2390374423
Iteration 2, inertia 8567.283118986383
Iteration 3, inertia 8565.28979473164
Iteration 4, inertia 8565.067099348305
Iteration 5, inertia 8564.99611836088
Iteration 6, inertia 8564.980694213735
Iteration 7, inertia 8564.978260355128
Converged at iteration 7: strict convergence.
Initialization complete
Iteration 0, inertia 17309.46998069188
Iteration 1, inertia 8635.4973197397
Iteration 2, inertia 8623.316195428777
Iteration 3, inertia 8618.211856255228
Iteration 4, inertia 8615.058873961314
Iteration 5, inertia 8612.835130443478
Iteration 6, inertia 8610.830512051452
Iteration 7, inertia 8609.163787076759
Iteration 8, inertia 8606.907958667267
Iteration 9, inertia 8601.332070545672
Iteration 10, inertia 8599.685095021097
Iteration 11, inertia 8599.243843664262
Iteration 12, inertia 8599.036228265859
Iteration 13, inertia 8599.0021407079
Iteration 14, inertia 8598.953475058774
Iteration 15, inertia 8598.84843804985
Iteration 16, inertia 8598.723849277727
Iteration 17, inertia 8598.396150421398
Iteration 18, inertia 8593.746528971084
Iteration 19, inertia 8572.522701769496
Iteration 20, inertia 8566.545576926672
Iteration 21, inertia 8564.973766901783
Iteration 22, inertia 8564.551739944789
Iteration 23, inertia 8564.309776741731
Iteration 24, inertia 8564.204156151141
Iteration 25, inertia 8564.145357860312
Iteration 26, inertia 8564.12661077553
Iteration 27, inertia 8564.120540227239
Iteration 28, inertia 8564.117026724636
Iteration 29, inertia 8564.11470634407
Converged at iteration 29: strict convergence.
Initialization complete
Iteration 0, inertia 17200.44876962317
Iteration 1, inertia 8580.178267297933
Iteration 2, inertia 8566.107095103993
Iteration 3, inertia 8562.849475093644
Iteration 4, inertia 8561.411618754193
Iteration 5, inertia 8561.259609602392
Iteration 6, inertia 8561.25397803743
Iteration 7, inertia 8561.252272527105
Iteration 8, inertia 8561.251439978687
Converged at iteration 8: strict convergence.
Initialization complete
Iteration 0, inertia 17241.67267523423
Iteration 1, inertia 8586.938095276511
Iteration 2, inertia 8578.829389204566
Iteration 3, inertia 8575.510287592742
Iteration 4, inertia 8573.765332174638
Iteration 5, inertia 8573.113374611059
Iteration 6, inertia 8572.905032113675
Iteration 7, inertia 8572.85347038085
Iteration 8, inertia 8572.825979392175
Iteration 9, inertia 8572.807076904379
Iteration 10, inertia 8572.796323930743
Converged at iteration 10: strict convergence.
Initialization complete
Iteration 0, inertia 17294.083268623872
Iteration 1, inertia 8594.451453498988
Iteration 2, inertia 8578.235396309048
Iteration 3, inertia 8576.447045061324
Iteration 4, inertia 8576.106224243124
Iteration 5, inertia 8575.962907194373
Iteration 6, inertia 8575.8074437341
Iteration 7, inertia 8575.649721211583
Iteration 8, inertia 8575.34520200186
Iteration 9, inertia 8575.087985160564
Iteration 10, inertia 8574.873245456218
Iteration 11, inertia 8574.747605668716
Iteration 12, inertia 8574.558826425933
Iteration 13, inertia 8574.475141218318
Iteration 14, inertia 8574.44863705675
Iteration 15, inertia 8574.435453055223
Iteration 16, inertia 8574.432635408344
Converged at iteration 16: strict convergence.
Initialization complete
Iteration 0, inertia 17188.36985932812
Iteration 1, inertia 8610.689374629605
Iteration 2, inertia 8586.609962314269
Iteration 3, inertia 8579.968671582134
Iteration 4, inertia 8577.60181506925
Iteration 5, inertia 8576.369422922086
Iteration 6, inertia 8574.68519981899
Iteration 7, inertia 8573.213012384182
Iteration 8, inertia 8573.153032528946
Converged at iteration 8: strict convergence.
labelsNeu = modelKmeansNeu.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeansNeg.cluster_centers_.argsort()[:, ::-1]
print("Neutral centers:", modelKmeansNeu.cluster_centers_)
print("Neutral labels", labelsNeu)
print("Neutral intertia:", modelKmeansNeu.inertia_)
Neutral centers: [[1.75633321e-03 7.11455203e-05 1.43081986e-04 ... 5.32167572e-05
5.61054612e-05 4.84129607e-04]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[2.35210771e-03 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[1.43402982e-03 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
0.00000000e+00 2.02042867e-03]]
Neutral labels [0 0 0 ... 0 0 0]
Neutral intertia: 8561.251439978687
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
print("Top Neutral words per cluster:")
clustermTerms = []
clusterDictionary={}
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
Top Neutral words per cluster:
Cluster: 0 texts: 3593
wly
naysirf
notifications
alegre
now
haijhagra
miss
asifiya
chupo
bharhay
Cluster: 1 texts: 86
khandanon
miss
ahrram
naysirf
azazi
lone
khota
ksr
parties
angrez
Cluster: 2 texts: 839
lgany
pic
qoumiyyat
italian
pure
leony
alegre
betey
interupted
peyda
Cluster: 3 texts: 595
krdiaa
wly
now
notifications
naysirf
hahaahahahahaha
sedhi
treaty
kehta
root
Cluster: 4 texts: 900
hahaahahahahaha
wly
malaysia
now
rosterm
hahahahahahahahahahahahah
naysirf
sahi
mutamdi
bakrio
generateWordCloudForClusters(n_clusters,clustermTerms)
from gensim.models import FastText
#from gensim.test.utils import common_texts # some example sentences
model = FastText(vector_size=15, window=4, min_count=1) # instantiate
model.build_vocab(corpus_iterable=forFastTextData)
model.train(corpus_iterable=forFastTextData, total_examples=len(forFastTextData), epochs=10)
model.wv.most_similar("insan")
model.wv.most_similar("larki")
model.wv.similarity(w1 = 'khushi',w2 = 'achi')
model.wv.similarity(w1 = 'waqt',w2 = 'gahri')
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
### We can use this function to do cross-validation
# use for both naive bayes and SVM
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import sem
def evaluate_cross_validation(clf, X, y, K):
# create a k-fold cross validation iterator of K folds
cv = KFold(n_splits=K, random_state=0, shuffle=True)
# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print(scores)
print("Mean score: %.3f (+/-%.3f)" % (np.mean(scores), sem(scores)))
def calc_params(X, y, clf, param_values, param_name, K):
# Convert input to Numpy arrays
X = np.array(X)
y = np.array(y)
# initialize training and testing score arrays with zeros
train_scores = np.zeros(len(param_values))
test_scores = np.zeros(len(param_values))
# iterate over the different parameter values
for i, param_value in enumerate(param_values):
# set classifier parameters
clf.set_params(**{param_name:param_value})
# initialize the K scores obtained for each fold
k_train_scores = np.zeros(K)
k_test_scores = np.zeros(K)
# create KFold cross validation
cv = KFold(n_splits=K, shuffle=True, random_state=0)
# iterate over the K folds
j = 0
for train, test in cv.split(X):
# fit the classifier in the corresponding fold
# and obtain the corresponding accuracy scores on train and test sets
clf.fit(X[train], y[train])
k_train_scores[j] = clf.score(X[train], y[train])
k_test_scores[j] = clf.score(X[test], y[test])
j += 1
# store the mean of the K fold scores
train_scores[i] = np.mean(k_train_scores)
test_scores[i] = np.mean(k_test_scores)
print(param_name, '=', param_value, "Train =", train_scores[i], "Test =", test_scores[i])
# plot the training and testing scores in a log scale
plt.semilogx(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b')
plt.semilogx(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g')
plt.legend(loc=7)
plt.xlabel(param_name + " values")
plt.ylabel("Mean cross validation accuracy")
# return the training and testing scores on each parameter value
return train_scores, test_scores
X = df['Sen_Out_StopWord']
y = df['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
df.columns
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train.values.astype('U'))
X_tfidf.shape
X_test_tfidf = tfidf.transform(X_test.values.astype('U'))
X_test_tfidf.shape
X_tfidf
clf = Pipeline([
('vect', TfidfVectorizer()),
('nb', MultinomialNB(alpha=0.01)),
])
evaluate_cross_validation(clf, X_train, y_train, 5)
[0.60951498 0.60982391 0.61650185 0.60012361 0.6131026 ]
Mean score: 0.610 (+/-0.003)
alphas = np.logspace(-2, 1, 20)
print(alphas)
[ 0.01 0.0143845 0.02069138 0.02976351 0.04281332 0.06158482
0.08858668 0.1274275 0.18329807 0.26366509 0.37926902 0.54555948
0.78475997 1.12883789 1.62377674 2.33572147 3.35981829 4.83293024
6.95192796 10. ]
train_scores, test_scores = calc_params(X_train, y_train, clf, alphas, 'nb__alpha', 5)
nb__alpha = 0.01 Train = 0.9473488812935834 Test = 0.6098133906740397
nb__alpha = 0.01438449888287663 Train = 0.9461747352716937 Test = 0.6116055741459705
nb__alpha = 0.0206913808111479 Train = 0.9447688502802472 Test = 0.6132741100371821
nb__alpha = 0.029763514416313176 Train = 0.9431003347117901 Test = 0.6160549204519896
nb__alpha = 0.04281332398719394 Train = 0.9409219888753217 Test = 0.6175997514828735
nb__alpha = 0.06158482110660264 Train = 0.9376622028681757 Test = 0.6203190435985647
nb__alpha = 0.08858667904100823 Train = 0.9333673039574851 Test = 0.6229764355510852
nb__alpha = 0.12742749857031335 Train = 0.9275583809312916 Test = 0.6253246512721992
nb__alpha = 0.18329807108324356 Train = 0.9189376903090396 Test = 0.6280436569898497
nb__alpha = 0.26366508987303583 Train = 0.906439241900739 Test = 0.6286618567070412
nb__alpha = 0.37926901907322497 Train = 0.8895532054383789 Test = 0.6286617230546223
nb__alpha = 0.5455594781168517 Train = 0.8669509482207849 Test = 0.6271171211421707
nb__alpha = 0.7847599703514611 Train = 0.837844531973649 Test = 0.6205665487852332
nb__alpha = 1.1288378916846884 Train = 0.8015542589871932 Test = 0.6130884477340759
nb__alpha = 1.623776739188721 Train = 0.754943904910417 Test = 0.5996779740431728
nb__alpha = 2.3357214690901213 Train = 0.6992493106672354 Test = 0.5845993463251121
nb__alpha = 3.359818286283781 Train = 0.644697986352915 Test = 0.5708801164532619
nb__alpha = 4.832930238571752 Train = 0.5962645262010896 Test = 0.5491889971218906
nb__alpha = 6.951927961775605 Train = 0.5582593232595784 Test = 0.5315151640125205
nb__alpha = 10.0 Train = 0.5287203242610345 Test = 0.5145208770806341
mnb = MultinomialNB(alpha= 0.37)
mnb.fit(X_tfidf, y_train)
mnb_pred = mnb.predict(X_test_tfidf)
print(mnb_pred)
['Positive' 'Neutral' 'Neutral' ... 'Neutral' 'Neutral' 'Negative']
from sklearn.metrics import classification_report
mnb_cr = classification_report(mnb_pred, y_test)
print(mnb_cr)
precision recall f1-score support
Negative 0.39 0.67 0.49 622
Neutral 0.80 0.62 0.70 2312
Positive 0.61 0.64 0.62 1112
accuracy 0.64 4046
macro avg 0.60 0.64 0.61 4046
weighted avg 0.68 0.64 0.65 4046
X_train.shape
X_test.shape
# Perform classification with SVM, kernel=linear and invoking a pipeline.
from sklearn import svm
from sklearn.svm import SVC
classifier_linear = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='linear')),
])
# SVM without tunning calling cross validation method above.
evaluate_cross_validation(classifier_linear, X_train, y_train, 5)
[0.64102564 0.63762743 0.631644 0.63442522 0.64307787]
Mean score: 0.638 (+/-0.002)
# We are tuning the c_vals parameter using the fixed range of array parameters. We can take larger sets but that would taken more time to execute so taking samlelr sets here.
c_vals = [1, 5, 10, 50, 100]
# We calculate train and test score by calling calc_params with c_vals as tuned parameter and 5 k folds.
train_scores, test_scores = calc_params(X_train, y_train, classifier_linear, c_vals, 'svc__C', 5)
svc__C = 1 Train = 0.8854900586248359 Test = 0.6375600338026061
svc__C = 5 Train = 0.9827431568406352 Test = 0.615931024659635
svc__C = 10 Train = 0.9909621718333581 Test = 0.6049308959714488
svc__C = 50 Train = 0.9963539680559178 Test = 0.5884931758984211
svc__C = 100 Train = 0.9970955361000685 Test = 0.5847854095854751
# passing gamma and original C parameter from above to get a grid of combination of C and gamma parameter.
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
parameters = {
'svc__gamma': np.logspace(-3, 0, 4),
'svc__C': [1, 5, 10, 50, 100],
}
clf_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf')),
])
gs = GridSearchCV(clf_rbf, parameters, verbose=2, cv=3)
gs.fit(X, y)
gs.best_params_, gs.best_score_
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] svc__C=1, svc__gamma=0.001 ......................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 29.3s
[CV] svc__C=1, svc__gamma=0.001 ......................................
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 29.2s remaining: 0.0s
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 29.0s
[CV] svc__C=1, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 31.1s
[CV] svc__C=1, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=1, svc__gamma=0.01, total= 30.5s
[CV] svc__C=1, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=1, svc__gamma=0.01, total= 30.4s
[CV] svc__C=1, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=1, svc__gamma=0.01, total= 30.0s
[CV] svc__C=1, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=1, svc__gamma=0.1, total= 28.0s
[CV] svc__C=1, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=1, svc__gamma=0.1, total= 31.4s
[CV] svc__C=1, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=1, svc__gamma=0.1, total= 33.0s
[CV] svc__C=1, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=1, svc__gamma=1.0, total= 37.6s
[CV] svc__C=1, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=1, svc__gamma=1.0, total= 40.1s
[CV] svc__C=1, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=1, svc__gamma=1.0, total= 42.0s
[CV] svc__C=5, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=5, svc__gamma=0.001, total= 33.0s
[CV] svc__C=5, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=5, svc__gamma=0.001, total= 30.5s
[CV] svc__C=5, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=5, svc__gamma=0.001, total= 30.3s
[CV] svc__C=5, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=5, svc__gamma=0.01, total= 28.2s
[CV] svc__C=5, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=5, svc__gamma=0.01, total= 30.3s
[CV] svc__C=5, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=5, svc__gamma=0.01, total= 33.0s
[CV] svc__C=5, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=5, svc__gamma=0.1, total= 26.1s
[CV] svc__C=5, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=5, svc__gamma=0.1, total= 26.9s
[CV] svc__C=5, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=5, svc__gamma=0.1, total= 29.4s
[CV] svc__C=5, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=5, svc__gamma=1.0, total= 51.7s
[CV] svc__C=5, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=5, svc__gamma=1.0, total= 53.0s
[CV] svc__C=5, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=5, svc__gamma=1.0, total= 58.4s
[CV] svc__C=10, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=10, svc__gamma=0.001, total= 27.5s
[CV] svc__C=10, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=10, svc__gamma=0.001, total= 26.8s
[CV] svc__C=10, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=10, svc__gamma=0.001, total= 28.3s
[CV] svc__C=10, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=10, svc__gamma=0.01, total= 25.1s
[CV] svc__C=10, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=10, svc__gamma=0.01, total= 25.5s
[CV] svc__C=10, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=10, svc__gamma=0.01, total= 28.0s
[CV] svc__C=10, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=10, svc__gamma=0.1, total= 33.5s
[CV] svc__C=10, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=10, svc__gamma=0.1, total= 35.5s
[CV] svc__C=10, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=10, svc__gamma=0.1, total= 39.7s
[CV] svc__C=10, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=10, svc__gamma=1.0, total= 50.0s
[CV] svc__C=10, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=10, svc__gamma=1.0, total= 53.0s
[CV] svc__C=10, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=10, svc__gamma=1.0, total= 56.8s
[CV] svc__C=50, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=50, svc__gamma=0.001, total= 25.7s
[CV] svc__C=50, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=50, svc__gamma=0.001, total= 25.7s
[CV] svc__C=50, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=50, svc__gamma=0.001, total= 27.4s
[CV] svc__C=50, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=50, svc__gamma=0.01, total= 23.5s
[CV] svc__C=50, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=50, svc__gamma=0.01, total= 25.0s
[CV] svc__C=50, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=50, svc__gamma=0.01, total= 26.0s
[CV] svc__C=50, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=50, svc__gamma=0.1, total= 39.3s
[CV] svc__C=50, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=50, svc__gamma=0.1, total= 42.3s
[CV] svc__C=50, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=50, svc__gamma=0.1, total= 44.6s
[CV] svc__C=50, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=50, svc__gamma=1.0, total= 48.7s
[CV] svc__C=50, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=50, svc__gamma=1.0, total= 52.0s
[CV] svc__C=50, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=50, svc__gamma=1.0, total= 55.8s
[CV] svc__C=100, svc__gamma=0.001 ....................................
[CV] ..................... svc__C=100, svc__gamma=0.001, total= 24.5s
[CV] svc__C=100, svc__gamma=0.001 ....................................
[CV] ..................... svc__C=100, svc__gamma=0.001, total= 24.9s
[CV] svc__C=100, svc__gamma=0.001 ....................................
[CV] ..................... svc__C=100, svc__gamma=0.001, total= 27.1s
[CV] svc__C=100, svc__gamma=0.01 .....................................
[CV] ...................... svc__C=100, svc__gamma=0.01, total= 29.6s
[CV] svc__C=100, svc__gamma=0.01 .....................................
[CV] ...................... svc__C=100, svc__gamma=0.01, total= 30.7s
[CV] svc__C=100, svc__gamma=0.01 .....................................
[CV] ...................... svc__C=100, svc__gamma=0.01, total= 33.0s
[CV] svc__C=100, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=100, svc__gamma=0.1, total= 37.7s
[CV] svc__C=100, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=100, svc__gamma=0.1, total= 41.2s
[CV] svc__C=100, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=100, svc__gamma=0.1, total= 44.0s
[CV] svc__C=100, svc__gamma=1.0 ......................................
[CV] ....................... svc__C=100, svc__gamma=1.0, total= 49.9s
[CV] svc__C=100, svc__gamma=1.0 ......................................
[CV] ....................... svc__C=100, svc__gamma=1.0, total= 54.4s
[CV] svc__C=100, svc__gamma=1.0 ......................................
[CV] ....................... svc__C=100, svc__gamma=1.0, total= 56.1s
[Parallel(n_jobs=1)]: Done 60 out of 60 | elapsed: 35.7min finished
classifier_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf', C=5, gamma=1)),
])
# Fitting the final classifier
classifier_rbf.fit(X_train, y_train)
# Predicting on test set
svc_pred = classifier_rbf.predict(X_test)
# training on the entire set
print(classifier_rbf.score(X_test, y_test))
0.6626297577854672
print("SVM rbf Accuracy:" , metrics.accuracy_score(y_test,svc_pred))
SVM rbf Accuracy: 0.6626297577854672
# classification report
# removing stopWord from our dataset
svm_cr = classification_report(y_test,svc_pred)
print(svm_cr)
precision recall f1-score support
Negative 0.67 0.50 0.58 1059
Neutral 0.65 0.79 0.72 1811
Positive 0.67 0.61 0.64 1176
accuracy 0.66 4046
macro avg 0.67 0.63 0.64 4046
weighted avg 0.66 0.66 0.66 4046
# creating a confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, svc_pred)
mat.T
#plotting a heatmap from the confusion matrix value above for better visulization.
fig, ax = plt.subplots(figsize=(8,8))
ax = sns.heatmap(mat.T, square=True, linecolor='grey', linewidths=1, annot=True,
fmt='d', cbar=True, cmap='Reds', ax=ax, annot_kws={"fontsize":12, "weight":"bold"},
xticklabels=y_test.unique(),
yticklabels=y_test.unique())
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.xlabel('true label')
plt.ylabel('predicted label');
comment = """achha lagta hain """ # I like it
print(classifier_rbf.predict([comment]))
['Positive']
comment = """umaima ka dimag kharab hain aaj""" # Umiama is mad today
print(classifier_rbf.predict([comment]))
['Negative']
comment = """Rauf kya chal rahan hain""" # what's going on Rauf
print(classifier_rbf.predict([comment]))
['Neutral']
# the below is original dataset changed to lower case,punctuation only and not cleaned with stop words.
X = df['Lower_Case']
y = df['Response']
X
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train.values.astype('U'))
X_tfidf.shape
X_test_tfidf = tfidf.transform(X_test.values.astype('U'))
X_test_tfidf.shape
### Lets' set up a pipeline to perform preprocessng of the data and
### classification of the documents using Multiomial Naive Bayes
clf_2 = Pipeline([
('vect', TfidfVectorizer()),
('nb', MultinomialNB(alpha=0.01)),
])
evaluate_cross_validation(clf_2, X_train, y_train, 5)
[0.61445783 0.60982391 0.61897404 0.60290482 0.618356 ]
Mean score: 0.613 (+/-0.003)
alphas = np.logspace(-2, 1, 20)
print(alphas)
[ 0.01 0.0143845 0.02069138 0.02976351 0.04281332 0.06158482
0.08858668 0.1274275 0.18329807 0.26366509 0.37926902 0.54555948
0.78475997 1.12883789 1.62377674 2.33572147 3.35981829 4.83293024
6.95192796 10. ]
train_scores, test_scores = calc_params(X_train, y_train, clf_2, alphas, 'nb__alpha', 5)
nb__alpha = 0.01 Train = 0.9469626818999227 Test = 0.6129033200406456
nb__alpha = 0.01438449888287663 Train = 0.9455567957150588 Test = 0.6142629660984913
nb__alpha = 0.0206913808111479 Train = 0.944104550040794 Test = 0.6172293051639858
nb__alpha = 0.029763514416313176 Train = 0.9423587869557337 Test = 0.6193922213528451
nb__alpha = 0.04281332398719394 Train = 0.9398096630642767 Test = 0.6219257175130111
nb__alpha = 0.06158482110660264 Train = 0.9360709348163214 Test = 0.6253864750625588
nb__alpha = 0.08858667904100823 Train = 0.931127173712692 Test = 0.6268696350487047
nb__alpha = 0.12742749857031335 Train = 0.9240513952331451 Test = 0.6290943750279239
nb__alpha = 0.18329807108324356 Train = 0.9147200544293774 Test = 0.6290944132143292
nb__alpha = 0.26366508987303583 Train = 0.9008466484396038 Test = 0.6280439242946876
nb__alpha = 0.37926901907322497 Train = 0.8824774699218556 Test = 0.628600128382695
nb__alpha = 0.5455594781168517 Train = 0.8576196503800407 Test = 0.6241506866106625
nb__alpha = 0.7847599703514611 Train = 0.826381203629397 Test = 0.6169818572569253
nb__alpha = 1.1288378916846884 Train = 0.7845601013020362 Test = 0.6071555404846543
nb__alpha = 1.623776739188721 Train = 0.7338247611061952 Test = 0.5930654251502541
nb__alpha = 2.3357214690901213 Train = 0.6785936889585686 Test = 0.5764418709352958
nb__alpha = 3.359818286283781 Train = 0.6278737486198874 Test = 0.5596328262560559
nb__alpha = 4.832930238571752 Train = 0.5847703539860765 Test = 0.541587974031717
nb__alpha = 6.951927961775605 Train = 0.5498549669760541 Test = 0.5256443096718908
nb__alpha = 10.0 Train = 0.5234984706655336 Test = 0.5101331063533396
mnb_2 = MultinomialNB(alpha= 0.18)
mnb_2.fit(X_tfidf, y_train)
#Performance on the test data
mnb_pred_2 = mnb_2.predict(X_test_tfidf)
print(mnb_pred_2)
['Positive' 'Neutral' 'Neutral' ... 'Neutral' 'Neutral' 'Negative']
mnb_cr_2 = classification_report(mnb_pred_2, y_test)
print(mnb_cr_2)
precision recall f1-score support
Negative 0.45 0.63 0.52 753
Neutral 0.76 0.65 0.70 2139
Positive 0.62 0.64 0.63 1154
accuracy 0.64 4046
macro avg 0.61 0.64 0.62 4046
weighted avg 0.66 0.64 0.65 4046
# Perform classification with SVM, kernel=linear, calling pipeline
from sklearn import svm
from sklearn.svm import SVC
classifi_linear = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='linear')),
])
#SVM without tunning calling cross validation method above.
evaluate_cross_validation(classifi_linear, X_train, y_train, 5)
[0.64874884 0.65400062 0.64647713 0.64276885 0.65111248]
Mean score: 0.649 (+/-0.002)
# We are tuning the c_vals parameter using the fixed range of array parameters.
c_vals = [1, 5, 10, 50, 100]
# calling calc_params function from above
train_scores, test_scores = calc_params(X_train, y_train, classifi_linear, c_vals, 'svc__C', 5)
svc__C = 1 Train = 0.8785688205283533 Test = 0.6486215853238952
svc__C = 5 Train = 0.9812291374988014 Test = 0.6232232151960508
svc__C = 10 Train = 0.9901742514603102 Test = 0.612346734088584
svc__C = 50 Train = 0.9966629545420778 Test = 0.5944257012837888
svc__C = 100 Train = 0.9971727824232541 Test = 0.5910887058741766
parameters = {
'svc__gamma': np.logspace(-3, 0, 4),
'svc__C': [1, 5, 10, 50, 100],
}
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
clf_rbf_2 = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf')),
])
gs = GridSearchCV(clf_rbf_2, parameters, verbose=2, cv=3)
gs.fit(X, y)
gs.best_params_, gs.best_score_
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] svc__C=1, svc__gamma=0.001 ......................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 38.4s
[CV] svc__C=1, svc__gamma=0.001 ......................................
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 38.3s remaining: 0.0s
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 35.7s
[CV] svc__C=1, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 40.7s
[CV] svc__C=1, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=1, svc__gamma=0.01, total= 39.0s
[CV] svc__C=1, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=1, svc__gamma=0.01, total= 41.3s
[CV] svc__C=1, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=1, svc__gamma=0.01, total= 38.7s
[CV] svc__C=1, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=1, svc__gamma=0.1, total= 38.1s
[CV] svc__C=1, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=1, svc__gamma=0.1, total= 35.3s
[CV] svc__C=1, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=1, svc__gamma=0.1, total= 36.4s
[CV] svc__C=1, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=1, svc__gamma=1.0, total= 43.4s
[CV] svc__C=1, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=1, svc__gamma=1.0, total= 44.3s
[CV] svc__C=1, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=1, svc__gamma=1.0, total= 51.6s
[CV] svc__C=5, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=5, svc__gamma=0.001, total= 36.5s
[CV] svc__C=5, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=5, svc__gamma=0.001, total= 34.8s
[CV] svc__C=5, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=5, svc__gamma=0.001, total= 36.3s
[CV] svc__C=5, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=5, svc__gamma=0.01, total= 32.0s
[CV] svc__C=5, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=5, svc__gamma=0.01, total= 34.3s
[CV] svc__C=5, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=5, svc__gamma=0.01, total= 37.6s
[CV] svc__C=5, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=5, svc__gamma=0.1, total= 32.0s
[CV] svc__C=5, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=5, svc__gamma=0.1, total= 33.5s
[CV] svc__C=5, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=5, svc__gamma=0.1, total= 39.0s
[CV] svc__C=5, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=5, svc__gamma=1.0, total= 1.2min
[CV] svc__C=5, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=5, svc__gamma=1.0, total= 1.4min
[CV] svc__C=5, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=5, svc__gamma=1.0, total= 1.4min
[CV] svc__C=10, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=10, svc__gamma=0.001, total= 37.1s
[CV] svc__C=10, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=10, svc__gamma=0.001, total= 36.0s
[CV] svc__C=10, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=10, svc__gamma=0.001, total= 38.3s
[CV] svc__C=10, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=10, svc__gamma=0.01, total= 36.9s
[CV] svc__C=10, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=10, svc__gamma=0.01, total= 35.0s
[CV] svc__C=10, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=10, svc__gamma=0.01, total= 36.7s
[CV] svc__C=10, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=10, svc__gamma=0.1, total= 40.2s
[CV] svc__C=10, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=10, svc__gamma=0.1, total= 53.7s
[CV] svc__C=10, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=10, svc__gamma=0.1, total= 44.4s
[CV] svc__C=10, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=10, svc__gamma=1.0, total= 1.1min
[CV] svc__C=10, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=10, svc__gamma=1.0, total= 1.3min
[CV] svc__C=10, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=10, svc__gamma=1.0, total= 1.3min
[CV] svc__C=50, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=50, svc__gamma=0.001, total= 36.8s
[CV] svc__C=50, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=50, svc__gamma=0.001, total= 40.2s
[CV] svc__C=50, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=50, svc__gamma=0.001, total= 40.1s
[CV] svc__C=50, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=50, svc__gamma=0.01, total= 31.0s
[CV] svc__C=50, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=50, svc__gamma=0.01, total= 31.1s
[CV] svc__C=50, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=50, svc__gamma=0.01, total= 34.2s
[CV] svc__C=50, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=50, svc__gamma=0.1, total= 50.9s
[CV] svc__C=50, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=50, svc__gamma=0.1, total= 52.9s
[CV] svc__C=50, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=50, svc__gamma=0.1, total= 57.6s
[CV] svc__C=50, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=50, svc__gamma=1.0, total= 1.1min
[CV] svc__C=50, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=50, svc__gamma=1.0, total= 1.2min
[CV] svc__C=50, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=50, svc__gamma=1.0, total= 1.3min
[CV] svc__C=100, svc__gamma=0.001 ....................................
[CV] ..................... svc__C=100, svc__gamma=0.001, total= 34.3s
[CV] svc__C=100, svc__gamma=0.001 ....................................
[CV] ..................... svc__C=100, svc__gamma=0.001, total= 33.9s
[CV] svc__C=100, svc__gamma=0.001 ....................................
[CV] ..................... svc__C=100, svc__gamma=0.001, total= 36.5s
[CV] svc__C=100, svc__gamma=0.01 .....................................
[CV] ...................... svc__C=100, svc__gamma=0.01, total= 37.2s
[CV] svc__C=100, svc__gamma=0.01 .....................................
[CV] ...................... svc__C=100, svc__gamma=0.01, total= 37.6s
[CV] svc__C=100, svc__gamma=0.01 .....................................
[CV] ...................... svc__C=100, svc__gamma=0.01, total= 44.3s
[CV] svc__C=100, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=100, svc__gamma=0.1, total= 50.8s
[CV] svc__C=100, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=100, svc__gamma=0.1, total= 1.0min
[CV] svc__C=100, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=100, svc__gamma=0.1, total= 1.0min
[CV] svc__C=100, svc__gamma=1.0 ......................................
[CV] ....................... svc__C=100, svc__gamma=1.0, total= 1.0min
[CV] svc__C=100, svc__gamma=1.0 ......................................
[CV] ....................... svc__C=100, svc__gamma=1.0, total= 1.2min
[CV] svc__C=100, svc__gamma=1.0 ......................................
[CV] ....................... svc__C=100, svc__gamma=1.0, total= 1.3min
[Parallel(n_jobs=1)]: Done 60 out of 60 | elapsed: 46.9min finished
# using kernel = rbf
classifi_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf', C=5, gamma=1.0)),
])
# fitting entire training data
classifi_rbf.fit(X_train, y_train)
# predicting on test set
svc_pred_2 = classifi_rbf.predict(X_test)
# predicting on entire test dataset
print(classifi_rbf.score(X_test, y_test))
0.6683143845773604
print("SVM rbf Accuracy:" , metrics.accuracy_score(y_test,svc_pred_2))
SVM rbf Accuracy: 0.6683143845773604
# classification report using
# dataset without removing stopWord
svm_cr_2 = classification_report(y_test,svc_pred_2)
print(svm_cr_2)
precision recall f1-score support
Negative 0.65 0.51 0.57 1059
Neutral 0.67 0.79 0.72 1811
Positive 0.68 0.63 0.65 1176
accuracy 0.67 4046
macro avg 0.67 0.64 0.65 4046
weighted avg 0.67 0.67 0.66 4046
# Dropping neutral sentiment sentences.
data = roman_urdu_df[roman_urdu_df.Response != 'Neutral']
data = pd.DataFrame(data[['Sentence', 'Response']]) #converting into dataframe
data['Response'].value_counts()
# removing all the punctuation by calling remove_punct function from above
data['Sentence_Removal'] = data['Sentence'].apply(lambda x: remove_punct(str(x)))
data['Lower_Case'] = data['Sentence_Removal'].apply(lambda x: convert_to_lower_case(str(x))) #calling lower case on each line
data['Sen_Out_StopWord'] = data['Lower_Case'].apply(lambda x: removeStopWordss(x)) # sentence with removed stop words
data.head()
X = data['Sen_Out_StopWord']
y = data['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
X_train.shape
X_test.shape
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train.values.astype('U'))
X_tfidf.shape
X_test_tfidf = tfidf.transform(X_test.values.astype('U'))
X_test_tfidf.shape
X_tfidf
### Lets' set up a pipeline to perform preprocessng of the data and
### classification of the documents using Multiomial Naive Bayes
B_clf = Pipeline([
('vect', TfidfVectorizer()),
('nb', MultinomialNB(alpha=0.01)),
])
evaluate_cross_validation(B_clf, X_train, y_train, 5)
[0.76548673 0.76106195 0.76161504 0.77599558 0.75442478]
Mean score: 0.764 (+/-0.004)
alphas = np.logspace(-2, 1, 20)
print(alphas)
[ 0.01 0.0143845 0.02069138 0.02976351 0.04281332 0.06158482
0.08858668 0.1274275 0.18329807 0.26366509 0.37926902 0.54555948
0.78475997 1.12883789 1.62377674 2.33572147 3.35981829 4.83293024
6.95192796 10. ]
train_scores, test_scores = calc_params(X_train, y_train, B_clf, alphas, 'nb__alpha', 5)
nb__alpha = 0.01 Train = 0.9797013274336284 Test = 0.763716814159292
nb__alpha = 0.01438449888287663 Train = 0.9789823008849557 Test = 0.7644911504424778
nb__alpha = 0.0206913808111479 Train = 0.9779867256637168 Test = 0.7655973451327434
nb__alpha = 0.029763514416313176 Train = 0.9767146017699115 Test = 0.7673672566371682
nb__alpha = 0.04281332398719394 Train = 0.9752212389380531 Test = 0.7694690265486726
nb__alpha = 0.06158482110660264 Train = 0.9730088495575222 Test = 0.7724557522123894
nb__alpha = 0.08858667904100823 Train = 0.9705752212389381 Test = 0.7765486725663717
nb__alpha = 0.12742749857031335 Train = 0.9671183628318584 Test = 0.7790929203539824
nb__alpha = 0.18329807108324356 Train = 0.9629977876106196 Test = 0.7814159292035399
nb__alpha = 0.26366508987303583 Train = 0.9573285398230087 Test = 0.7837389380530974
nb__alpha = 0.37926901907322497 Train = 0.9513550884955752 Test = 0.7831858407079647
nb__alpha = 0.5455594781168517 Train = 0.9428097345132743 Test = 0.7804203539823009
nb__alpha = 0.7847599703514611 Train = 0.9329922566371682 Test = 0.7807522123893806
nb__alpha = 1.1288378916846884 Train = 0.9217367256637168 Test = 0.774778761061947
nb__alpha = 1.623776739188721 Train = 0.9089048672566371 Test = 0.7654867256637168
nb__alpha = 2.3357214690901213 Train = 0.8932798672566372 Test = 0.7596238938053097
nb__alpha = 3.359818286283781 Train = 0.8740597345132743 Test = 0.7504424778761063
nb__alpha = 4.832930238571752 Train = 0.8533185840707965 Test = 0.7382743362831857
nb__alpha = 6.951927961775605 Train = 0.830116150442478 Test = 0.7244469026548672
nb__alpha = 10.0 Train = 0.7997234513274337 Test = 0.7071902654867257
B_mnb = MultinomialNB(alpha= 0.78)
B_mnb.fit(X_tfidf, y_train)
### Performance on the test data
B_mnb_pred = B_mnb.predict(X_test_tfidf)
print(B_mnb_pred)
['Positive' 'Positive' 'Negative' ... 'Negative' 'Negative' 'Positive']
B_mnb_cr = classification_report(B_mnb_pred, y_test)
print(B_mnb_cr)
precision recall f1-score support
Negative 0.69 0.77 0.73 904
Positive 0.83 0.77 0.80 1356
accuracy 0.77 2260
macro avg 0.76 0.77 0.76 2260
weighted avg 0.78 0.77 0.77 2260
# Perform classification with SVM, kernel=linear and invoking a pipeline.
from sklearn import svm
from sklearn.svm import SVC
B_classifier_linear = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='linear')),
])
# SVM without tunning calling cross validation method above.
evaluate_cross_validation(B_classifier_linear, X_train, y_train, 5)
[0.77378319 0.78152655 0.78318584 0.78926991 0.77488938]
Mean score: 0.781 (+/-0.003)
# We are tuning the c_vals parameter using the fixed range of array parameters. We can take larger sets but that would taken more time to execute so taking samlelr sets here.
c_vals = [1, 5, 10, 50, 100]
# We calculate train and test score by calling calc_params with c_vals as tuned parameter and 5 k folds.
train_scores, test_scores = calc_params(X_train, y_train, B_classifier_linear, c_vals, 'svc__C', 5)
svc__C = 1 Train = 0.954424778761062 Test = 0.7805309734513274
svc__C = 5 Train = 0.9958241150442477 Test = 0.7573008849557523
svc__C = 10 Train = 0.9981747787610619 Test = 0.7504424778761063
svc__C = 50 Train = 0.9990597345132743 Test = 0.7430309734513274
svc__C = 100 Train = 0.9994469026548674 Test = 0.7414823008849558
# passing gamma and original C parameter from above to get a grid of combination of C and gamma parameter.
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
parameters = {
'svc__gamma': np.logspace(-3, 0, 4),
'svc__C': [1, 5, 10, 50, 100],
}
B_clf_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf')),
])
gs = GridSearchCV(B_clf_rbf, parameters, verbose=2, cv=3)
gs.fit(X, y)
gs.best_params_, gs.best_score_
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] svc__C=1, svc__gamma=0.001 ......................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 10.2s
[CV] svc__C=1, svc__gamma=0.001 ......................................
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 10.1s remaining: 0.0s
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 9.3s
[CV] svc__C=1, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 9.9s
[CV] svc__C=1, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=1, svc__gamma=0.01, total= 11.2s
[CV] svc__C=1, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=1, svc__gamma=0.01, total= 9.4s
[CV] svc__C=1, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=1, svc__gamma=0.01, total= 9.4s
[CV] svc__C=1, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=1, svc__gamma=0.1, total= 8.3s
[CV] svc__C=1, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=1, svc__gamma=0.1, total= 7.7s
[CV] svc__C=1, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=1, svc__gamma=0.1, total= 8.3s
[CV] svc__C=1, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=1, svc__gamma=1.0, total= 8.4s
[CV] svc__C=1, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=1, svc__gamma=1.0, total= 8.0s
[CV] svc__C=1, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=1, svc__gamma=1.0, total= 9.5s
[CV] svc__C=5, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=5, svc__gamma=0.001, total= 9.5s
[CV] svc__C=5, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=5, svc__gamma=0.001, total= 9.7s
[CV] svc__C=5, svc__gamma=0.001 ......................................
[CV] ....................... svc__C=5, svc__gamma=0.001, total= 10.3s
[CV] svc__C=5, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=5, svc__gamma=0.01, total= 8.9s
[CV] svc__C=5, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=5, svc__gamma=0.01, total= 7.9s
[CV] svc__C=5, svc__gamma=0.01 .......................................
[CV] ........................ svc__C=5, svc__gamma=0.01, total= 9.1s
[CV] svc__C=5, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=5, svc__gamma=0.1, total= 8.4s
[CV] svc__C=5, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=5, svc__gamma=0.1, total= 7.1s
[CV] svc__C=5, svc__gamma=0.1 ........................................
[CV] ......................... svc__C=5, svc__gamma=0.1, total= 8.2s
[CV] svc__C=5, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=5, svc__gamma=1.0, total= 10.1s
[CV] svc__C=5, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=5, svc__gamma=1.0, total= 10.2s
[CV] svc__C=5, svc__gamma=1.0 ........................................
[CV] ......................... svc__C=5, svc__gamma=1.0, total= 10.2s
[CV] svc__C=10, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=10, svc__gamma=0.001, total= 8.6s
[CV] svc__C=10, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=10, svc__gamma=0.001, total= 8.4s
[CV] svc__C=10, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=10, svc__gamma=0.001, total= 9.6s
[CV] svc__C=10, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=10, svc__gamma=0.01, total= 8.4s
[CV] svc__C=10, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=10, svc__gamma=0.01, total= 9.4s
[CV] svc__C=10, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=10, svc__gamma=0.01, total= 9.6s
[CV] svc__C=10, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=10, svc__gamma=0.1, total= 9.0s
[CV] svc__C=10, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=10, svc__gamma=0.1, total= 8.3s
[CV] svc__C=10, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=10, svc__gamma=0.1, total= 9.7s
[CV] svc__C=10, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=10, svc__gamma=1.0, total= 9.0s
[CV] svc__C=10, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=10, svc__gamma=1.0, total= 10.9s
[CV] svc__C=10, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=10, svc__gamma=1.0, total= 11.9s
[CV] svc__C=50, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=50, svc__gamma=0.001, total= 8.9s
[CV] svc__C=50, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=50, svc__gamma=0.001, total= 9.2s
[CV] svc__C=50, svc__gamma=0.001 .....................................
[CV] ...................... svc__C=50, svc__gamma=0.001, total= 11.0s
[CV] svc__C=50, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=50, svc__gamma=0.01, total= 8.4s
[CV] svc__C=50, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=50, svc__gamma=0.01, total= 8.1s
[CV] svc__C=50, svc__gamma=0.01 ......................................
[CV] ....................... svc__C=50, svc__gamma=0.01, total= 8.8s
[CV] svc__C=50, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=50, svc__gamma=0.1, total= 9.8s
[CV] svc__C=50, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=50, svc__gamma=0.1, total= 9.4s
[CV] svc__C=50, svc__gamma=0.1 .......................................
[CV] ........................ svc__C=50, svc__gamma=0.1, total= 10.9s
[CV] svc__C=50, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=50, svc__gamma=1.0, total= 10.5s
[CV] svc__C=50, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=50, svc__gamma=1.0, total= 10.0s
[CV] svc__C=50, svc__gamma=1.0 .......................................
[CV] ........................ svc__C=50, svc__gamma=1.0, total= 10.7s
[CV] svc__C=100, svc__gamma=0.001 ....................................
[CV] ..................... svc__C=100, svc__gamma=0.001, total= 9.1s
[CV] svc__C=100, svc__gamma=0.001 ....................................
[CV] ..................... svc__C=100, svc__gamma=0.001, total= 7.5s
[CV] svc__C=100, svc__gamma=0.001 ....................................
[CV] ..................... svc__C=100, svc__gamma=0.001, total= 8.6s
[CV] svc__C=100, svc__gamma=0.01 .....................................
[CV] ...................... svc__C=100, svc__gamma=0.01, total= 7.4s
[CV] svc__C=100, svc__gamma=0.01 .....................................
[CV] ...................... svc__C=100, svc__gamma=0.01, total= 8.4s
[CV] svc__C=100, svc__gamma=0.01 .....................................
[CV] ...................... svc__C=100, svc__gamma=0.01, total= 10.1s
[CV] svc__C=100, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=100, svc__gamma=0.1, total= 8.2s
[CV] svc__C=100, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=100, svc__gamma=0.1, total= 7.8s
[CV] svc__C=100, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=100, svc__gamma=0.1, total= 9.5s
[CV] svc__C=100, svc__gamma=1.0 ......................................
[CV] ....................... svc__C=100, svc__gamma=1.0, total= 8.7s
[CV] svc__C=100, svc__gamma=1.0 ......................................
[CV] ....................... svc__C=100, svc__gamma=1.0, total= 9.8s
[CV] svc__C=100, svc__gamma=1.0 ......................................
[CV] ....................... svc__C=100, svc__gamma=1.0, total= 9.8s
[Parallel(n_jobs=1)]: Done 60 out of 60 | elapsed: 9.2min finished
B_classifier_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf', C=1, gamma= 1.0)),
])
# Fitting the final classifier
B_classifier_rbf.fit(X_train, y_train)
# Predicting on test set
B_svc_pred = B_classifier_rbf.predict(X_test)
# training on the entire test set
print(B_classifier_rbf.score(X_test, y_test))
0.772566371681416
print("SVM rbf Accuracy:" , metrics.accuracy_score(y_test,B_svc_pred))
SVM rbf Accuracy: 0.772566371681416
# results
B_svm_cr = classification_report(y_test,B_svc_pred)
print(B_svm_cr)
precision recall f1-score support
Negative 0.73 0.78 0.75 1006
Positive 0.81 0.77 0.79 1254
accuracy 0.77 2260
macro avg 0.77 0.77 0.77 2260
weighted avg 0.77 0.77 0.77 2260
# creating a confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
mat_3 = confusion_matrix(y_test, B_svc_pred)
mat_3
#plotting a heatmap from the confusion matrix value above.
fig, ax = plt.subplots(figsize=(8,8))
ax = sns.heatmap(mat_3.T, square=True, linecolor='grey', linewidths=1, annot=True,
fmt='d', cbar=True, cmap='Reds', ax=ax, annot_kws={"fontsize":12, "weight":"bold"},
xticklabels=y_test.unique(),
yticklabels=y_test.unique())
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.xlabel('true label')
plt.ylabel('predicted label');
import matplotlib.ticker as mtick
#mnb_cr[mnb_cr.find('Negative')+len('Negative'):100].split()
#mnb_cr[mnb_cr.find('Neutral')+len('Neutral'):155].split()
#mnb_cr[mnb_cr.find('Positive')+len('Positive'):210].split()
mnb_cr[mnb_cr.find('accuracy')+35:261] # dataset using stopword on unbalanced dataset
#svm_cr[svm_cr.find('Negative')+len('Negative'):100].split()
#svm_cr[svm_cr.find('Neutral')+len('Neutral'):155].split()
#svm_cr[svm_cr.find('Positive')+len('Positive'):210].split()
svm_cr[svm_cr.find('accuracy')+35:261] # dataset using stopword on unbalanced dataset
#B_svm_cr[B_svm_cr.find('Negative')+len('Negative'):100].split()
#B_svm_cr[B_svm_cr.find('Neutral')+len('Neutral'):155].split()
#B_svm_cr[B_svm_cr.find('Positive')+len('Positive'):210].split()
B_svm_cr[B_svm_cr.find('accuracy')+35:207] # dataset using stopWord on balanced dataset
#B_mnb_cr[mnb_cr.find('Negative')+len('Negative'):100].split()
#B_mnb_cr[mnb_cr.find('Neutral')+len('Neutral'):155].split()
#B_mnb_cr[mnb_cr.find('Positive')+len('Positive'):210].split()
B_mnb_cr[B_mnb_cr.find('accuracy')+35:207] # dataset using stopWord on balanced dataset
def accuracy_graph(cr1, cr2, cr3, cr4):
acc_val = cr1[cr1.find('accuracy')+35:261]
acc_val_2 = cr2[cr2.find('accuracy')+35:261]
acc_val_3 = cr3[cr3.find('accuracy')+35:207]
acc_val_4 = cr4[cr4.find('accuracy')+35:207]
SVM =[float(acc_val_4)*100,float(acc_val_2)*100]
MultinomialNB=[float(acc_val_3)*100,float(acc_val)*100]
index = ['Balanced_Data','UnBalanced_Data']
acc_pd = pd.DataFrame(data = {'SVM': SVM,'MultinomialNB':MultinomialNB},index=index)
acc_pd
ax = acc_pd.plot(kind='bar', ylim=(0,100), xlabel='dataSetType', ylabel = 'Accuracy', legend=True)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_title('Accuracy Score of SVM/MNB')
accuracy_graph(mnb_cr, svm_cr, B_mnb_cr, B_svm_cr)