import numpy as np
import pandas as pd
import string
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# Load dataset
def load_data(DATA_PATH):
''' Input parameter:
DATA_PATH = text to directory path
this method loads the dataset'''
data = pd.read_csv(DATA_PATH, header=None)
return data
#sample_data/Roman Urdu DataSet.csv
DATA_PATH = 'Roman Urdu DataSet.csv'
roman_urdu_df = load_data(DATA_PATH) #loading the data
roman_urdu_df.head()
roman_urdu_df = roman_urdu_df.drop([2], axis=1) #droping column
roman_urdu_df.columns = ['Sentence', 'Response'] #adding cols names to data
roman_urdu_df.head()
roman_urdu_df.isnull().sum()
roman_urdu_df.dropna(inplace = True)
roman_urdu_df.isnull().sum()
roman_urdu_df['Response'].value_counts()
# we can see there is one neative response
# digging inside it
roman_urdu_df[roman_urdu_df['Response'] == 'Neative']
roman_urdu_df.loc[roman_urdu_df['Response']=='Neative', 'Response'] = 'Negative'
roman_urdu_df['Response'].value_counts()
positive = roman_urdu_df[roman_urdu_df['Response'] == 'Positive'].shape[0]
Negative = roman_urdu_df[roman_urdu_df['Response'] == 'Negative'].shape[0]
Neutral = roman_urdu_df[roman_urdu_df['Response'] == 'Neutral'].shape[0]
# bar plot of the 3 classes
plt.bar(10,positive,3, label="Positive",color=['green'])
plt.bar(15,Negative,3, label="Negative",color=['red'])
plt.bar(20,Neutral,3, label="Neutral",color=['blue'])
plt.legend()
plt.ylabel('count')
plt.title('Overall Sentiment Responses')
plt.show()
roman_urdu_df['len'] = roman_urdu_df['Sentence'].str.rsplit().str.len()
roman_urdu_df.groupby(['Response'], sort=False)['len'].mean().plot(kind='bar',title = "Average number of words per sentiment", xlabel="Sentiment",color=['g', 'b', 'r'])
roman_urdu_df.groupby(['Response'], sort=False)['len'].max().plot(kind='bar',title = "Maximum number of words per sentiment", xlabel="Sentiment",color=['g', 'b', 'r'])
text_l = roman_urdu_df['Sentence'].tolist()
text = " ".join(review for review in text_l)
wordcloud = WordCloud().generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
print('Dataset size:',roman_urdu_df.shape)
print('Columns are:',roman_urdu_df.columns)
roman_urdu_df.dtypes
df = pd.DataFrame(roman_urdu_df[['Sentence', 'Response']]) #converting into dataframe
string.punctuation
def remove_punct(text):
''' Input parameter:
text: (str)
this method is used to take each text line and check the characters if it
contains any puntuation and then remove them with blank space'''
text = "".join([char for [char] in text if char not in string.punctuation])
text = re.sub('[0-9]+', '', text)
return text
# removing all the punctuation
df['Sentence_Removal'] = df['Sentence'].apply(lambda x: remove_punct(str(x)))
df.head(5) #testing to view transformed data
def convert_to_lower_case(text):
''' Input parameter:
text: (str)
this method is used to take each text line and converts the characters into lower case and joins them further'''
text = "".join([char.lower() for [char] in text if char not in string.punctuation])
return text
df['Lower_Case'] = df['Sentence_Removal'].apply(lambda x: convert_to_lower_case(str(x))) #calling lower case on each line
df.head(5) #testing lower case of alphabet
new = df['Lower_Case'].str #WITHOUT ANY PRE-PROCESSING CHANGED TO LOWER CASE
# list of stopWord
stopwords=['ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha', 'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh', 'rha',
'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr', 'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya', 'gaya', 'kch',
'ab', 'thy', 'thay', 'houn', 'hain', 'han', 'to','is', 'hi', 'jo', 'kya', 'thi', 'se', 'pe', 'phr', 'wala',
'waisay', 'us', 'na', 'ny', 'hun', 'rha', 'raha', 'ja', 'rahay', 'abi', 'uski', 'ne', 'haan', 'acha', 'nai',
'sent', 'you', 'kafi', 'gai', 'rhy', 'kuch', 'jata', 'aye', 'ya', 'dono', 'hoa', 'aese', 'de', 'wohi',
'jati', 'jb', 'krta', 'lg', 'rahi', 'hui', 'karna', 'krna', 'gi', 'hova', 'yehi', 'jana', 'jye', 'chal', 'mil',
'tu', 'hum', 'par', 'hay', 'kis', 'sb', 'gy', 'dain', 'krny', 'tou','h','je','or','jee','he','in','un','kay','ki','ya','ap','meri','me']
#some words like Mein - Ma - Maa
# after running clusters removed stopped words again
dictStopWords = {} # global variable
forFastTextData = []
def removeStopWordss(text):
text = re.sub('[^a-zA-Z]',' ',str(text))
text = text.lower()
wordList =str(text).split()
for word in wordList:
if word in stopwords:
wordList.remove(word)
if word not in dictStopWords:
dictStopWords[word]= 1
else:
dictStopWords[word] = dictStopWords[word] + 1
newSentence = " ".join(wordList)
forFastTextData.append(newSentence.split())
return (newSentence)
df['Sen_Out_StopWord'] = df['Lower_Case'].apply(lambda x: removeStopWordss(x)) # sentence with removed stop words
df.head()
## checking for most common stopwords from the dictionary
import collections
from collections import Counter
dictGraph = {}
d = Counter(dictStopWords) # creating a counter
d.most_common(10)
for k,v in d.most_common(10):
dictGraph[k] = v
dictGraph
#plotting a graph of 10 most common stopwords from dictionary
plt.bar(dictGraph.keys(), dictGraph.values(), align='center', color = 'green')
plt.title('Bargraph for Most Common StopWords')
plt.xlabel('StopWords')
plt.ylabel('Count')
plt.show()
def replacing_characters(word):
''' Input Parameter:
word: word from the sentences'''
word = re.sub(r'ain$', r'ein', word)
word = re.sub(r'ai', r'ae', word)
word = re.sub(r'ay$', r'e', word)
word = re.sub(r'ey$', r'e', word)
word = re.sub(r'aa+', r'aa', word)
word = re.sub(r'e+', r'ee', word)
word = re.sub(r'ai', r'ahi', word) # e.g "sahi and sai nahi"
word = re.sub(r'ai', r'ahi', word)
word = re.sub(r'ie$', r'y', word)
word = re.sub(r'^es', r'is', word)
word = re.sub(r'a+', r'a', word)
word = re.sub(r'j+', r'j', word)
word = re.sub(r'd+', r'd', word)
word = re.sub(r'u', r'o', word)
word = re.sub(r'o+', r'o', word)
if not re.match(r'ar', word):
word = re.sub(r'ar', r'r', word)
word = re.sub(r'iy+', r'i', word)
word = re.sub(r'ih+', r'eh', word)
word = re.sub(r's+', r's', word)
if re.search(r'[rst]y', 'word') and word[-1] != 'y':
word = re.sub(r'y', r'i', word)
if re.search(r'[^a]i', word):
word = re.sub(r'i$', r'y', word)
if re.search(r'[a-z]h', word):
word = re.sub(r'h', '', word)
return word
df['stem'] = df['Sen_Out_StopWord'].apply(lambda x: replacing_characters(x)) #with stop words removed doing stem
df.head()
df['Stem_On_Original'] = df['Lower_Case'].apply(lambda x: replacing_characters(x))
df.head()
df.drop(columns=['Sentence_Removal'])
# Lower_case column is one with removing puntuction and all the unique characters and numeric
# Sen_Out_StopWord column is one with removing stopwords
# stem cloumn which created after applying stemming function for StopWords
# Stem_On_Original column is one applying stemming function without stopwords
stats = df.drop(labels=["Sentence", "Lower_Case",], axis=1)
stats.describe()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy
def generateWordCloudForClusters(n_clusters,clustermTerms):
'''Input parameter: n_clusters:
clustermTerms: dictionary for clustered terms (which has cluster no. and its terms)
'''
termListerOfSeprateCluster=[]
for clusterno in range(n_clusters):
for i in (clustermTerms):
if(clusterno == i.get('cluster')): # if same cluster
terms = i.get('terms')
termListerOfSeprateCluster.append((terms)) #add to file
#print(termListerOfSeprateCluster)
wordcloud = WordCloud(width = 500, height = 500, background_color='black', random_state=10).generate(transformForWordCount(termListerOfSeprateCluster))
plot_cloud(wordcloud)
termListerOfSeprateCluster = [] #empty list again for next cluster
def transformForWordCount(terms):
'''Input parameter: terms(terms per seprate cluster)
genrates a list for the cloud
'''
cloud = []
for term in range(len(terms)):
cloud.append(terms[term])
return " ".join(cloud)
def plot_cloud(wordcloud):
'''Input parameter: wordcloud
plots graph
'''
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis("off");
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
def plotKneeElbow(matrix,vale_for_range):
''' Input patameter: matrix returns a plot to help determine no of clusters'''
Sum_of_squared_distances = []
K = range(2,vale_for_range)
for k in K:
km = KMeans(n_clusters=k, max_iter=500, n_init=10)
km = km.fit(matrix)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal clusters')
plt.show()
wholedf = df
# vectorization of the texts
vectorizer = TfidfVectorizer()
def vectorFeatures(dataSet):
''' Input parameter: dataSet: seprate dataset for Positive,Negative or Neutral.
returns feature names and transformed vector'''
matrixForm = vectorizer.fit_transform(dataSet['Sen_Out_StopWord'])
# used words (axis in our multi-dimensional space)
words = vectorizer.get_feature_names()
#print("words", words)
return matrixForm,words
def performKmeansClustering(n_clusters,max_iter,matrixForm):
''' Input Parameter: n_clusters: no of clusters
max_iter: maxium iteratios for each cluster
matrixForm: vectorized transformed matrix'''
modelKmeans = KMeans(n_clusters=n_clusters, max_iter=max_iter, verbose=1)
modelKmeans.fit(matrixForm)
return modelKmeans
matrixdfForm,words = vectorFeatures(wholedf)
plotKneeElbow(matrixdfForm,10)
modelKmeans = performKmeansClustering(5,500,matrixdfForm)
labels = modelKmeans.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeans.cluster_centers_.argsort()[:, ::-1]
print("centers:", modelKmeans.cluster_centers_)
print("labels", labels)
print("intertia:", modelKmeans.inertia_)
n_clusters = 5
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
clusterDictionary={}
clustermTerms=[]
print("Top words per cluster:")
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
n_clusters = 5
positivedf = df[df['Response'] == 'Positive'] #segmenting positive clusters
matrixForm,words = vectorFeatures(positivedf)
plotKneeElbow(matrixForm,20)
modelKmeans = performKmeansClustering(5,500,matrixForm)
labels = modelKmeans.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeans.cluster_centers_.argsort()[:, ::-1]
print("Positive centers:", modelKmeans.cluster_centers_)
print("Positive labels", labels)
print("Positive intertia:", modelKmeans.inertia_)
n_clusters = 5
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
clusterDictionary={}
clustermTerms=[]
print("Top words per Positive cluster:")
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
generateWordCloudForClusters(5,clustermTerms)
negtivedf = df[df['Response'] == 'Negative']
matrixFormNeg,words = vectorFeatures(negtivedf)
plotKneeElbow(matrixFormNeg,10)
modelKmeansNeg = performKmeansClustering(6,500,matrixFormNeg)
n_clusters=6
labelsNeg = modelKmeansNeg.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeansNeg.cluster_centers_.argsort()[:, ::-1]
print("Negative centers:", modelKmeansNeg.cluster_centers_)
print("Negative labels", labelsNeg)
print("Negative intertia:", modelKmeansNeg.inertia_)
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
print("Top Negative words per 6 cluster:")
clustermTerms = []
clusterDictionary={}
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
modelKmeansNeg = performKmeansClustering(5,500,matrixFormNeg)
labelsNeg = modelKmeansNeg.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeansNeg.cluster_centers_.argsort()[:, ::-1]
print("Negative centers:", modelKmeansNeg.cluster_centers_)
print("Negative labels", labelsNeg)
print("Negative intertia:", modelKmeansNeg.inertia_)
n_clusters=5
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
print("Top Negative words per cluster:")
clustermTerms = []
clusterDictionary={}
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
generateWordCloudForClusters(n_clusters,clustermTerms)
neutraldf = df[df['Response'] == 'Neutral']
matrixFormNeu,words = vectorFeatures(neutraldf)
plotKneeElbow(matrixFormNeg,10)
modelKmeansNeu = performKmeansClustering(5,500,matrixFormNeu)
labelsNeu = modelKmeansNeu.labels_
# indices of preferible words in each cluster
ordered_words = modelKmeansNeg.cluster_centers_.argsort()[:, ::-1]
print("Neutral centers:", modelKmeansNeu.cluster_centers_)
print("Neutral labels", labelsNeu)
print("Neutral intertia:", modelKmeansNeu.inertia_)
texts_per_cluster = numpy.zeros(n_clusters)
for i_cluster in range(n_clusters):
for label in labels:
if label==i_cluster:
texts_per_cluster[i_cluster] +=1
print("Top Neutral words per cluster:")
clustermTerms = []
clusterDictionary={}
for i_cluster in range(n_clusters):
print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
for term in ordered_words[i_cluster, :10]:
clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]}
clustermTerms.append(clusterDictionary)
print("\t"+words[term])
generateWordCloudForClusters(n_clusters,clustermTerms)
from gensim.models import FastText
#from gensim.test.utils import common_texts # some example sentences
model = FastText(vector_size=15, window=4, min_count=1) # instantiate
model.build_vocab(corpus_iterable=forFastTextData)
model.train(corpus_iterable=forFastTextData, total_examples=len(forFastTextData), epochs=10)
model.wv.most_similar("insan")
model.wv.most_similar("larki")
model.wv.similarity(w1 = 'khushi',w2 = 'achi')
model.wv.similarity(w1 = 'waqt',w2 = 'gahri')
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
### We can use this function to do cross-validation
# use for both naive bayes and SVM
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import sem
def evaluate_cross_validation(clf, X, y, K):
# create a k-fold cross validation iterator of K folds
cv = KFold(n_splits=K, random_state=0, shuffle=True)
# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print(scores)
print("Mean score: %.3f (+/-%.3f)" % (np.mean(scores), sem(scores)))
def calc_params(X, y, clf, param_values, param_name, K):
# Convert input to Numpy arrays
X = np.array(X)
y = np.array(y)
# initialize training and testing score arrays with zeros
train_scores = np.zeros(len(param_values))
test_scores = np.zeros(len(param_values))
# iterate over the different parameter values
for i, param_value in enumerate(param_values):
# set classifier parameters
clf.set_params(**{param_name:param_value})
# initialize the K scores obtained for each fold
k_train_scores = np.zeros(K)
k_test_scores = np.zeros(K)
# create KFold cross validation
cv = KFold(n_splits=K, shuffle=True, random_state=0)
# iterate over the K folds
j = 0
for train, test in cv.split(X):
# fit the classifier in the corresponding fold
# and obtain the corresponding accuracy scores on train and test sets
clf.fit(X[train], y[train])
k_train_scores[j] = clf.score(X[train], y[train])
k_test_scores[j] = clf.score(X[test], y[test])
j += 1
# store the mean of the K fold scores
train_scores[i] = np.mean(k_train_scores)
test_scores[i] = np.mean(k_test_scores)
print(param_name, '=', param_value, "Train =", train_scores[i], "Test =", test_scores[i])
# plot the training and testing scores in a log scale
plt.semilogx(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b')
plt.semilogx(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g')
plt.legend(loc=7)
plt.xlabel(param_name + " values")
plt.ylabel("Mean cross validation accuracy")
# return the training and testing scores on each parameter value
return train_scores, test_scores
X = df['Sen_Out_StopWord']
y = df['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
df.columns
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train.values.astype('U'))
X_tfidf.shape
X_test_tfidf = tfidf.transform(X_test.values.astype('U'))
X_test_tfidf.shape
X_tfidf
clf = Pipeline([
('vect', TfidfVectorizer()),
('nb', MultinomialNB(alpha=0.01)),
])
evaluate_cross_validation(clf, X_train, y_train, 5)
alphas = np.logspace(-2, 1, 20)
print(alphas)
train_scores, test_scores = calc_params(X_train, y_train, clf, alphas, 'nb__alpha', 5)
mnb = MultinomialNB(alpha= 0.37)
mnb.fit(X_tfidf, y_train)
mnb_pred = mnb.predict(X_test_tfidf)
print(mnb_pred)
from sklearn.metrics import classification_report
mnb_cr = classification_report(mnb_pred, y_test)
print(mnb_cr)
X_train.shape
X_test.shape
# Perform classification with SVM, kernel=linear and invoking a pipeline.
from sklearn import svm
from sklearn.svm import SVC
classifier_linear = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='linear')),
])
# SVM without tunning calling cross validation method above.
evaluate_cross_validation(classifier_linear, X_train, y_train, 5)
# We are tuning the c_vals parameter using the fixed range of array parameters. We can take larger sets but that would taken more time to execute so taking samlelr sets here.
c_vals = [1, 5, 10, 50, 100]
# We calculate train and test score by calling calc_params with c_vals as tuned parameter and 5 k folds.
train_scores, test_scores = calc_params(X_train, y_train, classifier_linear, c_vals, 'svc__C', 5)
# passing gamma and original C parameter from above to get a grid of combination of C and gamma parameter.
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
parameters = {
'svc__gamma': np.logspace(-3, 0, 4),
'svc__C': [1, 5, 10, 50, 100],
}
clf_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf')),
])
gs = GridSearchCV(clf_rbf, parameters, verbose=2, cv=3)
gs.fit(X, y)
gs.best_params_, gs.best_score_
classifier_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf', C=5, gamma=1)),
])
# Fitting the final classifier
classifier_rbf.fit(X_train, y_train)
# Predicting on test set
svc_pred = classifier_rbf.predict(X_test)
# training on the entire set
print(classifier_rbf.score(X_test, y_test))
print("SVM rbf Accuracy:" , metrics.accuracy_score(y_test,svc_pred))
# classification report
# removing stopWord from our dataset
svm_cr = classification_report(y_test,svc_pred)
print(svm_cr)
# creating a confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, svc_pred)
mat.T
#plotting a heatmap from the confusion matrix value above for better visulization.
fig, ax = plt.subplots(figsize=(8,8))
ax = sns.heatmap(mat.T, square=True, linecolor='grey', linewidths=1, annot=True,
fmt='d', cbar=True, cmap='Reds', ax=ax, annot_kws={"fontsize":12, "weight":"bold"},
xticklabels=y_test.unique(),
yticklabels=y_test.unique())
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.xlabel('true label')
plt.ylabel('predicted label');
comment = """achha lagta hain """ # I like it
print(classifier_rbf.predict([comment]))
comment = """umaima ka dimag kharab hain aaj""" # Umiama is mad today
print(classifier_rbf.predict([comment]))
comment = """Rauf kya chal rahan hain""" # what's going on Rauf
print(classifier_rbf.predict([comment]))
# the below is original dataset changed to lower case,punctuation only and not cleaned with stop words.
X = df['Lower_Case']
y = df['Response']
X
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train.values.astype('U'))
X_tfidf.shape
X_test_tfidf = tfidf.transform(X_test.values.astype('U'))
X_test_tfidf.shape
### Lets' set up a pipeline to perform preprocessng of the data and
### classification of the documents using Multiomial Naive Bayes
clf_2 = Pipeline([
('vect', TfidfVectorizer()),
('nb', MultinomialNB(alpha=0.01)),
])
evaluate_cross_validation(clf_2, X_train, y_train, 5)
alphas = np.logspace(-2, 1, 20)
print(alphas)
train_scores, test_scores = calc_params(X_train, y_train, clf_2, alphas, 'nb__alpha', 5)
mnb_2 = MultinomialNB(alpha= 0.18)
mnb_2.fit(X_tfidf, y_train)
#Performance on the test data
mnb_pred_2 = mnb_2.predict(X_test_tfidf)
print(mnb_pred_2)
mnb_cr_2 = classification_report(mnb_pred_2, y_test)
print(mnb_cr_2)
# Perform classification with SVM, kernel=linear, calling pipeline
from sklearn import svm
from sklearn.svm import SVC
classifi_linear = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='linear')),
])
#SVM without tunning calling cross validation method above.
evaluate_cross_validation(classifi_linear, X_train, y_train, 5)
# We are tuning the c_vals parameter using the fixed range of array parameters.
c_vals = [1, 5, 10, 50, 100]
# calling calc_params function from above
train_scores, test_scores = calc_params(X_train, y_train, classifi_linear, c_vals, 'svc__C', 5)
parameters = {
'svc__gamma': np.logspace(-3, 0, 4),
'svc__C': [1, 5, 10, 50, 100],
}
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
clf_rbf_2 = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf')),
])
gs = GridSearchCV(clf_rbf_2, parameters, verbose=2, cv=3)
gs.fit(X, y)
gs.best_params_, gs.best_score_
# using kernel = rbf
classifi_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf', C=5, gamma=1.0)),
])
# fitting entire training data
classifi_rbf.fit(X_train, y_train)
# predicting on test set
svc_pred_2 = classifi_rbf.predict(X_test)
# predicting on entire test dataset
print(classifi_rbf.score(X_test, y_test))
print("SVM rbf Accuracy:" , metrics.accuracy_score(y_test,svc_pred_2))
# classification report using
# dataset without removing stopWord
svm_cr_2 = classification_report(y_test,svc_pred_2)
print(svm_cr_2)
# Dropping neutral sentiment sentences.
data = roman_urdu_df[roman_urdu_df.Response != 'Neutral']
data = pd.DataFrame(data[['Sentence', 'Response']]) #converting into dataframe
data['Response'].value_counts()
# removing all the punctuation by calling remove_punct function from above
data['Sentence_Removal'] = data['Sentence'].apply(lambda x: remove_punct(str(x)))
data['Lower_Case'] = data['Sentence_Removal'].apply(lambda x: convert_to_lower_case(str(x))) #calling lower case on each line
data['Sen_Out_StopWord'] = data['Lower_Case'].apply(lambda x: removeStopWordss(x)) # sentence with removed stop words
data.head()
X = data['Sen_Out_StopWord']
y = data['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
X_train.shape
X_test.shape
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train.values.astype('U'))
X_tfidf.shape
X_test_tfidf = tfidf.transform(X_test.values.astype('U'))
X_test_tfidf.shape
X_tfidf
### Lets' set up a pipeline to perform preprocessng of the data and
### classification of the documents using Multiomial Naive Bayes
B_clf = Pipeline([
('vect', TfidfVectorizer()),
('nb', MultinomialNB(alpha=0.01)),
])
evaluate_cross_validation(B_clf, X_train, y_train, 5)
alphas = np.logspace(-2, 1, 20)
print(alphas)
train_scores, test_scores = calc_params(X_train, y_train, B_clf, alphas, 'nb__alpha', 5)
B_mnb = MultinomialNB(alpha= 0.78)
B_mnb.fit(X_tfidf, y_train)
### Performance on the test data
B_mnb_pred = B_mnb.predict(X_test_tfidf)
print(B_mnb_pred)
B_mnb_cr = classification_report(B_mnb_pred, y_test)
print(B_mnb_cr)
# Perform classification with SVM, kernel=linear and invoking a pipeline.
from sklearn import svm
from sklearn.svm import SVC
B_classifier_linear = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='linear')),
])
# SVM without tunning calling cross validation method above.
evaluate_cross_validation(B_classifier_linear, X_train, y_train, 5)
# We are tuning the c_vals parameter using the fixed range of array parameters. We can take larger sets but that would taken more time to execute so taking samlelr sets here.
c_vals = [1, 5, 10, 50, 100]
# We calculate train and test score by calling calc_params with c_vals as tuned parameter and 5 k folds.
train_scores, test_scores = calc_params(X_train, y_train, B_classifier_linear, c_vals, 'svc__C', 5)
# passing gamma and original C parameter from above to get a grid of combination of C and gamma parameter.
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
parameters = {
'svc__gamma': np.logspace(-3, 0, 4),
'svc__C': [1, 5, 10, 50, 100],
}
B_clf_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf')),
])
gs = GridSearchCV(B_clf_rbf, parameters, verbose=2, cv=3)
gs.fit(X, y)
gs.best_params_, gs.best_score_
B_classifier_rbf = Pipeline([
('vect', TfidfVectorizer()),
('svc', SVC(kernel='rbf', C=1, gamma= 1.0)),
])
# Fitting the final classifier
B_classifier_rbf.fit(X_train, y_train)
# Predicting on test set
B_svc_pred = B_classifier_rbf.predict(X_test)
# training on the entire test set
print(B_classifier_rbf.score(X_test, y_test))
print("SVM rbf Accuracy:" , metrics.accuracy_score(y_test,B_svc_pred))
# results
B_svm_cr = classification_report(y_test,B_svc_pred)
print(B_svm_cr)
# creating a confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
mat_3 = confusion_matrix(y_test, B_svc_pred)
mat_3
#plotting a heatmap from the confusion matrix value above.
fig, ax = plt.subplots(figsize=(8,8))
ax = sns.heatmap(mat_3.T, square=True, linecolor='grey', linewidths=1, annot=True,
fmt='d', cbar=True, cmap='Reds', ax=ax, annot_kws={"fontsize":12, "weight":"bold"},
xticklabels=y_test.unique(),
yticklabels=y_test.unique())
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.xlabel('true label')
plt.ylabel('predicted label');
import matplotlib.ticker as mtick
#mnb_cr[mnb_cr.find('Negative')+len('Negative'):100].split()
#mnb_cr[mnb_cr.find('Neutral')+len('Neutral'):155].split()
#mnb_cr[mnb_cr.find('Positive')+len('Positive'):210].split()
mnb_cr[mnb_cr.find('accuracy')+35:261] # dataset using stopword on unbalanced dataset
#svm_cr[svm_cr.find('Negative')+len('Negative'):100].split()
#svm_cr[svm_cr.find('Neutral')+len('Neutral'):155].split()
#svm_cr[svm_cr.find('Positive')+len('Positive'):210].split()
svm_cr[svm_cr.find('accuracy')+35:261] # dataset using stopword on unbalanced dataset
#B_svm_cr[B_svm_cr.find('Negative')+len('Negative'):100].split()
#B_svm_cr[B_svm_cr.find('Neutral')+len('Neutral'):155].split()
#B_svm_cr[B_svm_cr.find('Positive')+len('Positive'):210].split()
B_svm_cr[B_svm_cr.find('accuracy')+35:207] # dataset using stopWord on balanced dataset
#B_mnb_cr[mnb_cr.find('Negative')+len('Negative'):100].split()
#B_mnb_cr[mnb_cr.find('Neutral')+len('Neutral'):155].split()
#B_mnb_cr[mnb_cr.find('Positive')+len('Positive'):210].split()
B_mnb_cr[B_mnb_cr.find('accuracy')+35:207] # dataset using stopWord on balanced dataset
def accuracy_graph(cr1, cr2, cr3, cr4):
acc_val = cr1[cr1.find('accuracy')+35:261]
acc_val_2 = cr2[cr2.find('accuracy')+35:261]
acc_val_3 = cr3[cr3.find('accuracy')+35:207]
acc_val_4 = cr4[cr4.find('accuracy')+35:207]
SVM =[float(acc_val_4)*100,float(acc_val_2)*100]
MultinomialNB=[float(acc_val_3)*100,float(acc_val)*100]
index = ['Balanced_Data','UnBalanced_Data']
acc_pd = pd.DataFrame(data = {'SVM': SVM,'MultinomialNB':MultinomialNB},index=index)
acc_pd
ax = acc_pd.plot(kind='bar', ylim=(0,100), xlabel='dataSetType', ylabel = 'Accuracy', legend=True)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_title('Accuracy Score of SVM/MNB')
accuracy_graph(mnb_cr, svm_cr, B_mnb_cr, B_svm_cr)