Sentimental Analysis on Roman Urdu

import numpy as np import pandas as pd import string import re import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Load dataset def load_data(DATA_PATH): ''' Input parameter: DATA_PATH = text to directory path this method loads the dataset''' data = pd.read_csv(DATA_PATH, header=None) return data #sample_data/Roman Urdu DataSet.csv

DATA_PATH = 'Roman Urdu DataSet.csv' roman_urdu_df = load_data(DATA_PATH) #loading the data roman_urdu_df.head()

roman_urdu_df = roman_urdu_df.drop([2], axis=1) #droping column

roman_urdu_df.columns = ['Sentence', 'Response'] #adding cols names to data

roman_urdu_df.head()

roman_urdu_df.isnull().sum()

roman_urdu_df.dropna(inplace = True)

roman_urdu_df.isnull().sum()

roman_urdu_df['Response'].value_counts()

# we can see there is one neative response # digging inside it roman_urdu_df[roman_urdu_df['Response'] == 'Neative']

roman_urdu_df.loc[roman_urdu_df['Response']=='Neative', 'Response'] = 'Negative'

roman_urdu_df['Response'].value_counts()

positive = roman_urdu_df[roman_urdu_df['Response'] == 'Positive'].shape[0] Negative = roman_urdu_df[roman_urdu_df['Response'] == 'Negative'].shape[0] Neutral = roman_urdu_df[roman_urdu_df['Response'] == 'Neutral'].shape[0]

# bar plot of the 3 classes plt.bar(10,positive,3, label="Positive",color=['green']) plt.bar(15,Negative,3, label="Negative",color=['red']) plt.bar(20,Neutral,3, label="Neutral",color=['blue']) plt.legend() plt.ylabel('count') plt.title('Overall Sentiment Responses') plt.show()

roman_urdu_df['len'] = roman_urdu_df['Sentence'].str.rsplit().str.len()

roman_urdu_df.groupby(['Response'], sort=False)['len'].mean().plot(kind='bar',title = "Average number of words per sentiment", xlabel="Sentiment",color=['g', 'b', 'r'])

roman_urdu_df.groupby(['Response'], sort=False)['len'].max().plot(kind='bar',title = "Maximum number of words per sentiment", xlabel="Sentiment",color=['g', 'b', 'r'])

text_l = roman_urdu_df['Sentence'].tolist()

text = " ".join(review for review in text_l)

wordcloud = WordCloud().generate(text)

plt.imshow(wordcloud, interpolation='bilinear')

print('Dataset size:',roman_urdu_df.shape) print('Columns are:',roman_urdu_df.columns)

roman_urdu_df.dtypes

df = pd.DataFrame(roman_urdu_df[['Sentence', 'Response']]) #converting into dataframe

string.punctuation

def remove_punct(text): ''' Input parameter: text: (str) this method is used to take each text line and check the characters if it contains any puntuation and then remove them with blank space''' text = "".join([char for [char] in text if char not in string.punctuation]) text = re.sub('[0-9]+', '', text) return text

# removing all the punctuation df['Sentence_Removal'] = df['Sentence'].apply(lambda x: remove_punct(str(x)))

df.head(5) #testing to view transformed data

def convert_to_lower_case(text): ''' Input parameter: text: (str) this method is used to take each text line and converts the characters into lower case and joins them further''' text = "".join([char.lower() for [char] in text if char not in string.punctuation]) return text

df['Lower_Case'] = df['Sentence_Removal'].apply(lambda x: convert_to_lower_case(str(x))) #calling lower case on each line

df.head(5) #testing lower case of alphabet

new = df['Lower_Case'].str #WITHOUT ANY PRE-PROCESSING CHANGED TO LOWER CASE

# list of stopWord stopwords=['ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha', 'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh', 'rha', 'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr', 'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya', 'gaya', 'kch', 'ab', 'thy', 'thay', 'houn', 'hain', 'han', 'to','is', 'hi', 'jo', 'kya', 'thi', 'se', 'pe', 'phr', 'wala', 'waisay', 'us', 'na', 'ny', 'hun', 'rha', 'raha', 'ja', 'rahay', 'abi', 'uski', 'ne', 'haan', 'acha', 'nai', 'sent', 'you', 'kafi', 'gai', 'rhy', 'kuch', 'jata', 'aye', 'ya', 'dono', 'hoa', 'aese', 'de', 'wohi', 'jati', 'jb', 'krta', 'lg', 'rahi', 'hui', 'karna', 'krna', 'gi', 'hova', 'yehi', 'jana', 'jye', 'chal', 'mil', 'tu', 'hum', 'par', 'hay', 'kis', 'sb', 'gy', 'dain', 'krny', 'tou','h','je','or','jee','he','in','un','kay','ki','ya','ap','meri','me'] #some words like Mein - Ma - Maa # after running clusters removed stopped words again

dictStopWords = {} # global variable forFastTextData = [] def removeStopWordss(text): text = re.sub('[^a-zA-Z]',' ',str(text)) text = text.lower() wordList =str(text).split() for word in wordList: if word in stopwords: wordList.remove(word) if word not in dictStopWords: dictStopWords[word]= 1 else: dictStopWords[word] = dictStopWords[word] + 1 newSentence = " ".join(wordList) forFastTextData.append(newSentence.split()) return (newSentence)

df['Sen_Out_StopWord'] = df['Lower_Case'].apply(lambda x: removeStopWordss(x)) # sentence with removed stop words df.head()

## checking for most common stopwords from the dictionary import collections from collections import Counter dictGraph = {} d = Counter(dictStopWords) # creating a counter d.most_common(10) for k,v in d.most_common(10): dictGraph[k] = v dictGraph

#plotting a graph of 10 most common stopwords from dictionary plt.bar(dictGraph.keys(), dictGraph.values(), align='center', color = 'green') plt.title('Bargraph for Most Common StopWords') plt.xlabel('StopWords') plt.ylabel('Count') plt.show()

def replacing_characters(word): ''' Input Parameter: word: word from the sentences''' word = re.sub(r'ain$', r'ein', word) word = re.sub(r'ai', r'ae', word) word = re.sub(r'ay$', r'e', word) word = re.sub(r'ey$', r'e', word) word = re.sub(r'aa+', r'aa', word) word = re.sub(r'e+', r'ee', word) word = re.sub(r'ai', r'ahi', word) # e.g "sahi and sai nahi" word = re.sub(r'ai', r'ahi', word) word = re.sub(r'ie$', r'y', word) word = re.sub(r'^es', r'is', word) word = re.sub(r'a+', r'a', word) word = re.sub(r'j+', r'j', word) word = re.sub(r'd+', r'd', word) word = re.sub(r'u', r'o', word) word = re.sub(r'o+', r'o', word) if not re.match(r'ar', word): word = re.sub(r'ar', r'r', word) word = re.sub(r'iy+', r'i', word) word = re.sub(r'ih+', r'eh', word) word = re.sub(r's+', r's', word) if re.search(r'[rst]y', 'word') and word[-1] != 'y': word = re.sub(r'y', r'i', word) if re.search(r'[^a]i', word): word = re.sub(r'i$', r'y', word) if re.search(r'[a-z]h', word): word = re.sub(r'h', '', word) return word

df['stem'] = df['Sen_Out_StopWord'].apply(lambda x: replacing_characters(x)) #with stop words removed doing stem df.head()

df['Stem_On_Original'] = df['Lower_Case'].apply(lambda x: replacing_characters(x)) df.head()

df.drop(columns=['Sentence_Removal']) # Lower_case column is one with removing puntuction and all the unique characters and numeric # Sen_Out_StopWord column is one with removing stopwords # stem cloumn which created after applying stemming function for StopWords # Stem_On_Original column is one applying stemming function without stopwords

stats = df.drop(labels=["Sentence", "Lower_Case",], axis=1) stats.describe()

from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score import numpy

def generateWordCloudForClusters(n_clusters,clustermTerms): '''Input parameter: n_clusters: clustermTerms: dictionary for clustered terms (which has cluster no. and its terms) ''' termListerOfSeprateCluster=[] for clusterno in range(n_clusters): for i in (clustermTerms): if(clusterno == i.get('cluster')): # if same cluster terms = i.get('terms') termListerOfSeprateCluster.append((terms)) #add to file #print(termListerOfSeprateCluster) wordcloud = WordCloud(width = 500, height = 500, background_color='black', random_state=10).generate(transformForWordCount(termListerOfSeprateCluster)) plot_cloud(wordcloud) termListerOfSeprateCluster = [] #empty list again for next cluster

def transformForWordCount(terms): '''Input parameter: terms(terms per seprate cluster) genrates a list for the cloud ''' cloud = [] for term in range(len(terms)): cloud.append(terms[term]) return " ".join(cloud)

def plot_cloud(wordcloud): '''Input parameter: wordcloud plots graph ''' plt.figure(figsize=(10, 10)) plt.imshow(wordcloud) plt.axis("off");

import matplotlib.pyplot as plt from sklearn.cluster import KMeans def plotKneeElbow(matrix,vale_for_range): ''' Input patameter: matrix returns a plot to help determine no of clusters''' Sum_of_squared_distances = [] K = range(2,vale_for_range) for k in K: km = KMeans(n_clusters=k, max_iter=500, n_init=10) km = km.fit(matrix) Sum_of_squared_distances.append(km.inertia_) plt.plot(K, Sum_of_squared_distances, 'bx-') plt.xlabel('k') plt.ylabel('Sum_of_squared_distances') plt.title('Elbow Method For Optimal clusters') plt.show()

wholedf = df

# vectorization of the texts vectorizer = TfidfVectorizer()

def vectorFeatures(dataSet): ''' Input parameter: dataSet: seprate dataset for Positive,Negative or Neutral. returns feature names and transformed vector''' matrixForm = vectorizer.fit_transform(dataSet['Sen_Out_StopWord']) # used words (axis in our multi-dimensional space) words = vectorizer.get_feature_names() #print("words", words) return matrixForm,words

def performKmeansClustering(n_clusters,max_iter,matrixForm): ''' Input Parameter: n_clusters: no of clusters max_iter: maxium iteratios for each cluster matrixForm: vectorized transformed matrix''' modelKmeans = KMeans(n_clusters=n_clusters, max_iter=max_iter, verbose=1) modelKmeans.fit(matrixForm) return modelKmeans

matrixdfForm,words = vectorFeatures(wholedf)

plotKneeElbow(matrixdfForm,10)

modelKmeans = performKmeansClustering(5,500,matrixdfForm)

labels = modelKmeans.labels_ # indices of preferible words in each cluster ordered_words = modelKmeans.cluster_centers_.argsort()[:, ::-1] print("centers:", modelKmeans.cluster_centers_) print("labels", labels) print("intertia:", modelKmeans.inertia_)

n_clusters = 5 texts_per_cluster = numpy.zeros(n_clusters) for i_cluster in range(n_clusters): for label in labels: if label==i_cluster: texts_per_cluster[i_cluster] +=1

clusterDictionary={} clustermTerms=[] print("Top words per cluster:") for i_cluster in range(n_clusters): print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])), for term in ordered_words[i_cluster, :10]: clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]} clustermTerms.append(clusterDictionary) print("\t"+words[term])

n_clusters = 5

positivedf = df[df['Response'] == 'Positive'] #segmenting positive clusters

matrixForm,words = vectorFeatures(positivedf)

plotKneeElbow(matrixForm,20)

modelKmeans = performKmeansClustering(5,500,matrixForm)

labels = modelKmeans.labels_ # indices of preferible words in each cluster ordered_words = modelKmeans.cluster_centers_.argsort()[:, ::-1] print("Positive centers:", modelKmeans.cluster_centers_) print("Positive labels", labels) print("Positive intertia:", modelKmeans.inertia_)

n_clusters = 5 texts_per_cluster = numpy.zeros(n_clusters) for i_cluster in range(n_clusters): for label in labels: if label==i_cluster: texts_per_cluster[i_cluster] +=1

clusterDictionary={} clustermTerms=[] print("Top words per Positive cluster:") for i_cluster in range(n_clusters): print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])), for term in ordered_words[i_cluster, :10]: clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]} clustermTerms.append(clusterDictionary) print("\t"+words[term])

generateWordCloudForClusters(5,clustermTerms)

negtivedf = df[df['Response'] == 'Negative']

matrixFormNeg,words = vectorFeatures(negtivedf)

plotKneeElbow(matrixFormNeg,10)

modelKmeansNeg = performKmeansClustering(6,500,matrixFormNeg)

n_clusters=6 labelsNeg = modelKmeansNeg.labels_ # indices of preferible words in each cluster ordered_words = modelKmeansNeg.cluster_centers_.argsort()[:, ::-1] print("Negative centers:", modelKmeansNeg.cluster_centers_) print("Negative labels", labelsNeg) print("Negative intertia:", modelKmeansNeg.inertia_)

texts_per_cluster = numpy.zeros(n_clusters) for i_cluster in range(n_clusters): for label in labels: if label==i_cluster: texts_per_cluster[i_cluster] +=1

print("Top Negative words per 6 cluster:") clustermTerms = [] clusterDictionary={} for i_cluster in range(n_clusters): print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])), for term in ordered_words[i_cluster, :10]: clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]} clustermTerms.append(clusterDictionary) print("\t"+words[term])

modelKmeansNeg = performKmeansClustering(5,500,matrixFormNeg)

labelsNeg = modelKmeansNeg.labels_ # indices of preferible words in each cluster ordered_words = modelKmeansNeg.cluster_centers_.argsort()[:, ::-1] print("Negative centers:", modelKmeansNeg.cluster_centers_) print("Negative labels", labelsNeg) print("Negative intertia:", modelKmeansNeg.inertia_)

n_clusters=5 texts_per_cluster = numpy.zeros(n_clusters) for i_cluster in range(n_clusters): for label in labels: if label==i_cluster: texts_per_cluster[i_cluster] +=1

print("Top Negative words per cluster:") clustermTerms = [] clusterDictionary={} for i_cluster in range(n_clusters): print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])), for term in ordered_words[i_cluster, :10]: clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]} clustermTerms.append(clusterDictionary) print("\t"+words[term])

generateWordCloudForClusters(n_clusters,clustermTerms)

neutraldf = df[df['Response'] == 'Neutral']

matrixFormNeu,words = vectorFeatures(neutraldf)

plotKneeElbow(matrixFormNeg,10)

modelKmeansNeu = performKmeansClustering(5,500,matrixFormNeu)

labelsNeu = modelKmeansNeu.labels_ # indices of preferible words in each cluster ordered_words = modelKmeansNeg.cluster_centers_.argsort()[:, ::-1] print("Neutral centers:", modelKmeansNeu.cluster_centers_) print("Neutral labels", labelsNeu) print("Neutral intertia:", modelKmeansNeu.inertia_)

texts_per_cluster = numpy.zeros(n_clusters) for i_cluster in range(n_clusters): for label in labels: if label==i_cluster: texts_per_cluster[i_cluster] +=1

print("Top Neutral words per cluster:") clustermTerms = [] clusterDictionary={} for i_cluster in range(n_clusters): print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])), for term in ordered_words[i_cluster, :10]: clusterDictionary = {"cluster" : i_cluster,"terms" : words[term]} clustermTerms.append(clusterDictionary) print("\t"+words[term])

generateWordCloudForClusters(n_clusters,clustermTerms)

from gensim.models import FastText #from gensim.test.utils import common_texts # some example sentences

model = FastText(vector_size=15, window=4, min_count=1) # instantiate

model.build_vocab(corpus_iterable=forFastTextData) model.train(corpus_iterable=forFastTextData, total_examples=len(forFastTextData), epochs=10)

model.wv.most_similar("insan")

model.wv.most_similar("larki")

model.wv.similarity(w1 = 'khushi',w2 = 'achi')

model.wv.similarity(w1 = 'waqt',w2 = 'gahri')

from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.metrics import classification_report from sklearn.naive_bayes import MultinomialNB

### We can use this function to do cross-validation # use for both naive bayes and SVM from sklearn.model_selection import cross_val_score, KFold from scipy.stats import sem def evaluate_cross_validation(clf, X, y, K): # create a k-fold cross validation iterator of K folds cv = KFold(n_splits=K, random_state=0, shuffle=True) # by default the score used is the one returned by score method of the estimator (accuracy) scores = cross_val_score(clf, X, y, cv=cv) print(scores) print("Mean score: %.3f (+/-%.3f)" % (np.mean(scores), sem(scores)))

def calc_params(X, y, clf, param_values, param_name, K): # Convert input to Numpy arrays X = np.array(X) y = np.array(y) # initialize training and testing score arrays with zeros train_scores = np.zeros(len(param_values)) test_scores = np.zeros(len(param_values)) # iterate over the different parameter values for i, param_value in enumerate(param_values): # set classifier parameters clf.set_params(**{param_name:param_value}) # initialize the K scores obtained for each fold k_train_scores = np.zeros(K) k_test_scores = np.zeros(K) # create KFold cross validation cv = KFold(n_splits=K, shuffle=True, random_state=0) # iterate over the K folds j = 0 for train, test in cv.split(X): # fit the classifier in the corresponding fold # and obtain the corresponding accuracy scores on train and test sets clf.fit(X[train], y[train]) k_train_scores[j] = clf.score(X[train], y[train]) k_test_scores[j] = clf.score(X[test], y[test]) j += 1 # store the mean of the K fold scores train_scores[i] = np.mean(k_train_scores) test_scores[i] = np.mean(k_test_scores) print(param_name, '=', param_value, "Train =", train_scores[i], "Test =", test_scores[i]) # plot the training and testing scores in a log scale plt.semilogx(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b') plt.semilogx(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g') plt.legend(loc=7) plt.xlabel(param_name + " values") plt.ylabel("Mean cross validation accuracy") # return the training and testing scores on each parameter value return train_scores, test_scores

X = df['Sen_Out_StopWord'] y = df['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

df.columns

tfidf = TfidfVectorizer()

X_tfidf = tfidf.fit_transform(X_train.values.astype('U'))

X_tfidf.shape

X_test_tfidf = tfidf.transform(X_test.values.astype('U'))

X_test_tfidf.shape

X_tfidf

clf = Pipeline([ ('vect', TfidfVectorizer()), ('nb', MultinomialNB(alpha=0.01)), ])

evaluate_cross_validation(clf, X_train, y_train, 5)

alphas = np.logspace(-2, 1, 20) print(alphas)

train_scores, test_scores = calc_params(X_train, y_train, clf, alphas, 'nb__alpha', 5)

mnb = MultinomialNB(alpha= 0.37)

mnb.fit(X_tfidf, y_train)

mnb_pred = mnb.predict(X_test_tfidf) print(mnb_pred)

from sklearn.metrics import classification_report mnb_cr = classification_report(mnb_pred, y_test) print(mnb_cr)

X_train.shape

X_test.shape

# Perform classification with SVM, kernel=linear and invoking a pipeline. from sklearn import svm from sklearn.svm import SVC classifier_linear = Pipeline([ ('vect', TfidfVectorizer()), ('svc', SVC(kernel='linear')), ])

# SVM without tunning calling cross validation method above. evaluate_cross_validation(classifier_linear, X_train, y_train, 5)

# We are tuning the c_vals parameter using the fixed range of array parameters. We can take larger sets but that would taken more time to execute so taking samlelr sets here. c_vals = [1, 5, 10, 50, 100] # We calculate train and test score by calling calc_params with c_vals as tuned parameter and 5 k folds. train_scores, test_scores = calc_params(X_train, y_train, classifier_linear, c_vals, 'svc__C', 5)

# passing gamma and original C parameter from above to get a grid of combination of C and gamma parameter. from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline parameters = { 'svc__gamma': np.logspace(-3, 0, 4), 'svc__C': [1, 5, 10, 50, 100], } clf_rbf = Pipeline([ ('vect', TfidfVectorizer()), ('svc', SVC(kernel='rbf')), ])

gs = GridSearchCV(clf_rbf, parameters, verbose=2, cv=3)

gs.fit(X, y) gs.best_params_, gs.best_score_

classifier_rbf = Pipeline([ ('vect', TfidfVectorizer()), ('svc', SVC(kernel='rbf', C=5, gamma=1)), ])

# Fitting the final classifier classifier_rbf.fit(X_train, y_train)

# Predicting on test set svc_pred = classifier_rbf.predict(X_test)

# training on the entire set print(classifier_rbf.score(X_test, y_test))

print("SVM rbf Accuracy:" , metrics.accuracy_score(y_test,svc_pred))

# classification report # removing stopWord from our dataset svm_cr = classification_report(y_test,svc_pred) print(svm_cr)

# creating a confusion matrix import matplotlib.pyplot as plt import seaborn as sns; sns.set() from sklearn.metrics import confusion_matrix mat = confusion_matrix(y_test, svc_pred)

mat.T

#plotting a heatmap from the confusion matrix value above for better visulization. fig, ax = plt.subplots(figsize=(8,8)) ax = sns.heatmap(mat.T, square=True, linecolor='grey', linewidths=1, annot=True, fmt='d', cbar=True, cmap='Reds', ax=ax, annot_kws={"fontsize":12, "weight":"bold"}, xticklabels=y_test.unique(), yticklabels=y_test.unique()) bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) plt.xlabel('true label') plt.ylabel('predicted label');

comment = """achha lagta hain """ # I like it print(classifier_rbf.predict([comment]))

comment = """umaima ka dimag kharab hain aaj""" # Umiama is mad today print(classifier_rbf.predict([comment]))

comment = """Rauf kya chal rahan hain""" # what's going on Rauf print(classifier_rbf.predict([comment]))

# the below is original dataset changed to lower case,punctuation only and not cleaned with stop words. X = df['Lower_Case'] y = df['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

tfidf = TfidfVectorizer()

X_tfidf = tfidf.fit_transform(X_train.values.astype('U'))

X_tfidf.shape

X_test_tfidf = tfidf.transform(X_test.values.astype('U'))

X_test_tfidf.shape

### Lets' set up a pipeline to perform preprocessng of the data and ### classification of the documents using Multiomial Naive Bayes clf_2 = Pipeline([ ('vect', TfidfVectorizer()), ('nb', MultinomialNB(alpha=0.01)), ])

evaluate_cross_validation(clf_2, X_train, y_train, 5)

alphas = np.logspace(-2, 1, 20) print(alphas)

train_scores, test_scores = calc_params(X_train, y_train, clf_2, alphas, 'nb__alpha', 5)

mnb_2 = MultinomialNB(alpha= 0.18)

mnb_2.fit(X_tfidf, y_train)

#Performance on the test data mnb_pred_2 = mnb_2.predict(X_test_tfidf) print(mnb_pred_2)

mnb_cr_2 = classification_report(mnb_pred_2, y_test) print(mnb_cr_2)

# Perform classification with SVM, kernel=linear, calling pipeline from sklearn import svm from sklearn.svm import SVC classifi_linear = Pipeline([ ('vect', TfidfVectorizer()), ('svc', SVC(kernel='linear')), ])

#SVM without tunning calling cross validation method above. evaluate_cross_validation(classifi_linear, X_train, y_train, 5)

# We are tuning the c_vals parameter using the fixed range of array parameters. c_vals = [1, 5, 10, 50, 100] # calling calc_params function from above train_scores, test_scores = calc_params(X_train, y_train, classifi_linear, c_vals, 'svc__C', 5)

parameters = { 'svc__gamma': np.logspace(-3, 0, 4), 'svc__C': [1, 5, 10, 50, 100], }

from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline clf_rbf_2 = Pipeline([ ('vect', TfidfVectorizer()), ('svc', SVC(kernel='rbf')), ])

gs = GridSearchCV(clf_rbf_2, parameters, verbose=2, cv=3)

gs.fit(X, y) gs.best_params_, gs.best_score_

# using kernel = rbf classifi_rbf = Pipeline([ ('vect', TfidfVectorizer()), ('svc', SVC(kernel='rbf', C=5, gamma=1.0)), ])

# fitting entire training data classifi_rbf.fit(X_train, y_train)

# predicting on test set svc_pred_2 = classifi_rbf.predict(X_test)

# predicting on entire test dataset print(classifi_rbf.score(X_test, y_test))

print("SVM rbf Accuracy:" , metrics.accuracy_score(y_test,svc_pred_2))

# classification report using # dataset without removing stopWord svm_cr_2 = classification_report(y_test,svc_pred_2) print(svm_cr_2)

# Dropping neutral sentiment sentences. data = roman_urdu_df[roman_urdu_df.Response != 'Neutral']

data = pd.DataFrame(data[['Sentence', 'Response']]) #converting into dataframe

data['Response'].value_counts()

# removing all the punctuation by calling remove_punct function from above data['Sentence_Removal'] = data['Sentence'].apply(lambda x: remove_punct(str(x)))

data['Lower_Case'] = data['Sentence_Removal'].apply(lambda x: convert_to_lower_case(str(x))) #calling lower case on each line

data['Sen_Out_StopWord'] = data['Lower_Case'].apply(lambda x: removeStopWordss(x)) # sentence with removed stop words data.head()

X = data['Sen_Out_StopWord'] y = data['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

X_train.shape

X_test.shape

tfidf = TfidfVectorizer()

X_tfidf = tfidf.fit_transform(X_train.values.astype('U'))

X_tfidf.shape

X_test_tfidf = tfidf.transform(X_test.values.astype('U'))

X_test_tfidf.shape

X_tfidf

### Lets' set up a pipeline to perform preprocessng of the data and ### classification of the documents using Multiomial Naive Bayes B_clf = Pipeline([ ('vect', TfidfVectorizer()), ('nb', MultinomialNB(alpha=0.01)), ])

evaluate_cross_validation(B_clf, X_train, y_train, 5)

alphas = np.logspace(-2, 1, 20) print(alphas)

train_scores, test_scores = calc_params(X_train, y_train, B_clf, alphas, 'nb__alpha', 5)

B_mnb = MultinomialNB(alpha= 0.78)

B_mnb.fit(X_tfidf, y_train)

### Performance on the test data B_mnb_pred = B_mnb.predict(X_test_tfidf) print(B_mnb_pred)

B_mnb_cr = classification_report(B_mnb_pred, y_test) print(B_mnb_cr)

# Perform classification with SVM, kernel=linear and invoking a pipeline. from sklearn import svm from sklearn.svm import SVC B_classifier_linear = Pipeline([ ('vect', TfidfVectorizer()), ('svc', SVC(kernel='linear')), ])

# SVM without tunning calling cross validation method above. evaluate_cross_validation(B_classifier_linear, X_train, y_train, 5)

# passing gamma and original C parameter from above to get a grid of combination of C and gamma parameter. from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline parameters = { 'svc__gamma': np.logspace(-3, 0, 4), 'svc__C': [1, 5, 10, 50, 100], } B_clf_rbf = Pipeline([ ('vect', TfidfVectorizer()), ('svc', SVC(kernel='rbf')), ]) gs = GridSearchCV(B_clf_rbf, parameters, verbose=2, cv=3) gs.fit(X, y) gs.best_params_, gs.best_score_

B_classifier_rbf = Pipeline([ ('vect', TfidfVectorizer()), ('svc', SVC(kernel='rbf', C=1, gamma= 1.0)), ])

# Fitting the final classifier B_classifier_rbf.fit(X_train, y_train)

# Predicting on test set B_svc_pred = B_classifier_rbf.predict(X_test)

# training on the entire test set print(B_classifier_rbf.score(X_test, y_test))

print("SVM rbf Accuracy:" , metrics.accuracy_score(y_test,B_svc_pred))

# results B_svm_cr = classification_report(y_test,B_svc_pred) print(B_svm_cr)

# creating a confusion matrix import matplotlib.pyplot as plt import seaborn as sns; sns.set() from sklearn.metrics import confusion_matrix mat_3 = confusion_matrix(y_test, B_svc_pred) mat_3

#plotting a heatmap from the confusion matrix value above. fig, ax = plt.subplots(figsize=(8,8)) ax = sns.heatmap(mat_3.T, square=True, linecolor='grey', linewidths=1, annot=True, fmt='d', cbar=True, cmap='Reds', ax=ax, annot_kws={"fontsize":12, "weight":"bold"}, xticklabels=y_test.unique(), yticklabels=y_test.unique()) bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) plt.xlabel('true label') plt.ylabel('predicted label');

import matplotlib.ticker as mtick

#mnb_cr[mnb_cr.find('Negative')+len('Negative'):100].split() #mnb_cr[mnb_cr.find('Neutral')+len('Neutral'):155].split() #mnb_cr[mnb_cr.find('Positive')+len('Positive'):210].split() mnb_cr[mnb_cr.find('accuracy')+35:261] # dataset using stopword on unbalanced dataset

#svm_cr[svm_cr.find('Negative')+len('Negative'):100].split() #svm_cr[svm_cr.find('Neutral')+len('Neutral'):155].split() #svm_cr[svm_cr.find('Positive')+len('Positive'):210].split() svm_cr[svm_cr.find('accuracy')+35:261] # dataset using stopword on unbalanced dataset

#B_svm_cr[B_svm_cr.find('Negative')+len('Negative'):100].split() #B_svm_cr[B_svm_cr.find('Neutral')+len('Neutral'):155].split() #B_svm_cr[B_svm_cr.find('Positive')+len('Positive'):210].split() B_svm_cr[B_svm_cr.find('accuracy')+35:207] # dataset using stopWord on balanced dataset

#B_mnb_cr[mnb_cr.find('Negative')+len('Negative'):100].split() #B_mnb_cr[mnb_cr.find('Neutral')+len('Neutral'):155].split() #B_mnb_cr[mnb_cr.find('Positive')+len('Positive'):210].split() B_mnb_cr[B_mnb_cr.find('accuracy')+35:207] # dataset using stopWord on balanced dataset

def accuracy_graph(cr1, cr2, cr3, cr4): acc_val = cr1[cr1.find('accuracy')+35:261] acc_val_2 = cr2[cr2.find('accuracy')+35:261] acc_val_3 = cr3[cr3.find('accuracy')+35:207] acc_val_4 = cr4[cr4.find('accuracy')+35:207] SVM =[float(acc_val_4)*100,float(acc_val_2)*100] MultinomialNB=[float(acc_val_3)*100,float(acc_val)*100] index = ['Balanced_Data','UnBalanced_Data'] acc_pd = pd.DataFrame(data = {'SVM': SVM,'MultinomialNB':MultinomialNB},index=index) acc_pd ax = acc_pd.plot(kind='bar', ylim=(0,100), xlabel='dataSetType', ylabel = 'Accuracy', legend=True) ax.yaxis.set_major_formatter(mtick.PercentFormatter()) ax.set_title('Accuracy Score of SVM/MNB')

accuracy_graph(mnb_cr, svm_cr, B_mnb_cr, B_svm_cr)