#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import accuracy_score
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /root/nltk_data...
[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
df = pd.read_csv('train.csv') #read dataset
del df['idk'] #del unwanted columns
df.isnull().sum() #check for null value
df.fillna(method ='pad') #fill nan value based on previous word
df.Sentiment.value_counts() #count the value in the sentiment column
df["Sentiment"] = df.Sentiment.replace(to_replace = "Neative", value = "Negative") #Replace Neative to Negative
Sentiment_Count = df.groupby('Sentiment').count()
plt.bar(Sentiment_Count.index.values, Sentiment_Count['Text'])
plt.xlabel('Review Sentiments')
plt.ylabel('Number of Review')
plt.show()
df['Text'] = df['Text'].astype('str') #Convert to strings
def remove_pattern(input_txt, pattern):
r = re.findall(pattern, input_txt)
for i in r:
input_txt = re.sub(i, '', input_txt)
return input_txt
df['Text'] = np.vectorize(remove_pattern)(df['Text'], "@[\w]*") #remove @ and * from tweet
df['Text'] = df['Text'].str.replace("[^a-zA-Z#]", " ") # remove special characters, numbers, punctuations
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: The default value of regex will change from True to False in a future version.
!pip install wordcloud==1.8.1
from wordcloud import WordCloud
# Start with one review:
text = df.Text[0]
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
from sklearn.feature_extraction.text import CountVectorizer #Import Count Vectorizer
cv = CountVectorizer()
from sklearn.model_selection import train_test_split #Cross Validation
train, valid = train_test_split(df, test_size=0.2) #split train and valid set 80/20
train_set= cv.fit_transform(train['Text'])
train_tag = train['Sentiment']
valid_set= cv.transform(valid['Text'])
valid_tag = valid['Sentiment']
from sklearn.naive_bayes import MultinomialNB #Use Multinomial Naivebaye classifier
clf = MultinomialNB()
# To train the classifier, simple do
MU = clf.fit(train_set, train_tag)
from sklearn import metrics
Mu_train = MU.predict(train_set)
print('Train accuracy = {}'.format(
accuracy_score(Mu_train , train_tag) * 100)
)
f1_score = metrics.f1_score(Mu_train, train_tag, average='macro')
print(' F1 Train classification score: {}'.format(f1_score* 100))
Mu_test= MU.predict(valid_set)
print('Test accuracy = {}'.format(
accuracy_score(Mu_test, valid_tag) * 100)
)
f1_score = metrics.f1_score(Mu_test, valid_tag, average='macro')
print(' F1 Test classification score: {}'.format(f1_score* 100))
Train accuracy = 84.19946857813756
F1 Train classification score: 83.75705531490637
Test accuracy = 63.61838853188334
F1 Test classification score: 63.07121189882506
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=100, random_state=2, n_estimators = 10)
RF = clf.fit(train_set, train_tag)
RF_train = RF.predict(train_set)
print('Train accuracy = {}'.format(
accuracy_score(RF_train , train_tag) * 100)
)
f1_score = metrics.f1_score(RF_train, train_tag, average='macro')
print(' F1 Train classification score: {}'.format(f1_score* 100))
RF_test= RF.predict(valid_set)
print('Test accuracy = {}'.format(
accuracy_score(RF_test, valid_tag) * 100)
)
f1_score = metrics.f1_score(RF_test, valid_tag, average='macro')
print(' F1 Test classification score: {}'.format(f1_score* 100))
Train accuracy = 81.80189087313849
F1 Train classification score: 80.97479127559534
Test accuracy = 59.07068709836876
F1 Test classification score: 53.1293733605071
from sklearn import linear_model
clf2 = linear_model.SGDClassifier(max_iter=5,random_state=20,n_jobs=50, average = True, power_t =2, n_iter_no_change =1)
clf2.fit(train_set, train_tag)
C:\Users\EBA\Anaconda3\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:557: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
ConvergenceWarning)
sgd_train = clf2.predict(train_set)
print('Train accuracy = {}'.format(
accuracy_score(sgd_train , train_tag) * 100)
)
f1_score = metrics.f1_score(sgd_train, train_tag, average='macro')
print(' F1 Train classification score: {}'.format(f1_score* 100))
sgd_test= clf2.predict(valid_set)
print('Test accuracy = {}'.format(
accuracy_score(sgd_test, valid_tag) * 100)
)
f1_score = metrics.f1_score(sgd_test, valid_tag, average='macro')
print(' F1 Test classification score: {}'.format(f1_score* 100))
Train accuracy = 84.98424272384601
F1 Train classification score: 84.52769575208578
Test accuracy = 66.55956500247157
F1 Test classification score: 65.01920454754072
import xgboost as xgb
clf = xgb.XGBClassifier(n_estimators=2000, random_state=2, max_dept = 4)
xgb= clf.fit(train_set, train_tag,
verbose=True)
xgb_train = xgb.predict(train_set)
print('Train accuracy = {}'.format(
accuracy_score(xgb_train , train_tag) * 100)
)
f1_score = metrics.f1_score(xgb_train, train_tag, average='macro')
print(' F1 Train classification score: {}'.format(f1_score* 100))
xgb_test= xgb.predict(valid_set)
print('Test accuracy = {}'.format(
accuracy_score(xgb_test, valid_tag) * 100)
)
f1_score = metrics.f1_score(xgb_test, valid_tag, average='macro')
print(' F1 Test classification score: {}'.format(f1_score* 100))
Train accuracy = 77.48254340975097
F1 Train classification score: 76.08272317168199
Test accuracy = 66.53484923381117
F1 Test classification score: 63.836880252661935
from sklearn.ensemble import AdaBoostClassifier
Adaclf = AdaBoostClassifier(n_estimators=1000, random_state=9, algorithm='SAMME.R')
Ada = Adaclf.fit(train_set, train_tag)
ada_train = Ada.predict(train_set)
print('Train accuracy = {}'.format(
accuracy_score(ada_train , train_tag) * 100)
)
f1_score = metrics.f1_score(ada_train, train_tag, average='macro')
print(' F1 Train classification score: {}'.format(f1_score* 100))
ada_test= Ada.predict(valid_set)
print('Test accuracy = {}'.format(
accuracy_score(ada_test, valid_tag) * 100)
)
f1_score = metrics.f1_score(ada_test, valid_tag, average='macro')
print(' F1 Test classification score: {}'.format(f1_score* 100))
Train accuracy = 71.21670889204721
F1 Train classification score: 69.9170103010853
Test accuracy = 64.75531389026199
F1 Test classification score: 62.59397130620956
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
Log_pred= LogisticRegression(random_state=100,n_jobs=7, solver='lbfgs').fit(train_set, train_tag)
parameters = {'kernel':('linear', 'rbf'), 'C':[10, 100]}
Logs=GridSearchCV(clf, parameters)
LogsT = Log_pred.predict(train_set)
print('Train accuracy = {}'.format(
accuracy_score(LogsT , train_tag) * 100)
)
f1_score = metrics.f1_score(LogsT , train_tag, average='macro')
print(' F1 Train classification score: {}'.format(f1_score* 100))
Logs = Log_pred.predict(valid_set)
print('Test accuracy = {}'.format(
accuracy_score(Logs , valid_tag) * 100)
)
f1_score = metrics.f1_score(Logs , valid_tag, average='macro')
print(' F1 Test classification score: {}'.format(f1_score* 100))
Train accuracy = 95.0997960823086
F1 Train classification score: 95.09024653454475
Test accuracy = 67.39990113692535
F1 Test classification score: 65.65344725013082
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(valid_tag, xgb_test)
cm
import plotly.graph_objects as go
fig = go.Figure(data=go.Heatmap(z=[[ 473, 458, 106],
[ 473, 458, 106],
[ 103, 418, 694]],
hoverongaps = False))
fig.show()
custom = ['aanti toh gussa e kr gai hain'] #Shanti Toh has become angry
custom = cv.transform(custom)
prediction_random= xgb.predict(custom)
prediction_random
prediction_random= Log_pred.predict(custom)
prediction_random