Sentiment Analysis with ML

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns color = sns.color_palette() %matplotlib inline import plotly.offline as py py.init_notebook_mode(connected=True) import plotly.graph_objs as go import plotly.tools as tls import plotly.express as px import nltk from nltk.corpus import stopwords from wordcloud import WordCloud, STOPWORDS from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix,classification_report

df = pd.read_csv('Reviews.csv') df.head()

fig = px.histogram(df, x="Score") fig.update_traces(marker_color="turquoise",marker_line_color='rgb(8,48,107)', marker_line_width=1.5) fig.update_layout(title_text='Product Score') fig.show()

# Create stopword list: stopwords = set(STOPWORDS) stopwords.update(["br", "href"]) textt = " ".join(review for review in df.Text) wordcloud = WordCloud(stopwords=stopwords).generate(textt) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('wordcloud11.png') plt.show()

# assign reviews with score > 3 as positive sentiment # score < 3 negative sentiment # remove score = 3 df = df[df['Score'] != 3] df['sentiment'] = df['Score'].apply(lambda rating : +1 if rating > 3 else -1)

df.head()

# split df - positive and negative sentiment: positive = df[df['sentiment'] == 1] negative = df[df['sentiment'] == -1]

stopwords = set(STOPWORDS) stopwords.update(["br", "href","good","great"]) ## good and great removed because they were included in negative sentiment pos = " ".join(review for review in positive.Summary) wordcloud2 = WordCloud(stopwords=stopwords).generate(pos) plt.imshow(wordcloud2, interpolation='bilinear') plt.axis("off") plt.show()

neg = " ".join(str(review) for review in negative.Summary) wordcloud3 = WordCloud(stopwords=stopwords).generate(neg) plt.imshow(wordcloud3, interpolation='bilinear') plt.axis("off") plt.savefig('wordcloud33.png') plt.show()

df['sentimentt'] = df['sentiment'].replace({-1 : 'negative'}) df['sentimentt'] = df['sentimentt'].replace({1 : 'positive'}) fig = px.histogram(df, x="sentimentt") fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)', marker_line_width=1.5) fig.update_layout(title_text='Product Sentiment') fig.show()

def remove_punctuation(text): final = "".join(u for u in text if u not in ("?", ".", ";", ":", "!",'"')) return final df['Text'] = df['Text'].apply(remove_punctuation) df = df.dropna(subset=['Summary']) df['Summary'] = df['Summary'].apply(remove_punctuation)

dfNew = df[['Summary','sentiment']] dfNew.head()

# random split train and test data index = df.index df['random_number'] = np.random.randn(len(index)) train = df[df['random_number'] <= 0.8] test = df[df['random_number'] > 0.8]

# count vectorizer: vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') train_matrix = vectorizer.fit_transform(train['Summary']) test_matrix = vectorizer.transform(test['Summary'])

lr = LogisticRegression()

X_train = train_matrix X_test = test_matrix y_train = train['sentiment'] y_test = test['sentiment']

lr.fit(X_train,y_train)

predictions = lr.predict(X_test)

# find accuracy, precision, recall: new = np.asarray(y_test) confusion_matrix(predictions,y_test)

print(classification_report(predictions,y_test))