## All purpose import pandas as pd import numpy as np ## Visualization import matplotlib.pyplot as plt import seaborn as sns import plotly import plotly.express as px import plotly.graph_objects as go from wordcloud import WordCloud ## NLP from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from nltk.tokenize import TweetTokenizer from gensim.summarization import keywords from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer ## Deeplearning/ML import tensorflow as tf from tensorflow.keras.preprocessing.text import Tokenizer from sklearn.model_selection import train_test_split import tensorflow.keras.layers as L from tensorflow.keras.losses import MeanAbsoluteError from tensorflow.keras.losses import SparseCategoricalCrossentropy from tensorflow.keras.optimizers import Adam from tensorflow.keras.utils import plot_model from tensorflow.keras.preprocessing.sequence import pad_sequences

## Creating sentimental polarity analyzer = SentimentIntensityAnalyzer() def compound_score(txt): return analyzer.polarity_scores(txt)["compound"] ## Sentiments def sentiment(score): emotion = "" if score >= 0.5: emotion = "Positive" elif score <= -0.5: emotion = "Negative" else: emotion = "Neutral" return emotion

Applying functions

## Importing CSV file df = pd.read_csv("tripadvisor_hotel_reviews.csv") ## Applying Compund score polarity_scores = df["Review"].astype("str").apply(compound_score) df["Sentiment_Score"] = polarity_scores ## Applying Sentiment df["Sentiment"] = df["Sentiment_Score"].apply(sentiment)

Saving file

## Saving preprocessed file df.to_csv("Trip-Advisor-rating-sentiments.csv",index=False) df.Sentiment.value_counts()

Loading Preprocessed Dataset

# Importing the Trip-Advisor-Hotel-Review Dataset data=pd.read_csv('Trip-Advisor-rating-sentiments.csv') # Having a look at the data data.head()

data.isna().sum()

sns.countplot(data=data,x="Sentiment",palette="pastel");

Visualization

# Preparing data for visualization Viz_1 = data[['Rating','Sentiment']].value_counts().rename_axis(['Rating','Sentiment']).reset_index(name='counts') # Plotting the Bar Graph fig = px.bar(x=Viz_1.Rating, y=Viz_1.counts, color=Viz_1.Sentiment,color_discrete_sequence=px.colors.qualitative.Pastel,title="Sentiment & Ratings",labels={'x':'Ratings','y':'Total Number'}) fig.show()

#Viz2 Data preparation Viz_2 = data['Rating'].value_counts().rename_axis(['Rating']).reset_index(name='counts') # Plotting pie chart for ratings fig_pie = px.pie(values=Viz_2.counts, names=Viz_2.Rating, title='Rating Distribution of the data',color_discrete_sequence=px.colors.qualitative.Pastel) fig_pie.show()

# Jointplot on the basis of Rating and Sentiment Score of the data jp = sns.jointplot(data=data,x='Rating',y='Sentiment_Score',kind="reg",color='#ff7373') # jp.fig.suptitle('Jointplot on the basis of Rating and Sentiment Score of the data',fontsize=20);

fig = go.Figure() Ratings = [1,2,3,4,5] for rating in Ratings: fig.add_trace(go.Violin(x=data['Rating'][data['Rating'] == rating], y=data['Sentiment_Score'][data['Rating'] == rating], name=rating, box_visible=True, meanline_visible=True)) fig.update_layout( title="Violin plot of Rating and Sentiment Score with box plot", xaxis_title="Rating", yaxis_title="Sentiment Score", font=dict( family="Courier New, monospace", size=12, ) ) fig.show()

Sentiment

Negative

text1 ='' for i in data[data['Sentiment']==str(Sentiment)]['Review'].values: text1+=i + ' ' wc = WordCloud(width = 800, height = 800,background_color="white",min_font_size = 10,\ repeat=True,) wc.generate(text1) plt.figure(figsize = (8, 8), facecolor = None) plt.axis("off") plt.imshow(wc, interpolation="bilinear") plt.title(Sentiment+' Reviews',fontsize=32);

# Getting all the reviews termed positive in a single string and forming a word cloud of the string fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=[14, 14], facecolor = None) text1 ='' for i in data[data['Sentiment']=='Positive']['Review'].values: text1+=i + ' ' x, y = np.ogrid[:300, :300] mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2 mask = 255 * mask.astype(int) wc1 = WordCloud(width = 800, height = 800,background_color="white",min_font_size = 10,\ repeat=True, mask=mask) wc1.generate(text1) ax1.axis("off") ax1.imshow(wc1, interpolation="bilinear") ax1.set_title('Positive Reviews',fontsize=20); text2 ='' for i in data[data['Sentiment']=='Negative']['Review'].values: text2+=i + ' ' x, y = np.ogrid[:300, :300] mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2 mask = 255 * mask.astype(int) wc2 = WordCloud(width = 800, height = 800,background_color="white",min_font_size = 10,\ repeat=True, mask=mask) wc2.generate(text2) ax2.axis("off") ax2.imshow(wc2, interpolation="bilinear") ax2.set_title('Neutral Reviews',fontsize=20); text3 ='' for i in data[data['Sentiment']=='Neutral']['Review'].values: text3+=i + ' ' x, y = np.ogrid[:300, :300] mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2 mask = 255 * mask.astype(int) wc3 = WordCloud(width = 800, height = 800,background_color="white",min_font_size = 10,\ repeat=True, mask=mask) wc3.generate(text3) ax3.axis("off") ax3.imshow(wc3, interpolation="bilinear") ax3.set_title('Negetive Reviews',fontsize=20); plt.show()

input_1

keywords(input_1).split("\n")

data["keywords"] = data["Review"].apply(keywords) data["keywords"] = data["keywords"].astype("str").str.replace('\n',',',)

words = [] for x in data.keywords.values: x=x.split(",") for i in x: words.append(i)

from collections import Counter word_counter = Counter(words)

word_df = pd.DataFrame(np.array(list(word_counter.items())),columns=["keyword","count"])

word_df["count"] = word_df["count"].astype(int) word_df = word_df.sort_values(['count'], ascending=False) top_20 = word_df[0:19] word_df.head(10)

sns.set(rc={'figure.figsize':(15,8)}) fig, ax = plt.subplots() ax = sns.barplot(data=top_20,x="keyword",y="count",palette="pastel") ax.patch.set_visible(False) ax.tick_params(axis='x', labelrotation = 45) ax.set_title("Top 20 Keywords",fontsize=20);

import nltk nltk.download('stopwords') nltk.download('wordnet') nltk.download('punkt')

top5 = ["hotel","room","rooms","hotels"] for x in top5: data["Review"] = data["Review"].astype(str).str.replace(x,"")

data.head(2)

data2=data.copy() def removing_stop_words(txt): stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(txt) filtered_sentence = [w for w in word_tokens if not w in stop_words] return filtered_sentence data2["Review"] = data2["Review"].apply(removing_stop_words)

# Making a function to lemmatize lemmatizer = WordNetLemmatizer() def lemmatize(data): lema_data=[] for j in data: x=j.lower() x=lemmatizer.lemmatize(j,pos='n') x=lemmatizer.lemmatize(j,pos='v') x=lemmatizer.lemmatize(j,pos='a') x=lemmatizer.lemmatize(j,pos='r') x=lemmatizer.lemmatize(x) lema_data.append(x) return lema_data data2["Review"] = data2["Review"].apply(lemmatize)

data2["Review"] = data2["Review"].apply(lambda x:" ".join(token for token in x))

data2.head(2)

X = data2["Review"].values tokenizer = Tokenizer() tokenizer.fit_on_texts(X)

fig, ax = plt.subplots() sns.set(rc={'figure.figsize':(12,9)}) length_dist = [len(x.split(" ")) for x in X] sns.histplot(length_dist,palette="pastel") ax.patch.set_visible(False) ax.set_xlim(0,600) ax.set_ylim(0,1200) ax.set_title("Sentence length distribution",fontsize=20); plt.show()

X = tokenizer.texts_to_sequences(X) max_length = max([len(x) for x in X]) vocab_size = len(tokenizer.word_index)+1 print("Vocabulary size: {}".format(vocab_size)) print("max length of sentence: {}".format(max_length))

# Padding the reviews [Pads sequences to the same length.] X = pad_sequences(X, padding='post', maxlen=600)

labels = ['1', '2', '3', '4', '5'] y = data['Rating'] y = y.map({1: 0, 2: 1, 3: 2, 4: 3, 5: 4 })

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=42)

EPOCHS

embedding_dim

BATCH_SIZE

125

units

val_split

model = tf.keras.Sequential([ L.Embedding(vocab_size, int(embedding_dim), input_length=X.shape[1]), L.Bidirectional(L.LSTM(int(units),return_sequences=True)), L.Conv1D(64,3), L.MaxPool1D(), L.Flatten(), L.Dropout(0.2), L.Dense(128, activation="relu"), L.Dropout(0.2), L.Dense(64, activation="relu"), L.Dropout(0.2), L.Dense(5, activation="softmax") ])

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

model.compile(loss=SparseCategoricalCrossentropy(), optimizer='adam',metrics=['accuracy'] )

history = model.fit(X_train, y_train, epochs=int(EPOCHS), validation_split=float(val_split), batch_size=int(BATCH_SIZE), verbose=2)

pred = model.predict(X_test) pred_final = np.argmax(pred,axis=-1) pred_final

from sklearn.metrics import accuracy_score print('Accuracy: {}%'.format(round(accuracy_score(pred_final, y_test)*100),2))

from sklearn.metrics import mean_squared_error print("Root mean square error: {}".format(round(np.sqrt(mean_squared_error(pred_final,y_test)),3)))

model.save("Tripadvisor_BiLSTM.h5")

new_model = tf.keras.models.load_model('Tripadvisor_BiLSTM.h5') # Check its architecture new_model.summary()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Applying functions

Saving file

Loading Preprocessed Dataset

Visualization

Applying functions