NLP Hotel reviews

#descriptive stats df.describe()

average = df[["Hotel_Name", "Average_Score"]].drop_duplicates() rcParams["figure.figsize"] = 50, 18 rcParams["axes.labelsize"] = 16 sns.set(font_scale = 2.5) a4_dims = (30,12) fig, ax = plt.subplots(figsize=a4_dims) sns.countplot(ax=ax, x="Average_Score", data=average)

#sorting hotels with average score equal or greater than 8.4 according to the number of reviews df[df.Average_Score >= 8.4][["Hotel_Name", "Average_Score", "Total_Number_of_Reviews"]].drop_duplicates().sort_values(by="Total_Number_of_Reviews", ascending=False)[:20]

#raising the average score to 8.8 #sorted by total number of reviews df[df.Average_Score >= 8.8][["Hotel_Name", "Average_Score", "Total_Number_of_Reviews"]].drop_duplicates().sort_values(by="Total_Number_of_Reviews", ascending=False)[:20]

#Replace "United Kingdom" with "UK" for easier processing #extract out the countries the hotels are based in into a new column, "Locations" df.Hotel_Address = df.Hotel_Address.str.replace("United Kingdom", "UK") df["Locations"] = df.Hotel_Address.apply(lambda x: x.split(" ")[-1]) df.sample(10)

#visualisation plt.figure(figsize = (14,5)) plt.title("Distribution of hotel locations") df.Locations.value_counts().plot.barh(color = "orange") plt.savefig("distribution.png")

#relationship between average score and number of reviews t = df["Total_Number_of_Reviews"] a = df["Average_Score"] plt.scatter(a, t, c="#d9a99c") plt.xlabel("Scoring") plt.ylabel("Number of reviews") plt.show()

#comparing total word count of negative and positive reviews against reviewer score plt.figure(figsize=(25,10)) sns.scatterplot(x=df["Review_Total_Negative_Word_Counts"], y=df["Review_Total_Positive_Word_Counts"], hue=df["Reviewer_Score"])

# change the value to black def black_color(word, font_size, position,orientation,random_state=None, **kwargs): return("hsl(0,100%, 1%)") # set the wordcloud background color to light grey # set max_words to 500 text = " ".join(review for review in df.Reviewer_Nationality.astype(str)) font_path= '/System/Library/Fonts/Supplemental/Arial.ttf' wordcloud = WordCloud(font_path = font_path, background_color="#e3e6e0", width=3000, height=2000, max_words=500).generate(text) # set the word color to black wordcloud.recolor(color_func = black_color) # set the figsize plt.figure(figsize=[15,10]) # plot the wordcloud plt.imshow(wordcloud, interpolation="bilinear") # remove plot axes plt.axis("off")

df.Reviewer_Nationality.value_counts()

df.Hotel_Name.value_counts()

#when positive review = no positive all_neg = df.loc[df.Review_Total_Positive_Word_Counts == 0] print("Total number of completely negative reviews:", len(all_neg))

#when negative review = no negative all_pos = df.loc[df.Review_Total_Negative_Word_Counts == 0] print("Total number of completely positive reviews:", len(all_pos))

#concatenate negative and positive reviews into one "Review" column df["Reviews"] = df["Negative_Review"] + df["Positive_Review"]

#test df.head(2)

#creating a copy of the dataframe df2 = df.copy()

df2.head(2)

#drop some colums that will not be used for text processing df2.drop(["Hotel_Address","Additional_Number_of_Scoring", "Review_Date", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "days_since_review", "Total_Number_of_Reviews_Reviewer_Has_Given", "lat", "lng"], inplace=True, axis=1)

#check again df2.info()

df2.shape

#goal: remove "no positive" and "no negative" stop_words = set(stopwords.words("english")) #add words that aren't in the NLTK stopwords list new_stopwords = ["positive", "negative"] stopwords = stop_words.union(new_stopwords)

def preprocess(x): x = re.sub('[^a-z\s]', '', x.lower()) x = re.sub(r'\b\w{1,3}\b',' ',x) x = re.sub(r'\d+',' ', x) x = [w for w in x.split() if w not in set(stopwords)] return ' '.join(x) df2["Clean"] = df2["Reviews"].apply(preprocess)

df2.head(2)

#mapping pos tags def get_wordnet_pos(pos_tag): if pos_tag.startswith('J'): return wordnet.ADJ elif pos_tag.startswith('V'): return wordnet.VERB elif pos_tag.startswith('N'): return wordnet.NOUN elif pos_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN

#lemmatizing import string def clean_text(text): text = [word.strip(string.punctuation) for word in text.split()] pos_tags = pos_tag(text) lemmatizer = WordNetLemmatizer() text = [lemmatizer.lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags] text = " ".join(text) return(text) df2["Lemmas"]=df2['Clean'].apply(clean_text)

df2.head(2)

sia = SentimentIntensityAnalyzer() df2["Positive"] = [sia.polarity_scores(i)["pos"] for i in df2["Lemmas"]] df2["Negative"] = [sia.polarity_scores(i)["neg"] for i in df2["Lemmas"]] df2["Neutral"] = [sia.polarity_scores(i)["neu"] for i in df2["Lemmas"]] df2["Compound"] = [sia.polarity_scores(i)["compound"] for i in df2["Lemmas"]]

df2.head(3)

def sentiment(score): emotion = "" if score >= 0.5: emotion = "Positive" elif score <= -0.5: emotion = "Negative" else: emotion = "Neutural" return emotion

df2["Sentiment"] = df2["Compound"].apply(sentiment)

df2.head(3)

#visualising sentiments = df2["Sentiment"].value_counts() text = sentiments.index quantity = sentiments.values custom_colors = ["orange", "lightblue", "purple"] plt.figure(figsize=(5,5)) plt.pie(quantity, labels=text, colors=custom_colors) central_circle = plt.Circle((0,0), 0.5, color="white") fig = plt.gcf() fig.gca().add_artist(central_circle) plt.rc("font", size=12) plt.title("Sentiments", fontsize=20) plt.show()

#calculations in percentage pos = (df2.Compound > 0.5).sum() neg = (df2.Compound < 0.5).sum() neu = ((df2.Compound >= (-0.5)) & (df2.Compound <= 0.5)).sum() print("Percentage of positive review: {}%".format((pos/len(df2["Compound"])*100))) print("Percentage of negative review: {}%".format((neg/len(df2["Compound"])*100))) print("Percentage of neutral review: {}%".format((neu/len(df2["Compound"])*100)))

#percentage: if the compound score is lowered pos = (df2.Compound > 0.2).sum() neg = (df2.Compound < 0.2).sum() neu = ((df2.Compound >= (-0.2)) & (df2.Compound <= 0.2)).sum() print("Percentage of positive review: {}%".format((pos/len(df2["Compound"])*100))) print("Percentage of negative review: {}%".format((neg/len(df2["Compound"])*100))) print("Percentage of neutral review: {}%".format((neu/len(df2["Compound"])*100)))

reviewer_score = df2.groupby(by=["Reviewer_Nationality"])["Positive"].mean().sort_values(ascending=False)[:10].reset_index() reviewer_score = pd.DataFrame(reviewer_score)

reviewer_score

px.bar(reviewer_score, x="Reviewer_Nationality", y="Positive", color="Reviewer_Nationality")

from itertools import islice #setting up CountVectorizer cvec = CountVectorizer(min_df=1, max_df=.5, ngram_range=(1,2)) #calculating all n-grams found in all documents cvec.fit(df2.Lemmas) list(islice(cvec.vocabulary_.items(), 20))

#number of unique n-grams len(cvec.vocabulary_)

cvec = CountVectorizer(min_df=.002, max_df=.6, ngram_range=(1,2)) cvec.fit(df2.Lemmas) len(cvec.vocabulary_)

cvec_counts = cvec.transform(df2.Lemmas) print('sparse matrix shape:', cvec_counts.shape) print('nonzero count:', cvec_counts.nnz) print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

#Top 20 most common words occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist() counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ}) counts_df.sort_values(by='occurrences', ascending=False).head(20)

#calculating the weight of the terms transformer = TfidfTransformer() transformed_weights = transformer.fit_transform(cvec_counts) transformed_weights

#Top 20 terms by average TF-IDF weight weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist() weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights}) weights_df.sort_values(by='weight', ascending=False).head(20)

# setting up side-by-side subplots fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[15, 10], facecolor = None) # change the value to black def black_color(word, font_size, position,orientation,random_state=None, **kwargs): return("hsl(0,100%, 1%)") # set the wordcloud background color to light yellow for differentiation # set max_words to 500 # set width and height as square, 2000 x 2000 text1 = " ".join(review for review in df.Positive_Review.astype(str)) font_path= '/System/Library/Fonts/Supplemental/Arial.ttf' wordcloud1 = WordCloud(font_path = font_path, background_color="#fff7e0", width=2000, height=2000, max_words=500).generate(text1) # set the word color to black wordcloud1.recolor(color_func = black_color) # plot the wordcloud ax1.imshow(wordcloud1, interpolation="bilinear") # remove plot axes ax1.axis("off") # set title ax1.set_title('Positive Reviews',fontsize=20) #for negative reviews text2 = " ".join(review for review in df.Negative_Review.astype(str)) font_path= '/System/Library/Fonts/Supplemental/Arial.ttf' wordcloud2 = WordCloud(font_path = font_path, background_color="#e7e8ea", width=2000, height=2000, max_words=500).generate(text2) # set the word color to black wordcloud2.recolor(color_func = black_color) # plot the wordcloud ax2.imshow(wordcloud2, interpolation="bilinear") # remove plot axes ax2.axis("off") # set title ax2.set_title('Negative Reviews',fontsize=20)