#descriptive stats
df.describe()
average = df[["Hotel_Name", "Average_Score"]].drop_duplicates()
rcParams["figure.figsize"] = 50, 18
rcParams["axes.labelsize"] = 16
sns.set(font_scale = 2.5)
a4_dims = (30,12)
fig, ax = plt.subplots(figsize=a4_dims)
sns.countplot(ax=ax, x="Average_Score", data=average)
#sorting hotels with average score equal or greater than 8.4 according to the number of reviews
df[df.Average_Score >= 8.4][["Hotel_Name", "Average_Score", "Total_Number_of_Reviews"]].drop_duplicates().sort_values(by="Total_Number_of_Reviews", ascending=False)[:20]
#raising the average score to 8.8
#sorted by total number of reviews
df[df.Average_Score >= 8.8][["Hotel_Name", "Average_Score", "Total_Number_of_Reviews"]].drop_duplicates().sort_values(by="Total_Number_of_Reviews", ascending=False)[:20]
#Replace "United Kingdom" with "UK" for easier processing
#extract out the countries the hotels are based in into a new column, "Locations"
df.Hotel_Address = df.Hotel_Address.str.replace("United Kingdom", "UK")
df["Locations"] = df.Hotel_Address.apply(lambda x: x.split(" ")[-1])
df.sample(10)
#visualisation
plt.figure(figsize = (14,5))
plt.title("Distribution of hotel locations")
df.Locations.value_counts().plot.barh(color = "orange")
plt.savefig("distribution.png")
#relationship between average score and number of reviews
t = df["Total_Number_of_Reviews"]
a = df["Average_Score"]
plt.scatter(a, t, c="#d9a99c")
plt.xlabel("Scoring")
plt.ylabel("Number of reviews")
plt.show()
#comparing total word count of negative and positive reviews against reviewer score
plt.figure(figsize=(25,10))
sns.scatterplot(x=df["Review_Total_Negative_Word_Counts"], y=df["Review_Total_Positive_Word_Counts"], hue=df["Reviewer_Score"])
# change the value to black
def black_color(word, font_size, position,orientation,random_state=None, **kwargs):
return("hsl(0,100%, 1%)")
# set the wordcloud background color to light grey
# set max_words to 500
text = " ".join(review for review in df.Reviewer_Nationality.astype(str))
font_path= '/System/Library/Fonts/Supplemental/Arial.ttf'
wordcloud = WordCloud(font_path = font_path, background_color="#e3e6e0", width=3000, height=2000, max_words=500).generate(text)
# set the word color to black
wordcloud.recolor(color_func = black_color)
# set the figsize
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
df.Reviewer_Nationality.value_counts()
df.Hotel_Name.value_counts()
#when positive review = no positive
all_neg = df.loc[df.Review_Total_Positive_Word_Counts == 0]
print("Total number of completely negative reviews:", len(all_neg))
#when negative review = no negative
all_pos = df.loc[df.Review_Total_Negative_Word_Counts == 0]
print("Total number of completely positive reviews:", len(all_pos))
#concatenate negative and positive reviews into one "Review" column
df["Reviews"] = df["Negative_Review"] + df["Positive_Review"]
#test
df.head(2)
#creating a copy of the dataframe
df2 = df.copy()
df2.head(2)
#drop some colums that will not be used for text processing
df2.drop(["Hotel_Address","Additional_Number_of_Scoring", "Review_Date", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "days_since_review", "Total_Number_of_Reviews_Reviewer_Has_Given", "lat", "lng"], inplace=True, axis=1)
#check again
df2.info()
df2.shape
#goal: remove "no positive" and "no negative"
stop_words = set(stopwords.words("english"))
#add words that aren't in the NLTK stopwords list
new_stopwords = ["positive", "negative"]
stopwords = stop_words.union(new_stopwords)
def preprocess(x):
x = re.sub('[^a-z\s]', '', x.lower())
x = re.sub(r'\b\w{1,3}\b',' ',x)
x = re.sub(r'\d+',' ', x)
x = [w for w in x.split() if w not in set(stopwords)]
return ' '.join(x)
df2["Clean"] = df2["Reviews"].apply(preprocess)
df2.head(2)
#mapping pos tags
def get_wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
#lemmatizing
import string
def clean_text(text):
text = [word.strip(string.punctuation) for word in text.split()]
pos_tags = pos_tag(text)
lemmatizer = WordNetLemmatizer()
text = [lemmatizer.lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
text = " ".join(text)
return(text)
df2["Lemmas"]=df2['Clean'].apply(clean_text)
df2.head(2)
sia = SentimentIntensityAnalyzer()
df2["Positive"] = [sia.polarity_scores(i)["pos"] for i in df2["Lemmas"]]
df2["Negative"] = [sia.polarity_scores(i)["neg"] for i in df2["Lemmas"]]
df2["Neutral"] = [sia.polarity_scores(i)["neu"] for i in df2["Lemmas"]]
df2["Compound"] = [sia.polarity_scores(i)["compound"] for i in df2["Lemmas"]]
df2.head(3)
def sentiment(score):
emotion = ""
if score >= 0.5:
emotion = "Positive"
elif score <= -0.5:
emotion = "Negative"
else:
emotion = "Neutural"
return emotion
df2["Sentiment"] = df2["Compound"].apply(sentiment)
df2.head(3)
#visualising
sentiments = df2["Sentiment"].value_counts()
text = sentiments.index
quantity = sentiments.values
custom_colors = ["orange", "lightblue", "purple"]
plt.figure(figsize=(5,5))
plt.pie(quantity, labels=text, colors=custom_colors)
central_circle = plt.Circle((0,0), 0.5, color="white")
fig = plt.gcf()
fig.gca().add_artist(central_circle)
plt.rc("font", size=12)
plt.title("Sentiments", fontsize=20)
plt.show()
#calculations in percentage
pos = (df2.Compound > 0.5).sum()
neg = (df2.Compound < 0.5).sum()
neu = ((df2.Compound >= (-0.5)) & (df2.Compound <= 0.5)).sum()
print("Percentage of positive review: {}%".format((pos/len(df2["Compound"])*100)))
print("Percentage of negative review: {}%".format((neg/len(df2["Compound"])*100)))
print("Percentage of neutral review: {}%".format((neu/len(df2["Compound"])*100)))
#percentage: if the compound score is lowered
pos = (df2.Compound > 0.2).sum()
neg = (df2.Compound < 0.2).sum()
neu = ((df2.Compound >= (-0.2)) & (df2.Compound <= 0.2)).sum()
print("Percentage of positive review: {}%".format((pos/len(df2["Compound"])*100)))
print("Percentage of negative review: {}%".format((neg/len(df2["Compound"])*100)))
print("Percentage of neutral review: {}%".format((neu/len(df2["Compound"])*100)))
reviewer_score = df2.groupby(by=["Reviewer_Nationality"])["Positive"].mean().sort_values(ascending=False)[:10].reset_index()
reviewer_score = pd.DataFrame(reviewer_score)
reviewer_score
px.bar(reviewer_score, x="Reviewer_Nationality", y="Positive", color="Reviewer_Nationality")
from itertools import islice
#setting up CountVectorizer
cvec = CountVectorizer(min_df=1, max_df=.5, ngram_range=(1,2))
#calculating all n-grams found in all documents
cvec.fit(df2.Lemmas)
list(islice(cvec.vocabulary_.items(), 20))
#number of unique n-grams
len(cvec.vocabulary_)
cvec = CountVectorizer(min_df=.002, max_df=.6, ngram_range=(1,2))
cvec.fit(df2.Lemmas)
len(cvec.vocabulary_)
cvec_counts = cvec.transform(df2.Lemmas)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))
#Top 20 most common words
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)
#calculating the weight of the terms
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights
#Top 20 terms by average TF-IDF weight
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)
# setting up side-by-side subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[15, 10], facecolor = None)
# change the value to black
def black_color(word, font_size, position,orientation,random_state=None, **kwargs):
return("hsl(0,100%, 1%)")
# set the wordcloud background color to light yellow for differentiation
# set max_words to 500
# set width and height as square, 2000 x 2000
text1 = " ".join(review for review in df.Positive_Review.astype(str))
font_path= '/System/Library/Fonts/Supplemental/Arial.ttf'
wordcloud1 = WordCloud(font_path = font_path, background_color="#fff7e0", width=2000, height=2000, max_words=500).generate(text1)
# set the word color to black
wordcloud1.recolor(color_func = black_color)
# plot the wordcloud
ax1.imshow(wordcloud1, interpolation="bilinear")
# remove plot axes
ax1.axis("off")
# set title
ax1.set_title('Positive Reviews',fontsize=20)
#for negative reviews
text2 = " ".join(review for review in df.Negative_Review.astype(str))
font_path= '/System/Library/Fonts/Supplemental/Arial.ttf'
wordcloud2 = WordCloud(font_path = font_path, background_color="#e7e8ea", width=2000, height=2000, max_words=500).generate(text2)
# set the word color to black
wordcloud2.recolor(color_func = black_color)
# plot the wordcloud
ax2.imshow(wordcloud2, interpolation="bilinear")
# remove plot axes
ax2.axis("off")
# set title
ax2.set_title('Negative Reviews',fontsize=20)