from ast import literal_eval
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import sidetable
# download nltk packages
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
df = pd.read_csv('TA_restaurants_curated.csv')
print('The dataset has %s rows and %s columns' % df.shape)
display("Initially dataset", df)
# Drop unnecessary columns
df.drop(columns=['Unnamed: 0','URL_TA','ID_TA', 'Ranking', 'Number of Reviews',
'Rating'], inplace=True)
# Rename columns for better convention
df.rename(columns={'Name': 'name', 'City': 'city', 'Cuisine Style': 'style',
'Price Range': 'price', 'Reviews': 'reviews' }, inplace=True)
display('Dataset after attributes reduction and renaming', df.head())
display('Missing values', df.stb.missing())
# Replace reviews that are empty to pandas NA
df['reviews'].replace("[[], []]", pd.NA, inplace=True)
print("Before drop: %s" % df.shape[0])
df.dropna(inplace=True)
print("After drop: %s" % df.shape[0])
display('Dataset after data reduction', df)
df['reviews'] = df['reviews'].str.lower()
df['style'] = df['style'].str.lower()
df['city'] = df['city'].str.lower()
display('Dataset after converting to lower case', df.head())
# Convert strings to python list
df['style'] = df['style'].apply(lambda x: literal_eval(x))
# Join the list items together
df['style'] = df['style'].apply(lambda x: ', '.join(x))
# For `reviews` column, we have some (5) dirty items that we need
# to handle explicitly and then remove them from dataset.
def convert_to_list(x):
try:
return literal_eval(x)
except ValueError:
return pd.NA
# Convert strings to python list and dirty data to pandas NA
df['reviews'] = df['reviews'].apply(convert_to_list)
# Drop dirty items
print("Before drop: %s" % df.shape[0])
df.dropna(inplace=True)
print("After drop: %s" % df.shape[0])
# Join the list items together, notice that we only send first item
# of list (reviews) to the function and ignore the second (dates)
df['reviews'] = df['reviews'].apply(lambda x: ', '.join(x[0]))
display('Dataset after we clean `reviews` and `style` columns', df.head())
df['price'].unique()
df['price'].replace(['$', '$$ - $$$', '$$$$'], ['cheap-eats', 'mid-range', 'fine-dining'], inplace=True)
df
print('Number of cities: %s ' % df['city'].nunique())
cities_list = df['city'].unique()
print(cities_list)
# Load cities dataset as pandas data frame:
word_cities_df = pd.read_csv('cities.csv')
# Convert all attributes to lowercase:
word_cities_df['country'] = word_cities_df['country'].str.lower()
word_cities_df['city'] = word_cities_df['city'].str.lower()
display("Sample of word cities dataset", word_cities_df.head())
# Merge for adding country column to data frame
df = pd.merge(df, word_cities_df, on='city', how='inner')
# Reindex data frame to move country next to city
df = df.reindex(columns=['name', 'city', 'country', 'price', 'style', 'reviews'])
display("Sample of our new data frame with country column", df.sample(5))
print('Number of countries: %s ' % df['country'].nunique())
countries_list = df['country'].unique()
print(countries_list)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def process_sentences(text):
temp_sent =[]
# Tokenize words
words = nltk.word_tokenize(text)
# Lemmatize each of the words based on their position in the sentence
tags = nltk.pos_tag(words)
for i, word in enumerate(words):
if tags[i][1] in ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'): # only verbs
lemmatized = lemmatizer.lemmatize(word, 'v')
else:
lemmatized = lemmatizer.lemmatize(word)
# Remove stop words and non alphabet tokens
if lemmatized not in stop_words and lemmatized.isalpha():
temp_sent.append(lemmatized)
# Some other clean-up
full_sentence = ' '.join(temp_sent)
full_sentence = full_sentence.replace("n't", " not")
full_sentence = full_sentence.replace("'m", " am")
full_sentence = full_sentence.replace("'s", " is")
full_sentence = full_sentence.replace("'re", " are")
full_sentence = full_sentence.replace("'ll", " will")
full_sentence = full_sentence.replace("'ve", " have")
full_sentence = full_sentence.replace("'d", " would")
return full_sentence
df['reviews_processed'] = df['reviews'].apply(process_sentences)
df['style_processed'] = df['style'].apply(process_sentences)
df[['reviews', 'reviews_processed', 'style', 'style_processed']].sample(5)
df['bag_of_words'] = df['style_processed'] + ' ' + df['reviews_processed']
display('Sample of bag_of_words attribute', df[['style', 'reviews' ,'bag_of_words']].sample(5))
price_map = {
'cheap-eats': ('cheap', 'inexpensive', 'low-price', 'low-cost', 'economical',
'economic', 'affordable'),
'mid-range': ('moderate', 'fair', 'mid-price', 'reasonable', 'average'),
'fine-dining': ('expensive', 'fancy', 'lavish')
}
def recommend(description):
# Convert user input to lowercase
description = description.lower()
data = df.copy()
# Extract cities
cities_input = []
for city in cities_list:
if city in description:
cities_input.append(city)
description = description.replace(city, "")
if cities_input:
data = data[data['city'].isin(cities_input)]
# Same thing for countries
countries_input = []
for country in countries_list:
if country in description:
countries_input.append(country)
description = description.replace(country, "")
if countries_input:
data = data[data['country'].isin(countries_input)]
# Extract price class
for key, value in price_map.items():
if any(v in description for v in value):
data = data[data['price'] == key]
break
# Process user description text input
description = process_sentences(description)
description = description.strip()
print('Processed user feedback:', description)
# Init a TF-IDF vectorizer
tfidfvec = TfidfVectorizer()
# Fit data on processed reviews
vec = tfidfvec.fit(data["bag_of_words"])
features = vec.transform(data["bag_of_words"])
# Transform user input data based on fitted model
description_vector = vec.transform([description])
# Calculate cosine similarities between users processed input and reviews
cos_sim = linear_kernel(description_vector, features)
# Add similarities to data frame
data['similarity'] = cos_sim[0]
# Sort data frame by similarities
data.sort_values(by='similarity', ascending=False, inplace=True)
return data[['name', 'city', 'country', 'price', 'style', 'reviews', 'similarity']]
recommend('for vietnamese food')
# Feedback with multiples cities
recommend('good for party in Paris or London')
# feedback with country
recommend('vegan in Finland')
# with price class
recommend('fancy for meetings')
# with price class and location
recommend('a reasonable breakfast in France')
recommend('romantic cafe for dating')
recommend('a bar with live music and good atmosphere')
recommend('nice place for valentine date')
recommend('good quality espresso in italy')