Content-based Restaurant Recommender System

from ast import literal_eval import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem.wordnet import WordNetLemmatizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel import pandas as pd import sidetable # download nltk packages nltk.download('stopwords') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet')

df = pd.read_csv('TA_restaurants_curated.csv') print('The dataset has %s rows and %s columns' % df.shape) display("Initially dataset", df)

# Drop unnecessary columns df.drop(columns=['Unnamed: 0','URL_TA','ID_TA', 'Ranking', 'Number of Reviews', 'Rating'], inplace=True) # Rename columns for better convention df.rename(columns={'Name': 'name', 'City': 'city', 'Cuisine Style': 'style', 'Price Range': 'price', 'Reviews': 'reviews' }, inplace=True) display('Dataset after attributes reduction and renaming', df.head())

display('Missing values', df.stb.missing())

# Replace reviews that are empty to pandas NA df['reviews'].replace("[[], []]", pd.NA, inplace=True) print("Before drop: %s" % df.shape[0]) df.dropna(inplace=True) print("After drop: %s" % df.shape[0])

display('Dataset after data reduction', df)

df['reviews'] = df['reviews'].str.lower() df['style'] = df['style'].str.lower() df['city'] = df['city'].str.lower() display('Dataset after converting to lower case', df.head())

# Convert strings to python list df['style'] = df['style'].apply(lambda x: literal_eval(x)) # Join the list items together df['style'] = df['style'].apply(lambda x: ', '.join(x))

# For `reviews` column, we have some (5) dirty items that we need # to handle explicitly and then remove them from dataset. def convert_to_list(x): try: return literal_eval(x) except ValueError: return pd.NA # Convert strings to python list and dirty data to pandas NA df['reviews'] = df['reviews'].apply(convert_to_list) # Drop dirty items print("Before drop: %s" % df.shape[0]) df.dropna(inplace=True) print("After drop: %s" % df.shape[0]) # Join the list items together, notice that we only send first item # of list (reviews) to the function and ignore the second (dates) df['reviews'] = df['reviews'].apply(lambda x: ', '.join(x[0])) display('Dataset after we clean `reviews` and `style` columns', df.head())

df['price'].unique()

df['price'].replace(['$', '$$ - $$$', '$$$$'], ['cheap-eats', 'mid-range', 'fine-dining'], inplace=True)

print('Number of cities: %s ' % df['city'].nunique()) cities_list = df['city'].unique() print(cities_list)

# Load cities dataset as pandas data frame: word_cities_df = pd.read_csv('cities.csv') # Convert all attributes to lowercase: word_cities_df['country'] = word_cities_df['country'].str.lower() word_cities_df['city'] = word_cities_df['city'].str.lower() display("Sample of word cities dataset", word_cities_df.head())

# Merge for adding country column to data frame df = pd.merge(df, word_cities_df, on='city', how='inner') # Reindex data frame to move country next to city df = df.reindex(columns=['name', 'city', 'country', 'price', 'style', 'reviews']) display("Sample of our new data frame with country column", df.sample(5))

print('Number of countries: %s ' % df['country'].nunique()) countries_list = df['country'].unique() print(countries_list)

lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) def process_sentences(text): temp_sent =[] # Tokenize words words = nltk.word_tokenize(text) # Lemmatize each of the words based on their position in the sentence tags = nltk.pos_tag(words) for i, word in enumerate(words): if tags[i][1] in ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'): # only verbs lemmatized = lemmatizer.lemmatize(word, 'v') else: lemmatized = lemmatizer.lemmatize(word) # Remove stop words and non alphabet tokens if lemmatized not in stop_words and lemmatized.isalpha(): temp_sent.append(lemmatized) # Some other clean-up full_sentence = ' '.join(temp_sent) full_sentence = full_sentence.replace("n't", " not") full_sentence = full_sentence.replace("'m", " am") full_sentence = full_sentence.replace("'s", " is") full_sentence = full_sentence.replace("'re", " are") full_sentence = full_sentence.replace("'ll", " will") full_sentence = full_sentence.replace("'ve", " have") full_sentence = full_sentence.replace("'d", " would") return full_sentence

df['reviews_processed'] = df['reviews'].apply(process_sentences)

df['style_processed'] = df['style'].apply(process_sentences)

df[['reviews', 'reviews_processed', 'style', 'style_processed']].sample(5)

df['bag_of_words'] = df['style_processed'] + ' ' + df['reviews_processed'] display('Sample of bag_of_words attribute', df[['style', 'reviews' ,'bag_of_words']].sample(5))

price_map = { 'cheap-eats': ('cheap', 'inexpensive', 'low-price', 'low-cost', 'economical', 'economic', 'affordable'), 'mid-range': ('moderate', 'fair', 'mid-price', 'reasonable', 'average'), 'fine-dining': ('expensive', 'fancy', 'lavish') }

def recommend(description): # Convert user input to lowercase description = description.lower() data = df.copy() # Extract cities cities_input = [] for city in cities_list: if city in description: cities_input.append(city) description = description.replace(city, "") if cities_input: data = data[data['city'].isin(cities_input)] # Same thing for countries countries_input = [] for country in countries_list: if country in description: countries_input.append(country) description = description.replace(country, "") if countries_input: data = data[data['country'].isin(countries_input)] # Extract price class for key, value in price_map.items(): if any(v in description for v in value): data = data[data['price'] == key] break # Process user description text input description = process_sentences(description) description = description.strip() print('Processed user feedback:', description) # Init a TF-IDF vectorizer tfidfvec = TfidfVectorizer() # Fit data on processed reviews vec = tfidfvec.fit(data["bag_of_words"]) features = vec.transform(data["bag_of_words"]) # Transform user input data based on fitted model description_vector = vec.transform([description]) # Calculate cosine similarities between users processed input and reviews cos_sim = linear_kernel(description_vector, features) # Add similarities to data frame data['similarity'] = cos_sim[0] # Sort data frame by similarities data.sort_values(by='similarity', ascending=False, inplace=True) return data[['name', 'city', 'country', 'price', 'style', 'reviews', 'similarity']]

recommend('for vietnamese food')

# Feedback with multiples cities recommend('good for party in Paris or London')

# feedback with country recommend('vegan in Finland')

# with price class recommend('fancy for meetings')

# with price class and location recommend('a reasonable breakfast in France')

recommend('romantic cafe for dating')

recommend('a bar with live music and good atmosphere')

recommend('nice place for valentine date')

recommend('good quality espresso in italy')