Challenge 2: Analyse sentiment from GoodReads vs IMDB reviews

What is sentiment analysis?

Breve presentation of GoodReads and IMDB

!python -m pip install --upgrade pip !pip install -U spaCy #to installs spaCy !python -m spacy download en #choose the language; 'en' for English !python -m spacy download en_core_web_lg #to download the large English language model of spaCy. !python -m spacy.en.download all !pip install spacytextblob !pip install spacy-wordnet

!pip install bs4 !pip install selenium !pip install requests #!pip install Goodreads !pip install ml_datasets #to download IMDB reviews data

import pandas as pd import numpy as np import collections import string import time import tqdm import re import os import random

import seaborn as sns import matplotlib.pyplot as plt

from bs4 import BeautifulSoup from selenium import webdriver import requests from html.parser import HTMLParser import html

from wordcloud import WordCloud import spacy #spacy.load("en_core_web_lg") #sm: small English model, md: medium English model, lg: large English model from spacy.lang.en.stop_words import STOP_WORDS as stopwords from spacy_wordnet.wordnet_annotator import WordnetAnnotator as wordnet import textblob import unicodedata import nltk nltk.download('wordnet') #import pattern.en from spacytextblob import spacytextblob print(f'spaCy version : {spacy.__version__}') #spaCy v3.1.3

from sklearn import model_selection #from sklearn import ensemble from sklearn import metrics from sklearn.feature_extraction import text

import ml_datasets

nlp = spacy.load("en_core_web_lg") nlp.add_pipe('spacy_wordnet') nlp.add_pipe('spacytextblob') CONTRACTIONS_MAP = { "ain't": "is not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he'll've": "he he will have", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "so's": "so as", "that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

... for books reviews (web) scraping purpose

def get_book_title(soup): try: book_title = ' '.join(soup.find('h1', {'id': 'bookTitle'}).text.split()) return book_title except: return "NA" def get_author_name(soup): try: author_name = ' '.join(soup.find('span', {'itemprop': 'name'}).text.split()) return author_name except: return "NA" def get_edition_language(soup): try: edition_language = soup.find('div', {'itemprop': 'inLanguage'}).text.strip() return edition_language except: return "NA" def get_isbn(soup): try: isbn = re.findall(r'nisbn: [0-9]{10}' , str(soup))[0].split()[1] return isbn except: return "" def get_genres(soup): genres = [] for node in soup.find_all('div', {'class': 'left'}): current_genres = node.find_all('a', {'class': 'actionLinkLite bookPageGenreLink'}) current_genre = ' > '.join([g.text for g in current_genres]) if current_genre.strip(): genres.append(current_genre) return genres def get_series_name(soup): try: series = soup.find(id="bookSeries").find("a") if series: series_name = re.search(r'$(.*?)$', series.text).group(1) return series_name except: return "NA" def get_num_pages(soup): if soup.find('span', {'itemprop': 'numberOfPages'}): num_pages = soup.find('span', {'itemprop': 'numberOfPages'}).text.strip() return int(num_pages.split()[0]) return "NA" def get_year_first_published(soup): year_first_published = soup.find('nobr', attrs={'class':'greyText'}) try: if year_first_published: year_first_published = year_first_published.string return re.search('([0-9]{3,4})', year_first_published).group(1) else: return "NA" except: return "NA" def get_book_distinction(soup): list_of_awards = [] try: book_distinction = soup.find_all('a', {'class': 'award'}) for award in book_distinction: list_of_awards.append(award.text) return list_of_awards except: return "NA" def get_num_ratings(soup): try: num_ratings = soup.find('meta', {'itemprop': 'ratingCount'})['content'].strip() return int(num_ratings) except: return "NA" def get_num_reviews(soup): try: num_reviews = soup.find('meta', {'itemprop': 'reviewCount'})['content'].strip() return int(num_reviews) except: return "NA" def get_average_ratings(soup): try: average_ratings = soup.find('span', {'itemprop': 'ratingValue'}).text.strip() return float(average_ratings) except: return "NA" def get_author_profile(soup): try: about_the_author = ' '.join(soup.find('div', {'class': 'bookAuthorProfile__about'}).text.split()) return about_the_author[:-8] #remove 'See also:... () ...more' except: return "NA" def get_book_synopsys(soup): try: book_synopsys = ' '.join(soup.find('div', {'id': 'description'}).text.split()) return book_synopsys[:-8] #remove ' ...more' except: return "NA" def get_book_review(soup): ''' scrap the first review among 1-30 avaible! ''' try: book_review = ' '.join(soup.find('div', {'id': 'bookReviews'}).find_all('span', {'class': 'readable'})[0].text.split()) return book_review except: return "NA" def book_review_scraper(book_page_url): url = 'https://www.goodreads.com/book/show/' + str(book_page_url) page_source = requests.get(url).content soup = BeautifulSoup(page_source,'html.parser') time.sleep(random.randint(1,4)) ##To avoid that the server refuses the connection, after sending too many requests from same ip address in short period of time. return { 'title': get_book_title(soup), 'author': get_author_name(soup), 'author_profile': get_author_profile(soup), 'year_first_published': get_year_first_published(soup), 'book_series': get_series_name(soup), 'edition_language': get_edition_language(soup), 'isbn': get_isbn(soup), 'synopsys': get_book_synopsys(soup), 'num_pages': get_num_pages(soup), 'genres': get_genres(soup), 'awards': get_book_distinction(soup), 'num_ratings': get_num_ratings(soup), 'num_reviews': get_num_reviews(soup), 'average_rating': get_average_ratings(soup), 'review': get_book_review(soup) } def book_scraper(book_page_url): url = 'https://www.goodreads.com/book/show/' + str(book_page_url) page_source = requests.get(url).content soup = BeautifulSoup(page_source,'html.parser') time.sleep(random.randint(1,3)) ##To avoid that the server refuses the connection, after sending too many requests from same ip address in short period of time. return { 'title': get_book_title(soup), 'author': get_author_name(soup), 'review': get_book_review(soup), 'average_rating': get_average_ratings(soup) } def books_web_scraper(first_url, last_url): ''' function to scrape reviews of (last_url - first_url+1) books ''' scraped_books = [] for book_page_url in tqdm.tqdm(range(first_url, last_url+1)): book_review = book_scraper(book_page_url) scraped_books.append(book_review) return pd.DataFrame(scraped_books)

book_page_url = 1 book_review_scraper(book_page_url) #long definition

book_page_url = 1 book_scraper(book_page_url) #short definition

def get_movie_title(movie): try: movie_title = movie.find_all("h3", {"class": "lister-item-header"})[0].find("a").getText() return movie_title except: return 'NA' def get_release_year(movie): try: release_year = movie.find_all("h3", {"class": "lister-item-header"})[0].find("span", {"class": "lister-item-year text-muted unbold"}).getText() return release_year except: return 'NA' def get_genre(movie): try: movie_genre = movie.find_all("p", {"class": "text-muted"})[0].find("span", {"class": "genre"}).getText() movie_genre = movie_genre.split() _list_ = [] for genre in movie_genre: if genre[-1] in list('[@_!#$%^&*()<>?/\|}{~:,]'): genre = genre[:-1] _list_.append(genre) return _list_ except: return 'NA' def get_duration_in_min(movie): try: movie_duration = movie.find_all("p", {"class": "text-muted"})[0].find("span", {"class": "runtime"}).getText() movie_duration = int(movie_duration[:-3]) #remove 'min' return movie_duration except: return 'NA' def get_certificate(movie): try: movie_certificate = movie.find_all("p", {"class": "text-muted"})[0].find("span", {"class": "certificate"}).getText() try: movie_certificate = int(movie_certificate) movie_certificate = str(movie_certificate)+' years and above' return movie_certificate except: return 'All public' except: return 'NA' def get_synopsys(movie): try: movie_synopsys = movie.find_all("p", {"class": "text-muted"})[1].getText() return movie_synopsys[1:] except: return 'NA' def get_covers_url(movie): try: movie_covers_url = movie.find("div", {"class": "lister-item-image float-left"}).find("img", "loadlate").get('loadlate') return movie_covers_url except: return 'NA' def get_covers_id(movie): try: movie_covers_id = movie.find("div", {"class": "lister-item-image float-left"}).find("img", "loadlate").get('data-tconst') return movie_covers_id except: return 'NA' def get_director_or_creator(movie): try: movie_id = movie.find("div", {"class": "lister-item-image float-left"}).find("img", "loadlate").get('data-tconst') url = f'http://www.imdb.com/title/{movie_id}/?ref_=adv_li_tt' p_source = requests.get(url).content sp = BeautifulSoup(p_source,'html.parser') dr_or_cr = sp.find("a", {"class": "ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link"}).getText() return dr_or_cr except: return "NA" def get_average_ratings(movie): try: average_ratings = movie.find_all("div", {"class": "inline-block ratings-imdb-rating"})[0].getText() return float(average_ratings.split()[0]) except: return "NA" def get_num_reviews(movie): try: num_reviews = movie.find_all("p", {"class": "sort-num_votes-visible"})[0].find("span", {"name": "nv"}).getText() num_reviews = float(num_reviews) return num_reviews except: return "NA" def get_movie_review(movie): try: movie_id = movie.find("div", {"class": "lister-item-image float-left"}).find("img", "loadlate").get('data-tconst') url = f'http://www.imdb.com/title/{movie_id}/reviews?spoiler=hide&sort=userRating&dir=desc&ratingFilter=0' p_source = requests.get(url).content sp = BeautifulSoup(p_source,'html.parser') review = review = ' '.join(sp.find("div", {"id": "main"}).find_all('div', {'class': 'text show-more__control'})[0].text.split()) return review except: return "NA" def movie_review_scraper(movie): return { 'title': get_movie_title(movie), 'movie_release_year': get_release_year(movie), 'movie_creator_or_director': get_director_or_creator(movie), 'movie_genre': get_genre(movie), 'movie_duration_in_min': get_duration_in_min(movie), 'movie_certificate': get_certificate(movie), 'movie_synopsys': get_synopsys(movie), 'movie_covers_url': get_covers_url(movie), 'movie_covers_id': get_covers_id(movie), 'average_rating': get_average_ratings(movie), 'movie_num_reviews': get_num_reviews(movie), 'review': get_movie_review(movie), } def movie_scraper(movie): return { 'title': get_movie_title(movie), 'movie_release_year': get_release_year(movie), 'movie_creator_or_director': get_director_or_creator(movie), 'review': get_movie_review(movie), 'average_rating': get_average_ratings(movie), } def movies_web_scraper(max_page): ''' function to scrape reviews of (max_page*50) movies within a period of release ''' scraped_movies = [] for page_num in tqdm.tqdm(range(0, max_page)): next_records_id = 0 if page_num==0 else (50*page_num)+1 url_being_queried = f'https://www.imdb.com/search/title/?release_date=1990-01-01,2021-12-31&start={next_records_id}&ref_=adv_nxt' page_source = requests.get(url_being_queried).content soup = BeautifulSoup(page_source,'html.parser') time.sleep(random.randint(1,3)) #To avoid that the server refuses the connection, after sending too many requests from same ip address in short period of time. movies_list = soup.find_all("div", {"class": "lister-item mode-advanced"}) for movie in movies_list: _50_movies_reviews = movie_scraper(movie) scraped_movies.append(_50_movies_reviews) return pd.DataFrame(scraped_movies)

next_records_id = 0 url_being_queried = f'https://www.imdb.com/search/title/?release_date=1990-01-01,2021-12-31&start={next_records_id}&ref_=adv_nxt' page_source = requests.get(url_being_queried).content soup = BeautifulSoup(page_source,'html.parser') time.sleep(2) movies_list = soup.find_all("div", {"class": "lister-item mode-advanced"}) movie = movies_list[4]

movie_review_scraper(movie) #long definition

movie_scraper(movie) #short definition

def get_sentiment_score(review): ''' function to return the sentiment score given a review. ''' doc = nlp(review) return doc._.polarity def get_sentiment_label(review): ''' function to return the sentiment label given a review. ''' doc = nlp(review) ### the polarity is scored on a scale from -1 (negative) to +1 (positive) if doc._.polarity > 0: result = 'positive' elif doc._.polarity < 0: result = 'negative' else: result = 'neutral' return result def get_subjectivity_score(review): ''' function to return the subjectivity score given a review. ''' ### the subjectivity is scored on a scale from 0 (fact - objective) to +1 (opinion - subjective) doc = nlp(review) return doc._.subjectivity

reviewA = 'Black Panthers is not a well-known movie.' get_sentiment_score(reviewA), get_sentiment_label(reviewA), get_subjectivity_score(reviewA)

reviewB = 'A Smell Of Burning tells a brilliant history.' get_sentiment_score(reviewB), get_sentiment_label(reviewB), get_subjectivity_score(reviewB)

reviewC = 'Titanic is a movie.' get_sentiment_score(reviewC), get_sentiment_label(reviewC), get_subjectivity_score(reviewC)

def prepare_data(file_path, path_to_save): """ file_path: r'data/df_name.csv' path_to_save: r'data/df_name.csv' returns: df with additional features (lables & scores) """ #load data = pd.read_csv(file_path) #additional features data['sentiment_score'] = data['review'].astype(str).apply(get_sentiment_score) data['sentiment_label'] = data['review'].astype(str).apply(get_sentiment_label) data['subjectivity_score'] = data['review'].astype(str).apply(get_subjectivity_score) #keep only data = data[['title','review', 'subjectivity_score', 'sentiment_score', 'sentiment_label']] #save data.to_csv(path_to_save, encoding='utf-8') #print data.head(n=3)

def train_test_split(file_path, train_size = 0.8, binary=True): """ truncate and split the data using list slices. straified sampling to prevent unballanced data shortcomings. """ data = pd.read_csv(file_path) if (binary): data = data[data.sentiment_label.isin(['positive', 'negative'])].reset_index() test_rate = 1 - train_size train, validation = model_selection.train_test_split( data, test_size= test_rate, random_state=42, shuffle=True, stratify= data.sentiment_label ) return train, validation

class MLStripper(HTMLParser): """ Function to remove unnecessary HTML characters from text documents """ def __init__(self): super().__init__() self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return ' '.join(self.fed) def strip_html(text): html_stripper = MLStripper() html_stripper.feed(text) return html_stripper.get_data() def expand_contractions(text, contraction_mapping): """ function for expanding contractions. It takes in a body of text and returns the same with its contractions expanded if there is a match. """ contractions_pattern = re.compile( '({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL ) def expand_match(contraction): match = contraction.group(0) first_char = match[0] expanded_contraction = contraction_mapping.get(match)\ if contraction_mapping.get(match)\ else contraction_mapping.get(match.lower()) expanded_contraction = first_char+expanded_contraction[1:] return expanded_contraction expanded_text = contractions_pattern.sub(expand_match, text) expanded_text = re.sub("'", "", expanded_text) return expanded_text def lemmatize_text(text): """ Function to lemmatize text based on POS tags. It takes in a body of text data and lemmatizes each word of the text based on its POS tag if it is present and then returns the lemmatized text back. """ lemmatized_tokens = [token.lemma_.lower() for token in nlp(text)] lemmatized_text = ' '.join(lemmatized_tokens) return lemmatized_text def normalize_accented_characters(text): """ Function to normalize special accented characters and convert them into regular ASCII characters so as to standardize the text across all reviews. """ text = unicodedata.normalize( 'NFKD', text ).encode('ascii', 'ignore').decode('utf8') return text def tokenize_text(text): tokens = nlp(text) tokens = [token.text.strip() for token in tokens] return tokens def remove_stopwords(text): tokens = tokenize_text(text) filtered_tokens = [ token for token in tokens if token not in stopwords ] filtered_text = ' '.join(filtered_tokens) return filtered_text def remove_special_characters(text): """ Function to remove unnecessary characters and symbols from the data. """ tokens = tokenize_text(text) pattern = re.compile('[{}]'.format(re.escape(string.punctuation))) filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens]) filtered_text = ' '.join(filtered_tokens) return filtered_text def keep_text_characters(text): """ Function to only extract text tokens from a body of text for which we use regular expressions. """ filtered_tokens = [] tokens = tokenize_text(text) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) filtered_text = ' '.join(filtered_tokens) return filtered_text def normalize_corpus(corpus, lemmatize=True, only_text_chars=False, tokenize=False): """ Overall text normalization function for contractions, lemmatization, HTML unescaping, special characters removal, and stopwords removal functions. """ normalized_corpus = [] for index, review in tqdm.tqdm(enumerate(corpus)): review = normalize_accented_characters(review) review = html.unescape(review) review = strip_html(review) review = expand_contractions(review, CONTRACTIONS_MAP) if lemmatize: review = lemmatize_text(review) else: review = review.lower() review = remove_special_characters(review) review = remove_stopwords(review) if only_text_chars: review = keep_text_characters(review) if tokenize: review = tokenize_text(review) normalized_corpus.append(review) else: normalized_corpus.append(review) return normalized_corpus def clean_data(data, limit=0, normalize=True): """ this will take a list of texts and labels and transform them in spacy documents data: list(tuple(text, label)) returns: List(spacy.Doc.doc) """ # drop missing values data.dropna(axis = 0, how ='any',inplace=True) # flat sorting print(data['sentiment_label'].value_counts()) # clean by removing 'emoji', 'url', etc. if (normalize): corpus = data['review'] data['review'] = normalize_corpus(corpus, lemmatize=True, only_text_chars=True, tokenize=False) # in case... if limit: # Eliminate any possible bias from the order in which 'reviews' is loaded random.shuffle(data) # rows to keep data = data[:limit] # tranform into a tuple data_in_tuple_format = tuple(zip(data['review'].tolist(), data['sentiment_label'].tolist())) return data, data_in_tuple_format def preprocess_data(data, binary=True): """ Function to turn the reviews text and the labels into neat spaCy Doc Objects. returns: List(spacy.Doc.doc) """ docs = [] if (binary): for doc, label in tqdm.tqdm(nlp.pipe(data, as_tuples=True), total = len(data)): # we need to set the (text)cat(egory) for each document if (label=='positive'): doc.cats['positive'] = 1 doc.cats['negative'] = 0 else: doc.cats['positive'] = 0 doc.cats['negative'] = 1 # put them into a nice list docs.append(doc) else: for doc, label in tqdm.tqdm(nlp.pipe(data, as_tuples=True), total = len(data)): # we need to set the label for each review if (label=='positive'): doc.cats['positive'] = 1 doc.cats['negative'] = 0 doc.cats['neutral'] = 0 elif (label=='negative'): doc.cats['positive'] = 0 doc.cats['negative'] = 1 doc.cats['neutral'] = 0 else: doc.cats['positive'] = 0 doc.cats['negative'] = 0 doc.cats['neutral'] = 1 # put them into a nice list docs.append(doc) return docs

def display_evaluation_metrics(true_labels, predicted_labels, positive_class=1): """ We will be evaluating our model based on precision, recall, accuracy, and F1-score, which are suitable metrics for text classification. """ accuracy = np.round( metrics.accuracy_score(true_labels, predicted_labels),2) precision = np.round( metrics.precision_score(true_labels, predicted_labels, pos_label=positive_class, average='binary'), 2) recall = np.round( metrics.recall_score(true_labels, predicted_labels, pos_label=positive_class, average='binary'), 2) f1_score = np.round( metrics.f1_score(true_labels, predicted_labels, pos_label=positive_class, average='binary'), 2) return {"accuracy": accuracy, "precision": precision, "recall": recall, "f-score": f1_score} def make_predictions(dataframe): """ Generate predictions """ scores = [] labels = [] for row in tqdm.tqdm(range(len(dataframe))): scores.append(predict_score(dataframe['review'].iloc[row])) labels.append(predict_label(dataframe['review'].iloc[row])) dataframe['score_prediction'] = scores dataframe['label_prediction'] = labels return dataframe def predict_score(input_data:str): # Load saved trained model loaded_model = spacy.load("training/model-best") #model-best return loaded_model(input_data).cats["positive"] def predict_label(input_data: str): # Load saved trained model loaded_model = spacy.load("training/model-best") if loaded_model(input_data).cats["positive"] > loaded_model(input_data).cats["negative"]: label = 'positive' else: label = 'negative' return label

first_url, last_url = 1, 2000 books_reviews_df = books_web_scraper(first_url, last_url)

books_reviews_df.head(n=3)

books_reviews_df.to_csv(r'books_reviews_df.csv', encoding='utf-8')

max_page = 40 #50 by page, i.e. 1,000,000 movies reviews movies_reviews_df = movies_web_scraper(max_page)

movies_reviews_df.head(n=3)

movies_reviews_df.to_csv('movies_reviews_df.csv', encoding='utf-8')

prepare_data(file_path=r'movies_reviews_df.csv', path_to_save=r'movies_reviews.csv')

prepare_data(file_path=r'books_reviews_df.csv', path_to_save=r'books_reviews.csv')

data = pd.read_csv(r'movies_reviews.csv') data.dropna(axis = 0, how ='any',inplace=True) data = data[data.sentiment_label.isin(['positive', 'negative'])].reset_index(drop=True) data['sentiment_label'].value_counts()

data['sentiment_label'].value_counts().plot(kind='bar')

fig, ax = plt.subplots(1, 3, figsize=(25,32)) wordcloud_global = WordCloud(background_color='white',width=1000, height=1000).generate(' '.join(data['review'])) wordcloud_neg = WordCloud(background_color='white',width=1000, height=1000).generate(' '.join(data[data['sentiment_label']=='negative']['review'])) wordcloud_pos = WordCloud(background_color='white',width=1000, height=1000).generate(' '.join(data[data['sentiment_label']=='positive']['review'])) ax[0].imshow(wordcloud_global, interpolation="bilinear") ax[0].axis('off') ax[0].set_title('All movie reviews') ax[1].imshow(wordcloud_pos, interpolation="bilinear") ax[1].axis('off') ax[1].set_title('Positive movie reviews') ax[2].imshow(wordcloud_neg, interpolation="bilinear") ax[2].axis('off') ax[2].set_title('Negative movie reviews') plt.show()

def get_adjectives(dataset): """ Here we create a new feature (called 'adjective_words') in the datatset. """ adjectives = [] for doc in dataset['review']: row = nlp(doc) tokens = [ token.lemma_.lower().strip() for token in row if token.pos_ == 'ADJ' ] adjectives.append(tokens) dataset['adjectives'] = pd.Series(adjectives) return dataset def adj_comprehensive_list(data): """ unique values for adjectives used in reviews """ adj_values = [] for _list_ in data.adjectives: for adj in _list_: if adj_values.count(adj)==0: adj_values.append(adj) return adj_values def get_dummy(sub_liste, _list_): """ Function that creates as many dummie variables as there are adjectives. Input: sub_list: adjectives used in a review. _list_: extension definition of genre feature. """ results=[] for elt in _list_: y = (sub_liste.count(elt)!=0)*1 #is 'elt' in the list ? results.append(y) return results

data = get_adjectives(data)

data.head(n=3)

adj_list = adj_comprehensive_list(data) data_adj = data.adjectives.apply(lambda row: get_dummy(row, adj_list)) data_adj = data_adj.apply(pd.Series) data_adj = data_adj.rename(columns=dict(zip(range(len(adj_list)), adj_list))) data_adj['title'] = data.title

data_adj.head()

cols = data_adj.columns.tolist()[:-1] corr = data_adj[cols].corr() mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True f, ax = plt.subplots(figsize=(15, 14)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.1, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})

cols = data_adj.columns.tolist()[:-1] cum_adj = data_adj[cols].sum(axis = 0).sort_values(ascending=False) top_adj = list(cum_adj[:10].index) fig = plt.figure( figsize=(15,5)) cum_adj[:100].plot(kind = 'bar')

#top most frequent adjectives list(cum_adj[:10].index)

#top least frequent adjectives list(cum_adj[-10:].index)

cmap_cust = sns.color_palette("husl", n_colors = 2) neg_data = data[data['sentiment_label'] == 'negative'] neg_data_text = ' '.join(text for text in neg_data['review']) neg_data_list = neg_data_text.split() neg_data_list = [w.lower() for w in neg_data_list if w not in stopwords and w.isalpha()] neg_counts = collections.Counter(neg_data_list) top_neg_words = pd.DataFrame(neg_counts.most_common(50),columns=['words', 'count']) pos_data = data[data['sentiment_label'] == 'positive'] pos_data_text = ' '.join(text for text in pos_data['review']) pos_data_list = pos_data_text.split() pos_data_list = [w.lower() for w in pos_data_list if w not in stopwords and w.isalpha()] pos_counts = collections.Counter(pos_data_list) top_pos_words = pd.DataFrame(pos_counts.most_common(50),columns=['words', 'count']) fig, ax = plt.subplots(1, 2, figsize=(25,32)) top_pos_words.sort_values(by='count').plot.barh(x='words',y='count', ax=ax[0], color=cmap_cust[1]) ax[0].set_title("Positive reviews") top_neg_words.sort_values(by='count').plot.barh(x='words',y='count', ax=ax[1], color=cmap_cust[0]) ax[1].set_title("Negative reviews") plt.show()

train_data, validation_data = train_test_split(file_path=r'movies_reviews.csv', train_size = 0.8, binary=True)

train_data.sentiment_label.value_counts()

validation_data.sentiment_label.value_counts()

train_data_cleaned, train_data_cleaned_in_tuple_format = clean_data(train_data, limit = 0, normalize=True) validation_data_cleaned, validation_data_cleaned_in_tuple_format = clean_data(validation_data, limit = 0, normalize=True)

train_data_cleaned.head()

train_docs = preprocess_data(train_data_cleaned_in_tuple_format, binary=True) validation_docs = preprocess_data(validation_data_cleaned_in_tuple_format, binary=True)

spacy.tokens.DocBin(docs=train_docs).to_disk("./train.spacy") spacy.tokens.DocBin(docs=validation_docs).to_disk("./valid.spacy")

!python -m spacy init fill-config --help

# step 1: file configuration: language (en)| pipeline (textcat)| metric (accuracy) !python -m spacy init config --force config.cfg --lang en --pipeline textcat --optimize accuracy ## Run the spacy init fill-config to auto-fill all default settings (the remaining defaults) !python -m spacy init fill-config base_config.cfg config.cfg # step 1: train and package pipeline ## 1.1. training config + custom registered functions and code + data paths config !python -m spacy train config.cfg --verbose --output ./training --paths.train train.spacy --paths.dev valid.spacy

test_data = pd.read_csv(r'books_reviews.csv') test_data.dropna(axis = 0, how ='any',inplace=True) test_data = test_data[test_data.sentiment_label.isin(['positive', 'negative'])].reset_index(drop=True) test_data['sentiment_label'].value_counts()

test_data.head()

# predictions test_data = make_predictions(test_data) label_predictions = test_data['label_prediction'] label_sentiments = test_data['sentiment_label']

test_data.head()

# show performance metrics (without cleaning data) display_evaluation_metrics(true_labels=label_sentiments, predicted_labels=label_predictions, positive_class='positive')

test_data_cleaned, _ = clean_data(test_data, limit = 0, normalize=True) test_data_cleaned.head()

# predictions test_data_cleaned = make_predictions(test_data_cleaned) label_predictions = test_data_cleaned['label_prediction'] label_sentiments = test_data_cleaned['sentiment_label'] display_evaluation_metrics(true_labels=label_sentiments, predicted_labels=label_predictions, positive_class='positive')

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Challenge 2: Analyse sentiment from GoodReads vs IMDB reviews

What is sentiment analysis?

Breve presentation of GoodReads and IMDB

... for books reviews (web) scraping purpose

Challenge 2: Analyse sentiment from GoodReads vs IMDB reviews