Challenge 2: Analyse sentiment from GoodReads vs IMDB reviews
What is sentiment analysis?
Breve presentation of GoodReads and IMDB
!python -m pip install --upgrade pip
!pip install -U spaCy #to installs spaCy
!python -m spacy download en #choose the language; 'en' for English
!python -m spacy download en_core_web_lg #to download the large English language model of spaCy.
!python -m spacy.en.download all
!pip install spacytextblob
!pip install spacy-wordnet
!pip install bs4
!pip install selenium
!pip install requests
#!pip install Goodreads
!pip install ml_datasets #to download IMDB reviews data
import pandas as pd
import numpy as np
import collections
import string
import time
import tqdm
import re
import os
import random
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
from html.parser import HTMLParser
import html
from wordcloud import WordCloud
import spacy
#spacy.load("en_core_web_lg") #sm: small English model, md: medium English model, lg: large English model
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from spacy_wordnet.wordnet_annotator import WordnetAnnotator as wordnet
import textblob
import unicodedata
import nltk
nltk.download('wordnet')
#import pattern.en
from spacytextblob import spacytextblob
print(f'spaCy version : {spacy.__version__}') #spaCy v3.1.3
from sklearn import model_selection
#from sklearn import ensemble
from sklearn import metrics
from sklearn.feature_extraction import text
import ml_datasets
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe('spacy_wordnet')
nlp.add_pipe('spacytextblob')
CONTRACTIONS_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}
... for books reviews (web) scraping purpose
def get_book_title(soup):
try:
book_title = ' '.join(soup.find('h1', {'id': 'bookTitle'}).text.split())
return book_title
except:
return "NA"
def get_author_name(soup):
try:
author_name = ' '.join(soup.find('span', {'itemprop': 'name'}).text.split())
return author_name
except:
return "NA"
def get_edition_language(soup):
try:
edition_language = soup.find('div', {'itemprop': 'inLanguage'}).text.strip()
return edition_language
except:
return "NA"
def get_isbn(soup):
try:
isbn = re.findall(r'nisbn: [0-9]{10}' , str(soup))[0].split()[1]
return isbn
except:
return ""
def get_genres(soup):
genres = []
for node in soup.find_all('div', {'class': 'left'}):
current_genres = node.find_all('a', {'class': 'actionLinkLite bookPageGenreLink'})
current_genre = ' > '.join([g.text for g in current_genres])
if current_genre.strip():
genres.append(current_genre)
return genres
def get_series_name(soup):
try:
series = soup.find(id="bookSeries").find("a")
if series:
series_name = re.search(r'\((.*?)\)', series.text).group(1)
return series_name
except:
return "NA"
def get_num_pages(soup):
if soup.find('span', {'itemprop': 'numberOfPages'}):
num_pages = soup.find('span', {'itemprop': 'numberOfPages'}).text.strip()
return int(num_pages.split()[0])
return "NA"
def get_year_first_published(soup):
year_first_published = soup.find('nobr', attrs={'class':'greyText'})
try:
if year_first_published:
year_first_published = year_first_published.string
return re.search('([0-9]{3,4})', year_first_published).group(1)
else:
return "NA"
except:
return "NA"
def get_book_distinction(soup):
list_of_awards = []
try:
book_distinction = soup.find_all('a', {'class': 'award'})
for award in book_distinction:
list_of_awards.append(award.text)
return list_of_awards
except:
return "NA"
def get_num_ratings(soup):
try:
num_ratings = soup.find('meta', {'itemprop': 'ratingCount'})['content'].strip()
return int(num_ratings)
except:
return "NA"
def get_num_reviews(soup):
try:
num_reviews = soup.find('meta', {'itemprop': 'reviewCount'})['content'].strip()
return int(num_reviews)
except:
return "NA"
def get_average_ratings(soup):
try:
average_ratings = soup.find('span', {'itemprop': 'ratingValue'}).text.strip()
return float(average_ratings)
except:
return "NA"
def get_author_profile(soup):
try:
about_the_author = ' '.join(soup.find('div', {'class': 'bookAuthorProfile__about'}).text.split())
return about_the_author[:-8] #remove 'See also:... () ...more'
except:
return "NA"
def get_book_synopsys(soup):
try:
book_synopsys = ' '.join(soup.find('div', {'id': 'description'}).text.split())
return book_synopsys[:-8] #remove ' ...more'
except:
return "NA"
def get_book_review(soup):
''' scrap the first review among 1-30 avaible! '''
try:
book_review = ' '.join(soup.find('div', {'id': 'bookReviews'}).find_all('span', {'class': 'readable'})[0].text.split())
return book_review
except:
return "NA"
def book_review_scraper(book_page_url):
url = 'https://www.goodreads.com/book/show/' + str(book_page_url)
page_source = requests.get(url).content
soup = BeautifulSoup(page_source,'html.parser')
time.sleep(random.randint(1,4)) ##To avoid that the server refuses the connection, after sending too many requests from same ip address in short period of time.
return {
'title': get_book_title(soup),
'author': get_author_name(soup),
'author_profile': get_author_profile(soup),
'year_first_published': get_year_first_published(soup),
'book_series': get_series_name(soup),
'edition_language': get_edition_language(soup),
'isbn': get_isbn(soup),
'synopsys': get_book_synopsys(soup),
'num_pages': get_num_pages(soup),
'genres': get_genres(soup),
'awards': get_book_distinction(soup),
'num_ratings': get_num_ratings(soup),
'num_reviews': get_num_reviews(soup),
'average_rating': get_average_ratings(soup),
'review': get_book_review(soup)
}
def book_scraper(book_page_url):
url = 'https://www.goodreads.com/book/show/' + str(book_page_url)
page_source = requests.get(url).content
soup = BeautifulSoup(page_source,'html.parser')
time.sleep(random.randint(1,3)) ##To avoid that the server refuses the connection, after sending too many requests from same ip address in short period of time.
return {
'title': get_book_title(soup),
'author': get_author_name(soup),
'review': get_book_review(soup),
'average_rating': get_average_ratings(soup)
}
def books_web_scraper(first_url, last_url):
''' function to scrape reviews of (last_url - first_url+1) books '''
scraped_books = []
for book_page_url in tqdm.tqdm(range(first_url, last_url+1)):
book_review = book_scraper(book_page_url)
scraped_books.append(book_review)
return pd.DataFrame(scraped_books)
book_page_url = 1
book_review_scraper(book_page_url) #long definition
book_page_url = 1
book_scraper(book_page_url) #short definition
def get_movie_title(movie):
try:
movie_title = movie.find_all("h3", {"class": "lister-item-header"})[0].find("a").getText()
return movie_title
except:
return 'NA'
def get_release_year(movie):
try:
release_year = movie.find_all("h3", {"class": "lister-item-header"})[0].find("span", {"class": "lister-item-year text-muted unbold"}).getText()
return release_year
except:
return 'NA'
def get_genre(movie):
try:
movie_genre = movie.find_all("p", {"class": "text-muted"})[0].find("span", {"class": "genre"}).getText()
movie_genre = movie_genre.split()
_list_ = []
for genre in movie_genre:
if genre[-1] in list('[@_!#$%^&*()<>?/\|}{~:,]'):
genre = genre[:-1]
_list_.append(genre)
return _list_
except:
return 'NA'
def get_duration_in_min(movie):
try:
movie_duration = movie.find_all("p", {"class": "text-muted"})[0].find("span", {"class": "runtime"}).getText()
movie_duration = int(movie_duration[:-3]) #remove 'min'
return movie_duration
except:
return 'NA'
def get_certificate(movie):
try:
movie_certificate = movie.find_all("p", {"class": "text-muted"})[0].find("span", {"class": "certificate"}).getText()
try:
movie_certificate = int(movie_certificate)
movie_certificate = str(movie_certificate)+' years and above'
return movie_certificate
except:
return 'All public'
except:
return 'NA'
def get_synopsys(movie):
try:
movie_synopsys = movie.find_all("p", {"class": "text-muted"})[1].getText()
return movie_synopsys[1:]
except:
return 'NA'
def get_covers_url(movie):
try:
movie_covers_url = movie.find("div", {"class": "lister-item-image float-left"}).find("img", "loadlate").get('loadlate')
return movie_covers_url
except:
return 'NA'
def get_covers_id(movie):
try:
movie_covers_id = movie.find("div", {"class": "lister-item-image float-left"}).find("img", "loadlate").get('data-tconst')
return movie_covers_id
except:
return 'NA'
def get_director_or_creator(movie):
try:
movie_id = movie.find("div", {"class": "lister-item-image float-left"}).find("img", "loadlate").get('data-tconst')
url = f'http://www.imdb.com/title/{movie_id}/?ref_=adv_li_tt'
p_source = requests.get(url).content
sp = BeautifulSoup(p_source,'html.parser')
dr_or_cr = sp.find("a", {"class": "ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link"}).getText()
return dr_or_cr
except:
return "NA"
def get_average_ratings(movie):
try:
average_ratings = movie.find_all("div", {"class": "inline-block ratings-imdb-rating"})[0].getText()
return float(average_ratings.split()[0])
except:
return "NA"
def get_num_reviews(movie):
try:
num_reviews = movie.find_all("p", {"class": "sort-num_votes-visible"})[0].find("span", {"name": "nv"}).getText()
num_reviews = float(num_reviews)
return num_reviews
except:
return "NA"
def get_movie_review(movie):
try:
movie_id = movie.find("div", {"class": "lister-item-image float-left"}).find("img", "loadlate").get('data-tconst')
url = f'http://www.imdb.com/title/{movie_id}/reviews?spoiler=hide&sort=userRating&dir=desc&ratingFilter=0'
p_source = requests.get(url).content
sp = BeautifulSoup(p_source,'html.parser')
review = review = ' '.join(sp.find("div", {"id": "main"}).find_all('div', {'class': 'text show-more__control'})[0].text.split())
return review
except:
return "NA"
def movie_review_scraper(movie):
return {
'title': get_movie_title(movie),
'movie_release_year': get_release_year(movie),
'movie_creator_or_director': get_director_or_creator(movie),
'movie_genre': get_genre(movie),
'movie_duration_in_min': get_duration_in_min(movie),
'movie_certificate': get_certificate(movie),
'movie_synopsys': get_synopsys(movie),
'movie_covers_url': get_covers_url(movie),
'movie_covers_id': get_covers_id(movie),
'average_rating': get_average_ratings(movie),
'movie_num_reviews': get_num_reviews(movie),
'review': get_movie_review(movie),
}
def movie_scraper(movie):
return {
'title': get_movie_title(movie),
'movie_release_year': get_release_year(movie),
'movie_creator_or_director': get_director_or_creator(movie),
'review': get_movie_review(movie),
'average_rating': get_average_ratings(movie),
}
def movies_web_scraper(max_page):
''' function to scrape reviews of (max_page*50) movies within a period of release '''
scraped_movies = []
for page_num in tqdm.tqdm(range(0, max_page)):
next_records_id = 0 if page_num==0 else (50*page_num)+1
url_being_queried = f'https://www.imdb.com/search/title/?release_date=1990-01-01,2021-12-31&start={next_records_id}&ref_=adv_nxt'
page_source = requests.get(url_being_queried).content
soup = BeautifulSoup(page_source,'html.parser')
time.sleep(random.randint(1,3)) #To avoid that the server refuses the connection, after sending too many requests from same ip address in short period of time.
movies_list = soup.find_all("div", {"class": "lister-item mode-advanced"})
for movie in movies_list:
_50_movies_reviews = movie_scraper(movie)
scraped_movies.append(_50_movies_reviews)
return pd.DataFrame(scraped_movies)
next_records_id = 0
url_being_queried = f'https://www.imdb.com/search/title/?release_date=1990-01-01,2021-12-31&start={next_records_id}&ref_=adv_nxt'
page_source = requests.get(url_being_queried).content
soup = BeautifulSoup(page_source,'html.parser')
time.sleep(2)
movies_list = soup.find_all("div", {"class": "lister-item mode-advanced"})
movie = movies_list[4]
movie_review_scraper(movie) #long definition
movie_scraper(movie) #short definition
def get_sentiment_score(review):
''' function to return the sentiment score given a review. '''
doc = nlp(review)
return doc._.polarity
def get_sentiment_label(review):
''' function to return the sentiment label given a review. '''
doc = nlp(review)
### the polarity is scored on a scale from -1 (negative) to +1 (positive)
if doc._.polarity > 0:
result = 'positive'
elif doc._.polarity < 0:
result = 'negative'
else:
result = 'neutral'
return result
def get_subjectivity_score(review):
''' function to return the subjectivity score given a review. '''
### the subjectivity is scored on a scale from 0 (fact - objective) to +1 (opinion - subjective)
doc = nlp(review)
return doc._.subjectivity
reviewA = 'Black Panthers is not a well-known movie.'
get_sentiment_score(reviewA), get_sentiment_label(reviewA), get_subjectivity_score(reviewA)
reviewB = 'A Smell Of Burning tells a brilliant history.'
get_sentiment_score(reviewB), get_sentiment_label(reviewB), get_subjectivity_score(reviewB)
reviewC = 'Titanic is a movie.'
get_sentiment_score(reviewC), get_sentiment_label(reviewC), get_subjectivity_score(reviewC)
def prepare_data(file_path, path_to_save):
"""
file_path: r'data/df_name.csv'
path_to_save: r'data/df_name.csv'
returns: df with additional features (lables & scores)
"""
#load
data = pd.read_csv(file_path)
#additional features
data['sentiment_score'] = data['review'].astype(str).apply(get_sentiment_score)
data['sentiment_label'] = data['review'].astype(str).apply(get_sentiment_label)
data['subjectivity_score'] = data['review'].astype(str).apply(get_subjectivity_score)
#keep only
data = data[['title','review', 'subjectivity_score', 'sentiment_score', 'sentiment_label']]
#save
data.to_csv(path_to_save, encoding='utf-8')
#print
data.head(n=3)
def train_test_split(file_path, train_size = 0.8, binary=True):
"""
truncate and split the data using list slices.
straified sampling to prevent unballanced data shortcomings.
"""
data = pd.read_csv(file_path)
if (binary):
data = data[data.sentiment_label.isin(['positive', 'negative'])].reset_index()
test_rate = 1 - train_size
train, validation = model_selection.train_test_split(
data, test_size= test_rate, random_state=42, shuffle=True, stratify= data.sentiment_label
)
return train, validation
class MLStripper(HTMLParser):
"""
Function to remove unnecessary HTML characters from text documents
"""
def __init__(self):
super().__init__()
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ' '.join(self.fed)
def strip_html(text):
html_stripper = MLStripper()
html_stripper.feed(text)
return html_stripper.get_data()
def expand_contractions(text, contraction_mapping):
"""
function for expanding contractions. It takes in a body of text and returns
the same with its contractions expanded if there is a match.
"""
contractions_pattern = re.compile(
'({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL
)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
def lemmatize_text(text):
"""
Function to lemmatize text based on POS tags.
It takes in a body of text data and lemmatizes each word of the text based
on its POS tag if it is present and then returns the lemmatized text back.
"""
lemmatized_tokens = [token.lemma_.lower()
for token in nlp(text)]
lemmatized_text = ' '.join(lemmatized_tokens)
return lemmatized_text
def normalize_accented_characters(text):
"""
Function to normalize special accented characters and convert
them into regular ASCII characters so as to standardize the text across all reviews.
"""
text = unicodedata.normalize( 'NFKD', text ).encode('ascii', 'ignore').decode('utf8')
return text
def tokenize_text(text):
tokens = nlp(text)
tokens = [token.text.strip() for token in tokens]
return tokens
def remove_stopwords(text):
tokens = tokenize_text(text)
filtered_tokens = [
token for token in tokens if token not in stopwords
]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def remove_special_characters(text):
"""
Function to remove unnecessary characters and symbols from the data.
"""
tokens = tokenize_text(text)
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def keep_text_characters(text):
"""
Function to only extract text tokens from a body of text for which we use regular expressions.
"""
filtered_tokens = []
tokens = tokenize_text(text)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def normalize_corpus(corpus, lemmatize=True, only_text_chars=False, tokenize=False):
"""
Overall text normalization function for contractions, lemmatization, HTML unescaping,
special characters removal, and stopwords removal functions.
"""
normalized_corpus = []
for index, review in tqdm.tqdm(enumerate(corpus)):
review = normalize_accented_characters(review)
review = html.unescape(review)
review = strip_html(review)
review = expand_contractions(review, CONTRACTIONS_MAP)
if lemmatize:
review = lemmatize_text(review)
else:
review = review.lower()
review = remove_special_characters(review)
review = remove_stopwords(review)
if only_text_chars:
review = keep_text_characters(review)
if tokenize:
review = tokenize_text(review)
normalized_corpus.append(review)
else:
normalized_corpus.append(review)
return normalized_corpus
def clean_data(data, limit=0, normalize=True):
"""
this will take a list of texts and labels
and transform them in spacy documents
data: list(tuple(text, label))
returns: List(spacy.Doc.doc)
"""
# drop missing values
data.dropna(axis = 0, how ='any',inplace=True)
# flat sorting
print(data['sentiment_label'].value_counts())
# clean by removing 'emoji', 'url', etc.
if (normalize):
corpus = data['review']
data['review'] = normalize_corpus(corpus, lemmatize=True, only_text_chars=True, tokenize=False)
# in case...
if limit:
# Eliminate any possible bias from the order in which 'reviews' is loaded
random.shuffle(data)
# rows to keep
data = data[:limit]
# tranform into a tuple
data_in_tuple_format = tuple(zip(data['review'].tolist(), data['sentiment_label'].tolist()))
return data, data_in_tuple_format
def preprocess_data(data, binary=True):
"""
Function to turn the reviews text and the labels into neat spaCy Doc Objects.
returns: List(spacy.Doc.doc)
"""
docs = []
if (binary):
for doc, label in tqdm.tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
# we need to set the (text)cat(egory) for each document
if (label=='positive'):
doc.cats['positive'] = 1
doc.cats['negative'] = 0
else:
doc.cats['positive'] = 0
doc.cats['negative'] = 1
# put them into a nice list
docs.append(doc)
else:
for doc, label in tqdm.tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
# we need to set the label for each review
if (label=='positive'):
doc.cats['positive'] = 1
doc.cats['negative'] = 0
doc.cats['neutral'] = 0
elif (label=='negative'):
doc.cats['positive'] = 0
doc.cats['negative'] = 1
doc.cats['neutral'] = 0
else:
doc.cats['positive'] = 0
doc.cats['negative'] = 0
doc.cats['neutral'] = 1
# put them into a nice list
docs.append(doc)
return docs
def display_evaluation_metrics(true_labels, predicted_labels, positive_class=1):
"""
We will be evaluating our model based on precision, recall, accuracy, and F1-score,
which are suitable metrics for text classification.
"""
accuracy = np.round( metrics.accuracy_score(true_labels, predicted_labels),2)
precision = np.round( metrics.precision_score(true_labels, predicted_labels, pos_label=positive_class, average='binary'), 2)
recall = np.round( metrics.recall_score(true_labels, predicted_labels, pos_label=positive_class, average='binary'), 2)
f1_score = np.round( metrics.f1_score(true_labels, predicted_labels, pos_label=positive_class, average='binary'), 2)
return {"accuracy": accuracy, "precision": precision, "recall": recall, "f-score": f1_score}
def make_predictions(dataframe):
""" Generate predictions """
scores = []
labels = []
for row in tqdm.tqdm(range(len(dataframe))):
scores.append(predict_score(dataframe['review'].iloc[row]))
labels.append(predict_label(dataframe['review'].iloc[row]))
dataframe['score_prediction'] = scores
dataframe['label_prediction'] = labels
return dataframe
def predict_score(input_data:str):
# Load saved trained model
loaded_model = spacy.load("training/model-best") #model-best
return loaded_model(input_data).cats["positive"]
def predict_label(input_data: str):
# Load saved trained model
loaded_model = spacy.load("training/model-best")
if loaded_model(input_data).cats["positive"] > loaded_model(input_data).cats["negative"]:
label = 'positive'
else:
label = 'negative'
return label
first_url, last_url = 1, 2000
books_reviews_df = books_web_scraper(first_url, last_url)
books_reviews_df.head(n=3)
books_reviews_df.to_csv(r'books_reviews_df.csv', encoding='utf-8')
max_page = 40 #50 by page, i.e. 1,000,000 movies reviews
movies_reviews_df = movies_web_scraper(max_page)
movies_reviews_df.head(n=3)
movies_reviews_df.to_csv('movies_reviews_df.csv', encoding='utf-8')
prepare_data(file_path=r'movies_reviews_df.csv', path_to_save=r'movies_reviews.csv')
prepare_data(file_path=r'books_reviews_df.csv', path_to_save=r'books_reviews.csv')
data = pd.read_csv(r'movies_reviews.csv')
data.dropna(axis = 0, how ='any',inplace=True)
data = data[data.sentiment_label.isin(['positive', 'negative'])].reset_index(drop=True)
data['sentiment_label'].value_counts()
data['sentiment_label'].value_counts().plot(kind='bar')
fig, ax = plt.subplots(1, 3, figsize=(25,32))
wordcloud_global = WordCloud(background_color='white',width=1000, height=1000).generate(' '.join(data['review']))
wordcloud_neg = WordCloud(background_color='white',width=1000, height=1000).generate(' '.join(data[data['sentiment_label']=='negative']['review']))
wordcloud_pos = WordCloud(background_color='white',width=1000, height=1000).generate(' '.join(data[data['sentiment_label']=='positive']['review']))
ax[0].imshow(wordcloud_global, interpolation="bilinear")
ax[0].axis('off')
ax[0].set_title('All movie reviews')
ax[1].imshow(wordcloud_pos, interpolation="bilinear")
ax[1].axis('off')
ax[1].set_title('Positive movie reviews')
ax[2].imshow(wordcloud_neg, interpolation="bilinear")
ax[2].axis('off')
ax[2].set_title('Negative movie reviews')
plt.show()
def get_adjectives(dataset):
"""
Here we create a new feature (called 'adjective_words') in the datatset.
"""
adjectives = []
for doc in dataset['review']:
row = nlp(doc)
tokens = [
token.lemma_.lower().strip() for token in row if token.pos_ == 'ADJ'
]
adjectives.append(tokens)
dataset['adjectives'] = pd.Series(adjectives)
return dataset
def adj_comprehensive_list(data):
""" unique values for adjectives used in reviews """
adj_values = []
for _list_ in data.adjectives:
for adj in _list_:
if adj_values.count(adj)==0:
adj_values.append(adj)
return adj_values
def get_dummy(sub_liste, _list_):
"""
Function that creates as many dummie variables as there are adjectives.
Input:
sub_list: adjectives used in a review.
_list_: extension definition of genre feature.
"""
results=[]
for elt in _list_:
y = (sub_liste.count(elt)!=0)*1 #is 'elt' in the list ?
results.append(y)
return results
data = get_adjectives(data)
data.head(n=3)
adj_list = adj_comprehensive_list(data)
data_adj = data.adjectives.apply(lambda row: get_dummy(row, adj_list))
data_adj = data_adj.apply(pd.Series)
data_adj = data_adj.rename(columns=dict(zip(range(len(adj_list)), adj_list)))
data_adj['title'] = data.title
data_adj.head()
cols = data_adj.columns.tolist()[:-1]
corr = data_adj[cols].corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(15, 14))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.1, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})
cols = data_adj.columns.tolist()[:-1]
cum_adj = data_adj[cols].sum(axis = 0).sort_values(ascending=False)
top_adj = list(cum_adj[:10].index)
fig = plt.figure( figsize=(15,5))
cum_adj[:100].plot(kind = 'bar')
#top most frequent adjectives
list(cum_adj[:10].index)
#top least frequent adjectives
list(cum_adj[-10:].index)
cmap_cust = sns.color_palette("husl", n_colors = 2)
neg_data = data[data['sentiment_label'] == 'negative']
neg_data_text = ' '.join(text for text in neg_data['review'])
neg_data_list = neg_data_text.split()
neg_data_list = [w.lower() for w in neg_data_list if w not in stopwords and w.isalpha()]
neg_counts = collections.Counter(neg_data_list)
top_neg_words = pd.DataFrame(neg_counts.most_common(50),columns=['words', 'count'])
pos_data = data[data['sentiment_label'] == 'positive']
pos_data_text = ' '.join(text for text in pos_data['review'])
pos_data_list = pos_data_text.split()
pos_data_list = [w.lower() for w in pos_data_list if w not in stopwords and w.isalpha()]
pos_counts = collections.Counter(pos_data_list)
top_pos_words = pd.DataFrame(pos_counts.most_common(50),columns=['words', 'count'])
fig, ax = plt.subplots(1, 2, figsize=(25,32))
top_pos_words.sort_values(by='count').plot.barh(x='words',y='count', ax=ax[0], color=cmap_cust[1])
ax[0].set_title("Positive reviews")
top_neg_words.sort_values(by='count').plot.barh(x='words',y='count', ax=ax[1], color=cmap_cust[0])
ax[1].set_title("Negative reviews")
plt.show()
train_data, validation_data = train_test_split(file_path=r'movies_reviews.csv', train_size = 0.8, binary=True)
train_data.sentiment_label.value_counts()
validation_data.sentiment_label.value_counts()
train_data_cleaned, train_data_cleaned_in_tuple_format = clean_data(train_data, limit = 0, normalize=True)
validation_data_cleaned, validation_data_cleaned_in_tuple_format = clean_data(validation_data, limit = 0, normalize=True)
train_data_cleaned.head()
train_docs = preprocess_data(train_data_cleaned_in_tuple_format, binary=True)
validation_docs = preprocess_data(validation_data_cleaned_in_tuple_format, binary=True)
spacy.tokens.DocBin(docs=train_docs).to_disk("./train.spacy")
spacy.tokens.DocBin(docs=validation_docs).to_disk("./valid.spacy")
!python -m spacy init fill-config --help
# step 1: file configuration: language (en)| pipeline (textcat)| metric (accuracy)
!python -m spacy init config --force config.cfg --lang en --pipeline textcat --optimize accuracy
## Run the spacy init fill-config to auto-fill all default settings (the remaining defaults)
!python -m spacy init fill-config base_config.cfg config.cfg
# step 1: train and package pipeline
## 1.1. training config + custom registered functions and code + data paths config
!python -m spacy train config.cfg --verbose --output ./training --paths.train train.spacy --paths.dev valid.spacy
test_data = pd.read_csv(r'books_reviews.csv')
test_data.dropna(axis = 0, how ='any',inplace=True)
test_data = test_data[test_data.sentiment_label.isin(['positive', 'negative'])].reset_index(drop=True)
test_data['sentiment_label'].value_counts()
test_data.head()
# predictions
test_data = make_predictions(test_data)
label_predictions = test_data['label_prediction']
label_sentiments = test_data['sentiment_label']
test_data.head()
# show performance metrics (without cleaning data)
display_evaluation_metrics(true_labels=label_sentiments, predicted_labels=label_predictions, positive_class='positive')
test_data_cleaned, _ = clean_data(test_data, limit = 0, normalize=True)
test_data_cleaned.head()
# predictions
test_data_cleaned = make_predictions(test_data_cleaned)
label_predictions = test_data_cleaned['label_prediction']
label_sentiments = test_data_cleaned['sentiment_label']
display_evaluation_metrics(true_labels=label_sentiments, predicted_labels=label_predictions, positive_class='positive')