# get IMDB Dataset: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# extract downloaded files
!tar xvzf aclImdb_v1.tar.gz
# After extraction, folder aclImdb contains two folders as follows:
# * train
# * pos reviews
# * neg reviews
# * test
# * pos reviews
# * neg reviews
# Let's, load movie reviews in pandas dataframe and add corresponding label.
# https://stackoverflow.com/questions/33912773/python-read-txt-files-into-a-dataframe/33912971
import os
def fetch_reviews(path):
data = []
#path = 'aclImdb/train/pos/'
files = [f for f in os.listdir(path)]
for file in files:
with open(path+file, "r", encoding='utf8') as f:
data.append(f.read())
return data
import pandas as pd
df_train_pos = pd.DataFrame({'review': fetch_reviews('aclImdb/train/pos/'), 'label': 1})
df_train_neg = pd.DataFrame({'review': fetch_reviews('aclImdb/train/neg/'), 'label': 0})
df_test_pos = pd.DataFrame({'review': fetch_reviews('aclImdb/test/pos/'), 'label': 1})
df_test_neg = pd.DataFrame({'review': fetch_reviews('aclImdb/test/neg/'), 'label': 0})
# Merging all df's for data cleaning and preprocessing step.
df = pd.concat([df_train_pos, df_train_neg, df_test_pos, df_test_neg], ignore_index=True)
print("Total reviews in df: ", df.shape)
df.head()
# No null values in dataset.
df['review'].isnull().sum()
print("Total Number of positive reviews in data: ", df[df['label']==1].shape[0])
print("Total Number of negative reviews in data: ", df[df['label']==0].shape[0])
# sample positive movie review
df[df['label']==1].sample(n=1)['review'].iloc[0]
# sample negative review
df[df['label']==0].sample(n=1)['review'].iloc[0]
# word_count in reviews
word_counts = df['review'].apply(lambda x: len(x.split()))
word_counts.describe()
plt.boxplot(df['word_count'].values)
plt.grid(True)
plt.show()
import re
# import nltk
# nltk.download('punkt') # At first you have to download these nltk packages.
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english') # defining stop_words
stop_words.remove('not') # removing not from the stop_words list as it contains value in negative movies
lemmatizer = WordNetLemmatizer()
def data_preprocessing(review):
# data cleaning
review = re.sub(re.compile('<.*?>'), '', review) #removing html tags
review = re.sub('[^A-Za-z0-9]+', ' ', review) #taking only words
# lowercase
review = review.lower()
# tokenization
tokens = nltk.word_tokenize(review) # converts review to tokens
# stop_words removal
review = [word for word in tokens if word not in stop_words] #removing stop words
# lemmatization
review = [lemmatizer.lemmatize(word) for word in review]
# join words in preprocessed review
review = ' '.join(review)
return review
df['preprocessed_review'] = df['review'].apply(lambda review: data_preprocessing(review))
df.head()
from wordcloud import WordCloud
words_list = df[df['label']==1]['preprocessed_review'].unique().tolist()
pos_words = " ".join(words_list)
pos_wordcloud = WordCloud(
width=800, height = 500,
stopwords=stop_words).generate(pos_words)
plt.figure(figsize=(8, 8), facecolor = None)
plt.imshow(pos_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
words_list = df[df['label']==0]['preprocessed_review'].unique().tolist()
neg_words = " ".join(words_list)
neg_wordcloud = WordCloud(
width=800, height = 500,
stopwords=stop_words).generate(neg_words)
plt.figure(figsize=(8, 8), facecolor = None)
plt.imshow(neg_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
from sklearn.model_selection import train_test_split
data = df.copy()
y = data['label'].values
data.drop(['label'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, stratify=y)
print("Train data:", X_train.shape, y_train.shape)
print("Test data:", X_test.shape, y_test.shape)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(min_df=10)
X_train_review_bow = vect.fit_transform(X_train['preprocessed_review'])
X_test_review_bow = vect.transform(X_test['preprocessed_review'])
print('X_train_review_bow shape: ', X_train_review_bow.shape)
print('X_test_review_bow shape: ', X_test_review_bow.shape)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10)
X_train_review_tfidf = vectorizer.fit_transform(X_train['preprocessed_review'])
X_test_review_tfidf = vectorizer.transform(X_test['preprocessed_review'])
print('X_train_review_tfidf shape: ', X_train_review_tfidf.shape)
print('X_test_review_tfidf shape: ', X_test_review_tfidf.shape)
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
clf = MultinomialNB()
clf.fit(X_train_review_bow, y_train)
y_pred = clf.predict(X_test_review_bow) #prediction from model
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()
clf = MultinomialNB(alpha=1)
clf.fit(X_train_review_tfidf, y_train)
y_pred = clf.predict(X_test_review_tfidf)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='l1')
clf.fit(X_train_review_tfidf, y_train)
y_pred = clf.predict(X_test_review_tfidf)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Test Confusion Matrix')
plt.show()
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ['Vectorizer', 'Model', 'Accuracy']
x.add_row(['BOW', 'Naive Bayes', '84.6%'])
x.add_row(['TFIDF', 'Naive Bayes', '85.3%'])
x.add_row(['TFIDF', 'Logistic Regression', '88.0%'])
print(x)
Topics: Data Preprocessing
# Data Cleaning
import re
#sample review from the IMDB dataset.
review = "<b>A touching movie!!</b> It is full of emotions and wonderful acting.<br> I could have sat through it a second time."
cleaned_review = re.sub(re.compile('<.*?>'), '', review) #removing HTML tags
cleaned_review = re.sub('[^A-Za-z0-9]+', ' ', cleaned_review) #taking only words
print(cleaned_review)
# Lowercase
cleaned_review = cleaned_review.lower()
print(cleaned_review)
# Tokenization
# import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokens = nltk.word_tokenize(cleaned_review)
print(cleaned_review)
print(tokens)
# Stop words removal
# nltk.download('stopwords') # you have to download the set of stop words the first time
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
filtered_review = [word for word in tokens if word not in stop_words] # removing stop words
print(filtered_review)
# Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_review = [stemmer.stem(word) for word in filtered_review]
print(stemmed_review)
# Lemmatization
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemm_review = [lemmatizer.lemmatize(word) for word in filtered_review]
print(lemm_review)