NLP Analysis

import numpy as np import pandas as pd import nltk nltk.download('punkt') nltk.download('stopwords') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') import seaborn as sns from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from string import punctuation from nltk.stem.wordnet import WordNetLemmatizer from sklearn.feature_extraction.text import CountVectorizer from tqdm import tqdm %matplotlib inline from sklearn.feature_extraction.text import TfidfVectorizer from nltk import tokenize,WordNetLemmatizer, PorterStemmer from nltk.corpus import wordnet

#re-scrap dataset df= pd.read_csv('https://raw.githubusercontent.com/emaleeeeeee/review_mini/main/REVIEWS%20RESCRAP.csv') df.columns = ['Customer','Title','Review','Rating','Date'] df = df[['Title','Review']] df = df.dropna() spanish = df.index[(df['Review'].str.contains('muy')) | (df['Review'].str.contains('bueno'))| (df['Review'].str.contains('en el'))| (df['Review'].str.contains('ninguno')) | (df['Review'].str.contains('tiempo'))| (df['Review'].str.contains('teléfonos'))| df['Review'].str.contains('calidad de imagen') ] bad_df = df.index.isin(spanish) df = df[~bad_df] df

#Text cleaning import re import string def text_cleaning(text): text = text.lower() text = re.sub(r"however", ". however", text) text = re.sub(r"n\but", ". but", text) text = re.sub(r"wont", "will not", text) text = re.sub(r"can't", "can not", text) text = re.sub(r"i'm", "i am", text) text = re.sub('\n', '', str(text)) text = re.sub(r"\’re", " are", text) text = re.sub(r"\’s", " is", text) text = re.sub(r"\’d", " would", text) text = re.sub(r"\’ll", " will", text) text = re.sub(r"\’t", " not", text) text = re.sub(r"you've", " you have", text) text = re.sub(r"\’ve", " have", text) text = re.sub(r"\’m", " am", text) return text

df['Title']=df['Title'].apply(text_cleaning) df['Review']=df['Review'].apply(text_cleaning) df

# for example: text = "this film is great but the movie was awful. The theatre was amazing" print(sent_tokenize(text))

def nltk_sen(text): return sent_tokenize(text)

df['cut'] = df['Title'].apply(nltk_sen) df

df['cut'] = df['cut'].astype(str).str.replace("]", '') df['cut'] = df['cut'].str.replace("[", '') df['cut']

cut_df = pd.DataFrame(df['cut']) cut_df.index = np.arange(1, len(df)+1) cut_df

#Link each review to an ID by matching ID number to index number cut_df.index = np.arange(1, len(cut_df)+1) cut_df['ID'] = cut_df.index cut_df

#Splitting each review into 1 word per row & keeping ID to the right review new_df = pd.DataFrame(cut_df['cut'].str.split(',').tolist(), index=cut_df['ID']).stack() new_df

#Reset index new_df = new_df.reset_index([0, 'ID']) #Renaming columns new_df.columns = ['ID', 'Sentence']

new_df['Sentence'] = new_df['Sentence'].astype(str).str.replace("'", '')

new_df

def get_wordnet_pos(word): """Map POS tag to first character lemmatize() accepts""" tag = nltk.pos_tag([word])[0][1][0].upper() tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV} return tag_dict.get(tag, wordnet.NOUN)

#nltk wnl = WordNetLemmatizer() def lemmatize(s): lemmatized = [wnl.lemmatize(word,get_wordnet_pos(word)) for word in word_tokenize(s) if word not in string.punctuation] return lemmatized

new_df['Sentence'] = new_df['Sentence'].apply(lemmatize)

new_df['Sentence'] = new_df['Sentence'].astype(str).str.replace("[", '') new_df['Sentence'] = new_df['Sentence'].str.replace("]", '') new_df['Sentence'] = new_df['Sentence'].str.replace("'", '') new_df['Sentence'] = new_df['Sentence'].str.replace(",", '') new_df['Sentence']

! pip install rake_nltk

# one example of how it works: from rake_nltk import Rake import nltk

# one example of how it works: r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.Please note that "hello" is not included in the list of stopwords. text='I bought this last Friday.' a=r.extract_keywords_from_text(text) b=r.get_ranked_phrases() c=r.get_ranked_phrases_with_scores() print(b) print(c)

# define our stopwords nltk.download('stopwords') our_stopwords = ['also', 'would', 'one','still'] stopword = list(stopwords.words('english'))+our_stopwords stopword.remove('not')

def rake_package(text): r = Rake(stopwords =stopword) a = r.extract_keywords_from_text(text) extraction = r.get_ranked_phrases() return extraction

new_df['Rake'] = new_df['Sentence'].apply(rake_package)

new_df['Rake'] = new_df['Rake'].astype(str).str.replace("]", '') new_df['Rake'] = new_df['Rake'].str.replace("[", '') new_df['Rake']

df1 = new_df [['ID','Rake']]

#Splitting each review into 1 word per row & keeping ID to the right review df1 = pd.DataFrame(new_df['Rake'].str.split(',').tolist(), index=new_df['ID']).stack() df1

#Reset index df1 = df1.reset_index([0, 'ID']) #Renaming columns df1.columns = ['ID', 'subject']

#Text cleaning import re import string def text_cleaning(text): text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text)) text = text.strip() text = " ".join(text.split()) return text df1['subject_clean'] = df1['subject'].apply(text_cleaning)

df1['Len'] = df1['subject_clean'].apply(lambda x: len(word_tokenize(x))) df1

df1['subject_clean'].value_counts().head(10)

df1 = df1[df1.Len != 0] df1

Len = df1['Len']==1 Str = df1['subject_clean'].str.contains('use') df1.loc[Str | Len].value_counts()

#just wanna see the combination to which more than 1 words df1.loc[df1['Len']==2,'subject_clean'].value_counts().head(30)

#check the len average df1.loc[df1['ID']==41]

df1

#2021/10/17 df1.to_csv('Title Extraction.csv',encoding='utf-8-sig')