import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import seaborn as sns
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize,WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet
#re-scrap dataset
df= pd.read_csv('https://raw.githubusercontent.com/emaleeeeeee/review_mini/main/REVIEWS%20RESCRAP.csv')
df.columns = ['Customer','Title','Review','Rating','Date']
df = df[['Title','Review']]
df = df.dropna()
spanish = df.index[(df['Review'].str.contains('muy')) | (df['Review'].str.contains('bueno'))| (df['Review'].str.contains('en el'))| (df['Review'].str.contains('ninguno')) | (df['Review'].str.contains('tiempo'))|
(df['Review'].str.contains('teléfonos'))| df['Review'].str.contains('calidad de imagen') ]
bad_df = df.index.isin(spanish)
df = df[~bad_df]
df
#Text cleaning
import re
import string
def text_cleaning(text):
text = text.lower()
text = re.sub(r"however", ". however", text)
text = re.sub(r"n\but", ". but", text)
text = re.sub(r"wont", "will not", text)
text = re.sub(r"can't", "can not", text)
text = re.sub(r"i'm", "i am", text)
text = re.sub('\n', '', str(text))
text = re.sub(r"\’re", " are", text)
text = re.sub(r"\’s", " is", text)
text = re.sub(r"\’d", " would", text)
text = re.sub(r"\’ll", " will", text)
text = re.sub(r"\’t", " not", text)
text = re.sub(r"you've", " you have", text)
text = re.sub(r"\’ve", " have", text)
text = re.sub(r"\’m", " am", text)
return text
df['Title']=df['Title'].apply(text_cleaning)
df['Review']=df['Review'].apply(text_cleaning)
df
# for example:
text = "this film is great but the movie was awful. The theatre was amazing"
print(sent_tokenize(text))
def nltk_sen(text):
return sent_tokenize(text)
df['cut'] = df['Title'].apply(nltk_sen)
df
df['cut'] = df['cut'].astype(str).str.replace("]", '')
df['cut'] = df['cut'].str.replace("[", '')
df['cut']
cut_df = pd.DataFrame(df['cut'])
cut_df.index = np.arange(1, len(df)+1)
cut_df
#Link each review to an ID by matching ID number to index number
cut_df.index = np.arange(1, len(cut_df)+1)
cut_df['ID'] = cut_df.index
cut_df
#Splitting each review into 1 word per row & keeping ID to the right review
new_df = pd.DataFrame(cut_df['cut'].str.split(',').tolist(), index=cut_df['ID']).stack()
new_df
#Reset index
new_df = new_df.reset_index([0, 'ID'])
#Renaming columns
new_df.columns = ['ID', 'Sentence']
new_df['Sentence'] = new_df['Sentence'].astype(str).str.replace("'", '')
new_df
def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
#nltk
wnl = WordNetLemmatizer()
def lemmatize(s):
lemmatized = [wnl.lemmatize(word,get_wordnet_pos(word)) for word in word_tokenize(s) if word not in string.punctuation]
return lemmatized
new_df['Sentence'] = new_df['Sentence'].apply(lemmatize)
new_df['Sentence'] = new_df['Sentence'].astype(str).str.replace("[", '')
new_df['Sentence'] = new_df['Sentence'].str.replace("]", '')
new_df['Sentence'] = new_df['Sentence'].str.replace("'", '')
new_df['Sentence'] = new_df['Sentence'].str.replace(",", '')
new_df['Sentence']
! pip install rake_nltk
# one example of how it works:
from rake_nltk import Rake
import nltk
# one example of how it works:
r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.Please note that "hello" is not included in the list of stopwords.
text='I bought this last Friday.'
a=r.extract_keywords_from_text(text)
b=r.get_ranked_phrases()
c=r.get_ranked_phrases_with_scores()
print(b)
print(c)
# define our stopwords
nltk.download('stopwords')
our_stopwords = ['also', 'would', 'one','still']
stopword = list(stopwords.words('english'))+our_stopwords
stopword.remove('not')
def rake_package(text):
r = Rake(stopwords =stopword)
a = r.extract_keywords_from_text(text)
extraction = r.get_ranked_phrases()
return extraction
new_df['Rake'] = new_df['Sentence'].apply(rake_package)
new_df['Rake'] = new_df['Rake'].astype(str).str.replace("]", '')
new_df['Rake'] = new_df['Rake'].str.replace("[", '')
new_df['Rake']
df1 = new_df [['ID','Rake']]
#Splitting each review into 1 word per row & keeping ID to the right review
df1 = pd.DataFrame(new_df['Rake'].str.split(',').tolist(), index=new_df['ID']).stack()
df1
#Reset index
df1 = df1.reset_index([0, 'ID'])
#Renaming columns
df1.columns = ['ID', 'subject']
#Text cleaning
import re
import string
def text_cleaning(text):
text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text))
text = text.strip()
text = " ".join(text.split())
return text
df1['subject_clean'] = df1['subject'].apply(text_cleaning)
df1['Len'] = df1['subject_clean'].apply(lambda x: len(word_tokenize(x)))
df1
df1['subject_clean'].value_counts().head(10)
df1 = df1[df1.Len != 0]
df1
Len = df1['Len']==1
Str = df1['subject_clean'].str.contains('use')
df1.loc[Str | Len].value_counts()
#just wanna see the combination to which more than 1 words
df1.loc[df1['Len']==2,'subject_clean'].value_counts().head(30)
#check the len average
df1.loc[df1['ID']==41]
df1
#2021/10/17
df1.to_csv('Title Extraction.csv',encoding='utf-8-sig')