pip install contractions
pip install unidecode
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
nltk.download('wordnet')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
stopwords=set(STOPWORDS)
import re
import random
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
import warnings
import contractions
import unidecode
from collections import Counter
import spacy
from nltk.stem import PorterStemmer
porter = PorterStemmer()
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/gdrive')
train=pd.read_csv('/content/gdrive/MyDrive/uhack_sentiments_20_decode_code_words/train.csv')
test=pd.read_csv('/content/gdrive/MyDrive/uhack_sentiments_20_decode_code_words/test.csv')
train.info()
test.info()
train.isnull().sum()
test.isnull().sum()
train.head()
test.head()
fig=plt.subplots(figsize=(15, 15))
for i,col in enumerate(train.iloc[:,2:14].columns.values):
_=plt.subplot(6,2,i+1)
_=sns.countplot(x=train[col],hue=train[col])
_=plt.title(col+' Topic Distribution',fontsize=15)
_=plt.xlabel(col,fontsize=10)
_=plt.xticks(fontsize=15)
_=plt.tight_layout()
plt.show()
for col in train.iloc[:,2:14].columns.values:
print("="*100)
print(train[col].value_counts())
print("-"*100)
fig = plt.figure(figsize=(30, 30),constrained_layout=True)
outer = gridspec.GridSpec(6, 2, wspace=0.2, hspace=0.2)
for i,col in enumerate(train.iloc[:,2:8].columns.values):
inner = gridspec.GridSpecFromSubplotSpec(1, 2,
subplot_spec=outer[i], wspace=0.1, hspace=0.1
)
for x in train[col].unique():
ax = plt.Subplot(fig, inner[x])
wc = WordCloud(background_color="white", max_words=100, stopwords=stopwords,
max_font_size=40, random_state=42).generate(train[train[col]==x]['Review'].to_string())
fig.add_subplot(ax)
plt.imshow(wc)
ax.set_title(f"{col}_{x}")
ax.set_xticks([])
ax.set_yticks([])
ax.set_frame_on(False)
fig.tight_layout()
fig = plt.figure(figsize=(30, 30),constrained_layout=True)
outer = gridspec.GridSpec(6, 2, wspace=0.2, hspace=0.2)
for i,col in enumerate(train.iloc[:,8:14].columns.values):
inner = gridspec.GridSpecFromSubplotSpec(1, 2,
subplot_spec=outer[i], wspace=0.1, hspace=0.1
)
for x in train[col].unique():
ax = plt.Subplot(fig, inner[x])
wc = WordCloud(background_color="white", max_words=100, stopwords=stopwords,
max_font_size=40, random_state=42).generate(train[train[col]==x]['Review'].to_string())
fig.add_subplot(ax)
plt.imshow(wc)
ax.set_title(f"{col}_{x}")
ax.set_xticks([])
ax.set_yticks([])
ax.set_frame_on(False)
fig.tight_layout()
_=plt.figure(figsize=(8, 5))
_=sns.histplot(train['Review'].str.len(),kde=True)
_=plt.title( 'Number of characters in Reviews',fontsize=15)
sns.boxplot(y=train['Review'].str.len());
train['Review'].str.len().describe()
_=plt.figure(figsize=(8, 5))
_=sns.histplot(train['Review'].str.split().map(lambda x: len(x)),kde=True,color='Green')
_=plt.title( 'Number of Words in Reviews',fontsize=15)
sns.boxplot(y=train['Review'].str.split().map(lambda x: len(x)),color='Green');
train['Review'].str.split().map(lambda x: len(x)).describe()
_=plt.figure(figsize=(8, 5))
_=sns.histplot(train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x))
,kde=True,color='Orange')
_=plt.title( 'Average Number of Words in Each Reviews',fontsize=15)
sns.boxplot(y=train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x)),
color='Orange');
train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x)).describe()
txt_info=train.iloc[:,2:14]
for i in range(3):
txt_info['characters']=train['Review'].str.len()
txt_info['words']=train['Review'].str.split().map(lambda x: len(x))
txt_info['avg_words']=train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x))
def target_grp_hist(df,valcol,title=''):
fig=plt.subplots(figsize=(15, 15))
for i,col in enumerate(df.iloc[:,0:12].columns.values):
_=plt.subplot(6,2,i+1)
_=sns.histplot(x=df[valcol],hue=txt_info[col])
_=plt.title(col+title,fontsize=15)
_=plt.xlabel(col,fontsize=10)
_=plt.xticks(fontsize=15)
_=plt.tight_layout()
plt.show()
target_grp_hist(txt_info,'characters','-Number of Characters')
def target_grp_box(df,valcol,title=''):
fig=plt.subplots(figsize=(15, 15))
for i,col in enumerate(df.iloc[:,0:12].columns.values):
_=plt.subplot(6,2,i+1)
_=sns.boxplot(x=txt_info[col],y=df[valcol])
_=plt.title(col+title,fontsize=15)
_=plt.xlabel(col,fontsize=10)
_=plt.xticks(fontsize=15)
_=plt.tight_layout()
plt.show()
target_grp_box(txt_info,'characters','-Number of Characters')
def target_grp_summary(df,valcol):
for col in df.iloc[:,0:12].columns.values:
print("="*100)
print(f"{df.groupby([col])[valcol].describe()}")
print("-"*100)
target_grp_summary(txt_info,'characters')
target_grp_hist(txt_info,'words','-Number of Words')
target_grp_box(txt_info,'words','-Number of Words')
target_grp_summary(txt_info,'words')
target_grp_hist(txt_info,'avg_words','-Number of Average Words')
target_grp_box(txt_info,'avg_words','-Number of Average Words')
target_grp_summary(txt_info,'avg_words')
from collections import defaultdict
stopWords = set(stopwords.words('english'))
words = word_tokenize(train['Review'].to_string().lower())
dic=defaultdict(int)
for word in words:
if word in stopWords:
dic[word]+=1
list(dict(Counter(dic).most_common(20)).items())
_=plt.figure(figsize=(8,8))
sns.barplot(x=list(dict(Counter(dic).most_common(20)).values()),
y=list(dict(Counter(dic).most_common(20)).keys())
);
dic1=defaultdict(int)
for word in words:
if word not in stopWords:
dic1[word]+=1
list(dict(Counter(dic1).most_common(20)).items())
_=plt.figure(figsize=(8,8))
sns.barplot(x=list(dict(Counter(dic1).most_common(20)).values()),
y=list(dict(Counter(dic1).most_common(20)).keys())
);
def top_ngram(txt=None,n=0):
n_gram=(pd.Series(nltk.ngrams(txt, n)).value_counts().sort_values(ascending=False))[:10]
return n_gram, sns.barplot(x=n_gram.values,
y=n_gram.index);
top_ngram(words,2)
top_ngram(words,3)
doc=nlp(" ".join([j for i in train['Review'].str.split() for j in i]))
all_ent=[(x.text,x.label_) for x in doc.ents]
cat_ents=pd.DataFrame()
cat_ents['cat_ent']=[j for i ,j in all_ent]
cat_ents['txt']=[i for i ,j in all_ent]
plt.figure(figsize=(8,8))
sns.countplot(y=cat_ents['cat_ent'],order=cat_ents['cat_ent'].value_counts().index);
clr=[]
for name, hex in matplotlib.colors.cnames.items():
if 'dark' in name:
clr.append(name)
fig=plt.subplots(figsize=(15, 15))
for i,(col,clrs) in enumerate(zip(cat_ents['cat_ent'].value_counts().nlargest(10).index.values
,clr[9:19])):
_=plt.subplot(5,2,i+1)
df=(cat_ents[cat_ents['cat_ent']==col].groupby(['txt'])['txt'].agg({'count'}).
reset_index().sort_values('count',ascending=False)[:10])
df=df.sort_values('count')
_=plt.barh(df['txt'],df['count'],color=clrs)
_=plt.title(f"Top '{col}' Named-Entity",fontsize=15)
_=plt.ylabel("")
_=plt.yticks(fontsize=12)
_=plt.tight_layout()
plt.show()
pos=nltk.pos_tag(word_tokenize(" ".join([j for i in train['Review'].str.split() for j in i])))
pos_tag=pd.DataFrame()
pos_tag['tag']=[j for i ,j in pos]
pos_tag['txt']=[i for i ,j in pos]
plt.figure(figsize=(12,12))
sns.countplot(y=pos_tag['tag'],order=pos_tag['tag'].value_counts().index);
fig=plt.subplots(figsize=(15, 15))
for i,(col,clrs) in enumerate(zip(pos_tag['tag'].value_counts().nlargest(10).index.values
,clr[6:16])):
_=plt.subplot(5,2,i+1)
df=(pos_tag[pos_tag['tag']==col].groupby(['txt'])['txt'].agg({'count'}).
reset_index().sort_values('count',ascending=False)[:10])
df=df.sort_values('count')
_=plt.barh(df['txt'],df['count'],color=clrs)
_=plt.title(f"Most Common '{col}'",fontsize=15)
_=plt.ylabel("")
_=plt.yticks(fontsize=12)
_=plt.tight_layout()
plt.show()
train[train['Review'].str.lower().str.contains("(https:?\/\/[www]?.+)")]['Review']
train[train['Review'].str.contains("(\d+)")]['Review'][6123]
def digits(text):
res=text.str.lower().str.extract("(\d+|\d+\.\d+)")
return res.dropna().value_counts().nlargest(10), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,10))
digits(train['Review'])
def mixed_contraction(text):
res=text.str.lower().str.extract("([a-zA-Z]+'[a-zA-Z]+)")
return res.dropna().value_counts().nlargest(10), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,10))
mixed_contraction(train['Review'])
def non_ascii(text):
res=text.str.lower().str.extract("([^\x00-\x7F]+)")
return res.dropna().value_counts().nlargest(20), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,10))
non_ascii(train['Review'])
def currency(text):
res=text.str.lower().str.extract("([$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6])")
return res.dropna().value_counts().nlargest(10), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(8,5))
currency(train['Review'])
def remove_stopwords(x:str):
lst = [i for i in x.split(" ") if i not in stopWords]
final = ' '.join(lst)
return final
def lemmati(x:str):
lst = [lemmatizer.lemmatize(i) for i in x.split(" ")]
final=" ".join(lst)
return final
def stem(x:str):
lst = [porter.stem(i) for i in x.split(" ")]
final=" ".join(lst)
return final
train['Review'][100]
stem("""Cushions holding up well after a month of use. Color and size is accurate.
Seller provides great customer service with quick communication and follow up.""")
lemmati("""Cushions holding up well after a month of use. Color and size is accurate.
Seller provides great customer service with quick communication and follow up.""")
def text_pre_process(strings):
txt=strings.lower() # convert text to lowercse
txt=re.sub('(https:?\/\/[www]?.+)','',txt) # remove url
txt=unidecode.unidecode(txt) # diacritics remove
txt=contractions.fix(txt) # contraction fix
txt=re.sub('(\d+)',' ',txt) # remove numbers
txt=re.sub('[^\w\s]',' ',txt) # remove punctuations
txt=remove_stopwords(txt)
txt=lemmati(txt)
return txt