uhack_sentiments_20_decode_code_words

pip install contractions

pip install unidecode

nltk.download('punkt') nltk.download('stopwords') nlp = spacy.load("en_core_web_sm") nltk.download('averaged_perceptron_tagger') nltk.download('words') nltk.download('wordnet')

import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns from wordcloud import WordCloud, STOPWORDS stopwords=set(STOPWORDS) import re import random import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from matplotlib import pyplot as plt from sklearn.feature_extraction.text import CountVectorizer %matplotlib inline import warnings import contractions import unidecode from collections import Counter import spacy from nltk.stem import PorterStemmer porter = PorterStemmer() warnings.filterwarnings('ignore')

from google.colab import drive drive.mount('/content/gdrive')

train=pd.read_csv('/content/gdrive/MyDrive/uhack_sentiments_20_decode_code_words/train.csv') test=pd.read_csv('/content/gdrive/MyDrive/uhack_sentiments_20_decode_code_words/test.csv')

train.info()

test.info()

train.isnull().sum()

test.isnull().sum()

train.head()

test.head()

fig=plt.subplots(figsize=(15, 15)) for i,col in enumerate(train.iloc[:,2:14].columns.values): _=plt.subplot(6,2,i+1) _=sns.countplot(x=train[col],hue=train[col]) _=plt.title(col+' Topic Distribution',fontsize=15) _=plt.xlabel(col,fontsize=10) _=plt.xticks(fontsize=15) _=plt.tight_layout() plt.show()

for col in train.iloc[:,2:14].columns.values: print("="*100) print(train[col].value_counts()) print("-"*100)

fig = plt.figure(figsize=(30, 30),constrained_layout=True) outer = gridspec.GridSpec(6, 2, wspace=0.2, hspace=0.2) for i,col in enumerate(train.iloc[:,2:8].columns.values): inner = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=outer[i], wspace=0.1, hspace=0.1 ) for x in train[col].unique(): ax = plt.Subplot(fig, inner[x]) wc = WordCloud(background_color="white", max_words=100, stopwords=stopwords, max_font_size=40, random_state=42).generate(train[train[col]==x]['Review'].to_string()) fig.add_subplot(ax) plt.imshow(wc) ax.set_title(f"{col}_{x}") ax.set_xticks([]) ax.set_yticks([]) ax.set_frame_on(False) fig.tight_layout()

fig = plt.figure(figsize=(30, 30),constrained_layout=True) outer = gridspec.GridSpec(6, 2, wspace=0.2, hspace=0.2) for i,col in enumerate(train.iloc[:,8:14].columns.values): inner = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=outer[i], wspace=0.1, hspace=0.1 ) for x in train[col].unique(): ax = plt.Subplot(fig, inner[x]) wc = WordCloud(background_color="white", max_words=100, stopwords=stopwords, max_font_size=40, random_state=42).generate(train[train[col]==x]['Review'].to_string()) fig.add_subplot(ax) plt.imshow(wc) ax.set_title(f"{col}_{x}") ax.set_xticks([]) ax.set_yticks([]) ax.set_frame_on(False) fig.tight_layout()

_=plt.figure(figsize=(8, 5)) _=sns.histplot(train['Review'].str.len(),kde=True) _=plt.title( 'Number of characters in Reviews',fontsize=15)

sns.boxplot(y=train['Review'].str.len());

train['Review'].str.len().describe()

_=plt.figure(figsize=(8, 5)) _=sns.histplot(train['Review'].str.split().map(lambda x: len(x)),kde=True,color='Green') _=plt.title( 'Number of Words in Reviews',fontsize=15)

sns.boxplot(y=train['Review'].str.split().map(lambda x: len(x)),color='Green');

train['Review'].str.split().map(lambda x: len(x)).describe()

_=plt.figure(figsize=(8, 5)) _=sns.histplot(train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x)) ,kde=True,color='Orange') _=plt.title( 'Average Number of Words in Each Reviews',fontsize=15)

sns.boxplot(y=train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x)), color='Orange');

train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x)).describe()

txt_info=train.iloc[:,2:14] for i in range(3): txt_info['characters']=train['Review'].str.len() txt_info['words']=train['Review'].str.split().map(lambda x: len(x)) txt_info['avg_words']=train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x))

def target_grp_hist(df,valcol,title=''): fig=plt.subplots(figsize=(15, 15)) for i,col in enumerate(df.iloc[:,0:12].columns.values): _=plt.subplot(6,2,i+1) _=sns.histplot(x=df[valcol],hue=txt_info[col]) _=plt.title(col+title,fontsize=15) _=plt.xlabel(col,fontsize=10) _=plt.xticks(fontsize=15) _=plt.tight_layout() plt.show()

target_grp_hist(txt_info,'characters','-Number of Characters')

def target_grp_box(df,valcol,title=''): fig=plt.subplots(figsize=(15, 15)) for i,col in enumerate(df.iloc[:,0:12].columns.values): _=plt.subplot(6,2,i+1) _=sns.boxplot(x=txt_info[col],y=df[valcol]) _=plt.title(col+title,fontsize=15) _=plt.xlabel(col,fontsize=10) _=plt.xticks(fontsize=15) _=plt.tight_layout() plt.show()

target_grp_box(txt_info,'characters','-Number of Characters')

def target_grp_summary(df,valcol): for col in df.iloc[:,0:12].columns.values: print("="*100) print(f"{df.groupby([col])[valcol].describe()}") print("-"*100)

target_grp_summary(txt_info,'characters')

target_grp_hist(txt_info,'words','-Number of Words')

target_grp_box(txt_info,'words','-Number of Words')

target_grp_summary(txt_info,'words')

target_grp_hist(txt_info,'avg_words','-Number of Average Words')

target_grp_box(txt_info,'avg_words','-Number of Average Words')

target_grp_summary(txt_info,'avg_words')

from collections import defaultdict

stopWords = set(stopwords.words('english')) words = word_tokenize(train['Review'].to_string().lower())

dic=defaultdict(int) for word in words: if word in stopWords: dic[word]+=1

list(dict(Counter(dic).most_common(20)).items())

_=plt.figure(figsize=(8,8)) sns.barplot(x=list(dict(Counter(dic).most_common(20)).values()), y=list(dict(Counter(dic).most_common(20)).keys()) );

dic1=defaultdict(int) for word in words: if word not in stopWords: dic1[word]+=1

list(dict(Counter(dic1).most_common(20)).items())

_=plt.figure(figsize=(8,8)) sns.barplot(x=list(dict(Counter(dic1).most_common(20)).values()), y=list(dict(Counter(dic1).most_common(20)).keys()) );

def top_ngram(txt=None,n=0): n_gram=(pd.Series(nltk.ngrams(txt, n)).value_counts().sort_values(ascending=False))[:10] return n_gram, sns.barplot(x=n_gram.values, y=n_gram.index);

top_ngram(words,2)

top_ngram(words,3)

doc=nlp(" ".join([j for i in train['Review'].str.split() for j in i]))

all_ent=[(x.text,x.label_) for x in doc.ents]

cat_ents=pd.DataFrame() cat_ents['cat_ent']=[j for i ,j in all_ent] cat_ents['txt']=[i for i ,j in all_ent]

plt.figure(figsize=(8,8)) sns.countplot(y=cat_ents['cat_ent'],order=cat_ents['cat_ent'].value_counts().index);

clr=[] for name, hex in matplotlib.colors.cnames.items(): if 'dark' in name: clr.append(name)

fig=plt.subplots(figsize=(15, 15)) for i,(col,clrs) in enumerate(zip(cat_ents['cat_ent'].value_counts().nlargest(10).index.values ,clr[9:19])): _=plt.subplot(5,2,i+1) df=(cat_ents[cat_ents['cat_ent']==col].groupby(['txt'])['txt'].agg({'count'}). reset_index().sort_values('count',ascending=False)[:10]) df=df.sort_values('count') _=plt.barh(df['txt'],df['count'],color=clrs) _=plt.title(f"Top '{col}' Named-Entity",fontsize=15) _=plt.ylabel("") _=plt.yticks(fontsize=12) _=plt.tight_layout() plt.show()

pos=nltk.pos_tag(word_tokenize(" ".join([j for i in train['Review'].str.split() for j in i])))

pos_tag=pd.DataFrame() pos_tag['tag']=[j for i ,j in pos] pos_tag['txt']=[i for i ,j in pos]

plt.figure(figsize=(12,12)) sns.countplot(y=pos_tag['tag'],order=pos_tag['tag'].value_counts().index);

fig=plt.subplots(figsize=(15, 15)) for i,(col,clrs) in enumerate(zip(pos_tag['tag'].value_counts().nlargest(10).index.values ,clr[6:16])): _=plt.subplot(5,2,i+1) df=(pos_tag[pos_tag['tag']==col].groupby(['txt'])['txt'].agg({'count'}). reset_index().sort_values('count',ascending=False)[:10]) df=df.sort_values('count') _=plt.barh(df['txt'],df['count'],color=clrs) _=plt.title(f"Most Common '{col}'",fontsize=15) _=plt.ylabel("") _=plt.yticks(fontsize=12) _=plt.tight_layout() plt.show()

train[train['Review'].str.lower().str.contains("(https:?\/\/[www]?.+)")]['Review']

train[train['Review'].str.contains("(\d+)")]['Review'][6123]

def digits(text): res=text.str.lower().str.extract("(\d+|\d+\.\d+)") return res.dropna().value_counts().nlargest(10), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,10))

digits(train['Review'])

def mixed_contraction(text): res=text.str.lower().str.extract("([a-zA-Z]+'[a-zA-Z]+)") return res.dropna().value_counts().nlargest(10), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,10))

mixed_contraction(train['Review'])

def non_ascii(text): res=text.str.lower().str.extract("([^\x00-\x7F]+)") return res.dropna().value_counts().nlargest(20), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,10))

non_ascii(train['Review'])

def currency(text): res=text.str.lower().str.extract("([$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6])") return res.dropna().value_counts().nlargest(10), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(8,5))

currency(train['Review'])

def remove_stopwords(x:str): lst = [i for i in x.split(" ") if i not in stopWords] final = ' '.join(lst) return final

def lemmati(x:str): lst = [lemmatizer.lemmatize(i) for i in x.split(" ")] final=" ".join(lst) return final

def stem(x:str): lst = [porter.stem(i) for i in x.split(" ")] final=" ".join(lst) return final

train['Review'][100]

stem("""Cushions holding up well after a month of use. Color and size is accurate. Seller provides great customer service with quick communication and follow up.""")

lemmati("""Cushions holding up well after a month of use. Color and size is accurate. Seller provides great customer service with quick communication and follow up.""")

def text_pre_process(strings): txt=strings.lower() # convert text to lowercse txt=re.sub('(https:?\/\/[www]?.+)','',txt) # remove url txt=unidecode.unidecode(txt) # diacritics remove txt=contractions.fix(txt) # contraction fix txt=re.sub('(\d+)',' ',txt) # remove numbers txt=re.sub('[^\w\s]',' ',txt) # remove punctuations txt=remove_stopwords(txt) txt=lemmati(txt) return txt