pip install contractions
pip install unidecode
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
nltk.download('wordnet')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
stopwords=set(STOPWORDS)
import re
import random
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
import warnings
import contractions
import unidecode
from collections import Counter
import spacy
from nltk.stem import PorterStemmer
porter = PorterStemmer()
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/gdrive')
Mounted at /content/gdrive
train=pd.read_csv('/content/gdrive/MyDrive/uhack_sentiments_20_decode_code_words/train.csv')
test=pd.read_csv('/content/gdrive/MyDrive/uhack_sentiments_20_decode_code_words/test.csv')
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6136 entries, 0 to 6135
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 6136 non-null int64
1 Review 6136 non-null object
2 Components 6136 non-null int64
3 Delivery and Customer Support 6136 non-null int64
4 Design and Aesthetics 6136 non-null int64
5 Dimensions 6136 non-null int64
6 Features 6136 non-null int64
7 Functionality 6136 non-null int64
8 Installation 6136 non-null int64
9 Material 6136 non-null int64
10 Price 6136 non-null int64
11 Quality 6136 non-null int64
12 Usability 6136 non-null int64
13 Polarity 6136 non-null int64
dtypes: int64(13), object(1)
memory usage: 671.2+ KB
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2631 entries, 0 to 2630
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 2631 non-null int64
1 Review 2631 non-null object
2 Components 0 non-null float64
3 Delivery and Customer Support 0 non-null float64
4 Design and Aesthetics 0 non-null float64
5 Dimensions 0 non-null float64
6 Features 0 non-null float64
7 Functionality 0 non-null float64
8 Installation 0 non-null float64
9 Material 0 non-null float64
10 Price 0 non-null float64
11 Quality 0 non-null float64
12 Usability 0 non-null float64
13 Polarity 0 non-null float64
dtypes: float64(12), int64(1), object(1)
memory usage: 287.9+ KB
train.isnull().sum()
test.isnull().sum()
train.head()
test.head()
fig=plt.subplots(figsize=(15, 15))
for i,col in enumerate(train.iloc[:,2:14].columns.values):
_=plt.subplot(6,2,i+1)
_=sns.countplot(x=train[col],hue=train[col])
_=plt.title(col+' Topic Distribution',fontsize=15)
_=plt.xlabel(col,fontsize=10)
_=plt.xticks(fontsize=15)
_=plt.tight_layout()
plt.show()
for col in train.iloc[:,2:14].columns.values:
print("="*100)
print(train[col].value_counts())
print("-"*100)
====================================================================================================
0 5862
1 274
Name: Components, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 5957
1 179
Name: Delivery and Customer Support, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 5486
1 650
Name: Design and Aesthetics, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 5441
1 695
Name: Dimensions, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 5823
1 313
Name: Features, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 3598
1 2538
Name: Functionality, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 5374
1 762
Name: Installation, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 5981
1 155
Name: Material, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 5282
1 854
Name: Price, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 3959
1 2177
Name: Quality, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
0 4907
1 1229
Name: Usability, dtype: int64
----------------------------------------------------------------------------------------------------
====================================================================================================
1 4770
0 1366
Name: Polarity, dtype: int64
----------------------------------------------------------------------------------------------------
fig = plt.figure(figsize=(30, 30),constrained_layout=True)
outer = gridspec.GridSpec(6, 2, wspace=0.2, hspace=0.2)
for i,col in enumerate(train.iloc[:,2:8].columns.values):
inner = gridspec.GridSpecFromSubplotSpec(1, 2,
subplot_spec=outer[i], wspace=0.1, hspace=0.1
)
for x in train[col].unique():
ax = plt.Subplot(fig, inner[x])
wc = WordCloud(background_color="white", max_words=100, stopwords=stopwords,
max_font_size=40, random_state=42).generate(train[train[col]==x]['Review'].to_string())
fig.add_subplot(ax)
plt.imshow(wc)
ax.set_title(f"{col}_{x}")
ax.set_xticks([])
ax.set_yticks([])
ax.set_frame_on(False)
fig.tight_layout()
fig = plt.figure(figsize=(30, 30),constrained_layout=True)
outer = gridspec.GridSpec(6, 2, wspace=0.2, hspace=0.2)
for i,col in enumerate(train.iloc[:,8:14].columns.values):
inner = gridspec.GridSpecFromSubplotSpec(1, 2,
subplot_spec=outer[i], wspace=0.1, hspace=0.1
)
for x in train[col].unique():
ax = plt.Subplot(fig, inner[x])
wc = WordCloud(background_color="white", max_words=100, stopwords=stopwords,
max_font_size=40, random_state=42).generate(train[train[col]==x]['Review'].to_string())
fig.add_subplot(ax)
plt.imshow(wc)
ax.set_title(f"{col}_{x}")
ax.set_xticks([])
ax.set_yticks([])
ax.set_frame_on(False)
fig.tight_layout()
_=plt.figure(figsize=(8, 5))
_=sns.histplot(train['Review'].str.len(),kde=True)
_=plt.title( 'Number of characters in Reviews',fontsize=15)
sns.boxplot(y=train['Review'].str.len());
train['Review'].str.len().describe()
_=plt.figure(figsize=(8, 5))
_=sns.histplot(train['Review'].str.split().map(lambda x: len(x)),kde=True,color='Green')
_=plt.title( 'Number of Words in Reviews',fontsize=15)
sns.boxplot(y=train['Review'].str.split().map(lambda x: len(x)),color='Green');
train['Review'].str.split().map(lambda x: len(x)).describe()
_=plt.figure(figsize=(8, 5))
_=sns.histplot(train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x))
,kde=True,color='Orange')
_=plt.title( 'Average Number of Words in Each Reviews',fontsize=15)
sns.boxplot(y=train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x)),
color='Orange');
train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x)).describe()
txt_info=train.iloc[:,2:14]
for i in range(3):
txt_info['characters']=train['Review'].str.len()
txt_info['words']=train['Review'].str.split().map(lambda x: len(x))
txt_info['avg_words']=train['Review'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x:np.mean(x))
def target_grp_hist(df,valcol,title=''):
fig=plt.subplots(figsize=(15, 15))
for i,col in enumerate(df.iloc[:,0:12].columns.values):
_=plt.subplot(6,2,i+1)
_=sns.histplot(x=df[valcol],hue=txt_info[col])
_=plt.title(col+title,fontsize=15)
_=plt.xlabel(col,fontsize=10)
_=plt.xticks(fontsize=15)
_=plt.tight_layout()
plt.show()
target_grp_hist(txt_info,'characters','-Number of Characters')
def target_grp_box(df,valcol,title=''):
fig=plt.subplots(figsize=(15, 15))
for i,col in enumerate(df.iloc[:,0:12].columns.values):
_=plt.subplot(6,2,i+1)
_=sns.boxplot(x=txt_info[col],y=df[valcol])
_=plt.title(col+title,fontsize=15)
_=plt.xlabel(col,fontsize=10)
_=plt.xticks(fontsize=15)
_=plt.tight_layout()
plt.show()
target_grp_box(txt_info,'characters','-Number of Characters')
def target_grp_summary(df,valcol):
for col in df.iloc[:,0:12].columns.values:
print("="*100)
print(f"{df.groupby([col])[valcol].describe()}")
print("-"*100)
target_grp_summary(txt_info,'characters')
====================================================================================================
count mean std min 25% 50% 75% max
Components
0 5862.0 158.237632 173.755273 16.0 54.0 100.0 196.0 1987.0
1 274.0 220.368613 186.163422 21.0 92.0 170.0 278.5 1271.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean ... 75% max
Delivery and Customer Support ...
0 5957.0 160.752392 ... 200.0 1987.0
1 179.0 169.653631 ... 196.0 930.0
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Design and Aesthetics ...
0 5486.0 157.867481 169.660484 ... 100.0 198.0 1987.0
1 650.0 187.552308 211.475697 ... 113.0 227.0 1943.0
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Dimensions
0 5441.0 160.801507 175.820480 16.0 54.0 100.0 202.0 1987.0
1 695.0 162.660432 166.557966 17.0 59.0 107.0 189.0 1175.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Features
0 5823.0 156.749098 170.190766 16.0 54.0 100.0 195.0 1981.0
1 313.0 240.319489 231.442730 23.0 81.0 167.0 313.0 1987.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Functionality
0 3598.0 153.583658 170.566273 16.0 52.0 96.0 193.0 1981.0
1 2538.0 171.542947 180.103795 16.0 59.0 110.0 211.0 1987.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Installation
0 5374.0 158.811500 171.293599 16.0 54.0 101.0 197.00 1987.0
1 762.0 176.531496 197.085225 16.0 57.0 105.5 222.75 1943.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Material
0 5981.0 160.138438 174.569419 16.0 54.0 101.0 199.0 1987.0
1 155.0 194.722581 180.232805 22.0 62.0 127.0 263.0 985.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Price
0 5282.0 164.533321 179.952461 16.0 56.0 103.0 204.0 1987.0
1 854.0 139.233021 136.683308 17.0 48.0 89.5 180.0 986.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Quality
0 3959.0 146.207376 163.408017 16.0 51.5 92.0 173.5 1987.0
1 2177.0 187.935232 190.885675 16.0 64.0 125.0 241.0 1981.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Usability
0 4907.0 160.300591 171.240110 16.0 55.0 101.0 201.0 1987.0
1 1229.0 163.852726 188.316574 16.0 54.0 100.0 197.0 1981.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Polarity
0 1366.0 229.721816 213.061205 18.0 90.0 164.0 298.0 1987.0
1 4770.0 141.335430 156.739033 16.0 50.0 88.0 171.0 1981.0
----------------------------------------------------------------------------------------------------
target_grp_hist(txt_info,'words','-Number of Words')
target_grp_box(txt_info,'words','-Number of Words')
target_grp_summary(txt_info,'words')
====================================================================================================
count mean std min 25% 50% 75% max
Components
0 5862.0 29.480382 32.595648 2.0 10.0 18.0 37.00 374.0
1 274.0 41.306569 34.964045 4.0 17.0 32.0 54.75 246.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Delivery and Customer Support ...
0 5957.0 29.988921 32.820393 ... 19.0 38.0 374.0
1 179.0 30.659218 31.955922 ... 19.0 38.5 161.0
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Design and Aesthetics ...
0 5486.0 29.466642 31.867511 ... 18.0 37.0 354.0
1 650.0 34.581538 39.490011 ... 20.0 43.0 374.0
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Dimensions
0 5441.0 29.938614 32.932773 2.0 10.0 19.0 38.0 374.0
1 695.0 30.555396 31.696499 3.0 11.0 20.0 36.0 228.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Features
0 5823.0 29.239052 31.975008 2.0 10.0 18.0 37.0 374.0
1 313.0 44.322684 43.023637 4.0 15.0 31.0 58.0 354.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Functionality
0 3598.0 28.514175 31.820016 2.0 9.0 17.0 36.0 374.0
1 2538.0 32.126872 34.019211 2.0 11.0 20.0 40.0 354.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Installation
0 5374.0 29.623372 32.148049 2.0 10.0 19.0 37.0 354.0
1 762.0 32.724409 36.934508 2.0 10.0 19.0 42.0 374.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Material
0 5981.0 29.868082 32.762878 2.0 10.0 19.0 38.0 374.0
1 155.0 35.425806 33.602518 4.0 11.5 22.0 47.5 176.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Price
0 5282.0 30.655623 33.712886 2.0 10.0 19.0 38.0 374.0
1 854.0 26.005855 26.063791 3.0 9.0 16.0 34.0 179.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Quality
0 3959.0 27.411215 30.736679 2.0 9.0 17.0 33.0 354.0
1 2177.0 34.731741 35.762141 2.0 11.0 23.0 45.0 374.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Usability
0 4907.0 29.813532 32.135194 2.0 10.0 19.0 38.0 374.0
1 1229.0 30.786819 35.301264 2.0 10.0 19.0 37.0 351.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Polarity
0 1366.0 43.027086 39.695028 3.0 17.0 31.0 56.0 354.0
1 4770.0 26.280294 29.495237 2.0 9.0 16.0 32.0 374.0
----------------------------------------------------------------------------------------------------
target_grp_hist(txt_info,'avg_words','-Number of Average Words')
target_grp_box(txt_info,'avg_words','-Number of Average Words')
target_grp_summary(txt_info,'avg_words')
====================================================================================================
count mean std ... 50% 75% max
Components ...
0 5862.0 4.566210 0.730161 ... 4.428571 4.857143 11.5
1 274.0 4.431509 0.533932 ... 4.366987 4.665179 7.0
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean ... 75% max
Delivery and Customer Support ...
0 5957.0 4.552429 ... 4.833333 11.500000
1 179.0 4.818642 ... 5.193750 8.769231
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Design and Aesthetics ...
0 5486.0 4.546989 0.721873 ... 4.416667 4.833333 11.5
1 650.0 4.671654 0.723816 ... 4.512854 5.000000 9.6
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Dimensions ...
0 5441.0 4.567564 0.737115 ... 4.428571 4.857143 11.5
1 695.0 4.502508 0.598892 ... 4.418182 4.797436 8.0
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Features ...
0 5823.0 4.556931 0.725902 ... 4.423077 4.846154 11.5
1 313.0 4.620930 0.665685 ... 4.478261 4.868421 8.5
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Functionality ...
0 3598.0 4.598865 0.772253 ... 4.451732 4.906795 11.500000
1 2538.0 4.505376 0.643060 ... 4.400000 4.777778 9.272727
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Installation ...
0 5374.0 4.558645 0.728509 ... 4.421053 4.846154 11.5
1 762.0 4.571131 0.683574 ... 4.454545 4.873992 9.0
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Material ...
0 5981.0 4.555766 0.723324 ... 4.423077 4.846154 11.50
1 155.0 4.731097 0.692795 ... 4.548387 5.081140 6.75
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Price
0 5282.0 4.557533 0.725384 2.666667 4.120000 4.427619 4.838710 11.5
1 854.0 4.576659 0.708556 3.000000 4.100641 4.428571 4.888889 9.0
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std min 25% 50% 75% max
Quality
0 3959.0 4.510921 0.697646 2.666667 4.076923 4.40 4.800000 11.5
1 2177.0 4.649803 0.759073 2.888889 4.179104 4.48 4.958333 9.6
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Usability ...
0 4907.0 4.580598 0.703545 ... 4.454545 4.875000 9.5
1 1229.0 4.478732 0.791204 ... 4.333333 4.717391 11.5
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
====================================================================================================
count mean std ... 50% 75% max
Polarity ...
0 1366.0 4.420211 0.594495 ... 4.333333 4.666667 9.272727
1 4770.0 4.600283 0.751099 ... 4.454545 4.908562 11.500000
[2 rows x 8 columns]
----------------------------------------------------------------------------------------------------
from collections import defaultdict
stopWords = set(stopwords.words('english'))
words = word_tokenize(train['Review'].to_string().lower())
dic=defaultdict(int)
for word in words:
if word in stopWords:
dic[word]+=1
list(dict(Counter(dic).most_common(20)).items())
_=plt.figure(figsize=(8,8))
sns.barplot(x=list(dict(Counter(dic).most_common(20)).values()),
y=list(dict(Counter(dic).most_common(20)).keys())
);
dic1=defaultdict(int)
for word in words:
if word not in stopWords:
dic1[word]+=1
list(dict(Counter(dic1).most_common(20)).items())
_=plt.figure(figsize=(8,8))
sns.barplot(x=list(dict(Counter(dic1).most_common(20)).values()),
y=list(dict(Counter(dic1).most_common(20)).keys())
);
def top_ngram(txt=None,n=0):
n_gram=(pd.Series(nltk.ngrams(txt, n)).value_counts().sort_values(ascending=False))[:10]
return n_gram, sns.barplot(x=n_gram.values,
y=n_gram.index);
top_ngram(words,2)
top_ngram(words,3)
doc=nlp(" ".join([j for i in train['Review'].str.split() for j in i]))
all_ent=[(x.text,x.label_) for x in doc.ents]
cat_ents=pd.DataFrame()
cat_ents['cat_ent']=[j for i ,j in all_ent]
cat_ents['txt']=[i for i ,j in all_ent]
plt.figure(figsize=(8,8))
sns.countplot(y=cat_ents['cat_ent'],order=cat_ents['cat_ent'].value_counts().index);
clr=[]
for name, hex in matplotlib.colors.cnames.items():
if 'dark' in name:
clr.append(name)
fig=plt.subplots(figsize=(15, 15))
for i,(col,clrs) in enumerate(zip(cat_ents['cat_ent'].value_counts().nlargest(10).index.values
,clr[9:19])):
_=plt.subplot(5,2,i+1)
df=(cat_ents[cat_ents['cat_ent']==col].groupby(['txt'])['txt'].agg({'count'}).
reset_index().sort_values('count',ascending=False)[:10])
df=df.sort_values('count')
_=plt.barh(df['txt'],df['count'],color=clrs)
_=plt.title(f"Top '{col}' Named-Entity",fontsize=15)
_=plt.ylabel("")
_=plt.yticks(fontsize=12)
_=plt.tight_layout()
plt.show()
pos=nltk.pos_tag(word_tokenize(" ".join([j for i in train['Review'].str.split() for j in i])))
pos_tag=pd.DataFrame()
pos_tag['tag']=[j for i ,j in pos]
pos_tag['txt']=[i for i ,j in pos]
plt.figure(figsize=(12,12))
sns.countplot(y=pos_tag['tag'],order=pos_tag['tag'].value_counts().index);
fig=plt.subplots(figsize=(15, 15))
for i,(col,clrs) in enumerate(zip(pos_tag['tag'].value_counts().nlargest(10).index.values
,clr[6:16])):
_=plt.subplot(5,2,i+1)
df=(pos_tag[pos_tag['tag']==col].groupby(['txt'])['txt'].agg({'count'}).
reset_index().sort_values('count',ascending=False)[:10])
df=df.sort_values('count')
_=plt.barh(df['txt'],df['count'],color=clrs)
_=plt.title(f"Most Common '{col}'",fontsize=15)
_=plt.ylabel("")
_=plt.yticks(fontsize=12)
_=plt.tight_layout()
plt.show()
train[train['Review'].str.lower().str.contains("(https:?\/\/[www]?.+)")]['Review']
train[train['Review'].str.contains("(\d+)")]['Review'][6123]
def digits(text):
res=text.str.lower().str.extract("(\d+|\d+\.\d+)")
return res.dropna().value_counts().nlargest(10), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,10))
digits(train['Review'])
def mixed_contraction(text):
res=text.str.lower().str.extract("([a-zA-Z]+'[a-zA-Z]+)")
return res.dropna().value_counts().nlargest(10), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,10))
mixed_contraction(train['Review'])
def non_ascii(text):
res=text.str.lower().str.extract("([^\x00-\x7F]+)")
return res.dropna().value_counts().nlargest(20), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,10))
non_ascii(train['Review'])
def currency(text):
res=text.str.lower().str.extract("([$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6])")
return res.dropna().value_counts().nlargest(10), res.dropna().value_counts().nlargest(10).sort_values(ascending=True).plot(kind='barh',figsize=(8,5))
currency(train['Review'])
def remove_stopwords(x:str):
lst = [i for i in x.split(" ") if i not in stopWords]
final = ' '.join(lst)
return final
def lemmati(x:str):
lst = [lemmatizer.lemmatize(i) for i in x.split(" ")]
final=" ".join(lst)
return final
def stem(x:str):
lst = [porter.stem(i) for i in x.split(" ")]
final=" ".join(lst)
return final
train['Review'][100]
stem("""Cushions holding up well after a month of use. Color and size is accurate.
Seller provides great customer service with quick communication and follow up.""")
lemmati("""Cushions holding up well after a month of use. Color and size is accurate.
Seller provides great customer service with quick communication and follow up.""")
def text_pre_process(strings):
txt=strings.lower() # convert text to lowercse
txt=re.sub('(https:?\/\/[www]?.+)','',txt) # remove url
txt=unidecode.unidecode(txt) # diacritics remove
txt=contractions.fix(txt) # contraction fix
txt=re.sub('(\d+)',' ',txt) # remove numbers
txt=re.sub('[^\w\s]',' ',txt) # remove punctuations
txt=remove_stopwords(txt)
txt=lemmati(txt)
return txt