import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print('Setup completed!!')
data = pd.read_csv('/work/Womens Clothing E-Commerce Reviews.csv')
data.head(10)
data.dtypes
data.drop(['Unnamed: 0','Clothing ID'], axis=1, inplace=True)
data.rename(columns={'Review Text': 'text',
'Positive Feedback Count': 'feedback_count',
'Division Name': 'Division', 'Department Name': 'Department',
'Class Name':'Class', 'Recommended IND': 'Recommended'},inplace=True)
print(data.columns)
data.isnull().sum()
data.dropna(axis=0, inplace=True)
data.isnull().sum()
sns.histplot(data=data, x='Age', kde=True)
plt.show()
sns.histplot(data=data, x='Rating', kde=False)
sns.set_style('white')
sns.barplot(x=data['Rating'], y=data['Class'],palette="Blues_d")
sns.set_theme(style="white")
sns.color_palette("flare", as_cmap=False)
sns.barplot(x=data['Age'], y=data['Class'],palette="Reds_d")
sns.displot(data=data,
x='Rating', hue="Class",
kind="kde", height=6,
multiple="fill", clip=(0, None),
)
sns.boxplot(x=data['Department'], y=data['Age'])
plt.show()
data.groupby(['Department']).sum().plot(kind='pie', subplots=False, shadow = False,startangle=90,figsize=(15,10), y='Rating')
sns.countplot(data=data, y='Class', hue='Recommended', palette = "Set1")
sns.countplot(data=data, y='Department', hue='Recommended' ,palette = "Set1")
sns.countplot(data=data, y='Division', hue='Recommended', palette = "Set1")
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
data.columns
data['all_text'] = data['text']
nltk.download('punkt');
def tokenize(column):
# Tokenize a Pandas dataframe columns and returns a list of tokens.
tokens = nltk.word_tokenize(column)
return [w for w in tokens if w.isalpha]
data['tokenized'] = data.apply(lambda x: tokenize(x['all_text']), axis=1)
data[['text', 'tokenized']].head()
nltk.download('stopwords');
def remove_stopwords(tokenized_column):
# this fuction will retun a list of tokens with English stopwords removed
stops = set(stopwords.words('english'))
return [word for word in tokenized_column if not word in stops]
data['stopwords_removed'] = data.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
data[['text', 'stopwords_removed']].head()
def apply_stemming(tokenized_column):
# this function will return a list of tokens with PorterStemming applied
stemmer = PorterStemmer()
return [stemmer.stem(word) for word in tokenized_column]
data['porter_stemmed'] = data.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
data[['text', 'porter_stemmed']].head()
def rejoin_words(tokenized_column):
# this fuction will rejoin the tokenized words list into a single string
return (" ".join(tokenized_column))
data['rejoined'] = data.apply(lambda x: rejoin_words(x['porter_stemmed']), axis=1)
data[['text', 'rejoined']].head()