Data Exploratoty Analysis top 50 bestselling amazon books, 2009-2019
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import matplotlib.image as img
# Let's import the project's cover image
from PIL import Image
image = Image.open("/work/amazon_books.jpg")
image.show()
Overview of the data set
df = pd.read_csv('/work/amazon_best_sellers_books.csv')
df.head()
df.shape
df.isnull().sum()
df.info()
df.nunique()
df['Genre'].value_counts()
df['Genre'] = df['Genre'].astype('category')
df.describe()
Analysing data set's variables
Name (books)
# count of the number of times each book appears on the yearly top 50 best selling list
books_count = df['Name'].value_counts()
books_count.iloc[:45,]
# Top 20 books by user rating
top20_books_rating = df.sort_values('User Rating', ascending=False).iloc[:20,]
top20_books_rating
# Bottom 20 books by user rating
bottom20_books_rating = df.sort_values('User Rating').iloc[:20,]
bottom20_books_rating
Author
# top 20 authors by user rating
top20_authors_rating = df.groupby('Author')['User Rating'].mean().sort_values(
ascending=False).iloc[:20]
top20_authors_rating
# top 20 authors by mean price
df_top20_authors_price = df.groupby('Author')['Price'].mean().sort_values(
ascending=False).iloc[:20]
df_top20_authors_price
# top 20 authors by number of books ranked
top20_authors_number_books = df['Author'].value_counts().iloc[:20]
top20_authors_number_books
top20_authors_columns = df[df['Author'].isin(top20_authors_number_books.index)][['Author','User Rating', 'Reviews', 'Price']]
top20_authors_columns
top20_authors_columns_grouped = top20_authors_columns.groupby('Author')[['Author','User Rating', 'Reviews', 'Price']].mean(
).sort_values(by='Price', ascending=False)
top20_authors_columns_grouped
Genre
fig = px.histogram(df, x='Genre', title='Genre', color='Genre')
fig.show()
genre_by_year = df.groupby('Year')['Genre'].value_counts().reset_index(level
=0).rename(columns={'Genre':'Genre Count'})
genre_by_year
genre_by_year_hist = px.histogram(df, x='Year', color='Genre',pattern_shape='Genre')
genre_by_year_hist.show()
User Rating
df['User Rating'].describe()
fig = px.histogram(df, x='User Rating', title='User Rating', marginal='box',
hover_data=df[['Name', 'Author']])
fig.show()
fig = px.scatter(df, x='Year', y='User Rating', title='User Rating per Year', color='Genre',
hover_data = df[['Name','Author']])
fig.show()
Reviews
df['Reviews'].describe()
fig = px.histogram(df, x='Reviews', title='Reviews', marginal='box',
hover_data=df[['Name', 'Author']])
fig.show()
fig = px.scatter(df, x='Year', y='Reviews', title='Number of Reviews per Year', color='Genre',
hover_data = df[['Name','Author']])
fig.show()
Price
df['Price'].describe()
fig = px.histogram(df, x='Price', title='Price', marginal='box',
hover_data=df[['Name', 'Author']])
fig.show()
df.sort_values(by='Price').head(12)
fig = px.scatter(df, x='Year', y='Price', title='Price of Books per Year', color='Genre',
hover_data = df[['Name','Author']])
fig.show()
Correlations
df[['User Rating', 'Reviews', 'Price']].corr()
fig = px.scatter_matrix(df, dimensions=['User Rating', 'Reviews', 'Price'], color='Genre',
hover_data=df[['Name', 'Author']], opacity=0.6, width=1000, height=1000, title='Correlations')
fig.show()