Data Exploratoty Analysis top 50 bestselling amazon books, 2009-2019

import pandas as pd import numpy as np import plotly import plotly.express as px import plotly.graph_objs as go import plotly.offline as py from plotly.offline import iplot from plotly.subplots import make_subplots import plotly.figure_factory as ff import matplotlib.pyplot as plt import matplotlib.image as img

# Let's import the project's cover image from PIL import Image image = Image.open("/work/amazon_books.jpg") image.show()

Overview of the data set

df = pd.read_csv('/work/amazon_best_sellers_books.csv') df.head()

df.shape

df.isnull().sum()

df.info()

df.nunique()

df['Genre'].value_counts()

df['Genre'] = df['Genre'].astype('category')

df.describe()

Analysing data set's variables

Name (books)

# count of the number of times each book appears on the yearly top 50 best selling list books_count = df['Name'].value_counts() books_count.iloc[:45,]

# Top 20 books by user rating top20_books_rating = df.sort_values('User Rating', ascending=False).iloc[:20,] top20_books_rating

# Bottom 20 books by user rating bottom20_books_rating = df.sort_values('User Rating').iloc[:20,] bottom20_books_rating

Author

# top 20 authors by user rating top20_authors_rating = df.groupby('Author')['User Rating'].mean().sort_values( ascending=False).iloc[:20] top20_authors_rating

# top 20 authors by mean price df_top20_authors_price = df.groupby('Author')['Price'].mean().sort_values( ascending=False).iloc[:20] df_top20_authors_price

# top 20 authors by number of books ranked top20_authors_number_books = df['Author'].value_counts().iloc[:20] top20_authors_number_books

top20_authors_columns = df[df['Author'].isin(top20_authors_number_books.index)][['Author','User Rating', 'Reviews', 'Price']] top20_authors_columns

top20_authors_columns_grouped = top20_authors_columns.groupby('Author')[['Author','User Rating', 'Reviews', 'Price']].mean( ).sort_values(by='Price', ascending=False) top20_authors_columns_grouped

Genre

fig = px.histogram(df, x='Genre', title='Genre', color='Genre') fig.show()

genre_by_year = df.groupby('Year')['Genre'].value_counts().reset_index(level =0).rename(columns={'Genre':'Genre Count'}) genre_by_year

genre_by_year_hist = px.histogram(df, x='Year', color='Genre',pattern_shape='Genre') genre_by_year_hist.show()

User Rating

df['User Rating'].describe()

fig = px.histogram(df, x='User Rating', title='User Rating', marginal='box', hover_data=df[['Name', 'Author']]) fig.show()

fig = px.scatter(df, x='Year', y='User Rating', title='User Rating per Year', color='Genre', hover_data = df[['Name','Author']]) fig.show()

Reviews

df['Reviews'].describe()

fig = px.histogram(df, x='Reviews', title='Reviews', marginal='box', hover_data=df[['Name', 'Author']]) fig.show()

fig = px.scatter(df, x='Year', y='Reviews', title='Number of Reviews per Year', color='Genre', hover_data = df[['Name','Author']]) fig.show()

Price

df['Price'].describe()

fig = px.histogram(df, x='Price', title='Price', marginal='box', hover_data=df[['Name', 'Author']]) fig.show()

df.sort_values(by='Price').head(12)

fig = px.scatter(df, x='Year', y='Price', title='Price of Books per Year', color='Genre', hover_data = df[['Name','Author']]) fig.show()

Correlations

df[['User Rating', 'Reviews', 'Price']].corr()

fig = px.scatter_matrix(df, dimensions=['User Rating', 'Reviews', 'Price'], color='Genre', hover_data=df[['Name', 'Author']], opacity=0.6, width=1000, height=1000, title='Correlations') fig.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Data Exploratoty Analysis top 50 bestselling amazon books, 2009-2019