#Yandex JupyterHub plotly version is 4.1.0 which have no heatmap plot
!pip install plotly==5.1.0
Requirement already satisfied: plotly==5.1.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (5.1.0)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from plotly==5.1.0) (1.16.0)
Requirement already satisfied: tenacity>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from plotly==5.1.0) (8.0.1)
WARNING: You are using pip version 21.2.2; however, version 21.2.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import pandas as pd
import numpy as np
import scipy
from scipy import stats as st
from scipy.stats import levene
import math
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
#import datasets
try:
game_raw = pd.read_csv('/work/games.csv')
except:
print("Some error")
else:
print("No exception, everything good")
No exception, everything good
#sample dataset
game_raw.sample(5)
#Statistical Description
game_raw.describe()
#Data type and len check
game_raw.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 16713 non-null object
1 Platform 16715 non-null object
2 Year_of_Release 16446 non-null float64
3 Genre 16713 non-null object
4 NA_sales 16715 non-null float64
5 EU_sales 16715 non-null float64
6 JP_sales 16715 non-null float64
7 Other_sales 16715 non-null float64
8 Critic_Score 8137 non-null float64
9 User_Score 10014 non-null object
10 Rating 9949 non-null object
dtypes: float64(6), object(5)
memory usage: 1.4+ MB
#Estimate the proportion of missing values in each column
(game_raw.isnull().sum() / game_raw.shape[0] * 100).round(2)
#check duplicated rows
game_raw.duplicated().sum()
#Check Datatyoe in user_score
game_raw.User_Score.apply(type).unique()
#Check Datatyoe in rating
game_raw.Rating.apply(type).unique()
#Check Datatyoe in critic_score
game_raw.Critic_Score.apply(type).unique()
game_raw.columns = map(str.lower, game_raw.columns)
game_raw.columns
game_raw.dropna(subset=['year_of_release'], inplace=True)
game_raw.dropna(subset=['name'], inplace=True)
game_raw.isnull().sum()
#Check percent of each values in user_score columns
game_raw['critic_score'].value_counts()/len(game_raw) * 100
#Check percent of each values in user_score columns
game_raw['user_score'].value_counts()/len(game_raw) * 100
game_raw['user_score'] = game_raw['user_score'].replace(['tbd'], np.nan)
#Change 'user_score' to integer first
game_raw['user_score'] = pd.to_numeric(game_raw['user_score'], errors='coerce')
game_raw.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16444 entries, 0 to 16714
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 16444 non-null object
1 platform 16444 non-null object
2 year_of_release 16444 non-null float64
3 genre 16444 non-null object
4 na_sales 16444 non-null float64
5 eu_sales 16444 non-null float64
6 jp_sales 16444 non-null float64
7 other_sales 16444 non-null float64
8 critic_score 7983 non-null float64
9 user_score 7463 non-null float64
10 rating 9768 non-null object
dtypes: float64(7), object(4)
memory usage: 1.5+ MB
#Fill 'user_score' and 'critic_score' with avarage value based on their name.
game_raw['user_score'] = game_raw.groupby('name')['user_score'].apply(lambda x: x.fillna(x.mean()))
game_raw['critic_score'] = game_raw.groupby('name')['critic_score'].apply(lambda x: x.fillna(x.mean()))
#Check how many NaN left in both columns
print('No. of NaN in user score: ', game_raw.user_score.isna().sum())
print('No. of NaN in critic_score: ', game_raw.critic_score.isna().sum())
No. of NaN in user score: 7920
No. of NaN in critic_score: 7522
#Fill 'user_score' and 'critic_score' with avarage value based on their name
game_raw['user_score'] = game_raw.groupby('genre')['user_score'].apply(lambda x: x.fillna(x.mean()))
game_raw['critic_score'] = game_raw.groupby('genre')['critic_score'].apply(lambda x: x.fillna(x.mean()))
#Check how many NaN left in both columns
print('No. of NaN in user score: ', game_raw.user_score.isna().sum())
print('No. of NaN in critic_score: ', game_raw.critic_score.isna().sum())
No. of NaN in user score: 0
No. of NaN in critic_score: 0
game_raw['user_score'] = game_raw['user_score'].round(1)
game_raw['critic_score'] = game_raw['critic_score'].round(1)
game_raw['user_score'].value_counts()
#Check each values in 'rating'
game_raw.rating.value_counts()
game_raw.rating.isnull().sum()
#year_of_release to integer
game_raw['year_of_release'] = game_raw['year_of_release'].astype('int')
game_raw['total_sales'] = game_raw[['na_sales','eu_sales','jp_sales','other_sales']].sum(axis=1)
game_raw.sample(1)
#Final Check on overall Dataset
game_raw.info()
#Create cleaned_dataset
game = game_raw.copy()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16444 entries, 0 to 16714
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 16444 non-null object
1 platform 16444 non-null object
2 year_of_release 16444 non-null int64
3 genre 16444 non-null object
4 na_sales 16444 non-null float64
5 eu_sales 16444 non-null float64
6 jp_sales 16444 non-null float64
7 other_sales 16444 non-null float64
8 critic_score 16444 non-null float64
9 user_score 16444 non-null float64
10 rating 9768 non-null object
11 total_sales 16444 non-null float64
dtypes: float64(7), int64(1), object(4)
memory usage: 1.6+ MB
#Create Pivot table based on year_of_release
game_year = game.pivot_table(index = 'year_of_release', values= 'name', aggfunc='count' ).reset_index()
game_year.columns = ['year', 'games_count']
game_year = game_year.sort_values('games_count', ascending=False)
game_year.head(5)
#Plot Histogram of game released in each year with Plotly
fig = px.bar(game_year, x = 'year', y = 'games_count',
title='Histogram of released game in each years',
labels=dict(year='Year', games_count="Number of game"))
fig.show()
#Grouping get top platform in the total sales
top_10 = game.pivot_table(index = ['platform'], values = 'total_sales', aggfunc = 'sum').reset_index().sort_values("total_sales", ascending=False)
top_10 = top_10.nlargest(10, 'total_sales')
top_10.head(10)
#Pivot the dataset for total sales in each platform by year
top_list = top_10['platform'].tolist()
only_top = game.query('platform in @top_list')
platform = only_top.pivot_table(index = ['platform','year_of_release' ], values = 'total_sales', aggfunc = 'sum').reset_index()
platform.head()
#plot scatter chart to see platform sale on each year
fig = px.scatter(platform, x='year_of_release', y='platform',
title='Platform sale in each year',
labels=dict(year_of_release='Year', platform="Platform",
height=500))
fig.show()
fig = px.area(platform, x="year_of_release", y="total_sales", color="platform",
title='Total sales from each platform per year',
labels=dict(year_of_release='Year', platform="Platform",total_sales="Total Sale" ),
line_group="platform",
height=500)
fig.show()
#filter year after 2011
filtered= game.query('year_of_release >= 2012')
filtered.sort_values(by='total_sales')
filtered.head()
#pivot to find total sale on each platform by year
game_year = pd.pivot_table(filtered, index='year_of_release', columns='platform', values='total_sales', aggfunc=sum, fill_value=0)
game_year.head()
#Compare on before and after (year before - year after)
trend = game_year - game_year.shift(+1)
trend.head(5)
#plot heatmap
fig = px.imshow(trend.T,
labels=dict(year_of_release="Year", platform="Platform"),
title='Sale trend each platform 2012-2016',
height=500)
fig.update_xaxes(side="top")
fig.show()
#group dataset by year and platform with sum total sale
avg_sale = filtered.groupby(['platform'])['total_sales'].sum().reset_index()
avg_sale.sort_values('total_sales', ascending=True)
#group dataset by year and platform with sum total sale
total_sale = filtered.groupby(['platform','year_of_release'])['total_sales'].sum().reset_index()
total_sale.sample(5)
#plot boxplot
fig = px.box(total_sale, x="platform", y="total_sales", color='platform',
title='Average sales on various platforms 2012-2016',
labels=dict(total_sales='Total Sales', platform="Platform"),
height=500).update_xaxes(categoryorder = 'total descending')
fig.show()
#filter only xbox 360
x360 = filtered.query(' platform == "X360"')
x360.head()
#regplot for both user_score and critic score
fig, axs = plt.subplots(ncols=2, figsize=(15, 4))
sns.regplot(x="user_score", y="total_sales", data=x360, ax=axs[0], color='r')
sns.regplot(x="critic_score", y="total_sales", data=x360, ax=axs[1], color='b')
axs[0].set_title('User Score and Total Sale 2012-2016 (Xbox 360)', fontsize=10, color='r')
axs[1].set_title('Critic Score and Total Sale 2012-2016 (Xbox 360)', fontsize=10, color='b')
plt.show()
x360_corr = x360.pivot_table(index='total_sales', values=['user_score','critic_score']).reset_index()
x360_corr.corr()
selected_plat = filtered.query(' platform in ("PS2","PS3", "X360", "Wii", "DS", "PS4","GBA")')
other_plat = selected_plat.pivot_table(index='total_sales', values=['user_score','critic_score']).reset_index()
other_plat.tail()
#regplot for both user_score and critic score
fig, axs = plt.subplots(ncols=2, figsize=(15, 4))
sns.regplot(x="user_score", y="total_sales", data=other_plat, ax=axs[0], color='r')
sns.regplot(x="critic_score", y="total_sales", data=other_plat, ax=axs[1], color='b')
axs[0].set_title('User Score and Total Sale 2012-2016 (Top platforms)', fontsize=10, color='r')
axs[1].set_title('Critic Score and Total Sale 2012-2016 (Top platforms)', fontsize=10, color='b')
plt.show()
other_plat.corr()
#filter only top 5 platform
Pop_plat = filtered.query(' platform in ("PS4","PS3", "X360", "3DS", "XOne")')
#create a pivot of total sale grouped by platform
genre_pivot = Pop_plat.pivot_table(index=['platform', 'genre'],
values=['na_sales','eu_sales','jp_sales' ,'other_sales' ,'total_sales' ],
aggfunc=sum).reset_index()\
.sort_values(by='total_sales', ascending=False)
#Plot barplot for Total sales in each genre by platform
fig = px.bar(genre_pivot, x="genre", y='total_sales', barmode='group',
color='platform',
title='Total sales in each genre by platform 2012-2016',
labels=dict(total_sales='Total Sales', genre='Genre'),
height=500)
fig.update_layout(barmode='stack')
fig.show()
#create a pivot of total sale grouped by genre.
genre_comp = filtered.pivot_table(index=['genre'],
values=['na_sales','eu_sales','jp_sales' ,'total_sales','other_sales' ],
aggfunc=sum)\
.reset_index()\
.sort_values(by='total_sales', ascending=False)
genre_comp.sample(5)
#Barplot total sale in each platform by region
fig = px.bar(genre_comp, x="genre", y=['eu_sales','jp_sales', 'na_sales','other_sales'], barmode='group',
title='Total sales in each platform by region 2012-2016',
labels=dict(value='Total Sales', genre='Genre'),
height=500)
fig.update_layout(barmode='stack')
fig.show()
#pivot to total sale in each platform
region = filtered.pivot_table(index=['platform'],
values =['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'total_sales'],
aggfunc=sum)\
.reset_index().sort_values(by='total_sales', ascending=False)
#select only the top five in total sale
top_plat_region = region.head(5)
top_plat_region
#plot bar chart on region sale by each platform
fig = px.bar(top_plat_region, x="platform", y=['eu_sales','jp_sales', 'na_sales','other_sales'], barmode='group',
title='Total sales in each platform by region 2012-2016',
labels=dict(value='Total Sales', platform='Platform'),
height=500)
fig.update_layout(barmode='stack')
fig.show()
#Plot piechart to see market share amoung platform in each regions
labels = top_plat_region['platform']
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=4, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values=top_plat_region['eu_sales'], name="Europe Sale"),
1, 1)
fig.add_trace(go.Pie(labels=labels, values=top_plat_region['jp_sales'], name="Japan Sale"),
1, 2)
fig.add_trace(go.Pie(labels=labels, values=top_plat_region['na_sales'], name="North America Sale"),
1, 3)
fig.add_trace(go.Pie(labels=labels,values=top_plat_region['other_sales'], name="Other Sale"),
1, 4)
fig.update_layout(
title_text="Market share in each region 2012-2016",
# Add annotations in the center of the donut pies.
annotations=[dict(text='Europe Sale', x=0.02, y=0.9, font_size=20, showarrow=False),
dict(text='Japan Sale', x=0.36, y=0.9, font_size=20, showarrow=False),
dict(text='North America Sale', x=0.76, y=0.9, font_size=20, showarrow=False),
dict(text='Other Sale', x=0.97, y=0.9, font_size=20, showarrow=False)])
fig.show()
#top 5 by region
genre = filtered.pivot_table(index=['genre'],
values =['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'total_sales'],
aggfunc=sum)\
.reset_index().sort_values(by='total_sales', ascending=False)
top_plat_genre = genre.head(5)
top_plat_genre
fig = px.bar(top_plat_genre, x="genre", y=['eu_sales','jp_sales', 'na_sales','other_sales'], barmode='group',
title='Total sales in each genre by region 2012-2016',
labels=dict(value='Total Sales', genre='Genre'),
height=500)
fig.update_layout(barmode='stack')
fig.show()
#Plot piechart to see market share amoung genre in each regions
labels = top_plat_genre['genre']
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=4, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values=top_plat_region['eu_sales'], name="Europe Sale"),
1, 1)
fig.add_trace(go.Pie(labels=labels, values=top_plat_region['jp_sales'], name="Japan Sale"),
1, 2)
fig.add_trace(go.Pie(labels=labels, values=top_plat_region['na_sales'], name="North America Sale"),
1, 3)
fig.add_trace(go.Pie(labels=labels,values=top_plat_region['other_sales'], name="Other Sale"),
1, 4)
fig.update_layout(
title_text="Market share in each region 2012-2016",
# Add annotations in the center of the donut pies.
annotations=[dict(text='Europe Sale', x=0.02, y=0.9, font_size=20, showarrow=False),
dict(text='Japan Sale', x=0.36, y=0.9, font_size=20, showarrow=False),
dict(text='North America Sale', x=0.76, y=0.9, font_size=20, showarrow=False),
dict(text='Other Sale', x=0.97, y=0.9, font_size=20, showarrow=False)])
fig.show()
#pivot region sale in every ESRB rating
grouped_rating = filtered.pivot_table(index=['rating'],
values =['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'total_sales'],
aggfunc=sum)\
.reset_index()\
.sort_values(by='total_sales', ascending=False)
grouped_rating
#plot bar chart of Total sales in each rating by region
fig = px.bar(grouped_rating, x="rating", y=['eu_sales','jp_sales', 'na_sales','other_sales'], barmode='group',
title='Total sales in each rating by region 2012-2016',
labels=dict(value='Total Sales', rating='Rating'),
height=500)
fig.update_layout(barmode='stack')
fig.show()
#Create 2 dataset user ratings for XOne and PC.
filtered_xbox = filtered.query('platform == "XOne"')
filtered_pc = filtered.query('platform == "PC"')
#Pivot by year of release and platform
user_rating_xbox = filtered_xbox.pivot_table(index=['year_of_release', 'platform'], values=['user_score']).reset_index()
user_rating_pc = filtered_pc.pivot_table(index=['year_of_release', 'platform'], values=['user_score']).reset_index()
#Average on each datasets
xbox_avg = user_rating_xbox['user_score']
pc_avg = user_rating_pc['user_score']
#Use Levene test tests the null hypothesis that all input samples are from populations with equal variances.
stat, p = levene(xbox_avg, pc_avg)
print("p-value from levene test: ", p.round(5))
p-value from levene test: 0.69916
alpha = 0.05
#the p-value of the levene test is more than the threshold value of 0.05 , the sample groups have equal variance.
results = st.ttest_ind(
xbox_avg,
pc_avg,
equal_var=True)
print('p-value:', results.pvalue)
if (results.pvalue < alpha):
print("We reject the null hypothesis")
else:
print("We can't reject the null hypothesis")
p-value: 0.866426147777448
We can't reject the null hypothesis
filtered_action = filtered.query('genre == "Action"')
filtered_sports = filtered.query('genre == "Sports"')
user_rating_action = filtered_action.pivot_table(index=['year_of_release', 'genre'], values=['user_score']).reset_index()
user_rating_sports = filtered_sports.pivot_table(index=['year_of_release', 'genre'], values=['user_score']).reset_index()
action_avg = user_rating_action['user_score']
sports_avg = user_rating_sports['user_score']
#Use Levene test tests the null hypothesis that all input samples are from populations with equal variances.
stat, p = levene(action_avg, sports_avg )
print("p-value from levene test: ", p.round(5))
p-value from levene test: 0.15704
alpha = 0.05
#the p-value of the levene test is more than the threshold value of 0.05 , the sample groups have equal variance.
results = st.ttest_ind(
action_avg,
sports_avg,
equal_var=True)
print('p-value:', results.pvalue)
if (results.pvalue < alpha):
print("We reject the null hypothesis")
else:
print("We can't reject the null hypothesis")
p-value: 0.0010111707670376574
We reject the null hypothesis