import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
df = pd.read_csv('db/android-games.csv')
df = df.rename(columns={'total ratings':'total_ratings'})
df = df.rename(columns={'growth (30 days)':'growth_30_days'})
df = df.rename(columns={'growth (60 days)':'growth_60_days'})
df
df.isnull().sum()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730 entries, 0 to 1729
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 rank 1730 non-null int64
1 title 1730 non-null object
2 total_ratings 1730 non-null int64
3 installs 1730 non-null object
4 average rating 1730 non-null int64
5 growth_30_days 1730 non-null float64
6 growth_60_days 1730 non-null float64
7 price 1730 non-null float64
8 category 1730 non-null object
9 5 star ratings 1730 non-null int64
10 4 star ratings 1730 non-null int64
11 3 star ratings 1730 non-null int64
12 2 star ratings 1730 non-null int64
13 1 star ratings 1730 non-null int64
14 paid 1730 non-null bool
dtypes: bool(1), float64(3), int64(8), object(3)
memory usage: 191.0+ KB
df['installs'] = df.installs.apply(lambda x: float(x.split(" ")[0]) * 1000000 if 'M' in x else float(x.split(' ')[0]) * 1000000).astype('float')
df = df.rename(columns={'installs':'installs_in_million'})
df['installs_in_million'].value_counts()
df['price'].value_counts()
df['paid'].value_counts()
df.drop('price', axis=1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730 entries, 0 to 1729
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 rank 1730 non-null int64
1 title 1730 non-null object
2 total_ratings 1730 non-null int64
3 installs_in_million 1730 non-null float64
4 average rating 1730 non-null int64
5 growth_30_days 1730 non-null float64
6 growth_60_days 1730 non-null float64
7 category 1730 non-null object
8 5 star ratings 1730 non-null int64
9 4 star ratings 1730 non-null int64
10 3 star ratings 1730 non-null int64
11 2 star ratings 1730 non-null int64
12 1 star ratings 1730 non-null int64
13 paid 1730 non-null bool
dtypes: bool(1), float64(3), int64(8), object(2)
memory usage: 177.5+ KB
df['category'].value_counts(normalize=True)
fig = px.histogram(df,
x='category',
title='Categorias de Juegos')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
df['total_ratings'].describe(include='all')
fig = px.histogram(df, x= 'total_ratings', title='Total Ratings sobre los Juegos')
fig.show()
fig = px.box(df, x='total_ratings', hover_data=df[['title','category']])
fig.show()
df['installs_in_million'].describe()
fig = px.histogram(df, x='installs_in_million', title='Numeros de Instalaciones en Millones de los Juegos')
fig.show()
fig = px.box(df, x='installs_in_million', hover_data=df[['title','category']])
fig.update_traces(quartilemethod='inclusive')
fig.show()
print(df['paid'].value_counts(normalize=True))
print('-'*60)
print(df['paid'].value_counts())
False 0.995954
True 0.004046
Name: paid, dtype: float64
------------------------------------------------------------
False 1723
True 7
Name: paid, dtype: int64
paid_free = df['paid'].value_counts()
label = ['Gratis','Paga']
fig = px.pie(df, values=paid_free.values, names=label, title='Juegos Gratis y de Pago')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
total_rating_by_category = df.groupby('category')['total_ratings'].mean()
total_rating_by_category
fig = px.bar(total_rating_by_category, x=total_rating_by_category.index, y=total_rating_by_category.values, labels={'y':'Total Ratings'})
fig.update_layout(xaxis = {'categoryorder':'total descending'})
fig.show()
install_by_category = df.groupby('category')['installs_in_million'].mean()
install_by_category
fig = px.bar(install_by_category,
x=install_by_category.index,
y=install_by_category.values,
title='Numeros de juegos instalados por categoria',
labels={'y': 'Total en millones'}
)
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
growth_by_category_30 = df.groupby('category')['growth_30_days'].mean()
growth_by_category_30
fig = px.bar(growth_by_category_30,
x=growth_by_category_30.index,
y=growth_by_category_30,
title='Crecimiento de las categorias en 30 dias',
labels={'y':'Crecimiento en 30 dias'}
)
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
growth_by_category_60 = df.groupby('category')['growth_60_days'].mean()
growth_by_category_60
fig = px.bar(growth_by_category_60,
x=growth_by_category_60.index,
y=growth_by_category_60,
title='Crecimiento de las categorias en 60 dias',
labels={'y':'Crecimiento en 60 dias'}
)
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.show()
top_ranked_games = df[df['rank'] < 4][['rank','title','category', 'total_ratings', 'installs_in_million', '5 star ratings']]
top_ranked_games
fig = px.scatter(top_ranked_games,
y='title',
x='total_ratings',
hover_data=top_ranked_games[['category','rank']],
color='category',
title='Top 3 de Juegos por Categoria',
labels={
"title": "Juegos",
"total_ratings": "Total Rating",
}
)
fig.show()
fig = px.scatter(top_ranked_games,
y='title',
x='5 star ratings',
hover_data=top_ranked_games[['category','rank']],
color='category',
title='Top 3 de Juegos con 5 Estrellas en Rankings',
labels={
"title": "Juegos",
"5 star ratings": "5 star ratings",
}
)
fig.show()
df.columns
# rdenamos los valores
top_20 = df.sort_values(by='installs_in_million', ascending=False).head(20)
top_20
fig = px.bar(top_20,
x='title',
y='installs_in_million',
hover_data=['rank','5 star ratings'],
color='category'
)
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()