import urllib.request as Req
from zipfile import ZipFile
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
# import plotly.offline as pyo
# pyo.init_notebook_mode()
# The DataSet comes from this link.
# https://www.kaggle.com/datasets/nikdavis/steam-store-games?select=steam.csv
url = r'https://storage.googleapis.com/kaggle-data-sets/227895/490308/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220719%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220719T185204Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=99634b254d574ff871661e8efcfe3b1f56b8dc9600aca3fa0b55dcd3d5561f90a00292cd68b6a5f55f04ef6002c18b83c420681a6d0b6fe166b679e931172b982907d8da4a4a0f4ba22e4e7c92adf4f8fb41d742b6388ae6661d53f558b289ebba7d24354d90805bde0e37e06497a7d8c0744df384cd658791de8e33d7f026a9744c4e48d74516f053ceccf23e044a9c69e0871841ee50cbb952008955336e87452d41509349c0af6d9b69dca9960a23aaf2bda371a2bf227664ebedb4402e817708c32d4ea65ee21ffbd80c70a89d55fb216c45c14d705f611b351c4ea5cf3b2edd95ff32e2e3bae8316843e19ca7faef2fe78158f5315a880af5e3cde409e3'
# Req.urlretrieve(url, r'steam_games.zip')
## You can comment all the previous lines and directly execute the following ones to
## save time, since the entire dataset is 36,052 KB.
steam_zip = ZipFile(r'steam_games.zip')
steam_zip.filelist
archivo = steam_zip.open('steam.csv')
archivo_support = steam_zip.open('steam_support_info.csv')
df = pd.read_csv(archivo)
df.head()
df_support = pd.read_csv(archivo_support)
df_support.head()
df.duplicated().any()
df.duplicated().unique()
df.columns
df_support.duplicated().any()
df_support.duplicated().unique()
df_support.columns
len(df['appid'].unique())
df.drop(columns='appid', inplace=True)
df_support.drop(columns='steam_appid', inplace=True)
df.select_dtypes(include=[object])
df.select_dtypes(include=[object]).columns
cols = ['platforms', 'categories', 'genres', 'steamspy_tags']
df[cols] = df[cols].apply(lambda x: x.str.lower())
df.select_dtypes(include=[object])
df['owners'].unique()
# pat_inferior = '-.*'
# owners_inferior = df['owners'].str.replace(pat_inferior, '', regex=True).apply(int)
pat_superior = '.*-'
owners_superior = df['owners'].str.replace(pat_superior, '', regex=True).apply(int)
df['owners_median'] = owners_superior / 2
df['owners_median'].apply(int)
df.drop(columns='owners', inplace=True)
df['english'].unique()
map_dict = {0: 'non english', 1: 'english'}
df['english'] = df['english'].map(map_dict)
# df['categories'].value_counts().count()
len(df['categories'].unique())
df_categories = df[['categories']].copy()
pat_inferior_categories = ';.*'
categories = df['categories'].str.replace(pat_inferior_categories, '', regex=True)
# genres.value_counts()
df['categories'] = categories
df.head()
df['genres'].value_counts().count()
df_genres = df[['genres']].copy()
pat_inferior_genres = ';.*'
genres = df['genres'].str.replace(pat_inferior_genres, '', regex=True)
# genres.value_counts()
df['genres'] = genres
df.head()
df.info()
df['release_date'] = pd.to_datetime(df['release_date']) # Transforma Dtype a datetime64[ns]
df.info()
# df['release_date'] = df['release_date'].dt.strftime('%d/%m/%Y') # ordena el formato de la fecha, pero se pierde el formato datetime64[ns]
df.describe()
print('''It is not necessary to eliminate the outliers, it is better to apply other techniques so that they do not affect
our conclusions, if required.''')
df.isna()
df.isna().any()
df_support.isna()
df_support.isna().any()
len(df_support[df_support['support_url'].isna()])
len(df_support[df_support['website'].isna()])
len(df_support[df_support['support_email'].isna()])
df_support['support_url'].fillna('none', inplace=True)
len(df_support[df_support['support_url'].isna()])
df_support['website'].fillna('none', inplace=True)
len(df_support[df_support['website'].isna()])
df_support['support_email'].fillna('none', inplace=True)
len(df_support[df_support['support_email'].isna()])
df_support.isna().any()
df.info()
df['mas_jugados'] = df['owners_median'] * df['median_playtime']
mas_jugados = df.sort_values(by='mas_jugados',
ascending=False)[['name', 'mas_jugados', 'median_playtime', 'owners_median']][:10].set_index('name')
mas_jugados
df.groupby('genres')['owners_median'].sum().max()
mas_vendido = df.groupby('genres')['owners_median'].sum().sort_values(ascending=False)[:5]
# mas_vendido
# mas_vendido.index
fig = px.bar(x=mas_vendido.index,
y=mas_vendido,
color=mas_vendido.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Video game genres with the most owners',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Genres",
yaxis_title="Games sales")
fig.show()
mas_reviews = df.groupby('genres')['positive_ratings'].mean().apply(int).sort_values(ascending=False)[:10]
# mas_reviews
fig = px.bar(x=mas_reviews.index,
y=mas_reviews,
color=mas_reviews.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Chart of video game genres with the most reviews on average',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Genres",
yaxis_title="Games sales")
fig.show()
juegos_vendidos = df.groupby('name')['owners_median'].sum().sort_values(ascending=False)[:5]
fig = px.bar(x=juegos_vendidos.index,
y=juegos_vendidos,
color=juegos_vendidos.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='TOP games that have more owners',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Desarrollador",
yaxis_title="Juegos vendidos")
fig.show()
mas_vendidos = df.groupby('developer')['owners_median'].sum().sort_values(ascending=False)[:5]
fig = px.bar(x=mas_vendidos.index,
y=mas_vendidos,
color=mas_vendidos.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='TOP of developers who have more owners in their games',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Desarrollador",
yaxis_title="Juegos vendidos")
fig.show()
df['ganancias_totales'] = df['price'] * df['owners_median']
juegos_recaudacion = df.groupby('name')['ganancias_totales'].sum().sort_values(ascending=False)[:5]
fig = px.bar(x=juegos_recaudacion.index,
y=juegos_recaudacion,
color=juegos_recaudacion.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='TOP games that have raised the most',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Desarrollador",
yaxis_title="Dolares totales juego")
fig.show()
mas_gana_developers = df.groupby('developer')['ganancias_totales'].sum().sort_values(ascending=False)[:5]
mas_gana_publisher = df.groupby('publisher')['ganancias_totales'].sum().sort_values(ascending=False)[:5]
fig1 = px.bar(x=mas_gana_developers.index,
y=mas_gana_developers,
color=mas_gana_developers.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig1.update_layout(title='Developers who have earned the most money from games sold',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Desarrollador",
yaxis_title="Dolares totales desarrollador")
fig1.show()
fig2 = px.bar(x=mas_gana_publisher.index,
y=mas_gana_publisher,
color=mas_gana_publisher.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig2.update_layout(title='Publisher who have earned the most money for games sold',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Publisher",
yaxis_title="Dolares totales desarrollador")
fig2.show()
# cantidad_desarrollados = df.groupby('developer')['developer'].size().sort_values(ascending=False)[:5]
# cantidad_desarrollados = df['developer'].value_counts()[:10]
cantidad_desarrollados = df.groupby('developer')['developer'].count().sort_values(ascending=False)[:10]
fig = px.bar(x=cantidad_desarrollados.index,
y=cantidad_desarrollados,
color=cantidad_desarrollados.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Chart of developers who have developed the most number of games',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Meses",
yaxis_title="Cantidad de juegos")
fig.show()
day_games = df.groupby('release_date')['release_date'].count().sort_values(ascending=False)[:5]
day_games
# df.groupby(['release_date', pd.Grouper(key='date', freq='M')])
montly_games = df.groupby(df['release_date'].dt.strftime('%B'))['genres'].count().sort_values(ascending=False)
## Fuente: https://foroayuda.es/como-puedo-agrupar-por-mes-desde-un-campo-de-fecha-usando-python-pandas/
fig = px.bar(x=montly_games.index,
y=montly_games,
color=montly_games.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Month in which more games have been developed',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Meses",
yaxis_title="Cantidad de juegos")
fig.show()
year_games = df.groupby(df['release_date'].dt.strftime('%Y'))['genres'].count().sort_values(ascending=False)
fig = px.bar(x=year_games.index,
y=year_games,
color=year_games.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Year in which more games have been developed',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Años",
yaxis_title="Cantidad de juegos")
fig.show()
df.groupby('genres')[['positive_ratings', 'negative_ratings']].mean().sort_values(by='positive_ratings' ,ascending=False)
achievements, non_achievements = len(df[df['achievements'] != 0]), len(df[df['achievements'] == 0])
fig = px.pie(values=[achievements, non_achievements],
names=['achievements', 'non achievements'],
color_discrete_sequence=['#479B55', '#FA0087'])
fig.update_layout(title='Reviews',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')
fig.show()
evolution_achievements = df.groupby('release_date')['achievements'].count()
# fig = px.bar(x=evolution_achievements.index,
# y=evolution_achievements,
# opacity=0.8)
# fig.update_layout(title='Evolution over time of the amount of achievements in the games',
# font=dict(family='verdana', size=16, color='white'),
# template='plotly_dark',
# height=500,
# width=900,
# legend_bgcolor='rgb(117, 112, 179)',
# xaxis_title="Años",
# yaxis_title="Cantidad de juegos")
# fig.show()
year_achievements = df.groupby(df['release_date'].dt.strftime('%Y'))['achievements'].count()
fig = px.bar(x=year_achievements.index,
y=year_achievements,
color=year_achievements.index,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Annual evolution of the number of game achievements',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Años",
yaxis_title="Cantidad de juegos")
fig.show()
df[['name',
'negative_ratings',
'positive_ratings']].query('negative_ratings > positive_ratings').sort_values(by='negative_ratings', ascending=False)[:5]
df[['name',
'positive_ratings',
'negative_ratings']].query('positive_ratings > negative_ratings').sort_values(by='positive_ratings', ascending=False)[:5]
comparison_review = ['negative_ratings > positive_ratings',
'positive_ratings > negative_ratings',
'positive_ratings == negative_ratings']
list_review = [len(df[['name', 'positive_ratings', 'negative_ratings']].query(comparison)) for comparison in comparison_review]
fig = px.pie(values=list_review,
names=comparison_review,
color_discrete_sequence=['rgb(175, 100, 88)', '#479B55', '#FA0087'])
fig.update_layout(title='Reviews',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')
fig.show()
cantidad_desarrollados = df.groupby('developer')['developer'].count().sort_values(ascending=False)[:10]
linea = pd.DataFrame(dict(x=[0,500000],
y=[0,500000]))
linea
fig1 = px.scatter(df,
x='negative_ratings',
y='positive_ratings',
hover_data=['name'],
color='genres',
color_discrete_sequence=px.colors.qualitative.Light24)
fig2 = px.line(linea,
x='x',
y='y')
fig2.update_traces(line_color='white')
fig3 = go.Figure(data=fig1.data + fig2.data)
fig3.update_layout(title='Reviews positive/negative',
xaxis_title='Positivo',
yaxis_title='Negativas',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')
fig3.show()
free, non_free = len(df[df['price'] == 0]), len(df[df['price'] != 0])
fig = px.pie(values=[free, non_free],
names=['Free', 'Non free'],
color_discrete_sequence=['#479B55', '#FA0087'])
fig.update_layout(title='Free vs Non Free',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')
fig.show()
english, non_english = len(df[df['english'] == 'english']), len(df[df['english'] == 'non english'])
fig = px.pie(values=[english, non_english],
names=['English', 'Non English'],
color_discrete_sequence=['#479B55', '#FA0087'])
fig.update_layout(title='English vs Non English',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')
fig.show()
non_required_age, required_age = len(df[df['required_age'] == 0]), len(df[df['required_age'] != 0])
fig = px.pie(values=[non_required_age, required_age],
names=['Non required age', 'Required age'],
color_discrete_sequence=['#479B55', '#FA0087'])
fig.update_layout(title='Required age vs Non required age',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')
fig.show()
df.required_age.value_counts()[1:]
fig = px.pie(values=df.required_age.value_counts()[1:],
names=['18+', '16+', '12+', '7+', '3+'],
color_discrete_sequence=px.colors.qualitative.Light24)
fig.update_layout(title='Required age',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')
fig.show()
platform_set = set()
for i in df['platforms'].str.split(';'):
platform_set.update(i)
for platform in platform_set:
df[platform] = df['platforms'].apply(lambda x: platform in x)
df.drop(columns='platforms', inplace=True)
true_list_0 = [df[platform].values.sum() for platform in platform_set]
fig = px.bar(x=list(platform_set),
y=true_list_0,
color=list(platform_set),
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Platforms for which Steam games are available',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Plataformas",
yaxis_title="Cantidad")
fig.show()
df_categories['multiplayer'] = df_categories['categories'].apply(lambda x: 'multi-player' in x.lower())
fig = px.bar(x=['No-multiplayer','Multiplayer'],
y=df_categories['multiplayer'].value_counts().unique(),
color=['No-multiplayer','Multiplayer'],
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Games with all types of multiplayer vs individual games',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Plataformas",
yaxis_title="Cantidad")
fig.show()
genres_set = set()
for i in df_genres['genres'].str.split(';'):
genres_set.update(i)
for genre in genres_set:
df_genres[genre] = df_genres['genres'].apply(lambda x: genre in x)
df_genres.drop(columns='genres', inplace=True)
true_list = [df_genres[genre].values.sum() for genre in df_genres.columns]
fig = px.bar(x=df_genres.columns,
y=true_list,
color=df_genres.columns,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Presence of each genre in Steam games',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=700,
width=1500,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Generos",
yaxis_title="Cantidad",
xaxis={'categoryorder':'min descending'})
fig.show()
categorie_set = set()
for i in df_categories['categories'].str.split(';'):
categorie_set.update(i)
for categorie in categorie_set:
df_categories[categorie] = df_categories['categories'].apply(lambda x: categorie in x)
df_categories.drop(columns='categories', inplace=True)
true_list_2 = [df_categories[categorie].values.sum() for categorie in df_categories.columns]
fig = px.bar(x=df_categories.columns,
y=true_list_2,
color=df_categories.columns,
opacity=0.8,
color_discrete_sequence=px.colors.qualitative.Dark24)
fig.update_layout(title='Presence of each genre in Steam games',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=700,
width=1500,
legend_bgcolor='rgb(117, 112, 179)',
xaxis_title="Categorias",
yaxis_title="Cantidad",
xaxis={'categoryorder':'min descending'})
fig.show()
columns_corr_high = ['required_age', 'achievements', 'positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'owners_median', 'price']
df[columns_corr_high].corr(method ='pearson').style.background_gradient(cmap = 'coolwarm', axis=None)
fig1 = px.imshow(df[columns_corr_high].corr(),
color_continuous_scale='Portland')
fig1.update_layout(title = 'Correlation heatmaps of our dataset by Pearson method',
font = dict(family = 'verdana', size = 16, color = 'white'),
template = 'plotly_dark',
height = 750,
width = 900)
fig1.show()
df[columns_corr_high].corr()[['owners_median']].sort_values(by='owners_median', ascending=False).style.background_gradient(cmap = 'coolwarm',axis=None)
fig1 = px.violin(df,
x='genres',
y='owners_median',
color='genres',
box=True,
points='all',
hover_data=df.columns,
color_discrete_sequence=px.colors.qualitative.Light24)
fig1.update_layout(title = 'Graph to determine the effect of the variable owners ine unique gender',
font = dict(family = 'verdana', size = 16, color = 'white'),
template = 'plotly_dark',
height = 500,
width = 900,
legend_bgcolor='rgb(117, 112, 179)')
fig1.show()
df_support.head()
non_website_game, website_game = len(df_support[df_support['website'] == 'none']), len(df_support[df_support['website'] != 'none'])
fig = px.pie(values=[non_website_game, website_game],
names=['Non website game', 'Website game'],
color_discrete_sequence=['#479B55', '#FA0087'])
fig.update_layout(title='Website game vs Non website game',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')
fig.show()
non_support_email_game, support_email_game = len(df_support[df_support['support_email'] == 'none']), len(df_support[df_support['support_email'] != 'none'])
fig = px.pie(values=[non_support_email_game, support_email_game],
names=['Non support email game', 'Support email game'],
color_discrete_sequence=['#479B55', '#FA0087'])
fig.update_layout(title='Support email game vs Non support email game',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')
non_support_url_game, support_url_game = len(df_support[df_support['support_url'] == 'none']), len(df_support[df_support['support_url'] != 'none'])
fig = px.pie(values=[non_support_url_game, support_url_game],
names=['Non support url game', 'Support url game'],
color_discrete_sequence=['#479B55', '#FA0087'])
fig.update_layout(title='Support url game vs Non support url game',
font=dict(family='verdana', size=16, color='white'),
template='plotly_dark',
height=500,
width=900,
legend_bgcolor='rgb(117, 112, 179)')