Proyecto DataScience Steam

import urllib.request as Req from zipfile import ZipFile import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly.graph_objects as go import plotly.express as px # import plotly.offline as pyo # pyo.init_notebook_mode()

# The DataSet comes from this link. # https://www.kaggle.com/datasets/nikdavis/steam-store-games?select=steam.csv url = r'https://storage.googleapis.com/kaggle-data-sets/227895/490308/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220719%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220719T185204Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=99634b254d574ff871661e8efcfe3b1f56b8dc9600aca3fa0b55dcd3d5561f90a00292cd68b6a5f55f04ef6002c18b83c420681a6d0b6fe166b679e931172b982907d8da4a4a0f4ba22e4e7c92adf4f8fb41d742b6388ae6661d53f558b289ebba7d24354d90805bde0e37e06497a7d8c0744df384cd658791de8e33d7f026a9744c4e48d74516f053ceccf23e044a9c69e0871841ee50cbb952008955336e87452d41509349c0af6d9b69dca9960a23aaf2bda371a2bf227664ebedb4402e817708c32d4ea65ee21ffbd80c70a89d55fb216c45c14d705f611b351c4ea5cf3b2edd95ff32e2e3bae8316843e19ca7faef2fe78158f5315a880af5e3cde409e3'

# Req.urlretrieve(url, r'steam_games.zip') ## You can comment all the previous lines and directly execute the following ones to ## save time, since the entire dataset is 36,052 KB.

steam_zip = ZipFile(r'steam_games.zip') steam_zip.filelist archivo = steam_zip.open('steam.csv') archivo_support = steam_zip.open('steam_support_info.csv')

df = pd.read_csv(archivo) df.head()

df_support = pd.read_csv(archivo_support) df_support.head()

df.duplicated().any()

df.duplicated().unique()

df.columns

df_support.duplicated().any()

df_support.duplicated().unique()

df_support.columns

len(df['appid'].unique())

df.drop(columns='appid', inplace=True)

df_support.drop(columns='steam_appid', inplace=True)

df.select_dtypes(include=[object])

df.select_dtypes(include=[object]).columns

cols = ['platforms', 'categories', 'genres', 'steamspy_tags']

df[cols] = df[cols].apply(lambda x: x.str.lower())

df.select_dtypes(include=[object])

df['owners'].unique()

# pat_inferior = '-.*' # owners_inferior = df['owners'].str.replace(pat_inferior, '', regex=True).apply(int)

pat_superior = '.*-' owners_superior = df['owners'].str.replace(pat_superior, '', regex=True).apply(int)

df['owners_median'] = owners_superior / 2 df['owners_median'].apply(int)

df.drop(columns='owners', inplace=True)

df['english'].unique()

map_dict = {0: 'non english', 1: 'english'} df['english'] = df['english'].map(map_dict)

# df['categories'].value_counts().count() len(df['categories'].unique())

df_categories = df[['categories']].copy()

pat_inferior_categories = ';.*' categories = df['categories'].str.replace(pat_inferior_categories, '', regex=True) # genres.value_counts()

df['categories'] = categories df.head()

df['genres'].value_counts().count()

df_genres = df[['genres']].copy()

pat_inferior_genres = ';.*' genres = df['genres'].str.replace(pat_inferior_genres, '', regex=True) # genres.value_counts()

df['genres'] = genres df.head()

df.info()

df['release_date'] = pd.to_datetime(df['release_date']) # Transforma Dtype a datetime64[ns]

df.info() # df['release_date'] = df['release_date'].dt.strftime('%d/%m/%Y') # ordena el formato de la fecha, pero se pierde el formato datetime64[ns]

df.describe()

print('''It is not necessary to eliminate the outliers, it is better to apply other techniques so that they do not affect our conclusions, if required.''')

df.isna()

df.isna().any()

df_support.isna()

df_support.isna().any()

len(df_support[df_support['support_url'].isna()])

len(df_support[df_support['website'].isna()])

len(df_support[df_support['support_email'].isna()])

df_support['support_url'].fillna('none', inplace=True) len(df_support[df_support['support_url'].isna()])

df_support['website'].fillna('none', inplace=True) len(df_support[df_support['website'].isna()])

df_support['support_email'].fillna('none', inplace=True) len(df_support[df_support['support_email'].isna()])

df_support.isna().any()

df.info()

df['mas_jugados'] = df['owners_median'] * df['median_playtime'] mas_jugados = df.sort_values(by='mas_jugados', ascending=False)[['name', 'mas_jugados', 'median_playtime', 'owners_median']][:10].set_index('name') mas_jugados

df.groupby('genres')['owners_median'].sum().max()

mas_vendido = df.groupby('genres')['owners_median'].sum().sort_values(ascending=False)[:5] # mas_vendido # mas_vendido.index

fig = px.bar(x=mas_vendido.index, y=mas_vendido, color=mas_vendido.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Video game genres with the most owners', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Genres", yaxis_title="Games sales") fig.show()

mas_reviews = df.groupby('genres')['positive_ratings'].mean().apply(int).sort_values(ascending=False)[:10] # mas_reviews

fig = px.bar(x=mas_reviews.index, y=mas_reviews, color=mas_reviews.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Chart of video game genres with the most reviews on average', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Genres", yaxis_title="Games sales") fig.show()

juegos_vendidos = df.groupby('name')['owners_median'].sum().sort_values(ascending=False)[:5]

fig = px.bar(x=juegos_vendidos.index, y=juegos_vendidos, color=juegos_vendidos.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='TOP games that have more owners', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Desarrollador", yaxis_title="Juegos vendidos") fig.show()

mas_vendidos = df.groupby('developer')['owners_median'].sum().sort_values(ascending=False)[:5]

fig = px.bar(x=mas_vendidos.index, y=mas_vendidos, color=mas_vendidos.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='TOP of developers who have more owners in their games', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Desarrollador", yaxis_title="Juegos vendidos") fig.show()

df['ganancias_totales'] = df['price'] * df['owners_median']

juegos_recaudacion = df.groupby('name')['ganancias_totales'].sum().sort_values(ascending=False)[:5]

fig = px.bar(x=juegos_recaudacion.index, y=juegos_recaudacion, color=juegos_recaudacion.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='TOP games that have raised the most', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Desarrollador", yaxis_title="Dolares totales juego") fig.show()

mas_gana_developers = df.groupby('developer')['ganancias_totales'].sum().sort_values(ascending=False)[:5] mas_gana_publisher = df.groupby('publisher')['ganancias_totales'].sum().sort_values(ascending=False)[:5]

fig1 = px.bar(x=mas_gana_developers.index, y=mas_gana_developers, color=mas_gana_developers.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig1.update_layout(title='Developers who have earned the most money from games sold', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Desarrollador", yaxis_title="Dolares totales desarrollador") fig1.show()

fig2 = px.bar(x=mas_gana_publisher.index, y=mas_gana_publisher, color=mas_gana_publisher.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig2.update_layout(title='Publisher who have earned the most money for games sold', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Publisher", yaxis_title="Dolares totales desarrollador") fig2.show()

# cantidad_desarrollados = df.groupby('developer')['developer'].size().sort_values(ascending=False)[:5] # cantidad_desarrollados = df['developer'].value_counts()[:10] cantidad_desarrollados = df.groupby('developer')['developer'].count().sort_values(ascending=False)[:10]

fig = px.bar(x=cantidad_desarrollados.index, y=cantidad_desarrollados, color=cantidad_desarrollados.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Chart of developers who have developed the most number of games', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Meses", yaxis_title="Cantidad de juegos") fig.show()

day_games = df.groupby('release_date')['release_date'].count().sort_values(ascending=False)[:5] day_games

# df.groupby(['release_date', pd.Grouper(key='date', freq='M')]) montly_games = df.groupby(df['release_date'].dt.strftime('%B'))['genres'].count().sort_values(ascending=False) ## Fuente: https://foroayuda.es/como-puedo-agrupar-por-mes-desde-un-campo-de-fecha-usando-python-pandas/

fig = px.bar(x=montly_games.index, y=montly_games, color=montly_games.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Month in which more games have been developed', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Meses", yaxis_title="Cantidad de juegos") fig.show()

year_games = df.groupby(df['release_date'].dt.strftime('%Y'))['genres'].count().sort_values(ascending=False)

fig = px.bar(x=year_games.index, y=year_games, color=year_games.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Year in which more games have been developed', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Años", yaxis_title="Cantidad de juegos") fig.show()

df.groupby('genres')[['positive_ratings', 'negative_ratings']].mean().sort_values(by='positive_ratings' ,ascending=False)

achievements, non_achievements = len(df[df['achievements'] != 0]), len(df[df['achievements'] == 0])

fig = px.pie(values=[achievements, non_achievements], names=['achievements', 'non achievements'], color_discrete_sequence=['#479B55', '#FA0087']) fig.update_layout(title='Reviews', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

evolution_achievements = df.groupby('release_date')['achievements'].count()

# fig = px.bar(x=evolution_achievements.index, # y=evolution_achievements, # opacity=0.8) # fig.update_layout(title='Evolution over time of the amount of achievements in the games', # font=dict(family='verdana', size=16, color='white'), # template='plotly_dark', # height=500, # width=900, # legend_bgcolor='rgb(117, 112, 179)', # xaxis_title="Años", # yaxis_title="Cantidad de juegos") # fig.show()

year_achievements = df.groupby(df['release_date'].dt.strftime('%Y'))['achievements'].count()

fig = px.bar(x=year_achievements.index, y=year_achievements, color=year_achievements.index, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Annual evolution of the number of game achievements', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Años", yaxis_title="Cantidad de juegos") fig.show()

df[['name', 'negative_ratings', 'positive_ratings']].query('negative_ratings > positive_ratings').sort_values(by='negative_ratings', ascending=False)[:5]

df[['name', 'positive_ratings', 'negative_ratings']].query('positive_ratings > negative_ratings').sort_values(by='positive_ratings', ascending=False)[:5]

comparison_review = ['negative_ratings > positive_ratings', 'positive_ratings > negative_ratings', 'positive_ratings == negative_ratings'] list_review = [len(df[['name', 'positive_ratings', 'negative_ratings']].query(comparison)) for comparison in comparison_review]

fig = px.pie(values=list_review, names=comparison_review, color_discrete_sequence=['rgb(175, 100, 88)', '#479B55', '#FA0087']) fig.update_layout(title='Reviews', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

cantidad_desarrollados = df.groupby('developer')['developer'].count().sort_values(ascending=False)[:10]

linea = pd.DataFrame(dict(x=[0,500000], y=[0,500000]))

linea

fig1 = px.scatter(df, x='negative_ratings', y='positive_ratings', hover_data=['name'], color='genres', color_discrete_sequence=px.colors.qualitative.Light24) fig2 = px.line(linea, x='x', y='y') fig2.update_traces(line_color='white') fig3 = go.Figure(data=fig1.data + fig2.data) fig3.update_layout(title='Reviews positive/negative', xaxis_title='Positivo', yaxis_title='Negativas', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig3.show()

free, non_free = len(df[df['price'] == 0]), len(df[df['price'] != 0])

fig = px.pie(values=[free, non_free], names=['Free', 'Non free'], color_discrete_sequence=['#479B55', '#FA0087']) fig.update_layout(title='Free vs Non Free', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

english, non_english = len(df[df['english'] == 'english']), len(df[df['english'] == 'non english'])

fig = px.pie(values=[english, non_english], names=['English', 'Non English'], color_discrete_sequence=['#479B55', '#FA0087']) fig.update_layout(title='English vs Non English', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

non_required_age, required_age = len(df[df['required_age'] == 0]), len(df[df['required_age'] != 0]) fig = px.pie(values=[non_required_age, required_age], names=['Non required age', 'Required age'], color_discrete_sequence=['#479B55', '#FA0087']) fig.update_layout(title='Required age vs Non required age', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

df.required_age.value_counts()[1:] fig = px.pie(values=df.required_age.value_counts()[1:], names=['18+', '16+', '12+', '7+', '3+'], color_discrete_sequence=px.colors.qualitative.Light24) fig.update_layout(title='Required age', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

platform_set = set() for i in df['platforms'].str.split(';'): platform_set.update(i) for platform in platform_set: df[platform] = df['platforms'].apply(lambda x: platform in x) df.drop(columns='platforms', inplace=True) true_list_0 = [df[platform].values.sum() for platform in platform_set]

fig = px.bar(x=list(platform_set), y=true_list_0, color=list(platform_set), opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Platforms for which Steam games are available', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Plataformas", yaxis_title="Cantidad") fig.show()

df_categories['multiplayer'] = df_categories['categories'].apply(lambda x: 'multi-player' in x.lower()) fig = px.bar(x=['No-multiplayer','Multiplayer'], y=df_categories['multiplayer'].value_counts().unique(), color=['No-multiplayer','Multiplayer'], opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Games with all types of multiplayer vs individual games', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Plataformas", yaxis_title="Cantidad") fig.show()

genres_set = set() for i in df_genres['genres'].str.split(';'): genres_set.update(i) for genre in genres_set: df_genres[genre] = df_genres['genres'].apply(lambda x: genre in x) df_genres.drop(columns='genres', inplace=True) true_list = [df_genres[genre].values.sum() for genre in df_genres.columns]

fig = px.bar(x=df_genres.columns, y=true_list, color=df_genres.columns, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Presence of each genre in Steam games', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=700, width=1500, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Generos", yaxis_title="Cantidad", xaxis={'categoryorder':'min descending'}) fig.show()

categorie_set = set() for i in df_categories['categories'].str.split(';'): categorie_set.update(i) for categorie in categorie_set: df_categories[categorie] = df_categories['categories'].apply(lambda x: categorie in x) df_categories.drop(columns='categories', inplace=True) true_list_2 = [df_categories[categorie].values.sum() for categorie in df_categories.columns]

fig = px.bar(x=df_categories.columns, y=true_list_2, color=df_categories.columns, opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title='Presence of each genre in Steam games', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=700, width=1500, legend_bgcolor='rgb(117, 112, 179)', xaxis_title="Categorias", yaxis_title="Cantidad", xaxis={'categoryorder':'min descending'}) fig.show()

columns_corr_high = ['required_age', 'achievements', 'positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'owners_median', 'price'] df[columns_corr_high].corr(method ='pearson').style.background_gradient(cmap = 'coolwarm', axis=None)

fig1 = px.imshow(df[columns_corr_high].corr(), color_continuous_scale='Portland') fig1.update_layout(title = 'Correlation heatmaps of our dataset by Pearson method', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 750, width = 900) fig1.show()

df[columns_corr_high].corr()[['owners_median']].sort_values(by='owners_median', ascending=False).style.background_gradient(cmap = 'coolwarm',axis=None)

fig1 = px.violin(df, x='genres', y='owners_median', color='genres', box=True, points='all', hover_data=df.columns, color_discrete_sequence=px.colors.qualitative.Light24) fig1.update_layout(title = 'Graph to determine the effect of the variable owners ine unique gender', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig1.show()

df_support.head()

non_website_game, website_game = len(df_support[df_support['website'] == 'none']), len(df_support[df_support['website'] != 'none']) fig = px.pie(values=[non_website_game, website_game], names=['Non website game', 'Website game'], color_discrete_sequence=['#479B55', '#FA0087']) fig.update_layout(title='Website game vs Non website game', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

non_support_email_game, support_email_game = len(df_support[df_support['support_email'] == 'none']), len(df_support[df_support['support_email'] != 'none']) fig = px.pie(values=[non_support_email_game, support_email_game], names=['Non support email game', 'Support email game'], color_discrete_sequence=['#479B55', '#FA0087']) fig.update_layout(title='Support email game vs Non support email game', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)')

non_support_url_game, support_url_game = len(df_support[df_support['support_url'] == 'none']), len(df_support[df_support['support_url'] != 'none']) fig = px.pie(values=[non_support_url_game, support_url_game], names=['Non support url game', 'Support url game'], color_discrete_sequence=['#479B55', '#FA0087']) fig.update_layout(title='Support url game vs Non support url game', font=dict(family='verdana', size=16, color='white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)')