# libraries imports
import pandas as pd
import datetime as dt
import plotly.express as exp
import numpy as np
# changing the plotting backend for better
pd.options.plotting.backend = "plotly"
# loading dataset
dataset = pd.read_csv('game_info.csv')
dataset
dataset["genres"].unique()
# get all uniques in a column with multiple values per case
def get_uniques_from_column(column, sep: str = '||', remove_under: int = None) -> list:
"""
Get unique values from columns containing multiple values in the same raw.
column: (iterable) container for your values
sep: (string) separator for the content of each element in the column
remove_under: (int) prevent from returning a unique value if its occurs less than remove_under times in column.
Returns a list containing once each unique value contained in the column.
"""
#print("column:", len(column))
uniques = []
for elem in column:
if sep in str(elem):
parsed = elem.split(sep)
else:
parsed = [elem]
uniques.extend(parsed)
#print("uniques:", len(uniques))
set_uniques = list(set(uniques)) # converting to a set automatically removes duplicates from the parsed values
#print("set_uniques:", set_uniques)
if remove_under is not None:
counts = {i:uniques.count(i) for i in set_uniques}
#print("counts:", counts)
for key, val in counts.items():
if val < remove_under:
set_uniques.remove(key)
return set_uniques
#get_uniques_from_column(dataset["platforms"]) # giving the Series.unique() reduces the number of values to process, but should not be used with argument remove_under
# First, remove unwanted/useless columns
dataset.drop(
[
"id", # unique id on RAWG
"slug", # unique slug on RAWG
"website", # official website of the game
"tba", # to be announced
"updated",
"rating_top",
"added_status_yet",
"added_status_owned",
"added_status_beaten",
"added_status_toplay",
"added_status_dropped",
"added_status_playing"],
inplace=True,
axis=1
)
# Rename oddly named or unclear column names
dataset.rename(
{
"esrb_rating": "esrb_category", # could be confused with apprecition rating
},
inplace=True,
axis=1
)
dataset
# Formatting date columns to manipulate them more easily
dataset["released"] = pd.to_datetime(dataset["released"], format="%Y-%m-%d")
prev_len = len(dataset)
dataset = dataset[(dataset["released"] <= dt.datetime.today())] # removing games released after today
print(f"Removed {prev_len - len(dataset)} games that are not released yet.")
Removed 24328 games that are not released yet.
# check for inconsistencies
erroneous_ratings = dataset[(dataset["ratings_count"] == 0) & (dataset["rating"] != 0)]
print(f"There are {len(erroneous_ratings)} games with inconsistencies in their RAWG ratings.")
# removing games without any critics
prev_len = len(dataset)
dataset = dataset[(dataset["ratings_count"] == 0) & dataset["metacritic"].isnull() & dataset["reviews_count"] <= 0] # removing games without rating
print(f"Removed {prev_len - len(dataset)} games that have no ratings.")
dataset
There are 0 games with inconsistencies in their RAWG ratings.
Removed 628 games that have no ratings.
# Check the columns' dtypes
dataset.dtypes
dataset.isnull().sum()
dataset["rating"] = dataset["rating"].replace(0, np.nan)
def fill_empty_name(name):
if str(name) == "nan":
name = "Unknown game " + str(fill_empty_name.counter)
fill_empty_name.counter += 1
return name
fill_empty_name.counter = 0 # static variable to give each game a unique name
# fill empty names
dataset["name"] = dataset["name"].apply(fill_empty_name)
print(f"There are {dataset['name'].isnull().sum()} empty names left.")
There are 0 empty names left.
import matplotlib.pyplot as plt
f = plt.figure(figsize=(12, 7))
plt.matshow(dataset.corr(), fignum=f.number)
plt.xticks(range(dataset.select_dtypes(['number']).shape[1]), dataset.select_dtypes(['number']).columns, fontsize=12, rotation=75)
plt.yticks(range(dataset.select_dtypes(['number']).shape[1]), dataset.select_dtypes(['number']).columns, fontsize=12)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=12)
plt.title('Correlation Matrix', fontsize=16);
fig = exp.scatter(x="rating", y="metacritic", data_frame=dataset, trendline="ols") # ols = Ordinary Least Squares
fig.data = [t for t in fig.data if t.mode == "lines"]
fig
dataset["release_year"] = pd.DatetimeIndex(dataset['released']).year
by_release = dataset[["release_year", "metacritic", "rating"]].groupby("release_year").describe()
by_release["metacritic"].plot.line(y=["count"], title="Count of Metacritic ratings by release year")
by_release["rating"].plot.line(y=["count"], title="Count of RAWG ratings by release year")
prev_len = len(by_release["rating"])
by_release = by_release.drop(by_release[(by_release['rating']['count'] <= 10)].index)
#by_release["rating"] = by_release["rating"][(by_release["rating"]["count"] >= 10)]
print(f"Removed {prev_len - len(by_release['rating'])} years that do not have enough games rated on RAWG.")
prev_len = len(by_release["metacritic"])
by_release = by_release.drop(by_release[(by_release['metacritic']['count'] <= 10)].index)
#by_release["metacritic"] = by_release["metacritic"][(by_release["metacritic"]["count"] >= 10)]
print(f"Removed {prev_len - len(by_release['metacritic'])} years that do not have enough games rated on Metacritic.")
Removed 13 years that do not have enough games rated on RAWG.
Removed 15 years that do not have enough games rated on Metacritic.
by_release["metacritic"].plot.scatter(x=by_release.index, y=["mean", "min", "max"], title="Mean, Min and Max Metacritic ratings by release year", trendline="lowess")
by_release["rating"].plot.scatter(x=by_release.index, y=["mean", "min", "max"], title="Mean, Min and Max RAWG ratings by release year", trendline="lowess")
def get_describe_by(mean_col: str, by_col: str, dataframe: pd.DataFrame):
def wrapped(elem):
tmp = dataframe.where(dataframe[elem[by_col]] == True)[mean_col].describe()
for col in tmp.index:
elem[mean_col + '-' + col] = tmp[col]
return elem
return wrapped
def get_rating_by_column(dataset: pd.DataFrame, column: str, remove_under: int = None):
uniques = get_uniques_from_column(dataset[column], remove_under=remove_under)
uniques.remove(np.nan) # removing NaN from game publishers
new_dataset = dataset.copy()
for unique in uniques:
new_dataset[unique] = False
new_dataset[unique] = new_dataset[column].str.contains(r"(^|\|\|)" + unique + r"($|\|\|)", na=False)
return (pd.DataFrame(data={column[:-1] if column[-1] == 's' else column: uniques}), new_dataset)
genres_base, dataset_genres = get_rating_by_column(dataset, "genres")
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/strings/accessor.py:101: UserWarning:
This pattern has match groups. To actually get the groups, use str.extract.
metacritic_genres = genres_base.apply(get_describe_by("metacritic", "genre", dataset_genres), axis=1)
metacritic_genres.set_index('genre', inplace=True)
metacritic_genres.sort_values("metacritic-count", inplace=True)
metacritic_genres.plot.bar(x=metacritic_genres.index, y=["metacritic-count"], title="Count of metacritic ratings by genre")
metacritic_genres.sort_values("metacritic-mean", inplace=True)
metacritic_genres.plot.scatter(x=metacritic_genres.index, y=["metacritic-min", "metacritic-mean", "metacritic-max"], title="Mean, Min and Max Metacritic ratings by genre")
rating_genres = genres_base.apply(get_describe_by("rating", "genre", dataset_genres), axis=1)
rating_genres.set_index('genre', inplace=True)
rating_genres.sort_values("rating-count", inplace=True)
rating_genres.plot.bar(x=rating_genres.index, y="rating-count", title="Count of ratings by genre")
rating_genres.sort_values("rating-mean", inplace=True)
rating_genres.plot.scatter(x=rating_genres.index, y=["rating-min", "rating-mean", "rating-max"], title="Mean, Min and Max ratings by genre")
platforms_new_df, platforms_df = get_rating_by_column(dataset, "platforms", 5)
platforms_metacritic = platforms_new_df.apply(get_describe_by("metacritic", "platform", platforms_df), axis=1)
platforms_metacritic.set_index("platform", inplace=True)
platforms_metacritic = platforms_metacritic.drop(platforms_metacritic[(platforms_metacritic["metacritic-count"] < 25)].index)
platforms_metacritic.sort_values("metacritic-count", inplace=True)
platforms_metacritic.plot.bar(x=platforms_metacritic.index, y="metacritic-count", title="Count of metacritic ratings by platform")
platforms_metacritic.sort_values("metacritic-mean", inplace=True)
platforms_metacritic.plot.scatter(x=platforms_metacritic.index, y=["metacritic-min", "metacritic-mean", "metacritic-max"], title="Mean, Min and Max metacritic ratings by platform")
platforms_rating = platforms_new_df.apply(get_describe_by("rating", "platform", platforms_df), axis=1)
platforms_rating.set_index("platform", inplace=True)
platforms_rating = platforms_rating.drop(platforms_rating[(platforms_rating["rating-count"] < 25)].index)
platforms_rating.sort_values("rating-count", inplace=True)
platforms_rating.plot.bar(x=platforms_rating.index, y="rating-count", title="Count of ratings by platfom")
platforms_rating.sort_values("rating-mean", inplace=True)
platforms_rating.plot.scatter(x=platforms_rating.index, y=["rating-min", "rating-mean", "rating-max"], title="Mean, Min and Max ratings by platform")
categories_base, dataset_categories = get_rating_by_column(dataset, "esrb_category")
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/strings/accessor.py:101: UserWarning:
This pattern has match groups. To actually get the groups, use str.extract.
metacritic_categories = categories_base.apply(get_describe_by("metacritic", "esrb_category", dataset_categories), axis=1)
metacritic_categories.set_index('esrb_category', inplace=True)
metacritic_categories.sort_values("metacritic-count", inplace=True)
metacritic_categories = metacritic_categories.drop(metacritic_categories[(metacritic_categories['metacritic-count'] <= 10)].index)
metacritic_categories.plot.bar(x=metacritic_categories.index, y=["metacritic-count"], title="Count of metacritic ratings by ESRB")
metacritic_categories.sort_values("metacritic-mean", inplace=True)
metacritic_categories.plot.scatter(x=metacritic_categories.index, y=["metacritic-min", "metacritic-mean", "metacritic-max"], title="Mean, Min and Max metacritic ratings by ESRB")
rating_categories = categories_base.apply(get_describe_by("rating", "esrb_category", dataset_categories), axis=1)
rating_categories.set_index('esrb_category', inplace=True)
rating_categories.sort_values("rating-count", inplace=True)
rating_categories = rating_categories.drop(rating_categories[(rating_categories['rating-count'] <= 10)].index)
rating_categories.plot.bar(x=rating_categories.index, y=["rating-count"], title="Count of ratings by ESRB")
rating_categories.sort_values("rating-mean", inplace=True)
rating_categories.plot.scatter(x=rating_categories.index, y=["rating-min", "rating-mean", "rating-max"], title="Mean, Min and Max ratings by ESRB")