# libraries imports
import pandas as pd
import datetime as dt
import plotly.express as exp
import numpy as np
# changing the plotting backend for better
pd.options.plotting.backend = "plotly"
# loading dataset
dataset = pd.read_csv('game_info.csv')
dataset
dataset["genres"].unique()
# get all uniques in a column with multiple values per case
def get_uniques_from_column(column, sep: str = '||', remove_under: int = None) -> list:
"""
Get unique values from columns containing multiple values in the same raw.
column: (iterable) container for your values
sep: (string) separator for the content of each element in the column
remove_under: (int) prevent from returning a unique value if its occurs less than remove_under times in column.
Returns a list containing once each unique value contained in the column.
"""
#print("column:", len(column))
uniques = []
for elem in column:
if sep in str(elem):
parsed = elem.split(sep)
else:
parsed = [elem]
uniques.extend(parsed)
#print("uniques:", len(uniques))
set_uniques = list(set(uniques)) # converting to a set automatically removes duplicates from the parsed values
#print("set_uniques:", set_uniques)
if remove_under is not None:
counts = {i:uniques.count(i) for i in set_uniques}
#print("counts:", counts)
for key, val in counts.items():
if val < remove_under:
set_uniques.remove(key)
return set_uniques
#get_uniques_from_column(dataset["platforms"]) # giving the Series.unique() reduces the number of values to process, but should not be used with argument remove_under
# First, remove unwanted/useless columns
dataset.drop(
[
"id", # unique id on RAWG
"slug", # unique slug on RAWG
"website", # official website of the game
"tba", # to be announced
"updated",
"rating_top",
"added_status_yet",
"added_status_owned",
"added_status_beaten",
"added_status_toplay",
"added_status_dropped",
"added_status_playing"],
inplace=True,
axis=1
)
# Rename oddly named or unclear column names
dataset.rename(
{
"esrb_rating": "esrb_category", # could be confused with apprecition rating
},
inplace=True,
axis=1
)
dataset
# Formatting date columns to manipulate them more easily
dataset["released"] = pd.to_datetime(dataset["released"], format="%Y-%m-%d")
prev_len = len(dataset)
dataset = dataset[(dataset["released"] <= dt.datetime.today())] # removing games released after today
print(f"Removed {prev_len - len(dataset)} games that are not released yet.")
# check for inconsistencies
erroneous_ratings = dataset[(dataset["ratings_count"] == 0) & (dataset["rating"] != 0)]
print(f"There are {len(erroneous_ratings)} games with inconsistencies in their RAWG ratings.")
# removing games without any critics
prev_len = len(dataset)
dataset = dataset[(dataset["ratings_count"] == 0) & dataset["metacritic"].isnull() & dataset["reviews_count"] <= 0] # removing games without rating
print(f"Removed {prev_len - len(dataset)} games that have no ratings.")
dataset
# Check the columns' dtypes
dataset.dtypes
dataset.isnull().sum()
dataset["rating"] = dataset["rating"].replace(0, np.nan)
def fill_empty_name(name):
if str(name) == "nan":
name = "Unknown game " + str(fill_empty_name.counter)
fill_empty_name.counter += 1
return name
fill_empty_name.counter = 0 # static variable to give each game a unique name
# fill empty names
dataset["name"] = dataset["name"].apply(fill_empty_name)
print(f"There are {dataset['name'].isnull().sum()} empty names left.")
import matplotlib.pyplot as plt
f = plt.figure(figsize=(12, 7))
plt.matshow(dataset.corr(), fignum=f.number)
plt.xticks(range(dataset.select_dtypes(['number']).shape[1]), dataset.select_dtypes(['number']).columns, fontsize=12, rotation=75)
plt.yticks(range(dataset.select_dtypes(['number']).shape[1]), dataset.select_dtypes(['number']).columns, fontsize=12)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=12)
plt.title('Correlation Matrix', fontsize=16);
fig = exp.scatter(x="rating", y="metacritic", data_frame=dataset, trendline="ols") # ols = Ordinary Least Squares
fig.data = [t for t in fig.data if t.mode == "lines"]
fig
dataset["release_year"] = pd.DatetimeIndex(dataset['released']).year
by_release = dataset[["release_year", "metacritic", "rating"]].groupby("release_year").describe()
by_release["metacritic"].plot.line(y=["count"], title="Count of Metacritic ratings by release year")
by_release["rating"].plot.line(y=["count"], title="Count of RAWG ratings by release year")
prev_len = len(by_release["rating"])
by_release = by_release.drop(by_release[(by_release['rating']['count'] <= 10)].index)
#by_release["rating"] = by_release["rating"][(by_release["rating"]["count"] >= 10)]
print(f"Removed {prev_len - len(by_release['rating'])} years that do not have enough games rated on RAWG.")
prev_len = len(by_release["metacritic"])
by_release = by_release.drop(by_release[(by_release['metacritic']['count'] <= 10)].index)
#by_release["metacritic"] = by_release["metacritic"][(by_release["metacritic"]["count"] >= 10)]
print(f"Removed {prev_len - len(by_release['metacritic'])} years that do not have enough games rated on Metacritic.")
by_release["metacritic"].plot.scatter(x=by_release.index, y=["mean", "min", "max"], title="Mean, Min and Max Metacritic ratings by release year", trendline="lowess")
by_release["rating"].plot.scatter(x=by_release.index, y=["mean", "min", "max"], title="Mean, Min and Max RAWG ratings by release year", trendline="lowess")
def get_describe_by(mean_col: str, by_col: str, dataframe: pd.DataFrame):
def wrapped(elem):
tmp = dataframe.where(dataframe[elem[by_col]] == True)[mean_col].describe()
for col in tmp.index:
elem[mean_col + '-' + col] = tmp[col]
return elem
return wrapped
def get_rating_by_column(dataset: pd.DataFrame, column: str, remove_under: int = None):
uniques = get_uniques_from_column(dataset[column], remove_under=remove_under)
uniques.remove(np.nan) # removing NaN from game publishers
new_dataset = dataset.copy()
for unique in uniques:
new_dataset[unique] = False
new_dataset[unique] = new_dataset[column].str.contains(r"(^|\|\|)" + unique + r"($|\|\|)", na=False)
return (pd.DataFrame(data={column[:-1] if column[-1] == 's' else column: uniques}), new_dataset)
genres_base, dataset_genres = get_rating_by_column(dataset, "genres")
metacritic_genres = genres_base.apply(get_describe_by("metacritic", "genre", dataset_genres), axis=1)
metacritic_genres.set_index('genre', inplace=True)
metacritic_genres.sort_values("metacritic-count", inplace=True)
metacritic_genres.plot.bar(x=metacritic_genres.index, y=["metacritic-count"], title="Count of metacritic ratings by genre")
metacritic_genres.sort_values("metacritic-mean", inplace=True)
metacritic_genres.plot.scatter(x=metacritic_genres.index, y=["metacritic-min", "metacritic-mean", "metacritic-max"], title="Mean, Min and Max Metacritic ratings by genre")
rating_genres = genres_base.apply(get_describe_by("rating", "genre", dataset_genres), axis=1)
rating_genres.set_index('genre', inplace=True)
rating_genres.sort_values("rating-count", inplace=True)
rating_genres.plot.bar(x=rating_genres.index, y="rating-count", title="Count of ratings by genre")
rating_genres.sort_values("rating-mean", inplace=True)
rating_genres.plot.scatter(x=rating_genres.index, y=["rating-min", "rating-mean", "rating-max"], title="Mean, Min and Max ratings by genre")
platforms_new_df, platforms_df = get_rating_by_column(dataset, "platforms", 5)
platforms_metacritic = platforms_new_df.apply(get_describe_by("metacritic", "platform", platforms_df), axis=1)
platforms_metacritic.set_index("platform", inplace=True)
platforms_metacritic = platforms_metacritic.drop(platforms_metacritic[(platforms_metacritic["metacritic-count"] < 25)].index)
platforms_metacritic.sort_values("metacritic-count", inplace=True)
platforms_metacritic.plot.bar(x=platforms_metacritic.index, y="metacritic-count", title="Count of metacritic ratings by platform")
platforms_metacritic.sort_values("metacritic-mean", inplace=True)
platforms_metacritic.plot.scatter(x=platforms_metacritic.index, y=["metacritic-min", "metacritic-mean", "metacritic-max"], title="Mean, Min and Max metacritic ratings by platform")
platforms_rating = platforms_new_df.apply(get_describe_by("rating", "platform", platforms_df), axis=1)
platforms_rating.set_index("platform", inplace=True)
platforms_rating = platforms_rating.drop(platforms_rating[(platforms_rating["rating-count"] < 25)].index)
platforms_rating.sort_values("rating-count", inplace=True)
platforms_rating.plot.bar(x=platforms_rating.index, y="rating-count", title="Count of ratings by platfom")
platforms_rating.sort_values("rating-mean", inplace=True)
platforms_rating.plot.scatter(x=platforms_rating.index, y=["rating-min", "rating-mean", "rating-max"], title="Mean, Min and Max ratings by platform")
categories_base, dataset_categories = get_rating_by_column(dataset, "esrb_category")
metacritic_categories = categories_base.apply(get_describe_by("metacritic", "esrb_category", dataset_categories), axis=1)
metacritic_categories.set_index('esrb_category', inplace=True)
metacritic_categories.sort_values("metacritic-count", inplace=True)
metacritic_categories = metacritic_categories.drop(metacritic_categories[(metacritic_categories['metacritic-count'] <= 10)].index)
metacritic_categories.plot.bar(x=metacritic_categories.index, y=["metacritic-count"], title="Count of metacritic ratings by ESRB")
metacritic_categories.sort_values("metacritic-mean", inplace=True)
metacritic_categories.plot.scatter(x=metacritic_categories.index, y=["metacritic-min", "metacritic-mean", "metacritic-max"], title="Mean, Min and Max metacritic ratings by ESRB")
rating_categories = categories_base.apply(get_describe_by("rating", "esrb_category", dataset_categories), axis=1)
rating_categories.set_index('esrb_category', inplace=True)
rating_categories.sort_values("rating-count", inplace=True)
rating_categories = rating_categories.drop(rating_categories[(rating_categories['rating-count'] <= 10)].index)
rating_categories.plot.bar(x=rating_categories.index, y=["rating-count"], title="Count of ratings by ESRB")
rating_categories.sort_values("rating-mean", inplace=True)
rating_categories.plot.scatter(x=rating_categories.index, y=["rating-min", "rating-mean", "rating-max"], title="Mean, Min and Max ratings by ESRB")