# import libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# read in dataset
star_wars_df = pd.read_csv("StarWars.csv", encoding="ISO-8859-1")
# drop null values from RespondentID
star_wars_df = star_wars_df.dropna(subset=["RespondentID"])
# print summary info
star_wars_df.describe(include="all")
# drop unneeded columns
star_wars_df = star_wars_df.drop(star_wars_df.columns[15:33] ,axis=1)
# convert the columns to True/False booleans
for col in ['Have you seen any of the 6 films in the Star Wars franchise?', 'Do you consider yourself to be a fan of the Star Wars film franchise?']:
star_wars_df[col] = star_wars_df[col].map({
"Yes": True,
"No": False
})
# iterate over the marked columns
for count, col in enumerate(star_wars_df.columns[3:9], start=1):
# if the value is a string holding the movie name, return True, otherwise return False
star_wars_df[col] = star_wars_df[col].apply(lambda x: True if isinstance(x, str) else False)
# rename the column
star_wars_df = star_wars_df.rename(columns={col: f"seen_movie_{count}"})
# iterate over marked column
for count, col in enumerate(star_wars_df.columns[9:15], start=1):
# convert column to float type
star_wars_df[col] = star_wars_df[col].astype(float)
# rename the column
star_wars_df = star_wars_df.rename(columns={col: f"ranking_movie_{count}"})
fans_df = star_wars_df[star_wars_df["Do you consider yourself to be a fan of the Star Wars film franchise?"] == True]
# create bar chart with ranked movies
ax = fans_df.loc[:, "ranking_movie_1":"ranking_movie_6"].mean().plot(kind="bar", figsize=(10, 5))
# set labels and title
ax.set_xlabel("Movie")
ax.set_ylabel("Ranking")
plt.title("Best Ranking Movies Among Fans")
# show plot
plt.show()
# create bar chart with most seen movies
ax = fans_df.loc[:, "seen_movie_1":"seen_movie_6"].sum().plot(kind="bar", figsize=(10, 5))
# set labels and title
ax.set_xlabel("Movie")
ax.set_ylabel("Count")
plt.title("Most Viewed Movies")
# show plot
plt.show()