# Use this cell to begin your analysis, and add as many as you would like!
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('/work/Investigating-Netflix-Movies-and-Guest-Stars-in-The-Office/datasets/office_episodes.csv')
# Define a function to scale the ratings
def scale_ratings(rating):
if rating < 0.25:
return 'red'
elif rating < 0.5:
return 'orange'
elif rating < 0.75:
return 'lightgreen'
else:
return 'darkgreen'
# Apply the function to the ratings column to get the scaled ratings
df['colors'] = df['scaled_ratings'].apply(scale_ratings)
# Set marker sizes based on whether the episode has guest appearances or not
df['sizes'] = df['has_guests'].apply(lambda x: 250 if x else 25)
non_guest_df = df[df['has_guests'] == False]
guest_df = df[df['has_guests'] == True]
# Initialize the figure object
fig = plt.figure()
# Set the figure size
plt.rcParams['figure.figsize'] = [11, 7]
plt.scatter(x= non_guest_df['episode_number'],
y= non_guest_df['viewership_mil'],
c= non_guest_df['colors'],
s= non_guest_df['sizes'])
plt.scatter(x= guest_df['episode_number'],
y= guest_df['viewership_mil'],
c= guest_df['colors'],
s= guest_df['sizes'],
marker='*')
# Set the plot title and axis labels
plt.title('Popularity, Quality, and Guest Appearances on the Office')
plt.xlabel('Episode Number')
plt.ylabel('Viewership (Millions)')
# Show the plot
plt.show()
top_star = df.loc[df['viewership_mil'].idxmax()]['guest_stars'].split(',')[0]
top_star