Hypothesis Testing of Movie Ratings Data
By Mary Nwangwu
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import matplotlib.pyplot as plt
import re
from cliffs_delta import cliffs_delta
from statistics import mean, stdev
from math import sqrt
import seaborn as sns
%matplotlib inline
data_df = pd.read_csv('movieReplicationSet.csv')
data_df
# Columns 1-400, find total number of ratings (count) for each movie
movies = data_df.iloc[:,0:400] # filter to ratings for 400 movies
movies_total_rating = movies.count(axis=0) # get the total count of ratings for each movie
mtr_reshape = movies_total_rating.values.reshape(1,400)
mtr_append = movies.append(pd.DataFrame(mtr_reshape, columns=movies.columns), ignore_index=True) # append count to df
sorted_movies = mtr_append.sort_values(mtr_append.last_valid_index(), axis=1) #sort df by rating counts
# movies_stats = movies_total_rating.describe() # get median rating for more popular movies vs median rating for movies that are less popular 197.5
less_popular = []
more_popular = []
for i, col in enumerate(sorted_movies.columns): # select columns based on the values in the last row
if sorted_movies.iloc[-1,i] < 197.5:
less_popular.append(col)
if sorted_movies.iloc[-1,i] > 197.5:
more_popular.append(col)
less_popular = sorted_movies[less_popular] # create new df of just less popular movies
less_popular_drop = less_popular.iloc[:-1 , :] # remove last row of median values
less_popular_arr = less_popular_drop.to_numpy().flatten() # flatten df values to one array
less_popular_arr = less_popular_arr[~np.isnan(less_popular_arr)] # get rid of nan values
more_popular = sorted_movies[more_popular] # create new df of just more popular movies
more_popular_drop = more_popular.iloc[:-1 , :] # remove last row of median values
more_popular_arr = more_popular_drop.to_numpy().flatten() # flatten df values to one array
more_popular_arr = more_popular_arr[~np.isnan(more_popular_arr)] # get rid of nan values
# stats
d, res = cliffs_delta(more_popular_arr, less_popular_arr)
print(d,res)
print(scipy.stats.mannwhitneyu(more_popular_arr, less_popular_arr)) # significant
print("Median Arr 1: " +str(np.median(more_popular_arr)))
print("Sample Size 1: " +str(len(more_popular_arr)))
print("Median Arr 2: " +str(np.median(less_popular_arr)))
print("Sample Size 2: " +str(len(less_popular_arr)))
# plot
popular_combinedData = np.transpose(np.array([more_popular_arr,less_popular_arr],dtype=object))
fig = plt.figure()
ax = fig.add_subplot(111)
# Creating axes instance
bp = ax.boxplot(popular_combinedData, patch_artist = True,
notch ='True', vert = 0)
colors = ['orchid', 'paleturquoise']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
whisker.set(color ='#8B008B',
linewidth = 1.5,
linestyle =":")
# changing color and linewidth of
# caps
for cap in bp['caps']:
cap.set(color ='#8B008B',
linewidth = 2)
# changing color and linewidth of
# medians
for median in bp['medians']:
median.set(color ='red',
linewidth = 3)
# changing style of fliers
for flier in bp['fliers']:
flier.set(marker ='D',
color ='#e7298a',
alpha = 0.5)
# x-axis labels
ax.set_yticklabels(['More Popular', 'Less Popular'])
# Adding title
plt.title("Ratings: Less Popular vs More Popular Movies")
# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
plt.xlabel('Ratings')
# show plot
plt.show()
movie_years = data_df.iloc[:,0:400] # load movies
column_headers = list(movie_years.columns.values) # get movies names and create list
column_headers_str = ''.join(str(e) for e in column_headers) # put movie names into a string
years = re.findall(r'\(([^()]*)\)', column_headers_str) # find the years in the string
years_int = [int(i) for i in years]
movie_years.loc[len(movie_years)] = years_int # append the years string as row in df
movie_years_new = movie_years
median_year = movie_years_new.iloc[-1, :].median() # find median year: 1999
sorted_movie_years = movie_years.sort_values(movie_years.last_valid_index(), axis=1) # sort movies by year
old_movies = []
new_movies = []
for i, col in enumerate(sorted_movie_years.columns): # select columns based on the values in the last row
if sorted_movie_years.iloc[-1,i] < 1999:
old_movies.append(col)
if sorted_movie_years.iloc[-1,i] >= 1999:
new_movies.append(col)
# 29 movies that are 1999, included in new movies to make sample sizes more comparable
old_movies = sorted_movie_years[old_movies] # create new df of just less popular movies
old_movies_drop = old_movies.iloc[:-1 , :] # remove last row of median values
old_movies_arr = old_movies_drop.to_numpy().flatten() # flatten df values to one array
old_movies_arr = old_movies_arr[~np.isnan(old_movies_arr)] # get rid of nan values
new_movies = sorted_movie_years[new_movies] # create new df of just more popular movies
new_movies_drop = new_movies.iloc[:-1 , :] # remove last row of median values
new_movies_arr = new_movies_drop.to_numpy().flatten() # flatten df values to one array
new_movies_arr = new_movies_arr[~np.isnan(new_movies_arr)] # get rid of nan values
# stats
d, res = cliffs_delta(new_movies_arr, old_movies_arr)
print(d,res)
print(scipy.stats.mannwhitneyu(new_movies_arr,old_movies_arr)) # significant
print("Median Arr 1: " +str(np.median(new_movies_arr)))
print("Sample Size 1: " +str(len(new_movies_arr)))
print("Sum of ranks 1: " +str(np.sum(new_movies_arr)))
print("Median Arr 2: " +str(np.median(old_movies_arr)))
print("Sample Size 2: " +str(len(old_movies_arr)))
print("Sum of ranks 2: "+str(np.sum(old_movies_arr)))
# plot
popular_combinedData = np.transpose(np.array([old_movies_arr,new_movies_arr],dtype=object))
fig = plt.figure()
ax = fig.add_subplot(111)
# Creating axes instance
bp = ax.boxplot(popular_combinedData, patch_artist = True,
notch ='True', vert = 0)
colors = ['yellowgreen', 'lightsalmon']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
whisker.set(color ='#8B008B',
linewidth = 1.5,
linestyle =":")
# changing color and linewidth of
# caps
for cap in bp['caps']:
cap.set(color ='#8B008B',
linewidth = 2)
# changing color and linewidth of
# medians
for median in bp['medians']:
median.set(color ='red',
linewidth = 3)
# changing style of fliers
for flier in bp['fliers']:
flier.set(marker ='D',
color ='#e7298a',
alpha = 0.5)
# x-axis labels
ax.set_yticklabels(['Newer Movies', 'Older Movies'])
# Adding title
plt.title("Ratings: Older vs Newer Movies")
# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
plt.xlabel('Ratings')
plt.show()
shrek_gender = data_df.loc[:,['Shrek (2001)','Gender identity (1 = female; 2 = male; 3 = self-described)']]
#print(shrek_gender)
df3 = shrek_gender.dropna() # drop rows that have null values
df3 = df3.reset_index(drop=True) # reset index
# separate by gender, two arrays
female = df3[df3['Gender identity (1 = female; 2 = male; 3 = self-described)'] == 1.0].reset_index(drop=True)
female = female['Shrek (2001)']
male = df3[df3['Gender identity (1 = female; 2 = male; 3 = self-described)'] == 2.0].reset_index(drop=True)
male = male['Shrek (2001)']
shrek_combinedData = np.transpose(np.array([female,male],dtype=object))
d, res = cliffs_delta(female, male)
print(d,res)
print(scipy.stats.mannwhitneyu(female,male)) # not significant
print("Median Arr 1: " +str(female.median()))
print("Sample Size 1: " +str(len(female)))
print("Sum 1: " +str(female.sum()))
print("Median Arr 2: " +str(male.median()))
print("Sample Size 2: " +str(len(male)))
print("Sum 1: " + str(male.sum()))
print(scipy.stats.kstest(female,male))
# colors = ['pink', 'blue']
# labels = ['Females', 'Males']
# lines = ['black', 'black']
# plt.boxplot(shrek_combinedData)
# plt.title('Shrek (2001) Ratings: Female vs Male Viewers')
# plt.legend(labels)
# plt.show()
fig = plt.figure()
ax = fig.add_subplot(111)
# Creating axes instance
bp = ax.boxplot(shrek_combinedData, patch_artist = True,
notch ='True', vert = 0)
colors = ['pink', 'blue']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
whisker.set(color ='#8B008B',
linewidth = 1.5,
linestyle =":")
# changing color and linewidth of
# caps
for cap in bp['caps']:
cap.set(color ='#8B008B',
linewidth = 2)
# changing color and linewidth of
# medians
for median in bp['medians']:
median.set(color ='red',
linewidth = 3)
# changing style of fliers
for flier in bp['fliers']:
flier.set(marker ='D',
color ='#e7298a',
alpha = 0.5)
# x-axis labels
ax.set_yticklabels(['Females', 'Males'])
# Adding title
plt.title("Shrek (2001) Ratings: Male vs Female Viewers")
# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
plt.xlabel('Ratings')
# show plot
plt.show()
df4 = data_df.iloc[:,:475]
df4 = df4.drop(df4.iloc[:, 400:474],axis = 1)
# create two df for female and male viewers
female = df4[df4['Gender identity (1 = female; 2 = male; 3 = self-described)'] == 1.0].reset_index(drop=True) # female df
female_t = np.transpose(female).iloc[:-1 , :] # transpose
female_arr = female_t.to_numpy() # convert to array
l1 = []
for a in female_arr: # iterate over rows in array
a = a[np.isfinite(a)] # remove nan values element wise for each movie array
l1.append(a)
female_arr_new = np.array(l1,dtype='object')
male = df4[df4['Gender identity (1 = female; 2 = male; 3 = self-described)'] == 2.0].reset_index(drop=True)
male_t = np.transpose(male).iloc[:-1 , :] # transpose
male_arr = male_t.to_numpy() # convert to array
l2 = []
for b in male_arr: # iterate over rows in array
b = b[np.isfinite(b)] # remove nan values element wise for each movie array
l2.append(b)
male_arr_new = np.array(l2,dtype='object')
# compare the corresponding movie arrays between female and male with mannwhitneyu
l3 = []
test_stats = []
eff_s = []
for i,j in zip(female_arr_new,male_arr_new):
U_test = scipy.stats.mannwhitneyu(i,j)[1]
test_val = scipy.stats.mannwhitneyu(i,j)[0]
effect_size = abs(cliffs_delta(i,j)[0])
l3.append(U_test)
test_stats.append(test_val)
eff_s.append(effect_size)
sig_count = 0
total = 0
for x in l3:
if x <= 0.005:
sig_count += 1
total += 1
else:
sig_count += 0
total += 1
p_male_fem = sig_count/total
print(sig_count)
print(p_male_fem)
plt.hist(l3,200,color='darkviolet')
plt.title('Movies Rated by Female and Male Viewers')
plt.xlabel('p-value')
plt.ylabel('frequency');
lion_child = data_df.loc[:,['The Lion King (1994)','Are you an only child? (1: Yes; 0: No; -1: Did not respond)']]
df5 = lion_child.dropna()
df5 = df5.reset_index(drop=True)
only = df5[df5['Are you an only child? (1: Yes; 0: No; -1: Did not respond)'] == 1].reset_index(drop=True)
only = only['The Lion King (1994)']
sib = df5[df5['Are you an only child? (1: Yes; 0: No; -1: Did not respond)'] == 0].reset_index(drop=True)
sib = sib['The Lion King (1994)']
lion_combinedData = np.transpose(np.array([only,sib],dtype=object))
# stats
d, res = cliffs_delta(only, sib)
print(d,res)
print(scipy.stats.mannwhitneyu(only,sib, alternative="greater")) # not significant
print("Median Arr 1: " +str(only.median()))
print("Sample Size 1: " +str(len(only)))
print('Sum 1: ' +str(only.sum()))
print("Median Arr 2: " +str(sib.median()))
print("Sample Size 2: " +str(len(sib)))
print('Sum 2: ' +str(sib.sum()))
fig = plt.figure()
ax = fig.add_subplot(111)
# Creating axes instance
bp = ax.boxplot(lion_combinedData, patch_artist = True,
notch ='True', vert = 0)
colors = ['purple', 'orange']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
whisker.set(color ='#8B008B',
linewidth = 1.5,
linestyle =":")
# changing color and linewidth of
# caps
for cap in bp['caps']:
cap.set(color ='#8B008B',
linewidth = 2)
# changing color and linewidth of
# medians
for median in bp['medians']:
median.set(color ='red',
linewidth = 3)
# changing style of fliers
for flier in bp['fliers']:
flier.set(marker ='D',
color ='#e7298a',
alpha = 0.5)
# x-axis labels
ax.set_yticklabels(['Only Child', 'Has Siblings'])
# Adding title
plt.title("The Lion King (1994) Ratings: Viewer Has Siblings vs Is Only Child")
# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
plt.xlabel('Ratings')
# show plot
plt.show()
df6 = data_df.iloc[:,:476]
df6 = df6.drop(df6.iloc[:, 400:475],axis = 1)
df6
# create two df for female and male viewers
only = df6[df6['Are you an only child? (1: Yes; 0: No; -1: Did not respond)'] == 1].reset_index(drop=True) # female df
only_t = np.transpose(only).iloc[:-1 , :] # transpose
only_arr = only_t.to_numpy() # convert to array
l4 = []
for a in only_arr: # iterate over rows in array
a = a[np.isfinite(a)] # remove nan values element wise for each movie array
l4.append(a)
only_arr_new = np.array(l4,dtype='object')
sib = df6[df6['Are you an only child? (1: Yes; 0: No; -1: Did not respond)'] == 0].reset_index(drop=True)
sib_t = np.transpose(sib).iloc[:-1 , :] # transpose
sib_arr = sib_t.to_numpy() # convert to array
l5 = []
for b in sib_arr: # iterate over rows in array
b = b[np.isfinite(b)] # remove nan values element wise for each movie array
l5.append(b)
sib_arr_new = np.array(l5,dtype='object')
# compare the corresponding movie arrays between female and male with mannwhitneyu
l6 = []
for i,j in zip(only_arr_new,sib_arr_new):
U_test = scipy.stats.mannwhitneyu(i,j)[1]
l6.append(U_test)
sig_count = 0
total = 0
for x in l6:
if x <= 0.005:
sig_count += 1
total += 1
else:
sig_count += 0
total += 1
p_only_sib = sig_count/total
print(sig_count)
print(p_only_sib)
plt.hist(l6,200,color='green')
plt.title('Movies Rated by Viewers with Siblings vs Those Without')
plt.xlabel('p-value')
plt.ylabel('frequency');
# plot effect size using forrest plot, do not plot p-vals bc they are testing different things
# run multiple mannuwhitneys for every movie between male and female
# append values to a chart corresponding to the movie
# find the total count of significant p-values and divide by total movies (400)
# the answer will be the fraction of movies rated differently btw male and female
# group by?
wolf_social = data_df.loc[:,['The Wolf of Wall Street (2013)','Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)']]
df7 = wolf_social.dropna()
df7 = df7.reset_index(drop=True)
social = df7[df7['Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)'] == 0].reset_index(drop=True)
social = social['The Wolf of Wall Street (2013)']
anti = df7[df7['Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)'] == 1].reset_index(drop=True)
anti = anti['The Wolf of Wall Street (2013)']
# stats
print(scipy.stats.mannwhitneyu(social, anti, alternative="greater")) # not significant
d, res = cliffs_delta(social, anti)
print(d,res)
print("Median Arr 1: " +str(social.median()))
print("Sample Size 1: " +str(len(social)))
print("Sum 1: " +str(social.sum()))
print("Median Arr 2: " +str(anti.median()))
print("Sample Size 2: " +str(len(anti)))
print("Sum 2: "+str(anti.sum()))
wolf_combinedData = np.transpose(np.array([social,anti],dtype=object))
fig = plt.figure()
ax = fig.add_subplot(111)
# Creating axes instance
bp = ax.boxplot(wolf_combinedData, patch_artist = True,
notch ='True', vert = 0)
colors = ['cyan', 'green']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
whisker.set(color ='#8B008B',
linewidth = 1.5,
linestyle =":")
# changing color and linewidth of
# caps
for cap in bp['caps']:
cap.set(color ='#8B008B',
linewidth = 2)
# changing color and linewidth of
# medians
for median in bp['medians']:
median.set(color ='red',
linewidth = 3)
# changing style of fliers
for flier in bp['fliers']:
flier.set(marker ='D',
color ='#e7298a',
alpha = 0.5)
# x-axis labels
ax.set_yticklabels(['Social Viewers', 'Anti-Social Viewers'])
# Adding title
plt.title("The Wolf of Wall Street (2013) Ratings: Anti-Social vs Social Viewers")
# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
plt.xlabel('Ratings')
# show plot
plt.show()
df8 = data_df.iloc[:,:477]
df8 = df8.drop(df8.iloc[:, 400:476],axis = 1)
#cannot drop row-wise
df8
# create two df for female and male viewers
social = df8[df8['Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)'] == 0].reset_index(drop=True) # female df
social_t = np.transpose(social).iloc[:-1 , :] # transpose
social_arr = social_t.to_numpy() # convert to array
l7 = []
for a in social_arr: # iterate over rows in array
a = a[np.isfinite(a)] # remove nan values element wise for each movie array
l7.append(a)
social_arr_new = np.array(l7,dtype='object')
anti = df8[df8['Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)'] == 1].reset_index(drop=True)
anti_t = np.transpose(anti).iloc[:-1 , :] # transpose
anti_arr = anti_t.to_numpy() # convert to array
l8 = []
for b in anti_arr: # iterate over rows in array
b = b[np.isfinite(b)] # remove nan values element wise for each movie array
l8.append(b)
anti_arr_new = np.array(l8,dtype='object')
# compare the corresponding movie arrays between female and male with mannwhitneyu
l9 = []
for i,j in zip(social_arr_new,anti_arr_new):
U_test = scipy.stats.mannwhitneyu(i,j)[1]
l9.append(U_test)
sig_count = 0
total = 0
for x in l9:
if x <= 0.005:
sig_count += 1
total += 1
else:
sig_count += 0
total += 1
p_social_anti = sig_count/total
print(sig_count)
print(p_social_anti)
plt.hist(l9,200,color='darkorange')
plt.title('Movies Rated by Social Viewers vs Anti-Social Viewers')
plt.xlabel('p-value')
plt.ylabel('frequency');
# run multiple mannuwhitneys for every movie between male and female
# append values to a chart corresponding to the movie
# find the total count of significant p-values and divide by total movies (400)
# the answer will be the fraction of movies rated differently btw male and female
# pie-chart
home_nemo = data_df.loc[:,['Home Alone (1990)','Finding Nemo (2003)']]
HN1 = pd.to_numeric(data_df['Home Alone (1990)'],errors='coerce').values
HN2 = pd.to_numeric(data_df['Finding Nemo (2003)'],errors='coerce').values
# element wise deletion
HN1 = HN1[np.isfinite(HN1)] # only keep the finite elements (not infinity or NaN)
# HN1 = np.reshape(HN1, (857,1))
HN2 = HN2[np.isfinite(HN2)]
# HN2 = np.reshape(HN2, (1014,1))
print(scipy.stats.kstest(HN1, HN2)) # significant
# d, res = cliffs_delta(HN1, HN2)
# print(d,res)
# n1,n2 = sample sizes of groups 1 and 2
# s1,s2 = variances of groups 1 and 2
# u1,u2 = means of groups 1 and 2
print("Median Arr 1: " +str(np.median(HN1)))
print("Sample Size 1: " +str(len(HN1)))
print("Median Arr 2: " +str(np.median(HN2)))
print("Sample Size 2: " +str(len(HN2)))
HN_combinedData = np.transpose(np.array([HN1,HN2],dtype=object))
# plt.hist(HN2,bins=9,alpha=0.6, color='violet', edgecolor='black', label='Finding Nemo')
# plt.hist(HN1,bins=9,alpha=0.4, color='lightskyblue', edgecolor='black', label='Home Alone')
# plt.title('Ratings: Home Alone (1990) vs Finding Nemo (2003)')
# plt.xlabel('Ratings')
# plt.ylabel('Frequency')
# plt.legend()
# plt.show()
# No of data points used
N = 857
N2 = 1014
# normal distribution
data = HN1
data2 = HN2
# sort the data in ascending order
x = np.sort(data)
x2 = np.sort(data2)
# get the cdf values of y
y = np.arange(N) / float(N)
y2 = np.arange(N2) / float(N2)
# plotting
plt.xlabel('Ratings')
plt.ylabel('Cumulative Probability')
plt.title('Ratings: Home Alone (1990) vs Finding Nemo (2003)')
plt.plot(x, y, color='darkviolet', label='Home Alone')
plt.plot(x2, y2, color='lightskyblue', label='Finding Nemo')
plt.axvline(x = 3.25, ymin = 0.33, ymax = 0.47, color='red', label='Max Distance')
plt.legend()
plt.show()
# inconsistent quality, ratings vary as the series continues
# row wise deletion because we only care about the viewers who watched all movies
# Kruskal Wallis
# report the size of the viewers that watched all the movies in a franchise! get the shape of the combinedData
# address concerns about independence assumption for this test, since we are looking at ratings from the same people
# would use an ANOVA but this is rank data, lose some power
# Star Wars
S1 = pd.to_numeric(data_df['Star Wars: Episode IV - A New Hope (1977)'],errors='coerce').values
S2 = pd.to_numeric(data_df['Star Wars: Episode V - The Empire Strikes Back (1980)'],errors='coerce').values
S3 = pd.to_numeric(data_df['Star Wars: Episode VI - The Return of the Jedi (1983)'],errors='coerce').values
S4 = pd.to_numeric(data_df['Star Wars: Episode 1 - The Phantom Menace (1999)'],errors='coerce').values
S5 = pd.to_numeric(data_df['Star Wars: Episode II - Attack of the Clones (2002)'],errors='coerce').values
S6 = pd.to_numeric(data_df['Star Wars: Episode VII - The Force Awakens (2015)'],errors='coerce').values
S_temp = np.array([pd.isnull(S1),pd.isnull(S2),pd.isnull(S3),pd.isnull(S4),pd.isnull(S5),pd.isnull(S6)],dtype=bool)
S_temp2 = S_temp*1
S_temp2 = sum(S_temp2)
S_missingData = np.where(S_temp2>0)
S1 = np.delete(S1,S_missingData)
S2 = np.delete(S2,S_missingData)
S3 = np.delete(S3,S_missingData)
S4 = np.delete(S4,S_missingData)
S5 = np.delete(S5,S_missingData)
S6 = np.delete(S6,S_missingData)
S_combinedData = np.transpose(np.array([S1,S2,S3,S4,S5,S6]))
plt.hist(S_combinedData, bins=9)
plt.show()
print('Star Wars: ' + str(scipy.stats.kruskal(S1,S2,S3,S4,S5,S6))) # significant
# Harry Potter
H1 = pd.to_numeric(data_df['Harry Potter and the Sorcerer\'s Stone (2001)'],errors='coerce').values
H2 = pd.to_numeric(data_df['Harry Potter and the Chamber of Secrets (2002)'],errors='coerce').values
H3 = pd.to_numeric(data_df['Harry Potter and the Goblet of Fire (2005)'],errors='coerce').values
H4 = pd.to_numeric(data_df['Harry Potter and the Deathly Hallows: Part 2 (2011)'],errors='coerce').values
H_temp = np.array([pd.isnull(H1),pd.isnull(H2),pd.isnull(H3),pd.isnull(H4)],dtype=bool)
H_temp2 = H_temp*1
H_temp2 = sum(H_temp2)
H_missingData = np.where(H_temp2>0)
H1 = np.delete(H1,H_missingData)
H2 = np.delete(H2,H_missingData)
H3 = np.delete(H3,H_missingData)
H4 = np.delete(H4,H_missingData)
H_combinedData = np.transpose(np.array([H1,H2,H3,H4]))
plt.hist(H_combinedData, bins=9)
plt.show()
print('Harry Potter: ' + str(scipy.stats.kruskal(H1,H2,H3,H4))) # not significant
# The Matrix
M1 = pd.to_numeric(data_df['The Matrix (1999)'],errors='coerce').values
M2 = pd.to_numeric(data_df['The Matrix Reloaded (2003)'],errors='coerce').values
M3 = pd.to_numeric(data_df['The Matrix Revolutions (2003)'],errors='coerce').values
M_temp = np.array([pd.isnull(M1),pd.isnull(M2),pd.isnull(M3)],dtype=bool)
M_temp2 = M_temp*1
M_temp2 = sum(M_temp2)
M_missingData = np.where(M_temp2>0)
M1 = np.delete(M1,M_missingData)
M2 = np.delete(M2,M_missingData)
M3 = np.delete(M3,M_missingData)
M_combinedData = np.transpose(np.array([M1,M2,M3]))
plt.hist(M_combinedData, bins=9)
plt.show()
print('Matrix: ' + str(scipy.stats.kruskal(M1,M2,M3))) # significant
# Indiana Jones
I1 = pd.to_numeric(data_df['Indiana Jones and the Raiders of the Lost Ark (1981)'],errors='coerce').values
I2 = pd.to_numeric(data_df['Indiana Jones and the Temple of Doom (1984)'],errors='coerce').values
I3 = pd.to_numeric(data_df['Indiana Jones and the Last Crusade (1989)'],errors='coerce').values
I4 = pd.to_numeric(data_df['Indiana Jones and the Kingdom of the Crystal Skull (2008)'],errors='coerce').values
I_temp = np.array([pd.isnull(I1),pd.isnull(I2),pd.isnull(I3),pd.isnull(I4)],dtype=bool)
I_temp2 = I_temp*1
I_temp2 = sum(I_temp2)
I_missingData = np.where(I_temp2>0)
I1 = np.delete(I1,I_missingData)
I2 = np.delete(I2,I_missingData)
I3 = np.delete(I3,I_missingData)
I4 = np.delete(I4,I_missingData)
I_combinedData = np.transpose(np.array([I1,I2,I3,I4]))
plt.hist(I_combinedData, bins=9)
plt.show()
print('Indiana: ' + str(scipy.stats.kruskal(I1,I2,I3,I4))) # significant
# Jurassic Park
J1 = pd.to_numeric(data_df['Jurassic Park (1993)'],errors='coerce').values
J2 = pd.to_numeric(data_df['The Lost World: Jurassic Park (1997)'],errors='coerce').values
J3 = pd.to_numeric(data_df['Jurassic Park III (2001)'],errors='coerce').values
J_temp = np.array([pd.isnull(J1),pd.isnull(J2),pd.isnull(J3)],dtype=bool)
J_temp2 = J_temp*1
J_temp2 = sum(J_temp2)
J_missingData = np.where(J_temp2>0)
J1 = np.delete(J1,J_missingData)
J2 = np.delete(J2,J_missingData)
J3 = np.delete(J3,J_missingData)
J_combinedData = np.transpose(np.array([J1,J2,J3]))
plt.hist(J_combinedData, bins=9)
plt.show()
print('Jurassic: ' + str(scipy.stats.kruskal(J1,J2,J3))) # significant
# Pirates of the Caribbean
P1 = pd.to_numeric(data_df['Pirates of the Caribbean: The Curse of the Black Pearl (2003)'],errors='coerce').values
P2 = pd.to_numeric(data_df['Pirates of the Caribbean: Dead Man\'s Chest (2006)'],errors='coerce').values
P3 = pd.to_numeric(data_df['Pirates of the Caribbean: At World\'s End (2007)'],errors='coerce').values
P_temp = np.array([pd.isnull(P1),pd.isnull(P2),pd.isnull(P3)],dtype=bool)
P_temp2 = P_temp*1
P_temp2 = sum(P_temp2)
P_missingData = np.where(P_temp2>0)
P1 = np.delete(P1,P_missingData)
P2 = np.delete(P2,P_missingData)
P3 = np.delete(P3,P_missingData)
colors = ['darkgoldenrod', 'teal', 'firebrick']
P_combinedData = np.transpose(np.array([P1,P2,P3]))
plt.hist(P_combinedData, bins=9, color=colors)
plt.title('Pirates of the Caribbean')
plt.xlabel('Ratings')
plt.ylabel('')
plt.show()
print('Pirates: ' + str(scipy.stats.kruskal(P1,P2,P3))) # not significant
# Toy Story
T1 = pd.to_numeric(data_df['Toy Story (1995)'],errors='coerce').values
T2 = pd.to_numeric(data_df['Toy Story 2 (1999)'],errors='coerce').values
T3 = pd.to_numeric(data_df['Toy Story 3 (2010)'],errors='coerce').values
T_temp = np.array([pd.isnull(T1),pd.isnull(T2),pd.isnull(T3)],dtype=bool)
T_temp2 = T_temp*1
T_temp2 = sum(T_temp2)
T_missingData = np.where(T_temp2>0)
T1 = np.delete(T1,T_missingData)
T2 = np.delete(T2,T_missingData)
T3 = np.delete(T3,T_missingData)
T_combinedData = np.transpose(np.array([T1,T2,T3]))
plt.hist(T_combinedData, bins=9)
plt.show()
print('Toy Story: ' + str(scipy.stats.kruskal(T1,T2,T3))) # significant
# Batman
B1 = pd.to_numeric(data_df['Batman (1989)'],errors='coerce').values
B2 = pd.to_numeric(data_df['Batman & Robin (1997)'],errors='coerce').values
B3 = pd.to_numeric(data_df['Batman: The Dark Knight (2008)'],errors='coerce').values
B_temp = np.array([pd.isnull(B1),pd.isnull(B2),pd.isnull(B3)],dtype=bool)
B_temp2 = B_temp*1
B_temp2 = sum(B_temp2)
B_missingData = np.where(B_temp2>0)
B1 = np.delete(B1,B_missingData)
B2 = np.delete(B2,B_missingData)
B3 = np.delete(B3,B_missingData)
B_combinedData = np.transpose(np.array([B1,B2,B3]))
plt.hist(B_combinedData, bins=9)
plt.show()
print('Batman: ' + str(scipy.stats.kruskal(B1,B2,B3))) # significant
colors = ['gold', 'gray', 'mediumblue']
labels = ['Batman (1989)','Batman & Robin (1997)','Batman: The Dark Knight (2008)']
colors2 = ['maroon','darkgreen','midnightblue','goldenrod']
labels2 = ['Sorcerer\'s Stone','Chamber of Secrets','Goblet of Fire','Deathly Hallows: Part 2']
plt.subplot(1,2,2)
plt.hist(B_combinedData, bins=9, color=colors, label=labels)
plt.title('Inconsistent: Batman')
plt.xlabel('Ratings')
plt.ylabel('Frequency')
plt.legend(loc=9, fontsize='x-small')
plt.subplot(1,2,1)
plt.hist(H_combinedData, bins=9, color=colors2, label=labels2)
plt.title('Consistent: Harry Potter')
plt.xlabel('Ratings')
plt.legend(loc=9, fontsize='x-small')
plt.show()
#locate column index
data_df.columns.get_loc("I had a sheltered upbringing")
# finding anything of significance
dfx = data_df.iloc[:,:415]
dfx = dfx.drop(dfx.iloc[:, 400:414],axis = 1)
no = dfx[dfx['I had a sheltered upbringing'] == 1 | 2].reset_index(drop=True)
no_t = np.transpose(no).iloc[:-1 , :] # transpose
no_arr = no_t.to_numpy() # convert to array
l1 = []
for a in no_arr: # iterate over rows in array
a = a[np.isfinite(a)] # remove nan values element wise for each movie array
l1.append(a)
no_arr_new = np.array(l1,dtype='object')
yes = dfx[dfx['I had a sheltered upbringing'] == 4 | 5].reset_index(drop=True)
yes_t = np.transpose(yes).iloc[:-1 , :] # transpose
yes_arr = yes_t.to_numpy() # convert to array
l2 = []
for b in yes_arr: # iterate over rows in array
b = b[np.isfinite(b)] # remove nan values element wise for each movie array
l2.append(b)
yes_arr_new = np.array(l2,dtype='object')
# compare the corresponding movie arrays with mannwhitneyu
l3 = []
test_stats = []
eff_s = []
names = column_headers
for i,j in zip(yes_arr_new,no_arr_new):
U_test = scipy.stats.mannwhitneyu(i,j)[1]
test_val = scipy.stats.mannwhitneyu(i,j)[0]
effect_size = abs(cliffs_delta(i,j)[0])
l3.append(U_test)
test_stats.append(test_val)
eff_s.append(effect_size)
sig_count = 0
total = 0
for x,y in zip(l3,names):
if x <= 0.005:
print(y)
sig_count += 1
total += 1
else:
sig_count += 0
total += 1
p = sig_count/total
print(sig_count)
print(p)
comp1 = data_df.loc[:,['I had a sheltered upbringing','American Psycho (2000)']]
comp2 = data_df.loc[:,['I had a sheltered upbringing','Kill Bill: Vol. 1 (2003)']]
comp3 = data_df.loc[:,['I had a sheltered upbringing','Bowling For Columbine (2002)']]
dfa = comp1.dropna()
dfa = dfa.reset_index(drop=True)
dfk = comp2.dropna()
dfk = dfk.reset_index(drop=True)
dfbc = comp3.dropna()
dfbc = dfbc.reset_index(drop=True)
# American Psycho
A_no = dfa[dfa['I had a sheltered upbringing'] == 1 | 2].reset_index(drop=True)
A_no = A_no['American Psycho (2000)']
A_yes = dfa[dfa['I had a sheltered upbringing'] == 4 | 5].reset_index(drop=True)
A_yes = A_yes['American Psycho (2000)']
# stats
print(scipy.stats.mannwhitneyu(A_no, A_yes)) # not significant
d, res = cliffs_delta(A_yes, A_no)
print(d,res)
print("Median Arr 1: " +str(np.median(A_yes)))
print("Sample Size 1: " +str(len(A_yes)))
print("Sum 1: " +str(np.sum(A_yes)))
print("Median Arr 2: " +str(np.median(A_no)))
print("Sample Size 2: " +str(len(A_no)))
print("Sum 2: "+str(np.sum(A_no)))
A_combinedData = np.transpose(np.array([A_yes,A_no],dtype=object))
# Kill Bill
K_no = dfk[dfk['I had a sheltered upbringing'] == 1 | 2].reset_index(drop=True)
K_no = K_no['Kill Bill: Vol. 1 (2003)']
K_yes = dfk[dfk['I had a sheltered upbringing'] == 4 | 5].reset_index(drop=True)
K_yes = K_yes['Kill Bill: Vol. 1 (2003)']
# stats
print(scipy.stats.mannwhitneyu(K_no, K_yes)) # not significant
d, res = cliffs_delta(K_yes, K_no)
print(d,res)
print("Median Arr 1: " +str(np.median(K_yes)))
print("Sample Size 1: " +str(len(K_yes)))
print("Sum 1: " +str(np.sum(K_yes)))
print("Median Arr 2: " +str(np.median(K_no)))
print("Sample Size 2: " +str(len(K_no)))
print("Sum 2: "+str(np.sum(K_no)))
K_combinedData = np.transpose(np.array([K_yes,K_no],dtype=object))
# Bowling for Columbine
BC_no = dfbc[dfbc['I had a sheltered upbringing'] == 1 | 2].reset_index(drop=True)
BC_no = BC_no['Bowling For Columbine (2002)']
BC_yes = dfbc[dfbc['I had a sheltered upbringing'] == 4 | 5].reset_index(drop=True)
BC_yes = BC_yes['Bowling For Columbine (2002)']
# stats
print(scipy.stats.mannwhitneyu(BC_no, BC_yes)) # not significant
d, res = cliffs_delta(BC_yes, BC_no)
print(d,res)
print("Median Arr 1: " +str(np.median(BC_yes)))
print("Sample Size 1: " +str(len(BC_yes)))
print("Sum 1: " +str(np.sum(BC_yes)))
print("Median Arr 2: " +str(np.median(BC_no)))
print("Sample Size 2: " +str(len(BC_no)))
print("Sum 2: "+str(np.sum(BC_no)))
BC_combinedData = np.transpose(np.array([BC_yes,BC_no],dtype=object))
fig1 = plt.figure()
ax = fig1.add_subplot(111)
# Creating axes instance
bp = ax.boxplot(BC_combinedData, patch_artist = True,
notch ='True', vert = 0)
colors = ['green', 'lightblue']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
whisker.set(color ='#8B008B',
linewidth = 1.5,
linestyle =":")
# changing color and linewidth of
# caps
for cap in bp['caps']:
cap.set(color ='#8B008B',
linewidth = 2)
# changing color and linewidth of
# medians
for median in bp['medians']:
median.set(color ='red',
linewidth = 3)
# changing style of fliers
for flier in bp['fliers']:
flier.set(marker ='D',
color ='#e7298a',
alpha = 0.5)
# x-axis labels
ax.set_yticklabels(['Sheltered', 'Not Sheltered'])
# Adding title
plt.title("Bowling For Columbine (2002) Ratings")
# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
plt.xlabel('Ratings')
# show plot
plt.show()
fig2 = plt.figure()
ax = fig2.add_subplot(111)
# Creating axes instance
bp = ax.boxplot(K_combinedData, patch_artist = True,
notch ='True', vert = 0)
colors = ['green', 'lightblue']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
whisker.set(color ='#8B008B',
linewidth = 1.5,
linestyle =":")
# changing color and linewidth of
# caps
for cap in bp['caps']:
cap.set(color ='#8B008B',
linewidth = 2)
# changing color and linewidth of
# medians
for median in bp['medians']:
median.set(color ='red',
linewidth = 3)
# changing style of fliers
for flier in bp['fliers']:
flier.set(marker ='D',
color ='#e7298a',
alpha = 0.5)
# x-axis labels
# ax.set_yticklabels(['Sheltered', 'Not Sheltered'])
# Adding title
plt.title("Kill Bill: Vol. 1 (2003) Ratings")
# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
plt.xlabel('Ratings')
# show plot
plt.show()
fig3 = plt.figure()
ax = fig3.add_subplot(111)
# Creating axes instance
bp = ax.boxplot(A_combinedData, patch_artist = True,
notch ='True', vert = 0)
colors = ['green', 'lightblue']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# changing color and linewidth of
# whiskers
for whisker in bp['whiskers']:
whisker.set(color ='#8B008B',
linewidth = 1.5,
linestyle =":")
# changing color and linewidth of
# caps
for cap in bp['caps']:
cap.set(color ='#8B008B',
linewidth = 2)
# changing color and linewidth of
# medians
for median in bp['medians']:
median.set(color ='red',
linewidth = 3)
# changing style of fliers
for flier in bp['fliers']:
flier.set(marker ='D',
color ='#e7298a',
alpha = 0.5)
# x-axis labels
# ax.set_yticklabels(['Sheltered', 'Not Sheltered'])
# Adding title
plt.title("American Psycho (2000) Ratings")
# Removing top axes and right axes
# ticks
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
plt.xlabel('Ratings')
# show plot
plt.show()