!pip install statsmodels==0.12.2
Requirement already satisfied: statsmodels==0.12.2 in /root/venv/lib/python3.7/site-packages (0.12.2)
Requirement already satisfied: pandas>=0.21 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.12.2) (1.2.4)
Requirement already satisfied: scipy>=1.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.12.2) (1.6.3)
Requirement already satisfied: numpy>=1.15 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.12.2) (1.19.5)
Requirement already satisfied: patsy>=0.5 in /root/venv/lib/python3.7/site-packages (from statsmodels==0.12.2) (0.5.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.21->statsmodels==0.12.2) (2.8.1)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.21->statsmodels==0.12.2) (2021.1)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5->statsmodels==0.12.2) (1.16.0)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
sns.set_style("darkgrid")
mpl.rcParams['figure.figsize'] = (20,5)
bechdel_only = pd.read_csv('Bechdel Complete.csv')
bechdel_imdb = pd.read_csv('Bechdel plus IMDB.csv')
bechdel_imdb_budget = pd.read_csv('Bechdel with IMDB and Budget.csv')
print('Bechdel Only')
print(bechdel_only.head())
print(bechdel_only.info())
print('Bechdel with IMDB Ratings')
print(bechdel_imdb.head())
print(bechdel_imdb.info())
print('Bechdel with Ratings and Budget')
print(bechdel_imdb_budget.head())
print(bechdel_imdb_budget.info())
Bechdel Only
title imdbid bechdel test \
0 Roundhay Garden Scene 392728.0 FAIL
1 Pauvre Pierrot 3.0 FAIL
2 Blacksmith Scene 5.0 FAIL
3 Execution of Mary, Queen of Scots, The 132134.0 FAIL
4 Tables Turned on the Gardener 14.0 FAIL
2 named women talk to each other not about men rating year
0 False False False 0 1888
1 False False False 0 1892
2 False False False 0 1893
3 False False False 0 1895
4 False False False 0 1895
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8930 entries, 0 to 8929
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 8930 non-null object
1 imdbid 8926 non-null float64
2 bechdel test 8930 non-null object
3 2 named women 8930 non-null bool
4 talk to each other 8930 non-null bool
5 not about men 8930 non-null bool
6 rating 8930 non-null int64
7 year 8930 non-null int64
dtypes: bool(3), float64(1), int64(2), object(2)
memory usage: 375.1+ KB
None
Bechdel with IMDB Ratings
bechdel.title imdbid bechdel_test 2_named_women \
0 Only the Brave 3829920 FAIL True
1 Sand Castle 2582576 FAIL False
2 Beauty and the Beast 2771200 PASS True
3 Disaster Artist, The 3521126 PASS True
4 Dance Academy: The Movie 5834660 PASS True
talk_to_each_other not_about_men rating year genre \
0 True False 2 2017 Action, Biography, Drama
1 False False 0 2017 Action, Drama, War
2 True True 3 2017 Family, Fantasy, Musical
3 True True 3 2017 Biography, Comedy, Drama
4 True True 3 2017 Drama
duration ... females_18age_avg_vote females_18age_votes \
0 134 ... 8.0 1420.0
1 113 ... 6.4 541.0
2 129 ... 7.6 28300.0
3 104 ... 7.4 6497.0
4 101 ... 7.4 324.0
females_30age_avg_vote females_30age_votes females_45age_avg_vote \
0 7.8 2595 7.8
1 6.3 741 6.5
2 7.5 27857 7.7
3 7.2 6376 7.0
4 7.0 234 6.3
females_45age_votes us_voters_rating us_voters_votes \
0 1044.0 7.8 7248.0
1 237.0 6.3 1988.0
2 4978.0 7.2 34926.0
3 1272.0 7.5 20064.0
4 44.0 7.5 172.0
non_us_voters_rating non_us_voters_votes
0 7.5 24036
1 6.3 8114
2 7.0 93811
3 7.3 53873
4 6.6 516
[5 rows x 62 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7956 entries, 0 to 7955
Data columns (total 62 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 bechdel.title 7956 non-null object
1 imdbid 7956 non-null int64
2 bechdel_test 7956 non-null object
3 2_named_women 7956 non-null bool
4 talk_to_each_other 7956 non-null bool
5 not_about_men 7956 non-null bool
6 rating 7956 non-null int64
7 year 7956 non-null int64
8 genre 7956 non-null object
9 duration 7956 non-null int64
10 country 7955 non-null object
11 language 7931 non-null object
12 director 7956 non-null object
13 writer 7946 non-null object
14 production_company 7913 non-null object
15 description 7938 non-null object
16 weighted_average_vote 7956 non-null float64
17 total_votes 7956 non-null int64
18 mean_vote 7956 non-null float64
19 median_vote 7956 non-null float64
20 votes_10 7956 non-null int64
21 votes_9 7956 non-null int64
22 votes_8 7956 non-null int64
23 votes_7 7956 non-null int64
24 votes_6 7956 non-null int64
25 votes_5 7956 non-null int64
26 votes_4 7956 non-null int64
27 votes_3 7956 non-null int64
28 votes_2 7956 non-null int64
29 votes_1 7956 non-null int64
30 allgenders_0age_avg_vote 6791 non-null float64
31 allgenders_0age_votes 6791 non-null float64
32 allgenders_18age_avg_vote 7955 non-null float64
33 allgenders_18age_votes 7955 non-null float64
34 allgenders_30age_avg_vote 7956 non-null float64
35 allgenders_30age_votes 7956 non-null int64
36 allgenders_45age_avg_vote 7956 non-null float64
37 allgenders_45age_votes 7956 non-null int64
38 males_allages_avg_vote 7956 non-null float64
39 males_allages_votes 7956 non-null int64
40 males_0age_avg_vote 6339 non-null float64
41 males_0age_votes 6339 non-null float64
42 males_18age_avg_vote 7953 non-null float64
43 males_18age_votes 7953 non-null float64
44 males_30age_avg_vote 7956 non-null float64
45 males_30age_votes 7956 non-null int64
46 males_45age_avg_vote 7956 non-null float64
47 males_45age_votes 7956 non-null int64
48 females_allages_avg_vote 7956 non-null float64
49 females_allages_votes 7956 non-null int64
50 females_0age_avg_vote 5822 non-null float64
51 females_0age_votes 5822 non-null float64
52 females_18age_avg_vote 7939 non-null float64
53 females_18age_votes 7939 non-null float64
54 females_30age_avg_vote 7956 non-null float64
55 females_30age_votes 7956 non-null int64
56 females_45age_avg_vote 7952 non-null float64
57 females_45age_votes 7952 non-null float64
58 us_voters_rating 7955 non-null float64
59 us_voters_votes 7955 non-null float64
60 non_us_voters_rating 7956 non-null float64
61 non_us_voters_votes 7956 non-null int64
dtypes: bool(3), float64(27), int64(23), object(9)
memory usage: 3.6+ MB
None
Bechdel with Ratings and Budget
bechdel.title imdbid bechdel_test 2_named_women \
0 Invisible Man, The 1051906 PASS True
1 Onward 7146812 PASS True
2 Emma. 9214832 PASS True
3 Honey Boy 8151874 FAIL False
4 Fantasy Island 983946 PASS True
talk_to_each_other not_about_men rating year \
0 True True 3 2020
1 True True 3 2020
2 True True 3 2020
3 False False 0 2020
4 True True 3 2020
genre duration ... females_30age_votes \
0 Horror, Mystery, Sci-Fi 124 ... 6778
1 Animation, Adventure, Comedy 102 ... 4173
2 Comedy, Drama 124 ... 2313
3 Drama 94 ... 1126
4 Action, Adventure, Fantasy 109 ... 1554
females_45age_avg_vote females_45age_votes us_voters_rating us_voters_votes \
0 7.2 1603.0 7.2 15568
1 7.6 814.0 7.5 12884
2 6.8 708.0 6.9 2954
3 7.1 404.0 7.4 5663
4 5.3 494.0 4.9 3820
non_us_voters_rating non_us_voters_votes budget dom_gross \
0 7.0 49787 $7,000,000 $64,914,050
1 7.3 25321 $200,000,000 $61,555,145
2 6.6 7433 $10,000,000 $10,055,355
3 7.1 6725 $3,500,000 $3,012,615
4 4.8 9940 $7,000,000 $26,441,782
int_gross
0 $138,997,923
1 $127,912,438
2 $26,377,603
3 $3,415,542
4 $47,954,473
[5 rows x 65 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3275 entries, 0 to 3274
Data columns (total 65 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 bechdel.title 3275 non-null object
1 imdbid 3275 non-null int64
2 bechdel_test 3275 non-null object
3 2_named_women 3275 non-null bool
4 talk_to_each_other 3275 non-null bool
5 not_about_men 3275 non-null bool
6 rating 3275 non-null int64
7 year 3275 non-null int64
8 genre 3275 non-null object
9 duration 3275 non-null int64
10 country 3275 non-null object
11 language 3272 non-null object
12 director 3275 non-null object
13 writer 3275 non-null object
14 production_company 3273 non-null object
15 description 3268 non-null object
16 weighted_average_vote 3275 non-null float64
17 total_votes 3275 non-null int64
18 mean_vote 3275 non-null float64
19 median_vote 3275 non-null float64
20 votes_10 3275 non-null int64
21 votes_9 3275 non-null int64
22 votes_8 3275 non-null int64
23 votes_7 3275 non-null int64
24 votes_6 3275 non-null int64
25 votes_5 3275 non-null int64
26 votes_4 3275 non-null int64
27 votes_3 3275 non-null int64
28 votes_2 3275 non-null int64
29 votes_1 3275 non-null int64
30 allgenders_0age_avg_vote 3185 non-null float64
31 allgenders_0age_votes 3185 non-null float64
32 allgenders_18age_avg_vote 3275 non-null float64
33 allgenders_18age_votes 3275 non-null int64
34 allgenders_30age_avg_vote 3275 non-null float64
35 allgenders_30age_votes 3275 non-null int64
36 allgenders_45age_avg_vote 3275 non-null float64
37 allgenders_45age_votes 3275 non-null int64
38 males_allages_avg_vote 3275 non-null float64
39 males_allages_votes 3275 non-null int64
40 males_0age_avg_vote 3109 non-null float64
41 males_0age_votes 3109 non-null float64
42 males_18age_avg_vote 3275 non-null float64
43 males_18age_votes 3275 non-null int64
44 males_30age_avg_vote 3275 non-null float64
45 males_30age_votes 3275 non-null int64
46 males_45age_avg_vote 3275 non-null float64
47 males_45age_votes 3275 non-null int64
48 females_allages_avg_vote 3275 non-null float64
49 females_allages_votes 3275 non-null int64
50 females_0age_avg_vote 2960 non-null float64
51 females_0age_votes 2960 non-null float64
52 females_18age_avg_vote 3274 non-null float64
53 females_18age_votes 3274 non-null float64
54 females_30age_avg_vote 3275 non-null float64
55 females_30age_votes 3275 non-null int64
56 females_45age_avg_vote 3274 non-null float64
57 females_45age_votes 3274 non-null float64
58 us_voters_rating 3275 non-null float64
59 us_voters_votes 3275 non-null int64
60 non_us_voters_rating 3275 non-null float64
61 non_us_voters_votes 3275 non-null int64
62 budget 3275 non-null object
63 dom_gross 3275 non-null object
64 int_gross 3275 non-null object
dtypes: bool(3), float64(24), int64(26), object(12)
memory usage: 1.6+ MB
None
# Only movies that pass the Bechdel test.
bechdel_movies = bechdel_only[bechdel_only["bechdel_test"] == "PASS"]
pass_rate = bechdel_only['bechdel_test'].value_counts(normalize=True)
print(pass_rate)
labels = 'PASS', 'FAIL'
explode = (0, 0.1)
colors = ['green', 'red']
plt.pie(pass_rate, labels = labels, autopct='%1.1f%%', startangle= 15, shadow = True, colors= colors, explode=explode)
plt.axis('equal')
plt.title('Percent of Total Films Passing or Failing Bechdel Test')
plt.show()
KeyError: 'bechdel_test'
year = bechdel_only['year']
pass_fail = bechdel_only['bechdel_test']
plt.plot(year, pass_fail)
plt.show()
corr_var = bechdel_imdb_budget[['bechdel_binary', 'year', 'mean_vote', 'males_allages_avg_vote',
'females_allages_avg_vote', 'budget', 'dom_gross', 'int_gross']]
bechdel_corr = corr_var.corr()
plt.figure(figsize=(15,7))
sns.heatmap(bechdel_corr, annot=True)
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
total_gross = bechdel_imdb_budget['dom_gross'] + bechdel_imdb_budget['int_gross']
genre1 = bechdel_imdb_budget['genre1']
plt.bar(genre1, total_gross)
plt.title('Box Office Gross: Primary Genre')
plt.show()
genre2 = bechdel_imdb_budget['genre2']
plt.bar(genre2, total_gross)
plt.title('Box Office Gross: Secondary Genre')
plt.show()
genre3 = bechdel_imdb_budget['genre3']
plt.bar(genre3, total_gross)
plt.title('Box Office Gross: Tertiary Genre')
plt.show()
TypeError: 'value' must be an instance of str or bytes, not a float
null_budget_gone = bechdel_imdb_budget.dropna()
budget = null_budget_gone['budget']
total_gross = null_budget_gone['dom_gross'] + null_budget_gone['int_gross']
genre1 = null_budget_gone['genre1']
profit = sum(total_gross) - sum(budget)
profit_margin = profit/total_gross
plt.bar(genre1, profit_margin)
plt.show()
plt.plot()