# Don't change this cell; just run it.
# Load the Numpy array module with variable name 'np'.
import numpy as np
# Load Pandas data table module with variable name 'pd'.
import pandas as pd
# Turn on a setting to use Pandas more safely.
# We will discuss this setting later.
pd.set_option('mode.chained_assignment', 'raise')
# Load "pyplot" submodule of Matplotlib plotting module
# with variable name "plt".
import matplotlib.pyplot as plt
# Make plots look a little more fancy.
plt.style.use('fivethirtyeight')
# Tell plotting package to display plots inside the notebook.
%matplotlib inline
# Load the OKpy testing library
from client.api.notebook import Notebook
ok = Notebook('race_policing.ok')
=====================================================================
Assignment: demographics
OK, version v1.18.1
=====================================================================
# Load the data from a file in the same directory as the notebook.
regions_by_eth = pd.read_csv('areas-of-england-and-wales-by-ethnicity.csv')
# Show the first five rows.
regions_by_eth.head()
#- Put the "Region" column of the data frame into a new variable "regions".
#- 2 marks / 100 (total 2 so far)
regions = regions_by_eth['Region']
# Show the result
regions
# Check answer is on right track.
_ = ok.grade('q_1_regions')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- "is_west_midlands" should have True for elements corresponding to West
#- Midlands, False otherwise.
#- 3 marks / 100 (total 5 so far)
is_west_midlands =regions_by_eth['Region'] == 'West Midlands'
# Show the first five values
is_west_midlands.head()
# Check answer is on right track.
_ = ok.grade('q_2_is_wm')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- 5 marks / 100 (total 10 so far)
west_midlands_by_eth = regions_by_eth[(regions_by_eth['Region']=='West Midlands')]
west_midlands_by_eth.head()
# Check answer is on right track.
_ = ok.grade('q_3_wm_by_eth')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- Make a new data frame "west_midlands_sorted" by sorting
#- "west_midlands_by_eth" by the "%" column. Largest values should come first.
#- 5 marks / 100 (total 15 so far)
west_midlands_sorted = west_midlands_by_eth.sort_values('%', ascending=False)
# Show first five rows of the sorted data frame.
west_midlands_sorted.head()
# Check answer is on right track.
_ = ok.grade('q_4_wm_sorted')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 4
Failed: 0
[ooooooooook] 100.0% passed
#- 5 marks / 100 (total 20 so far)
west_midlands_sorted.plot.barh('Standard Ethnicity', '%')
plt.ylabel('Standard Ethnicity')
plt.xlabel('%')
plt.show
#- Put the West Midlands percentage of "White British" into a new variable
#- "white_british_pct". You will probably need more than one line of code.
#- 5 marks / 100 (total 25 so far)
...
white_british_pct = west_midlands_sorted.loc[38, '%']
# Show the result converted to a simple floating point value.
float(white_british_pct)
_ = ok.grade('q_6_wb_pct')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 2
Failed: 0
[ooooooooook] 100.0% passed
#- Load the file "2020-09-west-midlands-stop-and-search.csv"
#- as a data frame.
#- Put the new data frame into a variable "stop_search".
#- 2 marks / 100 (total 27 so far)
stop_search = pd.read_csv('2020-09-west-midlands-stop-and-search.csv')
# Show the first 5 rows.
stop_search.head()
_ = ok.grade('q_7_stop_search')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- Make a new variable "ethnicity" that has the values from
#- the "Self-defined ethnicity" column of "stop_search".
#- 2 marks / 100 (total 29 so far)
ethnicity = stop_search['Self-defined ethnicity']
# Show the first five values
ethnicity.head()
# Check you are on the right track.
_ = ok.grade('q_8_ethnicity')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
# Run this cell.
type(ethnicity)
#- 3 marks / 100 (total 32 so far)
eth_counts = stop_search.value_counts("Self-defined ethnicity")
# Show the result
eth_counts
# Check you are on the right track.
_ = ok.grade('q_9_eth_counts')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
# A function to recode ethnicity.
# Run this cell to define the function.
def recode_eth(value):
""" Recode ethnicity values to match standard
Accept ethnicity string as used in "Self-defined ethnicity".
Return matching ethnicity used by the government statistics
on ethnicity.
"""
if pd.isna(value): # Missing value
return np.nan
if value.startswith('Asian'):
return 'Asian'
if value.startswith('Mixed'):
return 'Mixed'
if value.startswith('Black'):
return 'Black'
if value.startswith('White - English'):
return 'White British'
if value.startswith('White - Irish'):
return 'White British'
if value.startswith('White - Any other'):
return 'White other'
if value.startswith('Other ethnic group'):
return 'Other'
# If we didn't already return something, stop here
# with an error.
raise ValueError('Did not expect ethnicity ' + value)
# Run this cell.
# Show example of the recoding function in action
recode_eth('Asian/Asian British - Indian')
# Run this cell.
# Another example
recode_eth('White - Irish')
# Run this cell to use the "apply" method of "ethnicity",
# and the "recode_eth" function above, to make a new variable
# "recoded_eth". This has the ethnicity values recoded to the
# form used in the UK government demographics.
recoded_eth = ethnicity.apply(recode_eth)
# Display the first give values of "recoded_eth"
recoded_eth.head()
#- Make a new variable "recoded_eth_counts" that has the unique values of
#- "recoded_eth", and their counts.
#- 2 marks / 100 (total 34 so far)
recoded_eth_counts = recoded_eth.value_counts()
# Show the values of "recoded_eth_counts"
recoded_eth_counts
# Check you are on the right track.
_ = ok.grade('q_10_recoded_eth_counts')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
recoded_eth_counts.plot.barh
#- Use a "plot." method of "recoded_eth_counts" to show a
#- horizontal bar chart of the counts for each ethnicity.
#- The ethnicity should display on the vertical axis
#- and the count on the horizontal axis.
#- 3 marks / 100 (total 37 so far)
recoded_eth_counts.plot.barh()
print('Number of valid values in "recoded_eth"', recoded_eth.count())
print('Number of values in "recoded_eth"', len(recoded_eth))
Number of valid values in "recoded_eth" 1885
Number of values in "recoded_eth" 2051
# Run this cell.
# Make a new empty data frame
eth_outcome = pd.DataFrame()
# Insert the recoded_eth series.
# The .copy() is for safety. More later in the course.
eth_outcome['recoded_eth'] = recoded_eth.copy()
# Insert a copy of the outcome series from the original data frame.
eth_outcome['Outcome'] = stop_search['Outcome'].copy()
# Show the first five rows.
eth_outcome.head()
#- Make a new variable "valid_eth_outcome" containing a data frame restricted
#- to rows with no missing values.
#- 3 marks / 100 (total 40 so far)
valid_eth_outcome = eth_outcome.dropna()
# Show the first 5 rows
valid_eth_outcome.head()
# Check you are on the right track.
_ = ok.grade('q_12_valid_eth')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 4
Failed: 0
[ooooooooook] 100.0% passed
# Run this cell.
# The number of rows remaining in our table.
n_valid_eth = len(valid_eth_outcome)
n_valid_eth
#- 3 marks / 100 (total 43 so far)
#- Calculate the proportion of 'White British' values in "valid_eth_outcome".
prop_wb = recoded_eth =='White British'
prop_wb=prop_wb/n_valid_eth
prop_wb.sum()
prop_wb=0.3962703962703963
prop_wb
# Check you are on the right track.
_ = ok.grade('q_12a_prop_wb')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
# Simulation of proportion from a single month of stop-search incidents.
randoms = np.random.uniform(0, 1, size=n_valid_eth)
wb_incidents = randoms < 0.792
simulated_prop = np.count_nonzero(wb_incidents) / n_valid_eth
simulated_prop
#- 5 marks / 100 (total 48 so far)
#- Run 10000 trials. Collect the proportions at each trial.
#- Store the proportions in an array sim_wb_proportions.
sim_wb_proportions = np.zeros (10000)
for i in np.arange(10000):
simulated_prop=np.random.uniform(0, 1, n_valid_eth)
count = np.sum(simulated_prop < 0.792)
sim_wb_proportions[i] = count
# Show the simulated proportions
sim_wb_proportions[:5]
# Check you are on the right track.
_ = ok.grade('q_13_sim_wb_proportions')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Question 13_sim_wb_proportions > Suite 1 > Case 5
>>> # At least some of your values are suprisingly high.
>>> np.all(sim_wb_proportions < 1)
False
# Error: expected
# True
# but got
# False
Run only this test case with "python3 ok -q q_13_sim_wb_proportions --suite 1 --case 5"
---------------------------------------------------------------------
Test summary
Passed: 4
Failed: 1
[ooooooook..] 80.0% passed
#- Show a histogram of the proportions in the "sim_wb_proportions" array.
#- 2 marks / 100 (total 50 so far)
plt.hist(sim_wb_proportions)
#- 5 marks / 100 (total 55 so far)
likely_no_bias = 4
_ = ok.grade('q_14_1_no_bias')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- 2 marks / 100 (total 57 so far)
outcomes = valid_eth_outcome['Outcome']
# Show the first five values in outcomes
outcomes.head()
# Check you are on the right track.
_ = ok.grade('q_15_outcomes')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 4
Failed: 0
[ooooooooook] 100.0% passed
#- 2 marks / 100 (total 59 so far)
outcomes_counts = pd.value_counts(valid_eth_outcome.Outcome)
# Show the result.
outcomes_counts
# Check you are on the right track.
_ = ok.grade('q_16_outcomes_counts')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- 6 marks / 100 (total 65 so far)
white_outcomes = valid_eth_outcome[valid_eth_outcome["recoded_eth"] == "White British"]["Outcome"]
# Show the first five values.
white_outcomes.head()
# Check you are on the right track.
_ = ok.grade('q_19_white_outcomes')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- Calculate the unique values and counts of the different outcomes, for
#- White British people who were stopped and searched.
#- 2 marks / 100 (total 67 so far)
white_outcomes_counts = pd.value_counts(white_outcomes)
white_outcomes_counts
# Check you are on the right track.
_ = ok.grade('q_20_white_outcomes_counts')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- 5 marks / 100 (total 72 so far)
white_arrested_valid = np.count_nonzero(white_outcomes)
white_arrested_only = white_outcomes == 'Arrest'
white_arrests =np.count_nonzero (white_outcomes[white_arrested_only])
white_arrested_p = white_arrests/white_arrested_valid
87
# Check you are on the right track.
_ = ok.grade('q_20_1_white_arrested_p')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- 5 marks / 100 (total 77 so far)
not_white= recoded_eth != 'White British'
not_white_outcomes = outcomes [not_white]
not_white_outcomes.head()
# Check you are on the right track.
_ = ok.grade('q_22_not_white_outcomes')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- 2 marks / 100 (total 79 so far)
not_white_outcomes_counts = pd.value_counts(not_white_outcomes)
not_white_outcomes_counts
# Check you are on the right track.
_ = ok.grade('q_23_not_white_outcomes_counts')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
#- 3 marks / 100 (total 82 so far)
np.count_nonzero(white_outcomes)
not_white_arrested_valid= np.count_nonzero(not_white_outcomes)
not_white_arrested =not_white_outcomes == 'Arrest'
not_white_arrests = np.count_nonzero(not_white_outcomes[not_white_arrested])
not_white_arrested_p = not_white_arrests/not_white_arrested_valid
not_white_arrested_p
# Check you are on the right track.
_ = ok.grade('q_23_1_not_white_arrested_p')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
p_diff = white_arrested_p - not_white_arrested_p
p_diff
n_wb = np.count_nonzero(valid_eth_outcome['recoded_eth'] == 'White British')
n_wb
n_nwb = np.count_nonzero(valid_eth_outcome['recoded_eth'] != 'White British')
n_nwb
n_arrests = np.count_nonzero(valid_eth_outcome['Outcome'] == 'Arrest')
n_arrests
p_arrest = n_arrests / len(valid_eth_outcome)
p_arrest
wb_randoms = np.random.uniform(0, 1, size=n_wb)
n_wb_arrests = np.count_nonzero(wb_randoms < p_arrest)
n_wb_arrests
nwb_randoms = np.random.uniform(0, 1, size=n_nwb)
n_nwb_arrests = np.count_nonzero(nwb_randoms < p_arrest)
n_nwb_arrests
#- 15 marks / 100 (total 97 so far)
p_white_random_arrests = np.zeros(10000)
for i in np.arange(10000):
wb_randoms = np.random.uniform(0, 1, n_wb)
count = np.sum(wb_randoms <p_arrest)
p_white_random_arrests[i] =count
p_not_white_random_arrests=np.zeros(10000)
for i in np.arange (10000):
wb_randoms=np.random.uniform (0, 1, n_wb)
count=np.sum(wb_randoms < p_arrest)
p_white_random_arrests[i] = count
p_differences=(p_white_random_arrests - p_not_white_random_arrests)/1000
# Show the first five differences
p_differences[:5]
# Test you are on the right track.
_ = ok.grade('q_23_2_p_differences')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 4
Failed: 0
[ooooooooook] 100.0% passed
# Code to plot a histogram of p_differences
plt.hist(p_differences)
#- 3 marks / 100 (total 100 so far)
likely_outcome_same = 3
_ = ok.grade('q_24_outcome_same')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
# For your convenience, you can run this cell to run all the tests at once!
import os
_ = [ok.grade(q[:-3]) for q in os.listdir("tests") if q.startswith('q')]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Question 13_sim_wb_proportions > Suite 1 > Case 5
>>> # At least some of your values are suprisingly high.
>>> np.all(sim_wb_proportions < 1)
False
# Error: expected
# True
# but got
# False
Run only this test case with "python3 ok -q q_13_sim_wb_proportions --suite 1 --case 5"
---------------------------------------------------------------------
Test summary
Passed: 4
Failed: 1
[ooooooook..] 80.0% passed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 2
Failed: 0
[ooooooooook] 100.0% passed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running tests
---------------------------------------------------------------------
Test summary
Passed: 3
Failed: 0
[ooooooooook] 100.0% passed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.
Current values:
NotebookApp.iopub_msg_rate_limit=50.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)