import numpy as np # module for creating and manipulating arrays
import matplotlib.pyplot as plt # plotting
import pandas as pd # dataframes
url = 'https://raw.githubusercontent.com/data-8/materials-sp18/master/lec/san_francisco_2015.csv'
df0 = pd.read_csv(url)
print("Employee compensations in the City of San Francisco in the"\
"year 2015\nShape: {} \nColumns: {}".format(df0.shape, df0.columns))
df0.head()
Employee compensations in the City of San Francisco in theyear 2015
Shape: (42989, 22)
Columns: Index(['Year Type', 'Year', 'Organization Group Code', 'Organization Group',
'Department Code', 'Department', 'Union Code', 'Union',
'Job Family Code', 'Job Family', 'Job Code', 'Job',
'Employee Identifier', 'Salaries', 'Overtime', 'Other Salaries',
'Total Salary', 'Retirement', 'Health/Dental', 'Other Benefits',
'Total Benefits', 'Total Compensation'],
dtype='object')
df0.sort_values(['Total Compensation'], ascending=False)[:3]
df0.sort_values(['Total Compensation'], ascending=True)[:3]
df = df0.loc[df0['Salaries'] > 10000]
TC = df['Total Compensation'] / 1000 # scale the Total compensation
TC
# you can use NumPy to compute the statistics
TC_mean = TC.sum() / TC.size
a = TC.sum() - TC_mean
b = a**2
c = b / TC.size
TC_std = np.sqrt(c)
TC_min = TC.min()
TC_max = TC.max()
# Report the mean 'Total Compensation' using the standard deviation and uncertainties
print("The mean Total Compensation is {:.0f} +- {:.0f} (mean +- SD) "\
"thousand dollars annually, \nwith a range "\
"({:.2f}, {:.2f})".format(TC_mean, TC_std, TC_min, TC_max))
The mean Total Compensation is 115 +- 21938 (mean +- SD) thousand dollars annually,
with a range (11.05, 648.88)
plt.hist(TC, range=(0, 700), bins=20, density=True)
plt.xlabel("Total Compensation (k$)")
TC_median = np.percentile(TC, 50, axis=-1)
print("The median is {:.1f} thousand dollars annually".format(TC_median))
The median is 110.3 thousand dollars annually
print("10, 50 and 90 percentiles {}".format( np.percentile(TC, [10, 50, 90])))
10, 50 and 90 percentiles [ 32.756136 110.30579 196.293506]
# draw sample with 500 elements
n = 500
our_sample = np.random.choice(TC, size=n, replace=None, p=None)
# plot an histogram
plt.hist(our_sample, range=(0,700))
# Calcualte the median of the sample
sample_median = np.percentile(TC, 50, axis=-1)
print("Sample median {:.2f}".format(sample_median))
Sample median 110.31
# draw a sample with replacement from the original sample our_sample
resample_1 = np.random.choice(our_sample, n, replace=True)
# plot histogram of the drawed sample
# plt. ... your code here...
def bootstrap_median(original_sample, replications):
"""Returns an array of bootstrapped sample medians
Parameters
----------
original_sample: array containing the original sample
replications: number of bootstrap samples
"""
n = len(original_sample)
medians = np.zeros(replications)
for i in np.arange(replications):
bootstrap_sample = np.random.choice(original_sample, n, replace=True)
resampled_median = np.median(bootstrap_sample)
medians[i] = resampled_median
return medians
bstrap_medians = bootstrap_median(our_sample, 5000)
# calculate the right percentile 2.5 and left percentile 97.5 of bstrap_medians
leftp = np.percentile(TC, 2.5, axis=-1)
rightp = np.percentile(TC, 97.5, axis=-1)
print("The sample median {:.1f}\n95% confidence interval of the median ({:.1f}, {:.1f})".format(sample_median, leftp, rightp))
# plot histogram of the bstrap_medians with the 95% confidence intervals and the sample median
plt.hist(bstrap_medians, density=True)
# indicate the middle 95% with an horizontal line
plt.hlines(0.0, leftp, rightp, 'y', lw=8) # 5 and 95 percentiles
# indicate the sample median with a red dot
plt.plot(sample_median, 0.002, 'ro', ms=10) # median
plt.xlabel('Bootstrapped medians')
The sample median 110.3
95% confidence interval of the median (17.7, 249.4)